7 Replies Latest reply on Apr 20, 2011 1:07 PM by MicahVillmow

    My code uses uncached reads despite const * restrict pointers

    Wibowit
      sprofile (in Linux) shows Cache Hit of 0.00 %, so no cache is used

      Hi,

      My program is here: http://www63.zippyshare.com/v/93264665/file.html (note that it's Java program). It's a ST5 implementation, accepts two parameters: first is input file, second is output file.

      First problem is that sprofile hogs my RAM. I have 8 GiB RAM but after about 30 kernel invocations memory gets filled up, then system gets unresponsive for a moment (or sometimes permanently) and then profiling speeds up. I don't know what's happening there, but such high memory usage suggest memory leaks.

      My main problem is that I've used const * restrict modifiers, but decompiled code shows only uncached fetches: 8  VFETCH R2.x___, R0.w, fc173  MEGA(4)

       I'm concerned about that two attached kernels. They do not use cache, however using cache should greatly reduce memory fetches. How to enable caching?

      BTW: My code doesn't compile if I add __read_only or similiar modifiers. Why? Compiler says that __read_only is a unrecognized identifier.

      __kernel void generateTuples(__global uchar const * restrict data, __private int length, __global ulong * tuples) { size_t gid = get_global_id(0); uint pointer = gid; uint low = 0; low += data[pointer]; pointer = (pointer + 1) < length ? pointer + 1 : 0; low <<= 8; low += data[pointer]; pointer = (pointer + 1) < length ? pointer + 1 : 0; low <<= 8; low += data[pointer]; pointer = (pointer + 1) < length ? pointer + 1 : 0; low <<= 8; low += data[pointer]; pointer = (pointer + 1) < length ? pointer + 1 : 0; ulong tuple = convert_ulong(low) << 32; tuple |= (convert_uint(data[pointer]) << 24) + gid; tuples[gid] = tuple; } __kernel void retrieveST5(__global ulong * tuples, __global uchar const * restrict data, __private int length, __global uchar16 * output) { size_t gid = get_global_id(0); uchar16 chunk = 0; uint bufferPosition; bufferPosition = tuples[gid * 16 + 0] & 0xFFFFFF; bufferPosition = bufferPosition == 0 ? length - 1 : bufferPosition - 1; chunk.s0 = data[bufferPosition]; bufferPosition = tuples[gid * 16 + 1] & 0xFFFFFF; bufferPosition = bufferPosition == 0 ? length - 1 : bufferPosition - 1; chunk.s1 = data[bufferPosition]; bufferPosition = tuples[gid * 16 + 2] & 0xFFFFFF; bufferPosition = bufferPosition == 0 ? length - 1 : bufferPosition - 1; chunk.s2 = data[bufferPosition]; bufferPosition = tuples[gid * 16 + 3] & 0xFFFFFF; bufferPosition = bufferPosition == 0 ? length - 1 : bufferPosition - 1; chunk.s3 = data[bufferPosition]; bufferPosition = tuples[gid * 16 + 4] & 0xFFFFFF; bufferPosition = bufferPosition == 0 ? length - 1 : bufferPosition - 1; chunk.s4 = data[bufferPosition]; bufferPosition = tuples[gid * 16 + 5] & 0xFFFFFF; bufferPosition = bufferPosition == 0 ? length - 1 : bufferPosition - 1; chunk.s5 = data[bufferPosition]; bufferPosition = tuples[gid * 16 + 6] & 0xFFFFFF; bufferPosition = bufferPosition == 0 ? length - 1 : bufferPosition - 1; chunk.s6 = data[bufferPosition]; bufferPosition = tuples[gid * 16 + 7] & 0xFFFFFF; bufferPosition = bufferPosition == 0 ? length - 1 : bufferPosition - 1; chunk.s7 = data[bufferPosition]; bufferPosition = tuples[gid * 16 + 8] & 0xFFFFFF; bufferPosition = bufferPosition == 0 ? length - 1 : bufferPosition - 1; chunk.s8 = data[bufferPosition]; bufferPosition = tuples[gid * 16 + 9] & 0xFFFFFF; bufferPosition = bufferPosition == 0 ? length - 1 : bufferPosition - 1; chunk.s9 = data[bufferPosition]; bufferPosition = tuples[gid * 16 + 10] & 0xFFFFFF; bufferPosition = bufferPosition == 0 ? length - 1 : bufferPosition - 1; chunk.sa = data[bufferPosition]; bufferPosition = tuples[gid * 16 + 11] & 0xFFFFFF; bufferPosition = bufferPosition == 0 ? length - 1 : bufferPosition - 1; chunk.sb = data[bufferPosition]; bufferPosition = tuples[gid * 16 + 12] & 0xFFFFFF; bufferPosition = bufferPosition == 0 ? length - 1 : bufferPosition - 1; chunk.sc = data[bufferPosition]; bufferPosition = tuples[gid * 16 + 13] & 0xFFFFFF; bufferPosition = bufferPosition == 0 ? length - 1 : bufferPosition - 1; chunk.sd = data[bufferPosition]; bufferPosition = tuples[gid * 16 + 14] & 0xFFFFFF; bufferPosition = bufferPosition == 0 ? length - 1 : bufferPosition - 1; chunk.se = data[bufferPosition]; bufferPosition = tuples[gid * 16 + 15] & 0xFFFFFF; bufferPosition = bufferPosition == 0 ? length - 1 : bufferPosition - 1; chunk.sf = data[bufferPosition]; output[gid] = chunk; }