Hi,
I have a small kernel, very simple. If you have the following input
"1 1 1 1 1 1 1 1 1 1 1 1 1 1 1"
It should create a simple scan
1 2 3 4 5 6 7 8 ...
It works only on NVidia SDK but not on AMD one !
But I got wrong results.
__kernel void kernel__scanIntra(__global uint* input, uint size) { size_t idx = get_global_id(0); const uint lane = get_local_id(0); const uint bid = get_group_id(0); if (lane >= 1 && idx < size) input[idx] = input[idx - 1] + input[idx]; barrier(CLK_LOCAL_MEM_FENCE); if (lane >= 2 && idx < size) input[idx] = input[idx - 2] + input[idx]; barrier(CLK_LOCAL_MEM_FENCE); if (lane >= 4 && idx < size) input[idx] = input[idx - 4] + input[idx]; barrier(CLK_LOCAL_MEM_FENCE); if (lane >= 8 && idx < size) input[idx] = input[idx - 8] + input[idx]; barrier(CLK_LOCAL_MEM_FENCE); if (lane >= 16 && idx < size) input[idx] = input[idx - 16] + input[idx]; barrier(CLK_LOCAL_MEM_FENCE); }