
Binary array to list conversion
genaganna Sep 22, 2011 10:28 AM (in response to eugenek)Originally posted by: eugenek I wonder if anyone ever encountered this problem. For its apparent simplicity (and usefulness), I've been unable to find any standard library implementations, or algorithms, or even a common name for it.
Suppose I have a long binary array of 0's and 1's. (Assume that the array is >1 MB long.) Most bits are 0's, but some are 1's. I need to generate a list of integers with coordinates of '1's in the array.
I have a rather kludgy solution (involving three kernel launches, local memory, calls to an undocumented AMDspecific function popcnt(), and a lot of conditionals), but I wonder if there's a known better way to do this.
We can help you if you copy your kernel code here.

Binary array to list conversion
eugenek Sep 22, 2011 8:10 PM (in response to genaganna)Here's my current kernel. It's rather complicated, I took out boundary checks to make it more readable. You'd launch maskToList_phase1 with 1024x256 grid, maskToList_phase2 with 1x256 grid, and maskToList_phase3 with 4096x256 grid.
__kernel void maskToList_phase1(__global uint4* in, __global int* helper, __global int* helper2, int bytesPerThread) { int lid = get_local_id(0); int gid = get_global_id(0)  lid; helper += gid*4; int in_offset = get_global_id(0)*4*bytesPerThread; int i; uint4 count=(uint4)(0,0,0,0); in+=in_offset/16; __local int counts[1024]; for(i=0; i<bytesPerThread/4; i++) { uint4 x = *(in+i); count += popcnt(x); } count.y += count.x; count.w += count.z; count.z += count.y; count.w += count.y; counts[4*lid] = count.x; counts[4*lid+1] = count.y; counts[4*lid+2] = count.z; counts[4*lid+3] = count.w; barrier(CLK_LOCAL_MEM_FENCE); for(int scale=4; scale<256; scale*=2) { int dx = lid & (scale1); int id = (lid & ~(scale1))*2 + scale; counts[id+dx] += counts[id1]; counts[id+dx+512] += counts[id1+512]; barrier(CLK_LOCAL_MEM_FENCE); } counts[256+lid] += counts[2561]; counts[512+256+lid] += counts[512+2561]; barrier(CLK_LOCAL_MEM_FENCE); counts[lid+512] += counts[5121]; counts[lid+256+512] += counts[5121]; barrier(CLK_LOCAL_MEM_FENCE); helper[lid*4] = counts[lid*4]; helper[lid*4+1] = counts[lid*4+1]; helper[lid*4+2] = counts[lid*4+2]; helper[lid*4+3] = counts[lid*4+3]; if(lid == 255) helper2[gid/256] = counts[10241]; } __kernel void maskToList_phase2(__global int* input) { __local int temp[1024]; int tid = get_local_id(0); int4 in = vload4(0, input+4*tid); in.y += in.x; in.w += in.z; in.z += in.y; in.w += in.y; vstore4(in, 0, temp+4*tid); barrier(CLK_LOCAL_MEM_FENCE); for(int scale=4; scale<256; scale*=2) { int dx = tid & (scale1); int id = (tid & ~(scale1))*2 + scale; temp[id+dx] += temp[id1]; temp[id+512+dx] += temp[id+5121]; barrier(CLK_LOCAL_MEM_FENCE); } temp[256+tid] += temp[2561]; temp[512+256+tid] += temp[512+2561]; barrier(CLK_LOCAL_MEM_FENCE); temp[512+tid] += temp[5121]; temp[512+256+tid] += temp[5121]; barrier(CLK_LOCAL_MEM_FENCE); input[4*tid] = temp[4*tid]; input[4*tid+1] = temp[4*tid+1]; input[4*tid+2] = temp[4*tid+2]; input[4*tid+3] = temp[4*tid+3]; } __kernel void maskToList_phase3(__global uint* in, __global ulong* out, __global int* helper, __global int* helper2, int bytesPerThread) { int tid = get_global_id(0); int blockIdx = tid / 1024; int localIdx = tid & 1023; int in_offset = (tid & ~3)*bytesPerThread + (tid & 3)*4; in += in_offset/4; helper += blockIdx*1024; int i, count; out += (blockIdx==0)?0:helper2[blockIdx1]; out += (localIdx==0)?0:helper[localIdx1]; ulong base = in_offset*8*mult; for(i=0; i<bytesPerThread; i+=4) { uint v = in[i]; int pos; while((pos = clz(v)) != 32) { pos=31pos; *out=base+i*32+pos; out++; v &= ~(1<<pos); } } }
