When compiling the kernel attached to this post, this error message in returned:
0 llc 0x00000000009e18cf 1 llc 0x00000000009e2aa9 2 libpthread.so.0 0x00007f7a76fbb080 3 llc 0x00000000005fc31a 4 llc 0x000000000060227d 5 llc 0x0000000000602b8f 6 llc 0x00000000006033ef 7 llc 0x00000000005f676a 8 llc 0x00000000007d251a 9 llc 0x00000000007d2d60 10 llc 0x00000000007d2f9c 11 llc 0x000000000041ba06 12 libc.so.6 0x00007f7a76c585a6 __libc_start_main + 230 13 llc 0x0000000000419779 Stack dump: 0. Program arguments: /home/fr4nz/atistream/bin/x86_64/llc -mcpu=gummy -mattr=mwgs-3-256-1-1 -regalloc=linearscan -mtriple=amdil-pc-amdopencl /tmp/OCLOZTphA.bc -f -o /tmp/OCLOZTphA.il 1. Running pass 'AMD IL Control Flow Graph structurizer Pass' on function '@__OpenCL_intersect_kernel' Segmentation fault |
Specs: Ubuntu 9.04 64bit, ATI 5770, OpenCL 2.01 and 10.2 catalyst drivers.
The problem seems to reside in this "for" loop:
for(init.w=(k-3); init.w>=0; init.w--) tmpIntersect &= tmpBuffer[tmpCandItems[lid + init.w*lsize]]; |
In fact, if I modify the kernel in order to make a "forward" loop:
for(init.w=0; init.w<(k-2); init.w++) tmpIntersect &= tmpBuffer[tmpCandItems[lid + init.w*lsize]]; |
then the kernel is correctly compiled.
__kernel void intersect(__global uint4 *inputVD, __global uint *inputPrefixCk, __local uint4 *tmpBuffer, __local uint4 *tempIntersect, // Not used; __local uint *tmpCandItems, __local uint *tempCOUNTS, // Not used; __global uint *outputCounts, const int k, const int numItems, const int dimRowVec4, const int numIntersections) { const int gid = get_global_id(0); const int lid = get_local_id(0); const int lsize = get_local_size(0); uint4 tmpCOUNTS = 0; uint4 tmpIntersect; uint4 init; init.z = 1; init.x = (gid < numIntersections) ? 0 : k; for( ; init.x<k; init.x++) tmpCandItems[lid + init.x*lsize] = inputPrefixCk[gid*k + init.x]; for(init.x=0; init.x<dimRowVec4; init.x++) { init.y = (lid < numItems) ? lid : numItems; for( ; init.y<numItems; init.y+=lsize) tmpBuffer[init.y] = inputVD[init.y*dimRowVec4 + init.x]; barrier(CLK_LOCAL_MEM_FENCE); if(gid < numIntersections) { tmpIntersect = tmpBuffer[tmpCandItems[lid + (k-1)*lsize]] & tmpBuffer[tmpCandItems[lid + (k-2)*lsize]]; for(init.w=(k-3); init.w>=0; init.w--) tmpIntersect &= tmpBuffer[tmpCandItems[lid + init.w*lsize]]; // Bit count for(init.w=0 ; init.w<32 ; init.w++) { tmpCOUNTS += tmpIntersect & (uint4)init.z; tmpIntersect = tmpIntersect >> (uint4)init.z; } } } if(gid < numIntersections) outputCounts[gid] = tmpCOUNTS.x + tmpCOUNTS.y + tmpCOUNTS.z + tmpCOUNTS.w; }