Fr4nz

Bug in OpenCL compiler? Segmentation fault...

Discussion created by Fr4nz on Mar 12, 2010
Latest reply on Mar 15, 2010 by omkaranathan

When compiling the kernel attached to this post, this error message in returned:

 

0   llc             0x00000000009e18cf
1   llc             0x00000000009e2aa9
2   libpthread.so.0 0x00007f7a76fbb080
3   llc             0x00000000005fc31a
4   llc             0x000000000060227d
5   llc             0x0000000000602b8f
6   llc             0x00000000006033ef
7   llc             0x00000000005f676a
8   llc             0x00000000007d251a
9   llc             0x00000000007d2d60
10  llc             0x00000000007d2f9c
11  llc             0x000000000041ba06
12  libc.so.6       0x00007f7a76c585a6 __libc_start_main + 230
13  llc             0x0000000000419779
Stack dump:
0.    Program arguments: /home/fr4nz/atistream/bin/x86_64/llc -mcpu=gummy -mattr=mwgs-3-256-1-1 -regalloc=linearscan -mtriple=amdil-pc-amdopencl /tmp/OCLOZTphA.bc -f -o /tmp/OCLOZTphA.il
1.    Running pass 'AMD IL Control Flow Graph structurizer Pass' on function '@__OpenCL_intersect_kernel'
Segmentation fault


Specs: Ubuntu 9.04 64bit, ATI 5770, OpenCL 2.01 and 10.2 catalyst drivers.

The problem seems to reside in this "for" loop:

 

for(init.w=(k-3); init.w>=0; init.w--)
                tmpIntersect &= tmpBuffer[tmpCandItems[lid + init.w*lsize]];


In fact, if I modify the kernel in order to make a "forward" loop:

 

for(init.w=0; init.w<(k-2); init.w++)
                tmpIntersect &= tmpBuffer[tmpCandItems[lid + init.w*lsize]];


then the kernel is correctly compiled.

__kernel void intersect(__global uint4 *inputVD, __global uint *inputPrefixCk, __local uint4 *tmpBuffer, __local uint4 *tempIntersect, // Not used; __local uint *tmpCandItems, __local uint *tempCOUNTS, // Not used; __global uint *outputCounts, const int k, const int numItems, const int dimRowVec4, const int numIntersections) { const int gid = get_global_id(0); const int lid = get_local_id(0); const int lsize = get_local_size(0); uint4 tmpCOUNTS = 0; uint4 tmpIntersect; uint4 init; init.z = 1; init.x = (gid < numIntersections) ? 0 : k; for( ; init.x<k; init.x++) tmpCandItems[lid + init.x*lsize] = inputPrefixCk[gid*k + init.x]; for(init.x=0; init.x<dimRowVec4; init.x++) { init.y = (lid < numItems) ? lid : numItems; for( ; init.y<numItems; init.y+=lsize) tmpBuffer[init.y] = inputVD[init.y*dimRowVec4 + init.x]; barrier(CLK_LOCAL_MEM_FENCE); if(gid < numIntersections) { tmpIntersect = tmpBuffer[tmpCandItems[lid + (k-1)*lsize]] & tmpBuffer[tmpCandItems[lid + (k-2)*lsize]]; for(init.w=(k-3); init.w>=0; init.w--) tmpIntersect &= tmpBuffer[tmpCandItems[lid + init.w*lsize]]; // Bit count for(init.w=0 ; init.w<32 ; init.w++) { tmpCOUNTS += tmpIntersect & (uint4)init.z; tmpIntersect = tmpIntersect >> (uint4)init.z; } } } if(gid < numIntersections) outputCounts[gid] = tmpCOUNTS.x + tmpCOUNTS.y + tmpCOUNTS.z + tmpCOUNTS.w; }

Outcomes