Archives Discussions

d_a_a_ · ‎11-26-2010

The kernel attached below crashes the OpenCL compiler on my machine[1] when targeting the GPU device--it works under the Nvidia's OpenCL compiler, though. Strangely enough, when I change the value of WGS to 3 or less the compiler doesn't crash anymore.

1. GNU/Linux 2.6.36 64-bit, fglrx 10.11, AMD SDK v2.2, Radeon HD5750.

// ---------- BEGIN kernel #define WGS 256 #define NUM_POINTS 8192 #define X_DIM 4 #define MAX_TREE_SIZE 50 #define MAX_FLOAT 3.40282e+38f #define TOP ( stack[stack_top] ) #define POP ( stack[stack_top--] ) #define PUSH(arity, exp) stack[stack_top + 1 - arity] = (exp); stack_top = stack_top + 1 - arity; #define ARG(n) (stack[stack_top - n]) #define CREATE_STACK( type, size ) type stack[size]; int stack_top = -1; #define NODE program[op] #define INTERPRETER_CORE case 1:PUSH(1,ARG(0)) break; case 6:PUSH(2,(ARG(1) == 0.0f ? 1.0f : ARG(0)/ARG(1))) break; case 3:PUSH(2,ARG(0) + ARG(1)) break; case 4:PUSH(2,ARG(0) - ARG(1)) break; case 5:PUSH(2,ARG(0) * ARG(1)) break; #define COMPACT_RANGE 4194303 #define SCALE_FACTOR 16 #define ARITY( packed ) ((packed & 0xE0000000) >> 29) #define INDEX( packed ) ((packed & 0x1FC00000) >> 22) #define AS_INT( packed ) (packed & 0x3FFFFF) #define AS_FLOAT( packed ) ((float)( packed & 0x3FFFFF ) * SCALE_FACTOR / COMPACT_RANGE) __kernel void evaluate( __global const uint* pop, __global const float* X, __global const float* Y, __global float* E, __local uint* program ) { __local float PE[WGS]; __local unsigned int program_size; CREATE_STACK( float, MAX_TREE_SIZE ); uint i_id = get_local_id( 0 ); uint g_id = get_group_id( 0 ); uint wg_size = get_local_size( 0 ); if( i_id == 0 ) program_size = pop[(MAX_TREE_SIZE + 1) * g_id]; barrier(CLK_LOCAL_MEM_FENCE); if( i_id < program_size ) program[i_id] = pop[(MAX_TREE_SIZE + 1) * g_id + i_id + 1]; barrier(CLK_LOCAL_MEM_FENCE); PE[i_id] = 0.0f; for( uint iter = 0; iter < NUM_POINTS/wg_size; ++iter ) { // ------------------------------- // Calls the interpreter (C macro) // ------------------------------- for( int op = program_size; op-- ; ) switch( INDEX( program[op] ) ) { INTERPRETER_CORE default: PUSH( 0, X[iter * wg_size + NUM_POINTS * AS_INT( program[op] ) + i_id] ); } // ------------------------------- PE[i_id] += pown( POP - Y[ iter * wg_size + i_id ], 2 ); } for( uint d = 2; d<= WGS; d *= 2 ) { barrier(CLK_LOCAL_MEM_FENCE); if( i_id % d == 0 ) PE[i_id] += PE[i_id + d/2]; } if( i_id == 0 ) E[g_id] = ( isinf( PE[0] ) || isnan( PE[0] ) ) ? MAX_FLOAT : PE[0] / (float) NUM_POINTS; } // ------------------- END kernel I get this GDB output when I try to build the above kernel: Program received signal SIGSEGV, Segmentation fault. 0x00007ffff47e1edd in ?? () from /usr/lib/libaticaldd.so (gdb) bt #0 0x00007ffff47e1edd in ?? () from /usr/lib/libaticaldd.so #1 0x00007ffff47d9c20 in ?? () from /usr/lib/libaticaldd.so #2 0x00007ffff47dabf7 in ?? () from /usr/lib/libaticaldd.so #3 0x00007ffff47b6935 in ?? () from /usr/lib/libaticaldd.so #4 0x00007ffff47b6d4f in ?? () from /usr/lib/libaticaldd.so #5 0x00007ffff47b824a in ?? () from /usr/lib/libaticaldd.so #6 0x00007ffff47b9070 in ?? () from /usr/lib/libaticaldd.so #7 0x00007ffff47b5ab3 in ?? () from /usr/lib/libaticaldd.so #8 0x00007ffff47575aa in ?? () from /usr/lib/libaticaldd.so #9 0x00007ffff46c72fc in ?? () from /usr/lib/libaticaldd.so #10 0x00007ffff46c758c in ?? () from /usr/lib/libaticaldd.so #11 0x00007ffff46c7fe9 in ?? () from /usr/lib/libaticaldd.so #12 0x00007ffff48bf715 in ?? () from /usr/lib/libaticaldd.so #13 0x00007ffff465c61a in ?? () from /usr/lib/libaticaldd.so #14 0x00007ffff465dacd in ?? () from /usr/lib/libaticaldd.so #15 0x00007ffff5d757c7 in ?? () from /opt/ati-stream-sdk-v2.2-lnx64/lib/x86_64/libatiocl64.so #16 0x00007ffff5d77b19 in ?? () from /opt/ati-stream-sdk-v2.2-lnx64/lib/x86_64/libatiocl64.so #17 0x00007ffff5d799b9 in ?? () from /opt/ati-stream-sdk-v2.2-lnx64/lib/x86_64/libatiocl64.so #18 0x00007ffff5d8782a in ?? () from /opt/ati-stream-sdk-v2.2-lnx64/lib/x86_64/libatiocl64.so #19 0x00007ffff5d8a9a7 in ?? () from /opt/ati-stream-sdk-v2.2-lnx64/lib/x86_64/libatiocl64.so #20 0x00007ffff5d5e329 in ?? () from /opt/ati-stream-sdk-v2.2-lnx64/lib/x86_64/libatiocl64.so #21 0x00007ffff5da27f8 in ?? () from /opt/ati-stream-sdk-v2.2-lnx64/lib/x86_64/libatiocl64.so #22 0x00007ffff5d3cdd3 in clBuildProgram () from /opt/ati-stream-sdk-v2.2-lnx64/lib/x86_64/libatiocl64.so #23 0x0000000000434841 in cl::Program::build (this=0x7fffffffdf90, devices=..., options=0x0, notifyFptr=0, data=0x0) at /opt/ati-stream-sdk-v2.2-lnx64/include/CL/cl.hpp:2474 #24 0x0000000000432131 in GP::BuildKernel (this=0x6666b0) at /home/douglas/gpocl/gpocl/trunk/src/gp/GP.cc:689 #25 0x0000000000424ac9 in GP::Run (this=0x6666b0) at /home/douglas/gpocl/gpocl/trunk/src/gp/GP.h:133 #26 0x0000000000423fa7 in main (argc=22, argv=0x7fffffffe438) at /home/douglas/gpocl/gpocl/trunk/src/gp/gpocl.cc:54

MicahVillmow · ‎11-29-2010

Thanks for reporting this, I have reported to the correct team and it should be fixed in a future catalyst release.

keldor314 · ‎12-01-2010

Posting this here since I can't for the life of me find the "new topic" button.

I've found another case that makes the compiler explode. In this case, rather than actually crashing, it simply gets stuck in some sort of memory allocation loop, quickly grabbing all available system memory and proceeding to continue allocating into the page file until I force quit the process. The longest I waited had it allocating 16 GB of memory.

I've attached a kernel that reproduces the problem. The kernel is generated at runtime, and is dependent on the input data. The problem is not specific to this particular input data/kernel combination, and occurs on every varient I've tested.

It's worth noting that the code compiles and works correctly on Nvidia hardware.

There are two defines that are set as compiler parameters - BLOCK_SIZE = 64 and NUM_ITERATIONS = 100.

Win7 x64, AMD SDK 2.2, Radeon 5770 + GeForce GTX 295

#pragma OPENCL_EXTENSION cl_khr_fp64 : disable #define PI 3.1415926536f const sampler_t paletteSampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_REPEAT | CLK_FILTER_LINEAR; constant int shift1[4] = {6, 2, 13, 3}; constant int shift2[4] = {13, 27, 21, 12}; constant int shift3[4] = {18, 2, 7, 13}; constant unsigned int offset[4] = {4294967294, 4294967288, 4294967280, 4294967168}; unsigned int TausStep(unsigned int z, int s1, int s2, int s3, unsigned int M) { unsigned int b = ((z << s1) ^ z) >> s2; return (((z & M) << s3) ^b); } unsigned int RAND_INT(local volatile unsigned int* randStates) { unsigned int index = get_local_id(0); unsigned int i2 = get_local_id(0)&~31; randStates[index] = TausStep(randStates[index], shift1[index&3], shift2[index&3], shift3[index&3], offset[index&3]); return (randStates[index&31+i2]^randStates[(index+1)&31+i2]^randStates[(index+2)&31+i2]^randStates[(index+3)&31+i2]); } #define randInt(void) RAND_INT(randStates) unsigned int RAND_INT_WARP(local volatile unsigned int* randStates) { unsigned int index = get_local_id(0); unsigned int i2 = get_local_id(0)&~31; randStates[index] = TausStep(randStates[index], shift1[index&3], shift2[index&3], shift3[index&3], offset[index&3]); return (randStates[i2]^randStates[i2+1]^randStates[i2+2]^randStates[i2+3]); } #define randIntWarp(void) RAND_INT_WARP(randStates) float RAND_FLOAT(local volatile unsigned int* randStates) { unsigned int y = randInt(); return as_float((y&0x007FFFFF)|0x3F800000)-1.0f; } #define randFloat(void) RAND_FLOAT(randStates) float RAND_FLOAT_WARP(local volatile unsigned int* randStates) { unsigned int y = randIntWarp(); return as_float((y&0x007FFFFF)|0x3F800000)-1.0f; } #define randFloatWarp(void) RAND_FLOAT_WARP(randStates) float4 loadPoint(int nodeIndex, local volatile unsigned int * randStates,local volatile unsigned int * pointOffset,local volatile float * pointStage, global volatile float* pointPool) { unsigned int index = get_local_id(0); pointOffset[index] = 4*(nodeIndex*NUM_POINTS_PER_NODE+randInt()%NUM_POINTS_PER_NODE); for (int n = (index&~3); n < ((index&~3)+4);n++) { pointStage[n*4+(index&3)]=pointPool[pointOffset+(index&3)]; } return (float4)(pointStage[index*4],pointStage[index*4+1],pointStage[index*4+2],pointStage[index*4+3]); } void storePoint(float4 point, local volatile unsigned int * pointOffset,local volatile float * pointStage, global volatile float* pointPool) { unsigned int index = get_local_id(0); pointStage[index*4] = point.x; pointStage[index*4+1]=point.y; pointStage[index*4+2]=point.z; pointStage[index*4+3]=point.w; for (int n = (index&~3); n < ((index&~3)+4); n++) { pointPool[pointOffset+(index&3)]=pointStage[n*4+(index&3)]; } } int2 rasterizePoint(float4 point, int2 dim, local volatile unsigned int* randStates, constant float* flameBuffer) { point = point+(float4)(flameBuffer[70],flameBuffer[74],flameBuffer[78],0.0f); float4 pos = point.xxxx*(float4)(flameBuffer[67],flameBuffer[71],flameBuffer[75],0.0f)+point.yyyy*(float4)(flameBuffer[68],flameBuffer[72],flameBuffer[76],0.0f)+point.zzzz*(float4)(flameBuffer[69],flameBuffer[73],flameBuffer[77],0.0f); float xd = (0.5f+0.5f*pos.x)*(float)dim.x; float yd = ((0.5f+0.5f*pos.y))*(float)dim.y; return (int2)((int)xd,(int)yd); } void drawPoint(global volatile float4* renderBuffer, image2d_t palette, int2 dim, float4 point, local volatile unsigned int* randStates, float opacity, constant float* flameBuffer) { int2 coords = rasterizePoint(point, dim, randStates,flameBuffer); if ((coords.x >= 0.0f)&&(coords.y >= 0.0f)&&(coords.x < dim.x)&&(coords.y < dim.y)) { float4 color = opacity*read_imagef(palette, paletteSampler, (float2)(point.w,0.0f)); renderBuffer[coords.y*dim.x+coords.x]+=color; } } float4 linear(float4 pos, float weight, float r2, float r, float r2D2, float r2D, float theta, float phi, float psi, local unsigned int * randStates, constant float* flameBuffer,int offset) { float xout,yout,zout=0.0f; float x = pos.x; float y = pos.y; float z = pos.z; float a = flameBuffer[offset]; float b = flameBuffer[offset+1]; float c = flameBuffer[offset+2]; float d = flameBuffer[offset+3]; float e = flameBuffer[offset+4]; float f = flameBuffer[offset+5]; float g = flameBuffer[offset+6]; float h = flameBuffer[offset+7]; float i = flameBuffer[offset+8]; float j = flameBuffer[offset+9]; float k = flameBuffer[offset+10]; float l = flameBuffer[offset+11]; zout=z; xout=x; yout=y; zout=z; return (float4) (xout,yout,zout,0.0f); } float4 bubble$0033D(float4 pos, float weight, float r2, float r, float r2D2, float r2D, float theta, float phi, float psi, local unsigned int * randStates, constant float* flameBuffer,int offset) { float xout,yout,zout=0.0f; float x = pos.x; float y = pos.y; float z = pos.z; float a = flameBuffer[offset]; float b = flameBuffer[offset+1]; float c = flameBuffer[offset+2]; float d = flameBuffer[offset+3]; float e = flameBuffer[offset+4]; float f = flameBuffer[offset+5]; float g = flameBuffer[offset+6]; float h = flameBuffer[offset+7]; float i = flameBuffer[offset+8]; float j = flameBuffer[offset+9]; float k = flameBuffer[offset+10]; float l = flameBuffer[offset+11]; zout=z; xout=x*4.0f/(r2+4.0f); yout=y*4.0f/(r2+4.0f); zout=z*4.0f/(r2+4.0f); return (float4) (xout,yout,zout,0.0f); } float4 julia$0033Dz(float4 pos, float julia3Dz_power, float weight, float r2, float r, float r2D2, float r2D, float theta, float phi, float psi, local unsigned int * randStates, constant float* flameBuffer,int offset) { float xout,yout,zout=0.0f; float x = pos.x; float y = pos.y; float z = pos.z; float a = flameBuffer[offset]; float b = flameBuffer[offset+1]; float c = flameBuffer[offset+2]; float d = flameBuffer[offset+3]; float e = flameBuffer[offset+4]; float f = flameBuffer[offset+5]; float g = flameBuffer[offset+6]; float h = flameBuffer[offset+7]; float i = flameBuffer[offset+8]; float j = flameBuffer[offset+9]; float k = flameBuffer[offset+10]; float l = flameBuffer[offset+11]; zout=z; float R2 = x*x+y*y; float R = native_powr(R2,(.5f/julia3Dz_power)); xout= R*native_cos(atan2(y,x)+2.0f*3.14159f*(float)(randInt()%abs((int)julia3Dz_power))/julia3Dz_power); yout= R*native_sin(atan2(y,x)+2.0f*3.14159f*(float)(randInt()%abs((int)julia3Dz_power))/julia3Dz_power); float z2 = R*z/(native_sqrt(R2)*fabs(julia3Dz_power)); zout= -fabs(z2); return (float4) (xout,yout,zout,0.0f); } float4 xform$0031(float4 point, local unsigned int * randStates, constant float* flameBuffer) { float4 outpos = (float4)(0.0f,0.0f,0.0f,0.0f); float4 pos = point.xxxx*(float4)(flameBuffer[0],flameBuffer[4],flameBuffer[8],0.0f)+point.yyyy*(float4)(flameBuffer[1],flameBuffer[5],flameBuffer[9],0.0f)+point.zzzz*(float4)(flameBuffer[2],flameBuffer[6],flameBuffer[10],0.0f)+(float4)(flameBuffer[3],flameBuffer[7],flameBuffer[11],0.0f); float r2D2 = pos.x*pos.x+pos.y*pos.y; float r2 = pos.x*pos.x+pos.y*pos.y+pos.z*pos.z; float r2D = sqrt(r2D2); float r = sqrt(r2); float theta = atan2(pos.y,pos.x); float phi = atan2(pos.z,r2D); float psi = atan2(pos.x,pos.y); outpos+=((float)flameBuffer[14])*julia$0033Dz(pos,flameBuffer[15], flameBuffer[14], r2, r, r2D2, r2D, theta, phi, psi, randStates, flameBuffer,0); float col = mix(point.w,flameBuffer[12],flameBuffer[13]); return (float4)(outpos.x,outpos.y,outpos.z,col); } float4 xform$0032(float4 point, local unsigned int * randStates, constant float* flameBuffer) { float4 outpos = (float4)(0.0f,0.0f,0.0f,0.0f); float4 pos = point.xxxx*(float4)(flameBuffer[16],flameBuffer[20],flameBuffer[24],0.0f)+point.yyyy*(float4)(flameBuffer[17],flameBuffer[21],flameBuffer[25],0.0f)+point.zzzz*(float4)(flameBuffer[18],flameBuffer[22],flameBuffer[26],0.0f)+(float4)(flameBuffer[19],flameBuffer[23],flameBuffer[27],0.0f); float r2D2 = pos.x*pos.x+pos.y*pos.y; float r2 = pos.x*pos.x+pos.y*pos.y+pos.z*pos.z; float r2D = sqrt(r2D2); float r = sqrt(r2); float theta = atan2(pos.y,pos.x); float phi = atan2(pos.z,r2D); float psi = atan2(pos.x,pos.y); outpos+=((float)flameBuffer[30])*linear(pos, flameBuffer[30], r2, r, r2D2, r2D, theta, phi, psi, randStates, flameBuffer,16); float col = mix(point.w,flameBuffer[28],flameBuffer[29]); return (float4)(outpos.x,outpos.y,outpos.z,col); } float4 xform$0033(float4 point, local unsigned int * randStates, constant float* flameBuffer) { float4 outpos = (float4)(0.0f,0.0f,0.0f,0.0f); float4 pos = point.xxxx*(float4)(flameBuffer[31],flameBuffer[35],flameBuffer[39],0.0f)+point.yyyy*(float4)(flameBuffer[32],flameBuffer[36],flameBuffer[40],0.0f)+point.zzzz*(float4)(flameBuffer[33],flameBuffer[37],flameBuffer[41],0.0f)+(float4)(flameBuffer[34],flameBuffer[38],flameBuffer[42],0.0f); float r2D2 = pos.x*pos.x+pos.y*pos.y; float r2 = pos.x*pos.x+pos.y*pos.y+pos.z*pos.z; float r2D = sqrt(r2D2); float r = sqrt(r2); float theta = atan2(pos.y,pos.x); float phi = atan2(pos.z,r2D); float psi = atan2(pos.x,pos.y); outpos+=((float)flameBuffer[45])*bubble$0033D(pos, flameBuffer[45], r2, r, r2D2, r2D, theta, phi, psi, randStates, flameBuffer,31); float col = mix(point.w,flameBuffer[43],flameBuffer[44]); return (float4)(outpos.x,outpos.y,outpos.z,col); } kernel void renderBatch(global volatile float *renderBuffer1,global unsigned int* randSeeds, global unsigned int* discardStates, global volatile float *pointPool, global int* stateStacks, global int* oldStateIndices, constant float* flameBuffer, read_only image2d_t palette, int xdim, int ydim) { int2 dimension = (int2)(xdim,ydim); local volatile unsigned int randStates[BLOCK_SIZE]; randStates[get_local_id(0)]=randSeeds[get_global_id(1)*get_global_size(0)+get_global_id(0)]; local volatile unsigned int pointOffset[BLOCK_SIZE]; local volatile float pointStage[BLOCK_SIZE*4]; local int stateStack[BLOCK_SIZE]; const int warpOffset = get_local_id(0)&~31; global volatile float4 * renderBuffer = (global float4*) renderBuffer1; float4 oldPoint; int dstate = 0; float rnd = 0.0f; float4 point; float colIndex; float opacity; unsigned int discard = discardStates[(get_global_id(1)*get_global_size(0)+get_global_id(0))>>5]; unsigned int iterCount; stateStack[get_local_id(0)] = stateStacks[get_global_id(1)*get_global_size(0)+get_global_id(0)]; unsigned int state=oldStateIndices[(get_global_id(1)*get_global_size(0)+get_global_id(0))>>5]; unsigned int nextState=0; rnd = randFloatWarp(); switch (stateStack[state+warpOffset]) { case 0: { oldPoint = loadPoint(0,randStates,pointOffset,pointStage,pointPool); if (rnd <= (float)0.5) { state += as_int(flameBuffer[52]); dstate = as_int(flameBuffer[52]); state &= 31; opacity = flameBuffer[53]; discard = as_int(flameBuffer[54]); nextState = 2; } else { state += as_int(flameBuffer[48]); dstate = as_int(flameBuffer[48]); state &= 31; opacity = flameBuffer[49]; discard = as_int(flameBuffer[50]); nextState = 1; } }break; case 1: { oldPoint = loadPoint(1,randStates,pointOffset,pointStage,pointPool); if (rnd <= (float)0.5) { state += as_int(flameBuffer[60]); dstate = as_int(flameBuffer[60]); state &= 31; opacity = flameBuffer[61]; discard = as_int(flameBuffer[62]); nextState = 1; } else { state += as_int(flameBuffer[56]); dstate = as_int(flameBuffer[56]); state &= 31; opacity = flameBuffer[57]; discard = as_int(flameBuffer[58]); nextState = 0; } }break; case 2: { oldPoint = loadPoint(2,randStates,pointOffset,pointStage,pointPool); state += as_int(flameBuffer[64]); dstate = as_int(flameBuffer[64]); state &= 31; opacity = flameBuffer[65]; discard = as_int(flameBuffer[66]); nextState = 0; }break; } for (int iterations=0; iterations<MAX_ITERATIONS; iterations++) { switch (nextState) { case 0: { if (discard != 0) oldPoint = loadPoint(stateStack[state+warpOffset],randStates,pointOffset,pointStage,pointPool); iterCount = as_int(oldPoint.w)&0xFF800000; colIndex = as_float((as_int(oldPoint.w)&0x007FFFFF)|0x3F800000) -1.0f; oldPoint.w=colIndex; point = xform$0031(oldPoint, randStates, flameBuffer); if (iterCount > (100<<23)) { if (opacity!=0.0f) drawPoint(renderBuffer,palette,dimension,point,randStates,opacity,flameBuffer); } else iterCount += 1<<23; if (isfinite(point.x+point.y+point.z)==0) { point.x=randFloat(); point.y=randFloat(); point.z=randFloat(); point.w=randFloat(); iterCount=95<<23; } point.w=as_float(iterCount|(as_int(point.w+1.0f)&0x007FFFFF)); stateStack[state+warpOffset]=0; oldPoint = loadPoint(stateStack[state+warpOffset],randStates,pointOffset,pointStage,pointPool); storePoint(point,pointOffset,pointStage,pointPool); rnd = randFloatWarp(); if (rnd <= (float)0.5) { state += as_int(flameBuffer[52]); dstate = as_int(flameBuffer[52]); state &= 31; opacity = flameBuffer[53]; discard = as_int(flameBuffer[54]); nextState = 2; } else { state += as_int(flameBuffer[48]); dstate = as_int(flameBuffer[48]); state &= 31; opacity = flameBuffer[49]; discard = as_int(flameBuffer[50]); nextState = 1; } }break; case 1: { if (discard != 0) oldPoint = loadPoint(stateStack[state+warpOffset],randStates,pointOffset,pointStage,pointPool); iterCount = as_int(oldPoint.w)&0xFF800000; colIndex = as_float((as_int(oldPoint.w)&0x007FFFFF)|0x3F800000) -1.0f; oldPoint.w=colIndex; point = xform$0032(oldPoint, randStates, flameBuffer); if (iterCount > (100<<23)) { if (opacity!=0.0f) drawPoint(renderBuffer,palette,dimension,point,randStates,opacity,flameBuffer); } else iterCount += 1<<23; if (isfinite(point.x+point.y+point.z)==0) { point.x=randFloat(); point.y=randFloat(); point.z=randFloat(); point.w=randFloat(); iterCount=95<<23; } point.w=as_float(iterCount|(as_int(point.w+1.0f)&0x007FFFFF)); stateStack[state+warpOffset]=1; oldPoint = loadPoint(stateStack[state+warpOffset],randStates,pointOffset,pointStage,pointPool); storePoint(point,pointOffset,pointStage,pointPool); rnd = randFloatWarp(); if (rnd <= (float)0.5) { state += as_int(flameBuffer[60]); dstate = as_int(flameBuffer[60]); state &= 31; opacity = flameBuffer[61]; discard = as_int(flameBuffer[62]); nextState = 1; } else { state += as_int(flameBuffer[56]); dstate = as_int(flameBuffer[56]); state &= 31; opacity = flameBuffer[57]; discard = as_int(flameBuffer[58]); nextState = 0; } }break; case 2: { if (discard != 0) oldPoint = loadPoint(stateStack[state+warpOffset],randStates,pointOffset,pointStage,pointPool); iterCount = as_int(oldPoint.w)&0xFF800000; colIndex = as_float((as_int(oldPoint.w)&0x007FFFFF)|0x3F800000) -1.0f; oldPoint.w=colIndex; point = xform$0033(oldPoint, randStates, flameBuffer); if (iterCount > (100<<23)) { if (opacity!=0.0f) drawPoint(renderBuffer,palette,dimension,point,randStates,opacity,flameBuffer); } else iterCount += 1<<23; if (isfinite(point.x+point.y+point.z)==0) { point.x=randFloat(); point.y=randFloat(); point.z=randFloat(); point.w=randFloat(); iterCount=95<<23; } point.w=as_float(iterCount|(as_int(point.w+1.0f)&0x007FFFFF)); stateStack[state+warpOffset]=2; oldPoint = loadPoint(stateStack[state+warpOffset],randStates,pointOffset,pointStage,pointPool); storePoint(point,pointOffset,pointStage,pointPool); rnd = randFloatWarp(); state += as_int(flameBuffer[64]); dstate = as_int(flameBuffer[64]); state &= 31; opacity = flameBuffer[65]; discard = as_int(flameBuffer[66]); nextState = 0; }break; } } randSeeds[get_global_id(1)*get_global_size(0)+get_global_id(0)]=randStates[get_local_id(0)]; stateStacks[get_global_id(1)*get_global_size(0)+get_global_id(0)]=stateStack[get_local_id(0)]; oldStateIndices[(get_global_id(1)*get_global_size(0)+get_global_id(0))>>5] = state; discardStates[(get_global_id(1)*get_global_size(0)+get_global_id(0))>>5] = discard; }

MicahVillmow · ‎12-01-2010

keldor314,
This should work fine with our upcoming SDK.

himanshu_gautam · ‎12-01-2010

new topic is available at the top left corner of the page as the first button. It can be found on any forum topic page.

d_a_a_ · ‎12-02-2010

It would be nice if the AMD developer team could release incremental bug-fix versions of the SDK (x.y.z?) as well make available alpha/beta/RC releases of the upcoming SDKs.

General purpose computing on the GPU and the OpenCL specification are relatively new and so very immature by now; therefore, frequent releases would provide more immediate feedback from the users and also allow them to have immediate access to improved/fixed versions. So, why not?

himanshu_gautam · ‎12-02-2010

I think AMD tries to release a new SDK version quarterly. But you can send suggestions and feature request at forums or as tickets. New features are generally implemented on the basis of Market needs.

d_a_a_ · ‎12-02-2010

In my opinion the three months gap is too much for an immature and fast evolving technology. I know that new features and big improvements take time, but why not release incremental bug-fix releases in a shorter period? I'm sure that your internal SDK has already many bug fixes that could make many users happy right now.

Moreover, releasing alpha/beta/RC versions (targeted at users willing to contribute) doesn't preclude the AMD's current quarterly release schedule.

MicahVillmow · ‎12-02-2010

d.a.a.,
I've forwarded your request to the product manager, but I can't promise anything.

empty_knapsack · ‎12-02-2010

I've already wrote it elsewhere at forums but I'll repeat it here --

I have no idea why ATI not using free help from developers by publishing pre-release alpha/beta version of SDK. It is MUCH higher chances that bugs will be spotted and new features suggested if releases made frequently not just 3-4 times an year. This is especially true for ATI Stream compared to NVIDIA CUDA as Stream much younger and needs serious improvements to become competitive. Check out CUDA SDK release cycles -- its also only several times per year BUT there are plenty of intermediate releases available for registered developers. So if you're really interested in technology you can get latest possible version. Yes, it's possible that it'll contain some odd bugs and it also possible that it'll solve most annoying current bugs, so it's way more better (and I don't want to start again story about unsupported 5970s and it's just way annoying already).

You can forward this to product manager too.

d_a_a_ · ‎12-02-2010

@ MicahVillmow

Thank you MicahVillmow and those from AMD who actively participate in this forum.

@empty_knapsack

I totally agree.

d_a_a_ · ‎02-17-2011

Hi,

For your information, I'm still having problems with some kernels crashing the AMD's OpenCL compiler (the attached kernel is an example). I've upgraded to AMD APP SDK v2.3 and fglrx 11.2, and the compiler fails both for HD5750 and HD6970 GPUs (two different systems running 64-bit Debian GNU/Linux with kernel 2.6.37).

// ---------- BEGIN kernel #define LOCAL_SIZE 256 #define POP_SIZE 15360 #define NUM_POINTS 4096 #define X_DIM 3 #define MAX_TREE_SIZE 256 #define MAX_FLOAT 3.40282e+38f #define TOP ( stack[stack_top] ) #define POP ( stack[stack_top--] ) #define PUSH(arity, exp) stack[stack_top + 1 - arity] = (exp); stack_top += 1 - arity; #define PUSH_0( value ) stack[++stack_top] = (value); #define PUSH_1( exp ) stack[stack_top] = (exp); #define PUSH_2( exp ) stack[stack_top - 1] = (exp); --stack_top; #define PUSH_3( exp ) stack[stack_top - 2] = (exp); stack_top -= 2; #define ARG(n) (stack[stack_top - n]) #define STACK_SIZE 128 #define CREATE_STACK float stack[STACK_SIZE]; int stack_top = -1; #define NODE program[op] #define INTERPRETER_CORE case 0: PUSH_0(AS_FLOAT( NODE )) break; case 34: PUSH_1(-ARG(0)) break; case 3: PUSH_2(ARG(0) + ARG(1)) break; case 15: PUSH_2(ARG(0) - ARG(1)) break; case 16: PUSH_2(ARG(0) * ARG(1)) break; case 5: PUSH_2(native_divide(ARG(0), ARG(1))) break; // ----------------------------------------------------------------------------- // The errors metric // ----------------------------------------------------------------------------- // absolute difference #define ERROR_METRIC( actual, expected ) fabs( actual - expected ) // square of the difference //#define ERROR_METRIC( actual, expected ) pown( actual - expected, 2 ) // ----------------------------------------------------------------------------- #define MAX_INT_VALUE 4194303 // 2^22 - 1 #define COMPACT_RANGE MAX_INT_VALUE // 2^22 - 1 #define SCALE_FACTOR 16 // Range of possible float values: [0.0, SCALE_FACTOR] /* Structure of a program (individual) | | +----+-----+----+--------------+------------- |size|arity|type| index/value | ... | 32 | 3 | 7 | 22 | +----+-----+----+--------------+------------- | first element | second ... */ #define ARITY( packed ) ((packed & 0xE0000000) >> 29) // 0xE0000000 = 11100000 00000000 00000000 00000000 #define INDEX( packed ) ((packed & 0x1FC00000) >> 22) // 0x1FC00000 = 00011111 11000000 00000000 00000000 #define AS_INT( packed ) (packed & 0x3FFFFF) // 0x3FFFFF = 00000000 00111111 11111111 11111111 #define AS_FLOAT( packed ) ((float)( packed & 0x3FFFFF ) * SCALE_FACTOR / COMPACT_RANGE) // 0x3FFFFF = 00000000 00111111 11111111 11111111 __kernel void evaluate( __global const uint* pop, __global const float* X, #ifdef Y_DOES_NOT_FIT_IN_CONSTANT_BUFFER __global const #else __constant #endif float* Y, __global float* E, __local uint* program ) { __local float PE[LOCAL_SIZE]; __local uint program_size; CREATE_STACK uint lo_id = get_local_id( 0 ); uint gr_id = get_group_id( 0 ); // Get the actual program's size if( lo_id == 0 ) program_size = pop[(MAX_TREE_SIZE + 1) * gr_id]; barrier(CLK_LOCAL_MEM_FENCE); #ifndef PROGRAM_TREE_DOES_NOT_FIT_IN_LOCAL_SIZE if( lo_id < program_size ) program[lo_id] = pop[(MAX_TREE_SIZE + 1) * gr_id + lo_id + 1]; #else // Too few workers for the program_size, thus we need to do the work iteratively for( uint i = 0; i < ceil( program_size / (float) LOCAL_SIZE ); ++i ) { uint index = i * LOCAL_SIZE + lo_id; if( index < program_size ) program[index] = pop[(MAX_TREE_SIZE + 1) * gr_id + index + 1]; } #endif barrier(CLK_LOCAL_MEM_FENCE); PE[lo_id] = 0.0f; #ifndef NUM_POINTS_IS_NOT_DIVISIBLE_BY_LOCAL_SIZE /* When we know that NUM_POINTS is divisible by LOCAL_SIZE then we can avoid a comparison in each iteration due to the guarantee of not having work-items accessing beyond the available amount of points. */ for( uint iter = 0; iter < NUM_POINTS/LOCAL_SIZE; ++iter ) { #else for( uint iter = 0; iter < ceil( NUM_POINTS / (float) LOCAL_SIZE ); ++iter ) { //if( iter == ceil( NUM_POINTS / (float) LOCAL_SIZE) - 1 && lo_id < NUM_POINTS % LOCAL_SIZE ) if( iter * LOCAL_SIZE + lo_id < NUM_POINTS ) { #endif // ------------------------------- // Calls the interpreter (C macro) // ------------------------------- for( int op = program_size; op-- ; ) switch( INDEX( program[op] ) ) { INTERPRETER_CORE default: // Coalesced access pattern PUSH_0( X[iter * LOCAL_SIZE + NUM_POINTS * AS_INT( program[op] ) + lo_id] ); } // ------------------------------- PE[lo_id] += ERROR_METRIC( POP, Y[iter * LOCAL_SIZE + lo_id] ); // Avoid further calculations if the current one has overflown the float // (i.e., it is inf or NaN). if( isinf( PE[lo_id] ) || isnan( PE[lo_id] ) ) break; #ifdef NUM_POINTS_IS_NOT_DIVISIBLE_BY_LOCAL_SIZE } #endif } /* Parallel way to perform reduction within the work-group: | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | _ |_______________| -> (lo_id = 0) | |_______________| -> (lo_id = 1) | t0 |_______________| -> (lo_id = 2) | (s=4) |_______________| -> (lo_id = 3) _| _ |_______| -> (lo_id = 0) | t1 |_______| -> (lo_id = 1) _| (s=2) _ |___| -> (lo_id = 0) _| t2 (s=1) L----> total sum is stored on the first work-item */ for( uint s = LOCAL_SIZE_ROUNDED_UP_TO_POWER_OF_2 / 2; s > 0; s >>= 1 ) { barrier(CLK_LOCAL_MEM_FENCE); #ifndef LOCAL_SIZE_IS_NOT_POWER_OF_2 if( lo_id < s ) #else /* LOCAL_SIZE is not power of 2, so we need to perform an additional * check to ensure that no access beyond PE's range will occur. */ if( (lo_id < s) && (lo_id + s < LOCAL_SIZE) ) #endif PE[lo_id] += PE[lo_id + s]; } // Store on the global memory (to be read by the host) if( lo_id == 0 ) // Check for infinity/NaN E[gr_id] = ( isinf( PE[0] ) || isnan( PE[0] ) ) ? MAX_FLOAT : PE[0]; } // ------------------- END kernel GDB output: Program received signal SIGSEGV, Segmentation fault. 0x00007ffff43400cd in ?? () from /usr/lib/libaticaldd.so (gdb) bt #0 0x00007ffff43400cd in ?? () from /usr/lib/libaticaldd.so #1 0x00007ffff4338450 in ?? () from /usr/lib/libaticaldd.so #2 0x00007ffff433873d in ?? () from /usr/lib/libaticaldd.so #3 0x00007ffff4314025 in ?? () from /usr/lib/libaticaldd.so #4 0x00007ffff431430f in ?? () from /usr/lib/libaticaldd.so #5 0x00007ffff4315d4a in ?? () from /usr/lib/libaticaldd.so #6 0x00007ffff4316b70 in ?? () from /usr/lib/libaticaldd.so #7 0x00007ffff4314763 in ?? () from /usr/lib/libaticaldd.so #8 0x00007ffff42b06f5 in ?? () from /usr/lib/libaticaldd.so #9 0x00007ffff421d0fc in ?? () from /usr/lib/libaticaldd.so #10 0x00007ffff421d395 in ?? () from /usr/lib/libaticaldd.so #11 0x00007ffff421def7 in ?? () from /usr/lib/libaticaldd.so #12 0x00007ffff4422149 in ?? () from /usr/lib/libaticaldd.so #13 0x00007ffff41b595a in ?? () from /usr/lib/libaticaldd.so #14 0x00007ffff41b6e0d in ?? () from /usr/lib/libaticaldd.so #15 0x00007ffff594a187 in ?? () from /opt/ati-stream-sdk-v2.3-lnx64/lib/x86_64/libatiocl64.so #16 0x00007ffff594c5d2 in ?? () from /opt/ati-stream-sdk-v2.3-lnx64/lib/x86_64/libatiocl64.so #17 0x00007ffff594f559 in ?? () from /opt/ati-stream-sdk-v2.3-lnx64/lib/x86_64/libatiocl64.so #18 0x00007ffff595c765 in ?? () from /opt/ati-stream-sdk-v2.3-lnx64/lib/x86_64/libatiocl64.so #19 0x00007ffff595faaa in ?? () from /opt/ati-stream-sdk-v2.3-lnx64/lib/x86_64/libatiocl64.so #20 0x00007ffff593144c in ?? () from /opt/ati-stream-sdk-v2.3-lnx64/lib/x86_64/libatiocl64.so #21 0x00007ffff597e16a in ?? () from /opt/ati-stream-sdk-v2.3-lnx64/lib/x86_64/libatiocl64.so #22 0x00007ffff590f77a in clBuildProgram () from /opt/ati-stream-sdk-v2.3-lnx64/lib/x86_64/libatiocl64.so #23 0x0000000000439c57 in cl::Program::build (this=0x7fffffffda60, devices=..., options=0xffd1d8 " -D LOCAL_SIZE_ROUNDED_UP_TO_POWER_OF_2=256", notifyFptr=0, data=0x0) at /opt/ati-stream-sdk-v2.3-lnx64/include/CL/cl.hpp:2474 #24 0x00000000004370d1 in GP::BuildKernel (this=0x6818b0) at /home/douglas/gpocl/gpocl/trunk/src/gp/GP.cc:777 #25 0x00000000004280f0 in GP::Run (this=0x6818b0) at /home/douglas/gpocl/gpocl/trunk/src/gp/GP.h:148 #26 0x0000000000427289 in main (argc=13, argv=0x7fffffffdff8) at /home/douglas/gpocl/gpocl/trunk/src/gp/gpocl.cc:70

Archives Discussions

Kernel crashing the AMD's OpenCL compiler