ThijsWithaar

CL_BUILD_PROGRAM_FAILURE on kernel with long*

Discussion created by ThijsWithaar on May 13, 2010
Latest reply on May 15, 2010 by ThijsWithaar
Why does this fail?

Hi,

I'm getting a CL_BUILD_PROGRAM_FAILURE on a kernel which I don't really understand. Can someone point out what I'm doing wrong ?

The code pre-calculates derivatives for Lucas-Kanade image registration. Since I have a ati 4650, i'm not using image_2d types.

At the very end of the kernel code there's one line that, when enabled, seems to cause the CL_BUILD_PROGRAM_FAILURE.

 

#define clip(mi,x,ma) (min(max(mi,x),ma)) // Fetch a 4x1 block of unaligned pixel data uchar4 pixfetch(const __global uchar4 *im, int idx, int shift) { // Load pixel data: uchar8 p; p.lo = im[idx ]; p.hi = im[idx+1]; // Shift it long m = as_long(p); m <<= shift; // And get the leftmost 4 bytes: p = as_uchar8( m ); return p.lo; } // Get image gradient of 2x1 pixels, using central differences. short8 LK_gradient(const __global uchar4 *im, int2 dim, // Dimension of image, in pixels int2 pos // Position, in pixels ) { // buffer index for pixel left of the block: int stride = dim.x>>2; int idxyU= clip(0, pos.y-1, dim.y-1) * stride; int idxy = clip(0, pos.y , dim.y-1) * stride; int idxyD= clip(0, pos.y+1, dim.y-1) * stride; // Index for pixel left of pos.x, and 4 pixels right of that: int idxx = clip( 0, pos.x-1, dim.x-4); int bidx = idxx >> 2; // buffer index, in uchar4 elements int shift= (idxx & 3) << 3; // shift in bits for the uchar4 buffer. // Load pixel data: uchar4 pC = pixfetch(im, idxyU + bidx, shift); uchar4 pD = pixfetch(im, idxy + bidx, shift); uchar4 pU = pixfetch(im, idxyD + bidx, shift); // Calculate central differences: short8 ret; ret.lo.lo = (short2)(pC.s2 - pC.s0, pD.s1 - pU.s1); // pixel 1: dx,dy ret.lo.hi = (short2)(pC.s3 - pC.s1, pD.s2 - pU.s2); // pixel 2: dx,dy // Store the fetched pixels: ret.hi.s0 = pC.s1; ret.hi.s1 = pC.s2; return ret; } // Lucas-Kanade derivatives for affine transformation /* void LK_affineDeriv(short2 dIdxy, int2 dxy, long dIdp[6]) { dIdp[0] = dIdxy.x * dxy.x; dIdp[1] = dIdxy.x * dxy.y; dIdp[2] = dIdxy.x * 1; dIdp[3] = dIdxy.y * dxy.x; dIdp[4] = dIdxy.y * dxy.y; dIdp[5] = dIdxy.y * 1; }*/ // Calculates covariance of LK-derivatives in 2x1 blocks // Global dimensions should be set to: // {imageWidth/2, imageHeight } // // *A can be used for conjugate gradient optimization for Lucas-Kanade __kernel void LK_calcDeriv(const __global uchar4 *im, int border, __local long *tmpA, __global long *A, __global uchar4 *imout) { // Thread index: int tid = get_local_id(0) * get_local_size(0) + get_local_id(1); int tSize = get_local_size(0)*get_local_size(1); // Pixel dimensions: int2 pos = (get_global_id(0)*2 , get_global_id(1) ); int2 dim = (get_global_size(0)*2, get_global_size(1)); int2 dxy = pos - (dim/2); // Fetch pixels: short8 ret = LK_gradient(im, dim, pos); short4 dIdxy = ret.lo; long dIdp1; // dIdp2[6]; //LK_affineDeriv(dIdxy.lo, dxy, dIdp1); //dxy.x++; //LK_affineDeriv(dIdxy.hi, dxy, dIdp2); dIdp1 = dIdxy.lo.x * dxy.x; // Build covariance: int Aoff = tid * (6+5+4+3+2+1); for(int i=0; i < 6; i++) { for(int j=i; j < 6; j++) { // Does work: //tmpA[Aoff++] = (long)i; // Does not: tmpA[Aoff++] = dIdp1; // + dIdp2[i] * dIdp2[j]; // What's actually desired: //tmpA[Aoff++] = dIdp1[i] * dIdp1[j]; // + dIdp2[i] * dIdp2[j]; } } // Sum intermediate results of tmpA into A: //sumAndStoreVector(tmpA, N, tid, tSize, A); }

Outcomes