Kim2

Kernel crashes kernel analyzer

Discussion created by Kim2 on Mar 23, 2011
Latest reply on Mar 23, 2011 by Kim2

I'm trying to port some code from C++ to opencl but some part of it is crashing the opencl kernel analyzer 1.7. The code is meant to perform a bitonic sort using a relative Z-order comparison function. There should be no problem with the bitonic sort code since it's from an Nvidia sample.

Thanks in advance to anyone who takes a look for me.

#define LOCAL_SIZE_LIMIT 512U typedef union { //! Union accessor of floating point type float d; //! Union accessor as an array of integer types unsigned long int i[1]; } sep_float_sig; typedef struct { sep_float_sig sig; int exp; float val; } sep_float; void xor_sep_float_sig(sep_float_sig *a, sep_float_sig *b, sep_float_sig *c, const sep_float_sig zero) { for(unsigned int i = 0; i < sizeof(sep_float_sig)/sizeof(unsigned long int); ++i) { c[0].i[i]=(a[0].i[i]^b[0].i[i])|zero.i[i]; } } int msdb(sep_float x, sep_float y) { const sep_float_sig lzero = {0.5}; if(x.val == y.val) return 0; else if(x.exp == y.exp) { xor_sep_float_sig(&x.sig, &y.sig, &x.sig, lzero); frexp(x.sig.d-0.5, &y.exp); return x.exp+y.exp; } else if(x.exp > y.exp) return x.exp; else return y.exp; } bool lt_func(const float4 p, const float4 q) { int y, x; unsigned int k, j; sep_float pc, qc; j = 0; x = -INT_MAX; if((p.x < 0) != (q.x < 0)) return p.x < q.x; pc.val = p.x; pc.sig.d = (float) frexp(p.x, &pc.exp); qc.val = q.x; qc.sig.d = (float) frexp(q.x, &qc.exp); y = msdb(pc, qc); if(x < y) { j = 0; x = y; } if((p.y < 0) != (q.y < 0)) return p.y < q.y; pc.val = p.y; pc.sig.d = (float) frexp(p.y, &pc.exp); qc.val = q.y; qc.sig.d = (float) frexp(q.y, &qc.exp); y = msdb(pc, qc); if(x < y) { j = 1; x = y; } if((p.z < 0) != (q.z < 0)) return p.z < q.z; pc.val = p.z; pc.sig.d = (float) frexp(p.z, &pc.exp); qc.val = q.z; qc.sig.d = (float) frexp(q.z, &qc.exp); y = msdb(pc, qc); if(x < y) { j = 2; x = y; } if(j == 0) { return p.x < q.x; } else if(j == 1) { return p.y < q.y; } else if(j == 2) { return p.z < q.z; } } inline void ComparatorPrivate( uint *keyA, float4 *valA, uint *keyB, float4 *valB, uint dir ){ if( (lt_func(*valA, *valB)) == dir ){ uint t; float4 tf; t = *keyA; *keyA = *keyB; *keyB = t; tf = *valA; *valA = *valB; *valB = tf; } } inline void ComparatorLocal( __local uint *keyA, __local float4 *valA, __local uint *keyB, __local float4 *valB, uint dir ){ if( (lt_func(*valA, *valB)) == dir ){ uint t; float4 tf; t = *keyA; *keyA = *keyB; *keyB = t; tf = *valA; *valA = *valB; *valB = tf; } } __kernel void bitonicSortLocal1( __global uint *d_DstKey, __global float4 *d_DstVal, __global uint *d_SrcKey, __global float4 *d_SrcVal ){ __local uint l_key[LOCAL_SIZE_LIMIT]; __local float4 l_val[LOCAL_SIZE_LIMIT]; //Offset to the beginning of subarray and load data d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1); for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){ //Bitonic merge uint ddd = (comparatorI & (size / 2)) != 0; for(uint stride = size / 2; stride > 0; stride >>= 1){ barrier(CLK_LOCAL_MEM_FENCE); uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); ComparatorLocal( &l_key[pos + 0], &l_val[pos + 0], &l_key[pos + stride], &l_val[pos + stride], ddd ); } } //Odd / even arrays of LOCAL_SIZE_LIMIT elements //sorted in opposite directions { uint ddd = (get_group_id(0) & 1); for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){ barrier(CLK_LOCAL_MEM_FENCE); uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); ComparatorLocal( &l_key[pos + 0], &l_val[pos + 0], &l_key[pos + stride], &l_val[pos + stride], ddd ); } } barrier(CLK_LOCAL_MEM_FENCE); d_DstKey[ 0] = l_key[get_local_id(0) + 0]; d_DstVal[ 0] = l_val[get_local_id(0) + 0]; d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; } //Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT __kernel void bitonicMergeGlobal( __global uint *d_DstKey, __global float4 *d_DstVal, __global uint *d_SrcKey, __global float4 *d_SrcVal, uint arrayLength, uint size, uint stride, uint dir ){ uint global_comparatorI = get_global_id(0); uint comparatorI = global_comparatorI & (arrayLength / 2 - 1); //Bitonic merge uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 ); uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); uint keyA = d_SrcKey[pos + 0]; float4 valA = d_SrcVal[pos + 0]; uint keyB = d_SrcKey[pos + stride]; float4 valB = d_SrcVal[pos + stride]; ComparatorPrivate( &keyA, &valA, &keyB, &valB, ddd ); d_DstKey[pos + 0] = keyA; d_DstVal[pos + 0] = valA; d_DstKey[pos + stride] = keyB; d_DstVal[pos + stride] = valB; } //Combined bitonic merge steps for //'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2] __kernel void bitonicMergeLocal( __global uint *d_DstKey, __global float4 *d_DstVal, __global uint *d_SrcKey, __global float4 *d_SrcVal, uint arrayLength, uint stride, uint size, uint dir ){ __local uint l_key[LOCAL_SIZE_LIMIT]; __local float4 l_val[LOCAL_SIZE_LIMIT]; d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; //Bitonic merge uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1); uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 ); for(; stride > 0; stride >>= 1){ barrier(CLK_LOCAL_MEM_FENCE); uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); ComparatorLocal( &l_key[pos + 0], &l_val[pos + 0], &l_key[pos + stride], &l_val[pos + stride], ddd ); } barrier(CLK_LOCAL_MEM_FENCE); d_DstKey[ 0] = l_key[get_local_id(0) + 0]; d_DstVal[ 0] = l_val[get_local_id(0) + 0]; d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; }

Outcomes