I'm trying to port some code from C++ to opencl but some part of it is crashing the opencl kernel analyzer 1.7. The code is meant to perform a bitonic sort using a relative Z-order comparison function. There should be no problem with the bitonic sort code since it's from an Nvidia sample.
Thanks in advance to anyone who takes a look for me.
#define LOCAL_SIZE_LIMIT 512U typedef union { //! Union accessor of floating point type float d; //! Union accessor as an array of integer types unsigned long int i[1]; } sep_float_sig; typedef struct { sep_float_sig sig; int exp; float val; } sep_float; void xor_sep_float_sig(sep_float_sig *a, sep_float_sig *b, sep_float_sig *c, const sep_float_sig zero) { for(unsigned int i = 0; i < sizeof(sep_float_sig)/sizeof(unsigned long int); ++i) { c[0].i=(a[0].i^b[0].i)|zero.i; } } int msdb(sep_float x, sep_float y) { const sep_float_sig lzero = {0.5}; if(x.val == y.val) return 0; else if(x.exp == y.exp) { xor_sep_float_sig(&x.sig, &y.sig, &x.sig, lzero); frexp(x.sig.d-0.5, &y.exp); return x.exp+y.exp; } else if(x.exp > y.exp) return x.exp; else return y.exp; } bool lt_func(const float4 p, const float4 q) { int y, x; unsigned int k, j; sep_float pc, qc; j = 0; x = -INT_MAX; if((p.x < 0) != (q.x < 0)) return p.x < q.x; pc.val = p.x; pc.sig.d = (float) frexp(p.x, &pc.exp); qc.val = q.x; qc.sig.d = (float) frexp(q.x, &qc.exp); y = msdb(pc, qc); if(x < y) { j = 0; x = y; } if((p.y < 0) != (q.y < 0)) return p.y < q.y; pc.val = p.y; pc.sig.d = (float) frexp(p.y, &pc.exp); qc.val = q.y; qc.sig.d = (float) frexp(q.y, &qc.exp); y = msdb(pc, qc); if(x < y) { j = 1; x = y; } if((p.z < 0) != (q.z < 0)) return p.z < q.z; pc.val = p.z; pc.sig.d = (float) frexp(p.z, &pc.exp); qc.val = q.z; qc.sig.d = (float) frexp(q.z, &qc.exp); y = msdb(pc, qc); if(x < y) { j = 2; x = y; } if(j == 0) { return p.x < q.x; } else if(j == 1) { return p.y < q.y; } else if(j == 2) { return p.z < q.z; } } inline void ComparatorPrivate( uint *keyA, float4 *valA, uint *keyB, float4 *valB, uint dir ){ if( (lt_func(*valA, *valB)) == dir ){ uint t; float4 tf; t = *keyA; *keyA = *keyB; *keyB = t; tf = *valA; *valA = *valB; *valB = tf; } } inline void ComparatorLocal( __local uint *keyA, __local float4 *valA, __local uint *keyB, __local float4 *valB, uint dir ){ if( (lt_func(*valA, *valB)) == dir ){ uint t; float4 tf; t = *keyA; *keyA = *keyB; *keyB = t; tf = *valA; *valA = *valB; *valB = tf; } } __kernel void bitonicSortLocal1( __global uint *d_DstKey, __global float4 *d_DstVal, __global uint *d_SrcKey, __global float4 *d_SrcVal ){ __local uint l_key[LOCAL_SIZE_LIMIT]; __local float4 l_val[LOCAL_SIZE_LIMIT]; //Offset to the beginning of subarray and load data d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1); for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){ //Bitonic merge uint ddd = (comparatorI & (size / 2)) != 0; for(uint stride = size / 2; stride > 0; stride >>= 1){ barrier(CLK_LOCAL_MEM_FENCE); uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); ComparatorLocal( &l_key[pos + 0], &l_val[pos + 0], &l_key[pos + stride], &l_val[pos + stride], ddd ); } } //Odd / even arrays of LOCAL_SIZE_LIMIT elements //sorted in opposite directions { uint ddd = (get_group_id(0) & 1); for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){ barrier(CLK_LOCAL_MEM_FENCE); uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); ComparatorLocal( &l_key[pos + 0], &l_val[pos + 0], &l_key[pos + stride], &l_val[pos + stride], ddd ); } } barrier(CLK_LOCAL_MEM_FENCE); d_DstKey[ 0] = l_key[get_local_id(0) + 0]; d_DstVal[ 0] = l_val[get_local_id(0) + 0]; d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; } //Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT __kernel void bitonicMergeGlobal( __global uint *d_DstKey, __global float4 *d_DstVal, __global uint *d_SrcKey, __global float4 *d_SrcVal, uint arrayLength, uint size, uint stride, uint dir ){ uint global_comparatorI = get_global_id(0); uint comparatorI = global_comparatorI & (arrayLength / 2 - 1); //Bitonic merge uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 ); uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); uint keyA = d_SrcKey[pos + 0]; float4 valA = d_SrcVal[pos + 0]; uint keyB = d_SrcKey[pos + stride]; float4 valB = d_SrcVal[pos + stride]; ComparatorPrivate( &keyA, &valA, &keyB, &valB, ddd ); d_DstKey[pos + 0] = keyA; d_DstVal[pos + 0] = valA; d_DstKey[pos + stride] = keyB; d_DstVal[pos + stride] = valB; } //Combined bitonic merge steps for //'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2] __kernel void bitonicMergeLocal( __global uint *d_DstKey, __global float4 *d_DstVal, __global uint *d_SrcKey, __global float4 *d_SrcVal, uint arrayLength, uint stride, uint size, uint dir ){ __local uint l_key[LOCAL_SIZE_LIMIT]; __local float4 l_val[LOCAL_SIZE_LIMIT]; d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; //Bitonic merge uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1); uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 ); for(; stride > 0; stride >>= 1){ barrier(CLK_LOCAL_MEM_FENCE); uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); ComparatorLocal( &l_key[pos + 0], &l_val[pos + 0], &l_key[pos + stride], &l_val[pos + stride], ddd ); } barrier(CLK_LOCAL_MEM_FENCE); d_DstKey[ 0] = l_key[get_local_id(0) + 0]; d_DstVal[ 0] = l_val[get_local_id(0) + 0]; d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; }
Please post the system info:CPU,GPU,SDK,DRIVER,OS.
Are you saying you are able to run the kernels inside a program, but not able to profile it. I get some compilation errors in clBuildProgram.
Trying to build the kernel also crashes my program. My apologies, I should have said it crashes kernel analyzer 1.7, which I'm using to debug. I'll fix up the OP.
CPU: Intel i7 920
OS: Windows 7 64 bit
GPU: HD5870
SDK: 2.3
Catalyst 11.2
Kim2,
I get the following build log while trying to build it.
Platform 0 : Advanced Micro Devices, Inc.
Selected Platform Vendor : Advanced Micro Devices, Inc.
Device 0 : Juniper
BUILD LOG
************************************************
C:\Users\himanshu\AppData\Local\Temp\OCL6022.tmp.cl(183): warning:
double-precision constant is represented as single-precision
constant because double is not enabled
const sep_float_sig lzero = {0.5};
^
C:\Users\himanshu\AppData\Local\Temp\OCL6022.tmp.cl(190): warning:
double-precision constant is represented as single-precision
constant because double is not enabled
frexp(x.sig.d-0.5, &y.exp);
^
C:\Users\himanshu\AppData\Local\Temp\OCL6022.tmp.cl(264): warning: missing
return statement at end of non-void function "lt_func"
}
^
C:\Users\himanshu\AppData\Local\Temp\OCL6022.tmp.cl(202): warning: variable "k"
was declared but never referenced
unsigned int k, j;
^
C:\Users\himanshu\AppData\Local\Temp\OCL6022.tmp.cl(302): error: identifier
"LOCAL_SIZE_LIMIT" is undefined
__local uint l_key[LOCAL_SIZE_LIMIT];
^
C:\Users\himanshu\AppData\Local\Temp\OCL6022.tmp.cl(400): error: identifier
"LOCAL_SIZE_LIMIT" is undefined
__local uint l_key[LOCAL_SIZE_LIMIT];
^
2 errors detected in the compilation of "C:\Users\himanshu\AppData\Local\Temp\OCL6022.tmp.cl".
************************************************
Error: clBuildProgram failed. Error code : CL_BUILD_PROGRAM_FAILURE
Press any key to continue . . .
I also saw you using inline inside the kernel, which I don't think is allowed.
Thanks.
Try to remove these errors before trying it in SKA.
I think you may have accidentally clipped off the #define LOCAL_SIZE_LIMIT when you copy-pasted the code. I fixed up the warnings but I'm still getting a crash. Here's the code fixed for the warnings.
Thanks for your help, by the way.
#define LOCAL_SIZE_LIMIT 512U typedef union { //! Union accessor of floating point type float d; //! Union accessor as an array of integer types unsigned long int i[1]; } sep_float_sig; typedef struct { sep_float_sig sig; int exp; float val; } sep_float; void xor_sep_float_sig(sep_float_sig *a, sep_float_sig *b, sep_float_sig *c, const sep_float_sig zero) { for(unsigned int i = 0; i < sizeof(sep_float_sig)/sizeof(unsigned long int); ++i) { c[0].i=(a[0].i^b[0].i)|zero.i; } } int msdb(sep_float x, sep_float y) { const sep_float_sig lzero = {0.5}; if(x.val == y.val) return 0; else if(x.exp == y.exp) { xor_sep_float_sig(&x.sig, &y.sig, &x.sig, lzero); frexp(x.sig.d-0.5, &y.exp); return x.exp+y.exp; } else if(x.exp > y.exp) return x.exp; else return y.exp; } bool lt_func(const float4 p, const float4 q) { bool result; int y, x; unsigned int j; sep_float pc, qc; j = 0; x = -INT_MAX; if((p.x < 0) != (q.x < 0)) return p.x < q.x; pc.val = p.x; pc.sig.d = (float) frexp(p.x, &pc.exp); qc.val = q.x; qc.sig.d = (float) frexp(q.x, &qc.exp); y = msdb(pc, qc); if(x < y) { j = 0; x = y; } if((p.y < 0) != (q.y < 0)) return p.y < q.y; pc.val = p.y; pc.sig.d = (float) frexp(p.y, &pc.exp); qc.val = q.y; qc.sig.d = (float) frexp(q.y, &qc.exp); y = msdb(pc, qc); if(x < y) { j = 1; x = y; } if((p.z < 0) != (q.z < 0)) return p.z < q.z; pc.val = p.z; pc.sig.d = (float) frexp(p.z, &pc.exp); qc.val = q.z; qc.sig.d = (float) frexp(q.z, &qc.exp); y = msdb(pc, qc); if(x < y) { j = 2; x = y; } if(j == 0) { result = p.x < q.x; } else if(j == 1) { result = p.y < q.y; } else if(j == 2) { result = p.z < q.z; } return result; } void ComparatorPrivate( uint *keyA, float4 *valA, uint *keyB, float4 *valB, uint dir ){ if( (lt_func(*valA, *valB)) == dir ){ uint t; float4 tf; t = *keyA; *keyA = *keyB; *keyB = t; tf = *valA; *valA = *valB; *valB = tf; } } void ComparatorLocal( __local uint *keyA, __local float4 *valA, __local uint *keyB, __local float4 *valB, uint dir ){ if( (lt_func(*valA, *valB)) == dir ){ uint t; float4 tf; t = *keyA; *keyA = *keyB; *keyB = t; tf = *valA; *valA = *valB; *valB = tf; } } __kernel void bitonicSortLocal1( __global uint *d_DstKey, __global float4 *d_DstVal, __global uint *d_SrcKey, __global float4 *d_SrcVal ){ __local uint l_key[LOCAL_SIZE_LIMIT]; __local float4 l_val[LOCAL_SIZE_LIMIT]; //Offset to the beginning of subarray and load data d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1); for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){ //Bitonic merge uint ddd = (comparatorI & (size / 2)) != 0; for(uint stride = size / 2; stride > 0; stride >>= 1){ barrier(CLK_LOCAL_MEM_FENCE); uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); ComparatorLocal( &l_key[pos + 0], &l_val[pos + 0], &l_key[pos + stride], &l_val[pos + stride], ddd ); } } //Odd / even arrays of LOCAL_SIZE_LIMIT elements //sorted in opposite directions { uint ddd = (get_group_id(0) & 1); for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){ barrier(CLK_LOCAL_MEM_FENCE); uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); ComparatorLocal( &l_key[pos + 0], &l_val[pos + 0], &l_key[pos + stride], &l_val[pos + stride], ddd ); } } barrier(CLK_LOCAL_MEM_FENCE); d_DstKey[ 0] = l_key[get_local_id(0) + 0]; d_DstVal[ 0] = l_val[get_local_id(0) + 0]; d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; } //Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT __kernel void bitonicMergeGlobal( __global uint *d_DstKey, __global float4 *d_DstVal, __global uint *d_SrcKey, __global float4 *d_SrcVal, uint arrayLength, uint size, uint stride, uint dir ){ uint global_comparatorI = get_global_id(0); uint comparatorI = global_comparatorI & (arrayLength / 2 - 1); //Bitonic merge uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 ); uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); uint keyA = d_SrcKey[pos + 0]; float4 valA = d_SrcVal[pos + 0]; uint keyB = d_SrcKey[pos + stride]; float4 valB = d_SrcVal[pos + stride]; ComparatorPrivate( &keyA, &valA, &keyB, &valB, ddd ); d_DstKey[pos + 0] = keyA; d_DstVal[pos + 0] = valA; d_DstKey[pos + stride] = keyB; d_DstVal[pos + stride] = valB; } //Combined bitonic merge steps for //'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2] __kernel void bitonicMergeLocal( __global uint *d_DstKey, __global float4 *d_DstVal, __global uint *d_SrcKey, __global float4 *d_SrcVal, uint arrayLength, uint stride, uint size, uint dir ){ __local uint l_key[LOCAL_SIZE_LIMIT]; __local float4 l_val[LOCAL_SIZE_LIMIT]; d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; //Bitonic merge uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1); uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 ); for(; stride > 0; stride >>= 1){ barrier(CLK_LOCAL_MEM_FENCE); uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); ComparatorLocal( &l_key[pos + 0], &l_val[pos + 0], &l_key[pos + stride], &l_val[pos + stride], ddd ); } barrier(CLK_LOCAL_MEM_FENCE); d_DstKey[ 0] = l_key[get_local_id(0) + 0]; d_DstVal[ 0] = l_val[get_local_id(0) + 0]; d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; }
Hi kim,
Yeah i missed the upper line. Sorry about that. I am also able to reproduce the crash in SKA. It would be nice if you can also post the host code.
Thanks for reporting this.
Here's the host code that comes with the bitonic sort kernel for launching it. I haven't actually coded up any host code yet for my particular case (sorting float4 vectors) I've only tried to build the kernel.
/* * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ // standard utilities and systems includes #include <oclUtils.h> //////////////////////////////////////////////////////////////////////////////// // OpenCL launcher for bitonic sort kernel //////////////////////////////////////////////////////////////////////////////// //OpenCL bitonic sort program static cl_program cpBitonicSort; //OpenCL bitonic sort kernels static cl_kernel ckBitonicSortLocal1, ckBitonicMergeGlobal, ckBitonicMergeLocal; //Default command queue for bitonic kernels static cl_command_queue cqDefaultCommandQue; extern "C" void initBitonicSort(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv) { cl_int ciErrNum; size_t kernelLength; shrLog("...loading BitonicSort_b.cl\n"); char *cBitonicSort = oclLoadProgSource(shrFindFilePath("BitonicSort_b.cl", argv[0]), "// My comment\n", &kernelLength); oclCheckError(cBitonicSort != NULL, shrTRUE); shrLog("...creating bitonic sort program\n"); cpBitonicSort = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBitonicSort, &kernelLength, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("...building bitonic sort program\n"); ciErrNum = clBuildProgram(cpBitonicSort, 0, NULL, NULL, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpBitonicSort, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpBitonicSort, oclGetFirstDev(cxGPUContext), "oclBitonicSort.ptx"); oclCheckError(ciErrNum, CL_SUCCESS); } shrLog("...creating bitonic sort kernels\n"); ckBitonicSortLocal1 = clCreateKernel(cpBitonicSort, "bitonicSortLocal1", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); ckBitonicMergeGlobal = clCreateKernel(cpBitonicSort, "bitonicMergeGlobal", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); ckBitonicMergeLocal = clCreateKernel(cpBitonicSort, "bitonicMergeLocal", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); //Save default command queue cqDefaultCommandQue = cqParamCommandQue; //Discard temp storage free(cBitonicSort); } extern "C" void closeBitonicSort(void) { cl_int ciErrNum; ciErrNum = clReleaseKernel(ckBitonicMergeLocal); ciErrNum |= clReleaseKernel(ckBitonicMergeGlobal); ciErrNum |= clReleaseKernel(ckBitonicSortLocal1); ciErrNum |= clReleaseKernel(ckBitonicSortLocal); ciErrNum |= clReleaseProgram(cpBitonicSort); oclCheckError(ciErrNum, CL_SUCCESS); } static cl_uint factorRadix2(cl_uint& log2L, cl_uint L){ if(!L){ log2L = 0; return 0; }else{ for(log2L = 0; (L & 1) == 0; L >>= 1, log2L++); return L; } } //Note: logically shared with BitonicSort_b.cl! static const unsigned int LOCAL_SIZE_LIMIT = 512U; extern"C" void bitonicSort( cl_command_queue cqCommandQueue, cl_mem d_DstKey, cl_mem d_DstVal, cl_mem d_SrcKey, cl_mem d_SrcVal, unsigned int batch, unsigned int arrayLength, unsigned int dir ){ if(arrayLength < 2) return; //Only power-of-two array lengths are supported so far cl_uint log2L; cl_uint factorizationRemainder = factorRadix2(log2L, arrayLength); oclCheckError( factorizationRemainder == 1, shrTRUE ); if(!cqCommandQueue) cqCommandQueue = cqDefaultCommandQue; dir = (dir != 0); cl_int ciErrNum; size_t localWorkSize, globalWorkSize; //Launch bitonicSortLocal1 ciErrNum = clSetKernelArg(ckBitonicSortLocal1, 0, sizeof(cl_mem), (void *)&d_DstKey); ciErrNum |= clSetKernelArg(ckBitonicSortLocal1, 1, sizeof(cl_mem), (void *)&d_DstVal); ciErrNum |= clSetKernelArg(ckBitonicSortLocal1, 2, sizeof(cl_mem), (void *)&d_SrcKey); ciErrNum |= clSetKernelArg(ckBitonicSortLocal1, 3, sizeof(cl_mem), (void *)&d_SrcVal); oclCheckError(ciErrNum, CL_SUCCESS); localWorkSize = LOCAL_SIZE_LIMIT / 2; globalWorkSize = batch * arrayLength / 2; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBitonicSortLocal1, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); for(unsigned int size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1) { for(unsigned stride = size / 2; stride > 0; stride >>= 1) { if(stride >= LOCAL_SIZE_LIMIT) { //Launch bitonicMergeGlobal ciErrNum = clSetKernelArg(ckBitonicMergeGlobal, 0, sizeof(cl_mem), (void *)&d_DstKey); ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 1, sizeof(cl_mem), (void *)&d_DstVal); ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 2, sizeof(cl_mem), (void *)&d_DstKey); ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 3, sizeof(cl_mem), (void *)&d_DstVal); ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 4, sizeof(cl_uint), (void *)&arrayLength); ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 5, sizeof(cl_uint), (void *)&size); ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 6, sizeof(cl_uint), (void *)&stride); ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 7, sizeof(cl_uint), (void *)&dir); oclCheckError(ciErrNum, CL_SUCCESS); localWorkSize = LOCAL_SIZE_LIMIT / 4; globalWorkSize = batch * arrayLength / 2; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBitonicMergeGlobal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); } else { //Launch bitonicMergeLocal ciErrNum = clSetKernelArg(ckBitonicMergeLocal, 0, sizeof(cl_mem), (void *)&d_DstKey); ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 1, sizeof(cl_mem), (void *)&d_DstVal); ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 2, sizeof(cl_mem), (void *)&d_DstKey); ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 3, sizeof(cl_mem), (void *)&d_DstVal); ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 4, sizeof(cl_uint), (void *)&arrayLength); ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 5, sizeof(cl_uint), (void *)&stride); ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 6, sizeof(cl_uint), (void *)&size); ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 7, sizeof(cl_uint), (void *)&dir); oclCheckError(ciErrNum, CL_SUCCESS); localWorkSize = LOCAL_SIZE_LIMIT / 2; globalWorkSize = batch * arrayLength / 2; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBitonicMergeLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); break; } } } }
After a little trial and error I've determined the line that causes the crash is:
#26: const sep_float_sig lzero = {0.5};
Replaced with:
sep_float_sig lzero;
lzero.d = 0.5;
Compiles fine now.