7 Replies Latest reply on Mar 23, 2011 12:52 PM by Kim2

    Kernel crashes kernel analyzer

    Kim2

      I'm trying to port some code from C++ to opencl but some part of it is crashing the opencl kernel analyzer 1.7. The code is meant to perform a bitonic sort using a relative Z-order comparison function. There should be no problem with the bitonic sort code since it's from an Nvidia sample.

      Thanks in advance to anyone who takes a look for me.

      #define LOCAL_SIZE_LIMIT 512U typedef union { //! Union accessor of floating point type float d; //! Union accessor as an array of integer types unsigned long int i[1]; } sep_float_sig; typedef struct { sep_float_sig sig; int exp; float val; } sep_float; void xor_sep_float_sig(sep_float_sig *a, sep_float_sig *b, sep_float_sig *c, const sep_float_sig zero) { for(unsigned int i = 0; i < sizeof(sep_float_sig)/sizeof(unsigned long int); ++i) { c[0].i[i]=(a[0].i[i]^b[0].i[i])|zero.i[i]; } } int msdb(sep_float x, sep_float y) { const sep_float_sig lzero = {0.5}; if(x.val == y.val) return 0; else if(x.exp == y.exp) { xor_sep_float_sig(&x.sig, &y.sig, &x.sig, lzero); frexp(x.sig.d-0.5, &y.exp); return x.exp+y.exp; } else if(x.exp > y.exp) return x.exp; else return y.exp; } bool lt_func(const float4 p, const float4 q) { int y, x; unsigned int k, j; sep_float pc, qc; j = 0; x = -INT_MAX; if((p.x < 0) != (q.x < 0)) return p.x < q.x; pc.val = p.x; pc.sig.d = (float) frexp(p.x, &pc.exp); qc.val = q.x; qc.sig.d = (float) frexp(q.x, &qc.exp); y = msdb(pc, qc); if(x < y) { j = 0; x = y; } if((p.y < 0) != (q.y < 0)) return p.y < q.y; pc.val = p.y; pc.sig.d = (float) frexp(p.y, &pc.exp); qc.val = q.y; qc.sig.d = (float) frexp(q.y, &qc.exp); y = msdb(pc, qc); if(x < y) { j = 1; x = y; } if((p.z < 0) != (q.z < 0)) return p.z < q.z; pc.val = p.z; pc.sig.d = (float) frexp(p.z, &pc.exp); qc.val = q.z; qc.sig.d = (float) frexp(q.z, &qc.exp); y = msdb(pc, qc); if(x < y) { j = 2; x = y; } if(j == 0) { return p.x < q.x; } else if(j == 1) { return p.y < q.y; } else if(j == 2) { return p.z < q.z; } } inline void ComparatorPrivate( uint *keyA, float4 *valA, uint *keyB, float4 *valB, uint dir ){ if( (lt_func(*valA, *valB)) == dir ){ uint t; float4 tf; t = *keyA; *keyA = *keyB; *keyB = t; tf = *valA; *valA = *valB; *valB = tf; } } inline void ComparatorLocal( __local uint *keyA, __local float4 *valA, __local uint *keyB, __local float4 *valB, uint dir ){ if( (lt_func(*valA, *valB)) == dir ){ uint t; float4 tf; t = *keyA; *keyA = *keyB; *keyB = t; tf = *valA; *valA = *valB; *valB = tf; } } __kernel void bitonicSortLocal1( __global uint *d_DstKey, __global float4 *d_DstVal, __global uint *d_SrcKey, __global float4 *d_SrcVal ){ __local uint l_key[LOCAL_SIZE_LIMIT]; __local float4 l_val[LOCAL_SIZE_LIMIT]; //Offset to the beginning of subarray and load data d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1); for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){ //Bitonic merge uint ddd = (comparatorI & (size / 2)) != 0; for(uint stride = size / 2; stride > 0; stride >>= 1){ barrier(CLK_LOCAL_MEM_FENCE); uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); ComparatorLocal( &l_key[pos + 0], &l_val[pos + 0], &l_key[pos + stride], &l_val[pos + stride], ddd ); } } //Odd / even arrays of LOCAL_SIZE_LIMIT elements //sorted in opposite directions { uint ddd = (get_group_id(0) & 1); for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){ barrier(CLK_LOCAL_MEM_FENCE); uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); ComparatorLocal( &l_key[pos + 0], &l_val[pos + 0], &l_key[pos + stride], &l_val[pos + stride], ddd ); } } barrier(CLK_LOCAL_MEM_FENCE); d_DstKey[ 0] = l_key[get_local_id(0) + 0]; d_DstVal[ 0] = l_val[get_local_id(0) + 0]; d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; } //Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT __kernel void bitonicMergeGlobal( __global uint *d_DstKey, __global float4 *d_DstVal, __global uint *d_SrcKey, __global float4 *d_SrcVal, uint arrayLength, uint size, uint stride, uint dir ){ uint global_comparatorI = get_global_id(0); uint comparatorI = global_comparatorI & (arrayLength / 2 - 1); //Bitonic merge uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 ); uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); uint keyA = d_SrcKey[pos + 0]; float4 valA = d_SrcVal[pos + 0]; uint keyB = d_SrcKey[pos + stride]; float4 valB = d_SrcVal[pos + stride]; ComparatorPrivate( &keyA, &valA, &keyB, &valB, ddd ); d_DstKey[pos + 0] = keyA; d_DstVal[pos + 0] = valA; d_DstKey[pos + stride] = keyB; d_DstVal[pos + stride] = valB; } //Combined bitonic merge steps for //'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2] __kernel void bitonicMergeLocal( __global uint *d_DstKey, __global float4 *d_DstVal, __global uint *d_SrcKey, __global float4 *d_SrcVal, uint arrayLength, uint stride, uint size, uint dir ){ __local uint l_key[LOCAL_SIZE_LIMIT]; __local float4 l_val[LOCAL_SIZE_LIMIT]; d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; //Bitonic merge uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1); uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 ); for(; stride > 0; stride >>= 1){ barrier(CLK_LOCAL_MEM_FENCE); uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); ComparatorLocal( &l_key[pos + 0], &l_val[pos + 0], &l_key[pos + stride], &l_val[pos + stride], ddd ); } barrier(CLK_LOCAL_MEM_FENCE); d_DstKey[ 0] = l_key[get_local_id(0) + 0]; d_DstVal[ 0] = l_val[get_local_id(0) + 0]; d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; }

        • Kernel crashes stream profiler
          himanshu.gautam

          Please post the system info:CPU,GPU,SDK,DRIVER,OS.

          Are you saying you are able to run the kernels inside a program, but not able to profile it. I get some compilation errors in clBuildProgram.

           

            • Kernel crashes kernel analyzer
              Kim2

              Trying to build the kernel also crashes my program. My apologies, I should have said it crashes kernel analyzer 1.7, which I'm using to debug. I'll fix up the OP.

               

              CPU: Intel i7 920

              OS: Windows 7 64 bit

              GPU: HD5870

              SDK: 2.3

              Catalyst 11.2

                • Kernel crashes kernel analyzer
                  himanshu.gautam

                  Kim2,

                  I get the following build log while trying to build it.

                  Platform 0 : Advanced Micro Devices, Inc.

                  Selected Platform Vendor : Advanced Micro Devices, Inc.
                  Device 0 : Juniper


                                          BUILD LOG
                   ************************************************
                  C:\Users\himanshu\AppData\Local\Temp\OCL6022.tmp.cl(183): warning:
                            double-precision constant is represented as single-precision
                            constant because double is not enabled
                        const sep_float_sig lzero = {0.5};
                                                     ^

                  C:\Users\himanshu\AppData\Local\Temp\OCL6022.tmp.cl(190): warning:
                            double-precision constant is represented as single-precision
                            constant because double is not enabled
                            frexp(x.sig.d-0.5, &y.exp);
                                          ^

                  C:\Users\himanshu\AppData\Local\Temp\OCL6022.tmp.cl(264): warning: missing
                            return statement at end of non-void function "lt_func"
                    }
                    ^

                  C:\Users\himanshu\AppData\Local\Temp\OCL6022.tmp.cl(202): warning: variable "k"
                            was declared but never referenced
                        unsigned int k, j;
                                     ^

                  C:\Users\himanshu\AppData\Local\Temp\OCL6022.tmp.cl(302): error: identifier
                            "LOCAL_SIZE_LIMIT" is undefined
                        __local uint l_key[LOCAL_SIZE_LIMIT];
                                           ^

                  C:\Users\himanshu\AppData\Local\Temp\OCL6022.tmp.cl(400): error: identifier
                            "LOCAL_SIZE_LIMIT" is undefined
                        __local uint l_key[LOCAL_SIZE_LIMIT];
                                           ^

                  2 errors detected in the compilation of "C:\Users\himanshu\AppData\Local\Temp\OCL6022.tmp.cl".

                   ************************************************
                  Error: clBuildProgram failed. Error code : CL_BUILD_PROGRAM_FAILURE
                  Press any key to continue . . .

                   

                   

                  I also saw you using inline inside the kernel, which I don't think is allowed.

                  Thanks.

                  Try to remove these errors before trying it in SKA.

                    • Kernel crashes kernel analyzer
                      Kim2

                      I think you may have accidentally clipped off the #define LOCAL_SIZE_LIMIT when you copy-pasted the code. I fixed up the warnings but I'm still getting a crash. Here's the code fixed for the warnings.

                      Thanks for your help, by the way.

                      #define LOCAL_SIZE_LIMIT 512U typedef union { //! Union accessor of floating point type float d; //! Union accessor as an array of integer types unsigned long int i[1]; } sep_float_sig; typedef struct { sep_float_sig sig; int exp; float val; } sep_float; void xor_sep_float_sig(sep_float_sig *a, sep_float_sig *b, sep_float_sig *c, const sep_float_sig zero) { for(unsigned int i = 0; i < sizeof(sep_float_sig)/sizeof(unsigned long int); ++i) { c[0].i[i]=(a[0].i[i]^b[0].i[i])|zero.i[i]; } } int msdb(sep_float x, sep_float y) { const sep_float_sig lzero = {0.5}; if(x.val == y.val) return 0; else if(x.exp == y.exp) { xor_sep_float_sig(&x.sig, &y.sig, &x.sig, lzero); frexp(x.sig.d-0.5, &y.exp); return x.exp+y.exp; } else if(x.exp > y.exp) return x.exp; else return y.exp; } bool lt_func(const float4 p, const float4 q) { bool result; int y, x; unsigned int j; sep_float pc, qc; j = 0; x = -INT_MAX; if((p.x < 0) != (q.x < 0)) return p.x < q.x; pc.val = p.x; pc.sig.d = (float) frexp(p.x, &pc.exp); qc.val = q.x; qc.sig.d = (float) frexp(q.x, &qc.exp); y = msdb(pc, qc); if(x < y) { j = 0; x = y; } if((p.y < 0) != (q.y < 0)) return p.y < q.y; pc.val = p.y; pc.sig.d = (float) frexp(p.y, &pc.exp); qc.val = q.y; qc.sig.d = (float) frexp(q.y, &qc.exp); y = msdb(pc, qc); if(x < y) { j = 1; x = y; } if((p.z < 0) != (q.z < 0)) return p.z < q.z; pc.val = p.z; pc.sig.d = (float) frexp(p.z, &pc.exp); qc.val = q.z; qc.sig.d = (float) frexp(q.z, &qc.exp); y = msdb(pc, qc); if(x < y) { j = 2; x = y; } if(j == 0) { result = p.x < q.x; } else if(j == 1) { result = p.y < q.y; } else if(j == 2) { result = p.z < q.z; } return result; } void ComparatorPrivate( uint *keyA, float4 *valA, uint *keyB, float4 *valB, uint dir ){ if( (lt_func(*valA, *valB)) == dir ){ uint t; float4 tf; t = *keyA; *keyA = *keyB; *keyB = t; tf = *valA; *valA = *valB; *valB = tf; } } void ComparatorLocal( __local uint *keyA, __local float4 *valA, __local uint *keyB, __local float4 *valB, uint dir ){ if( (lt_func(*valA, *valB)) == dir ){ uint t; float4 tf; t = *keyA; *keyA = *keyB; *keyB = t; tf = *valA; *valA = *valB; *valB = tf; } } __kernel void bitonicSortLocal1( __global uint *d_DstKey, __global float4 *d_DstVal, __global uint *d_SrcKey, __global float4 *d_SrcVal ){ __local uint l_key[LOCAL_SIZE_LIMIT]; __local float4 l_val[LOCAL_SIZE_LIMIT]; //Offset to the beginning of subarray and load data d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; uint comparatorI = get_global_id(0) & ((LOCAL_SIZE_LIMIT / 2) - 1); for(uint size = 2; size < LOCAL_SIZE_LIMIT; size <<= 1){ //Bitonic merge uint ddd = (comparatorI & (size / 2)) != 0; for(uint stride = size / 2; stride > 0; stride >>= 1){ barrier(CLK_LOCAL_MEM_FENCE); uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); ComparatorLocal( &l_key[pos + 0], &l_val[pos + 0], &l_key[pos + stride], &l_val[pos + stride], ddd ); } } //Odd / even arrays of LOCAL_SIZE_LIMIT elements //sorted in opposite directions { uint ddd = (get_group_id(0) & 1); for(uint stride = LOCAL_SIZE_LIMIT / 2; stride > 0; stride >>= 1){ barrier(CLK_LOCAL_MEM_FENCE); uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); ComparatorLocal( &l_key[pos + 0], &l_val[pos + 0], &l_key[pos + stride], &l_val[pos + stride], ddd ); } } barrier(CLK_LOCAL_MEM_FENCE); d_DstKey[ 0] = l_key[get_local_id(0) + 0]; d_DstVal[ 0] = l_val[get_local_id(0) + 0]; d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; } //Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT __kernel void bitonicMergeGlobal( __global uint *d_DstKey, __global float4 *d_DstVal, __global uint *d_SrcKey, __global float4 *d_SrcVal, uint arrayLength, uint size, uint stride, uint dir ){ uint global_comparatorI = get_global_id(0); uint comparatorI = global_comparatorI & (arrayLength / 2 - 1); //Bitonic merge uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 ); uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1)); uint keyA = d_SrcKey[pos + 0]; float4 valA = d_SrcVal[pos + 0]; uint keyB = d_SrcKey[pos + stride]; float4 valB = d_SrcVal[pos + stride]; ComparatorPrivate( &keyA, &valA, &keyB, &valB, ddd ); d_DstKey[pos + 0] = keyA; d_DstVal[pos + 0] = valA; d_DstKey[pos + stride] = keyB; d_DstVal[pos + stride] = valB; } //Combined bitonic merge steps for //'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2] __kernel void bitonicMergeLocal( __global uint *d_DstKey, __global float4 *d_DstVal, __global uint *d_SrcKey, __global float4 *d_SrcVal, uint arrayLength, uint stride, uint size, uint dir ){ __local uint l_key[LOCAL_SIZE_LIMIT]; __local float4 l_val[LOCAL_SIZE_LIMIT]; d_SrcKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_SrcVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstKey += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); d_DstVal += get_group_id(0) * LOCAL_SIZE_LIMIT + get_local_id(0); l_key[get_local_id(0) + 0] = d_SrcKey[ 0]; l_val[get_local_id(0) + 0] = d_SrcVal[ 0]; l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcKey[(LOCAL_SIZE_LIMIT / 2)]; l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)] = d_SrcVal[(LOCAL_SIZE_LIMIT / 2)]; //Bitonic merge uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1); uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 ); for(; stride > 0; stride >>= 1){ barrier(CLK_LOCAL_MEM_FENCE); uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); ComparatorLocal( &l_key[pos + 0], &l_val[pos + 0], &l_key[pos + stride], &l_val[pos + stride], ddd ); } barrier(CLK_LOCAL_MEM_FENCE); d_DstKey[ 0] = l_key[get_local_id(0) + 0]; d_DstVal[ 0] = l_val[get_local_id(0) + 0]; d_DstKey[(LOCAL_SIZE_LIMIT / 2)] = l_key[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; d_DstVal[(LOCAL_SIZE_LIMIT / 2)] = l_val[get_local_id(0) + (LOCAL_SIZE_LIMIT / 2)]; }

                        • Kernel crashes kernel analyzer
                          himanshu.gautam

                          Hi kim,

                          Yeah i missed the upper line. Sorry about that. I am also able to reproduce the crash in SKA. It would be nice if you can also post the host code.

                          Thanks for reporting this.

                            • Kernel crashes kernel analyzer
                              Kim2

                              Here's the host code that comes with the bitonic sort kernel for launching it. I haven't actually coded up any host code yet for my particular case (sorting float4 vectors) I've only tried to build the kernel.

                               

                               

                              /* * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ // standard utilities and systems includes #include <oclUtils.h> //////////////////////////////////////////////////////////////////////////////// // OpenCL launcher for bitonic sort kernel //////////////////////////////////////////////////////////////////////////////// //OpenCL bitonic sort program static cl_program cpBitonicSort; //OpenCL bitonic sort kernels static cl_kernel ckBitonicSortLocal1, ckBitonicMergeGlobal, ckBitonicMergeLocal; //Default command queue for bitonic kernels static cl_command_queue cqDefaultCommandQue; extern "C" void initBitonicSort(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv) { cl_int ciErrNum; size_t kernelLength; shrLog("...loading BitonicSort_b.cl\n"); char *cBitonicSort = oclLoadProgSource(shrFindFilePath("BitonicSort_b.cl", argv[0]), "// My comment\n", &kernelLength); oclCheckError(cBitonicSort != NULL, shrTRUE); shrLog("...creating bitonic sort program\n"); cpBitonicSort = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBitonicSort, &kernelLength, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("...building bitonic sort program\n"); ciErrNum = clBuildProgram(cpBitonicSort, 0, NULL, NULL, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpBitonicSort, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpBitonicSort, oclGetFirstDev(cxGPUContext), "oclBitonicSort.ptx"); oclCheckError(ciErrNum, CL_SUCCESS); } shrLog("...creating bitonic sort kernels\n"); ckBitonicSortLocal1 = clCreateKernel(cpBitonicSort, "bitonicSortLocal1", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); ckBitonicMergeGlobal = clCreateKernel(cpBitonicSort, "bitonicMergeGlobal", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); ckBitonicMergeLocal = clCreateKernel(cpBitonicSort, "bitonicMergeLocal", &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); //Save default command queue cqDefaultCommandQue = cqParamCommandQue; //Discard temp storage free(cBitonicSort); } extern "C" void closeBitonicSort(void) { cl_int ciErrNum; ciErrNum = clReleaseKernel(ckBitonicMergeLocal); ciErrNum |= clReleaseKernel(ckBitonicMergeGlobal); ciErrNum |= clReleaseKernel(ckBitonicSortLocal1); ciErrNum |= clReleaseKernel(ckBitonicSortLocal); ciErrNum |= clReleaseProgram(cpBitonicSort); oclCheckError(ciErrNum, CL_SUCCESS); } static cl_uint factorRadix2(cl_uint& log2L, cl_uint L){ if(!L){ log2L = 0; return 0; }else{ for(log2L = 0; (L & 1) == 0; L >>= 1, log2L++); return L; } } //Note: logically shared with BitonicSort_b.cl! static const unsigned int LOCAL_SIZE_LIMIT = 512U; extern"C" void bitonicSort( cl_command_queue cqCommandQueue, cl_mem d_DstKey, cl_mem d_DstVal, cl_mem d_SrcKey, cl_mem d_SrcVal, unsigned int batch, unsigned int arrayLength, unsigned int dir ){ if(arrayLength < 2) return; //Only power-of-two array lengths are supported so far cl_uint log2L; cl_uint factorizationRemainder = factorRadix2(log2L, arrayLength); oclCheckError( factorizationRemainder == 1, shrTRUE ); if(!cqCommandQueue) cqCommandQueue = cqDefaultCommandQue; dir = (dir != 0); cl_int ciErrNum; size_t localWorkSize, globalWorkSize; //Launch bitonicSortLocal1 ciErrNum = clSetKernelArg(ckBitonicSortLocal1, 0, sizeof(cl_mem), (void *)&d_DstKey); ciErrNum |= clSetKernelArg(ckBitonicSortLocal1, 1, sizeof(cl_mem), (void *)&d_DstVal); ciErrNum |= clSetKernelArg(ckBitonicSortLocal1, 2, sizeof(cl_mem), (void *)&d_SrcKey); ciErrNum |= clSetKernelArg(ckBitonicSortLocal1, 3, sizeof(cl_mem), (void *)&d_SrcVal); oclCheckError(ciErrNum, CL_SUCCESS); localWorkSize = LOCAL_SIZE_LIMIT / 2; globalWorkSize = batch * arrayLength / 2; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBitonicSortLocal1, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); for(unsigned int size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1) { for(unsigned stride = size / 2; stride > 0; stride >>= 1) { if(stride >= LOCAL_SIZE_LIMIT) { //Launch bitonicMergeGlobal ciErrNum = clSetKernelArg(ckBitonicMergeGlobal, 0, sizeof(cl_mem), (void *)&d_DstKey); ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 1, sizeof(cl_mem), (void *)&d_DstVal); ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 2, sizeof(cl_mem), (void *)&d_DstKey); ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 3, sizeof(cl_mem), (void *)&d_DstVal); ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 4, sizeof(cl_uint), (void *)&arrayLength); ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 5, sizeof(cl_uint), (void *)&size); ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 6, sizeof(cl_uint), (void *)&stride); ciErrNum |= clSetKernelArg(ckBitonicMergeGlobal, 7, sizeof(cl_uint), (void *)&dir); oclCheckError(ciErrNum, CL_SUCCESS); localWorkSize = LOCAL_SIZE_LIMIT / 4; globalWorkSize = batch * arrayLength / 2; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBitonicMergeGlobal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); } else { //Launch bitonicMergeLocal ciErrNum = clSetKernelArg(ckBitonicMergeLocal, 0, sizeof(cl_mem), (void *)&d_DstKey); ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 1, sizeof(cl_mem), (void *)&d_DstVal); ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 2, sizeof(cl_mem), (void *)&d_DstKey); ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 3, sizeof(cl_mem), (void *)&d_DstVal); ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 4, sizeof(cl_uint), (void *)&arrayLength); ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 5, sizeof(cl_uint), (void *)&stride); ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 6, sizeof(cl_uint), (void *)&size); ciErrNum |= clSetKernelArg(ckBitonicMergeLocal, 7, sizeof(cl_uint), (void *)&dir); oclCheckError(ciErrNum, CL_SUCCESS); localWorkSize = LOCAL_SIZE_LIMIT / 2; globalWorkSize = batch * arrayLength / 2; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBitonicMergeLocal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); break; } } } }

                                • Kernel crashes kernel analyzer
                                  Kim2

                                  After a little trial and error I've determined the line that causes the crash is:

                                   

                                  #26: const sep_float_sig lzero = {0.5};

                                   

                                  Replaced with:

                                   

                                      sep_float_sig lzero;
                                      lzero.d = 0.5;   

                                   

                                  Compiles fine now.