karbous

Why can I debug only first compiled OpenCL kernel with GDB?

Discussion created by karbous on Sep 12, 2010
Latest reply on May 23, 2011 by himanshu.gautam

I'm having problem debugging OpenCL kernels with gdb and I discovered, that it depends which kernel I compile with clBuildProgram as first (only first one I can debug). So I tried to change SDK sample code (Binary Search), I just duplicated the kernel and analogously, ability to debug the kernel depends on the order of clCreateKernel commands. (I enclosed the changed Binary Search code. Only second created kernel I can debug, so only binarySearch2)

Am I doing something wrong?

I'm running ati-stream-sdk-v2.2-ln32 on Ubuntu Lucid 10.04, GDB 7.1-ubuntu. 

//************************************************************************************// ///* BinarySearch.cpp - added kernel2 */// #include "BinarySearch.hpp" #include <malloc.h> /* * \brief set up program input data */ int BinarySearch::setupBinarySearch() { /* allocate and init memory used by host */ cl_uint inputSizeBytes = length * sizeof(cl_uint); input = (cl_uint *) malloc(inputSizeBytes); if(input==NULL) { sampleCommon->error("Failed to allocate host memory. (input)"); return SDK_FAILURE; } cl_uint max = length * 20; /* random initialisation of input */ input[0] = 0; for(cl_uint i = 1; i < length; i++) input[i] = input[i-1] + (cl_uint) (max * rand()/(float)RAND_MAX); #if defined (_WIN32) output = (cl_uint *)_aligned_malloc(sizeof(cl_uint4), 16); #else output = (cl_uint *)memalign(16, sizeof(cl_uint4)); #endif if(output==NULL) { sampleCommon->error("Failed to allocate host memory. (output)"); return SDK_FAILURE; } /* * Unless quiet mode has been enabled, print the INPUT array. */ if(!quiet) { sampleCommon->printArray<cl_uint>( "Sorted Input", input, length, 1); } return SDK_SUCCESS; } /* * \brief OpenCL related initialisations are done here. * Context, Device list, Command Queue are set up. * Calls are made to set up OpenCL memory buffers that this program uses * and to load the programs into memory and get kernel handles. * Load and build OpenCL program and get kernel handles. * Set up OpenCL memory buffers used by this program. */ int BinarySearch::setupCL(void) { cl_int status = 0; size_t deviceListSize; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } if (0 < numPlatforms) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformInfo failed.")) { return SDK_FAILURE; } platform = platforms[i]; if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { break; } } delete[] platforms; } if(NULL == platform) { sampleCommon->error("NULL platform found so Exiting Application."); return SDK_FAILURE; } /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType( cps, dType, NULL, NULL, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateContextFromType failed.")) return SDK_FAILURE; /* First, get the size of device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetContextInfo failed.")) return SDK_FAILURE; /* Now allocate memory for device list based on the size we got earlier */ devices = (cl_device_id *)malloc(deviceListSize); if(devices==NULL) { sampleCommon->error("Failed to allocate memory (devices)."); return SDK_FAILURE; } /* Now, get the device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetGetContextInfo failed.")) return SDK_FAILURE; { /* The block is to move the declaration of prop closer to its use */ cl_command_queue_properties prop = 0; if(timing) prop |= CL_QUEUE_PROFILING_ENABLE; commandQueue = clCreateCommandQueue( context, devices[0], prop, &status); if(!sampleCommon->checkVal( status, 0, "clCreateCommandQueue failed.")) return SDK_FAILURE; } inputBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * length, input, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (inputBuffer)")) return SDK_FAILURE; outputBuffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_uint4), output, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (outputBuffer)")) return SDK_FAILURE; /* create a CL program using the kernel source */ streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath(); kernelPath.append("BinarySearch_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } const char * source = kernelFile.source().c_str(); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource( context, 1, &source, sourceSize, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; /* create a cl program executable for all the devices specified */ status = clBuildProgram(program, 1, devices, "-g", NULL, NULL); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char * buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!sampleCommon->checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { return SDK_FAILURE; } buildLog = (char*)malloc(buildLogSize); if(buildLog == NULL) { sampleCommon->error("Failed to allocate host memory. (buildLog)"); return SDK_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!sampleCommon->checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return SDK_FAILURE; } std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!sampleCommon->checkVal(status, CL_SUCCESS, "clBuildProgram failed.")) { return SDK_FAILURE; } } /* get a kernel object handle for a kernel with the given name */ kernel1 = clCreateKernel(program, "binarySearch1", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed.")) return SDK_FAILURE; kernel2 = clCreateKernel(program, "binarySearch2", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int BinarySearch::runCLKernels(void) { cl_int status; cl_event events[2]; size_t globalThreads[1]; size_t localThreads[1]; localThreads[0] = 256; numSubdivisions = length / (cl_uint)localThreads[0]; if(numSubdivisions < localThreads[0]) { numSubdivisions = (cl_uint)localThreads[0]; } globalThreads[0] = numSubdivisions; /* Check group size against kernelWorkGroupSize */ status = clGetKernelWorkGroupInfo(kernel1, devices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetKernelWorkGroupInfo failed.")) { return SDK_FAILURE; } if((cl_uint)(localThreads[0]) > kernelWorkGroupSize) { if(!quiet) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << localThreads[0] << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelWorkGroupSize << std::endl; std::cout << "Changing the group size to " << kernelWorkGroupSize << std::endl; } localThreads[0] = kernelWorkGroupSize; numSubdivisions = length / (cl_uint)localThreads[0]; if(numSubdivisions < localThreads[0]) { numSubdivisions = (cl_uint)localThreads[0]; } globalThreads[0] = numSubdivisions; } //return SDK_SUCCESS; cl_uint globalLowerBound = 0; cl_uint globalUpperBound = length - 1; cl_uint subdivSize = (globalUpperBound - globalLowerBound + 1)/numSubdivisions; cl_uint isElementFound = 0; if((input[0] > findMe) || (input[length-1] < findMe)) { output[0] = 0; output[1] = length - 1; output[2] = 0; return SDK_SUCCESS; } output[3] = 1; /*** Set appropruiate arguments to the kernel ***/ /* * First argument of the kernel is the output buffer */ status = clSetKernelArg( kernel1, 0, sizeof(cl_mem), (void *)&outputBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 0(OutputBuffer) failed.")) return SDK_FAILURE; status = clSetKernelArg( kernel2, 0, sizeof(cl_mem), (void *)&outputBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 0(OutputBuffer) failed.")) return SDK_FAILURE; /* * Second argument is input buffer */ status = clSetKernelArg( kernel1, 1, sizeof(cl_mem), (void *)&inputBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 1(inputBuffer) failed.")) return SDK_FAILURE; status = clSetKernelArg( kernel2, 1, sizeof(cl_mem), (void *)&inputBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 1(inputBuffer) failed.")) return SDK_FAILURE; /* * Third is the element we are looking for */ status = clSetKernelArg( kernel1, 2, sizeof(cl_uint), (void *)&findMe); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 2(findMe) failed.")) return SDK_FAILURE; status = clSetKernelArg( kernel2, 2, sizeof(cl_uint), (void *)&findMe); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 2(findMe) failed.")) return SDK_FAILURE; while(subdivSize > 1 && output[3] != 0) { output[3] = 0; /* Enqueue readBuffer*/ status = clEnqueueWriteBuffer( commandQueue, outputBuffer, CL_TRUE, 0, sizeof(cl_uint4), output, 0, NULL, &events[1]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueWriteBuffer failed.")) return SDK_FAILURE; /* Wait for the write buffer to finish execution */ status = clWaitForEvents(1, &events[1]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clWaitForEvents failed.")) return SDK_FAILURE; clReleaseEvent(events[1]); /* * Fourth argument is the lower bound for the full segment for this pass. * Each thread derives its own lower and upper bound from this. */ status = clSetKernelArg( kernel2, 3, sizeof(cl_uint), (void *)&globalLowerBound); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 3(globalLowerBound) failed.")) return SDK_FAILURE; /* * Similar to the above, but it is the upper bound */ status = clSetKernelArg( kernel2, 4, sizeof(cl_uint), (void *)&globalUpperBound); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 4(globalUpperBound) failed.")) return SDK_FAILURE; /* * The size of the subdivision for each thread */ status = clSetKernelArg( kernel2, 5, sizeof(cl_uint), (void *)&subdivSize); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 5(sumdivSize) failed.")) return SDK_FAILURE; /* * Enqueue a kernel run call */ status = clEnqueueNDRangeKernel(commandQueue, kernel2, 1, NULL, globalThreads, localThreads, 0, NULL, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueNDRangeKernel failed.")) return SDK_FAILURE; } while(subdivSize > 1 && output[3] != 0) { output[3] = 0; /* Enqueue readBuffer*/ status = clEnqueueWriteBuffer( commandQueue, outputBuffer, CL_TRUE, 0, sizeof(cl_uint4), output, 0, NULL, &events[1]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueWriteBuffer failed.")) return SDK_FAILURE; /* Wait for the write buffer to finish execution */ status = clWaitForEvents(1, &events[1]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clWaitForEvents failed.")) return SDK_FAILURE; clReleaseEvent(events[1]); /* * Fourth argument is the lower bound for the full segment for this pass. * Each thread derives its own lower and upper bound from this. */ status = clSetKernelArg( kernel1, 3, sizeof(cl_uint), (void *)&globalLowerBound); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 3(globalLowerBound) failed.")) return SDK_FAILURE; /* * Similar to the above, but it is the upper bound */ status = clSetKernelArg( kernel1, 4, sizeof(cl_uint), (void *)&globalUpperBound); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 4(globalUpperBound) failed.")) return SDK_FAILURE; /* * The size of the subdivision for each thread */ status = clSetKernelArg( kernel1, 5, sizeof(cl_uint), (void *)&subdivSize); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 5(sumdivSize) failed.")) return SDK_FAILURE; /* * Enqueue a kernel run call */ status = clEnqueueNDRangeKernel(commandQueue, kernel1, 1, NULL, globalThreads, localThreads, 0, NULL, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueNDRangeKernel failed.")) return SDK_FAILURE; /* wait for the kernel call to finish execution */ status = clWaitForEvents(1, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clWaitForEvents failed.")) return SDK_FAILURE; /* Enqueue readBuffer*/ status = clEnqueueReadBuffer( commandQueue, outputBuffer, CL_TRUE, 0, sizeof(cl_uint4), output, 0, NULL, &events[1]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueReadBuffer failed.")) return SDK_FAILURE; /* Wait for the read buffer to finish execution */ status = clWaitForEvents(1, &events[1]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clWaitForEvents failed.")) return SDK_FAILURE; clReleaseEvent(events[1]); globalLowerBound = output[0]; globalUpperBound = output[1]; subdivSize = (globalUpperBound - globalLowerBound + 1)/numSubdivisions; } for(cl_uint i=globalLowerBound; i<= globalUpperBound; i++) { if(input[i] == findMe) { output[0] = i; output[1] = i+1; output[2] = 1; return SDK_SUCCESS; } } /* The findMe element is not found from globalLowerBound to globalUpperBound */ output[2] = 0; return SDK_SUCCESS; } /** * CPU verification for the BinarySearch algorithm */ int BinarySearch::binarySearchCPUReference() { cl_uint globalLowerBound = output[0]; cl_uint globalUpperBound = output[1]; cl_uint isElementFound = output[2]; if(isElementFound) { if(input[globalLowerBound] == findMe) return 1; else return 0; } else { for(cl_uint i=0; i< length; i++) { if(input[i] == findMe) return 0; } return 1; } } int BinarySearch::initialize() { /*Call base class Initialize to get default configuration*/ if(!this->SDKSample::initialize()) return SDK_FAILURE; /* Now add customized options */ streamsdk::Option* array_length = new streamsdk::Option; if(!array_length) { sampleCommon->error("Memory allocation error.\n"); return SDK_FAILURE; } array_length->_sVersion = "x"; array_length->_lVersion = "length"; array_length->_description = "Lenght of the input array"; array_length->_type = streamsdk::CA_ARG_INT; array_length->_value = &length; sampleArgs->AddOption(array_length); delete array_length; streamsdk::Option* find_me = new streamsdk::Option; if(!find_me) { sampleCommon->error("Memory allocation error.\n"); return SDK_FAILURE; } find_me->_sVersion = "f"; find_me->_lVersion = "find"; find_me->_description = "element to be found"; find_me->_type = streamsdk::CA_ARG_INT; find_me->_value = &findMe; sampleArgs->AddOption(find_me); delete find_me; streamsdk::Option* sub_div = new streamsdk::Option; if(!sub_div) { sampleCommon->error("Memory allocation error.\n"); return SDK_FAILURE; } sub_div->_sVersion = "d"; sub_div->_lVersion = "divisions"; sub_div->_description = "number of subdivisions"; sub_div->_type = streamsdk::CA_ARG_INT; sub_div->_value = &numSubdivisions; sampleArgs->AddOption(sub_div); delete sub_div; streamsdk::Option* num_iterations = new streamsdk::Option; if(!num_iterations) { sampleCommon->error("Memory allocation error.\n"); return SDK_FAILURE; } num_iterations->_sVersion = "i"; num_iterations->_lVersion = "iterations"; num_iterations->_description = "Number of iterations for kernel execution"; num_iterations->_type = streamsdk::CA_ARG_INT; num_iterations->_value = &iterations; sampleArgs->AddOption(num_iterations); delete num_iterations; return SDK_SUCCESS; } int BinarySearch::setup() { if(!sampleCommon->isPowerOf2(length)) length = sampleCommon->roundToPowerOf2(length); if(setupBinarySearch()!=SDK_SUCCESS) return SDK_FAILURE; int timer = sampleCommon->createTimer(); sampleCommon->resetTimer(timer); sampleCommon->startTimer(timer); if(setupCL()!=SDK_SUCCESS) return SDK_FAILURE; setupTime = (cl_double)(sampleCommon->readTimer(timer)); return SDK_SUCCESS; } int BinarySearch::run() { int timer = sampleCommon->createTimer(); sampleCommon->resetTimer(timer); sampleCommon->startTimer(timer); std::cout << "Executing kernel for " << iterations << " iterations" << std::endl; std::cout << "-------------------------------------------" << std::endl; for(int i = 0; i < iterations; i++) { /* Arguments are set and execution call is enqueued on command buffer */ if(runCLKernels()!=SDK_SUCCESS) return SDK_FAILURE; } sampleCommon->stopTimer(timer); totalKernelTime = (double)(sampleCommon->readTimer(timer)) / iterations; if(!quiet) { cl_uint globalLowerBound = output[0]; cl_uint globalUpperBound = output[1]; cl_uint isElementFound = output[2]; printf("l = %d, u = %d, isfound = %d, fm = %d\n", globalLowerBound, globalUpperBound, isElementFound, findMe); } return SDK_SUCCESS; } int BinarySearch::verifyResults() { if(verify) { verificationInput = (cl_uint *) malloc(length*sizeof(cl_int)); if(verificationInput==NULL) { sampleCommon->error("Failed to allocate host memory. (verificationInput)"); return SDK_FAILURE; } memcpy(verificationInput, input, length*sizeof(cl_int)); /* reference implementation * it overwrites the input array with the output */ int refTimer = sampleCommon->createTimer(); sampleCommon->resetTimer(refTimer); sampleCommon->startTimer(refTimer); cl_int verified = binarySearchCPUReference(); sampleCommon->stopTimer(refTimer); referenceKernelTime = sampleCommon->readTimer(refTimer); /* compare the results and see if they match */ if(verified) { std::cout<<"Passed!\n"; return SDK_SUCCESS; } else { std::cout<<"Failed\n"; return SDK_FAILURE; } } return SDK_SUCCESS; } void BinarySearch::printStats() { std::string strArray[3] = {"Length", "Time(sec)", "kernelTime(sec)"}; std::string stats[3]; totalTime = setupTime + totalKernelTime; stats[0] = sampleCommon->toString(length , std::dec); stats[1] = sampleCommon->toString(totalTime, std::dec); stats[2] = sampleCommon->toString(totalKernelTime, std::dec); this->SDKSample::printStats(strArray, stats, 3); } int BinarySearch::cleanup() { /* Releases OpenCL resources (Context, Memory etc.) */ cl_int status; status = clReleaseKernel(kernel1); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; status = clReleaseKernel(kernel2); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; status = clReleaseProgram(program); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseProgram failed.")) return SDK_FAILURE; status = clReleaseMemObject(inputBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseMemObject failed.")) return SDK_FAILURE; status = clReleaseMemObject(outputBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseMemObject failed.")) return SDK_FAILURE; status = clReleaseCommandQueue(commandQueue); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseContext(context); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseContext failed.")) return SDK_FAILURE; /* release program resources (input memory etc.) */ if(input) free(input); if(output) { #if defined (_WIN32) _aligned_free(output); #else free(output); #endif } if(devices) free(devices); if(verificationInput) free(verificationInput); return SDK_SUCCESS; } int main(int argc, char * argv[]) { BinarySearch clBinarySearch("OpenCL Binary Search"); if(clBinarySearch.initialize()!=SDK_SUCCESS) return SDK_FAILURE; if(!clBinarySearch.parseCommandLine(argc, argv)) return SDK_FAILURE; if(clBinarySearch.setup()!=SDK_SUCCESS) return SDK_FAILURE; if(clBinarySearch.run()!=SDK_SUCCESS) return SDK_FAILURE; if(clBinarySearch.verifyResults()!=SDK_SUCCESS) return SDK_FAILURE; if(clBinarySearch.cleanup()!=SDK_SUCCESS) return SDK_FAILURE; clBinarySearch.printStats(); return SDK_SUCCESS; } //************************************************************************************// ///* BinarySearch_Kernels.hpp - simply added declaration of kernel1 and kernel2 */// #include <SDKCommandArgs.hpp> #include <SDKFile.hpp> /** * BinarySearch * Class implements OpenCL Binary Search sample * Derived from SDKSample base class */ class BinarySearch : public SDKSample { cl_uint seed; /**< Seed value for random number generation */ cl_double setupTime; /**< Time for setting up Opencl */ cl_double totalKernelTime; /**< Time for kernel execution */ cl_double totalProgramTime; /**< Time for program execution */ cl_double referenceKernelTime; /**< Time for reference implementation */ cl_uint findMe; /**< Search for this number in the input array */ cl_uint *input; /**< Input array */ cl_uint length; /**< Length of the input array */ cl_uint *output; /**< Output array */ cl_uint *verificationInput; /**< Input array for reference implementation */ cl_context context; /**< CL context */ cl_device_id *devices; /**< CL device list */ cl_mem inputBuffer; /**< CL memory buffer */ cl_mem outputBuffer; /**< CL output memory buffer */ cl_command_queue commandQueue; /**< CL command queue */ cl_program program; /**< CL program */ cl_kernel kernel1; /**< CL kernel */ cl_kernel kernel2; cl_uint numSubdivisions; /**< number of subdivisions to do N'ary Search */ size_t kernelWorkGroupSize; /**< Group Size returned by kernel */ int iterations; /**< Number of iterations for kernel execution */ public: /** * Constructor * Initialize member variables * @param name name of sample (string) */ BinarySearch(std::string name) : SDKSample(name) { seed = 123; input = NULL; output = NULL; verificationInput = NULL; findMe = 5; numSubdivisions = 8; /**< 8-ary Search by Default */ length = 512; setupTime = 0; totalKernelTime = 0; iterations = 1; } /** * Constructor * Initialize member variables * @param name name of sample (const char*) */ BinarySearch(const char* name) : SDKSample(name) { seed = 123; input = NULL; output = NULL; verificationInput = NULL; findMe = 5; numSubdivisions = 8; /**< Binary Search by Default */ length = 512; setupTime = 0; totalKernelTime = 0; iterations = 1; } /** * Allocate and initialize host memory array with random values * @return 1 on success and 0 on failure */ int setupBinarySearch(); /** * OpenCL related initialisations. * Set up Context, Device list, Command Queue, Memory buffers * Build CL kernel program executable * @return 1 on success and 0 on failure */ int setupCL(); /** * Set values for kernels' arguments, enqueue calls to the kernels * on to the command queue, wait till end of kernel execution. * Get kernel start and end time if timing is enabled * @return 1 on success and 0 on failure */ int runCLKernels(); /** * CPU verification for the BinarySearch algorithm */ int binarySearchCPUReference(); /** * Override from SDKSample. Print sample stats. */ void printStats(); /** * Override from SDKSample. Initialize * command line parser, add custom options */ int initialize(); /** * Override from SDKSample, adjust width and height * of execution domain, perform all sample setup */ int setup(); /** * Override from SDKSample * Run OpenCL BinarySearch */ int run(); /** * Override from SDKSample * Cleanup memory allocations */ int cleanup(); /** * Override from SDKSample * Verify against reference implementation */ int verifyResults(); }; #endif //************************************************************************************// ///* BinarySearch_Kernels.cl - simply duplicated original kernel*/// __kernel void binarySearch1( __global uint4 * outputArray, __const __global uint * sortedArray, const unsigned int findMe, const unsigned int globalLowerBound, const unsigned int globalUpperBound, const unsigned int subdivSize) { unsigned int tid = get_global_id(0); /* lower bound and upper bound are computed from segment and total search space for this pass * The total search space is known from global lower and upper bounds for this pass. */ unsigned int lowerBound = globalLowerBound + subdivSize * tid; unsigned int upperBound = lowerBound + subdivSize - 1; /* Then we find the elements at the two ends of the search space for this thread */ unsigned int lowerBoundElement = sortedArray[lowerBound]; unsigned int upperBoundElement = sortedArray[upperBound]; /* If the element to be found does not lie between them, then nothing left to do in this thread */ if( (lowerBoundElement > findMe) || (upperBoundElement < findMe)) { return; } else { /* However, if the element does lie between the lower and upper bounds of this thread's searchspace * we need to narrow down the search further in this search space */ /* The search space for this thread is marked in the output as being the total search space for the next pass */ outputArray[0].x = lowerBound; outputArray[0].y = upperBound; outputArray[0].w = 1; } } __kernel void binarySearch2( __global uint4 * outputArray, __const __global uint * sortedArray, const unsigned int findMe, const unsigned int globalLowerBound, const unsigned int globalUpperBound, const unsigned int subdivSize) { unsigned int tid = get_global_id(0); /* lower bound and upper bound are computed from segment and total search space for this pass * The total search space is known from global lower and upper bounds for this pass. */ unsigned int lowerBound = globalLowerBound + subdivSize * tid; unsigned int upperBound = lowerBound + subdivSize - 1; /* Then we find the elements at the two ends of the search space for this thread */ unsigned int lowerBoundElement = sortedArray[lowerBound]; unsigned int upperBoundElement = sortedArray[upperBound]; /* If the element to be found does not lie between them, then nothing left to do in this thread */ if( (lowerBoundElement > findMe) || (upperBoundElement < findMe)) { return; } else { /* However, if the element does lie between the lower and upper bounds of this thread's searchspace * we need to narrow down the search further in this search space */ /* The search space for this thread is marked in the output as being the total search space for the next pass */ outputArray[0].x = lowerBound; outputArray[0].y = upperBound; outputArray[0].w = 1; } }

Outcomes