2 Replies Latest reply on May 23, 2011 6:02 AM by himanshu.gautam

    Why can I debug only first compiled OpenCL kernel with GDB?

    karbous

      I'm having problem debugging OpenCL kernels with gdb and I discovered, that it depends which kernel I compile with clBuildProgram as first (only first one I can debug). So I tried to change SDK sample code (Binary Search), I just duplicated the kernel and analogously, ability to debug the kernel depends on the order of clCreateKernel commands. (I enclosed the changed Binary Search code. Only second created kernel I can debug, so only binarySearch2)

      Am I doing something wrong?

      I'm running ati-stream-sdk-v2.2-ln32 on Ubuntu Lucid 10.04, GDB 7.1-ubuntu. 

      //************************************************************************************// ///* BinarySearch.cpp - added kernel2 */// #include "BinarySearch.hpp" #include <malloc.h> /* * \brief set up program input data */ int BinarySearch::setupBinarySearch() { /* allocate and init memory used by host */ cl_uint inputSizeBytes = length * sizeof(cl_uint); input = (cl_uint *) malloc(inputSizeBytes); if(input==NULL) { sampleCommon->error("Failed to allocate host memory. (input)"); return SDK_FAILURE; } cl_uint max = length * 20; /* random initialisation of input */ input[0] = 0; for(cl_uint i = 1; i < length; i++) input[i] = input[i-1] + (cl_uint) (max * rand()/(float)RAND_MAX); #if defined (_WIN32) output = (cl_uint *)_aligned_malloc(sizeof(cl_uint4), 16); #else output = (cl_uint *)memalign(16, sizeof(cl_uint4)); #endif if(output==NULL) { sampleCommon->error("Failed to allocate host memory. (output)"); return SDK_FAILURE; } /* * Unless quiet mode has been enabled, print the INPUT array. */ if(!quiet) { sampleCommon->printArray<cl_uint>( "Sorted Input", input, length, 1); } return SDK_SUCCESS; } /* * \brief OpenCL related initialisations are done here. * Context, Device list, Command Queue are set up. * Calls are made to set up OpenCL memory buffers that this program uses * and to load the programs into memory and get kernel handles. * Load and build OpenCL program and get kernel handles. * Set up OpenCL memory buffers used by this program. */ int BinarySearch::setupCL(void) { cl_int status = 0; size_t deviceListSize; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } if (0 < numPlatforms) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformInfo failed.")) { return SDK_FAILURE; } platform = platforms[i]; if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { break; } } delete[] platforms; } if(NULL == platform) { sampleCommon->error("NULL platform found so Exiting Application."); return SDK_FAILURE; } /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType( cps, dType, NULL, NULL, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateContextFromType failed.")) return SDK_FAILURE; /* First, get the size of device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetContextInfo failed.")) return SDK_FAILURE; /* Now allocate memory for device list based on the size we got earlier */ devices = (cl_device_id *)malloc(deviceListSize); if(devices==NULL) { sampleCommon->error("Failed to allocate memory (devices)."); return SDK_FAILURE; } /* Now, get the device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetGetContextInfo failed.")) return SDK_FAILURE; { /* The block is to move the declaration of prop closer to its use */ cl_command_queue_properties prop = 0; if(timing) prop |= CL_QUEUE_PROFILING_ENABLE; commandQueue = clCreateCommandQueue( context, devices[0], prop, &status); if(!sampleCommon->checkVal( status, 0, "clCreateCommandQueue failed.")) return SDK_FAILURE; } inputBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * length, input, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (inputBuffer)")) return SDK_FAILURE; outputBuffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_uint4), output, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (outputBuffer)")) return SDK_FAILURE; /* create a CL program using the kernel source */ streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath(); kernelPath.append("BinarySearch_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } const char * source = kernelFile.source().c_str(); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource( context, 1, &source, sourceSize, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; /* create a cl program executable for all the devices specified */ status = clBuildProgram(program, 1, devices, "-g", NULL, NULL); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char * buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!sampleCommon->checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { return SDK_FAILURE; } buildLog = (char*)malloc(buildLogSize); if(buildLog == NULL) { sampleCommon->error("Failed to allocate host memory. (buildLog)"); return SDK_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!sampleCommon->checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return SDK_FAILURE; } std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!sampleCommon->checkVal(status, CL_SUCCESS, "clBuildProgram failed.")) { return SDK_FAILURE; } } /* get a kernel object handle for a kernel with the given name */ kernel1 = clCreateKernel(program, "binarySearch1", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed.")) return SDK_FAILURE; kernel2 = clCreateKernel(program, "binarySearch2", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int BinarySearch::runCLKernels(void) { cl_int status; cl_event events[2]; size_t globalThreads[1]; size_t localThreads[1]; localThreads[0] = 256; numSubdivisions = length / (cl_uint)localThreads[0]; if(numSubdivisions < localThreads[0]) { numSubdivisions = (cl_uint)localThreads[0]; } globalThreads[0] = numSubdivisions; /* Check group size against kernelWorkGroupSize */ status = clGetKernelWorkGroupInfo(kernel1, devices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetKernelWorkGroupInfo failed.")) { return SDK_FAILURE; } if((cl_uint)(localThreads[0]) > kernelWorkGroupSize) { if(!quiet) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << localThreads[0] << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelWorkGroupSize << std::endl; std::cout << "Changing the group size to " << kernelWorkGroupSize << std::endl; } localThreads[0] = kernelWorkGroupSize; numSubdivisions = length / (cl_uint)localThreads[0]; if(numSubdivisions < localThreads[0]) { numSubdivisions = (cl_uint)localThreads[0]; } globalThreads[0] = numSubdivisions; } //return SDK_SUCCESS; cl_uint globalLowerBound = 0; cl_uint globalUpperBound = length - 1; cl_uint subdivSize = (globalUpperBound - globalLowerBound + 1)/numSubdivisions; cl_uint isElementFound = 0; if((input[0] > findMe) || (input[length-1] < findMe)) { output[0] = 0; output[1] = length - 1; output[2] = 0; return SDK_SUCCESS; } output[3] = 1; /*** Set appropruiate arguments to the kernel ***/ /* * First argument of the kernel is the output buffer */ status = clSetKernelArg( kernel1, 0, sizeof(cl_mem), (void *)&outputBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 0(OutputBuffer) failed.")) return SDK_FAILURE; status = clSetKernelArg( kernel2, 0, sizeof(cl_mem), (void *)&outputBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 0(OutputBuffer) failed.")) return SDK_FAILURE; /* * Second argument is input buffer */ status = clSetKernelArg( kernel1, 1, sizeof(cl_mem), (void *)&inputBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 1(inputBuffer) failed.")) return SDK_FAILURE; status = clSetKernelArg( kernel2, 1, sizeof(cl_mem), (void *)&inputBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 1(inputBuffer) failed.")) return SDK_FAILURE; /* * Third is the element we are looking for */ status = clSetKernelArg( kernel1, 2, sizeof(cl_uint), (void *)&findMe); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 2(findMe) failed.")) return SDK_FAILURE; status = clSetKernelArg( kernel2, 2, sizeof(cl_uint), (void *)&findMe); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 2(findMe) failed.")) return SDK_FAILURE; while(subdivSize > 1 && output[3] != 0) { output[3] = 0; /* Enqueue readBuffer*/ status = clEnqueueWriteBuffer( commandQueue, outputBuffer, CL_TRUE, 0, sizeof(cl_uint4), output, 0, NULL, &events[1]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueWriteBuffer failed.")) return SDK_FAILURE; /* Wait for the write buffer to finish execution */ status = clWaitForEvents(1, &events[1]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clWaitForEvents failed.")) return SDK_FAILURE; clReleaseEvent(events[1]); /* * Fourth argument is the lower bound for the full segment for this pass. * Each thread derives its own lower and upper bound from this. */ status = clSetKernelArg( kernel2, 3, sizeof(cl_uint), (void *)&globalLowerBound); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 3(globalLowerBound) failed.")) return SDK_FAILURE; /* * Similar to the above, but it is the upper bound */ status = clSetKernelArg( kernel2, 4, sizeof(cl_uint), (void *)&globalUpperBound); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 4(globalUpperBound) failed.")) return SDK_FAILURE; /* * The size of the subdivision for each thread */ status = clSetKernelArg( kernel2, 5, sizeof(cl_uint), (void *)&subdivSize); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 5(sumdivSize) failed.")) return SDK_FAILURE; /* * Enqueue a kernel run call */ status = clEnqueueNDRangeKernel(commandQueue, kernel2, 1, NULL, globalThreads, localThreads, 0, NULL, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueNDRangeKernel failed.")) return SDK_FAILURE; } while(subdivSize > 1 && output[3] != 0) { output[3] = 0; /* Enqueue readBuffer*/ status = clEnqueueWriteBuffer( commandQueue, outputBuffer, CL_TRUE, 0, sizeof(cl_uint4), output, 0, NULL, &events[1]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueWriteBuffer failed.")) return SDK_FAILURE; /* Wait for the write buffer to finish execution */ status = clWaitForEvents(1, &events[1]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clWaitForEvents failed.")) return SDK_FAILURE; clReleaseEvent(events[1]); /* * Fourth argument is the lower bound for the full segment for this pass. * Each thread derives its own lower and upper bound from this. */ status = clSetKernelArg( kernel1, 3, sizeof(cl_uint), (void *)&globalLowerBound); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 3(globalLowerBound) failed.")) return SDK_FAILURE; /* * Similar to the above, but it is the upper bound */ status = clSetKernelArg( kernel1, 4, sizeof(cl_uint), (void *)&globalUpperBound); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 4(globalUpperBound) failed.")) return SDK_FAILURE; /* * The size of the subdivision for each thread */ status = clSetKernelArg( kernel1, 5, sizeof(cl_uint), (void *)&subdivSize); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg 5(sumdivSize) failed.")) return SDK_FAILURE; /* * Enqueue a kernel run call */ status = clEnqueueNDRangeKernel(commandQueue, kernel1, 1, NULL, globalThreads, localThreads, 0, NULL, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueNDRangeKernel failed.")) return SDK_FAILURE; /* wait for the kernel call to finish execution */ status = clWaitForEvents(1, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clWaitForEvents failed.")) return SDK_FAILURE; /* Enqueue readBuffer*/ status = clEnqueueReadBuffer( commandQueue, outputBuffer, CL_TRUE, 0, sizeof(cl_uint4), output, 0, NULL, &events[1]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueReadBuffer failed.")) return SDK_FAILURE; /* Wait for the read buffer to finish execution */ status = clWaitForEvents(1, &events[1]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clWaitForEvents failed.")) return SDK_FAILURE; clReleaseEvent(events[1]); globalLowerBound = output[0]; globalUpperBound = output[1]; subdivSize = (globalUpperBound - globalLowerBound + 1)/numSubdivisions; } for(cl_uint i=globalLowerBound; i<= globalUpperBound; i++) { if(input[i] == findMe) { output[0] = i; output[1] = i+1; output[2] = 1; return SDK_SUCCESS; } } /* The findMe element is not found from globalLowerBound to globalUpperBound */ output[2] = 0; return SDK_SUCCESS; } /** * CPU verification for the BinarySearch algorithm */ int BinarySearch::binarySearchCPUReference() { cl_uint globalLowerBound = output[0]; cl_uint globalUpperBound = output[1]; cl_uint isElementFound = output[2]; if(isElementFound) { if(input[globalLowerBound] == findMe) return 1; else return 0; } else { for(cl_uint i=0; i< length; i++) { if(input[i] == findMe) return 0; } return 1; } } int BinarySearch::initialize() { /*Call base class Initialize to get default configuration*/ if(!this->SDKSample::initialize()) return SDK_FAILURE; /* Now add customized options */ streamsdk::Option* array_length = new streamsdk::Option; if(!array_length) { sampleCommon->error("Memory allocation error.\n"); return SDK_FAILURE; } array_length->_sVersion = "x"; array_length->_lVersion = "length"; array_length->_description = "Lenght of the input array"; array_length->_type = streamsdk::CA_ARG_INT; array_length->_value = &length; sampleArgs->AddOption(array_length); delete array_length; streamsdk::Option* find_me = new streamsdk::Option; if(!find_me) { sampleCommon->error("Memory allocation error.\n"); return SDK_FAILURE; } find_me->_sVersion = "f"; find_me->_lVersion = "find"; find_me->_description = "element to be found"; find_me->_type = streamsdk::CA_ARG_INT; find_me->_value = &findMe; sampleArgs->AddOption(find_me); delete find_me; streamsdk::Option* sub_div = new streamsdk::Option; if(!sub_div) { sampleCommon->error("Memory allocation error.\n"); return SDK_FAILURE; } sub_div->_sVersion = "d"; sub_div->_lVersion = "divisions"; sub_div->_description = "number of subdivisions"; sub_div->_type = streamsdk::CA_ARG_INT; sub_div->_value = &numSubdivisions; sampleArgs->AddOption(sub_div); delete sub_div; streamsdk::Option* num_iterations = new streamsdk::Option; if(!num_iterations) { sampleCommon->error("Memory allocation error.\n"); return SDK_FAILURE; } num_iterations->_sVersion = "i"; num_iterations->_lVersion = "iterations"; num_iterations->_description = "Number of iterations for kernel execution"; num_iterations->_type = streamsdk::CA_ARG_INT; num_iterations->_value = &iterations; sampleArgs->AddOption(num_iterations); delete num_iterations; return SDK_SUCCESS; } int BinarySearch::setup() { if(!sampleCommon->isPowerOf2(length)) length = sampleCommon->roundToPowerOf2(length); if(setupBinarySearch()!=SDK_SUCCESS) return SDK_FAILURE; int timer = sampleCommon->createTimer(); sampleCommon->resetTimer(timer); sampleCommon->startTimer(timer); if(setupCL()!=SDK_SUCCESS) return SDK_FAILURE; setupTime = (cl_double)(sampleCommon->readTimer(timer)); return SDK_SUCCESS; } int BinarySearch::run() { int timer = sampleCommon->createTimer(); sampleCommon->resetTimer(timer); sampleCommon->startTimer(timer); std::cout << "Executing kernel for " << iterations << " iterations" << std::endl; std::cout << "-------------------------------------------" << std::endl; for(int i = 0; i < iterations; i++) { /* Arguments are set and execution call is enqueued on command buffer */ if(runCLKernels()!=SDK_SUCCESS) return SDK_FAILURE; } sampleCommon->stopTimer(timer); totalKernelTime = (double)(sampleCommon->readTimer(timer)) / iterations; if(!quiet) { cl_uint globalLowerBound = output[0]; cl_uint globalUpperBound = output[1]; cl_uint isElementFound = output[2]; printf("l = %d, u = %d, isfound = %d, fm = %d\n", globalLowerBound, globalUpperBound, isElementFound, findMe); } return SDK_SUCCESS; } int BinarySearch::verifyResults() { if(verify) { verificationInput = (cl_uint *) malloc(length*sizeof(cl_int)); if(verificationInput==NULL) { sampleCommon->error("Failed to allocate host memory. (verificationInput)"); return SDK_FAILURE; } memcpy(verificationInput, input, length*sizeof(cl_int)); /* reference implementation * it overwrites the input array with the output */ int refTimer = sampleCommon->createTimer(); sampleCommon->resetTimer(refTimer); sampleCommon->startTimer(refTimer); cl_int verified = binarySearchCPUReference(); sampleCommon->stopTimer(refTimer); referenceKernelTime = sampleCommon->readTimer(refTimer); /* compare the results and see if they match */ if(verified) { std::cout<<"Passed!\n"; return SDK_SUCCESS; } else { std::cout<<"Failed\n"; return SDK_FAILURE; } } return SDK_SUCCESS; } void BinarySearch::printStats() { std::string strArray[3] = {"Length", "Time(sec)", "kernelTime(sec)"}; std::string stats[3]; totalTime = setupTime + totalKernelTime; stats[0] = sampleCommon->toString(length , std::dec); stats[1] = sampleCommon->toString(totalTime, std::dec); stats[2] = sampleCommon->toString(totalKernelTime, std::dec); this->SDKSample::printStats(strArray, stats, 3); } int BinarySearch::cleanup() { /* Releases OpenCL resources (Context, Memory etc.) */ cl_int status; status = clReleaseKernel(kernel1); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; status = clReleaseKernel(kernel2); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; status = clReleaseProgram(program); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseProgram failed.")) return SDK_FAILURE; status = clReleaseMemObject(inputBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseMemObject failed.")) return SDK_FAILURE; status = clReleaseMemObject(outputBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseMemObject failed.")) return SDK_FAILURE; status = clReleaseCommandQueue(commandQueue); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseContext(context); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseContext failed.")) return SDK_FAILURE; /* release program resources (input memory etc.) */ if(input) free(input); if(output) { #if defined (_WIN32) _aligned_free(output); #else free(output); #endif } if(devices) free(devices); if(verificationInput) free(verificationInput); return SDK_SUCCESS; } int main(int argc, char * argv[]) { BinarySearch clBinarySearch("OpenCL Binary Search"); if(clBinarySearch.initialize()!=SDK_SUCCESS) return SDK_FAILURE; if(!clBinarySearch.parseCommandLine(argc, argv)) return SDK_FAILURE; if(clBinarySearch.setup()!=SDK_SUCCESS) return SDK_FAILURE; if(clBinarySearch.run()!=SDK_SUCCESS) return SDK_FAILURE; if(clBinarySearch.verifyResults()!=SDK_SUCCESS) return SDK_FAILURE; if(clBinarySearch.cleanup()!=SDK_SUCCESS) return SDK_FAILURE; clBinarySearch.printStats(); return SDK_SUCCESS; } //************************************************************************************// ///* BinarySearch_Kernels.hpp - simply added declaration of kernel1 and kernel2 */// #include <SDKCommandArgs.hpp> #include <SDKFile.hpp> /** * BinarySearch * Class implements OpenCL Binary Search sample * Derived from SDKSample base class */ class BinarySearch : public SDKSample { cl_uint seed; /**< Seed value for random number generation */ cl_double setupTime; /**< Time for setting up Opencl */ cl_double totalKernelTime; /**< Time for kernel execution */ cl_double totalProgramTime; /**< Time for program execution */ cl_double referenceKernelTime; /**< Time for reference implementation */ cl_uint findMe; /**< Search for this number in the input array */ cl_uint *input; /**< Input array */ cl_uint length; /**< Length of the input array */ cl_uint *output; /**< Output array */ cl_uint *verificationInput; /**< Input array for reference implementation */ cl_context context; /**< CL context */ cl_device_id *devices; /**< CL device list */ cl_mem inputBuffer; /**< CL memory buffer */ cl_mem outputBuffer; /**< CL output memory buffer */ cl_command_queue commandQueue; /**< CL command queue */ cl_program program; /**< CL program */ cl_kernel kernel1; /**< CL kernel */ cl_kernel kernel2; cl_uint numSubdivisions; /**< number of subdivisions to do N'ary Search */ size_t kernelWorkGroupSize; /**< Group Size returned by kernel */ int iterations; /**< Number of iterations for kernel execution */ public: /** * Constructor * Initialize member variables * @param name name of sample (string) */ BinarySearch(std::string name) : SDKSample(name) { seed = 123; input = NULL; output = NULL; verificationInput = NULL; findMe = 5; numSubdivisions = 8; /**< 8-ary Search by Default */ length = 512; setupTime = 0; totalKernelTime = 0; iterations = 1; } /** * Constructor * Initialize member variables * @param name name of sample (const char*) */ BinarySearch(const char* name) : SDKSample(name) { seed = 123; input = NULL; output = NULL; verificationInput = NULL; findMe = 5; numSubdivisions = 8; /**< Binary Search by Default */ length = 512; setupTime = 0; totalKernelTime = 0; iterations = 1; } /** * Allocate and initialize host memory array with random values * @return 1 on success and 0 on failure */ int setupBinarySearch(); /** * OpenCL related initialisations. * Set up Context, Device list, Command Queue, Memory buffers * Build CL kernel program executable * @return 1 on success and 0 on failure */ int setupCL(); /** * Set values for kernels' arguments, enqueue calls to the kernels * on to the command queue, wait till end of kernel execution. * Get kernel start and end time if timing is enabled * @return 1 on success and 0 on failure */ int runCLKernels(); /** * CPU verification for the BinarySearch algorithm */ int binarySearchCPUReference(); /** * Override from SDKSample. Print sample stats. */ void printStats(); /** * Override from SDKSample. Initialize * command line parser, add custom options */ int initialize(); /** * Override from SDKSample, adjust width and height * of execution domain, perform all sample setup */ int setup(); /** * Override from SDKSample * Run OpenCL BinarySearch */ int run(); /** * Override from SDKSample * Cleanup memory allocations */ int cleanup(); /** * Override from SDKSample * Verify against reference implementation */ int verifyResults(); }; #endif //************************************************************************************// ///* BinarySearch_Kernels.cl - simply duplicated original kernel*/// __kernel void binarySearch1( __global uint4 * outputArray, __const __global uint * sortedArray, const unsigned int findMe, const unsigned int globalLowerBound, const unsigned int globalUpperBound, const unsigned int subdivSize) { unsigned int tid = get_global_id(0); /* lower bound and upper bound are computed from segment and total search space for this pass * The total search space is known from global lower and upper bounds for this pass. */ unsigned int lowerBound = globalLowerBound + subdivSize * tid; unsigned int upperBound = lowerBound + subdivSize - 1; /* Then we find the elements at the two ends of the search space for this thread */ unsigned int lowerBoundElement = sortedArray[lowerBound]; unsigned int upperBoundElement = sortedArray[upperBound]; /* If the element to be found does not lie between them, then nothing left to do in this thread */ if( (lowerBoundElement > findMe) || (upperBoundElement < findMe)) { return; } else { /* However, if the element does lie between the lower and upper bounds of this thread's searchspace * we need to narrow down the search further in this search space */ /* The search space for this thread is marked in the output as being the total search space for the next pass */ outputArray[0].x = lowerBound; outputArray[0].y = upperBound; outputArray[0].w = 1; } } __kernel void binarySearch2( __global uint4 * outputArray, __const __global uint * sortedArray, const unsigned int findMe, const unsigned int globalLowerBound, const unsigned int globalUpperBound, const unsigned int subdivSize) { unsigned int tid = get_global_id(0); /* lower bound and upper bound are computed from segment and total search space for this pass * The total search space is known from global lower and upper bounds for this pass. */ unsigned int lowerBound = globalLowerBound + subdivSize * tid; unsigned int upperBound = lowerBound + subdivSize - 1; /* Then we find the elements at the two ends of the search space for this thread */ unsigned int lowerBoundElement = sortedArray[lowerBound]; unsigned int upperBoundElement = sortedArray[upperBound]; /* If the element to be found does not lie between them, then nothing left to do in this thread */ if( (lowerBoundElement > findMe) || (upperBoundElement < findMe)) { return; } else { /* However, if the element does lie between the lower and upper bounds of this thread's searchspace * we need to narrow down the search further in this search space */ /* The search space for this thread is marked in the output as being the total search space for the next pass */ outputArray[0].x = lowerBound; outputArray[0].y = upperBound; outputArray[0].w = 1; } }