cancel
Showing results for 
Search instead for 
Did you mean: 

OpenCL

Journeyman III
Journeyman III

CPU + GPU crash with AMD APP SDK 2.3 Win7

Here's the complete runtime code:

 

SimpleMultiDevice.hpp -------------------------- #ifndef MULTI_DEVICE_H_ #define MULTI_DEVICE_H_ #include <string.h> #include <cstdlib> #include <iostream> #include <string> #include <fstream> #include <time.h> #include <SDKCommon.hpp> #include <SDKThread.hpp> #define KERNEL_ITERATIONS 100 #define GROUP_SIZE 64 #define NUM_THREADS 1024 * 64 #define MAX_DATA_WIDTH 128 #define MAX_DATA_HEIGHT MAX_DATA_WIDTH #define MAX_DATA_SIZE (MAX_DATA_WIDTH * MAX_DATA_HEIGHT) #define LOCAL_WORK_SIZE 16 #define MAX_TEMPLATES 2 #define MAX_TEMPLATE_ELEMENTS 25 #define DEFAULT_THRESHOLD 0.25f #define THRESHOLD_UPPER 0.555f #define THRESHOLD_LOWER 0.055f typedef struct { float _val1; float _val2; float _val3; float _val4; } PixelType; typedef struct { float _default; float _thresholdHi; float _thresholdLo; float _spare; } ThresholdType; static const int _template[MAX_TEMPLATES][MAX_TEMPLATE_ELEMENTS] = { // template 1 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}, // template 2 {1, 1, 0, 2, 2, 1, 1, 0, 2, 2, 1, 1, 0, 2, 2, 1, 1, 0, 2, 2, 1, 1, 0, 2, 2} }; class Device { public: //CL Objects and memory buffers int status; cl_device_type dType; //device type cl_device_id deviceId; //device ID cl_context context; //context cl_command_queue queue; //command-queue cl_mem inputBuffer; //input buffer cl_mem outputBuffer; //output buffer cl_mem templateBuffer; cl_program program; //program object cl_kernel kernel; //kernel object cl_event eventObject; //event object cl_ulong kernelStartTime; //kernel start time cl_ulong kernelEndTime; //kernel end time double elapsedTime; //elapsed time in ms cl_float *output; //output host buffer for verification Device() { output = NULL; } ~Device(); // Create Context int createContext(); // Create Command-queue int createQueue(); // Create input and output buffers and Enqueue write data int createBuffers(); // Initialize Input buffers int enqueueWriteBuffer(); // Initialize Input buffers int enqueueWriteNewBuffer(); // Initialize Input buffers int enqueueWriteTemplateBuffer(); // Set Kernel arguments int setNewKernelArgs(); // Enqueue NDRAnge kernel int enqueueNewKernel(size_t *globalThreads, size_t *localThreads); // Get output data from device int enqueueNewReadData(); // Create Program object int createProgram(const char **source, const size_t *sourceSize); // Build Program source int buildProgram(); // Create Kernel object int createKernel(); // Set Kernel arguments int setKernelArgs(); // Enqueue NDRAnge kernel int enqueueKernel(size_t *globalThreads, size_t *localThreads); // Wait for kernel execution to finish int waitForKernel(); // Get kernel execution time int getProfilingData(); // Get output data from device int enqueueReadData(); // Verify results against host computation int verifyResults(); // Cleanup allocated resources int cleanupResources(); }; /*** GLOBALS ***/ //Separator std::string sep = "----------------------------------------------------------"; bool verify = false; // Pointer to list of devices Device *cpu; Device *gpu; // Number of CPU and GPU devices int numDevices; int numCPUDevices; int numGPUDevices; // Size of input data int width; // Input data is same for all devices cl_float *input; PixelType *_hData; // Host Output data for verification cl_float *verificationOutput; // Kernel source string std::string sourceStr; const char *source; // SDK object streamsdk::SDKCommon sdkObject; // Context properties const cl_context_properties* cprops; cl_context_properties cps[3]; cl_platform_id platform = NULL; // Count for verification cl_uint verificationCount = 0; cl_uint requiredCount = 0; /*** FUNCTION DECLARATIONS ***/ // Read a file into a string std::string convertToString(const char * filename); // Host kernel computation int CPUkernel(); // Runs the kernel on all GPU devices and verify results with host output int runMultiGPU(); // Runs the kernel on a CPU and a GPU and verifies their results with host output int runMultiDevice(); // Calls runMultiGPU and runMultiDevice function int run(void); // Releases program's resources void cleanupHost(void); /* * Prints no more than 256 elements of the given array. * Prints full array if length is less than 256. * * Prints Array name followed by elements. */ void print1DArray(const std::string arrayName, const unsigned int *arrayData, const unsigned int length); #endif /* #ifndef MULTI_DEVICE_H_ */ ----------------------------------------------------------------------------------------------------------------------------------- SimpleMultiDevice.cpp -------------------------- #include "SimpleMultiDevice.hpp" int Device::createContext() { //Create context using current device's ID context = clCreateContext(cprops, 1, &deviceId, 0, 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::createQueue() { //Create Command-Queue queue = clCreateCommandQueue(context, deviceId, CL_QUEUE_PROFILING_ENABLE, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateCommandQueue failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::createBuffers() { // Create input buffer inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, width * sizeof(cl_float), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(inputBuffer)")) return SDK_FAILURE; // Create output buffer outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, width * sizeof(cl_float), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(outputBuffer)")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueWriteBuffer() { // Initialize input buffer status = clEnqueueWriteBuffer(queue, inputBuffer, 1, 0, width * sizeof(cl_float), input, 0, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clEnqueueWriteBuffer failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueWriteNewBuffer() { // Initialize input buffer status = clEnqueueWriteBuffer(queue, inputBuffer, 1, 0, MAX_DATA_SIZE * sizeof(PixelType), _hData, 0, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "enqueueWriteNewBuffer failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueWriteTemplateBuffer() { // Initialize input buffer status = clEnqueueWriteBuffer(queue, templateBuffer, 1, 0, MAX_TEMPLATES * MAX_TEMPLATE_ELEMENTS * sizeof(int), _template, 0, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "enqueueWriteTemplateBuffer failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::setNewKernelArgs() { ThresholdType _info = {DEFAULT_THRESHOLD, THRESHOLD_UPPER, THRESHOLD_LOWER, 0.0f}; int _type = 0; status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &outputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(outputBuffer)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(inputBuffer)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &templateBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(templateBuffer)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 3, 4 * sizeof(cl_float), (void *) &_info); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(_info)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *) &_type); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(_type)")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueNewKernel(size_t *globalThreads, size_t *localThreads) { status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "enqueueNewKernel failed.")) return SDK_FAILURE; status = clFlush(queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clFlush failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueNewReadData() { unsigned char _hResult[MAX_DATA_SIZE]; status = clEnqueueReadBuffer(queue, outputBuffer, 1, 0, MAX_DATA_SIZE * sizeof(unsigned char), _hResult, 0, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "enqueueNewReadData failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::createProgram(const char **source, const size_t *sourceSize) { // Create program with source program = clCreateProgramWithSource(context, 1, source, sourceSize, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::buildProgram() { char buildOptions[50]; sprintf(buildOptions, "-D KERNEL_ITERATIONS=%d", KERNEL_ITERATIONS); // Build program source status = clBuildProgram(program, 1, &deviceId, buildOptions, 0, 0); /* Print build log here if build program failed */ if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char *buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo(program, deviceId, CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!sdkObject.checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { return SDK_FAILURE; } buildLog = (char*)malloc(buildLogSize); if(buildLog == NULL) { sdkObject.error("Failed to allocate host memory. (buildLog)"); return SDK_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo(program, deviceId, CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!sdkObject.checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return SDK_FAILURE; } std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!sdkObject.checkVal(status, CL_SUCCESS, "clBuildProgram failed.")) { return SDK_FAILURE; } } return SDK_SUCCESS; } int Device::createKernel() { kernel = clCreateKernel(program, "multiDeviceKernel", &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateKernel failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::setKernelArgs() { status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(inputBuffer)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(outputBuffer)")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueKernel(size_t *globalThreads, size_t *localThreads) { status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clEnqueueNDRangeKernel failed.")) return SDK_FAILURE; status = clFlush(queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clFlush failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::waitForKernel() { status = clFinish(queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clFinish failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::getProfilingData() { status = clGetEventProfilingInfo(eventObject, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernelStartTime, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetEventProfilingInfo failed.(start time)")) return SDK_FAILURE; status = clGetEventProfilingInfo(eventObject, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernelEndTime, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetEventProfilingInfo failed.(end time)")) return SDK_FAILURE; //Measure time in ms elapsedTime = 1e-6 * (kernelEndTime - kernelStartTime); return SDK_SUCCESS; } int Device::enqueueReadData() { // Allocate memory if(output == NULL) { output = (cl_float*)malloc(width * sizeof(cl_char)); if(output == NULL) { printf("Failed to allocate output buffer!\n"); return SDK_FAILURE; } } status = clEnqueueReadBuffer(queue, outputBuffer, 1, 0, //width * sizeof(cl_float), width * sizeof(cl_char), output, 0, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clEnqueueReadBuffer failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::verifyResults() { float error = 0; //compare results between verificationOutput and output host buffers for(int i = 0; i < width; i++) { error += (output - verificationOutput); } error /= width; if(error < 0.001) { std::cout << "Passed!\n"; verificationCount++; } else { std::cout << "Failed!\n"; } return SDK_SUCCESS; } int Device::cleanupResources() { int status = clReleaseCommandQueue(queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseKernel(kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; cl_uint programRefCount; status = clGetProgramInfo(program, CL_PROGRAM_REFERENCE_COUNT, sizeof(cl_uint), &programRefCount, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetProgramInfo failed.")) return SDK_FAILURE; if(programRefCount) { status = clReleaseProgram(program); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseProgram failed.")) return SDK_FAILURE; } cl_uint inputRefCount; status = clGetMemObjectInfo(inputBuffer, CL_MEM_REFERENCE_COUNT, sizeof(cl_uint), &inputRefCount, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetMemObjectInfo failed.")) return SDK_FAILURE; if(inputRefCount) { status = clReleaseMemObject(inputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (inputBuffer)")) return SDK_FAILURE; } cl_uint outputRefCount; status = clGetMemObjectInfo(outputBuffer, CL_MEM_REFERENCE_COUNT, sizeof(cl_uint), &outputRefCount, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetMemObjectInfo failed.")) return SDK_FAILURE; if(outputRefCount) { status = clReleaseMemObject(outputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (outputBuffer)")) return SDK_FAILURE; } cl_uint contextRefCount; status = clGetContextInfo(context, CL_CONTEXT_REFERENCE_COUNT, sizeof(cl_uint), &contextRefCount, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetContextInfo failed.")) return SDK_FAILURE; if(contextRefCount) { status = clReleaseContext(context); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseContext failed.")) return SDK_FAILURE; } status = clReleaseEvent(eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; return SDK_SUCCESS; } //Thread function for a device void* threadFunc(void *device) { Device *d = (Device*)device; size_t globalThreads = width; size_t localThreads = GROUP_SIZE; d->enqueueKernel(&globalThreads, &localThreads); d->waitForKernel(); return NULL; } Device::~Device() { if(output) { free(output); output = NULL; } } int runMultiGPU() { int status; /////////////////////////////////////////////////////////////////// // Case 1 : Single Context (Single Thread) ////////////////////////////////////////////////////////////////// std::cout << sep << "\nMulti GPU Test 1 : Single context Single Thread\n" << sep << std::endl; cl_context context = clCreateContextFromType(cprops, CL_DEVICE_TYPE_GPU, 0, 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; size_t sourceSize = strlen(source); cl_program program = clCreateProgramWithSource(context, 1, &source, (const size_t*)&sourceSize, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; char buildOptions[50]; sprintf(buildOptions, "-D KERNEL_ITERATIONS=%d", KERNEL_ITERATIONS); //Build program for all the devices in the context status = clBuildProgram(program, 0, 0, buildOptions, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clBuildProgram failed.")) return SDK_FAILURE; //Setup for all GPU devices for(int i = 0; i < numGPUDevices; i++) { gpu.context = context; gpu.program = program; status = gpu.createQueue(); if(status != SDK_SUCCESS) return SDK_FAILURE; status = gpu.createKernel(); if(status != SDK_SUCCESS) return SDK_FAILURE; } // Create buffers // Create input buffer cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, width * sizeof(cl_float), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(inputBuffer)")) return SDK_FAILURE; // Create output buffer cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, width * sizeof(cl_float), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(outputBuffer)")) return SDK_FAILURE; //Set Buffers for(int i = 0; i < numGPUDevices; i++) { gpu.inputBuffer = inputBuffer; status = gpu.enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; gpu.outputBuffer = outputBuffer; } //Set kernel arguments for(int i = 0; i < numGPUDevices; i++) { status = gpu.setKernelArgs(); if(status != SDK_SUCCESS) return status; } size_t globalThreads = width; size_t localThreads = GROUP_SIZE; //Start a host timer here int timer = sdkObject.createTimer(); sdkObject.resetTimer(timer); sdkObject.startTimer(timer); for(int i = 0; i < numGPUDevices; i++) { status = gpu.enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; } //Wait for all kernels to finish execution for(int i = 0; i < numGPUDevices; i++) { status = gpu.waitForKernel(); if(status != SDK_SUCCESS) return status; } //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time double totalTime = sdkObject.readTimer(timer); //Get individual timers for(int i = 0; i < numGPUDevices; i++) { status = gpu.getProfilingData(); if(status != SDK_SUCCESS) return status; } //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; for(int i = 0; i < numGPUDevices; i++) { std::cout << "Time of GPU" << i << " : " << gpu.elapsedTime << std::endl; } if(verify) { //Enqueue Read output buffer and verify results for(int i = 0; i < numGPUDevices; i++) { status = gpu.enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for GPU" << i << " : "; gpu.verifyResults(); } } //Release the resources on all devices //Release context status = clReleaseContext(context); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; //Release memory buffers status = clReleaseMemObject(inputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (inputBuffer)")) return SDK_FAILURE; status = clReleaseMemObject(outputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (outputBuffer)")) return SDK_FAILURE; //Release Program object status = clReleaseProgram(program); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseProgram failed.")) return SDK_FAILURE; //Release Kernel object, command-queue, event object for(int i = 0; i < numGPUDevices; i++) { status = clReleaseKernel(gpu.kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseCommandQueue(gpu.queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseEvent(gpu.eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; } /////////////////////////////////////////////////////////////////// // Case 2 : Multiple Context (Single Thread) ////////////////////////////////////////////////////////////////// std::cout << sep << "\nMulti GPU Test 2 : Multiple context Single Thread\n" << sep << std::endl; for(int i = 0; i < numGPUDevices; i++) { //Create context for each device status = gpu.createContext(); if(status != SDK_SUCCESS) return status; //Create command-queue; status = gpu.createQueue(); if(status != SDK_SUCCESS) return status; //Create memory buffers status = gpu.createBuffers(); if(status != SDK_SUCCESS) return status; //Initialize input buffer status = gpu.enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; //create program object status = gpu.createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; //Build program status = gpu.buildProgram(); if(status != SDK_SUCCESS) return status; //Create kernel objects for each device status = gpu.createKernel(); if(status != SDK_SUCCESS) return status; } //Set kernel arguments for(int i = 0; i < numGPUDevices; i++) { status = gpu.setKernelArgs(); if(status != SDK_SUCCESS) return status; } //Start a host timer here sdkObject.resetTimer(timer); sdkObject.startTimer(timer); for(int i = 0; i < numGPUDevices; i++) { status = gpu.enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; } //Wait for all kernels to finish execution for(int i = 0; i < numGPUDevices; i++) { status = gpu.waitForKernel(); if(status != SDK_SUCCESS) return status; } //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time totalTime = sdkObject.readTimer(timer); //Get individual timers for(int i = 0; i < numGPUDevices; i++) { status = gpu.getProfilingData(); if(status != SDK_SUCCESS) return status; } //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; for(int i = 0; i < numGPUDevices; i++) { std::cout << "Time of GPU" << i << " : " << gpu.elapsedTime << std::endl; } if(verify) { // Read outputdata and verify results for(int i = 0; i < numGPUDevices; i++) { status = gpu.enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for GPU" << i << " : "; gpu.verifyResults(); } } //Release the resources on all devices for(int i = 0; i < numGPUDevices; i++) { status = gpu.cleanupResources(); if(status != SDK_SUCCESS) return status; } //////////////////////////////////////////////////////////////////// // Case 3 : Multiple thread and multiple context for each device //////////////////////////////////////////////////////////////////// std::cout << sep << "\nMulti GPU Test 3 : Multiple context Multiple Thread\n" << sep << std::endl; for(int i = 0; i < numGPUDevices; i++) { //Create context for each device status = gpu.createContext(); if(status != SDK_SUCCESS) return status; //Create command-queue; status = gpu.createQueue(); if(status != SDK_SUCCESS) return status; //Create memory buffers status = gpu.createBuffers(); if(status != SDK_SUCCESS) return status; //Initialize input buffer status = gpu.enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; //create program object status = gpu.createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; //Build program status = gpu.buildProgram(); if(status != SDK_SUCCESS) return status; //Create kernel objects for each device status = gpu.createKernel(); if(status != SDK_SUCCESS) return status; } //Set kernel arguments for(int i = 0; i < numGPUDevices; i++) { status = gpu.setKernelArgs(); if(status != SDK_SUCCESS) return status; } //Start a host timer here sdkObject.resetTimer(timer); sdkObject.startTimer(timer); //Create thread objects streamsdk::SDKThread *gpuThread = new streamsdk::SDKThread[numGPUDevices]; //Start threads for each gpu device for(int i = 0; i < numGPUDevices; i++) { gpuThread.create(threadFunc, (void *)(gpu + i)); } //Join all gpu threads for(int i = 0; i < numGPUDevices; i++) { gpuThread.join(); } //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time totalTime = sdkObject.readTimer(timer); //Get individual timers for(int i = 0; i < numGPUDevices; i++) { status = gpu.getProfilingData(); if(status != SDK_SUCCESS) return status; } //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; for(int i = 0; i < numGPUDevices; i++) { std::cout << "Time of GPU" << i << " : " << gpu.elapsedTime << std::endl; } if(verify) { // Read outputdata and verify results for(int i = 0; i < numGPUDevices; i++) { status = gpu.enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for GPU" << i << " : "; gpu.verifyResults(); } } //Release the resources on all devices for(int i = 0; i < numGPUDevices; i++) { gpu.cleanupResources(); } return 0; } int runMultiDevice() { int status; /////////////////////////////////////////////////////////////////// // Case 1 : Single Context (Single Thread) ////////////////////////////////////////////////////////////////// std::cout << sep << "\nCPU + GPU Test 1 : Single context Single Thread\n" << sep << std::endl; /* Create a list of device IDs having only CPU0 and GPU0 as device IDs */ cl_device_id *devices = (cl_device_id*)malloc(2 * sizeof(cl_device_id)); devices[0] = cpu[0].deviceId; devices[1] = gpu[0].deviceId; cl_context context = clCreateContext(cprops, 2, devices, 0, 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; size_t sourceSize = strlen(source); cl_program program = clCreateProgramWithSource(context, 1, &source, (const size_t*)&sourceSize, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; char buildOptions[50]; sprintf(buildOptions, "-D WIDTH=%d -D HEIGHT=%d -cl-mad-enable", MAX_DATA_WIDTH, MAX_DATA_HEIGHT); //Build program for all the devices in the context status = clBuildProgram(program, 2, devices, buildOptions, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clBuildProgram failed.")) return SDK_FAILURE; //Allocate objects for CPU cpu[0].context = context; gpu[0].context = context; cpu[0].program = program; gpu[0].program = program; // Create command queue status = cpu[0].createQueue(); if(status != SDK_SUCCESS) return status; // Create kernel status = cpu[0].createKernel(); if(status != SDK_SUCCESS) return status; // Create queue status = gpu[0].createQueue(); if(status != SDK_SUCCESS) return status; // Create kernel status = gpu[0].createKernel(); if(status != SDK_SUCCESS) return status; cl_mem _dData = clCreateBuffer(context, CL_MEM_READ_ONLY, MAX_DATA_WIDTH * MAX_DATA_HEIGHT * sizeof(PixelType), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(_dDataSarVec)")) return SDK_FAILURE; cl_mem _dTemplate = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, MAX_TEMPLATES * MAX_TEMPLATE_ELEMENTS * sizeof(int), (void*)_template, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(_dDataSarVec)")) return SDK_FAILURE; cl_mem _dResult = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, MAX_DATA_WIDTH * MAX_DATA_HEIGHT * sizeof(unsigned char), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(_dResultDetectMapVec)")) return SDK_FAILURE; cpu[0].inputBuffer = _dData; gpu[0].inputBuffer = _dData; cpu[0].templateBuffer = _dTemplate; gpu[0].templateBuffer = _dTemplate; cpu[0].outputBuffer = _dResult; gpu[0].outputBuffer = _dResult; // Initialize input buffer for both devices status = cpu[0].enqueueWriteNewBuffer(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueWriteNewBuffer(); if(status != SDK_SUCCESS) return status; status = cpu[0].enqueueWriteTemplateBuffer(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueWriteTemplateBuffer(); if(status != SDK_SUCCESS) return status; //Set kernel arguments status = cpu[0].setNewKernelArgs(); if(status != SDK_SUCCESS) return status; status = gpu[0].setNewKernelArgs(); if(status != SDK_SUCCESS) return status; size_t globalThreads2D[2] = {MAX_DATA_WIDTH, MAX_DATA_HEIGHT}; size_t localThreads2D[2] = {LOCAL_WORK_SIZE, LOCAL_WORK_SIZE}; //Start a host timer here int timer = sdkObject.createTimer(); sdkObject.resetTimer(timer); sdkObject.startTimer(timer); status = gpu[0].enqueueNewKernel(globalThreads2D, localThreads2D); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueNewReadData(); if(status != SDK_SUCCESS) return status; status = gpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; status = cpu[0].enqueueNewKernel(globalThreads2D, localThreads2D); if(status != SDK_SUCCESS) return status; status = cpu[0].enqueueNewReadData(); if(status != SDK_SUCCESS) return status; status = cpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time double totalTime = sdkObject.readTimer(timer); status = cpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; status = gpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; std::cout << "Time of CPU : " << cpu[0].elapsedTime << std::endl; std::cout << "Time of GPU : " << gpu[0].elapsedTime << std::endl; //Release context status = clReleaseContext(context); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; //Release memory buffers status = clReleaseMemObject(_dData); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (_dData)")) return SDK_FAILURE; status = clReleaseMemObject(_dTemplate); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (_dTemplate)")) return SDK_FAILURE; status = clReleaseMemObject(_dResult); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (_dResult)")) return SDK_FAILURE; //ReleaseCommand-queue status = clReleaseCommandQueue(cpu[0].queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseCommandQueue(gpu[0].queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; //Release Program object status = clReleaseProgram(program); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseProgram failed.")) return SDK_FAILURE; //Release Kernel object status = clReleaseKernel(cpu[0].kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; status = clReleaseKernel(gpu[0].kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; //Release Event object status = clReleaseEvent(cpu[0].eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; status = clReleaseEvent(gpu[0].eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; size_t globalThreads = width; size_t localThreads = GROUP_SIZE; #if 0 // Create buffers - A buffer is created on all devices sharing a context // So bufffer creation should should not per device in a single-context cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, width * sizeof(cl_float), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(inputBuffer)")) return SDK_FAILURE; cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, //width * sizeof(cl_float), width * sizeof(cl_char), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(outputBuffer)")) return SDK_FAILURE; cpu[0].inputBuffer = inputBuffer; gpu[0].inputBuffer = inputBuffer; cpu[0].outputBuffer = outputBuffer; gpu[0].outputBuffer = outputBuffer; // Initialize input buffer for both devices status = cpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; //Set kernel arguments status = cpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; status = gpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; size_t globalThreads = width; size_t localThreads = GROUP_SIZE; //Start a host timer here int timer = sdkObject.createTimer(); sdkObject.resetTimer(timer); sdkObject.startTimer(timer); status = gpu[0].enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; status = gpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; status = cpu[0].enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; //Read back output data for verification status = cpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; //Wait for all kernels to finish execution status = cpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time double totalTime = sdkObject.readTimer(timer); status = cpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; status = gpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; std::cout << "Time of CPU : " << cpu[0].elapsedTime << std::endl; std::cout << "Time of GPU : " << gpu[0].elapsedTime << std::endl; if(verify) { //Read back output data for verification status = cpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for CPU : "; cpu[0].verifyResults(); std::cout << "Verifying results for GPU : "; gpu[0].verifyResults(); } //Release context status = clReleaseContext(context); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; //Release memory buffers status = clReleaseMemObject(inputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (inputBuffer)")) return SDK_FAILURE; status = clReleaseMemObject(outputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (outputBuffer)")) return SDK_FAILURE; //ReleaseCommand-queue status = clReleaseCommandQueue(cpu[0].queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseCommandQueue(gpu[0].queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; //Release Program object status = clReleaseProgram(program); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseProgram failed.")) return SDK_FAILURE; //Release Kernel object status = clReleaseKernel(cpu[0].kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; status = clReleaseKernel(gpu[0].kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; //Release Event object status = clReleaseEvent(cpu[0].eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; status = clReleaseEvent(gpu[0].eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; #endif /////////////////////////////////////////////////////////////////// // Case 2 : Multiple Context (Single Thread) ////////////////////////////////////////////////////////////////// std::cout << sep << "\nCPU + GPU Test 2 : Multiple context Single Thread\n" << sep << std::endl; status = cpu[0].createContext(); if(status != SDK_SUCCESS) return status; status = cpu[0].createQueue(); if(status != SDK_SUCCESS) return status; status = cpu[0].createBuffers(); if(status != SDK_SUCCESS) return status; status = cpu[0].createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; status = cpu[0].buildProgram(); if(status != SDK_SUCCESS) return status; status = cpu[0].createKernel(); if(status != SDK_SUCCESS) return status; status = gpu[0].createContext(); if(status != SDK_SUCCESS) return status; status = gpu[0].createQueue(); if(status != SDK_SUCCESS) return status; status = gpu[0].createBuffers(); if(status != SDK_SUCCESS) return status; status = gpu[0].createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; status = gpu[0].buildProgram(); if(status != SDK_SUCCESS) return status; status = gpu[0].createKernel(); if(status != SDK_SUCCESS) return status; // Initialize input buffer for both devices status = cpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; //Set kernel arguments status = cpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; status = gpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; //Start a host timer here //int timer = sdkObject.createTimer(); sdkObject.resetTimer(timer); sdkObject.startTimer(timer); //size_t globalThreads = width; //size_t localThreads = 1; status = cpu[0].enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; //Wait for all kernels to finish execution status = cpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; status = gpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time //double totalTime = sdkObject.readTimer(timer); status = cpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; status = gpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; std::cout << "Time of CPU : " << cpu[0].elapsedTime << std::endl; std::cout << "Time of GPU : " << gpu[0].elapsedTime << std::endl; if(verify) { //Read back output data for verification status = cpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for CPU : "; cpu[0].verifyResults(); std::cout << "Verifying results for GPU : "; gpu[0].verifyResults(); } //Release the resources on all devices status = cpu[0].cleanupResources(); if(status != SDK_SUCCESS) return status; status = gpu[0].cleanupResources(); if(status != SDK_SUCCESS) return status; ///////////////////////////////////////////////////////////////////// // Case 3 : Multiple thread and multiple context for each device //////////////////////////////////////////////////////////////////// std::cout << sep << "\nCPU + GPU Test 3 : Multiple context Multiple Thread\n" << sep << std::endl; status = cpu[0].createContext(); if(status != SDK_SUCCESS) return status; status = cpu[0].createQueue(); if(status != SDK_SUCCESS) return status; status = cpu[0].createBuffers(); if(status != SDK_SUCCESS) return status; status = cpu[0].createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; status = cpu[0].buildProgram(); if(status != SDK_SUCCESS) return status; status = cpu[0].createKernel(); if(status != SDK_SUCCESS) return status; status = gpu[0].createContext(); if(status != SDK_SUCCESS) return status; status = gpu[0].createQueue(); if(status != SDK_SUCCESS) return status; status = gpu[0].createBuffers(); if(status != SDK_SUCCESS) return status; status = gpu[0].createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; status = gpu[0].buildProgram(); if(status != SDK_SUCCESS) return status; status = gpu[0].createKernel(); if(status != SDK_SUCCESS) return status; // Initialize input buffer for both devices status = cpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; //Set kernel arguments status = cpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; status = gpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; //Start a host timer here sdkObject.resetTimer(timer); sdkObject.startTimer(timer); //Create a thread for CPU and GPU device each streamsdk::SDKThread cpuThread; streamsdk::SDKThread gpuThread; cpuThread.create(threadFunc, (void *)cpu); gpuThread.create(threadFunc, (void *)gpu); cpuThread.join(); gpuThread.join(); //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time totalTime = sdkObject.readTimer(timer); status = cpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; status = gpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; std::cout << "Time of CPU : " << cpu[0].elapsedTime << std::endl; std::cout << "Time of GPU : " << gpu[0].elapsedTime << std::endl; if(verify) { //Read back output data for verification status = cpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for CPU : "; cpu[0].verifyResults(); std::cout << "Verifying results for GPU : "; gpu[0].verifyResults(); } //Release the resources on all devices status = cpu[0].cleanupResources(); if(status != SDK_SUCCESS) return status; status = gpu[0].cleanupResources(); if(status != SDK_SUCCESS) return status; if(devices) { free(devices); devices = NULL; } return 0; } /* * \brief Host Initialization * Allocate and initialize memory * on the host. Print input array. */ int initializeHost(void) { width = NUM_THREADS; input = NULL; verificationOutput = NULL; ///////////////////////////////////////////////////////////////// // Allocate and initialize memory used by host ///////////////////////////////////////////////////////////////// _hData = (PixelType*) malloc (MAX_DATA_SIZE * sizeof(PixelType)); if (_hData == NULL) { printf("\nUnable to allocate memory at %d\n", __LINE__); return SDK_FAILURE; } for (int i = 0; i < MAX_DATA_SIZE; i+=4) { (_hData + i)->_val1 = i * 0.0001f; (_hData + i + 1)->_val2 = (i + 1) * 0.0001f; (_hData + i + 2)->_val3 = (i + 2) * 0.0001f; (_hData + i + 3)->_val4 = (i + 3) * 0.0001f; } #if 0 cl_uint sizeInBytes = width * sizeof(cl_uint); input = (cl_float*) malloc(sizeInBytes); if(input == NULL) { printf("Error: Failed to allocate input memory on host\n"); return SDK_FAILURE; } verificationOutput = (cl_float*) malloc(sizeInBytes); if(verificationOutput == NULL) { printf("Error: Failed to allocate verificationOutput memory on host\n"); return SDK_FAILURE; } //Initilize input data for(int i = 0; i < width; i++) input = (cl_float)i; #endif return SDK_SUCCESS; } /* * Converts the contents of a file into a string */ std::string convertToString(const char *filename) { size_t size; char* str; std::string s; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = (size_t)f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); return NULL; } f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; delete[] str; return s; } return NULL; } /* * \brief OpenCL related initialization * Create Context, Device list, Command Queue * Create OpenCL memory buffer objects * Load CL file, compile, link CL source * Build program and kernel objects */ int initializeCL(void) { cl_int status = 0; /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) return SDK_FAILURE; if(numPlatforms > 0) { cl_platform_id* platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) return SDK_FAILURE; for(unsigned int i=0; i < numPlatforms; ++i) { char pbuff[100]; status = clGetPlatformInfo( platforms, CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetPlatformInfo failed.")) return SDK_FAILURE; platform = platforms; if(!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { break; } } free(platforms); } /* * If we could find our platform, use it. Otherwise pass a NULL and get whatever the * implementation thinks we should be using. */ cps[0] = CL_CONTEXT_PLATFORM; cps[1] = (cl_context_properties)platform; cps[2] = 0; cprops = (NULL == platform) ? NULL : cps; // Get Number of CPU devices available status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, 0, (cl_uint*)&numCPUDevices); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetDeviceIDs failed.(numCPUDevices)")) return SDK_FAILURE; // Get Number of CPU devices available status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, 0, (cl_uint*)&numDevices); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetDeviceIDs failed.(numDevices)")) return SDK_FAILURE; // Get number of GPU Devices numGPUDevices = numDevices - numCPUDevices; // If no GPU is present then exit if(numGPUDevices < 1) { std::cout << "Only CPU device is present. Exiting!\n"; return SDK_EXPECTED_FAILURE; } // Allocate memory for list of Devices cpu = new Device[numCPUDevices]; //Get CPU Device IDs cl_device_id* cpuDeviceIDs = new cl_device_id[numCPUDevices]; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numCPUDevices, cpuDeviceIDs, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetDeviceIDs failed.")) return SDK_FAILURE; for(int i = 0; i < numCPUDevices; i++) { cpu.dType = CL_DEVICE_TYPE_CPU; cpu.deviceId = cpuDeviceIDs; } delete[] cpuDeviceIDs; gpu = new Device[numGPUDevices]; //Get GPU Device IDs cl_device_id* gpuDeviceIDs = new cl_device_id[numGPUDevices]; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numGPUDevices, gpuDeviceIDs, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetDeviceIDs failed.")) return SDK_FAILURE; for(int i = 0; i < numGPUDevices; i++) { gpu.dType = CL_DEVICE_TYPE_GPU; gpu.deviceId = gpuDeviceIDs; } delete[] gpuDeviceIDs; ///////////////////////////////////////////////////////////////// // Load CL file ///////////////////////////////////////////////////////////////// const char *filename = "SimpleMultiDevice_Kernels.cl"; sourceStr = convertToString(filename); source = sourceStr.c_str(); return SDK_SUCCESS; } int run() { int status; // If a GPU is present then run CPU + GPU concurrently if(numGPUDevices > 0 && numCPUDevices > 0) { // 3 tests : // a) Single context - Single thread // b) Multiple context - Single thread // c) Multiple context - Multple Threads // 3 Tests * 2 devices requiredCount += 3 * 2; status = runMultiDevice(); if(status != SDK_SUCCESS) return status; } // If more than 1 GPU is present then run MultiGPU concurrently if(numGPUDevices > 1) { // 3 tests : // a) Single context - Single thread // b) Multiple context - Single thread // c) Multiple context - Multple Threads // 3 Tests * numGPUDevices requiredCount += 3 * numGPUDevices; status = runMultiGPU(); if(status != SDK_SUCCESS) return status; } return SDK_SUCCESS; } /* * \brief Releases program's resources */ void cleanupHost(void) { if(input != NULL) { free(input); input = NULL; } if(verificationOutput != NULL) { free(verificationOutput); verificationOutput = NULL; } if(cpu != NULL) { delete[] cpu; cpu = NULL; } if(gpu != NULL) { delete[] gpu; gpu = NULL; } } /* * \brief Print no more than 256 elements of the given array. * * Print Array name followed by elements. */ void print1DArray( const std::string arrayName, const unsigned int * arrayData, const unsigned int length) { cl_uint i; cl_uint numElementsToPrint = (256 < length) ? 256 : length; std::cout << std::endl; std::cout << arrayName << ":" << std::endl; for(i = 0; i < numElementsToPrint; ++i) { std::cout << arrayData << " "; } std::cout << std::endl; } // OpenCL MAD definition for CPU float mad(float a, float b, float c) { return a * b + c; } // OpenCL HYPOT definition for CPU float hypot(float a, float b) { return sqrt(a * a + b * b); } int CPUKernel() { for(int i = 0; i < width; i++) { float a = mad(input, input, 1); float b = mad(input, input, 2); for(int j = 0; j < KERNEL_ITERATIONS; j++) { a = hypot(a, b); b = hypot(a, b); } verificationOutput = (a + b); } return 0; } int main(int argc, char * argv[]) { for(int i = 1; i < argc; i++) { if(!strcmp(argv, "-e") || !strcmp(argv, "--verify")) verify = true; if(!strcmp(argv, "-h") || !strcmp(argv, "--help")) { printf("Usage\n"); printf("-h, --help\tPrint this help.\n"); printf("-e, --verify\tVerify results against reference implementation.\n"); exit(0); } } int status; // Initialize Host application status = initializeHost(); if(status != SDK_SUCCESS) return status; // Run host computation if verification is true #if 0 CPUKernel(); #endif // Initialize OpenCL resources status = initializeCL(); if(status != SDK_SUCCESS) { if(status == SDK_EXPECTED_FAILURE) return SDK_SUCCESS; else return status; } // Run the CL program status = run(); if(status != SDK_SUCCESS) return status; // Release host resources cleanupHost(); if(verify) { // If any one test fails then print FAILED if(verificationCount != requiredCount) { std::cout << "\n\nFAILED!\n"; return SDK_FAILURE; } else { std::cout << "\n\nPASSED!\n"; return SDK_SUCCESS; } } return SDK_SUCCESS; }

0 Kudos
Reply
Journeyman III
Journeyman III

CPU + GPU crash with AMD APP SDK 2.3 Win7

Originally posted by: wheecheng Here's the complete runtime code:

 

 



Could  you please copy complete code?  .cpp code was incomplete.

0 Kudos
Reply
Journeyman III
Journeyman III

CPU + GPU crash with AMD APP SDK 2.3 Win7

Here's the .cpp code:

#include "SimpleMultiDevice.hpp" int Device::createContext() { //Create context using current device's ID context = clCreateContext(cprops, 1, &deviceId, 0, 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::createQueue() { //Create Command-Queue queue = clCreateCommandQueue(context, deviceId, CL_QUEUE_PROFILING_ENABLE, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateCommandQueue failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::createBuffers() { // Create input buffer inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, width * sizeof(cl_float), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(inputBuffer)")) return SDK_FAILURE; // Create output buffer outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, width * sizeof(cl_float), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(outputBuffer)")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueWriteBuffer() { // Initialize input buffer status = clEnqueueWriteBuffer(queue, inputBuffer, 1, 0, width * sizeof(cl_float), input, 0, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clEnqueueWriteBuffer failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueWriteNewBuffer() { // Initialize input buffer status = clEnqueueWriteBuffer(queue, inputBuffer, 1, 0, MAX_DATA_SIZE * sizeof(PixelType), _hData, 0, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "enqueueWriteNewBuffer failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueWriteTemplateBuffer() { // Initialize input buffer status = clEnqueueWriteBuffer(queue, templateBuffer, 1, 0, MAX_TEMPLATES * MAX_TEMPLATE_ELEMENTS * sizeof(int), _template, 0, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "enqueueWriteTemplateBuffer failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::setNewKernelArgs() { ThresholdType _info = {DEFAULT_THRESHOLD, THRESHOLD_UPPER, THRESHOLD_LOWER, 0.0f}; int _type = 0; status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &outputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(outputBuffer)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(inputBuffer)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &templateBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(templateBuffer)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 3, 4 * sizeof(cl_float), (void *) &_info); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(_info)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *) &_type); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(_type)")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueNewKernel(size_t *globalThreads, size_t *localThreads) { status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "enqueueNewKernel failed.")) return SDK_FAILURE; status = clFlush(queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clFlush failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueNewReadData() { unsigned char _hResult[MAX_DATA_SIZE]; status = clEnqueueReadBuffer(queue, outputBuffer, 1, 0, MAX_DATA_SIZE * sizeof(unsigned char), _hResult, 0, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "enqueueNewReadData failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::createProgram(const char **source, const size_t *sourceSize) { // Create program with source program = clCreateProgramWithSource(context, 1, source, sourceSize, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::buildProgram() { char buildOptions[50]; sprintf(buildOptions, "-D KERNEL_ITERATIONS=%d", KERNEL_ITERATIONS); // Build program source status = clBuildProgram(program, 1, &deviceId, buildOptions, 0, 0); /* Print build log here if build program failed */ if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char *buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo(program, deviceId, CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!sdkObject.checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { return SDK_FAILURE; } buildLog = (char*)malloc(buildLogSize); if(buildLog == NULL) { sdkObject.error("Failed to allocate host memory. (buildLog)"); return SDK_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo(program, deviceId, CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!sdkObject.checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return SDK_FAILURE; } std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!sdkObject.checkVal(status, CL_SUCCESS, "clBuildProgram failed.")) { return SDK_FAILURE; } } return SDK_SUCCESS; } int Device::createKernel() { kernel = clCreateKernel(program, "multiDeviceKernel", &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateKernel failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::setKernelArgs() { status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(inputBuffer)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(outputBuffer)")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueKernel(size_t *globalThreads, size_t *localThreads) { status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clEnqueueNDRangeKernel failed.")) return SDK_FAILURE; status = clFlush(queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clFlush failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::waitForKernel() { status = clFinish(queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clFinish failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::getProfilingData() { status = clGetEventProfilingInfo(eventObject, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernelStartTime, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetEventProfilingInfo failed.(start time)")) return SDK_FAILURE; status = clGetEventProfilingInfo(eventObject, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernelEndTime, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetEventProfilingInfo failed.(end time)")) return SDK_FAILURE; //Measure time in ms elapsedTime = 1e-6 * (kernelEndTime - kernelStartTime); return SDK_SUCCESS; } int Device::enqueueReadData() { // Allocate memory if(output == NULL) { output = (cl_float*)malloc(width * sizeof(cl_char)); if(output == NULL) { printf("Failed to allocate output buffer!\n"); return SDK_FAILURE; } } status = clEnqueueReadBuffer(queue, outputBuffer, 1, 0, //width * sizeof(cl_float), width * sizeof(cl_char), output, 0, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clEnqueueReadBuffer failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::verifyResults() { float error = 0; //compare results between verificationOutput and output host buffers for(int i = 0; i < width; i++) { error += (output - verificationOutput); } error /= width; if(error < 0.001) { std::cout << "Passed!\n"; verificationCount++; } else { std::cout << "Failed!\n"; } return SDK_SUCCESS; } int Device::cleanupResources() { int status = clReleaseCommandQueue(queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseKernel(kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; cl_uint programRefCount; status = clGetProgramInfo(program, CL_PROGRAM_REFERENCE_COUNT, sizeof(cl_uint), &programRefCount, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetProgramInfo failed.")) return SDK_FAILURE; if(programRefCount) { status = clReleaseProgram(program); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseProgram failed.")) return SDK_FAILURE; } cl_uint inputRefCount; status = clGetMemObjectInfo(inputBuffer, CL_MEM_REFERENCE_COUNT, sizeof(cl_uint), &inputRefCount, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetMemObjectInfo failed.")) return SDK_FAILURE; if(inputRefCount) { status = clReleaseMemObject(inputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (inputBuffer)")) return SDK_FAILURE; } cl_uint outputRefCount; status = clGetMemObjectInfo(outputBuffer, CL_MEM_REFERENCE_COUNT, sizeof(cl_uint), &outputRefCount, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetMemObjectInfo failed.")) return SDK_FAILURE; if(outputRefCount) { status = clReleaseMemObject(outputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (outputBuffer)")) return SDK_FAILURE; } cl_uint contextRefCount; status = clGetContextInfo(context, CL_CONTEXT_REFERENCE_COUNT, sizeof(cl_uint), &contextRefCount, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetContextInfo failed.")) return SDK_FAILURE; if(contextRefCount) { status = clReleaseContext(context); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseContext failed.")) return SDK_FAILURE; } status = clReleaseEvent(eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; return SDK_SUCCESS; } //Thread function for a device void* threadFunc(void *device) { Device *d = (Device*)device; size_t globalThreads = width; size_t localThreads = GROUP_SIZE; d->enqueueKernel(&globalThreads, &localThreads); d->waitForKernel(); return NULL; } Device::~Device() { if(output) { free(output); output = NULL; } } int runMultiGPU() { int status; /////////////////////////////////////////////////////////////////// // Case 1 : Single Context (Single Thread) ////////////////////////////////////////////////////////////////// std::cout << sep << "\nMulti GPU Test 1 : Single context Single Thread\n" << sep << std::endl; cl_context context = clCreateContextFromType(cprops, CL_DEVICE_TYPE_GPU, 0, 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; size_t sourceSize = strlen(source); cl_program program = clCreateProgramWithSource(context, 1, &source, (const size_t*)&sourceSize, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; char buildOptions[50]; sprintf(buildOptions, "-D KERNEL_ITERATIONS=%d", KERNEL_ITERATIONS); //Build program for all the devices in the context status = clBuildProgram(program, 0, 0, buildOptions, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clBuildProgram failed.")) return SDK_FAILURE; //Setup for all GPU devices for(int i = 0; i < numGPUDevices; i++) { gpu.context = context; gpu.program = program; status = gpu.createQueue(); if(status != SDK_SUCCESS) return SDK_FAILURE; status = gpu.createKernel(); if(status != SDK_SUCCESS) return SDK_FAILURE; } // Create buffers // Create input buffer cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, width * sizeof(cl_float), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(inputBuffer)")) return SDK_FAILURE; // Create output buffer cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, width * sizeof(cl_float), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(outputBuffer)")) return SDK_FAILURE; //Set Buffers for(int i = 0; i < numGPUDevices; i++) { gpu.inputBuffer = inputBuffer; status = gpu.enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; gpu.outputBuffer = outputBuffer; } //Set kernel arguments for(int i = 0; i < numGPUDevices; i++) { status = gpu.setKernelArgs(); if(status != SDK_SUCCESS) return status; } size_t globalThreads = width; size_t localThreads = GROUP_SIZE; //Start a host timer here int timer = sdkObject.createTimer(); sdkObject.resetTimer(timer); sdkObject.startTimer(timer); for(int i = 0; i < numGPUDevices; i++) { status = gpu.enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; } //Wait for all kernels to finish execution for(int i = 0; i < numGPUDevices; i++) { status = gpu.waitForKernel(); if(status != SDK_SUCCESS) return status; } //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time double totalTime = sdkObject.readTimer(timer); //Get individual timers for(int i = 0; i < numGPUDevices; i++) { status = gpu.getProfilingData(); if(status != SDK_SUCCESS) return status; } //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; for(int i = 0; i < numGPUDevices; i++) { std::cout << "Time of GPU" << i << " : " << gpu.elapsedTime << std::endl; } if(verify) { //Enqueue Read output buffer and verify results for(int i = 0; i < numGPUDevices; i++) { status = gpu.enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for GPU" << i << " : "; gpu.verifyResults(); } } //Release the resources on all devices //Release context status = clReleaseContext(context); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; //Release memory buffers status = clReleaseMemObject(inputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (inputBuffer)")) return SDK_FAILURE; status = clReleaseMemObject(outputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (outputBuffer)")) return SDK_FAILURE; //Release Program object status = clReleaseProgram(program); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseProgram failed.")) return SDK_FAILURE; //Release Kernel object, command-queue, event object for(int i = 0; i < numGPUDevices; i++) { status = clReleaseKernel(gpu.kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseCommandQueue(gpu.queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseEvent(gpu.eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; } /////////////////////////////////////////////////////////////////// // Case 2 : Multiple Context (Single Thread) ////////////////////////////////////////////////////////////////// std::cout << sep << "\nMulti GPU Test 2 : Multiple context Single Thread\n" << sep << std::endl; for(int i = 0; i < numGPUDevices; i++) { //Create context for each device status = gpu.createContext(); if(status != SDK_SUCCESS) return status; //Create command-queue; status = gpu.createQueue(); if(status != SDK_SUCCESS) return status; //Create memory buffers status = gpu.createBuffers(); if(status != SDK_SUCCESS) return status; //Initialize input buffer status = gpu.enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; //create program object status = gpu.createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; //Build program status = gpu.buildProgram(); if(status != SDK_SUCCESS) return status; //Create kernel objects for each device status = gpu.createKernel(); if(status != SDK_SUCCESS) return status; } //Set kernel arguments for(int i = 0; i < numGPUDevices; i++) { status = gpu.setKernelArgs(); if(status != SDK_SUCCESS) return status; } //Start a host timer here sdkObject.resetTimer(timer); sdkObject.startTimer(timer); for(int i = 0; i < numGPUDevices; i++) { status = gpu.enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; } //Wait for all kernels to finish execution for(int i = 0; i < numGPUDevices; i++) { status = gpu.waitForKernel(); if(status != SDK_SUCCESS) return status; } //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time totalTime = sdkObject.readTimer(timer); //Get individual timers for(int i = 0; i < numGPUDevices; i++) { status = gpu.getProfilingData(); if(status != SDK_SUCCESS) return status; } //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; for(int i = 0; i < numGPUDevices; i++) { std::cout << "Time of GPU" << i << " : " << gpu.elapsedTime << std::endl; } if(verify) { // Read outputdata and verify results for(int i = 0; i < numGPUDevices; i++) { status = gpu.enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for GPU" << i << " : "; gpu.verifyResults(); } } //Release the resources on all devices for(int i = 0; i < numGPUDevices; i++) { status = gpu.cleanupResources(); if(status != SDK_SUCCESS) return status; } //////////////////////////////////////////////////////////////////// // Case 3 : Multiple thread and multiple context for each device //////////////////////////////////////////////////////////////////// std::cout << sep << "\nMulti GPU Test 3 : Multiple context Multiple Thread\n" << sep << std::endl; for(int i = 0; i < numGPUDevices; i++) { //Create context for each device status = gpu.createContext(); if(status != SDK_SUCCESS) return status; //Create command-queue; status = gpu.createQueue(); if(status != SDK_SUCCESS) return status; //Create memory buffers status = gpu.createBuffers(); if(status != SDK_SUCCESS) return status; //Initialize input buffer status = gpu.enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; //create program object status = gpu.createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; //Build program status = gpu.buildProgram(); if(status != SDK_SUCCESS) return status; //Create kernel objects for each device status = gpu.createKernel(); if(status != SDK_SUCCESS) return status; } //Set kernel arguments for(int i = 0; i < numGPUDevices; i++) { status = gpu.setKernelArgs(); if(status != SDK_SUCCESS) return status; } //Start a host timer here sdkObject.resetTimer(timer); sdkObject.startTimer(timer); //Create thread objects streamsdk::SDKThread *gpuThread = new streamsdk::SDKThread[numGPUDevices]; //Start threads for each gpu device for(int i = 0; i < numGPUDevices; i++) { gpuThread.create(threadFunc, (void *)(gpu + i)); } //Join all gpu threads for(int i = 0; i < numGPUDevices; i++) { gpuThread.join(); } //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time totalTime = sdkObject.readTimer(timer); //Get individual timers for(int i = 0; i < numGPUDevices; i++) { status = gpu.getProfilingData(); if(status != SDK_SUCCESS) return status; } //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; for(int i = 0; i < numGPUDevices; i++) { std::cout << "Time of GPU" << i << " : " << gpu.elapsedTime << std::endl; } if(verify) { // Read outputdata and verify results for(int i = 0; i < numGPUDevices; i++) { status = gpu.enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for GPU" << i << " : "; gpu.verifyResults(); } } //Release the resources on all devices for(int i = 0; i < numGPUDevices; i++) { gpu.cleanupResources(); } return 0; } int runMultiDevice() { int status; /////////////////////////////////////////////////////////////////// // Case 1 : Single Context (Single Thread) ////////////////////////////////////////////////////////////////// std::cout << sep << "\nCPU + GPU Test 1 : Single context Single Thread\n" << sep << std::endl; /* Create a list of device IDs having only CPU0 and GPU0 as device IDs */ cl_device_id *devices = (cl_device_id*)malloc(2 * sizeof(cl_device_id)); devices[0] = cpu[0].deviceId; devices[1] = gpu[0].deviceId; cl_context context = clCreateContext(cprops, 2, devices, 0, 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; size_t sourceSize = strlen(source); cl_program program = clCreateProgramWithSource(context, 1, &source, (const size_t*)&sourceSize, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; char buildOptions[50]; sprintf(buildOptions, "-D WIDTH=%d -D HEIGHT=%d -cl-mad-enable", MAX_DATA_WIDTH, MAX_DATA_HEIGHT); //Build program for all the devices in the context status = clBuildProgram(program, 2, devices, buildOptions, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clBuildProgram failed.")) return SDK_FAILURE; //Allocate objects for CPU cpu[0].context = context; gpu[0].context = context; cpu[0].program = program; gpu[0].program = program; // Create command queue status = cpu[0].createQueue(); if(status != SDK_SUCCESS) return status; // Create kernel status = cpu[0].createKernel(); if(status != SDK_SUCCESS) return status; // Create queue status = gpu[0].createQueue(); if(status != SDK_SUCCESS) return status; // Create kernel status = gpu[0].createKernel(); if(status != SDK_SUCCESS) return status; cl_mem _dData = clCreateBuffer(context, CL_MEM_READ_ONLY, MAX_DATA_WIDTH * MAX_DATA_HEIGHT * sizeof(PixelType), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(_dDataSarVec)")) return SDK_FAILURE; cl_mem _dTemplate = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, MAX_TEMPLATES * MAX_TEMPLATE_ELEMENTS * sizeof(int), (void*)_template, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(_dDataSarVec)")) return SDK_FAILURE; cl_mem _dResult = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, MAX_DATA_WIDTH * MAX_DATA_HEIGHT * sizeof(unsigned char), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(_dResultDetectMapVec)")) return SDK_FAILURE; cpu[0].inputBuffer = _dData; gpu[0].inputBuffer = _dData; cpu[0].templateBuffer = _dTemplate; gpu[0].templateBuffer = _dTemplate; cpu[0].outputBuffer = _dResult; gpu[0].outputBuffer = _dResult; // Initialize input buffer for both devices status = cpu[0].enqueueWriteNewBuffer(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueWriteNewBuffer(); if(status != SDK_SUCCESS) return status; status = cpu[0].enqueueWriteTemplateBuffer(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueWriteTemplateBuffer(); if(status != SDK_SUCCESS) return status; //Set kernel arguments status = cpu[0].setNewKernelArgs(); if(status != SDK_SUCCESS) return status; status = gpu[0].setNewKernelArgs(); if(status != SDK_SUCCESS) return status; size_t globalThreads2D[2] = {MAX_DATA_WIDTH, MAX_DATA_HEIGHT}; size_t localThreads2D[2] = {LOCAL_WORK_SIZE, LOCAL_WORK_SIZE}; //Start a host timer here int timer = sdkObject.createTimer(); sdkObject.resetTimer(timer); sdkObject.startTimer(timer); status = gpu[0].enqueueNewKernel(globalThreads2D, localThreads2D); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueNewReadData(); if(status != SDK_SUCCESS) return status; status = gpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; status = cpu[0].enqueueNewKernel(globalThreads2D, localThreads2D); if(status != SDK_SUCCESS) return status; status = cpu[0].enqueueNewReadData(); if(status != SDK_SUCCESS) return status; status = cpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time double totalTime = sdkObject.readTimer(timer); status = cpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; status = gpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; std::cout << "Time of CPU : " << cpu[0].elapsedTime << std::endl; std::cout << "Time of GPU : " << gpu[0].elapsedTime << std::endl; //Release context status = clReleaseContext(context); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; //Release memory buffers status = clReleaseMemObject(_dData); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (_dData)")) return SDK_FAILURE; status = clReleaseMemObject(_dTemplate); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (_dTemplate)")) return SDK_FAILURE; status = clReleaseMemObject(_dResult); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (_dResult)")) return SDK_FAILURE; //ReleaseCommand-queue status = clReleaseCommandQueue(cpu[0].queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseCommandQueue(gpu[0].queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; //Release Program object status = clReleaseProgram(program); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseProgram failed.")) return SDK_FAILURE; //Release Kernel object status = clReleaseKernel(cpu[0].kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; status = clReleaseKernel(gpu[0].kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; //Release Event object status = clReleaseEvent(cpu[0].eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; status = clReleaseEvent(gpu[0].eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; size_t globalThreads = width; size_t localThreads = GROUP_SIZE; #if 0 // Create buffers - A buffer is created on all devices sharing a context // So bufffer creation should should not per device in a single-context cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, width * sizeof(cl_float), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(inputBuffer)")) return SDK_FAILURE; cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, //width * sizeof(cl_float), width * sizeof(cl_char), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(outputBuffer)")) return SDK_FAILURE; cpu[0].inputBuffer = inputBuffer; gpu[0].inputBuffer = inputBuffer; cpu[0].outputBuffer = outputBuffer; gpu[0].outputBuffer = outputBuffer; // Initialize input buffer for both devices status = cpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; //Set kernel arguments status = cpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; status = gpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; size_t globalThreads = width; size_t localThreads = GROUP_SIZE; //Start a host timer here int timer = sdkObject.createTimer(); sdkObject.resetTimer(timer); sdkObject.startTimer(timer); status = gpu[0].enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; status = gpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; status = cpu[0].enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; //Read back output data for verification status = cpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; //Wait for all kernels to finish execution status = cpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time double totalTime = sdkObject.readTimer(timer); status = cpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; status = gpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; std::cout << "Time of CPU : " << cpu[0].elapsedTime << std::endl; std::cout << "Time of GPU : " << gpu[0].elapsedTime << std::endl; if(verify) { //Read back output data for verification status = cpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for CPU : "; cpu[0].verifyResults(); std::cout << "Verifying results for GPU : "; gpu[0].verifyResults(); } //Release context status = clReleaseContext(context); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; //Release memory buffers status = clReleaseMemObject(inputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (inputBuffer)")) return SDK_FAILURE; status = clReleaseMemObject(outputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (outputBuffer)")) return SDK_FAILURE; //ReleaseCommand-queue status = clReleaseCommandQueue(cpu[0].queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseCommandQueue(gpu[0].queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; //Release Program object status = clReleaseProgram(program); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseProgram failed.")) return SDK_FAILURE; //Release Kernel object status = clReleaseKernel(cpu[0].kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; status = clReleaseKernel(gpu[0].kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; //Release Event object status = clReleaseEvent(cpu[0].eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; status = clReleaseEvent(gpu[0].eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; #endif /////////////////////////////////////////////////////////////////// // Case 2 : Multiple Context (Single Thread) ////////////////////////////////////////////////////////////////// std::cout << sep << "\nCPU + GPU Test 2 : Multiple context Single Thread\n" << sep << std::endl; status = cpu[0].createContext(); if(status != SDK_SUCCESS) return status; status = cpu[0].createQueue(); if(status != SDK_SUCCESS) return status; status = cpu[0].createBuffers(); if(status != SDK_SUCCESS) return status; status = cpu[0].createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; status = cpu[0].buildProgram(); if(status != SDK_SUCCESS) return status; status = cpu[0].createKernel(); if(status != SDK_SUCCESS) return status; status = gpu[0].createContext(); if(status != SDK_SUCCESS) return status; status = gpu[0].createQueue(); if(status != SDK_SUCCESS) return status; status = gpu[0].createBuffers(); if(status != SDK_SUCCESS) return status; status = gpu[0].createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; status = gpu[0].buildProgram(); if(status != SDK_SUCCESS) return status; status = gpu[0].createKernel(); if(status != SDK_SUCCESS) return status; // Initialize input buffer for both devices status = cpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; //Set kernel arguments status = cpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; status = gpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; //Start a host timer here //int timer = sdkObject.createTimer(); sdkObject.resetTimer(timer); sdkObject.startTimer(timer); //size_t globalThreads = width; //size_t localThreads = 1; status = cpu[0].enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; //Wait for all kernels to finish execution status = cpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; status = gpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time //double totalTime = sdkObject.readTimer(timer); status = cpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; status = gpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; std::cout << "Time of CPU : " << cpu[0].elapsedTime << std::endl; std::cout << "Time of GPU : " << gpu[0].elapsedTime << std::endl; if(verify) { //Read back output data for verification status = cpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for CPU : "; cpu[0].verifyResults(); std::cout << "Verifying results for GPU : "; gpu[0].verifyResults(); } //Release the resources on all devices status = cpu[0].cleanupResources(); if(status != SDK_SUCCESS) return status; status = gpu[0].cleanupResources(); if(status != SDK_SUCCESS) return status; ///////////////////////////////////////////////////////////////////// // Case 3 : Multiple thread and multiple context for each device //////////////////////////////////////////////////////////////////// std::cout << sep << "\nCPU + GPU Test 3 : Multiple context Multiple Thread\n" << sep << std::endl; status = cpu[0].createContext(); if(status != SDK_SUCCESS) return status; status = cpu[0].createQueue(); if(status != SDK_SUCCESS) return status; status = cpu[0].createBuffers(); if(status != SDK_SUCCESS) return status; status = cpu[0].createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; status = cpu[0].buildProgram(); if(status != SDK_SUCCESS) return status; status = cpu[0].createKernel(); if(status != SDK_SUCCESS) return status; status = gpu[0].createContext(); if(status != SDK_SUCCESS) return status; status = gpu[0].createQueue(); if(status != SDK_SUCCESS) return status; status = gpu[0].createBuffers(); if(status != SDK_SUCCESS) return status; status = gpu[0].createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; status = gpu[0].buildProgram(); if(status != SDK_SUCCESS) return status; status = gpu[0].createKernel(); if(status != SDK_SUCCESS) return status; // Initialize input buffer for both devices status = cpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; //Set kernel arguments status = cpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; status = gpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; //Start a host timer here sdkObject.resetTimer(timer); sdkObject.startTimer(timer); //Create a thread for CPU and GPU device each streamsdk::SDKThread cpuThread; streamsdk::SDKThread gpuThread; cpuThread.create(threadFunc, (void *)cpu); gpuThread.create(threadFunc, (void *)gpu); cpuThread.join(); gpuThread.join(); //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time totalTime = sdkObject.readTimer(timer); status = cpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; status = gpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; std::cout << "Time of CPU : " << cpu[0].elapsedTime << std::endl; std::cout << "Time of GPU : " << gpu[0].elapsedTime << std::endl; if(verify) { //Read back output data for verification status = cpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for CPU : "; cpu[0].verifyResults(); std::cout << "Verifying results for GPU : "; gpu[0].verifyResults(); } //Release the resources on all devices status = cpu[0].cleanupResources(); if(status != SDK_SUCCESS) return status; status = gpu[0].cleanupResources(); if(status != SDK_SUCCESS) return status; if(devices) { free(devices); devices = NULL; } return 0; } /* * \brief Host Initialization * Allocate and initialize memory * on the host. Print input array. */ int initializeHost(void) { width = NUM_THREADS; input = NULL; verificationOutput = NULL; ///////////////////////////////////////////////////////////////// // Allocate and initialize memory used by host ///////////////////////////////////////////////////////////////// _hData = (PixelType*) malloc (MAX_DATA_SIZE * sizeof(PixelType)); if (_hData == NULL) { printf("\nUnable to allocate memory at %d\n", __LINE__); return SDK_FAILURE; } for (int i = 0; i < MAX_DATA_SIZE; i+=4) { (_hData + i)->_val1 = i * 0.0001f; (_hData + i + 1)->_val2 = (i + 1) * 0.0001f; (_hData + i + 2)->_val3 = (i + 2) * 0.0001f; (_hData + i + 3)->_val4 = (i + 3) * 0.0001f; } #if 0 cl_uint sizeInBytes = width * sizeof(cl_uint); input = (cl_float*) malloc(sizeInBytes); if(input == NULL) { printf("Error: Failed to allocate input memory on host\n"); return SDK_FAILURE; } verificationOutput = (cl_float*) malloc(sizeInBytes); if(verificationOutput == NULL) { printf("Error: Failed to allocate verificationOutput memory on host\n"); return SDK_FAILURE; } //Initilize input data for(int i = 0; i < width; i++) input = (cl_float)i; #endif return SDK_SUCCESS; } /* * Converts the contents of a file into a string */ std::string convertToString(const char *filename) { size_t size; char* str; std::string s; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = (size_t)f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); return NULL; } f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; delete[] str; return s; } return NULL; } /* * \brief OpenCL related initialization * Create Context, Device list, Command Queue * Create OpenCL memory buffer objects * Load CL file, compile, link CL source * Build program and kernel objects */ int initializeCL(void) { cl_int status = 0; /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) return SDK_FAILURE; if(numPlatforms > 0) { cl_platform_id* platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) return SDK_FAILURE; for(unsigned int i=0; i < numPlatforms; ++i) { char pbuff[100]; status = clGetPlatformInfo( platforms, CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetPlatformInfo failed.")) return SDK_FAILURE; platform = platforms; if(!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { break; } } free(platforms); } /* * If we could find our platform, use it. Otherwise pass a NULL and get whatever the * implementation thinks we should be using. */ cps[0] = CL_CONTEXT_PLATFORM; cps[1] = (cl_context_properties)platform; cps[2] = 0; cprops = (NULL == platform) ? NULL : cps; // Get Number of CPU devices available status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, 0, (cl_uint*)&numCPUDevices); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetDeviceIDs failed.(numCPUDevices)")) return SDK_FAILURE; // Get Number of CPU devices available status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, 0, (cl_uint*)&numDevices); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetDeviceIDs failed.(numDevices)")) return SDK_FAILURE; // Get number of GPU Devices numGPUDevices = numDevices - numCPUDevices; // If no GPU is present then exit if(numGPUDevices < 1) { std::cout << "Only CPU device is present. Exiting!\n"; return SDK_EXPECTED_FAILURE; } // Allocate memory for list of Devices cpu = new Device[numCPUDevices]; //Get CPU Device IDs cl_device_id* cpuDeviceIDs = new cl_device_id[numCPUDevices]; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numCPUDevices, cpuDeviceIDs, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetDeviceIDs failed.")) return SDK_FAILURE; for(int i = 0; i < numCPUDevices; i++) { cpu.dType = CL_DEVICE_TYPE_CPU; cpu.deviceId = cpuDeviceIDs; } delete[] cpuDeviceIDs; gpu = new Device[numGPUDevices]; //Get GPU Device IDs cl_device_id* gpuDeviceIDs = new cl_device_id[numGPUDevices]; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numGPUDevices, gpuDeviceIDs, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetDeviceIDs failed.")) return SDK_FAILURE; for(int i = 0; i < numGPUDevices; i++) { gpu.dType = CL_DEVICE_TYPE_GPU; gpu.deviceId = gpuDeviceIDs; } delete[] gpuDeviceIDs; ///////////////////////////////////////////////////////////////// // Load CL file ///////////////////////////////////////////////////////////////// const char *filename = "SimpleMultiDevice_Kernels.cl"; sourceStr = convertToString(filename); source = sourceStr.c_str(); return SDK_SUCCESS; } int run() { int status; // If a GPU is present then run CPU + GPU concurrently if(numGPUDevices > 0 && numCPUDevices > 0) { // 3 tests : // a) Single context - Single thread // b) Multiple context - Single thread // c) Multiple context - Multple Threads // 3 Tests * 2 devices requiredCount += 3 * 2; status = runMultiDevice(); if(status != SDK_SUCCESS) return status; } // If more than 1 GPU is present then run MultiGPU concurrently if(numGPUDevices > 1) { // 3 tests : // a) Single context - Single thread // b) Multiple context - Single thread // c) Multiple context - Multple Threads // 3 Tests * numGPUDevices requiredCount += 3 * numGPUDevices; status = runMultiGPU(); if(status != SDK_SUCCESS) return status; } return SDK_SUCCESS; } /* * \brief Releases program's resources */ void cleanupHost(void) { if(input != NULL) { free(input); input = NULL; } if(verificationOutput != NULL) { free(verificationOutput); verificationOutput = NULL; } if(cpu != NULL) { delete[] cpu; cpu = NULL; } if(gpu != NULL) { delete[] gpu; gpu = NULL; } } /* * \brief Print no more than 256 elements of the given array. * * Print Array name followed by elements. */ void print1DArray( const std::string arrayName, const unsigned int * arrayData, const unsigned int length) { cl_uint i; cl_uint numElementsToPrint = (256 < length) ? 256 : length; std::cout << std::endl; std::cout << arrayName << ":" << std::endl; for(i = 0; i < numElementsToPrint; ++i) { std::cout << arrayData << " "; } std::cout << std::endl; } // OpenCL MAD definition for CPU float mad(float a, float b, float c) { return a * b + c; } // OpenCL HYPOT definition for CPU float hypot(float a, float b) { return sqrt(a * a + b * b); } int CPUKernel() { for(int i = 0; i < width; i++) { float a = mad(input, input, 1); float b = mad(input, input, 2); for(int j = 0; j < KERNEL_ITERATIONS; j++) { a = hypot(a, b); b = hypot(a, b); } verificationOutput = (a + b); } return 0; } int main(int argc, char * argv[]) { for(int i = 1; i < argc; i++) { if(!strcmp(argv, "-e") || !strcmp(argv, "--verify")) verify = true; if(!strcmp(argv, "-h") || !strcmp(argv, "--help")) { printf("Usage\n"); printf("-h, --help\tPrint this help.\n"); printf("-e, --verify\tVerify results against reference implementation.\n"); exit(0); } } int status; // Initialize Host application status = initializeHost(); if(status != SDK_SUCCESS) return status; // Run host computation if verification is true #if 0 CPUKernel(); #endif // Initialize OpenCL resources status = initializeCL(); if(status != SDK_SUCCESS) { if(status == SDK_EXPECTED_FAILURE) return SDK_SUCCESS; else return status; } // Run the CL program status = run(); if(status != SDK_SUCCESS) return status; // Release host resources cleanupHost(); if(verify) { // If any one test fails then print FAILED if(verificationCount != requiredCount) { std::cout << "\n\nFAILED!\n"; return SDK_FAILURE; } else { std::cout << "\n\nPASSED!\n"; return SDK_SUCCESS; } } return SDK_SUCCESS; }

0 Kudos
Reply
Journeyman III
Journeyman III

CPU + GPU crash with AMD APP SDK 2.3 Win7

The .cpp exceeds the length allowable for attaching code. Kindly provide alternative means to send the complete .cpp file.

0 Kudos
Reply
Journeyman III
Journeyman III

CPU + GPU crash with AMD APP SDK 2.3 Win7

Originally posted by: wheecheng The .cpp exceeds the length allowable for attaching code. Kindly provide alternative means to send the complete .cpp file.

 

You can file a ticket where you can attach your code in txt form. see following link

http://ad.amddevcentral.com/support/KnowledgeBase/pages/HelpdeskTicketForm.aspx?Category=2

 

 

0 Kudos
Reply
Journeyman III
Journeyman III

CPU + GPU crash with AMD APP SDK 2.3 Win7

Ticket ID 1265

0 Kudos
Reply
Journeyman III
Journeyman III

CPU + GPU crash with AMD APP SDK 2.3 Win7

Originally posted by: wheecheng The .cpp exceeds the length allowable for attaching code. Kindly provide alternative means to send the complete .cpp file.

 

Wheecheng,

Thank you for reporting this. I am able to reproduce this at my end.  Issue is reported to developers.

0 Kudos
Reply
Journeyman III
Journeyman III

CPU + GPU crash with AMD APP SDK 2.3 Win7

Application crashes at line 1199 (status = cpu[0].enqueueNewReadData()). Kindly replace the 3 files (*.cl, *.hpp and *.cpp) with the files I provide to create the crash.

0 Kudos
Reply