17 Replies Latest reply on Feb 18, 2011 6:08 AM by wheecheng

    CPU + GPU crash with AMD APP SDK 2.3 Win7

    wheecheng

      I developed an OpenCL application that uses both CPU and GPU to load balance the computation. The application works in WinXP but crashes in Win7 (clEnqueueMapBuffer).

      In addition, the application works when only one device is used (i.e. CPU or GPU but not combined) in Win7. How can I get both devices working together?

      I have two configurations and both exhibit the same issue:

      Configuration 1: - Intel Core2 Duo

                                 - Asus EAH5670

                                 - AMD APP SDK v2.3 with ATI Catalyst v11.1 (Win7 64 bits)

      Configuration 2: - AMD Fusion G-T56N

                                 - AMD APP SDK v2.3 with ATI Catalyst v11.1 (Win7 64 bits)

        • CPU + GPU crash with AMD APP SDK 2.3 Win7
          LeeHowes

          Which hand am I holding the answer in?

           

          You really need to give us more information to give you an answer to a question like this. A test case of some sort. Otherwise you just have to hope someone else has come across exactly the same bug. Can you give us some code that consistently crashes for you and someone will test it.

            • CPU + GPU crash with AMD APP SDK 2.3 Win7
              wheecheng

              My application works using previous version of the SDK, i.e. v2.2, on WinXP. However, it crashes using SDK v2.3 on WinXP. This is tested in AMD Fusion G-T56N.

                • CPU + GPU crash with AMD APP SDK 2.3 Win7
                  LeeHowes

                  Yes... but what is your application?

                    • CPU + GPU crash with AMD APP SDK 2.3 Win7
                      lordnn

                      I can post my version of application:

                       

                      #include <stdio.h> #include <stdlib.h> #include "CL\cl.h" const char *source = "#define cl_uint unsigned int\n" "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n" "\n" "__kernel void testD(const double k, const cl_uint uiCount, __global double *pX, __global double *pY)\n" "{\n" " cl_uint i = get_global_id(0);\n" " if (i >= uiCount)\n" " return;\n" " double k1 = 2.0f / 3.0f;\n" " pY[i] = k * pX[i];\n" "}\n" "\n" "__kernel void testDBug(const cl_uint uiCount, const double k, __global double *pX, __global double *pY)\n" "{\n" " cl_uint i = get_global_id(0);\n" " if (i >= uiCount)\n" " return;\n" " double k1 = 2.0f / 3.0f;\n" " pY[i] = k * pX[i];\n" "}\n"; int main() { cl_int err_code(0); cl_platform_id platform(NULL); cl_uint numPlatforms(0); err_code = clGetPlatformIDs(0, NULL, &numPlatforms); if (CL_SUCCESS != err_code) { printf("1. Unable get platform %d %d\n", err_code, numPlatforms); exit(1); } err_code = clGetPlatformIDs(1, &platform, NULL); if (CL_SUCCESS != err_code) { printf("Unable get platform %d\n", err_code); exit(1); } cl_uint numDevices(0); err_code = clGetDeviceIDs(platform ,CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices); cl_device_id *devices = new cl_device_id[numDevices]; err_code = clGetDeviceIDs(platform ,CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_CPU, numDevices, devices, NULL); if (CL_SUCCESS != err_code) { printf("clGetDeviceIDs failed %d\n", err_code); exit(1); } cl_context context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &err_code); if (err_code) { printf("Unable create context %d\n", err_code); exit(1); } cl_command_queue *queues = new cl_command_queue[numDevices]; for (int k(0); k < numDevices; ++k) { queues[k] = clCreateCommandQueue(context, devices[k], 0, &err_code); if (err_code) { printf("Unable create command queue %d\n", err_code); exit(1); } } cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source, NULL, &err_code); if (err_code) { printf("Unable create program %d\n", err_code); exit(1); } printf("Program created.\n"); err_code = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err_code) { printf("Unable build program %d\n", err_code); exit(1); } printf("Program builded.\n"); // TEST printf("Test started.\n"); cl_kernel k_testD = clCreateKernel(program, "testD", &err_code); if (CL_SUCCESS != err_code) { printf("clCreateKernel testD failed %d\n", err_code); exit(1); } cl_kernel k_testDBug = clCreateKernel(program, "testDBug", &err_code); if (CL_SUCCESS != err_code) { printf("clCreateKernel testDBug failed %d\n", err_code); exit(1); } cl_uint uiCount(10); double *p_dX2 = new double[uiCount]; double *p_dY2 = new double[uiCount]; for (cl_uint i(0); i < uiCount; ++i) { p_dX2[i] = (double)i; p_dY2[i] = 0; } double dValue(2.0/3.0); err_code = clSetKernelArg(k_testD, 0, sizeof(dValue), (void*)&dValue); if (CL_SUCCESS != err_code) { printf("clSetKernelArg 0 failed %d\n", err_code); exit(1); } err_code = clSetKernelArg(k_testD, 1, sizeof(uiCount), (void*)&uiCount); if (CL_SUCCESS != err_code) { printf("clSetKernelArg 1 failed %d\n", err_code); exit(1); } for (int k(0); k < numDevices; ++k) { cl_mem bufdX = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, uiCount * sizeof(double), p_dX2, &err_code); if (CL_SUCCESS != err_code) { printf("clCreateBuffer X failed %d\n", err_code); exit(1); } cl_mem bufdY = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, uiCount * sizeof(double), p_dY2, &err_code); if (CL_SUCCESS != err_code) { printf("clCreateBuffer Y failed %d\n", err_code); exit(1); } err_code = clSetKernelArg(k_testD, 2, sizeof(bufdX), (void*)&bufdX); if (CL_SUCCESS != err_code) { printf("clSetKernelArg 2 failed %d\n", err_code); exit(1); } err_code = clSetKernelArg(k_testD, 3, sizeof(bufdY), (void*)&bufdY); if (CL_SUCCESS != err_code) { printf("clSetKernelArg 3 failed %d\n", err_code); exit(1); } size_t globalThreads1[1] = { uiCount }; err_code = clEnqueueNDRangeKernel(queues[k], k_testD, 1, NULL, globalThreads1, NULL, 0, NULL, NULL); if (CL_SUCCESS != err_code) { printf("clEnqueueNDRangeKernel failed %d\n", err_code); exit(1); } err_code = clFinish(queues[k]); if (CL_SUCCESS != err_code) { printf("clFinish failed %d\n", err_code); exit(1); } err_code = clEnqueueReadBuffer(queues[k], bufdX, CL_TRUE, 0, uiCount * sizeof(double), p_dX2, 0, NULL, NULL); if (CL_SUCCESS != err_code) { printf("clEnqueueMapBuffer X failed %d\n", err_code); exit(1); } err_code = clEnqueueReadBuffer(queues[k], bufdY, CL_TRUE, 0, uiCount * sizeof(double), p_dY2, 0, NULL, NULL); if (CL_SUCCESS != err_code) { printf("clEnqueueMapBuffer Y failed %d\n", err_code); exit(1); } clReleaseMemObject(bufdX); clReleaseMemObject(bufdY); for (cl_uint i(0); i < uiCount; ++i) { printf("X = %lf Y = %lf\n", p_dX2[i], p_dY2[i]); p_dX2[i] = (double)i; p_dY2[i] = 0; } printf("\n"); } // WARNING err_code = clSetKernelArg(k_testDBug, 0, sizeof(uiCount), (void*)&uiCount); if (CL_SUCCESS != err_code) { printf("clSetKernelArg 0 failed %d\n", err_code); exit(1); } err_code = clSetKernelArg(k_testDBug, 1, sizeof(dValue), (void*)&dValue); if (CL_SUCCESS != err_code) { printf("clSetKernelArg 1 failed %d\n", err_code); exit(1); } for (int k(0); k < numDevices; ++k) { cl_mem bufdX = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, uiCount * sizeof(double), p_dX2, &err_code); if (CL_SUCCESS != err_code) { printf("clCreateBuffer X failed %d\n", err_code); exit(1); } cl_mem bufdY = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, uiCount * sizeof(double), p_dY2, &err_code); if (CL_SUCCESS != err_code) { printf("clCreateBuffer Y failed %d\n", err_code); exit(1); } err_code = clSetKernelArg(k_testDBug, 2, sizeof(bufdX), (void*)&bufdX); if (CL_SUCCESS != err_code) { printf("clSetKernelArg 2 failed %d\n", err_code); exit(1); } err_code = clSetKernelArg(k_testDBug, 3, sizeof(bufdY), (void*)&bufdY); if (CL_SUCCESS != err_code) { printf("clSetKernelArg 3 failed %d\n", err_code); exit(1); } size_t globalThreads1[1] = { uiCount }; err_code = clEnqueueNDRangeKernel(queues[k], k_testDBug, 1, NULL, globalThreads1, NULL, 0, NULL, NULL); if (CL_SUCCESS != err_code) { printf("clEnqueueNDRangeKernel failed %d\n", err_code); exit(1); } err_code = clFinish(queues[k]); if (CL_SUCCESS != err_code) { printf("clFinish failed %d\n", err_code); exit(1); } err_code = clEnqueueReadBuffer(queues[k], bufdX, CL_TRUE, 0, uiCount * sizeof(double), p_dX2, 0, NULL, NULL); if (CL_SUCCESS != err_code) { printf("clEnqueueMapBuffer X failed %d\n", err_code); exit(1); } err_code = clEnqueueReadBuffer(queues[k], bufdY, CL_TRUE, 0, uiCount * sizeof(double), p_dY2, 0, NULL, NULL); if (CL_SUCCESS != err_code) { printf("clEnqueueMapBuffer Y failed %d\n", err_code); exit(1); } clReleaseMemObject(bufdX); clReleaseMemObject(bufdY); for (cl_uint i(0); i < 10; ++i) { printf("X = %lf Y = %lf\n", p_dX2[i], p_dY2[i]); p_dX2[i] = (double)i; p_dY2[i] = 0; } printf("\n"); } delete [] p_dX2; delete [] p_dY2; clReleaseKernel(k_testD); clReleaseKernel(k_testDBug); clReleaseProgram(program); for (int k(0); k < numDevices; ++k) clReleaseCommandQueue(queues[k]); clReleaseContext(context); }

                        • CPU + GPU crash with AMD APP SDK 2.3 Win7
                          genaganna

                           

                          Originally posted by: lordnn I can post my version of application:

                           

                           



                          Could you please tell me what issue you are facing with this code?

                            • CPU + GPU crash with AMD APP SDK 2.3 Win7
                              lordnn

                              This code crashed then testDBug routine enqueued on CPU device.

                                • CPU + GPU crash with AMD APP SDK 2.3 Win7
                                  wheecheng

                                  Using SimpleMultiDevice project with CPU + GPU Test 1: Single Context, Single Thread as the reference, the input data is two-dimensional with WIDTH = HEIGHT = 128 and  each data comprises 4 elements (float).

                                  The kernel code is as below. Below is the summary of my observations:

                                  1. If either line 49 (i.e. _c   = ...) or 52 (i.e. _U = ...) is commented, the application works.

                                  2. If line 49 is replaced with _b += a; and _detC is modified appropriately, the application crashes nevertheless.

                                  3. If the entire nested for loop is commented, the application works.

                                   

                                  #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable #define MAX 25 #define OFFSET 2 #define COMPUTE(_in) ((_in) * (_in)) __kernel void multiDeviceKernel(__global unsigned char *_dout, __global float4 *_din, __constant int *_template, const float4 _val, const int _type ) { int _col = get_global_id(0); int _row = get_global_id(1); float4 _a = {0, 0, 0, 0}; float4 _b = {0, 0, 0, 0}; float4 _c = {0, 0, 0, 0}; int i, j, k, l; int _index; float _detA; float _detC; float _U; for (i = (_row - OFFSET), _index = 0, _a = (float4)0, _b = (float4)0; i <= (_row + OFFSET); i++) { for (j = (_col - OFFSET); j <= (_col + OFFSET); j++) { k = j; if ((j < 0) || (j >= WIDTH)) k = (_col << 1) - j; l = i; if ((i < 0) || (i >= HEIGHT)) l = (_row << 1) - i; if (_template[_type * MAX + _index] == 1) _a += COMPUTE(_din[l * WIDTH + k]); else if (_template[_type * MAX + _index] == 2) _b += COMPUTE(_din[l * WIDTH + k]); _index++; } } _c = _a + _b; _detA = (_a.x * _a.y) - (_a.z * _a.w); _detC = (_c.x * _c.y) + (_c.z * _c.w); _U = (_detC != 0.0f) ? _detC/_detA : _val.x; _dout[_row * WIDTH + _col] = (_U >= _val.z && _U < _val.y); }

                                    • CPU + GPU crash with AMD APP SDK 2.3 Win7
                                      genaganna

                                       

                                      Originally posted by: wheecheng Using SimpleMultiDevice project with CPU + GPU Test 1: Single Context, Single Thread as the reference, the input data is two-dimensional with WIDTH = HEIGHT = 128 and  each data comprises 4 elements (float).

                                       

                                      The kernel code is as below. Below is the summary of my observations:

                                       

                                      1. If either line 49 (i.e. _c   = ...) or 52 (i.e. _U = ...) is commented, the application works.

                                       

                                      2. If line 49 is replaced with _b += a; and _detC is modified appropriately, the application crashes nevertheless.

                                       

                                      3. If the entire nested for loop is commented, the application works.

                                       

                                       

                                      Wheecheng,

                                                    Thanks for giving kernel code.  Please send your runtime code also which allows us to reproduce this at our end.

                                • CPU + GPU crash with AMD APP SDK 2.3 Win7
                                  genaganna

                                   

                                  Originally posted by: lordnn I can post my version of application:

                                   

                                   



                                  Thanks for reporting this issue.

                                    • CPU + GPU crash with AMD APP SDK 2.3 Win7
                                      wheecheng

                                      Here's the complete runtime code:

                                       

                                      SimpleMultiDevice.hpp -------------------------- #ifndef MULTI_DEVICE_H_ #define MULTI_DEVICE_H_ #include <string.h> #include <cstdlib> #include <iostream> #include <string> #include <fstream> #include <time.h> #include <SDKCommon.hpp> #include <SDKThread.hpp> #define KERNEL_ITERATIONS 100 #define GROUP_SIZE 64 #define NUM_THREADS 1024 * 64 #define MAX_DATA_WIDTH 128 #define MAX_DATA_HEIGHT MAX_DATA_WIDTH #define MAX_DATA_SIZE (MAX_DATA_WIDTH * MAX_DATA_HEIGHT) #define LOCAL_WORK_SIZE 16 #define MAX_TEMPLATES 2 #define MAX_TEMPLATE_ELEMENTS 25 #define DEFAULT_THRESHOLD 0.25f #define THRESHOLD_UPPER 0.555f #define THRESHOLD_LOWER 0.055f typedef struct { float _val1; float _val2; float _val3; float _val4; } PixelType; typedef struct { float _default; float _thresholdHi; float _thresholdLo; float _spare; } ThresholdType; static const int _template[MAX_TEMPLATES][MAX_TEMPLATE_ELEMENTS] = { // template 1 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}, // template 2 {1, 1, 0, 2, 2, 1, 1, 0, 2, 2, 1, 1, 0, 2, 2, 1, 1, 0, 2, 2, 1, 1, 0, 2, 2} }; class Device { public: //CL Objects and memory buffers int status; cl_device_type dType; //device type cl_device_id deviceId; //device ID cl_context context; //context cl_command_queue queue; //command-queue cl_mem inputBuffer; //input buffer cl_mem outputBuffer; //output buffer cl_mem templateBuffer; cl_program program; //program object cl_kernel kernel; //kernel object cl_event eventObject; //event object cl_ulong kernelStartTime; //kernel start time cl_ulong kernelEndTime; //kernel end time double elapsedTime; //elapsed time in ms cl_float *output; //output host buffer for verification Device() { output = NULL; } ~Device(); // Create Context int createContext(); // Create Command-queue int createQueue(); // Create input and output buffers and Enqueue write data int createBuffers(); // Initialize Input buffers int enqueueWriteBuffer(); // Initialize Input buffers int enqueueWriteNewBuffer(); // Initialize Input buffers int enqueueWriteTemplateBuffer(); // Set Kernel arguments int setNewKernelArgs(); // Enqueue NDRAnge kernel int enqueueNewKernel(size_t *globalThreads, size_t *localThreads); // Get output data from device int enqueueNewReadData(); // Create Program object int createProgram(const char **source, const size_t *sourceSize); // Build Program source int buildProgram(); // Create Kernel object int createKernel(); // Set Kernel arguments int setKernelArgs(); // Enqueue NDRAnge kernel int enqueueKernel(size_t *globalThreads, size_t *localThreads); // Wait for kernel execution to finish int waitForKernel(); // Get kernel execution time int getProfilingData(); // Get output data from device int enqueueReadData(); // Verify results against host computation int verifyResults(); // Cleanup allocated resources int cleanupResources(); }; /*** GLOBALS ***/ //Separator std::string sep = "----------------------------------------------------------"; bool verify = false; // Pointer to list of devices Device *cpu; Device *gpu; // Number of CPU and GPU devices int numDevices; int numCPUDevices; int numGPUDevices; // Size of input data int width; // Input data is same for all devices cl_float *input; PixelType *_hData; // Host Output data for verification cl_float *verificationOutput; // Kernel source string std::string sourceStr; const char *source; // SDK object streamsdk::SDKCommon sdkObject; // Context properties const cl_context_properties* cprops; cl_context_properties cps[3]; cl_platform_id platform = NULL; // Count for verification cl_uint verificationCount = 0; cl_uint requiredCount = 0; /*** FUNCTION DECLARATIONS ***/ // Read a file into a string std::string convertToString(const char * filename); // Host kernel computation int CPUkernel(); // Runs the kernel on all GPU devices and verify results with host output int runMultiGPU(); // Runs the kernel on a CPU and a GPU and verifies their results with host output int runMultiDevice(); // Calls runMultiGPU and runMultiDevice function int run(void); // Releases program's resources void cleanupHost(void); /* * Prints no more than 256 elements of the given array. * Prints full array if length is less than 256. * * Prints Array name followed by elements. */ void print1DArray(const std::string arrayName, const unsigned int *arrayData, const unsigned int length); #endif /* #ifndef MULTI_DEVICE_H_ */ ----------------------------------------------------------------------------------------------------------------------------------- SimpleMultiDevice.cpp -------------------------- #include "SimpleMultiDevice.hpp" int Device::createContext() { //Create context using current device's ID context = clCreateContext(cprops, 1, &deviceId, 0, 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::createQueue() { //Create Command-Queue queue = clCreateCommandQueue(context, deviceId, CL_QUEUE_PROFILING_ENABLE, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateCommandQueue failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::createBuffers() { // Create input buffer inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, width * sizeof(cl_float), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(inputBuffer)")) return SDK_FAILURE; // Create output buffer outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, width * sizeof(cl_float), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(outputBuffer)")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueWriteBuffer() { // Initialize input buffer status = clEnqueueWriteBuffer(queue, inputBuffer, 1, 0, width * sizeof(cl_float), input, 0, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clEnqueueWriteBuffer failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueWriteNewBuffer() { // Initialize input buffer status = clEnqueueWriteBuffer(queue, inputBuffer, 1, 0, MAX_DATA_SIZE * sizeof(PixelType), _hData, 0, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "enqueueWriteNewBuffer failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueWriteTemplateBuffer() { // Initialize input buffer status = clEnqueueWriteBuffer(queue, templateBuffer, 1, 0, MAX_TEMPLATES * MAX_TEMPLATE_ELEMENTS * sizeof(int), _template, 0, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "enqueueWriteTemplateBuffer failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::setNewKernelArgs() { ThresholdType _info = {DEFAULT_THRESHOLD, THRESHOLD_UPPER, THRESHOLD_LOWER, 0.0f}; int _type = 0; status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &outputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(outputBuffer)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(inputBuffer)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &templateBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(templateBuffer)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 3, 4 * sizeof(cl_float), (void *) &_info); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(_info)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *) &_type); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(_type)")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueNewKernel(size_t *globalThreads, size_t *localThreads) { status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "enqueueNewKernel failed.")) return SDK_FAILURE; status = clFlush(queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clFlush failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueNewReadData() { unsigned char _hResult[MAX_DATA_SIZE]; status = clEnqueueReadBuffer(queue, outputBuffer, 1, 0, MAX_DATA_SIZE * sizeof(unsigned char), _hResult, 0, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "enqueueNewReadData failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::createProgram(const char **source, const size_t *sourceSize) { // Create program with source program = clCreateProgramWithSource(context, 1, source, sourceSize, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::buildProgram() { char buildOptions[50]; sprintf(buildOptions, "-D KERNEL_ITERATIONS=%d", KERNEL_ITERATIONS); // Build program source status = clBuildProgram(program, 1, &deviceId, buildOptions, 0, 0); /* Print build log here if build program failed */ if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char *buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo(program, deviceId, CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!sdkObject.checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { return SDK_FAILURE; } buildLog = (char*)malloc(buildLogSize); if(buildLog == NULL) { sdkObject.error("Failed to allocate host memory. (buildLog)"); return SDK_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo(program, deviceId, CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!sdkObject.checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return SDK_FAILURE; } std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!sdkObject.checkVal(status, CL_SUCCESS, "clBuildProgram failed.")) { return SDK_FAILURE; } } return SDK_SUCCESS; } int Device::createKernel() { kernel = clCreateKernel(program, "multiDeviceKernel", &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateKernel failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::setKernelArgs() { status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(inputBuffer)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(outputBuffer)")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueKernel(size_t *globalThreads, size_t *localThreads) { status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clEnqueueNDRangeKernel failed.")) return SDK_FAILURE; status = clFlush(queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clFlush failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::waitForKernel() { status = clFinish(queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clFinish failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::getProfilingData() { status = clGetEventProfilingInfo(eventObject, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernelStartTime, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetEventProfilingInfo failed.(start time)")) return SDK_FAILURE; status = clGetEventProfilingInfo(eventObject, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernelEndTime, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetEventProfilingInfo failed.(end time)")) return SDK_FAILURE; //Measure time in ms elapsedTime = 1e-6 * (kernelEndTime - kernelStartTime); return SDK_SUCCESS; } int Device::enqueueReadData() { // Allocate memory if(output == NULL) { output = (cl_float*)malloc(width * sizeof(cl_char)); if(output == NULL) { printf("Failed to allocate output buffer!\n"); return SDK_FAILURE; } } status = clEnqueueReadBuffer(queue, outputBuffer, 1, 0, //width * sizeof(cl_float), width * sizeof(cl_char), output, 0, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clEnqueueReadBuffer failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::verifyResults() { float error = 0; //compare results between verificationOutput and output host buffers for(int i = 0; i < width; i++) { error += (output[i] - verificationOutput[i]); } error /= width; if(error < 0.001) { std::cout << "Passed!\n"; verificationCount++; } else { std::cout << "Failed!\n"; } return SDK_SUCCESS; } int Device::cleanupResources() { int status = clReleaseCommandQueue(queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseKernel(kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; cl_uint programRefCount; status = clGetProgramInfo(program, CL_PROGRAM_REFERENCE_COUNT, sizeof(cl_uint), &programRefCount, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetProgramInfo failed.")) return SDK_FAILURE; if(programRefCount) { status = clReleaseProgram(program); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseProgram failed.")) return SDK_FAILURE; } cl_uint inputRefCount; status = clGetMemObjectInfo(inputBuffer, CL_MEM_REFERENCE_COUNT, sizeof(cl_uint), &inputRefCount, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetMemObjectInfo failed.")) return SDK_FAILURE; if(inputRefCount) { status = clReleaseMemObject(inputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (inputBuffer)")) return SDK_FAILURE; } cl_uint outputRefCount; status = clGetMemObjectInfo(outputBuffer, CL_MEM_REFERENCE_COUNT, sizeof(cl_uint), &outputRefCount, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetMemObjectInfo failed.")) return SDK_FAILURE; if(outputRefCount) { status = clReleaseMemObject(outputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (outputBuffer)")) return SDK_FAILURE; } cl_uint contextRefCount; status = clGetContextInfo(context, CL_CONTEXT_REFERENCE_COUNT, sizeof(cl_uint), &contextRefCount, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetContextInfo failed.")) return SDK_FAILURE; if(contextRefCount) { status = clReleaseContext(context); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseContext failed.")) return SDK_FAILURE; } status = clReleaseEvent(eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; return SDK_SUCCESS; } //Thread function for a device void* threadFunc(void *device) { Device *d = (Device*)device; size_t globalThreads = width; size_t localThreads = GROUP_SIZE; d->enqueueKernel(&globalThreads, &localThreads); d->waitForKernel(); return NULL; } Device::~Device() { if(output) { free(output); output = NULL; } } int runMultiGPU() { int status; /////////////////////////////////////////////////////////////////// // Case 1 : Single Context (Single Thread) ////////////////////////////////////////////////////////////////// std::cout << sep << "\nMulti GPU Test 1 : Single context Single Thread\n" << sep << std::endl; cl_context context = clCreateContextFromType(cprops, CL_DEVICE_TYPE_GPU, 0, 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; size_t sourceSize = strlen(source); cl_program program = clCreateProgramWithSource(context, 1, &source, (const size_t*)&sourceSize, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; char buildOptions[50]; sprintf(buildOptions, "-D KERNEL_ITERATIONS=%d", KERNEL_ITERATIONS); //Build program for all the devices in the context status = clBuildProgram(program, 0, 0, buildOptions, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clBuildProgram failed.")) return SDK_FAILURE; //Setup for all GPU devices for(int i = 0; i < numGPUDevices; i++) { gpu[i].context = context; gpu[i].program = program; status = gpu[i].createQueue(); if(status != SDK_SUCCESS) return SDK_FAILURE; status = gpu[i].createKernel(); if(status != SDK_SUCCESS) return SDK_FAILURE; } // Create buffers // Create input buffer cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, width * sizeof(cl_float), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(inputBuffer)")) return SDK_FAILURE; // Create output buffer cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, width * sizeof(cl_float), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(outputBuffer)")) return SDK_FAILURE; //Set Buffers for(int i = 0; i < numGPUDevices; i++) { gpu[i].inputBuffer = inputBuffer; status = gpu[i].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; gpu[i].outputBuffer = outputBuffer; } //Set kernel arguments for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].setKernelArgs(); if(status != SDK_SUCCESS) return status; } size_t globalThreads = width; size_t localThreads = GROUP_SIZE; //Start a host timer here int timer = sdkObject.createTimer(); sdkObject.resetTimer(timer); sdkObject.startTimer(timer); for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; } //Wait for all kernels to finish execution for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].waitForKernel(); if(status != SDK_SUCCESS) return status; } //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time double totalTime = sdkObject.readTimer(timer); //Get individual timers for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].getProfilingData(); if(status != SDK_SUCCESS) return status; } //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; for(int i = 0; i < numGPUDevices; i++) { std::cout << "Time of GPU" << i << " : " << gpu[i].elapsedTime << std::endl; } if(verify) { //Enqueue Read output buffer and verify results for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for GPU" << i << " : "; gpu[i].verifyResults(); } } //Release the resources on all devices //Release context status = clReleaseContext(context); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; //Release memory buffers status = clReleaseMemObject(inputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (inputBuffer)")) return SDK_FAILURE; status = clReleaseMemObject(outputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (outputBuffer)")) return SDK_FAILURE; //Release Program object status = clReleaseProgram(program); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseProgram failed.")) return SDK_FAILURE; //Release Kernel object, command-queue, event object for(int i = 0; i < numGPUDevices; i++) { status = clReleaseKernel(gpu[i].kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseCommandQueue(gpu[i].queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseEvent(gpu[i].eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; } /////////////////////////////////////////////////////////////////// // Case 2 : Multiple Context (Single Thread) ////////////////////////////////////////////////////////////////// std::cout << sep << "\nMulti GPU Test 2 : Multiple context Single Thread\n" << sep << std::endl; for(int i = 0; i < numGPUDevices; i++) { //Create context for each device status = gpu[i].createContext(); if(status != SDK_SUCCESS) return status; //Create command-queue; status = gpu[i].createQueue(); if(status != SDK_SUCCESS) return status; //Create memory buffers status = gpu[i].createBuffers(); if(status != SDK_SUCCESS) return status; //Initialize input buffer status = gpu[i].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; //create program object status = gpu[i].createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; //Build program status = gpu[i].buildProgram(); if(status != SDK_SUCCESS) return status; //Create kernel objects for each device status = gpu[i].createKernel(); if(status != SDK_SUCCESS) return status; } //Set kernel arguments for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].setKernelArgs(); if(status != SDK_SUCCESS) return status; } //Start a host timer here sdkObject.resetTimer(timer); sdkObject.startTimer(timer); for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; } //Wait for all kernels to finish execution for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].waitForKernel(); if(status != SDK_SUCCESS) return status; } //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time totalTime = sdkObject.readTimer(timer); //Get individual timers for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].getProfilingData(); if(status != SDK_SUCCESS) return status; } //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; for(int i = 0; i < numGPUDevices; i++) { std::cout << "Time of GPU" << i << " : " << gpu[i].elapsedTime << std::endl; } if(verify) { // Read outputdata and verify results for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for GPU" << i << " : "; gpu[i].verifyResults(); } } //Release the resources on all devices for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].cleanupResources(); if(status != SDK_SUCCESS) return status; } //////////////////////////////////////////////////////////////////// // Case 3 : Multiple thread and multiple context for each device //////////////////////////////////////////////////////////////////// std::cout << sep << "\nMulti GPU Test 3 : Multiple context Multiple Thread\n" << sep << std::endl; for(int i = 0; i < numGPUDevices; i++) { //Create context for each device status = gpu[i].createContext(); if(status != SDK_SUCCESS) return status; //Create command-queue; status = gpu[i].createQueue(); if(status != SDK_SUCCESS) return status; //Create memory buffers status = gpu[i].createBuffers(); if(status != SDK_SUCCESS) return status; //Initialize input buffer status = gpu[i].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; //create program object status = gpu[i].createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; //Build program status = gpu[i].buildProgram(); if(status != SDK_SUCCESS) return status; //Create kernel objects for each device status = gpu[i].createKernel(); if(status != SDK_SUCCESS) return status; } //Set kernel arguments for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].setKernelArgs(); if(status != SDK_SUCCESS) return status; } //Start a host timer here sdkObject.resetTimer(timer); sdkObject.startTimer(timer); //Create thread objects streamsdk::SDKThread *gpuThread = new streamsdk::SDKThread[numGPUDevices]; //Start threads for each gpu device for(int i = 0; i < numGPUDevices; i++) { gpuThread[i].create(threadFunc, (void *)(gpu + i)); } //Join all gpu threads for(int i = 0; i < numGPUDevices; i++) { gpuThread[i].join(); } //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time totalTime = sdkObject.readTimer(timer); //Get individual timers for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].getProfilingData(); if(status != SDK_SUCCESS) return status; } //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; for(int i = 0; i < numGPUDevices; i++) { std::cout << "Time of GPU" << i << " : " << gpu[i].elapsedTime << std::endl; } if(verify) { // Read outputdata and verify results for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for GPU" << i << " : "; gpu[i].verifyResults(); } } //Release the resources on all devices for(int i = 0; i < numGPUDevices; i++) { gpu[i].cleanupResources(); } return 0; } int runMultiDevice() { int status; /////////////////////////////////////////////////////////////////// // Case 1 : Single Context (Single Thread) ////////////////////////////////////////////////////////////////// std::cout << sep << "\nCPU + GPU Test 1 : Single context Single Thread\n" << sep << std::endl; /* Create a list of device IDs having only CPU0 and GPU0 as device IDs */ cl_device_id *devices = (cl_device_id*)malloc(2 * sizeof(cl_device_id)); devices[0] = cpu[0].deviceId; devices[1] = gpu[0].deviceId; cl_context context = clCreateContext(cprops, 2, devices, 0, 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; size_t sourceSize = strlen(source); cl_program program = clCreateProgramWithSource(context, 1, &source, (const size_t*)&sourceSize, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; char buildOptions[50]; sprintf(buildOptions, "-D WIDTH=%d -D HEIGHT=%d -cl-mad-enable", MAX_DATA_WIDTH, MAX_DATA_HEIGHT); //Build program for all the devices in the context status = clBuildProgram(program, 2, devices, buildOptions, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clBuildProgram failed.")) return SDK_FAILURE; //Allocate objects for CPU cpu[0].context = context; gpu[0].context = context; cpu[0].program = program; gpu[0].program = program; // Create command queue status = cpu[0].createQueue(); if(status != SDK_SUCCESS) return status; // Create kernel status = cpu[0].createKernel(); if(status != SDK_SUCCESS) return status; // Create queue status = gpu[0].createQueue(); if(status != SDK_SUCCESS) return status; // Create kernel status = gpu[0].createKernel(); if(status != SDK_SUCCESS) return status; cl_mem _dData = clCreateBuffer(context, CL_MEM_READ_ONLY, MAX_DATA_WIDTH * MAX_DATA_HEIGHT * sizeof(PixelType), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(_dDataSarVec)")) return SDK_FAILURE; cl_mem _dTemplate = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, MAX_TEMPLATES * MAX_TEMPLATE_ELEMENTS * sizeof(int), (void*)_template, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(_dDataSarVec)")) return SDK_FAILURE; cl_mem _dResult = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, MAX_DATA_WIDTH * MAX_DATA_HEIGHT * sizeof(unsigned char), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(_dResultDetectMapVec)")) return SDK_FAILURE; cpu[0].inputBuffer = _dData; gpu[0].inputBuffer = _dData; cpu[0].templateBuffer = _dTemplate; gpu[0].templateBuffer = _dTemplate; cpu[0].outputBuffer = _dResult; gpu[0].outputBuffer = _dResult; // Initialize input buffer for both devices status = cpu[0].enqueueWriteNewBuffer(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueWriteNewBuffer(); if(status != SDK_SUCCESS) return status; status = cpu[0].enqueueWriteTemplateBuffer(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueWriteTemplateBuffer(); if(status != SDK_SUCCESS) return status; //Set kernel arguments status = cpu[0].setNewKernelArgs(); if(status != SDK_SUCCESS) return status; status = gpu[0].setNewKernelArgs(); if(status != SDK_SUCCESS) return status; size_t globalThreads2D[2] = {MAX_DATA_WIDTH, MAX_DATA_HEIGHT}; size_t localThreads2D[2] = {LOCAL_WORK_SIZE, LOCAL_WORK_SIZE}; //Start a host timer here int timer = sdkObject.createTimer(); sdkObject.resetTimer(timer); sdkObject.startTimer(timer); status = gpu[0].enqueueNewKernel(globalThreads2D, localThreads2D); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueNewReadData(); if(status != SDK_SUCCESS) return status; status = gpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; status = cpu[0].enqueueNewKernel(globalThreads2D, localThreads2D); if(status != SDK_SUCCESS) return status; status = cpu[0].enqueueNewReadData(); if(status != SDK_SUCCESS) return status; status = cpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time double totalTime = sdkObject.readTimer(timer); status = cpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; status = gpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; std::cout << "Time of CPU : " << cpu[0].elapsedTime << std::endl; std::cout << "Time of GPU : " << gpu[0].elapsedTime << std::endl; //Release context status = clReleaseContext(context); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; //Release memory buffers status = clReleaseMemObject(_dData); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (_dData)")) return SDK_FAILURE; status = clReleaseMemObject(_dTemplate); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (_dTemplate)")) return SDK_FAILURE; status = clReleaseMemObject(_dResult); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (_dResult)")) return SDK_FAILURE; //ReleaseCommand-queue status = clReleaseCommandQueue(cpu[0].queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseCommandQueue(gpu[0].queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; //Release Program object status = clReleaseProgram(program); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseProgram failed.")) return SDK_FAILURE; //Release Kernel object status = clReleaseKernel(cpu[0].kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; status = clReleaseKernel(gpu[0].kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; //Release Event object status = clReleaseEvent(cpu[0].eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; status = clReleaseEvent(gpu[0].eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; size_t globalThreads = width; size_t localThreads = GROUP_SIZE; #if 0 // Create buffers - A buffer is created on all devices sharing a context // So bufffer creation should should not per device in a single-context cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, width * sizeof(cl_float), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(inputBuffer)")) return SDK_FAILURE; cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, //width * sizeof(cl_float), width * sizeof(cl_char), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(outputBuffer)")) return SDK_FAILURE; cpu[0].inputBuffer = inputBuffer; gpu[0].inputBuffer = inputBuffer; cpu[0].outputBuffer = outputBuffer; gpu[0].outputBuffer = outputBuffer; // Initialize input buffer for both devices status = cpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; //Set kernel arguments status = cpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; status = gpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; size_t globalThreads = width; size_t localThreads = GROUP_SIZE; //Start a host timer here int timer = sdkObject.createTimer(); sdkObject.resetTimer(timer); sdkObject.startTimer(timer); status = gpu[0].enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; status = gpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; status = cpu[0].enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; //Read back output data for verification status = cpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; //Wait for all kernels to finish execution status = cpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time double totalTime = sdkObject.readTimer(timer); status = cpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; status = gpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; std::cout << "Time of CPU : " << cpu[0].elapsedTime << std::endl; std::cout << "Time of GPU : " << gpu[0].elapsedTime << std::endl; if(verify) { //Read back output data for verification status = cpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for CPU : "; cpu[0].verifyResults(); std::cout << "Verifying results for GPU : "; gpu[0].verifyResults(); } //Release context status = clReleaseContext(context); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; //Release memory buffers status = clReleaseMemObject(inputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (inputBuffer)")) return SDK_FAILURE; status = clReleaseMemObject(outputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (outputBuffer)")) return SDK_FAILURE; //ReleaseCommand-queue status = clReleaseCommandQueue(cpu[0].queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseCommandQueue(gpu[0].queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; //Release Program object status = clReleaseProgram(program); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseProgram failed.")) return SDK_FAILURE; //Release Kernel object status = clReleaseKernel(cpu[0].kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; status = clReleaseKernel(gpu[0].kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; //Release Event object status = clReleaseEvent(cpu[0].eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; status = clReleaseEvent(gpu[0].eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; #endif /////////////////////////////////////////////////////////////////// // Case 2 : Multiple Context (Single Thread) ////////////////////////////////////////////////////////////////// std::cout << sep << "\nCPU + GPU Test 2 : Multiple context Single Thread\n" << sep << std::endl; status = cpu[0].createContext(); if(status != SDK_SUCCESS) return status; status = cpu[0].createQueue(); if(status != SDK_SUCCESS) return status; status = cpu[0].createBuffers(); if(status != SDK_SUCCESS) return status; status = cpu[0].createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; status = cpu[0].buildProgram(); if(status != SDK_SUCCESS) return status; status = cpu[0].createKernel(); if(status != SDK_SUCCESS) return status; status = gpu[0].createContext(); if(status != SDK_SUCCESS) return status; status = gpu[0].createQueue(); if(status != SDK_SUCCESS) return status; status = gpu[0].createBuffers(); if(status != SDK_SUCCESS) return status; status = gpu[0].createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; status = gpu[0].buildProgram(); if(status != SDK_SUCCESS) return status; status = gpu[0].createKernel(); if(status != SDK_SUCCESS) return status; // Initialize input buffer for both devices status = cpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; //Set kernel arguments status = cpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; status = gpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; //Start a host timer here //int timer = sdkObject.createTimer(); sdkObject.resetTimer(timer); sdkObject.startTimer(timer); //size_t globalThreads = width; //size_t localThreads = 1; status = cpu[0].enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; //Wait for all kernels to finish execution status = cpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; status = gpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time //double totalTime = sdkObject.readTimer(timer); status = cpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; status = gpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; std::cout << "Time of CPU : " << cpu[0].elapsedTime << std::endl; std::cout << "Time of GPU : " << gpu[0].elapsedTime << std::endl; if(verify) { //Read back output data for verification status = cpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for CPU : "; cpu[0].verifyResults(); std::cout << "Verifying results for GPU : "; gpu[0].verifyResults(); } //Release the resources on all devices status = cpu[0].cleanupResources(); if(status != SDK_SUCCESS) return status; status = gpu[0].cleanupResources(); if(status != SDK_SUCCESS) return status; ///////////////////////////////////////////////////////////////////// // Case 3 : Multiple thread and multiple context for each device //////////////////////////////////////////////////////////////////// std::cout << sep << "\nCPU + GPU Test 3 : Multiple context Multiple Thread\n" << sep << std::endl; status = cpu[0].createContext(); if(status != SDK_SUCCESS) return status; status = cpu[0].createQueue(); if(status != SDK_SUCCESS) return status; status = cpu[0].createBuffers(); if(status != SDK_SUCCESS) return status; status = cpu[0].createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; status = cpu[0].buildProgram(); if(status != SDK_SUCCESS) return status; status = cpu[0].createKernel(); if(status != SDK_SUCCESS) return status; status = gpu[0].createContext(); if(status != SDK_SUCCESS) return status; status = gpu[0].createQueue(); if(status != SDK_SUCCESS) return status; status = gpu[0].createBuffers(); if(status != SDK_SUCCESS) return status; status = gpu[0].createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; status = gpu[0].buildProgram(); if(status != SDK_SUCCESS) return status; status = gpu[0].createKernel(); if(status != SDK_SUCCESS) return status; // Initialize input buffer for both devices status = cpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; //Set kernel arguments status = cpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; status = gpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; //Start a host timer here sdkObject.resetTimer(timer); sdkObject.startTimer(timer); //Create a thread for CPU and GPU device each streamsdk::SDKThread cpuThread; streamsdk::SDKThread gpuThread; cpuThread.create(threadFunc, (void *)cpu); gpuThread.create(threadFunc, (void *)gpu); cpuThread.join(); gpuThread.join(); //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time totalTime = sdkObject.readTimer(timer); status = cpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; status = gpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; std::cout << "Time of CPU : " << cpu[0].elapsedTime << std::endl; std::cout << "Time of GPU : " << gpu[0].elapsedTime << std::endl; if(verify) { //Read back output data for verification status = cpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for CPU : "; cpu[0].verifyResults(); std::cout << "Verifying results for GPU : "; gpu[0].verifyResults(); } //Release the resources on all devices status = cpu[0].cleanupResources(); if(status != SDK_SUCCESS) return status; status = gpu[0].cleanupResources(); if(status != SDK_SUCCESS) return status; if(devices) { free(devices); devices = NULL; } return 0; } /* * \brief Host Initialization * Allocate and initialize memory * on the host. Print input array. */ int initializeHost(void) { width = NUM_THREADS; input = NULL; verificationOutput = NULL; ///////////////////////////////////////////////////////////////// // Allocate and initialize memory used by host ///////////////////////////////////////////////////////////////// _hData = (PixelType*) malloc (MAX_DATA_SIZE * sizeof(PixelType)); if (_hData == NULL) { printf("\nUnable to allocate memory at %d\n", __LINE__); return SDK_FAILURE; } for (int i = 0; i < MAX_DATA_SIZE; i+=4) { (_hData + i)->_val1 = i * 0.0001f; (_hData + i + 1)->_val2 = (i + 1) * 0.0001f; (_hData + i + 2)->_val3 = (i + 2) * 0.0001f; (_hData + i + 3)->_val4 = (i + 3) * 0.0001f; } #if 0 cl_uint sizeInBytes = width * sizeof(cl_uint); input = (cl_float*) malloc(sizeInBytes); if(input == NULL) { printf("Error: Failed to allocate input memory on host\n"); return SDK_FAILURE; } verificationOutput = (cl_float*) malloc(sizeInBytes); if(verificationOutput == NULL) { printf("Error: Failed to allocate verificationOutput memory on host\n"); return SDK_FAILURE; } //Initilize input data for(int i = 0; i < width; i++) input[i] = (cl_float)i; #endif return SDK_SUCCESS; } /* * Converts the contents of a file into a string */ std::string convertToString(const char *filename) { size_t size; char* str; std::string s; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = (size_t)f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); return NULL; } f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; delete[] str; return s; } return NULL; } /* * \brief OpenCL related initialization * Create Context, Device list, Command Queue * Create OpenCL memory buffer objects * Load CL file, compile, link CL source * Build program and kernel objects */ int initializeCL(void) { cl_int status = 0; /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) return SDK_FAILURE; if(numPlatforms > 0) { cl_platform_id* platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) return SDK_FAILURE; for(unsigned int i=0; i < numPlatforms; ++i) { char pbuff[100]; status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetPlatformInfo failed.")) return SDK_FAILURE; platform = platforms[i]; if(!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { break; } } free(platforms); } /* * If we could find our platform, use it. Otherwise pass a NULL and get whatever the * implementation thinks we should be using. */ cps[0] = CL_CONTEXT_PLATFORM; cps[1] = (cl_context_properties)platform; cps[2] = 0; cprops = (NULL == platform) ? NULL : cps; // Get Number of CPU devices available status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, 0, (cl_uint*)&numCPUDevices); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetDeviceIDs failed.(numCPUDevices)")) return SDK_FAILURE; // Get Number of CPU devices available status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, 0, (cl_uint*)&numDevices); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetDeviceIDs failed.(numDevices)")) return SDK_FAILURE; // Get number of GPU Devices numGPUDevices = numDevices - numCPUDevices; // If no GPU is present then exit if(numGPUDevices < 1) { std::cout << "Only CPU device is present. Exiting!\n"; return SDK_EXPECTED_FAILURE; } // Allocate memory for list of Devices cpu = new Device[numCPUDevices]; //Get CPU Device IDs cl_device_id* cpuDeviceIDs = new cl_device_id[numCPUDevices]; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numCPUDevices, cpuDeviceIDs, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetDeviceIDs failed.")) return SDK_FAILURE; for(int i = 0; i < numCPUDevices; i++) { cpu[i].dType = CL_DEVICE_TYPE_CPU; cpu[i].deviceId = cpuDeviceIDs[i]; } delete[] cpuDeviceIDs; gpu = new Device[numGPUDevices]; //Get GPU Device IDs cl_device_id* gpuDeviceIDs = new cl_device_id[numGPUDevices]; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numGPUDevices, gpuDeviceIDs, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetDeviceIDs failed.")) return SDK_FAILURE; for(int i = 0; i < numGPUDevices; i++) { gpu[i].dType = CL_DEVICE_TYPE_GPU; gpu[i].deviceId = gpuDeviceIDs[i]; } delete[] gpuDeviceIDs; ///////////////////////////////////////////////////////////////// // Load CL file ///////////////////////////////////////////////////////////////// const char *filename = "SimpleMultiDevice_Kernels.cl"; sourceStr = convertToString(filename); source = sourceStr.c_str(); return SDK_SUCCESS; } int run() { int status; // If a GPU is present then run CPU + GPU concurrently if(numGPUDevices > 0 && numCPUDevices > 0) { // 3 tests : // a) Single context - Single thread // b) Multiple context - Single thread // c) Multiple context - Multple Threads // 3 Tests * 2 devices requiredCount += 3 * 2; status = runMultiDevice(); if(status != SDK_SUCCESS) return status; } // If more than 1 GPU is present then run MultiGPU concurrently if(numGPUDevices > 1) { // 3 tests : // a) Single context - Single thread // b) Multiple context - Single thread // c) Multiple context - Multple Threads // 3 Tests * numGPUDevices requiredCount += 3 * numGPUDevices; status = runMultiGPU(); if(status != SDK_SUCCESS) return status; } return SDK_SUCCESS; } /* * \brief Releases program's resources */ void cleanupHost(void) { if(input != NULL) { free(input); input = NULL; } if(verificationOutput != NULL) { free(verificationOutput); verificationOutput = NULL; } if(cpu != NULL) { delete[] cpu; cpu = NULL; } if(gpu != NULL) { delete[] gpu; gpu = NULL; } } /* * \brief Print no more than 256 elements of the given array. * * Print Array name followed by elements. */ void print1DArray( const std::string arrayName, const unsigned int * arrayData, const unsigned int length) { cl_uint i; cl_uint numElementsToPrint = (256 < length) ? 256 : length; std::cout << std::endl; std::cout << arrayName << ":" << std::endl; for(i = 0; i < numElementsToPrint; ++i) { std::cout << arrayData[i] << " "; } std::cout << std::endl; } // OpenCL MAD definition for CPU float mad(float a, float b, float c) { return a * b + c; } // OpenCL HYPOT definition for CPU float hypot(float a, float b) { return sqrt(a * a + b * b); } int CPUKernel() { for(int i = 0; i < width; i++) { float a = mad(input[i], input[i], 1); float b = mad(input[i], input[i], 2); for(int j = 0; j < KERNEL_ITERATIONS; j++) { a = hypot(a, b); b = hypot(a, b); } verificationOutput[i] = (a + b); } return 0; } int main(int argc, char * argv[]) { for(int i = 1; i < argc; i++) { if(!strcmp(argv[i], "-e") || !strcmp(argv[i], "--verify")) verify = true; if(!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) { printf("Usage\n"); printf("-h, --help\tPrint this help.\n"); printf("-e, --verify\tVerify results against reference implementation.\n"); exit(0); } } int status; // Initialize Host application status = initializeHost(); if(status != SDK_SUCCESS) return status; // Run host computation if verification is true #if 0 CPUKernel(); #endif // Initialize OpenCL resources status = initializeCL(); if(status != SDK_SUCCESS) { if(status == SDK_EXPECTED_FAILURE) return SDK_SUCCESS; else return status; } // Run the CL program status = run(); if(status != SDK_SUCCESS) return status; // Release host resources cleanupHost(); if(verify) { // If any one test fails then print FAILED if(verificationCount != requiredCount) { std::cout << "\n\nFAILED!\n"; return SDK_FAILURE; } else { std::cout << "\n\nPASSED!\n"; return SDK_SUCCESS; } } return SDK_SUCCESS; }

                                        • CPU + GPU crash with AMD APP SDK 2.3 Win7
                                          genaganna

                                           

                                          Originally posted by: wheecheng Here's the complete runtime code:

                                           

                                           



                                          Could  you please copy complete code?  .cpp code was incomplete.

                                            • CPU + GPU crash with AMD APP SDK 2.3 Win7
                                              wheecheng

                                              Here's the .cpp code:

                                              #include "SimpleMultiDevice.hpp" int Device::createContext() { //Create context using current device's ID context = clCreateContext(cprops, 1, &deviceId, 0, 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::createQueue() { //Create Command-Queue queue = clCreateCommandQueue(context, deviceId, CL_QUEUE_PROFILING_ENABLE, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateCommandQueue failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::createBuffers() { // Create input buffer inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, width * sizeof(cl_float), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(inputBuffer)")) return SDK_FAILURE; // Create output buffer outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, width * sizeof(cl_float), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(outputBuffer)")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueWriteBuffer() { // Initialize input buffer status = clEnqueueWriteBuffer(queue, inputBuffer, 1, 0, width * sizeof(cl_float), input, 0, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clEnqueueWriteBuffer failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueWriteNewBuffer() { // Initialize input buffer status = clEnqueueWriteBuffer(queue, inputBuffer, 1, 0, MAX_DATA_SIZE * sizeof(PixelType), _hData, 0, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "enqueueWriteNewBuffer failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueWriteTemplateBuffer() { // Initialize input buffer status = clEnqueueWriteBuffer(queue, templateBuffer, 1, 0, MAX_TEMPLATES * MAX_TEMPLATE_ELEMENTS * sizeof(int), _template, 0, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "enqueueWriteTemplateBuffer failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::setNewKernelArgs() { ThresholdType _info = {DEFAULT_THRESHOLD, THRESHOLD_UPPER, THRESHOLD_LOWER, 0.0f}; int _type = 0; status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &outputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(outputBuffer)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(inputBuffer)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &templateBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(templateBuffer)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 3, 4 * sizeof(cl_float), (void *) &_info); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(_info)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *) &_type); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(_type)")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueNewKernel(size_t *globalThreads, size_t *localThreads) { status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "enqueueNewKernel failed.")) return SDK_FAILURE; status = clFlush(queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clFlush failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueNewReadData() { unsigned char _hResult[MAX_DATA_SIZE]; status = clEnqueueReadBuffer(queue, outputBuffer, 1, 0, MAX_DATA_SIZE * sizeof(unsigned char), _hResult, 0, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "enqueueNewReadData failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::createProgram(const char **source, const size_t *sourceSize) { // Create program with source program = clCreateProgramWithSource(context, 1, source, sourceSize, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::buildProgram() { char buildOptions[50]; sprintf(buildOptions, "-D KERNEL_ITERATIONS=%d", KERNEL_ITERATIONS); // Build program source status = clBuildProgram(program, 1, &deviceId, buildOptions, 0, 0); /* Print build log here if build program failed */ if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char *buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo(program, deviceId, CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!sdkObject.checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { return SDK_FAILURE; } buildLog = (char*)malloc(buildLogSize); if(buildLog == NULL) { sdkObject.error("Failed to allocate host memory. (buildLog)"); return SDK_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo(program, deviceId, CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!sdkObject.checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return SDK_FAILURE; } std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!sdkObject.checkVal(status, CL_SUCCESS, "clBuildProgram failed.")) { return SDK_FAILURE; } } return SDK_SUCCESS; } int Device::createKernel() { kernel = clCreateKernel(program, "multiDeviceKernel", &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateKernel failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::setKernelArgs() { status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(inputBuffer)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(outputBuffer)")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::enqueueKernel(size_t *globalThreads, size_t *localThreads) { status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clEnqueueNDRangeKernel failed.")) return SDK_FAILURE; status = clFlush(queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clFlush failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::waitForKernel() { status = clFinish(queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clFinish failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::getProfilingData() { status = clGetEventProfilingInfo(eventObject, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernelStartTime, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetEventProfilingInfo failed.(start time)")) return SDK_FAILURE; status = clGetEventProfilingInfo(eventObject, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernelEndTime, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetEventProfilingInfo failed.(end time)")) return SDK_FAILURE; //Measure time in ms elapsedTime = 1e-6 * (kernelEndTime - kernelStartTime); return SDK_SUCCESS; } int Device::enqueueReadData() { // Allocate memory if(output == NULL) { output = (cl_float*)malloc(width * sizeof(cl_char)); if(output == NULL) { printf("Failed to allocate output buffer!\n"); return SDK_FAILURE; } } status = clEnqueueReadBuffer(queue, outputBuffer, 1, 0, //width * sizeof(cl_float), width * sizeof(cl_char), output, 0, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clEnqueueReadBuffer failed.")) return SDK_FAILURE; return SDK_SUCCESS; } int Device::verifyResults() { float error = 0; //compare results between verificationOutput and output host buffers for(int i = 0; i < width; i++) { error += (output[i] - verificationOutput[i]); } error /= width; if(error < 0.001) { std::cout << "Passed!\n"; verificationCount++; } else { std::cout << "Failed!\n"; } return SDK_SUCCESS; } int Device::cleanupResources() { int status = clReleaseCommandQueue(queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseKernel(kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; cl_uint programRefCount; status = clGetProgramInfo(program, CL_PROGRAM_REFERENCE_COUNT, sizeof(cl_uint), &programRefCount, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetProgramInfo failed.")) return SDK_FAILURE; if(programRefCount) { status = clReleaseProgram(program); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseProgram failed.")) return SDK_FAILURE; } cl_uint inputRefCount; status = clGetMemObjectInfo(inputBuffer, CL_MEM_REFERENCE_COUNT, sizeof(cl_uint), &inputRefCount, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetMemObjectInfo failed.")) return SDK_FAILURE; if(inputRefCount) { status = clReleaseMemObject(inputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (inputBuffer)")) return SDK_FAILURE; } cl_uint outputRefCount; status = clGetMemObjectInfo(outputBuffer, CL_MEM_REFERENCE_COUNT, sizeof(cl_uint), &outputRefCount, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetMemObjectInfo failed.")) return SDK_FAILURE; if(outputRefCount) { status = clReleaseMemObject(outputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (outputBuffer)")) return SDK_FAILURE; } cl_uint contextRefCount; status = clGetContextInfo(context, CL_CONTEXT_REFERENCE_COUNT, sizeof(cl_uint), &contextRefCount, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetContextInfo failed.")) return SDK_FAILURE; if(contextRefCount) { status = clReleaseContext(context); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseContext failed.")) return SDK_FAILURE; } status = clReleaseEvent(eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; return SDK_SUCCESS; } //Thread function for a device void* threadFunc(void *device) { Device *d = (Device*)device; size_t globalThreads = width; size_t localThreads = GROUP_SIZE; d->enqueueKernel(&globalThreads, &localThreads); d->waitForKernel(); return NULL; } Device::~Device() { if(output) { free(output); output = NULL; } } int runMultiGPU() { int status; /////////////////////////////////////////////////////////////////// // Case 1 : Single Context (Single Thread) ////////////////////////////////////////////////////////////////// std::cout << sep << "\nMulti GPU Test 1 : Single context Single Thread\n" << sep << std::endl; cl_context context = clCreateContextFromType(cprops, CL_DEVICE_TYPE_GPU, 0, 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; size_t sourceSize = strlen(source); cl_program program = clCreateProgramWithSource(context, 1, &source, (const size_t*)&sourceSize, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; char buildOptions[50]; sprintf(buildOptions, "-D KERNEL_ITERATIONS=%d", KERNEL_ITERATIONS); //Build program for all the devices in the context status = clBuildProgram(program, 0, 0, buildOptions, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clBuildProgram failed.")) return SDK_FAILURE; //Setup for all GPU devices for(int i = 0; i < numGPUDevices; i++) { gpu[i].context = context; gpu[i].program = program; status = gpu[i].createQueue(); if(status != SDK_SUCCESS) return SDK_FAILURE; status = gpu[i].createKernel(); if(status != SDK_SUCCESS) return SDK_FAILURE; } // Create buffers // Create input buffer cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, width * sizeof(cl_float), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(inputBuffer)")) return SDK_FAILURE; // Create output buffer cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, width * sizeof(cl_float), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(outputBuffer)")) return SDK_FAILURE; //Set Buffers for(int i = 0; i < numGPUDevices; i++) { gpu[i].inputBuffer = inputBuffer; status = gpu[i].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; gpu[i].outputBuffer = outputBuffer; } //Set kernel arguments for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].setKernelArgs(); if(status != SDK_SUCCESS) return status; } size_t globalThreads = width; size_t localThreads = GROUP_SIZE; //Start a host timer here int timer = sdkObject.createTimer(); sdkObject.resetTimer(timer); sdkObject.startTimer(timer); for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; } //Wait for all kernels to finish execution for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].waitForKernel(); if(status != SDK_SUCCESS) return status; } //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time double totalTime = sdkObject.readTimer(timer); //Get individual timers for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].getProfilingData(); if(status != SDK_SUCCESS) return status; } //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; for(int i = 0; i < numGPUDevices; i++) { std::cout << "Time of GPU" << i << " : " << gpu[i].elapsedTime << std::endl; } if(verify) { //Enqueue Read output buffer and verify results for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for GPU" << i << " : "; gpu[i].verifyResults(); } } //Release the resources on all devices //Release context status = clReleaseContext(context); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; //Release memory buffers status = clReleaseMemObject(inputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (inputBuffer)")) return SDK_FAILURE; status = clReleaseMemObject(outputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (outputBuffer)")) return SDK_FAILURE; //Release Program object status = clReleaseProgram(program); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseProgram failed.")) return SDK_FAILURE; //Release Kernel object, command-queue, event object for(int i = 0; i < numGPUDevices; i++) { status = clReleaseKernel(gpu[i].kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseCommandQueue(gpu[i].queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseEvent(gpu[i].eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; } /////////////////////////////////////////////////////////////////// // Case 2 : Multiple Context (Single Thread) ////////////////////////////////////////////////////////////////// std::cout << sep << "\nMulti GPU Test 2 : Multiple context Single Thread\n" << sep << std::endl; for(int i = 0; i < numGPUDevices; i++) { //Create context for each device status = gpu[i].createContext(); if(status != SDK_SUCCESS) return status; //Create command-queue; status = gpu[i].createQueue(); if(status != SDK_SUCCESS) return status; //Create memory buffers status = gpu[i].createBuffers(); if(status != SDK_SUCCESS) return status; //Initialize input buffer status = gpu[i].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; //create program object status = gpu[i].createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; //Build program status = gpu[i].buildProgram(); if(status != SDK_SUCCESS) return status; //Create kernel objects for each device status = gpu[i].createKernel(); if(status != SDK_SUCCESS) return status; } //Set kernel arguments for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].setKernelArgs(); if(status != SDK_SUCCESS) return status; } //Start a host timer here sdkObject.resetTimer(timer); sdkObject.startTimer(timer); for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; } //Wait for all kernels to finish execution for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].waitForKernel(); if(status != SDK_SUCCESS) return status; } //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time totalTime = sdkObject.readTimer(timer); //Get individual timers for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].getProfilingData(); if(status != SDK_SUCCESS) return status; } //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; for(int i = 0; i < numGPUDevices; i++) { std::cout << "Time of GPU" << i << " : " << gpu[i].elapsedTime << std::endl; } if(verify) { // Read outputdata and verify results for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for GPU" << i << " : "; gpu[i].verifyResults(); } } //Release the resources on all devices for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].cleanupResources(); if(status != SDK_SUCCESS) return status; } //////////////////////////////////////////////////////////////////// // Case 3 : Multiple thread and multiple context for each device //////////////////////////////////////////////////////////////////// std::cout << sep << "\nMulti GPU Test 3 : Multiple context Multiple Thread\n" << sep << std::endl; for(int i = 0; i < numGPUDevices; i++) { //Create context for each device status = gpu[i].createContext(); if(status != SDK_SUCCESS) return status; //Create command-queue; status = gpu[i].createQueue(); if(status != SDK_SUCCESS) return status; //Create memory buffers status = gpu[i].createBuffers(); if(status != SDK_SUCCESS) return status; //Initialize input buffer status = gpu[i].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; //create program object status = gpu[i].createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; //Build program status = gpu[i].buildProgram(); if(status != SDK_SUCCESS) return status; //Create kernel objects for each device status = gpu[i].createKernel(); if(status != SDK_SUCCESS) return status; } //Set kernel arguments for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].setKernelArgs(); if(status != SDK_SUCCESS) return status; } //Start a host timer here sdkObject.resetTimer(timer); sdkObject.startTimer(timer); //Create thread objects streamsdk::SDKThread *gpuThread = new streamsdk::SDKThread[numGPUDevices]; //Start threads for each gpu device for(int i = 0; i < numGPUDevices; i++) { gpuThread[i].create(threadFunc, (void *)(gpu + i)); } //Join all gpu threads for(int i = 0; i < numGPUDevices; i++) { gpuThread[i].join(); } //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time totalTime = sdkObject.readTimer(timer); //Get individual timers for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].getProfilingData(); if(status != SDK_SUCCESS) return status; } //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; for(int i = 0; i < numGPUDevices; i++) { std::cout << "Time of GPU" << i << " : " << gpu[i].elapsedTime << std::endl; } if(verify) { // Read outputdata and verify results for(int i = 0; i < numGPUDevices; i++) { status = gpu[i].enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for GPU" << i << " : "; gpu[i].verifyResults(); } } //Release the resources on all devices for(int i = 0; i < numGPUDevices; i++) { gpu[i].cleanupResources(); } return 0; } int runMultiDevice() { int status; /////////////////////////////////////////////////////////////////// // Case 1 : Single Context (Single Thread) ////////////////////////////////////////////////////////////////// std::cout << sep << "\nCPU + GPU Test 1 : Single context Single Thread\n" << sep << std::endl; /* Create a list of device IDs having only CPU0 and GPU0 as device IDs */ cl_device_id *devices = (cl_device_id*)malloc(2 * sizeof(cl_device_id)); devices[0] = cpu[0].deviceId; devices[1] = gpu[0].deviceId; cl_context context = clCreateContext(cprops, 2, devices, 0, 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; size_t sourceSize = strlen(source); cl_program program = clCreateProgramWithSource(context, 1, &source, (const size_t*)&sourceSize, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; char buildOptions[50]; sprintf(buildOptions, "-D WIDTH=%d -D HEIGHT=%d -cl-mad-enable", MAX_DATA_WIDTH, MAX_DATA_HEIGHT); //Build program for all the devices in the context status = clBuildProgram(program, 2, devices, buildOptions, 0, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clBuildProgram failed.")) return SDK_FAILURE; //Allocate objects for CPU cpu[0].context = context; gpu[0].context = context; cpu[0].program = program; gpu[0].program = program; // Create command queue status = cpu[0].createQueue(); if(status != SDK_SUCCESS) return status; // Create kernel status = cpu[0].createKernel(); if(status != SDK_SUCCESS) return status; // Create queue status = gpu[0].createQueue(); if(status != SDK_SUCCESS) return status; // Create kernel status = gpu[0].createKernel(); if(status != SDK_SUCCESS) return status; cl_mem _dData = clCreateBuffer(context, CL_MEM_READ_ONLY, MAX_DATA_WIDTH * MAX_DATA_HEIGHT * sizeof(PixelType), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(_dDataSarVec)")) return SDK_FAILURE; cl_mem _dTemplate = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, MAX_TEMPLATES * MAX_TEMPLATE_ELEMENTS * sizeof(int), (void*)_template, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(_dDataSarVec)")) return SDK_FAILURE; cl_mem _dResult = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, MAX_DATA_WIDTH * MAX_DATA_HEIGHT * sizeof(unsigned char), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(_dResultDetectMapVec)")) return SDK_FAILURE; cpu[0].inputBuffer = _dData; gpu[0].inputBuffer = _dData; cpu[0].templateBuffer = _dTemplate; gpu[0].templateBuffer = _dTemplate; cpu[0].outputBuffer = _dResult; gpu[0].outputBuffer = _dResult; // Initialize input buffer for both devices status = cpu[0].enqueueWriteNewBuffer(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueWriteNewBuffer(); if(status != SDK_SUCCESS) return status; status = cpu[0].enqueueWriteTemplateBuffer(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueWriteTemplateBuffer(); if(status != SDK_SUCCESS) return status; //Set kernel arguments status = cpu[0].setNewKernelArgs(); if(status != SDK_SUCCESS) return status; status = gpu[0].setNewKernelArgs(); if(status != SDK_SUCCESS) return status; size_t globalThreads2D[2] = {MAX_DATA_WIDTH, MAX_DATA_HEIGHT}; size_t localThreads2D[2] = {LOCAL_WORK_SIZE, LOCAL_WORK_SIZE}; //Start a host timer here int timer = sdkObject.createTimer(); sdkObject.resetTimer(timer); sdkObject.startTimer(timer); status = gpu[0].enqueueNewKernel(globalThreads2D, localThreads2D); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueNewReadData(); if(status != SDK_SUCCESS) return status; status = gpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; status = cpu[0].enqueueNewKernel(globalThreads2D, localThreads2D); if(status != SDK_SUCCESS) return status; status = cpu[0].enqueueNewReadData(); if(status != SDK_SUCCESS) return status; status = cpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time double totalTime = sdkObject.readTimer(timer); status = cpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; status = gpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; std::cout << "Time of CPU : " << cpu[0].elapsedTime << std::endl; std::cout << "Time of GPU : " << gpu[0].elapsedTime << std::endl; //Release context status = clReleaseContext(context); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; //Release memory buffers status = clReleaseMemObject(_dData); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (_dData)")) return SDK_FAILURE; status = clReleaseMemObject(_dTemplate); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (_dTemplate)")) return SDK_FAILURE; status = clReleaseMemObject(_dResult); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (_dResult)")) return SDK_FAILURE; //ReleaseCommand-queue status = clReleaseCommandQueue(cpu[0].queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseCommandQueue(gpu[0].queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; //Release Program object status = clReleaseProgram(program); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseProgram failed.")) return SDK_FAILURE; //Release Kernel object status = clReleaseKernel(cpu[0].kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; status = clReleaseKernel(gpu[0].kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; //Release Event object status = clReleaseEvent(cpu[0].eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; status = clReleaseEvent(gpu[0].eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; size_t globalThreads = width; size_t localThreads = GROUP_SIZE; #if 0 // Create buffers - A buffer is created on all devices sharing a context // So bufffer creation should should not per device in a single-context cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, width * sizeof(cl_float), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(inputBuffer)")) return SDK_FAILURE; cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, //width * sizeof(cl_float), width * sizeof(cl_char), 0, &status); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateBuffer failed.(outputBuffer)")) return SDK_FAILURE; cpu[0].inputBuffer = inputBuffer; gpu[0].inputBuffer = inputBuffer; cpu[0].outputBuffer = outputBuffer; gpu[0].outputBuffer = outputBuffer; // Initialize input buffer for both devices status = cpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; //Set kernel arguments status = cpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; status = gpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; size_t globalThreads = width; size_t localThreads = GROUP_SIZE; //Start a host timer here int timer = sdkObject.createTimer(); sdkObject.resetTimer(timer); sdkObject.startTimer(timer); status = gpu[0].enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; status = gpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; status = cpu[0].enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; //Read back output data for verification status = cpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; //Wait for all kernels to finish execution status = cpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time double totalTime = sdkObject.readTimer(timer); status = cpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; status = gpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; std::cout << "Time of CPU : " << cpu[0].elapsedTime << std::endl; std::cout << "Time of GPU : " << gpu[0].elapsedTime << std::endl; if(verify) { //Read back output data for verification status = cpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for CPU : "; cpu[0].verifyResults(); std::cout << "Verifying results for GPU : "; gpu[0].verifyResults(); } //Release context status = clReleaseContext(context); if(!sdkObject.checkVal(status, CL_SUCCESS, "clCreateContext failed.")) return SDK_FAILURE; //Release memory buffers status = clReleaseMemObject(inputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (inputBuffer)")) return SDK_FAILURE; status = clReleaseMemObject(outputBuffer); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseMemObject failed. (outputBuffer)")) return SDK_FAILURE; //ReleaseCommand-queue status = clReleaseCommandQueue(cpu[0].queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseCommandQueue(gpu[0].queue); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; //Release Program object status = clReleaseProgram(program); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseProgram failed.")) return SDK_FAILURE; //Release Kernel object status = clReleaseKernel(cpu[0].kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; status = clReleaseKernel(gpu[0].kernel); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; //Release Event object status = clReleaseEvent(cpu[0].eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; status = clReleaseEvent(gpu[0].eventObject); if(!sdkObject.checkVal(status, CL_SUCCESS, "clReleaseEvent failed.")) return SDK_FAILURE; #endif /////////////////////////////////////////////////////////////////// // Case 2 : Multiple Context (Single Thread) ////////////////////////////////////////////////////////////////// std::cout << sep << "\nCPU + GPU Test 2 : Multiple context Single Thread\n" << sep << std::endl; status = cpu[0].createContext(); if(status != SDK_SUCCESS) return status; status = cpu[0].createQueue(); if(status != SDK_SUCCESS) return status; status = cpu[0].createBuffers(); if(status != SDK_SUCCESS) return status; status = cpu[0].createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; status = cpu[0].buildProgram(); if(status != SDK_SUCCESS) return status; status = cpu[0].createKernel(); if(status != SDK_SUCCESS) return status; status = gpu[0].createContext(); if(status != SDK_SUCCESS) return status; status = gpu[0].createQueue(); if(status != SDK_SUCCESS) return status; status = gpu[0].createBuffers(); if(status != SDK_SUCCESS) return status; status = gpu[0].createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; status = gpu[0].buildProgram(); if(status != SDK_SUCCESS) return status; status = gpu[0].createKernel(); if(status != SDK_SUCCESS) return status; // Initialize input buffer for both devices status = cpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; //Set kernel arguments status = cpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; status = gpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; //Start a host timer here //int timer = sdkObject.createTimer(); sdkObject.resetTimer(timer); sdkObject.startTimer(timer); //size_t globalThreads = width; //size_t localThreads = 1; status = cpu[0].enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueKernel(&globalThreads, &localThreads); if(status != SDK_SUCCESS) return status; //Wait for all kernels to finish execution status = cpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; status = gpu[0].waitForKernel(); if(status != SDK_SUCCESS) return status; //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time //double totalTime = sdkObject.readTimer(timer); status = cpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; status = gpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; std::cout << "Time of CPU : " << cpu[0].elapsedTime << std::endl; std::cout << "Time of GPU : " << gpu[0].elapsedTime << std::endl; if(verify) { //Read back output data for verification status = cpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for CPU : "; cpu[0].verifyResults(); std::cout << "Verifying results for GPU : "; gpu[0].verifyResults(); } //Release the resources on all devices status = cpu[0].cleanupResources(); if(status != SDK_SUCCESS) return status; status = gpu[0].cleanupResources(); if(status != SDK_SUCCESS) return status; ///////////////////////////////////////////////////////////////////// // Case 3 : Multiple thread and multiple context for each device //////////////////////////////////////////////////////////////////// std::cout << sep << "\nCPU + GPU Test 3 : Multiple context Multiple Thread\n" << sep << std::endl; status = cpu[0].createContext(); if(status != SDK_SUCCESS) return status; status = cpu[0].createQueue(); if(status != SDK_SUCCESS) return status; status = cpu[0].createBuffers(); if(status != SDK_SUCCESS) return status; status = cpu[0].createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; status = cpu[0].buildProgram(); if(status != SDK_SUCCESS) return status; status = cpu[0].createKernel(); if(status != SDK_SUCCESS) return status; status = gpu[0].createContext(); if(status != SDK_SUCCESS) return status; status = gpu[0].createQueue(); if(status != SDK_SUCCESS) return status; status = gpu[0].createBuffers(); if(status != SDK_SUCCESS) return status; status = gpu[0].createProgram(&source, &sourceSize); if(status != SDK_SUCCESS) return status; status = gpu[0].buildProgram(); if(status != SDK_SUCCESS) return status; status = gpu[0].createKernel(); if(status != SDK_SUCCESS) return status; // Initialize input buffer for both devices status = cpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueWriteBuffer(); if(status != SDK_SUCCESS) return status; //Set kernel arguments status = cpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; status = gpu[0].setKernelArgs(); if(status != SDK_SUCCESS) return status; //Start a host timer here sdkObject.resetTimer(timer); sdkObject.startTimer(timer); //Create a thread for CPU and GPU device each streamsdk::SDKThread cpuThread; streamsdk::SDKThread gpuThread; cpuThread.create(threadFunc, (void *)cpu); gpuThread.create(threadFunc, (void *)gpu); cpuThread.join(); gpuThread.join(); //Stop the host timer here sdkObject.stopTimer(timer); //Measure total time totalTime = sdkObject.readTimer(timer); status = cpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; status = gpu[0].getProfilingData(); if(status != SDK_SUCCESS) return status; //Print total time and individual times std::cout << "Total time : " << totalTime * 1000 << std::endl; std::cout << "Time of CPU : " << cpu[0].elapsedTime << std::endl; std::cout << "Time of GPU : " << gpu[0].elapsedTime << std::endl; if(verify) { //Read back output data for verification status = cpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; status = gpu[0].enqueueReadData(); if(status != SDK_SUCCESS) return status; // Verify results std::cout << "Verifying results for CPU : "; cpu[0].verifyResults(); std::cout << "Verifying results for GPU : "; gpu[0].verifyResults(); } //Release the resources on all devices status = cpu[0].cleanupResources(); if(status != SDK_SUCCESS) return status; status = gpu[0].cleanupResources(); if(status != SDK_SUCCESS) return status; if(devices) { free(devices); devices = NULL; } return 0; } /* * \brief Host Initialization * Allocate and initialize memory * on the host. Print input array. */ int initializeHost(void) { width = NUM_THREADS; input = NULL; verificationOutput = NULL; ///////////////////////////////////////////////////////////////// // Allocate and initialize memory used by host ///////////////////////////////////////////////////////////////// _hData = (PixelType*) malloc (MAX_DATA_SIZE * sizeof(PixelType)); if (_hData == NULL) { printf("\nUnable to allocate memory at %d\n", __LINE__); return SDK_FAILURE; } for (int i = 0; i < MAX_DATA_SIZE; i+=4) { (_hData + i)->_val1 = i * 0.0001f; (_hData + i + 1)->_val2 = (i + 1) * 0.0001f; (_hData + i + 2)->_val3 = (i + 2) * 0.0001f; (_hData + i + 3)->_val4 = (i + 3) * 0.0001f; } #if 0 cl_uint sizeInBytes = width * sizeof(cl_uint); input = (cl_float*) malloc(sizeInBytes); if(input == NULL) { printf("Error: Failed to allocate input memory on host\n"); return SDK_FAILURE; } verificationOutput = (cl_float*) malloc(sizeInBytes); if(verificationOutput == NULL) { printf("Error: Failed to allocate verificationOutput memory on host\n"); return SDK_FAILURE; } //Initilize input data for(int i = 0; i < width; i++) input[i] = (cl_float)i; #endif return SDK_SUCCESS; } /* * Converts the contents of a file into a string */ std::string convertToString(const char *filename) { size_t size; char* str; std::string s; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = (size_t)f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); return NULL; } f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; delete[] str; return s; } return NULL; } /* * \brief OpenCL related initialization * Create Context, Device list, Command Queue * Create OpenCL memory buffer objects * Load CL file, compile, link CL source * Build program and kernel objects */ int initializeCL(void) { cl_int status = 0; /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) return SDK_FAILURE; if(numPlatforms > 0) { cl_platform_id* platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) return SDK_FAILURE; for(unsigned int i=0; i < numPlatforms; ++i) { char pbuff[100]; status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetPlatformInfo failed.")) return SDK_FAILURE; platform = platforms[i]; if(!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { break; } } free(platforms); } /* * If we could find our platform, use it. Otherwise pass a NULL and get whatever the * implementation thinks we should be using. */ cps[0] = CL_CONTEXT_PLATFORM; cps[1] = (cl_context_properties)platform; cps[2] = 0; cprops = (NULL == platform) ? NULL : cps; // Get Number of CPU devices available status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, 0, (cl_uint*)&numCPUDevices); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetDeviceIDs failed.(numCPUDevices)")) return SDK_FAILURE; // Get Number of CPU devices available status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, 0, (cl_uint*)&numDevices); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetDeviceIDs failed.(numDevices)")) return SDK_FAILURE; // Get number of GPU Devices numGPUDevices = numDevices - numCPUDevices; // If no GPU is present then exit if(numGPUDevices < 1) { std::cout << "Only CPU device is present. Exiting!\n"; return SDK_EXPECTED_FAILURE; } // Allocate memory for list of Devices cpu = new Device[numCPUDevices]; //Get CPU Device IDs cl_device_id* cpuDeviceIDs = new cl_device_id[numCPUDevices]; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numCPUDevices, cpuDeviceIDs, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetDeviceIDs failed.")) return SDK_FAILURE; for(int i = 0; i < numCPUDevices; i++) { cpu[i].dType = CL_DEVICE_TYPE_CPU; cpu[i].deviceId = cpuDeviceIDs[i]; } delete[] cpuDeviceIDs; gpu = new Device[numGPUDevices]; //Get GPU Device IDs cl_device_id* gpuDeviceIDs = new cl_device_id[numGPUDevices]; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numGPUDevices, gpuDeviceIDs, 0); if(!sdkObject.checkVal(status, CL_SUCCESS, "clGetDeviceIDs failed.")) return SDK_FAILURE; for(int i = 0; i < numGPUDevices; i++) { gpu[i].dType = CL_DEVICE_TYPE_GPU; gpu[i].deviceId = gpuDeviceIDs[i]; } delete[] gpuDeviceIDs; ///////////////////////////////////////////////////////////////// // Load CL file ///////////////////////////////////////////////////////////////// const char *filename = "SimpleMultiDevice_Kernels.cl"; sourceStr = convertToString(filename); source = sourceStr.c_str(); return SDK_SUCCESS; } int run() { int status; // If a GPU is present then run CPU + GPU concurrently if(numGPUDevices > 0 && numCPUDevices > 0) { // 3 tests : // a) Single context - Single thread // b) Multiple context - Single thread // c) Multiple context - Multple Threads // 3 Tests * 2 devices requiredCount += 3 * 2; status = runMultiDevice(); if(status != SDK_SUCCESS) return status; } // If more than 1 GPU is present then run MultiGPU concurrently if(numGPUDevices > 1) { // 3 tests : // a) Single context - Single thread // b) Multiple context - Single thread // c) Multiple context - Multple Threads // 3 Tests * numGPUDevices requiredCount += 3 * numGPUDevices; status = runMultiGPU(); if(status != SDK_SUCCESS) return status; } return SDK_SUCCESS; } /* * \brief Releases program's resources */ void cleanupHost(void) { if(input != NULL) { free(input); input = NULL; } if(verificationOutput != NULL) { free(verificationOutput); verificationOutput = NULL; } if(cpu != NULL) { delete[] cpu; cpu = NULL; } if(gpu != NULL) { delete[] gpu; gpu = NULL; } } /* * \brief Print no more than 256 elements of the given array. * * Print Array name followed by elements. */ void print1DArray( const std::string arrayName, const unsigned int * arrayData, const unsigned int length) { cl_uint i; cl_uint numElementsToPrint = (256 < length) ? 256 : length; std::cout << std::endl; std::cout << arrayName << ":" << std::endl; for(i = 0; i < numElementsToPrint; ++i) { std::cout << arrayData[i] << " "; } std::cout << std::endl; } // OpenCL MAD definition for CPU float mad(float a, float b, float c) { return a * b + c; } // OpenCL HYPOT definition for CPU float hypot(float a, float b) { return sqrt(a * a + b * b); } int CPUKernel() { for(int i = 0; i < width; i++) { float a = mad(input[i], input[i], 1); float b = mad(input[i], input[i], 2); for(int j = 0; j < KERNEL_ITERATIONS; j++) { a = hypot(a, b); b = hypot(a, b); } verificationOutput[i] = (a + b); } return 0; } int main(int argc, char * argv[]) { for(int i = 1; i < argc; i++) { if(!strcmp(argv[i], "-e") || !strcmp(argv[i], "--verify")) verify = true; if(!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) { printf("Usage\n"); printf("-h, --help\tPrint this help.\n"); printf("-e, --verify\tVerify results against reference implementation.\n"); exit(0); } } int status; // Initialize Host application status = initializeHost(); if(status != SDK_SUCCESS) return status; // Run host computation if verification is true #if 0 CPUKernel(); #endif // Initialize OpenCL resources status = initializeCL(); if(status != SDK_SUCCESS) { if(status == SDK_EXPECTED_FAILURE) return SDK_SUCCESS; else return status; } // Run the CL program status = run(); if(status != SDK_SUCCESS) return status; // Release host resources cleanupHost(); if(verify) { // If any one test fails then print FAILED if(verificationCount != requiredCount) { std::cout << "\n\nFAILED!\n"; return SDK_FAILURE; } else { std::cout << "\n\nPASSED!\n"; return SDK_SUCCESS; } } return SDK_SUCCESS; }