I developed an OpenCL application that uses both CPU and GPU to load balance the computation. The application works in WinXP but crashes in Win7 (clEnqueueMapBuffer).
In addition, the application works when only one device is used (i.e. CPU or GPU but not combined) in Win7. How can I get both devices working together?
I have two configurations and both exhibit the same issue:
Configuration 1: - Intel Core2 Duo
- Asus EAH5670
- AMD APP SDK v2.3 with ATI Catalyst v11.1 (Win7 64 bits)
Configuration 2: - AMD Fusion G-T56N
- AMD APP SDK v2.3 with ATI Catalyst v11.1 (Win7 64 bits)
Which hand am I holding the answer in?
You really need to give us more information to give you an answer to a question like this. A test case of some sort. Otherwise you just have to hope someone else has come across exactly the same bug. Can you give us some code that consistently crashes for you and someone will test it.
My application works using previous version of the SDK, i.e. v2.2, on WinXP. However, it crashes using SDK v2.3 on WinXP. This is tested in AMD Fusion G-T56N.
Yes... but what is your application?
I can post my version of application:
#include <stdio.h> #include <stdlib.h> #include "CL\cl.h" const char *source = "#define cl_uint unsigned int\n" "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n" "\n" "__kernel void testD(const double k, const cl_uint uiCount, __global double *pX, __global double *pY)\n" "{\n" " cl_uint i = get_global_id(0);\n" " if (i >= uiCount)\n" " return;\n" " double k1 = 2.0f / 3.0f;\n" " pY = k * pX;\n" "}\n" "\n" "__kernel void testDBug(const cl_uint uiCount, const double k, __global double *pX, __global double *pY)\n" "{\n" " cl_uint i = get_global_id(0);\n" " if (i >= uiCount)\n" " return;\n" " double k1 = 2.0f / 3.0f;\n" " pY = k * pX;\n" "}\n"; int main() { cl_int err_code(0); cl_platform_id platform(NULL); cl_uint numPlatforms(0); err_code = clGetPlatformIDs(0, NULL, &numPlatforms); if (CL_SUCCESS != err_code) { printf("1. Unable get platform %d %d\n", err_code, numPlatforms); exit(1); } err_code = clGetPlatformIDs(1, &platform, NULL); if (CL_SUCCESS != err_code) { printf("Unable get platform %d\n", err_code); exit(1); } cl_uint numDevices(0); err_code = clGetDeviceIDs(platform ,CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices); cl_device_id *devices = new cl_device_id[numDevices]; err_code = clGetDeviceIDs(platform ,CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_CPU, numDevices, devices, NULL); if (CL_SUCCESS != err_code) { printf("clGetDeviceIDs failed %d\n", err_code); exit(1); } cl_context context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &err_code); if (err_code) { printf("Unable create context %d\n", err_code); exit(1); } cl_command_queue *queues = new cl_command_queue[numDevices]; for (int k(0); k < numDevices; ++k) { queues
= clCreateCommandQueue(context, devices , 0, &err_code); if (err_code) { printf("Unable create command queue %d\n", err_code); exit(1); } } cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source, NULL, &err_code); if (err_code) { printf("Unable create program %d\n", err_code); exit(1); } printf("Program created.\n"); err_code = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err_code) { printf("Unable build program %d\n", err_code); exit(1); } printf("Program builded.\n"); // TEST printf("Test started.\n"); cl_kernel k_testD = clCreateKernel(program, "testD", &err_code); if (CL_SUCCESS != err_code) { printf("clCreateKernel testD failed %d\n", err_code); exit(1); } cl_kernel k_testDBug = clCreateKernel(program, "testDBug", &err_code); if (CL_SUCCESS != err_code) { printf("clCreateKernel testDBug failed %d\n", err_code); exit(1); } cl_uint uiCount(10); double *p_dX2 = new double[uiCount]; double *p_dY2 = new double[uiCount]; for (cl_uint i(0); i < uiCount; ++i) { p_dX2 = (double)i; p_dY2 = 0; } double dValue(2.0/3.0); err_code = clSetKernelArg(k_testD, 0, sizeof(dValue), (void*)&dValue); if (CL_SUCCESS != err_code) { printf("clSetKernelArg 0 failed %d\n", err_code); exit(1); } err_code = clSetKernelArg(k_testD, 1, sizeof(uiCount), (void*)&uiCount); if (CL_SUCCESS != err_code) { printf("clSetKernelArg 1 failed %d\n", err_code); exit(1); } for (int k(0); k < numDevices; ++k) { cl_mem bufdX = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, uiCount * sizeof(double), p_dX2, &err_code); if (CL_SUCCESS != err_code) { printf("clCreateBuffer X failed %d\n", err_code); exit(1); } cl_mem bufdY = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, uiCount * sizeof(double), p_dY2, &err_code); if (CL_SUCCESS != err_code) { printf("clCreateBuffer Y failed %d\n", err_code); exit(1); } err_code = clSetKernelArg(k_testD, 2, sizeof(bufdX), (void*)&bufdX); if (CL_SUCCESS != err_code) { printf("clSetKernelArg 2 failed %d\n", err_code); exit(1); } err_code = clSetKernelArg(k_testD, 3, sizeof(bufdY), (void*)&bufdY); if (CL_SUCCESS != err_code) { printf("clSetKernelArg 3 failed %d\n", err_code); exit(1); } size_t globalThreads1[1] = { uiCount }; err_code = clEnqueueNDRangeKernel(queues , k_testD, 1, NULL, globalThreads1, NULL, 0, NULL, NULL); if (CL_SUCCESS != err_code) { printf("clEnqueueNDRangeKernel failed %d\n", err_code); exit(1); } err_code = clFinish(queues ); if (CL_SUCCESS != err_code) { printf("clFinish failed %d\n", err_code); exit(1); } err_code = clEnqueueReadBuffer(queues , bufdX, CL_TRUE, 0, uiCount * sizeof(double), p_dX2, 0, NULL, NULL); if (CL_SUCCESS != err_code) { printf("clEnqueueMapBuffer X failed %d\n", err_code); exit(1); } err_code = clEnqueueReadBuffer(queues , bufdY, CL_TRUE, 0, uiCount * sizeof(double), p_dY2, 0, NULL, NULL); if (CL_SUCCESS != err_code) { printf("clEnqueueMapBuffer Y failed %d\n", err_code); exit(1); } clReleaseMemObject(bufdX); clReleaseMemObject(bufdY); for (cl_uint i(0); i < uiCount; ++i) { printf("X = %lf Y = %lf\n", p_dX2, p_dY2); p_dX2 = (double)i; p_dY2 = 0; } printf("\n"); } // WARNING err_code = clSetKernelArg(k_testDBug, 0, sizeof(uiCount), (void*)&uiCount); if (CL_SUCCESS != err_code) { printf("clSetKernelArg 0 failed %d\n", err_code); exit(1); } err_code = clSetKernelArg(k_testDBug, 1, sizeof(dValue), (void*)&dValue); if (CL_SUCCESS != err_code) { printf("clSetKernelArg 1 failed %d\n", err_code); exit(1); } for (int k(0); k < numDevices; ++k) { cl_mem bufdX = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, uiCount * sizeof(double), p_dX2, &err_code); if (CL_SUCCESS != err_code) { printf("clCreateBuffer X failed %d\n", err_code); exit(1); } cl_mem bufdY = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, uiCount * sizeof(double), p_dY2, &err_code); if (CL_SUCCESS != err_code) { printf("clCreateBuffer Y failed %d\n", err_code); exit(1); } err_code = clSetKernelArg(k_testDBug, 2, sizeof(bufdX), (void*)&bufdX); if (CL_SUCCESS != err_code) { printf("clSetKernelArg 2 failed %d\n", err_code); exit(1); } err_code = clSetKernelArg(k_testDBug, 3, sizeof(bufdY), (void*)&bufdY); if (CL_SUCCESS != err_code) { printf("clSetKernelArg 3 failed %d\n", err_code); exit(1); } size_t globalThreads1[1] = { uiCount }; err_code = clEnqueueNDRangeKernel(queues , k_testDBug, 1, NULL, globalThreads1, NULL, 0, NULL, NULL); if (CL_SUCCESS != err_code) { printf("clEnqueueNDRangeKernel failed %d\n", err_code); exit(1); } err_code = clFinish(queues ); if (CL_SUCCESS != err_code) { printf("clFinish failed %d\n", err_code); exit(1); } err_code = clEnqueueReadBuffer(queues , bufdX, CL_TRUE, 0, uiCount * sizeof(double), p_dX2, 0, NULL, NULL); if (CL_SUCCESS != err_code) { printf("clEnqueueMapBuffer X failed %d\n", err_code); exit(1); } err_code = clEnqueueReadBuffer(queues , bufdY, CL_TRUE, 0, uiCount * sizeof(double), p_dY2, 0, NULL, NULL); if (CL_SUCCESS != err_code) { printf("clEnqueueMapBuffer Y failed %d\n", err_code); exit(1); } clReleaseMemObject(bufdX); clReleaseMemObject(bufdY); for (cl_uint i(0); i < 10; ++i) { printf("X = %lf Y = %lf\n", p_dX2, p_dY2); p_dX2 = (double)i; p_dY2 = 0; } printf("\n"); } delete [] p_dX2; delete [] p_dY2; clReleaseKernel(k_testD); clReleaseKernel(k_testDBug); clReleaseProgram(program); for (int k(0); k < numDevices; ++k) clReleaseCommandQueue(queues ); clReleaseContext(context); }
Originally posted by: lordnn I can post my version of application:
Could you please tell me what issue you are facing with this code?
This code crashed then testDBug routine enqueued on CPU device.
Using SimpleMultiDevice project with CPU + GPU Test 1: Single Context, Single Thread as the reference, the input data is two-dimensional with WIDTH = HEIGHT = 128 and each data comprises 4 elements (float).
The kernel code is as below. Below is the summary of my observations:
1. If either line 49 (i.e. _c = ...) or 52 (i.e. _U = ...) is commented, the application works.
2. If line 49 is replaced with _b += a; and _detC is modified appropriately, the application crashes nevertheless.
3. If the entire nested for loop is commented, the application works.
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable #define MAX 25 #define OFFSET 2 #define COMPUTE(_in) ((_in) * (_in)) __kernel void multiDeviceKernel(__global unsigned char *_dout, __global float4 *_din, __constant int *_template, const float4 _val, const int _type ) { int _col = get_global_id(0); int _row = get_global_id(1); float4 _a = {0, 0, 0, 0}; float4 _b = {0, 0, 0, 0}; float4 _c = {0, 0, 0, 0}; int i, j, k, l; int _index; float _detA; float _detC; float _U; for (i = (_row - OFFSET), _index = 0, _a = (float4)0, _b = (float4)0; i <= (_row + OFFSET); i++) { for (j = (_col - OFFSET); j <= (_col + OFFSET); j++) { k = j; if ((j < 0) || (j >= WIDTH)) k = (_col << 1) - j; l = i; if ((i < 0) || (i >= HEIGHT)) l = (_row << 1) - i; if (_template[_type * MAX + _index] == 1) _a += COMPUTE(_din[l * WIDTH + k]); else if (_template[_type * MAX + _index] == 2) _b += COMPUTE(_din[l * WIDTH + k]); _index++; } } _c = _a + _b; _detA = (_a.x * _a.y) - (_a.z * _a.w); _detC = (_c.x * _c.y) + (_c.z * _c.w); _U = (_detC != 0.0f) ? _detC/_detA : _val.x; _dout[_row * WIDTH + _col] = (_U >= _val.z && _U < _val.y); }
Originally posted by: wheecheng Using SimpleMultiDevice project with CPU + GPU Test 1: Single Context, Single Thread as the reference, the input data is two-dimensional with WIDTH = HEIGHT = 128 and each data comprises 4 elements (float).
The kernel code is as below. Below is the summary of my observations:
1. If either line 49 (i.e. _c = ...) or 52 (i.e. _U = ...) is commented, the application works.
2. If line 49 is replaced with _b += a; and _detC is modified appropriately, the application crashes nevertheless.
3. If the entire nested for loop is commented, the application works.
Wheecheng,
Thanks for giving kernel code. Please send your runtime code also which allows us to reproduce this at our end.
Originally posted by: lordnn I can post my version of application:
Thanks for reporting this issue.