petr.machacek

Very poor OpenCL performance

Discussion created by petr.machacek on Aug 14, 2011
Latest reply on Aug 18, 2011 by laobrasuca

Hello,

I'm new to OpenCL, so I guess I'm doing some silly mistake..

I tried to create simple program for image thresholding using OpenCL. Simplified version of the source code source is attached to this post.  The program work well, but execution time is very poor.

When I execute the program on RGB image 5760x3240, the program output is [time is in miliseconds]:

 

clCreateBuffer: 25.649242
clSetKernelArg: 0.001205
clEnqueueNDRangeKernel: 0.536059
clFinish: 66.903236
clEnqueueReadBuffer: 17.060545

 

When I use IPP's threshold (Intel performance primitives) on the same picture, then the threshold takes 40 ms. 

Why is my OpenCL program running on GPU so slow? I expected it to be much more faster.

My hardware:

Win7 Home, 64bit
CPU AMD Phenom II X4 965, 3.4GHs (4 cores)
6GB RAM
GPU: GIGABYTE, ATI Radeon 5750 
AMD APP SDK v2.4 

Thanks for any hint.. 

 

#inclue <stdio.h> #include <windows.h> #include <CL\opencl.h> typedef struct _PICBUF { unsigned Width; unsigned Height; unsigned BytesPerPixel; unsigned Components; unsigned BitDepth; unsigned Stride; unsigned char *Data; }PICBUF, *PPICBUF; #define SIZEOF_PICBUF(picbuf) \ ((picbuf).Stride * (picbuf).Height) #define COUNTER_TO_MS(Count) ((Count)/1000000.0) const char *KernelSource = "__kernel void threshold(__global uchar *input, __global uchar *output, const uchar thres)\n"\ "{\n"\ " size_t id = get_global_id(0);\n"\ " output[id] = input[id] < thres ? input[id] : thres;\n"\ "}\n"\ "\n"; unsigned GetCounter(double *pdTime); //------------------------------ int main() { cl_context context; cl_context_properties properties[3]; cl_kernel kernel; cl_command_queue command_queue; cl_program program; cl_int err; cl_uint num_of_platforms = 0; cl_platform_id platform_id[2]; cl_device_id device_id; cl_uint num_of_devices = 0; cl_mem input, output; size_t global; cl_uchar thres; PICBUF picIn = {0,}; PICBUF picOut = {0,}; double now, prev; int i = 0; ImageApiReadImage(L"Image.jpg", &picIn); ImageApiAllocPicBufEx(&picOut, &picIn); if(clGetPlatformIDs(2, platform_id, &num_of_platforms) != CL_SUCCESS) { printf("Could not read the platform id\n"); return 1; } //I have two platforms, the index 1 is AMD Accelerated parallel processing if(clGetDeviceIDs(platform_id[1], CL_DEVICE_TYPE_GPU, 1, &device_id, &num_of_devices) != CL_SUCCESS) { printf("Could not read the device id\n"); return 1; } properties[0] = CL_CONTEXT_PLATFORM; properties[1] = (cl_context_properties) platform_id[1]; properties[2] = 0; context = clCreateContext(properties, 1, &device_id, NULL, NULL, &err); command_queue = clCreateCommandQueue(context, device_id, 0, &err); program = clCreateProgramWithSource(context, 1, (const char **)&KernelSource, NULL, &err); if(err = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL) != CL_SUCCESS) { printf("Could not compile the program.\n"); return 1; } kernel = clCreateKernel(program, "threshold", &err); GetCounter(&prev); input = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, SIZEOF_PICBUF(picIn), picIn.Data, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, SIZEOF_PICBUF(picOut), NULL, NULL); GetCounter(&now); printf("clCreateBuffer: %Lf\n", COUNTER_TO_MS(now - prev)); GetCounter(&prev); thres = 128; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err = clSetKernelArg(kernel, 2, sizeof(cl_uchar), &thres); GetCounter(&now); printf("clSetKernelArg: %Lf\n", COUNTER_TO_MS(now - prev)); global = SIZEOF_PICBUF(picIn); GetCounter(&prev); if(err = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL)!=CL_SUCCESS) { printf("clEnqueueNDRangeKernel failed\n"); return 0; } GetCounter(&now); printf("clEnqueueNDRangeKernel: %Lf\n", COUNTER_TO_MS(now - prev)); GetCounter(&prev); if(clFinish(command_queue)!= CL_SUCCESS) { printf("clFinish failed\n"); return 1; } GetCounter(&now); printf("clFinish: %Lf\n", COUNTER_TO_MS(now - prev)); GetCounter(&prev); if(clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, SIZEOF_PICBUF(picOut), picOut.Data, 0, NULL, NULL)!=CL_SUCCESS) { printf("clEnqueueReadBuffer failed\n"); return 1; } GetCounter(&now); printf("clEnqueueReadBuffer: %Lf\n", COUNTER_TO_MS(now - prev)); ImageApiWriteImage(L"ImageOcl.jpg", &picOut, &par); ImageApiFinishPicBuf(&picIn); ImageApiFinishPicBuf(&picOut); clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(command_queue); clReleaseContext(context); } //------------------------------ unsigned GetCounter( double *pdDouble ) { unsigned res = 0; static BOOL perfSupported = TRUE; static LARGE_INTEGER freq; LARGE_INTEGER count; static BOOL first = TRUE; if(first) { perfSupported = QueryPerformanceFrequency(&freq); first = FALSE; } if(perfSupported) { BOOL ret = QueryPerformanceCounter(&count); if(ret) { *pdTime = (1000000000. * (count.QuadPart / (double)freq.QuadPart)); } else { res = -1; } } else { DWORD ticks = GetTickCount(); *pdTime = (double)(ticks * 1000000); } return res; }

Outcomes