0 Replies Latest reply on Dec 7, 2010 6:40 PM by timchist

    clEnqueueWriteBufferRect and clEnqueueReadBufferRect are slow on ATI 5850

    timchist

      I want to copy a rectangular region of a picture in CPU memory to the GPU memory and vice versa. Before OpenCL 1.1 I used temporary CPU buffer and copied the region in two steps: first copy the region to the continuous buffer, then copy the buffer to GPU. Same procedure was applied when a region of a picture needed to be copied from GPU to CPU.

      In OpenCL 1.1 clEnqueueWriteBufferRect and clEnqueueReadBufferRect were introduced, and I tried to use them. However, clEnqueueWriteBufferRect appeared to be 8 times slower and clEnqueueReadBufferRect - 15 times slower in comparison to the implementation using temporary buffer:

      **********
      CPU -> GPU
      **********
      Using temporary CPU mem
      Spent: 1030
      Using clEnqueueWriteBufferRect
      Spent: 8752
      **********
      GPU -> CPU
      **********
      Using temporary CPU mem
      Spent: 1092
      Using clEnqueueReadBufferRect
      Spent: 15897

      Will this be fixed in future versions of SDK or on newer devices?

      I am using Core i7 and ATI Radeon 5850.

      Source code is attached.

      #include <CL/cl.h> #include <stdio.h> #include <string.h> #include <windows.h> #define CHECK_ERR if(err != CL_SUCCESS) { printf("Error %d at %d\n", err, __LINE__); FreeLibrary(oclLib); return 1; } //------------------------------------------------------------------------------ int main(int argc, char* argv[]) { HMODULE oclLib = LoadLibrary("c:\\windows\\syswow64\\OpenCL.dll"); if(oclLib == NULL) { printf("OpenCL.dll is not present\n"); return 1; } cl_uint numPlatforms; cl_platform_id platform = NULL; cl_int err = clGetPlatformIDs(0, NULL, &numPlatforms); CHECK_ERR(err); if(numPlatforms == 0) { printf("No platforms found\n"); FreeLibrary(oclLib); return 1; } printf("Num platforms: %d\n", (int)numPlatforms); cl_platform_id* platforms = new cl_platform_id[numPlatforms]; err = clGetPlatformIDs(numPlatforms, platforms, &numPlatforms); if(err != CL_SUCCESS) { delete platforms; CHECK_ERR(err); } for(cl_uint i = 0; i < numPlatforms; i++) { char pbuf[100]; err = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if(err != CL_SUCCESS) { delete platforms; CHECK_ERR(err); } printf("%i: %s\n", (int)i, pbuf); if(strcmp(pbuf, "Advanced Micro Devices, Inc.") == 0) { platform = platforms[i]; break; } } delete platforms; if(platform == NULL) { printf("AMD platform is not found\n"); FreeLibrary(oclLib); return 1; } cl_context_properties ctxProperties[3]; ctxProperties[0] = CL_CONTEXT_PLATFORM; ctxProperties[1] = (cl_context_properties)platform; ctxProperties[2] = 0; cl_context context = clCreateContextFromType(ctxProperties, CL_DEVICE_TYPE_GPU, NULL, NULL, &err); CHECK_ERR(err); cl_uint devCount; err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &devCount); CHECK_ERR(err); cl_device_id* devices = new cl_device_id[devCount]; err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, devCount, devices, &devCount); if(err != CL_SUCCESS) { delete devices; CHECK_ERR(err); } cl_device_id deviceId = devices[0]; delete devices; cl_command_queue queue = clCreateCommandQueue(context, deviceId, 0, &err); CHECK_ERR(err); const int IterationCount = 100; const int Width = 2048; const int Height = 1536; const int Margin = 16; const int BigSize = Width * Height; const int SmallerSize = (Width - 2 * Margin) * (Height - 2 * Margin); unsigned int *bigPic = new unsigned int [BigSize]; unsigned int *smallerPic = new unsigned int [SmallerSize]; const int SOUI = sizeof(unsigned int); cl_mem bigPicGPU = clCreateBuffer(context, CL_MEM_READ_WRITE, BigSize * SOUI, NULL, &err); CHECK_ERR(err); cl_mem smallerPicGPU = clCreateBuffer(context, CL_MEM_READ_WRITE, SmallerSize * SOUI, NULL, &err); CHECK_ERR(err); unsigned int tickCount; puts("**********"); puts("CPU -> GPU"); puts("**********"); puts("Using temporary CPU mem"); for(int i = 0; i < IterationCount + 1; i++) { if(i == 1) tickCount = GetTickCount(); unsigned int* src = bigPic + Margin; unsigned int* dst = smallerPic; for(int y = 0; y < Height - 2 * Margin; y++) { memcpy(dst, src, (Width - 2 * Margin) * SOUI); src += Width; dst += Width - 2 * Margin; } err = clEnqueueWriteBuffer(queue, smallerPicGPU, CL_TRUE, 0, SmallerSize * SOUI, smallerPic, 0, NULL, NULL); CHECK_ERR(err); } printf("Spent: %d\n", GetTickCount() - tickCount); puts("Using clEnqueueWriteBufferRect"); for(int i = 0; i < IterationCount + 1; i++) { if(i == 1) tickCount = GetTickCount(); size_t bufferOrigin[3], hostOrigin[3], region[3]; bufferOrigin[0] = 0; bufferOrigin[1] = 0; bufferOrigin[2] = 0; hostOrigin[0] = Margin * SOUI; hostOrigin[1] = Margin; hostOrigin[2] = 0; region[0] = (Width - 2 * Margin) * SOUI; region[1] = Height - 2 * Margin; region[2] = 1; err = clEnqueueWriteBufferRect(queue, smallerPicGPU, CL_TRUE, bufferOrigin, hostOrigin, region, (Width - 2 * Margin) * SOUI, 0, Width * SOUI, 0, bigPic, 0, NULL, NULL); CHECK_ERR(err); } printf("Spent: %d\n", GetTickCount() - tickCount); puts("**********"); puts("GPU -> CPU"); puts("**********"); puts("Using temporary CPU mem"); for(int i = 0; i < IterationCount + 1; i++) { if(i == 1) tickCount = GetTickCount(); err = clEnqueueReadBuffer(queue, bigPicGPU, CL_TRUE, 0, BigSize * SOUI, bigPic, 0, NULL, NULL); CHECK_ERR(err); unsigned int* src = bigPic + Margin; unsigned int* dst = smallerPic; for(int y = 0; y < Height - 2 * Margin; y++) { memcpy(dst, src, (Width - 2 * Margin) * SOUI); src += Width; dst += Width - 2 * Margin; } } printf("Spent: %d\n", GetTickCount() - tickCount); puts("Using clEnqueueReadBufferRect"); for(int i = 0; i < IterationCount + 1; i++) { if(i == 1) tickCount = GetTickCount(); size_t bufferOrigin[3], hostOrigin[3], region[3]; bufferOrigin[0] = Margin * SOUI; bufferOrigin[1] = Margin; bufferOrigin[2] = 0; hostOrigin[0] = 0; hostOrigin[1] = 0; hostOrigin[2] = 0; region[0] = (Width - 2 * Margin) * SOUI; region[1] = Height - 2 * Margin; region[2] = 1; err = clEnqueueReadBufferRect(queue, bigPicGPU, CL_TRUE, bufferOrigin, hostOrigin, region, Width * SOUI, 0, (Width - 2 * Margin) * SOUI, 0, smallerPic, 0, NULL, NULL); CHECK_ERR(err); } printf("Spent: %d\n", GetTickCount() - tickCount); clReleaseMemObject(smallerPicGPU); clReleaseMemObject(bigPicGPU); clReleaseCommandQueue(queue); clReleaseContext(context); delete smallerPic; delete bigPic; FreeLibrary(oclLib); return 0; }