AnsweredAssumed Answered

no significant performance using pinned host memory

Question asked by arvin99 on Jan 15, 2014
Latest reply on Jan 18, 2014 by arvin99

Hi,

I make a program using pinned host memory (CL_MEM_ALLOC_HOST_PTR) and pageable memory (DEFAULT).

Code for pageable memory is created by DEFAULT flag and I use cleEnqueueWriteBuffer/ReadBuffer to transfer to DISCRETE GPU.

 

My question is:

1. Why there is no significant improvement in transfer rate when I use pinned host memory (non pageable) ?? The transfer rate must be greater than pageable memory (standard way) is n't it??

2. The transfer rate (pinned host memory)  become bad when I use clEnqueueCopyBuffer for the SECOND time. I use device buffer (not pinned memory) as kernel argument so why the second write transfer become slower??


 

Here is the code for pinned host memory:

  // Allocate device Memory For Input And Output

  devA = clCreateBuffer(context,  CL_MEM_READ_ONLY  ,   sizeof(cl_float)*size*size, 0, &err);

  devB = clCreateBuffer(context,  CL_MEM_READ_ONLY  ,   sizeof(cl_float)*size*size, 0, &err);

  devC = clCreateBuffer(context,  CL_MEM_WRITE_ONLY , sizeof(cl_float)*size*size, 0, &err);

 

  // Allocate pinned Memory For Input And Output

  pinned_A = clCreateBuffer(context,  CL_MEM_ALLOC_HOST_PTR ,   sizeof(cl_float)*size*size, 0, &err);

  pinned_B = clCreateBuffer(context,  CL_MEM_ALLOC_HOST_PTR,   sizeof(cl_float)*size*size, 0, &err);

  pinned_C = clCreateBuffer(context,  CL_MEM_ALLOC_HOST_PTR , sizeof(cl_float)*size*size, 0, &err);

 

  cl_float * mapPtrA = (cl_float *)clEnqueueMapBuffer( queue, pinned_A, CL_TRUE, CL_MAP_WRITE, 0, sizeof(cl_float)*size*size, 0, NULL, NULL, NULL);

  cl_float * mapPtrB = (cl_float *)clEnqueueMapBuffer( queue, pinned_B, CL_TRUE, CL_MAP_WRITE, 0, sizeof(cl_float)*size*size, 0, NULL, NULL, NULL);

 

  //Fill matrix

  fillMatrix(mapPtrA,size);

  fillMatrix(mapPtrB,size);

 

  clEnqueueUnmapMemObject(queue, pinned_A, mapPtrA, 0, NULL, NULL);

  clEnqueueUnmapMemObject(queue, pinned_B, mapPtrB, 0, NULL, NULL);

 

  clEnqueueCopyBuffer(queue, pinned_A, devA, 0, 0,sizeof(cl_float)*size*size, 0, NULL, NULL);

  clEnqueueCopyBuffer(queue, pinned_B, devB, 0, 0,sizeof(cl_float)*size*size, 0, NULL, NULL);

 

  MatrixMul(devA, devB, devC, size);

 

  clEnqueueCopyBuffer(queue, devC, pinned_C, 0, 0,sizeof(cl_float)*size*size, 0, NULL, NULL);

 

  cl_float * mapPtrC = (cl_float*)clEnqueueMapBuffer( queue, pinned_C, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_float)*size*size, 0, NULL, NULL, NULL);

  //memcpy(matrixC, mapPtrC, sizeof(cl_float)*size*size);

  clEnqueueUnmapMemObject(queue, pinned_C, mapPtrC, 0, NULL, NULL);

Outcomes