Raistmer

Inadequate times for memory transfers

Discussion created by Raistmer on Mar 13, 2010
Latest reply on Mar 18, 2010 by Raistmer
Why they so big and so differ??

I use memory transfer from pinned memory buffer in host memory to GPU memory and measure each stage of this transfer: mapping to host memory, unmapping and, finally, copying updated buffer to GPU memory.

What times I recived (in ns):
mapped/copied region size 4*2*32k*7=1792kB.
DataMap_ns: total=2.869e+008, N=4688, <>=6.121e+004, min=524 max=1.8e+007
DataUnmap_ns: total=1.591e+011, N=4688, <>=3.393e+007, min=3.356e+007 max=3.557e+007
DataCopy_ns: total=1.956e+008, N=4688, <>=4.172e+004, min=2.933e+004 max=6.052e+005

Especially interesting mapping: from 524ns to 18ms variation!
Why???
And data copying itself takes less time in average than map/unmap buffer! Something wrong here....

[GPU as usual, HD4870 + CPU Q9450]
Code for these sections:


Mapping: {Timings<T_DataMap> counter;cl_event ev; data_range=(ap_complex*)clEnqueueMapBuffer(cq, cpu_pinned_buf, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, sizeof(ap_complex)*state.fft_len*DATA_CHUNK_UNROLL, 0, NULL, &ev, &err); if(err != CL_SUCCESS)fprintf(stderr,"ERROR: clEnqueueMapBuffer (data_range): %d\n",err); #if 1 if(ev){ cl_ulong start,end; err=clWaitForEvents(1,&ev); err|=clGetEventProfilingInfo (ev,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&start,NULL); err|=clGetEventProfilingInfo (ev,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&end,NULL); Counters<T_DataMap_ns,cl_ulong>::update(end-start); //fprintf(stderr,"Pass %u: kernel took: %.2e ns, s=%d\n",pass,float(end-start),batchSize); err|=clReleaseEvent(ev);ev=NULL; if(err != CL_SUCCESS)fprintf(stderr,"ERROR: DataCopy event: %d\n",err); } #endif } Unmapping: {Timings<T_DataUnmap> counter1;cl_event ev; err=clEnqueueUnmapMemObject(cq,cpu_pinned_buf,data_range,0,NULL,&ev);data_range=NULL; #if 1 if(ev){ cl_ulong start,end; err=clWaitForEvents(1,&ev); err|=clGetEventProfilingInfo (ev,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&start,NULL); err|=clGetEventProfilingInfo (ev,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&end,NULL); Counters<T_DataUnmap_ns,cl_ulong>::update(end-start); //fprintf(stderr,"Pass %u: kernel took: %.2e ns, s=%d\n",pass,float(end-start),batchSize); err|=clReleaseEvent(ev);ev=NULL; if(err != CL_SUCCESS)fprintf(stderr,"ERROR: DataCopy event: %d\n",err); } #endif } Copying: {Timings<T_DataCopy> counter;cl_event ev; err|=clEnqueueCopyBuffer(cq,cpu_pinned_buf,gpu_data,0,0,sizeof(ap_complex)*fft_len*DATA_CHUNK_UNROLL,0, NULL, &ev); if(err != CL_SUCCESS)fprintf(stderr,"ERROR: CopyBuffer(gpu_data): %d\n",err); #if 1 if(ev){ cl_ulong start,end; err=clWaitForEvents(1,&ev); err|=clGetEventProfilingInfo (ev,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&start,NULL); err|=clGetEventProfilingInfo (ev,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&end,NULL); Counters<T_DataCopy_ns,cl_ulong>::update(end-start); //fprintf(stderr,"Pass %u: kernel took: %.2e ns, s=%d\n",pass,float(end-start),batchSize); err|=clReleaseEvent(ev);ev=NULL; if(err != CL_SUCCESS)fprintf(stderr,"ERROR: DataCopy event: %d\n",err); } #endif }

Outcomes