Some more about this issue. Listed code fragment really take in few times longer under background CPU load, but much stronger effect (in order of magnitude!) comes from this fragment:
{Timings< T_oclReadBuf > counter;
cl_uint offset;
cl_event ev=NULL;
if(state.dm_sign==-1)
offset=(32768/2)*(2*state.dm_now)* sizeof(cl_float);
else
offset=(32768/2)*(2*state.dm_now+1)* sizeof(cl_float);
//R: we have whole small DM chunk prepared. Now will retrieve needed power array from GPU memory only
err = clEnqueueReadBuffer(
cq,
gpu_power,
CL_TRUE,
offset,
(32768/2) * sizeof(cl_float),
power,
0,
NULL,
&ev);//solely for profiling cause read blocking for now
if(err != CL_SUCCESS)fprintf(stderr,"Error: clEnqueueReadBuffer failed: %d\n",err);
if(ev){
cl_ulong start,end;
err|=clGetEventProfilingInfo (ev,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&start,NULL);
err|=clGetEventProfilingInfo (ev,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&end,NULL);
Counters< T_oclFFT2_ns,cl_ulong >::update(end-start);
fprintf(stderr,"Read buf took:%.2e ns\n",float(end-start));
err|=clReleaseEvent(ev);ev=NULL;
if(err != CL_SUCCESS)fprintf(stderr,"ERROR: read buf event: %d\n",err);
}
}
For completely idle CPU:
class T_main_loop_L1: total=7.51e+011, N=1, <>=7.51e+011, min=7.51e+011, max=7.51e+011
class T_oclReadBuf: total=2.88e+011, N=262015, <>=1.10e+006, min=8.78e+005, max=7.94e+007
class T_oclFFT2_ns: total=4.343e+010, N=262015, <>=1.657e+005, min=1.366e+005 max=1.527e+006
it constitutes ~1/2 - 1/3 of total time
For background low-priority tasks running:
class T_main_loop_L1: total=3.91e+012, N=1, <>=3.91e+012, min=3.91e+012, max=3.91e+012
class T_oclReadBuf: total=3.27e+012, N=262015, <>=1.25e+007, min=9.46e+005, max=3.51e+008
class T_oclFFT2_ns: total=7.552e+010, N=262015, <>=2.882e+005, min=1.172e+005 max=3.876e+007
Almost all time spent on this small fragment !!!
And time reported via ev event almost the same for both cases. Where so much time lost ????