So, I have a kernel running in CAL which runs via the following code:
sleep(1);
clock_gettime(CLOCK_REALTIME, &time2);
cacheHitRateCounter.begin();
idlePercentCounter.begin();
radiiKernel.load(device);
expKernel.load(device);
//Run kernels, reassigning x,y,z,basis symbols
for(i = 0; i < kNumSets; i+=8)
{
CALdomain expDomain = {0, 0, kNbas/4, kBlockSize};
CALevent computeRadiiDone;
CALbuffer* radiiParameters[] = {coordGPU[i/8], r2GPU[0]};
const char* radiiSymbols[] = {"i0", "o0"};
CALdomain radiiDomain = {0, 0, kBlockSize, 8};
radiiKernel.assignSymbolAndRun(
radiiParameters,
radiiSymbols,
2,
device,
"main",
&computeRadiiDone,
&radiiDomain
;
//while(calCtxIsEventDone(device.getContext(),computeRadiiDone) ==
// CAL_RESULT_PENDING){}
//Assign symbol names and run kernel
CALbuffer* expParameters[] = {
r2GPU[0],
alphaGPU,
basisGPU[i+0],
basisGPU[i+1],
basisGPU[i+2],
basisGPU[i+3],
basisGPU[i+4],
basisGPU[i+5],
basisGPU[i+6],
basisGPU[i+7],
};
const char* expSymbols[] = {
"i0",
"i1",
"o0",
"o1",
"o2",
"o3",
"o4",
"o5",
"o6",
"o7"};
expKernel.assignSymbolAndRun(
expParameters,
expSymbols,
10,
device,
"main",
&event[i/8],
&expDomain);
}
for(i = 0; i < kNumSets/8; i++)
{
RETRY_WAIT:
//printf("%d %d\n",event,i);
if(calCtxIsEventDone(device.getContext(), event)
== CAL_RESULT_PENDING)
{
goto RETRY_WAIT;
}
}
clock_gettime(CLOCK_REALTIME, &time3);
I compute the execution time as the difference between time 3 and time2. assignSymbolAndRun is a high level wrapper that I wrote to make my life easier. The issue is this: when sleep(1) is commented out, the execution takes ~16ms to complete. When sleep(1) is not commented out, the execution takes over 40ms. Sleep(1) can be substituted with any heavy computation load running on the host and the same result occurs. What could cause this? The sleep occurs before I take my time measurement.