AnsweredAssumed Answered

submit-to-start latency

Question asked by kd2 on May 9, 2013
Latest reply on May 10, 2013 by kd2

With nothing in the queue, my typical kernels have a 1/2 to 1 millisecond latency between CL_PROFILING_COMMAND_SUBMIT and CL_PROFILING_COMMAND_START.

This makes up for a large amount of my total run time. How do I minimize this latency?

 

Here's a simple example. Code below. I'm using AMD-APP-SDK-v2.8-lnx64.tgz with linux kernel 3.4.4, Catalyst-13.3, on a CL_DEVICE_NAME=Tahiti card

 

# ./tst_zer64

    total time   queue->submit   submit->start      start->end

      0.996 ms        0.011 ms        0.930 ms        0.009 ms

      0.516 ms        0.026 ms        0.473 ms        0.005 ms

      0.476 ms        0.006 ms        0.456 ms        0.004 ms

 

# cat tst_zer.cpp

 

#include <CL/cl.h>

#include <stdio.h>

#include <sys/time.h>

 

const char *src = "\n\

__kernel void kernel_0(__global uint *A, __global uint *B) {\n\

   uint gx = get_global_id(0);                         \n\

   B[gx] = A[gx];                                      \n\

}\n";

 

int main(int argc, char **argv)

{

   const size_t NTHD(64), NTESTS(1024);

   cl_platform_id platform;

   cl_device_id dev;

   cl_uint platforms, devs;

   clGetPlatformIDs(1,&platform,&platforms);

   clGetDeviceIDs(platform,CL_DEVICE_TYPE_ALL,1,&dev,&devs);

   cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,(cl_context_properties)platform,0};

   cl_int err(0);

   cl_context ctx = clCreateContext(properties,1,&dev,NULL,NULL,&err);

   cl_command_queue cq = clCreateCommandQueue(ctx,dev,CL_QUEUE_PROFILING_ENABLE,&err);

   cl_program prog = clCreateProgramWithSource(ctx,1,&src,NULL,&err);

   err = clBuildProgram(prog,1,&dev,"",NULL,NULL);

   //

   int *hA(new int [NTESTS]), *hB(new int [NTESTS]);

   cl_mem dA = clCreateBuffer(ctx,CL_MEM_READ_ONLY,NTESTS*4,0,&err);

   cl_mem dB = clCreateBuffer(ctx,CL_MEM_WRITE_ONLY,NTESTS*4,0,&err);

   clEnqueueWriteBuffer(cq,dA,CL_TRUE,0,NTESTS*4,hA,0,NULL,NULL);

   clFinish(cq);

   cl_kernel kernel = clCreateKernel(prog,"kernel_0",&err);

   clSetKernelArg(kernel,0,sizeof(cl_mem),(void *)&dA);

   clSetKernelArg(kernel,1,sizeof(cl_mem),(void *)&dB);

   //

   ::printf(" %15s %15s %15s %15s\n", "total time", "queue->submit", "submit->start", "start->end");

   for(int ITER(0); ITER < 3; ITER++) {

     struct timeval time;

     ::gettimeofday(&time,NULL);

     double T0(time.tv_sec+time.tv_usec/1e6);

     cl_event evt;

     clEnqueueNDRangeKernel(cq,kernel,1,NULL,&NTESTS,&NTHD,0,NULL,&evt);

     clFinish(cq);

     ::gettimeofday(&time,NULL);

     double T1(time.tv_sec+time.tv_usec/1e6);

     double dT(T1-T0);

     cl_ulong t0(0), t1(0), t2(0), t3(0);

     clWaitForEvents(1,&evt);

     clGetEventProfilingInfo(evt,CL_PROFILING_COMMAND_QUEUED,sizeof(cl_ulong),&t0,0);

     clGetEventProfilingInfo(evt,CL_PROFILING_COMMAND_SUBMIT,sizeof(cl_ulong),&t1,0);

     clGetEventProfilingInfo(evt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&t2,0);

     clGetEventProfilingInfo(evt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&t3,0);

     double dt_qs((t1-t0)/1e9), dt_ss((t2-t1)/1e9), dt_se((t3-t2)/1e9);

     clEnqueueReadBuffer(cq,dB,CL_FALSE,0,NTESTS*4,hB,0,NULL,NULL);

     clFinish(cq);

     ::printf(" %12.3f ms %12.3f ms %12.3f ms %12.3f ms\n", dT*1e3, dt_qs*1e3, dt_ss*1e3, dt_se*1e3);

   }

   //

   if( hA ) delete [] hA;

   if( hB ) delete [] hB;

   clReleaseCommandQueue(cq);

   clReleaseMemObject(dA);

   clReleaseMemObject(dB);

   clReleaseKernel(kernel);

   clReleaseProgram(prog);

   clReleaseContext(ctx);

   return 0;

}

 

#/usr/bin/g++ -O4 -I/opt/AMDAPP/include -o tst_zer64 tst_zer.cpp /usr/lib64/libOpenCL.so

Outcomes