2 Replies Latest reply on May 10, 2013 3:33 AM by kd2

    submit-to-start latency

    kd2

      With nothing in the queue, my typical kernels have a 1/2 to 1 millisecond latency between CL_PROFILING_COMMAND_SUBMIT and CL_PROFILING_COMMAND_START.

      This makes up for a large amount of my total run time. How do I minimize this latency?

       

      Here's a simple example. Code below. I'm using AMD-APP-SDK-v2.8-lnx64.tgz with linux kernel 3.4.4, Catalyst-13.3, on a CL_DEVICE_NAME=Tahiti card

       

      # ./tst_zer64

          total time   queue->submit   submit->start      start->end

            0.996 ms        0.011 ms        0.930 ms        0.009 ms

            0.516 ms        0.026 ms        0.473 ms        0.005 ms

            0.476 ms        0.006 ms        0.456 ms        0.004 ms

       

      # cat tst_zer.cpp

       

      #include <CL/cl.h>

      #include <stdio.h>

      #include <sys/time.h>

       

      const char *src = "\n\

      __kernel void kernel_0(__global uint *A, __global uint *B) {\n\

         uint gx = get_global_id(0);                         \n\

         B[gx] = A[gx];                                      \n\

      }\n";

       

      int main(int argc, char **argv)

      {

         const size_t NTHD(64), NTESTS(1024);

         cl_platform_id platform;

         cl_device_id dev;

         cl_uint platforms, devs;

         clGetPlatformIDs(1,&platform,&platforms);

         clGetDeviceIDs(platform,CL_DEVICE_TYPE_ALL,1,&dev,&devs);

         cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,(cl_context_properties)platform,0};

         cl_int err(0);

         cl_context ctx = clCreateContext(properties,1,&dev,NULL,NULL,&err);

         cl_command_queue cq = clCreateCommandQueue(ctx,dev,CL_QUEUE_PROFILING_ENABLE,&err);

         cl_program prog = clCreateProgramWithSource(ctx,1,&src,NULL,&err);

         err = clBuildProgram(prog,1,&dev,"",NULL,NULL);

         //

         int *hA(new int [NTESTS]), *hB(new int [NTESTS]);

         cl_mem dA = clCreateBuffer(ctx,CL_MEM_READ_ONLY,NTESTS*4,0,&err);

         cl_mem dB = clCreateBuffer(ctx,CL_MEM_WRITE_ONLY,NTESTS*4,0,&err);

         clEnqueueWriteBuffer(cq,dA,CL_TRUE,0,NTESTS*4,hA,0,NULL,NULL);

         clFinish(cq);

         cl_kernel kernel = clCreateKernel(prog,"kernel_0",&err);

         clSetKernelArg(kernel,0,sizeof(cl_mem),(void *)&dA);

         clSetKernelArg(kernel,1,sizeof(cl_mem),(void *)&dB);

         //

         ::printf(" %15s %15s %15s %15s\n", "total time", "queue->submit", "submit->start", "start->end");

         for(int ITER(0); ITER < 3; ITER++) {

           struct timeval time;

           ::gettimeofday(&time,NULL);

           double T0(time.tv_sec+time.tv_usec/1e6);

           cl_event evt;

           clEnqueueNDRangeKernel(cq,kernel,1,NULL,&NTESTS,&NTHD,0,NULL,&evt);

           clFinish(cq);

           ::gettimeofday(&time,NULL);

           double T1(time.tv_sec+time.tv_usec/1e6);

           double dT(T1-T0);

           cl_ulong t0(0), t1(0), t2(0), t3(0);

           clWaitForEvents(1,&evt);

           clGetEventProfilingInfo(evt,CL_PROFILING_COMMAND_QUEUED,sizeof(cl_ulong),&t0,0);

           clGetEventProfilingInfo(evt,CL_PROFILING_COMMAND_SUBMIT,sizeof(cl_ulong),&t1,0);

           clGetEventProfilingInfo(evt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&t2,0);

           clGetEventProfilingInfo(evt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&t3,0);

           double dt_qs((t1-t0)/1e9), dt_ss((t2-t1)/1e9), dt_se((t3-t2)/1e9);

           clEnqueueReadBuffer(cq,dB,CL_FALSE,0,NTESTS*4,hB,0,NULL,NULL);

           clFinish(cq);

           ::printf(" %12.3f ms %12.3f ms %12.3f ms %12.3f ms\n", dT*1e3, dt_qs*1e3, dt_ss*1e3, dt_se*1e3);

         }

         //

         if( hA ) delete [] hA;

         if( hB ) delete [] hB;

         clReleaseCommandQueue(cq);

         clReleaseMemObject(dA);

         clReleaseMemObject(dB);

         clReleaseKernel(kernel);

         clReleaseProgram(prog);

         clReleaseContext(ctx);

         return 0;

      }

       

      #/usr/bin/g++ -O4 -I/opt/AMDAPP/include -o tst_zer64 tst_zer.cpp /usr/lib64/libOpenCL.so

        • Re: submit-to-start latency
          himanshu.gautam

          Here are my numbers for this test:

          cas@cas-MS-7751:~/Desktop/launchtime$ ./a.out

                total time   queue->submit   submit->start      start->end

                  2.577 ms        0.003 ms        2.526 ms        0.021 ms

                  0.092 ms        0.004 ms        0.065 ms        0.015 ms

                  0.073 ms        0.003 ms        0.052 ms        0.012 ms

          Assuming first time is the warm-up time. I guess the submit-start latency is not as high as 500us for me.

          System Details: Ubuntu 12.04, Intel CPU, HD 7770 GPU, Driver 13.4, SDK 2.8

          Can you check with 13.4 driver? Also post kernelLaunch sample's output.

            • Re: submit-to-start latency
              kd2

              Thanks for that. I knew it should have been faster. After switching the driver from 13.3 to 13.4, the times are much better..

               

              # ./tst_zer64

                    total time   queue->submit   submit->start      start->end

                      0.660 ms        0.013 ms        0.594 ms        0.009 ms

                      0.130 ms        0.024 ms        0.080 ms        0.005 ms

                      0.129 ms        0.010 ms        0.079 ms        0.004 ms