Below is a pseudo-code that I tried.
-----------------------Pseudo code ---------------------------------
tm1 = gettimeofday();
clEnqueueNDRangeKernel (que, kernel, 1, NULL,&pe_size, &group_size, 0, NULL, &event );
clWaitForEvents( 1 , &event );
clGetEventProfilingInfo ( event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &tc1, NULL );
clGetEventProfilingInfo ( event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &tc2, NULL );
clReleaseEvent( event );
tm2 = gettimeofday();
---------------------End code--------------------------------------
Needless to say, "tm2 - tm1 is elapse execution time measured from host CPU." and "tc2 - tc1 is net execution time measured from GPU."
and I got
Elapse execution time (tm2 - tm1) : 3 (sec)
but
Net Execution time (tc2 - tc1) : 0.1 (sec)
In that case, 2.9 (sec) was spent for waiting. but I don't know how 2.9(sec) was used or what used 2.9(sec).
Could anyone explain me about the waiting time ?
It is the first kernel run, right? Try measuring the second run.
Yes. It is a first run.
I tried kernel run 2 times in a row.
= CASE 1 = Using the same kernel.
-----------------------Pseudo code ---------------------------------
tm1 = gettimeofday();
(tc1, tc2) = clEnqueueNDRangeKernel ( kernel );
tm2 = gettimeofday();
(tc3, tc4) = clEnqueueNDRangeKernel ( kernel );
tm3 = gettimeofday();
---------------------End code--------------------------------------
Elapse execution time (tm2 - tm1) : 3 (sec)
Net Execution time (tc2 - tc1) : 0.1 (sec)
Elapse execution time (tm3 - tm2) : 0.4 (sec)
Net Execution time (tc4 - tc3) : 0.1 (sec)
= CASE 2 = Using different kernels
-----------------------Pseudo code ---------------------------------
tm1 = gettimeofday();
(tc1, tc2) = clEnqueueNDRangeKernel ( kernel_A );
tm2 = gettimeofday();
(tc3, tc4) = clEnqueueNDRangeKernel ( kernel_B );
tm3 = gettimeofday();
---------------------End code--------------------------------------
Elapse execution time (tm2 - tm1) : 3 (sec)
Net Execution time (tc2 - tc1) : 0.1 (sec)
Elapse execution time (tm3 - tm2) : 3 (sec)
Net Execution time (tc4 - tc3) : 0.1 (sec)
Is some sort of preparation done for the first kernel run ?
My GPU is HD4870 with APP SDK 2.4 & Catalyst11.5 on Ubuntu10.04 x64.
Well, I don't develop for unix... I never encountered 3 seconds start up for the 1st kernel run. I observed such delays when creating comman queues. Maybe Ubuntu driver devays creating queue until kernel run? Who knows.
there is lazy allocation of memory buffers. they are allocated and initialized at first run of the kernel. then you can observe delay on first run.
But not 3 seconds! Copying 512MB (max total buffer size for 4870) to the device cannot take 3 seconds.
ztatsuch, do you have large buffers as parameters in your kernels?
Try using the AMD Profiler to figure out what takes how much time. And as nou said there is always deferred allocation so you should expect more time for first kernel run.
Do you use a clFinish() command just before starting to measure the time, this can also be a reason for long time as many previous commands might be getting executed within youe timeframe.
Hello everybody. Thank you for your replies.
By the way,
I used 4 buffers. their sizes are 114MB, 57MB, 171MB, 19MB respectively.
I think they aren't so huge.
profiler(V2.2) result
---------------------------------------------------------------------------------------------------
API Name # of Calls Total Time(ms) Avg Time(ms) Max Time(ms) Min Time(ms)
---------------------------------------------------------------------------------------------------
clBuildProgram 1 7118.24952 7118.24952 7118.24952 7118.24952
clWaitForEvents 2 7710.05968 3855.02984 3861.09756 3848.96212
clEnqueueMapBuffer 8 1308.26086 163.53261 437.54849 24.08488
clCreateCommandQueue 1 14.58162 14.58162 14.58162 14.58162
clReleaseProgram 1 0.98791 0.98791 0.98791 0.98791
clCreateProgramWithSource 1 0.02667 0.02667 0.02667 0.02667
clEnqueueUnmapMemObject 4 0.02049 0.00512 0.01453 0.00187
clEnqueueNDRangeKernel 2 0.02230 0.01115 0.01121 0.01108
clFlush 14 0.06232 0.00445 0.00934 0.00018
clCreateBuffer 4 0.01382 0.00345 0.00728 0.00072
clCreateContext 1 0.00428 0.00428 0.00428 0.00428
clCreateKernel 2 0.00488 0.00244 0.00368 0.00120
clReleaseKernel 2 0.00379 0.00190 0.00319 0.00060
clGetDeviceInfo 1 0.00270 0.00270 0.00270 0.00270
clGetDeviceIDs 2 0.00182 0.00091 0.00114 0.00068
clReleaseEvent 2 0.00125 0.00063 0.00064 0.00061
clGetContextInfo 6 0.00175 0.00029 0.00064 0.00014
clGetEventProfilingInfo 4 0.00136 0.00034 0.00063 0.00010
clGetPlatformInfo 1 0.00055 0.00055 0.00055 0.00055
clSetKernelArg 8 0.00112 0.00014 0.00053 0.00006
true values of the measured time are
(tc[2] - tc[1]) / 1.0E9 : 0.10328 (sec)
(tm[2] - tm[1]) / 1.0E6 : 3.86494 (sec)
(tc[4] - tc[3]) / 1.0E9 : 0.104498(sec)
(tm[3] - tm[2]) / 1.0E6 : 3.8561 (sec)
below is a part of true code.
// // create buffers // #define NELEM 5000000 cl_mem C = clCreateBuffer( DeviceContext, CL_MEM_READ_ONLY, sizeof(cl_float)*NELEM*6, NULL, NULL ); // 114MB cl_mem E = clCreateBuffer( DeviceContext, CL_MEM_WRITE_ONLY, sizeof(cl_float)*NELEM*3, NULL, NULL ); // 57MB cl_mem V = clCreateBuffer( DeviceContext, CL_MEM_WRITE_ONLY, sizeof(cl_float)*NELEM*9, NULL, NULL ); // 171MB cl_mem Count = clCreateBuffer( DeviceContext, CL_MEM_WRITE_ONLY, sizeof(cl_int)*NELEM, NULL, NULL ); // 19MB // // BUFFER MAPPING for Initializing float *pC = (float *)clEnqueueMapBuffer ( que, C, CL_TRUE, CL_MAP_WRITE, 0, sizeof(cl_float)*NELEM*6, 0, NULL, NULL, NULL ); float *pE = (float *)clEnqueueMapBuffer ( que, E, CL_TRUE, CL_MAP_WRITE, 0, sizeof(cl_float)*NELEM*3, 0, NULL, NULL, NULL ); float *pV = (float *)clEnqueueMapBuffer ( que, V, CL_TRUE, CL_MAP_WRITE, 0, sizeof(cl_float)*NELEM*9, 0, NULL, NULL, NULL ); int *pCount = (int *)clEnqueueMapBuffer ( que, Count, CL_TRUE, CL_MAP_WRITE, 0, sizeof(cl_int)*NELEM, 0, NULL, NULL, NULL ); clFlush( que ); // // Test Data // float *C1 = matgen(); // text matrix generation for( int i=0; i<NELEM; i++ ) { memcpy( &pC[6*i], C1, sizeof(float)*6 ); } memset( pE, 0, sizeof(float)*NELEM*3 ); memset( pV, 0, sizeof(float)*NELEM*9 ); memset( pCount, 0, sizeof(int) *NELEM ); // // BUFFER UNMAPPING. (void)clEnqueueUnmapMemObject ( que,C, pC, 0, NULL, NULL ); (void)clEnqueueUnmapMemObject ( que,E, pE, 0, NULL, NULL ); (void)clEnqueueUnmapMemObject ( que,V, pV, 0, NULL, NULL ); (void)clEnqueueUnmapMemObject ( que,Count,pCount, 0, NULL, NULL ); clFlush( que ); // // Reading kernel source. * * // // kernel creation & build. cl_program pgm = CREATE_PROGRAM( pgm_source ); // clCreateProgramWithSource() if( BUILD_PROGRAM( pgm ) ) { // Build o.k. // clBuildProgram (..., "-cl-mad-enable -cl-denorms-are-zero", ... ) string kernel_name1( "kernel_A" ); string kernel_name2( "kernel_B" ); cl_kernel kernel1 = CREATE_KERNEL( pgm, kernel_name1 ); // clCreateKernel() cl_kernel kernel2 = CREATE_KERNEL( pgm, kernel_name2 ); // 1st clSetKernelArg( kernel1, 0, sizeof(cl_mem), (void *)&C ); // input to the kernel clSetKernelArg( kernel1, 1, sizeof(cl_mem), (void *)&E ); // output from the kernel clSetKernelArg( kernel1, 2, sizeof(cl_mem), (void *)&V ); // output from the kernel clSetKernelArg( kernel1, 3, sizeof(cl_mem), (void *)&Count ); // output from the kernel // 2nd clSetKernelArg( kernel2, 0, sizeof(cl_mem), (void *)&C ); clSetKernelArg( kernel2, 1, sizeof(cl_mem), (void *)&E ); clSetKernelArg( kernel2, 2, sizeof(cl_mem), (void *)&V ); clSetKernelArg( kernel2, 3, sizeof(cl_mem), (void *)&Count ); // // 1st kernel execution tm[1] = timer(); // gettimeofday() { cl_event event; cl_ulong device_time_counter[2]; const size_t pe_size = NELEM; // 5000000 const size_t group_size = 64; clEnqueueNDRangeKernel ( que, kernel1, 1, NULL, &pe_size, &group_size, 0, NULL, &event ); clWaitForEvents( 1 , &event ); clGetEventProfilingInfo ( event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &device_time_counter[0], NULL ); clGetEventProfilingInfo ( event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &device_time_counter[1], NULL ); clReleaseEvent( event ); event=NULL; tc[1] = device_time_counter[0]; tc[2] = device_time_counter[1]; } tm[2] = timer(); // gettimeofday() // // 2nd kernel execution { cl_event event; cl_ulong device_time_counter[2]; const size_t pe_size = NELEM; // 5000000 const size_t group_size = 64; clEnqueueNDRangeKernel ( que, kernel2, 1, NULL, &pe_size, &group_size, 0, NULL, &event ); clWaitForEvents( 1 , &event ); clGetEventProfilingInfo ( event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &device_time_counter[0], NULL ); clGetEventProfilingInfo ( event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &device_time_counter[1], NULL ); clReleaseEvent( event ); event=NULL; tc[3] = device_time_counter[0]; tc[4] = device_time_counter[1]; } tm[3] = timer(); // gettimeofday() // // clean up clReleaseKernel( kernel1 ); clReleaseKernel( kernel2 ); clReleaseProgram( pgm ); } GpuDevice.RELEASE_BUFFER( C ); GpuDevice.RELEASE_BUFFER( E ); GpuDevice.RELEASE_BUFFER( V ); GpuDevice.RELEASE_BUFFER( Count ); //