6 Replies Latest reply on Apr 22, 2010 11:26 AM by noxnet

    Performance drops on iterative kernel calls

    noxnet

      I've written a function to interatively execute a kernel. The function handels buffer declarations/release,
      setting kernel args and calls clEnqueNDRangeKernel iteratively (1000 iterations) and so on.

      When calling the same function several times for the same kernel
      the total execution time of all iterations raises on each function call (not kernel call).

      I'm using OpenCL Profiling to measure average kernel execution time and the average kernel execution
      time is constant on all function calls. So the kernel seems to behave right. I guess this is a memory/buffer
      issue.

      I've discovered this issue when running a kernel of ATI Image Convolution Sample (invariants).
      Using a 512x512 Matrix and a filter of 3x3 running 1000 iterations the average kernel execution time on an HD5450
      is about 6.22 ms. 6.22 * 1000 = 6220 sec when executing the function 4 times in a row i got the following times

      1st call: 8,1 sec
      2st call: 9,2 sec
      3st call: 10,0 sec
      4st call: 11,0 sec

      The recorded times where measured on CODE 1, so no buffer declarations and similar stuff included.

      Also already mentioned i guess it is a memory buffer issue.

      Any ideas?


      Another problem i'm facing is performance problems on an HD5750, I already posted that in another thread.
      On an HD5750 the average kernel execution time is about 5.5 ms which is just slighty faster compared
      to the 6.25 ms of the HD5450.

      I'm using Windows 7-64 Bit, Catalyst 10.3.

      CODE 1: //start total time measurement for(int x = 0; x < iterations; x++) { err = clEnqueueNDRangeKernel(cmd_queue, kernels[kernel_id], 2, NULL, global_work_size, local_work_size, 0, NULL, &kernel_event); err |= clFlush(cmd_queue); err_chk(err); assert(err == CL_SUCCESS); time_kernel_exec += get_event_exec_time(kernel_event); } clFinish(cmd_queue); //end total time measurement ################################ MEMORY BUFFERS in_mem = clCreateBuffer(context,CL_MEM_READ_WRITE, buff_size_padded, NULL, &err); err = clEnqueueWriteBuffer(cmd_queue, in_mem, CL_FALSE, 0, buff_size_padded, input, 0, NULL, NULL); filter_mem = clCreateBuffer(context,CL_MEM_READ_ONLY, buff_size_filter, NULL, &err); err = clEnqueueWriteBuffer(cmd_queue, filter_mem, CL_FALSE, 0, buff_size_filter, filter, 0, NULL, NULL); out_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, buff_size_unpadded, NULL, &err); ################################ KERNEL kernel void Convolve_Inv(__global float * pInput, __constant float * pFilter, __global float * pOutput, const int input_width, const int filter_width) { const int nWidth = get_global_size(0); const int xOut = get_global_id(0); const int yOut = get_global_id(1); const int xInTopLeft = xOut; const int yInTopLeft = yOut; float sum = 0; for (int r = 0; r < FILTER_WIDTH; r++) { const int idxFtmp = r * FILTER_WIDTH; //current row in pFilter const int yIn = yInTopLeft + r; //current row in pInput const int idxIntmp = yIn * input_width + xInTopLeft; //current pos in pInput for (int c = 0; c < FILTER_WIDTH; c++) { const int idxF = idxFtmp + c; const int idxIn = idxIntmp + c; sum += pFilter[idxF]*pInput[idxIn]; } } //for (int r = 0... const int idxOut = yOut * nWidth + xOut; pOutput[idxOut] = sum; }

        • Performance drops on iterative kernel calls
          omkaranathan

          noxnet,

          Could you provide the whole code?(A working testcase). Its easier to try and reproduce the problem that way.

          Thanks

            • Performance drops on iterative kernel calls
              noxnet

              Here is a "quick and dirty" code for testing.

              Average Kernel execution time is between: 6.21 - 6.29 ms (measured by ocl profiling)

              The execution time all kernels (1000 iterations) is
              1st call: 7719ms
              2st call: 8385ms
              3st call: 9849ms
              4st call: 10455ms
              5st call: 11903ms

              After some experiments found out that these problem is due to cl_event passed at
              clEnqueueNDRangeKernel(...).

              The Problem occurs when using the following declaration.

              cl_event kernel_event = NULL;

              ...

              err = clEnqueueNDRangeKernel(cmd_queue, kernels[kernel_id], 2, NULL,
                 global_work_size, local_work_size, 0, NULL, &kernel_event);


              When changing it to


              cl_event * kernel_event = NULL;

              ...

              err = clEnqueueNDRangeKernel(cmd_queue, kernels[kernel_id], 2, NULL,
                 global_work_size, local_work_size, 0, NULL, kernel_event);

              The performance as i expected to be. Same when passing NULL instead of kernel_event.
              1st call: 6554ms
              2st call: 6558ms
              3st call: 6537ms
              4st call: 6548ms
              5st call: 6551ms

              When using cl_event* i don't know how to use ocl profiling.

              Any Ideas why this occurs?

               

              ########################################################## convolve.cl ########################################################## __kernel void convolve(__global float * pInput, __constant float * pFilter, __global float * pOutput, const int input_width, const int filter_width) { const int nWidth = get_global_size(0); const int xOut = get_global_id(0); const int yOut = get_global_id(1); const int xInTopLeft = xOut; const int yInTopLeft = yOut; int padding = (filter_width - 1) / 2; float sum = 0; for (int r = 0; r < FILTER_WIDTH; r++) { const int idxFtmp = r * FILTER_WIDTH; //current row in pFilter const int yIn = yInTopLeft + r; //current row in pInput const int idxIntmp = yIn * input_width + xInTopLeft; //current pos in pInput for (int c = 0; c < FILTER_WIDTH; c++) { const int idxF = idxFtmp + c; const int idxIn = idxIntmp + c; sum += pFilter[idxF]*pInput[idxIn]; } } //for (int r = 0... //float res = pInput[(yOut + padding) * input_width + xOut + padding] + diff_coef * (sum); const int idxOut = yOut * nWidth + xOut; pOutput[idxOut] = sum; } ########################################################## c-File ########################################################## /*! \mainpage OpenCL \author Müller Jochen \date April 2010 */ //#include "stdafx.h" #include <cstdio> #include <cstdlib> #include <iostream> #include <CL/cl.h> #include <assert.h> #include <math.h> #include <ctime> #include <sys/stat.h> #include <windows.h> #include <sys/timeb.h> #include <stdio.h> #include <stdarg.h> #define SEPARATOR "\n----------------------------------------------------------------\n" cl_int err; cl_program program[1]; cl_device_id device1; cl_device_id *device; cl_device_id devs[2]; cl_context context; cl_command_queue cmd_queue; cl_kernel kernels[10]; cl_mem group_mem, local_mem; cl_uint num_devices, num_ret_kernels; cl_event event; float get_event_exec_time(cl_event event); double get_time_ms(timeval tv_start, timeval tv_end); char * load_program_source(const char *filename); int init_opencl(cl_device_type device); int read_kernels(char *file_name, char * build_params); float* invoke_convolve_kernel(float * input, float * filter, int input_width, int filter_width, int iterations, int padding, int kernel_id); void gettimeofday(struct timeval* t,void* timezone) { struct _timeb timebuffer; _ftime( &timebuffer ); t->tv_sec=timebuffer.time; t->tv_usec=1000*timebuffer.millitm; } /*! * the main function of the programm */ int main(int argc, char **argv) { printf("Starting calculations\n"); cl_device_type device_type = CL_DEVICE_TYPE_GPU; int filter_width = 3; int unpadded_width = 512; int unpadded_height = unpadded_width; int iter = 1000; int right_border = 10; int left_border = 10; int top_border = 10; int bottom_border = 10; int init_value = 2; char use_cpu = 'n'; char use_def = 'n'; int dim = unpadded_width + filter_width - 1; int padding = (filter_width - 1) / 2; //calculate padding (on each side) printf(SEPARATOR); printf("Matrix size (unpadded): %d x %d\n", unpadded_width, unpadded_height); printf("Matrix size (padded): \t%d x %d\n", dim ,dim); printf("Filter size: \t\t%d x %d\n", filter_width, filter_width); printf("Iterations: \t\t%d\n", iter); printf(SEPARATOR); //allocating buffers float * input_cpu = (float *) calloc(dim * dim, sizeof(float)); float * input_gpu = (float *) calloc(dim * dim, sizeof(float)); float * result_cpu = (float *) calloc(dim * dim, sizeof(float)); float * result_gpu = (float *) calloc(dim * dim, sizeof(float)); //filling buffers with default values for(int i = 0;i < dim*dim; i++) { input_cpu[i] = 1; input_gpu[i] = 1; } //filter init float * filter = (float *) calloc(filter_width * filter_width, sizeof(float)); for(int i = 0; i < filter_width * filter_width; i++) { filter[i] = 1; } filter[filter_width * padding + padding] = - filter_width * filter_width + 1; /***** OpenCL Calculation *****/ printf("Starting OpenCL Calculation on GPU...\n"); init_opencl(device_type); //setting up build params char tmp[3]; //char * build_tmp = "-cl-fast-relaxed-math -D FILTER_WIDTH="; char * build_tmp = "-D FILTER_WIDTH="; char * build_params = (char*) calloc((strlen(build_tmp)+3),sizeof(char)); strcat(build_params, build_tmp); itoa(filter_width,tmp,10); strcat(build_params,tmp); read_kernels("convolve", build_params); //Warm up call printf(SEPARATOR); printf("WARM UP call START (10 Iterations of first kernel)\n"); result_gpu = invoke_convolve_kernel(input_gpu, filter, dim, filter_width, 10, padding, 0); printf(SEPARATOR); printf("WARM UP call END (10 Iterations of first kernel)\n"); printf(SEPARATOR); char * file_prefix = "diffusion_no_flow_2D_GPU_"; //calling the same function several times for(int i = 0; i < 5; i++) result_gpu = invoke_convolve_kernel(input_gpu, filter, dim, filter_width, iter, padding, 0); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); printf(SEPARATOR); printf("DONE"); getchar(); } int init_opencl(cl_device_type device_type) { cl_int err; cl_uint numPlatforms; size_t deviceListSize; cl_platform_id platform = NULL; /***** Get Platform *****/ err = clGetPlatformIDs(0, NULL, &numPlatforms); err = clGetPlatformIDs(numPlatforms, &platform, NULL); assert(err == CL_SUCCESS); cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; /* Use NULL for backward compatibility */ cl_context_properties* cprops = (NULL == platform) ? NULL : cps; context = clCreateContextFromType( cprops, device_type, NULL, NULL, &err); assert(err == CL_SUCCESS); err |= clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); assert(err == CL_SUCCESS); device = (cl_device_id *)malloc(deviceListSize); /* Now, get the device list data */ err = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, device, NULL); assert(err == CL_SUCCESS); /***** Creating Cmd Queue *****/ cmd_queue = clCreateCommandQueue(context, device[0], CL_QUEUE_PROFILING_ENABLE, &err); assert(err == CL_SUCCESS); return (int)err; } int read_kernels(char *file_name, char * build_params) { size_t program_length; const char* source_path; char *program_source; cl_uint num_kernels = 0; printf(SEPARATOR); char * filename = (char *) calloc(strlen(file_name)+3, sizeof(char)); strcat(filename,file_name); strcat(filename,".cl"); //Creating Programm with Source and Building printf("\nLoading program source from file: %s\n", filename); /***** Reading Source File *****/ program_source = load_program_source(filename); if(program_source == NULL) { printf("Error: can't read program_source"); return 1; } /***** Creating Prgramm *****/ program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source, NULL, &err); assert(err == CL_SUCCESS); /***** Build Prgramm *****/ err = clBuildProgram(program[0], 1, device, build_params, NULL, NULL); assert(err == CL_SUCCESS); printf("\nProgram built\n"); /***** Reading Kernels *****/ err = clCreateKernelsInProgram(program[0], NULL, NULL, &num_kernels); err = clCreateKernelsInProgram(program[0], num_kernels, kernels, &num_kernels); assert(err == CL_SUCCESS); return num_kernels; } float* invoke_convolve_kernel(float * input, float * filter, int input_width, int filter_width, int iterations, int padding, int kernel_id) { printf(SEPARATOR); printf("\nExecuting kernel: "); char * cbuffer = (char*) malloc(sizeof(char) * 64); clGetKernelInfo(kernels[kernel_id], CL_KERNEL_FUNCTION_NAME, sizeof(char) * 64, cbuffer, NULL); printf("%s \n", cbuffer); struct timeval tv_func_start, tv_func_end, tv_kernel_start, tv_kernel_end, tv_ov_start, tv_ov_end, tv_ov_start_euler,tv_ov_end_euler; void * tz; gettimeofday(&tv_func_start, &tz); cl_int err = 0; cl_event kernel_event = NULL; cl_mem in_mem, out_mem, filter_mem; float * result; int unpadded_width = input_width - 2 * padding; int block_width = 16; int block_height = 16; int local_mem_size = block_width * block_height * sizeof(float) * 4; int max_work_group_size; /***** handling work-group-size limitations *****/ err = clGetKernelWorkGroupInfo(kernels[kernel_id], device[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t),&max_work_group_size,NULL); if(max_work_group_size < 64) block_height = block_width = 4; else if(max_work_group_size < 256) block_height = block_width = 8; /***** Declaring buffer sizes *****/ int buff_size_padded = input_width * input_width * sizeof(float); int buff_size_unpadded = unpadded_width * unpadded_width * sizeof(float); int buff_size_filter = filter_width * filter_width * sizeof(float); //tmporary storage for iteration results float * tmp = (float *) malloc(buff_size_padded); memcpy(tmp,input,buff_size_padded); result = (float *) malloc(buff_size_padded); float * result1 = (float *) malloc(buff_size_padded); cl_uint num_args; err = clGetKernelInfo(kernels[kernel_id], CL_KERNEL_NUM_ARGS, sizeof(cl_uint), &num_args, NULL); /***** Creating in/filter/out Buffers *****/ in_mem = clCreateBuffer(context,CL_MEM_READ_WRITE, buff_size_padded, NULL, &err); err |= clEnqueueWriteBuffer(cmd_queue, in_mem, CL_FALSE, 0, buff_size_padded, input, 0, NULL, NULL); assert(err == CL_SUCCESS); filter_mem = clCreateBuffer(context,CL_MEM_READ_ONLY, buff_size_filter, NULL, &err); err |= clEnqueueWriteBuffer(cmd_queue, filter_mem, CL_FALSE, 0, buff_size_filter, filter, 0, NULL, NULL); assert(err == CL_SUCCESS); out_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, buff_size_unpadded, NULL, &err); assert(err == CL_SUCCESS); err = clFinish(cmd_queue); assert(err == CL_SUCCESS); /***** Setting Kernel Args *****/ err = clSetKernelArg(kernels[kernel_id], 0, sizeof(cl_mem), &in_mem); err |= clSetKernelArg(kernels[kernel_id], 1, sizeof(cl_mem), &filter_mem); err |= clSetKernelArg(kernels[kernel_id], 2, sizeof(cl_mem), &out_mem); err |= clSetKernelArg(kernels[kernel_id], 3, sizeof(int), &input_width); err |= clSetKernelArg(kernels[kernel_id], 4, sizeof(int), &filter_width); /***** Start Kernel Exec *****/ size_t local_work_size[2] = { block_width, block_height }; size_t global_work_size[2] = { unpadded_width , unpadded_width }; //only inner elements are calculated float time_overhead = 0; float time_kernel_exec = 0; gettimeofday(&tv_kernel_start, &tz); for(int x = 0; x < iterations; x++) { err = clEnqueueNDRangeKernel(cmd_queue, kernels[kernel_id], 2, NULL, global_work_size, local_work_size, 0, NULL, &kernel_event); err |= clFinish(cmd_queue); assert(err == CL_SUCCESS); time_kernel_exec += get_event_exec_time(kernel_event); } assert(err == CL_SUCCESS); clFinish(cmd_queue); gettimeofday(&tv_kernel_end, &tz); /***** Reading Result *****/ //reading only output size not padded size struct timeval tv_read_start, tv_read_end; err = clEnqueueReadBuffer(cmd_queue, out_mem, CL_TRUE, 0, buff_size_unpadded, tmp, 0, NULL, NULL); assert(err == CL_SUCCESS); //padded write of results for(int i = padding, i1 = 0; i < input_width - padding; i++, i1++) { memcpy(&result[i * input_width + padding], &tmp[i1 * unpadded_width], sizeof(float) * unpadded_width); } //Free Objects clReleaseMemObject(in_mem); clReleaseMemObject(filter_mem); clReleaseMemObject(out_mem); gettimeofday(&tv_func_end, &tz); /***** Logging *****/ printf("Time needed to execute all kernels:\t\t\t %.2f ms\n", get_time_ms(tv_kernel_start,tv_kernel_end)); gettimeofday(&tv_func_end, &tz); printf("Time needed to execute whole function:\t\t\t %.2f ms\n", get_time_ms(tv_func_start,tv_func_end)); if(kernel_event != NULL) { time_kernel_exec = time_kernel_exec / iterations; printf("Average time needed to execute a kernel:\t\t %.2f ms\n", time_kernel_exec); } return result; } float get_event_exec_time(cl_event ev) { cl_ulong start, end; clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); float exec_time_ms = (end - start) * 1.0e-6; return exec_time_ms; } double get_time_ms(timeval tv_start, timeval tv_end) { double ms; //milliseconds double sec; //seconds double us; //microseconds double total_us; double start = tv_start.tv_usec + tv_start.tv_sec*pow((double)10,6); double end = tv_end.tv_usec + tv_end.tv_sec*pow((double)10,6); total_us = end - start; ms = total_us/pow((double)10, 3); return ms; } char * load_program_source(const char *filename) { struct stat statbuf; FILE *fh; char *source; char *output; fh = fopen(filename, "r"); if (fh == 0) return 0; stat(filename, &statbuf); source = (char *) malloc((int)statbuf.st_size); fread(source, statbuf.st_size, 1, fh); source[statbuf.st_size] = '\0'; //removing invalid characters int i = statbuf.st_size; while(i > 0) { char f = source[i]; if(source[i] > 0 && (source[i] == '/' || source[i] =='}')) { i++; break; } i--; } output = (char *) malloc(i+1); memcpy(output, source, i); output[i] = '\0'; return output; }