2 Replies Latest reply on May 19, 2010 4:32 AM by pavandsp

    Huge Performace drop when OpenCL code was Integrated to C++ project

    pavandsp

      Hi

      I am seeing a huge performance drop when an algorithm implemented in OpenCL was integrated to a C++ Project.

      I have called test_main() function from my  C++ project which is a starting point of OpenCL code.This test_main() is called for each video frame continously .The input and output buffers are initilized and freed in C++ project code only.

      The Complete algo is in runCLKernels() including kernel code.I have distributed the algorithm into two kernels.

      It seems there is a huge OpenCL overhead interms of buffers/cpu-gpu readwrite and so on.Please let me know where i can optimize and remove any unnecessary code(buffers inits,).

      Please find the attached kernel and OpenCL application code.I am using HD5850

       

       

      #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable /*! * Sample kernel which multiplies every element of the input array with * a constant and stores it at the corresponding output array */ __kernel void templateKernel(const int linesToProcess, const int lineLen, __global unsigned char * output, __global unsigned char * input, __global int *rhist, __global int *ghist, __global int *bhist) { uint tx = get_global_id(0); //j uint ty = get_global_id(1); //i int aa=0,x=0; if((tx!=0)&&(ty!=0)) { aa = (ty - 1) * 3 * lineLen + (tx - 1) * 3; x = ty * lineLen + tx; if((ty % 2 == 0) && (tx % 2 == 1)) { output[aa] = ( unsigned char)((input[ x - lineLen] + input[x + lineLen]) / 2 ); output[aa + 1] = ( unsigned char)input[x]; output[aa + 2] = ( unsigned char)((input[x-1] + input[x+1]) / 2); } else if ((ty % 2 == 1) && (tx % 2 == 0)) { output[aa] = ( unsigned char)((input[x - 1] + input[ x + 1]) / 2); output[aa + 1] = ( unsigned char)input[x]; output[aa + 2] = ( unsigned char)((input[x - lineLen] + input[x + lineLen]) / 2); } else if ((ty % 2 == 0) && (tx % 2 == 0)) { output[aa] = ( unsigned char)((input[x - lineLen - 1] + input[x - lineLen + 1] + input[ x + lineLen - 1] + input[ x + lineLen + 1]) / 4); output[aa + 1] = ( unsigned char)((input[x - 1] + input[x + 1] + input[x - lineLen] + input[x + lineLen]) / 4); output[aa + 2] = ( unsigned char)input[x]; } else { output[aa] = ( unsigned char)input[x]; output[aa + 1] = ( unsigned char)((input[ x - 1] + input[ x + 1] + input[ x - lineLen] + input[ x + lineLen]) / 4); output[aa + 2] = ( unsigned char)((input[x - lineLen - 1] + input[x - lineLen + 1] + input[x + lineLen - 1] + input[ x + lineLen + 1]) / 4); } atom_inc(rhist+ *(output+(aa + 0))); atom_inc(ghist+ *(output+(aa + 1))); atom_inc(bhist+ *(output+(aa + 2))); }// (tx!=0) } __kernel void gammacolorkernel(__global unsigned char * output, const int rc,const int gc,const int bc , const int ravg,const int gavg,const int bavg, const int ra,const int ga,const int ba) { int B[3]; uint j; uint tx = get_global_id(0) * 3; // Gamma Normalization B[0] = ((output[tx] - rc) * ravg) + ra; B[1] = ((output[tx+1] - gc) * gavg) + ga; B[2] = ((output[tx+2] - bc) * bavg) + ba; for(j=0;j<3;j++) { if(B[j] < 0) B[j] = 0; if(B[j] > 255) B[j] = 255; output[tx+j] = B[j]; } } *************************************************** App code ************************************************* int initializeHost(void) { rhist = NULL; ghist = NULL; bhist = NULL; linesToProcess = 720; lineLen = 1280; width = linesToProcess*lineLen; ///////////////////////////////////////////////////////////////// // Allocate and initialize memory used by host ///////////////////////////////////////////////////////////////// cl_uint sizeInBytes = width * sizeof(cl_uchar); rhist = (cl_int *)malloc(sizeof(cl_int) * 256); ghist = (cl_int *)malloc(sizeof(cl_int) * 256); bhist = (cl_int *)malloc(sizeof(cl_int) * 256); return 0; } /* * Converts the contents of a file into a string */ std::string convertToString(const char *filename) { size_t size; char* str; std::string s; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); return NULL; } f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; return s; } return NULL; } int initializeCL(unsigned char *input,unsigned char *output) { cl_int status = 0; cl_event events[1]; size_t deviceListSize; cl_device_type device_type=NULL; cl_uint num_devices; cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(status != CL_SUCCESS) { printf("Error: Getting Platforms. (clGetPlatformsIDs)\n"); return 1; } if(numPlatforms > 0) { cl_platform_id* platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); for(unsigned int i=0; i < numPlatforms; ++i) { char pbuff[100]; status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); platform = platforms[i]; if(!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { break; } } delete platforms; } cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; cl_context_properties* cprops = (NULL == platform) ? NULL : cps; ///////////////////////////////////////////////////////////////// // Create an OpenCL context ///////////////////////////////////////////////////////////////// context = clCreateContextFromType(cprops, // CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); /* First, get the size of device list data */ status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); ///////////////////////////////////////////////////////////////// // Detect OpenCL devices ///////////////////////////////////////////////////////////////// devices = (cl_device_id *)malloc(deviceListSize); status=clGetDeviceIDs(platform,CL_DEVICE_TYPE_ALL,0,NULL,&num_devices); status=clGetDeviceIDs(platform,CL_DEVICE_TYPE_ALL,2,devices,&num_devices); status = clGetContextInfo(context,CL_CONTEXT_DEVICES,deviceListSize,devices,NULL); ///////////////////////////////////////////////////////////////// // Create an OpenCL command queue ///////////////////////////////////////////////////////////////// commandQueue = clCreateCommandQueue( context, // devices[0], devices[1], CL_QUEUE_PROFILING_ENABLE, &status); ///////////////////////////////////////////////////////////////// // Create OpenCL memory buffers ///////////////////////////////////////////////////////////////// inputBuffer = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uchar) * width, input, &status); outputBuffer = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uchar) * width * 3 , output, &status); rhistBuffer = clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,sizeof(cl_int) * 256 ,rhist,&status); ghistBuffer = clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,sizeof(cl_int) * 256 ,ghist,&status); bhistBuffer = clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,sizeof(cl_int) * 256 ,bhist,&status); ///////////////////////////////////////////////////////////////// // Load CL file, build CL program object, create CL kernel object ///////////////////////////////////////////////////////////////// const char * filename = "Template_Kernels.cl"; std::string sourceStr = convertToString(filename); const char * source = sourceStr.c_str(); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource( context, 1, &source, sourceSize, &status); if(status != CL_SUCCESS) { std::cout<< "Error: Loading Binary into cl_program \ (clCreateProgramWithBinary)\n"; return 1; } status = clBuildProgram(program,1,devices, NULL, NULL, NULL); size_t len; char buffer[4096]; kernel = clCreateKernel(program, "templateKernel", &status); gammacolorkernel = clCreateKernel(program, "gammacolorkernel", &status); return 0; } int runCLKernels(unsigned char *output) { cl_int status; cl_uint maxDims; cl_event events[5]; size_t globalThreads[2]; size_t localThreads[2]; size_t maxWorkGroupSize; size_t maxWorkItemSizes[3]; size_t length; size_t kernelWorkGroupSize; cl_device_type device_type; cl_ulong startTime ,endTime; char devicebuff[100]; cl_uint i,j; /*Histogram Var*/ cl_int percentile5 = 1280*720*.05; cl_int ra=0,rb=0,rc=0,rd=0; cl_int ga=0,gb=0,gc=0,gd=0; cl_int ba=0,bb=0,bc=0,bd=0; cl_int r5sum=0,g5sum=0,b5sum=0; cl_int r95sum=0,g95sum=0,b95sum=0; cl_int A=0, B=0, C=0; globalThreads[0] =1280; globalThreads[1] =720; localThreads[0] =16; localThreads[1] =16; /*** Set appropriate arguments to the kernel ***/ status = clSetKernelArg(kernel,0,sizeof(cl_int),(void *)&linesToProcess); status = clSetKernelArg(kernel,1,sizeof(cl_int),(void *)&lineLen); status = clSetKernelArg(kernel, 2,sizeof(cl_mem),(void *)&outputBuffer); status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&inputBuffer); status = clSetKernelArg(kernel,4, sizeof(cl_mem),(void *)&rhistBuffer); status = clSetKernelArg(kernel,5, sizeof(cl_mem),(void *)&ghistBuffer); status = clSetKernelArg(kernel,6, sizeof(cl_mem),(void *)&bhistBuffer); status = clEnqueueNDRangeKernel( commandQueue, kernel, 2, NULL, globalThreads, localThreads, //NULL, 0, NULL, &events[0]); /* wait for the kernel call to finish execution */ status = clWaitForEvents(1, &events[0]); clReleaseEvent(events[0]); /* Enqueue readBuffer*/ status = clEnqueueReadBuffer( commandQueue, outputBuffer, CL_TRUE, 0, width * 3 * sizeof(cl_uchar), output, 0, NULL, &events[1]); status = clWaitForEvents(1, &events[1]); clReleaseEvent(events[1]); /* Enqueue readBufferof rhist*/ status = clEnqueueReadBuffer(commandQueue,rhistBuffer,CL_TRUE, 0,256 * sizeof(cl_int),rhist, 0, NULL, &events[2]); status = clWaitForEvents(1, &events[2]); clReleaseEvent(events[2]); /* Enqueue readBufferof ghist*/ status = clEnqueueReadBuffer(commandQueue,ghistBuffer,CL_TRUE, 0,256 * sizeof(cl_int),ghist, 0, NULL, &events[3]); status = clWaitForEvents(1, &events[3]); clReleaseEvent(events[3]); /* Enqueue readBufferof bhist*/ status = clEnqueueReadBuffer(commandQueue,bhistBuffer,CL_TRUE, 0,256 * sizeof(cl_int),bhist, 0, NULL, &events[4]); status = clWaitForEvents(1, &events[4]); clReleaseEvent(events[4]); /*Map Buffer*/ output =(cl_uchar *)clEnqueueMapBuffer(commandQueue,outputBuffer,CL_TRUE,CL_MAP_READ|CL_MAP_WRITE,0, sizeof(cl_uchar) * width * 3 , 0,NULL, &events[0],&status); status = clWaitForEvents(1, &events[0]); clReleaseEvent(events[0]); i = linesToProcess; for (j = 1; j <= lineLen; j++) { output[(i - 2) * 3 * lineLen + (j - 1) * 3 + 0] = output[(i - 3) * 3 * lineLen + (j - 1) * 3 + 0]; output[(i - 2) * 3 * lineLen + (j - 1) * 3 + 1] = output[(i - 3) * 3 * lineLen + (j - 1) * 3 + 1]; output[(i - 2) * 3 * lineLen + (j - 1) * 3 + 2] = output[(i - 3) * 3 * lineLen + (j - 1) * 3 + 2]; output[(i - 1) * 3 * lineLen + (j - 1) * 3 + 0] = output[(i - 3) * 3 * lineLen + (j - 1) * 3 + 0]; output[(i - 1) * 3 * lineLen + (j - 1) * 3 + 1] = output[(i - 3) * 3 * lineLen + (j - 1) * 3 + 1]; output[(i - 1) * 3 * lineLen + (j - 1) * 3 + 2] = output[(i - 3) * 3 * lineLen + (j - 1) * 3 + 2]; } j = lineLen; for (i = 1; i <= linesToProcess; i++) { output[(i - 1) * 3 * lineLen + (j - 2) * 3 + 0] = output[(i - 1) * 3 * lineLen + (j - 3) * 3 + 0]; output[(i - 1) * 3 * lineLen + (j - 2) * 3 + 1] = output[(i - 1) * 3 * lineLen + (j - 3) * 3 + 1]; output[(i - 1) * 3 * lineLen + (j - 2) * 3 + 2] = output[(i - 1) * 3 * lineLen + (j - 3) * 3 + 2]; output[(i - 1) * 3 * lineLen + (j - 1) * 3 + 0] = output[(i - 1) * 3 * lineLen + (j - 3) * 3 + 0]; output[(i - 1) * 3 * lineLen + (j - 1) * 3 + 1] = output[(i - 1) * 3 * lineLen + (j - 3) * 3 + 1]; output[(i - 1) * 3 * lineLen + (j - 1) * 3 + 2] = output[(i - 1) * 3 * lineLen + (j - 3) * 3 + 2]; } cl_int cnt = 0; for(i=1;i<255;i++) { if(r5sum < percentile5) { r5sum += rhist[i]; } else if(rc == 0) { rc = i; cnt++; } if(r95sum < percentile5) { r95sum += rhist[255-i]; } else if(rd == 0) { rd = 255-i; cnt++; } if(g5sum < percentile5) { g5sum += ghist[i]; } else if(gc == 0) { gc = i; cnt++; } if(g95sum < percentile5) { g95sum += ghist[255-i]; } else if(gd == 0) { gd = 255-i; cnt++; } if(b5sum < percentile5) { b5sum += bhist[i]; } else if(bc == 0) { bc = i; cnt++; } if(b95sum < percentile5) { b95sum += bhist[255-i]; } else if(bd == 0) { bd = 255-i; cnt++; } if(cnt == 6) break; } ra = rc-30; ga = gc-30; ba = bc-30; rb = rd + 30; gb = gd + 30; bb = bd + 30; cl_int ravg = (rb-ra)/(rd-rc); cl_int gavg = (gb-ga)/(gd-gc); cl_int bavg = (bb-ba)/(bd-bc); clSetKernelArg(gammacolorkernel,0, sizeof(cl_mem),(void *)&outputBuffer); clSetKernelArg(gammacolorkernel,1, sizeof(cl_int),(void *)&rc); clSetKernelArg(gammacolorkernel,2, sizeof(cl_int),(void *)&gc); clSetKernelArg(gammacolorkernel,3, sizeof(cl_int),(void *)&bc); clSetKernelArg(gammacolorkernel,4, sizeof(cl_int),(void *)&ravg); clSetKernelArg(gammacolorkernel,5, sizeof(cl_int),(void *)&gavg); clSetKernelArg(gammacolorkernel,6, sizeof(cl_int),(void *)&bavg); clSetKernelArg(gammacolorkernel,7, sizeof(cl_int),(void *)&ra); clSetKernelArg(gammacolorkernel,8, sizeof(cl_int),(void *)&ga); status=clSetKernelArg(gammacolorkernel,9, sizeof(cl_int),(void *)&ba); /*UnMap Buffer*/ status=clEnqueueUnmapMemObject(commandQueue,outputBuffer,output,0,NULL, &events[0]); status = clWaitForEvents(1, &events[0]); clReleaseEvent(events[0]); /* * Enqueue a gammacolorkernel run call. */ globalThreads[0] =width; localThreads[0] =256; status = clEnqueueNDRangeKernel(commandQueue,gammacolorkernel,1,NULL,globalThreads, localThreads, 0, NULL,&events[0]); status = clWaitForEvents(1, &events[0]); clReleaseEvent(events[0]); status = clEnqueueReadBuffer(commandQueue,outputBuffer,CL_TRUE,0,width * 3 * sizeof(cl_uchar),output,0,NULL, &events[1]); status = clWaitForEvents(1, &events[1]); clReleaseEvent(events[1]); return 0; } /* * \brief Release OpenCL resources (Context, Memory etc.) */ int cleanupCL(void) { cl_int status; status = clReleaseKernel(kernel); status = clReleaseKernel(gammacolorkernel); status = clReleaseProgram(program); status = clReleaseMemObject(inputBuffer); status = clReleaseMemObject(outputBuffer); status = clReleaseMemObject(rhistBuffer); status = clReleaseMemObject(ghistBuffer); status = clReleaseMemObject(bhistBuffer); status = clReleaseCommandQueue(commandQueue); status = clReleaseContext(context); return 0; } void cleanupHost(void) { if(rhist != NULL) { free(rhist); rhist = NULL; } if(ghist != NULL) { free(ghist); ghist = NULL; } if(bhist != NULL) { free(bhist); bhist = NULL; } if(devices != NULL) { free(devices); devices = NULL; } } /* OpenCL code called from C++ Project :This is called in loop for each video frame */ int test_main(unsigned char *input,unsigned char *output) { if(initializeHost()==1) return 1; if(initializeCL(input,output)==1) return 1; if(runCLKernels(output)==1) return 1; if(cleanupCL()==1) return 1; cleanupHost(); return 0; }