Archives Discussions

pavandsp · ‎04-27-2010

Hi

I am seeing a huge performance drop when an algorithm implemented in OpenCL was integrated to a C++ Project.

I have called test_main() function from my C++ project which is a starting point of OpenCL code.This test_main() is called for each video frame continously .The input and output buffers are initilized and freed in C++ project code only.

The Complete algo is in runCLKernels() including kernel code.I have distributed the algorithm into two kernels.

It seems there is a huge OpenCL overhead interms of buffers/cpu-gpu readwrite and so on.Please let me know where i can optimize and remove any unnecessary code(buffers inits,).

Please find the attached kernel and OpenCL application code.I am using HD5850

#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable /*! * Sample kernel which multiplies every element of the input array with * a constant and stores it at the corresponding output array */ __kernel void templateKernel(const int linesToProcess, const int lineLen, __global unsigned char * output, __global unsigned char * input, __global int *rhist, __global int *ghist, __global int *bhist) { uint tx = get_global_id(0); //j uint ty = get_global_id(1); //i int aa=0,x=0; if((tx!=0)&&(ty!=0)) { aa = (ty - 1) * 3 * lineLen + (tx - 1) * 3; x = ty * lineLen + tx; if((ty % 2 == 0) && (tx % 2 == 1)) { output[aa] = ( unsigned char)((input[ x - lineLen] + input[x + lineLen]) / 2 ); output[aa + 1] = ( unsigned char)input; output[aa + 2] = ( unsigned char)((input[x-1] + input[x+1]) / 2); } else if ((ty % 2 == 1) && (tx % 2 == 0)) { output[aa] = ( unsigned char)((input[x - 1] + input[ x + 1]) / 2); output[aa + 1] = ( unsigned char)input; output[aa + 2] = ( unsigned char)((input[x - lineLen] + input[x + lineLen]) / 2); } else if ((ty % 2 == 0) && (tx % 2 == 0)) { output[aa] = ( unsigned char)((input[x - lineLen - 1] + input[x - lineLen + 1] + input[ x + lineLen - 1] + input[ x + lineLen + 1]) / 4); output[aa + 1] = ( unsigned char)((input[x - 1] + input[x + 1] + input[x - lineLen] + input[x + lineLen]) / 4); output[aa + 2] = ( unsigned char)input; } else { output[aa] = ( unsigned char)input; output[aa + 1] = ( unsigned char)((input[ x - 1] + input[ x + 1] + input[ x - lineLen] + input[ x + lineLen]) / 4); output[aa + 2] = ( unsigned char)((input[x - lineLen - 1] + input[x - lineLen + 1] + input[x + lineLen - 1] + input[ x + lineLen + 1]) / 4); } atom_inc(rhist+ *(output+(aa + 0))); atom_inc(ghist+ *(output+(aa + 1))); atom_inc(bhist+ *(output+(aa + 2))); }// (tx!=0) } __kernel void gammacolorkernel(__global unsigned char * output, const int rc,const int gc,const int bc , const int ravg,const int gavg,const int bavg, const int ra,const int ga,const int ba) { int B[3]; uint j; uint tx = get_global_id(0) * 3; // Gamma Normalization B[0] = ((output[tx] - rc) * ravg) + ra; B[1] = ((output[tx+1] - gc) * gavg) + ga; B[2] = ((output[tx+2] - bc) * bavg) + ba; for(j=0;j<3;j++) { if(B < 0) B = 0; if(B > 255) B = 255; output[tx+j] = B; } } *************************************************** App code ************************************************* int initializeHost(void) { rhist = NULL; ghist = NULL; bhist = NULL; linesToProcess = 720; lineLen = 1280; width = linesToProcess*lineLen; ///////////////////////////////////////////////////////////////// // Allocate and initialize memory used by host ///////////////////////////////////////////////////////////////// cl_uint sizeInBytes = width * sizeof(cl_uchar); rhist = (cl_int *)malloc(sizeof(cl_int) * 256); ghist = (cl_int *)malloc(sizeof(cl_int) * 256); bhist = (cl_int *)malloc(sizeof(cl_int) * 256); return 0; } /* * Converts the contents of a file into a string */ std::string convertToString(const char *filename) { size_t size; char* str; std::string s; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); return NULL; } f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; return s; } return NULL; } int initializeCL(unsigned char *input,unsigned char *output) { cl_int status = 0; cl_event events[1]; size_t deviceListSize; cl_device_type device_type=NULL; cl_uint num_devices; cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(status != CL_SUCCESS) { printf("Error: Getting Platforms. (clGetPlatformsIDs)\n"); return 1; } if(numPlatforms > 0) { cl_platform_id* platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); for(unsigned int i=0; i < numPlatforms; ++i) { char pbuff[100]; status = clGetPlatformInfo( platforms, CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); platform = platforms; if(!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { break; } } delete platforms; } cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; cl_context_properties* cprops = (NULL == platform) ? NULL : cps; ///////////////////////////////////////////////////////////////// // Create an OpenCL context ///////////////////////////////////////////////////////////////// context = clCreateContextFromType(cprops, // CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); /* First, get the size of device list data */ status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); ///////////////////////////////////////////////////////////////// // Detect OpenCL devices ///////////////////////////////////////////////////////////////// devices = (cl_device_id *)malloc(deviceListSize); status=clGetDeviceIDs(platform,CL_DEVICE_TYPE_ALL,0,NULL,&num_devices); status=clGetDeviceIDs(platform,CL_DEVICE_TYPE_ALL,2,devices,&num_devices); status = clGetContextInfo(context,CL_CONTEXT_DEVICES,deviceListSize,devices,NULL); ///////////////////////////////////////////////////////////////// // Create an OpenCL command queue ///////////////////////////////////////////////////////////////// commandQueue = clCreateCommandQueue( context, // devices[0], devices[1], CL_QUEUE_PROFILING_ENABLE, &status); ///////////////////////////////////////////////////////////////// // Create OpenCL memory buffers ///////////////////////////////////////////////////////////////// inputBuffer = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uchar) * width, input, &status); outputBuffer = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uchar) * width * 3 , output, &status); rhistBuffer = clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,sizeof(cl_int) * 256 ,rhist,&status); ghistBuffer = clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,sizeof(cl_int) * 256 ,ghist,&status); bhistBuffer = clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,sizeof(cl_int) * 256 ,bhist,&status); ///////////////////////////////////////////////////////////////// // Load CL file, build CL program object, create CL kernel object ///////////////////////////////////////////////////////////////// const char * filename = "Template_Kernels.cl"; std::string sourceStr = convertToString(filename); const char * source = sourceStr.c_str(); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource( context, 1, &source, sourceSize, &status); if(status != CL_SUCCESS) { std::cout<< "Error: Loading Binary into cl_program \ (clCreateProgramWithBinary)\n"; return 1; } status = clBuildProgram(program,1,devices, NULL, NULL, NULL); size_t len; char buffer[4096]; kernel = clCreateKernel(program, "templateKernel", &status); gammacolorkernel = clCreateKernel(program, "gammacolorkernel", &status); return 0; } int runCLKernels(unsigned char *output) { cl_int status; cl_uint maxDims; cl_event events[5]; size_t globalThreads[2]; size_t localThreads[2]; size_t maxWorkGroupSize; size_t maxWorkItemSizes[3]; size_t length; size_t kernelWorkGroupSize; cl_device_type device_type; cl_ulong startTime ,endTime; char devicebuff[100]; cl_uint i,j; /*Histogram Var*/ cl_int percentile5 = 1280*720*.05; cl_int ra=0,rb=0,rc=0,rd=0; cl_int ga=0,gb=0,gc=0,gd=0; cl_int ba=0,bb=0,bc=0,bd=0; cl_int r5sum=0,g5sum=0,b5sum=0; cl_int r95sum=0,g95sum=0,b95sum=0; cl_int A=0, B=0, C=0; globalThreads[0] =1280; globalThreads[1] =720; localThreads[0] =16; localThreads[1] =16; /*** Set appropriate arguments to the kernel ***/ status = clSetKernelArg(kernel,0,sizeof(cl_int),(void *)&linesToProcess); status = clSetKernelArg(kernel,1,sizeof(cl_int),(void *)&lineLen); status = clSetKernelArg(kernel, 2,sizeof(cl_mem),(void *)&outputBuffer); status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&inputBuffer); status = clSetKernelArg(kernel,4, sizeof(cl_mem),(void *)&rhistBuffer); status = clSetKernelArg(kernel,5, sizeof(cl_mem),(void *)&ghistBuffer); status = clSetKernelArg(kernel,6, sizeof(cl_mem),(void *)&bhistBuffer); status = clEnqueueNDRangeKernel( commandQueue, kernel, 2, NULL, globalThreads, localThreads, //NULL, 0, NULL, &events[0]); /* wait for the kernel call to finish execution */ status = clWaitForEvents(1, &events[0]); clReleaseEvent(events[0]); /* Enqueue readBuffer*/ status = clEnqueueReadBuffer( commandQueue, outputBuffer, CL_TRUE, 0, width * 3 * sizeof(cl_uchar), output, 0, NULL, &events[1]); status = clWaitForEvents(1, &events[1]); clReleaseEvent(events[1]); /* Enqueue readBufferof rhist*/ status = clEnqueueReadBuffer(commandQueue,rhistBuffer,CL_TRUE, 0,256 * sizeof(cl_int),rhist, 0, NULL, &events[2]); status = clWaitForEvents(1, &events[2]); clReleaseEvent(events[2]); /* Enqueue readBufferof ghist*/ status = clEnqueueReadBuffer(commandQueue,ghistBuffer,CL_TRUE, 0,256 * sizeof(cl_int),ghist, 0, NULL, &events[3]); status = clWaitForEvents(1, &events[3]); clReleaseEvent(events[3]); /* Enqueue readBufferof bhist*/ status = clEnqueueReadBuffer(commandQueue,bhistBuffer,CL_TRUE, 0,256 * sizeof(cl_int),bhist, 0, NULL, &events[4]); status = clWaitForEvents(1, &events[4]); clReleaseEvent(events[4]); /*Map Buffer*/ output =(cl_uchar *)clEnqueueMapBuffer(commandQueue,outputBuffer,CL_TRUE,CL_MAP_READ|CL_MAP_WRITE,0, sizeof(cl_uchar) * width * 3 , 0,NULL, &events[0],&status); status = clWaitForEvents(1, &events[0]); clReleaseEvent(events[0]); i = linesToProcess; for (j = 1; j <= lineLen; j++) { output[(i - 2) * 3 * lineLen + (j - 1) * 3 + 0] = output[(i - 3) * 3 * lineLen + (j - 1) * 3 + 0]; output[(i - 2) * 3 * lineLen + (j - 1) * 3 + 1] = output[(i - 3) * 3 * lineLen + (j - 1) * 3 + 1]; output[(i - 2) * 3 * lineLen + (j - 1) * 3 + 2] = output[(i - 3) * 3 * lineLen + (j - 1) * 3 + 2]; output[(i - 1) * 3 * lineLen + (j - 1) * 3 + 0] = output[(i - 3) * 3 * lineLen + (j - 1) * 3 + 0]; output[(i - 1) * 3 * lineLen + (j - 1) * 3 + 1] = output[(i - 3) * 3 * lineLen + (j - 1) * 3 + 1]; output[(i - 1) * 3 * lineLen + (j - 1) * 3 + 2] = output[(i - 3) * 3 * lineLen + (j - 1) * 3 + 2]; } j = lineLen; for (i = 1; i <= linesToProcess; i++) { output[(i - 1) * 3 * lineLen + (j - 2) * 3 + 0] = output[(i - 1) * 3 * lineLen + (j - 3) * 3 + 0]; output[(i - 1) * 3 * lineLen + (j - 2) * 3 + 1] = output[(i - 1) * 3 * lineLen + (j - 3) * 3 + 1]; output[(i - 1) * 3 * lineLen + (j - 2) * 3 + 2] = output[(i - 1) * 3 * lineLen + (j - 3) * 3 + 2]; output[(i - 1) * 3 * lineLen + (j - 1) * 3 + 0] = output[(i - 1) * 3 * lineLen + (j - 3) * 3 + 0]; output[(i - 1) * 3 * lineLen + (j - 1) * 3 + 1] = output[(i - 1) * 3 * lineLen + (j - 3) * 3 + 1]; output[(i - 1) * 3 * lineLen + (j - 1) * 3 + 2] = output[(i - 1) * 3 * lineLen + (j - 3) * 3 + 2]; } cl_int cnt = 0; for(i=1;i<255;i++) { if(r5sum < percentile5) { r5sum += rhist; } else if(rc == 0) { rc = i; cnt++; } if(r95sum < percentile5) { r95sum += rhist[255-i]; } else if(rd == 0) { rd = 255-i; cnt++; } if(g5sum < percentile5) { g5sum += ghist; } else if(gc == 0) { gc = i; cnt++; } if(g95sum < percentile5) { g95sum += ghist[255-i]; } else if(gd == 0) { gd = 255-i; cnt++; } if(b5sum < percentile5) { b5sum += bhist; } else if(bc == 0) { bc = i; cnt++; } if(b95sum < percentile5) { b95sum += bhist[255-i]; } else if(bd == 0) { bd = 255-i; cnt++; } if(cnt == 6) break; } ra = rc-30; ga = gc-30; ba = bc-30; rb = rd + 30; gb = gd + 30; bb = bd + 30; cl_int ravg = (rb-ra)/(rd-rc); cl_int gavg = (gb-ga)/(gd-gc); cl_int bavg = (bb-ba)/(bd-bc); clSetKernelArg(gammacolorkernel,0, sizeof(cl_mem),(void *)&outputBuffer); clSetKernelArg(gammacolorkernel,1, sizeof(cl_int),(void *)&rc); clSetKernelArg(gammacolorkernel,2, sizeof(cl_int),(void *)&gc); clSetKernelArg(gammacolorkernel,3, sizeof(cl_int),(void *)&bc); clSetKernelArg(gammacolorkernel,4, sizeof(cl_int),(void *)&ravg); clSetKernelArg(gammacolorkernel,5, sizeof(cl_int),(void *)&gavg); clSetKernelArg(gammacolorkernel,6, sizeof(cl_int),(void *)&bavg); clSetKernelArg(gammacolorkernel,7, sizeof(cl_int),(void *)&ra); clSetKernelArg(gammacolorkernel,8, sizeof(cl_int),(void *)&ga); status=clSetKernelArg(gammacolorkernel,9, sizeof(cl_int),(void *)&ba); /*UnMap Buffer*/ status=clEnqueueUnmapMemObject(commandQueue,outputBuffer,output,0,NULL, &events[0]); status = clWaitForEvents(1, &events[0]); clReleaseEvent(events[0]); /* * Enqueue a gammacolorkernel run call. */ globalThreads[0] =width; localThreads[0] =256; status = clEnqueueNDRangeKernel(commandQueue,gammacolorkernel,1,NULL,globalThreads, localThreads, 0, NULL,&events[0]); status = clWaitForEvents(1, &events[0]); clReleaseEvent(events[0]); status = clEnqueueReadBuffer(commandQueue,outputBuffer,CL_TRUE,0,width * 3 * sizeof(cl_uchar),output,0,NULL, &events[1]); status = clWaitForEvents(1, &events[1]); clReleaseEvent(events[1]); return 0; } /* * \brief Release OpenCL resources (Context, Memory etc.) */ int cleanupCL(void) { cl_int status; status = clReleaseKernel(kernel); status = clReleaseKernel(gammacolorkernel); status = clReleaseProgram(program); status = clReleaseMemObject(inputBuffer); status = clReleaseMemObject(outputBuffer); status = clReleaseMemObject(rhistBuffer); status = clReleaseMemObject(ghistBuffer); status = clReleaseMemObject(bhistBuffer); status = clReleaseCommandQueue(commandQueue); status = clReleaseContext(context); return 0; } void cleanupHost(void) { if(rhist != NULL) { free(rhist); rhist = NULL; } if(ghist != NULL) { free(ghist); ghist = NULL; } if(bhist != NULL) { free(bhist); bhist = NULL; } if(devices != NULL) { free(devices); devices = NULL; } } /* OpenCL code called from C++ Project :This is called in loop for each video frame */ int test_main(unsigned char *input,unsigned char *output) { if(initializeHost()==1) return 1; if(initializeCL(input,output)==1) return 1; if(runCLKernels(output)==1) return 1; if(cleanupCL()==1) return 1; cleanupHost(); return 0; }

omkaranathan · ‎05-18-2010

Pavan,

What are you trying to do?

You seem to be calling the whole cl program from your app, which means you are doing the whole setup every frame! This explains why your application is slow. If you are trying to speedup your GL program, take a look at the GL interoperability samples.

pavandsp · ‎05-19-2010

Hi Omkar,

Thanks for the reply and yaa i have modifed it by bringup the setup outside the frame loop and the speed is fine.

I am not using GL api's ...why is that u mentioned about GL interoperability?

did u see any GL apis in my code.

Thanks

Pavan

Archives Discussions

Huge Performace drop when OpenCL code was Integrated to C++ project