licoah

bad performance on Opencl

Discussion created by licoah on Feb 9, 2010
Latest reply on Feb 10, 2010 by licoah

I wrote a program, and I got a a realy bad performance about 3.3GB/s.

And wenn I use the combination of clCreateBuffer and clEnqueueWriteBuffer, the execution time of kernel(enqueue) is less than the time when I just use  clCreateBuffer with CL_MEM_USE_HOST_PTR. Why?

And wenn I use local memory, the execution time is also increased, why?

Maybe somebody can help me?

#include "main.hpp" #include "array.h" using namespace std; #define KERNEL_VERSION 2 //! number of images -> currently this is fixed const int nimages = 1; //! pad size -> this does not have to be changed int padding = 1; // 16 on GPU, 1 on CPU //! do computation on CPU or GPU const int location = onGPU; //! number of multigrid levels const int nlevels = 1; //! number of multigrid V(2,2)-cycles -> this does not have to be changed int iters = 200; //! problem size for solver (has to be a multiple of 2^levels) int nrows = 2048; int ncols = nrows; //! sizes of images (have to fit into problem size nrows >= imagerows+4, ncols >= imagecols+4) int imagerows = 400; int imagecols = 1100; int localNumWorkitems_Y = 15; int numGroup_Y = 128; int localNumWorkitems_X = 15; int numGroup_X = 128; /*** GLOBALS ***/ Array2D<float> gradx; Array2D<float> grady; Array2D<float> *image; Array2D<float> img; /* * Converts the contents of a file into a string */ std::string convertToString(const char *filename) { size_t size; char* str; std::string s; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); return NULL; } f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; return s; } return NULL; } /* * \brief OpenCL related initialization * Create Context, Device list, Command Queue * Create OpenCL memory buffer objects * Load CL file, compile, link CL source * Build program and kernel objects */ int initializeCL(void) { cl_int status = 0; size_t deviceListSize; /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); cout << "platform: "<<numPlatforms<<endl; if(status != CL_SUCCESS) { std::cout << "Error: Getting Platforms. (clGetPlatformsIDs)\n"; return 1; } if(numPlatforms > 0) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Platform Ids. (clGetPlatformsIDs)\n"; return 1; } for(unsigned int i=0; i < numPlatforms; ++i) { char pbuff[100]; status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); platform = platforms[i]; if(!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { break; } } delete platforms; } /* * If we could find our platform, use it. Otherwise pass a NULL and get whatever the * implementation thinks we should be using. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; cl_context_properties* cprops = (NULL == platform) ? NULL : cps; ///////////////////////////////////////////////////////////////// // Create an OpenCL context ///////////////////////////////////////////////////////////////// context = clCreateContextFromType(cprops, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); if(status != CL_SUCCESS) { std::cout<<"Error: Creating Context. (clCreateContextFromType)\n"; return 1; } /* First, get the size of device list data */ status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(status != CL_SUCCESS) { std::cout<< "Error: Getting Context Info \ (device list size, clGetContextInfo)\n"; return 1; } ///////////////////////////////////////////////////////////////// // Detect OpenCL devices ///////////////////////////////////////////////////////////////// cout<<"device list: "<<deviceListSize<<endl; devices = (cl_device_id *)malloc(deviceListSize); if(devices == 0) { std::cout<<"Error: No devices found.\n"; return 1; } /* Now, get the device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(status != CL_SUCCESS) { std::cout<< "Error: Getting Context Info \ (device list, clGetContextInfo)\n"; return 1; } ///////////////////////////////////////////////////////////////// // Create an OpenCL command queue ///////////////////////////////////////////////////////////////// commandQueue = clCreateCommandQueue( context, devices[0],//*************************************** 0, &status); if(status != CL_SUCCESS) { std::cout<<"Creating Command Queue. (clCreateCommandQueue)\n"; return 1; } ///////////////////////////////////////////////////////////////// // Create OpenCL memory buffers ///////////////////////////////////////////////////////////////// imageBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY,//| CL_MEM_USE_HOST_PTR, sizeof(cl_float) * length, NULL,//img.begin(), &status); if(status != CL_SUCCESS) { std::cout<<"Error: clCreateBuffer (inputBuffer)\n"; return 1; } status = clEnqueueWriteBuffer (commandQueue, imageBuffer, CL_TRUE, 0, sizeof(cl_float) * length, img.begin(), 0, NULL, NULL); gradxBuffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY,// | CL_MEM_USE_HOST_PTR, sizeof(cl_float) * length, NULL,//gradx.begin(), &status); if(status != CL_SUCCESS) { std::cout<<"Error: clCreateBuffer (outputBuffer)\n"<<status; return 1; } gradyBuffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY, //| CL_MEM_USE_HOST_PTR, sizeof(cl_float) * length, NULL,//grady.begin(), &status); if(status != CL_SUCCESS) { std::cout<<"Error: clCreateBuffer (outputBuffer)\n"; return 1; } ///////////////////////////////////////////////////////////////// // Load CL file, build CL program object, create CL kernel object ///////////////////////////////////////////////////////////////// const char * filename = "gradient.cl";//?????????????????????? std::string sourceStr = convertToString(filename); const char * source = sourceStr.c_str(); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource( context, 1, &source, sourceSize, &status); if(status != CL_SUCCESS) { std::cout<< "Error: Loading Binary into cl_program \ (clCreateProgramWithBinary)\n"; return 1; } /* create a cl program executable for all the devices specified */ status = clBuildProgram(program, 1, devices, NULL, NULL, NULL); if(status != CL_SUCCESS) { std::cout<<"Error: Building Program (clBuildProgram)" <<endl; size_t len; char buffer[2048]; clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_STATUS, sizeof(buffer), buffer, &len); printf("%s\n", buffer); clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_OPTIONS, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return 1; } /* get a kernel object handle for a kernel with the given name */ kernel = clCreateKernel(program, "templateKernel", &status); if(status != CL_SUCCESS) { std::cout<<"Error: Creating Kernel from program. (clCreateKernel)\n"; return 1; } return 0; } /* * \brief Run OpenCL program * * Bind host variables to kernel arguments * Run the CL kernel */ int runCLKernels(void) { cl_int status; cl_uint maxDims; cl_event events[2]; //size_t globalThreads[2] = {localNumWorkitems_Y * numGroup_Y, localNumWorkitems_X * numGroup_X}; // to be changed //size_t localThreads[2] = {localNumWorkitems_Y, localNumWorkitems_X};// to be changed size_t globalThreads[2] = {(localNumWorkitems_Y+1) * numGroup_Y, (localNumWorkitems_X+1) * numGroup_X}; size_t localThreads[2] = {localNumWorkitems_Y+1, localNumWorkitems_X+1}; size_t maxWorkGroupSize; size_t maxWorkItemSizes[3]; /** * Query device capabilities. Maximum * work item dimensions and the maximmum * work item sizes */ clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void*)&maxWorkGroupSize, NULL); clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void*)&maxDims, NULL); clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDims, (void*)maxWorkItemSizes, NULL); cout<<"local threads x : "<< localThreads[0]<<endl; cout<<"local threads y : "<< localThreads[1]<<endl; cout<<"local threads total : "<< localThreads[0]*localThreads[1]<<endl; if(localThreads[0] > maxWorkItemSizes[0] || localThreads[1] > maxWorkItemSizes[1]|| localThreads[0]*localThreads[1] > maxWorkGroupSize) { std::cout<<"Unsupported: Device does not support requested number of work items."; return 1; } /*** Set appropriate arguments to the kernel ***/ /* the gradx array to the kernel */ status = clSetKernelArg( kernel, 1, sizeof(cl_mem), (void *)&gradxBuffer); if(status != CL_SUCCESS) { std::cout<<"Error: Setting kernel argument. (gradx)\n"; return 1; } /* the grady array to the kernel */ status = clSetKernelArg( kernel, 2, sizeof(cl_mem), (void *)&gradyBuffer); if(status != CL_SUCCESS) { std::cout<<"Error: Setting kernel argument. (grady)\n"; return 1; } /* the input array to the kernel */ status = clSetKernelArg( kernel, 0, sizeof(cl_mem), (void *)&imageBuffer); if(status != CL_SUCCESS) { std::cout<<"Error: Setting kernel argument. (image)\n"; return 1; } /* the constant to the kernel */ #if KERNEL_VERSION == 2 cout<<"version 2"<<endl; status = clSetKernelArg( kernel, 4, sizeof(cl_int), (void *)&width); if(status != CL_SUCCESS) { std::cout<<"Error: Setting kernel argument. (width)\n"; return 1; } /*local memory*/ status = clSetKernelArg( kernel, 3, sizeof(cl_float)*(localNumWorkitems_Y+1)*(localNumWorkitems_X+1), NULL); if(status != CL_SUCCESS) { std::cout<<"Error: Setting kernel argument. (blocksize)\n"; return 1; } status = clSetKernelArg( kernel, 5, sizeof(cl_int), (void*)&localNumWorkitems_X); if(status != CL_SUCCESS) { std::cout<<"Error: Setting kernel argument. (blocksize)\n"; return 1; } #elif KERNEL_VERSION == 1 cout<<"version 1"<<endl; status = clSetKernelArg( kernel, 3, sizeof(cl_int), (void *)&width); if(status != CL_SUCCESS) { std::cout<<"Error: Setting kernel argument. (width)\n"; return 1; } #else cout<< " Not implemented yet !!"<<endl; #endif /* * Enqueue a kernel run call. */ CPerfCounter timer; timer.GetElapsedTime(); timer.Reset(); timer.Start(); for(int i=0;i<10;++i) { status = clEnqueueNDRangeKernel( commandQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &events[0]); if(status != CL_SUCCESS) { std::cout<< "Error: Enqueueing kernel onto command queue. \ (clEnqueueNDRangeKernel)\n"; return 1; } /* wait for the kernel call to finish execution */ status = clWaitForEvents(1, &events[0]); if(status != CL_SUCCESS) { std::cout<< "Error: Waiting for kernel run to finish. \ (clWaitForEvents)\n"; return 1; } } timer.Stop(); cout<<"Enqueue "<<timer.GetElapsedTime()<<endl; clReleaseEvent(events[0]); /* Enqueue readBuffer*/ timer.Reset(); timer.Start(); status = clEnqueueReadBuffer( commandQueue, gradxBuffer, CL_TRUE, 0, length * sizeof(cl_float), gradx.begin(), 0, NULL, &events[1]); if(status != CL_SUCCESS) { std::cout << "Error: clEnqueueReadBuffer failed. \ (clEnqueueReadBuffer)\n"; return 1; } status = clEnqueueReadBuffer( commandQueue, gradyBuffer, CL_TRUE, 0, length * sizeof(cl_float), grady.begin(), 0, NULL, &events[1]); if(status != CL_SUCCESS) { std::cout << "Error: clEnqueueReadBuffer failed. \ (clEnqueueReadBuffer)\n"; return 1; } /* Wait for the read buffer to finish execution */ status = clWaitForEvents(1, &events[1]); if(status != CL_SUCCESS) { std::cout<< "Error: Waiting for read buffer call to finish. \ (clWaitForEvents)\n"; return 1; } timer.Stop(); cout<<"read "<<timer.GetElapsedTime()<<endl; clReleaseEvent(events[1]); return 0; } /* * \brief Release OpenCL resources (Context, Memory etc.) */ int cleanupCL(void) { cl_int status; status = clReleaseKernel(kernel); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseKernel \n"; return 1; } status = clReleaseProgram(program); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseProgram\n"; return 1; } status = clReleaseMemObject(gradxBuffer); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseMemObject (gradxbuffer)\n"; return 1; } status = clReleaseMemObject(gradyBuffer); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseMemObject (gradybuffer)\n"; return 1; } status = clReleaseMemObject(imageBuffer); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseMemObject (imageBuffer)\n"; return 1; } status = clReleaseCommandQueue(commandQueue); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseCommandQueue\n"; return 1; } status = clReleaseContext(context); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseContext\n"; return 1; } return 0; } /* * \brief Releases program's resources */ void cleanupHost(void) { if(image != NULL) { free(image); image = NULL; } /* if(gradx != NULL) { free(gradx); gradx = NULL; } if(grady != NULL) { free(grady); grady = NULL; } */ if(devices != NULL) { free(devices); devices = NULL; } } int main(int argc, char * argv[]) { if (nrows < max(imagerows,imagecols)) { nrows = max(imagerows,imagecols)+4; ncols = nrows; cout << "solver sizes to small changed to " << nrows << " x " << ncols << endl; } int levelspad = int(pow(2.0f,nlevels-1)); if (nrows % levelspad != 0) { cout << "Wrong solver size " << nrows << " x " << ncols << " has to be a multiple of " << levelspad << endl; nrows += levelspad - nrows%levelspad; ncols = nrows; cout << "Problem size changed to " << nrows << " x " << ncols << endl; } if ( location == onCPU ) cout << "compute on CPU" << endl; else cout << "compute on GPU" << endl; unsigned int cpu_thread_id = 1; // Storage for solution, right hand side, residual, and images on each level Array2D<float>* Sol = new Array2D<float>[nlevels]; Array2D<float>* RHS = new Array2D<float>[nlevels]; Array2D<float>* Res = new Array2D<float>[nlevels]; //Array2D<float>* images = new Array2D<float>[nimages]; image = new Array2D<float>[nimages]; // read input image readImageSiemens<unsigned short,cl_float>("comp_1_06_cropped.dri", imagerows,imagecols, nrows+1 , ncols+1, image[0]); image[0].plotpgm("bild.pgm"); cout<<"image width "<<image[0].ncols()<<endl; // copy image[0] data from host to deviceListSize // allocate memory on each level for solution, RHS, and residual on CPU or GPU // setting: node-based grid and Dirichlet boundaries -> add 1 point in each direction int sizeaddrow = 1, sizeaddcol = 1; // in Brook+ mem alloc on GPU not needed! for (int i = 0; i < nlevels; i++ ) { Sol[i].resize ( nrows+sizeaddrow,ncols+sizeaddcol, onCPU, padding ); RHS[i].resize ( nrows+sizeaddrow,ncols+sizeaddcol, onCPU, padding ); Res[i].resize ( nrows+sizeaddrow,ncols+sizeaddcol, onCPU, padding ); cout << "lev: " << i << " " << nrows+sizeaddrow << " " << ncols+sizeaddcol << " real size " << Sol[i].nrows() << " " << Sol[i].ncols() << endl; nrows= ( nrows/2 ); ncols= ( ncols/2 ); } // memory on CPU for solution and image Array2D<float> lsg ( Sol[0].nrows(),Sol[0].ncols(), onHost, padding ); //Array2D<float> img ( image[0].nrows(),image[0].ncols(), onGPU ); // gradient images gradx.resize( RHS[0].nrows(),RHS[0].ncols(), onCPU, padding ); grady.resize( RHS[0].nrows(),RHS[0].ncols(), onCPU, padding ); img.resize ( image[0].nrows(),image[0].ncols(), onCPU, padding ); CPerfCounter timer; timer.GetElapsedTime(); timer.Reset(); width = img.ncols(); length = img.getsize(); // Initialize Host application // if(initializeHost(img)==1) // return 1; img = image[0]; // Initialize OpenCL resources timer.Start(); if(initializeCL()==1) return 1; timer.Stop(); cout << "initializeCL " << timer.GetElapsedTime() << endl; img.plotpgm("bild.pgm"); // Run the CL program if(runCLKernels()==1) return 1; gradx.plotpgm ( "gradx.pgm" ); grady.plotpgm ( "grady.pgm" ); // Releases OpenCL resources if(cleanupCL()==1) return 1; delete [] Sol; delete [] RHS; return 0; } *********************************************************************** kernel: __kernel void templateKernel(__global float* image, __global float* gradx, __global float* grady, __local float* block, const uint width,// the width of image const uint blockSize) { uint globalIdx = get_global_id(0); uint globalIdy = get_global_id(1); uint pos = globalIdy * width + globalIdx; gradx[pos] = image[pos+1]- image[pos]; grady[pos] = image[pos+width] - image[pos]; /* uint localIdx = get_local_id(0); uint localIdy = get_local_id(1); int bx = get_group_id(0); int by = get_group_id(1); int image_width = bx * blockSize + localIdx; int image_height = by * blockSize + localIdy; uint pos = image_height*width + image_width; uint pos_block = localIdy*(blockSize+1) + localIdx; // copy from input to local memory block[pos_block] = image[pos]; // wait until the whole block is filled barrier(CLK_LOCAL_MEM_FENCE); float xval = block[pos_block + 1] - block[pos_block]; float yval = block[pos_block + blockSize + 1] - block[pos_block]; pos = pos + localIdx/blockSize * 5000000 + localIdy/blockSize * 5000000; gradx[pos] = xval; grady[pos] = yval; */ }

Outcomes