2 Replies Latest reply on Feb 10, 2010 8:55 PM by licoah

    bad performance on Opencl

    licoah

      I wrote a program, and I got a a realy bad performance about 3.3GB/s.

      And wenn I use the combination of clCreateBuffer and clEnqueueWriteBuffer, the execution time of kernel(enqueue) is less than the time when I just use  clCreateBuffer with CL_MEM_USE_HOST_PTR. Why?

      And wenn I use local memory, the execution time is also increased, why?

      Maybe somebody can help me?

      #include "main.hpp" #include "array.h" using namespace std; #define KERNEL_VERSION 2 //! number of images -> currently this is fixed const int nimages = 1; //! pad size -> this does not have to be changed int padding = 1; // 16 on GPU, 1 on CPU //! do computation on CPU or GPU const int location = onGPU; //! number of multigrid levels const int nlevels = 1; //! number of multigrid V(2,2)-cycles -> this does not have to be changed int iters = 200; //! problem size for solver (has to be a multiple of 2^levels) int nrows = 2048; int ncols = nrows; //! sizes of images (have to fit into problem size nrows >= imagerows+4, ncols >= imagecols+4) int imagerows = 400; int imagecols = 1100; int localNumWorkitems_Y = 15; int numGroup_Y = 128; int localNumWorkitems_X = 15; int numGroup_X = 128; /*** GLOBALS ***/ Array2D<float> gradx; Array2D<float> grady; Array2D<float> *image; Array2D<float> img; /* * Converts the contents of a file into a string */ std::string convertToString(const char *filename) { size_t size; char* str; std::string s; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); return NULL; } f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; return s; } return NULL; } /* * \brief OpenCL related initialization * Create Context, Device list, Command Queue * Create OpenCL memory buffer objects * Load CL file, compile, link CL source * Build program and kernel objects */ int initializeCL(void) { cl_int status = 0; size_t deviceListSize; /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); cout << "platform: "<<numPlatforms<<endl; if(status != CL_SUCCESS) { std::cout << "Error: Getting Platforms. (clGetPlatformsIDs)\n"; return 1; } if(numPlatforms > 0) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Platform Ids. (clGetPlatformsIDs)\n"; return 1; } for(unsigned int i=0; i < numPlatforms; ++i) { char pbuff[100]; status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); platform = platforms[i]; if(!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { break; } } delete platforms; } /* * If we could find our platform, use it. Otherwise pass a NULL and get whatever the * implementation thinks we should be using. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; cl_context_properties* cprops = (NULL == platform) ? NULL : cps; ///////////////////////////////////////////////////////////////// // Create an OpenCL context ///////////////////////////////////////////////////////////////// context = clCreateContextFromType(cprops, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); if(status != CL_SUCCESS) { std::cout<<"Error: Creating Context. (clCreateContextFromType)\n"; return 1; } /* First, get the size of device list data */ status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(status != CL_SUCCESS) { std::cout<< "Error: Getting Context Info \ (device list size, clGetContextInfo)\n"; return 1; } ///////////////////////////////////////////////////////////////// // Detect OpenCL devices ///////////////////////////////////////////////////////////////// cout<<"device list: "<<deviceListSize<<endl; devices = (cl_device_id *)malloc(deviceListSize); if(devices == 0) { std::cout<<"Error: No devices found.\n"; return 1; } /* Now, get the device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(status != CL_SUCCESS) { std::cout<< "Error: Getting Context Info \ (device list, clGetContextInfo)\n"; return 1; } ///////////////////////////////////////////////////////////////// // Create an OpenCL command queue ///////////////////////////////////////////////////////////////// commandQueue = clCreateCommandQueue( context, devices[0],//*************************************** 0, &status); if(status != CL_SUCCESS) { std::cout<<"Creating Command Queue. (clCreateCommandQueue)\n"; return 1; } ///////////////////////////////////////////////////////////////// // Create OpenCL memory buffers ///////////////////////////////////////////////////////////////// imageBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY,//| CL_MEM_USE_HOST_PTR, sizeof(cl_float) * length, NULL,//img.begin(), &status); if(status != CL_SUCCESS) { std::cout<<"Error: clCreateBuffer (inputBuffer)\n"; return 1; } status = clEnqueueWriteBuffer (commandQueue, imageBuffer, CL_TRUE, 0, sizeof(cl_float) * length, img.begin(), 0, NULL, NULL); gradxBuffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY,// | CL_MEM_USE_HOST_PTR, sizeof(cl_float) * length, NULL,//gradx.begin(), &status); if(status != CL_SUCCESS) { std::cout<<"Error: clCreateBuffer (outputBuffer)\n"<<status; return 1; } gradyBuffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY, //| CL_MEM_USE_HOST_PTR, sizeof(cl_float) * length, NULL,//grady.begin(), &status); if(status != CL_SUCCESS) { std::cout<<"Error: clCreateBuffer (outputBuffer)\n"; return 1; } ///////////////////////////////////////////////////////////////// // Load CL file, build CL program object, create CL kernel object ///////////////////////////////////////////////////////////////// const char * filename = "gradient.cl";//?????????????????????? std::string sourceStr = convertToString(filename); const char * source = sourceStr.c_str(); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource( context, 1, &source, sourceSize, &status); if(status != CL_SUCCESS) { std::cout<< "Error: Loading Binary into cl_program \ (clCreateProgramWithBinary)\n"; return 1; } /* create a cl program executable for all the devices specified */ status = clBuildProgram(program, 1, devices, NULL, NULL, NULL); if(status != CL_SUCCESS) { std::cout<<"Error: Building Program (clBuildProgram)" <<endl; size_t len; char buffer[2048]; clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_STATUS, sizeof(buffer), buffer, &len); printf("%s\n", buffer); clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_OPTIONS, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return 1; } /* get a kernel object handle for a kernel with the given name */ kernel = clCreateKernel(program, "templateKernel", &status); if(status != CL_SUCCESS) { std::cout<<"Error: Creating Kernel from program. (clCreateKernel)\n"; return 1; } return 0; } /* * \brief Run OpenCL program * * Bind host variables to kernel arguments * Run the CL kernel */ int runCLKernels(void) { cl_int status; cl_uint maxDims; cl_event events[2]; //size_t globalThreads[2] = {localNumWorkitems_Y * numGroup_Y, localNumWorkitems_X * numGroup_X}; // to be changed //size_t localThreads[2] = {localNumWorkitems_Y, localNumWorkitems_X};// to be changed size_t globalThreads[2] = {(localNumWorkitems_Y+1) * numGroup_Y, (localNumWorkitems_X+1) * numGroup_X}; size_t localThreads[2] = {localNumWorkitems_Y+1, localNumWorkitems_X+1}; size_t maxWorkGroupSize; size_t maxWorkItemSizes[3]; /** * Query device capabilities. Maximum * work item dimensions and the maximmum * work item sizes */ clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void*)&maxWorkGroupSize, NULL); clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void*)&maxDims, NULL); clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDims, (void*)maxWorkItemSizes, NULL); cout<<"local threads x : "<< localThreads[0]<<endl; cout<<"local threads y : "<< localThreads[1]<<endl; cout<<"local threads total : "<< localThreads[0]*localThreads[1]<<endl; if(localThreads[0] > maxWorkItemSizes[0] || localThreads[1] > maxWorkItemSizes[1]|| localThreads[0]*localThreads[1] > maxWorkGroupSize) { std::cout<<"Unsupported: Device does not support requested number of work items."; return 1; } /*** Set appropriate arguments to the kernel ***/ /* the gradx array to the kernel */ status = clSetKernelArg( kernel, 1, sizeof(cl_mem), (void *)&gradxBuffer); if(status != CL_SUCCESS) { std::cout<<"Error: Setting kernel argument. (gradx)\n"; return 1; } /* the grady array to the kernel */ status = clSetKernelArg( kernel, 2, sizeof(cl_mem), (void *)&gradyBuffer); if(status != CL_SUCCESS) { std::cout<<"Error: Setting kernel argument. (grady)\n"; return 1; } /* the input array to the kernel */ status = clSetKernelArg( kernel, 0, sizeof(cl_mem), (void *)&imageBuffer); if(status != CL_SUCCESS) { std::cout<<"Error: Setting kernel argument. (image)\n"; return 1; } /* the constant to the kernel */ #if KERNEL_VERSION == 2 cout<<"version 2"<<endl; status = clSetKernelArg( kernel, 4, sizeof(cl_int), (void *)&width); if(status != CL_SUCCESS) { std::cout<<"Error: Setting kernel argument. (width)\n"; return 1; } /*local memory*/ status = clSetKernelArg( kernel, 3, sizeof(cl_float)*(localNumWorkitems_Y+1)*(localNumWorkitems_X+1), NULL); if(status != CL_SUCCESS) { std::cout<<"Error: Setting kernel argument. (blocksize)\n"; return 1; } status = clSetKernelArg( kernel, 5, sizeof(cl_int), (void*)&localNumWorkitems_X); if(status != CL_SUCCESS) { std::cout<<"Error: Setting kernel argument. (blocksize)\n"; return 1; } #elif KERNEL_VERSION == 1 cout<<"version 1"<<endl; status = clSetKernelArg( kernel, 3, sizeof(cl_int), (void *)&width); if(status != CL_SUCCESS) { std::cout<<"Error: Setting kernel argument. (width)\n"; return 1; } #else cout<< " Not implemented yet !!"<<endl; #endif /* * Enqueue a kernel run call. */ CPerfCounter timer; timer.GetElapsedTime(); timer.Reset(); timer.Start(); for(int i=0;i<10;++i) { status = clEnqueueNDRangeKernel( commandQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &events[0]); if(status != CL_SUCCESS) { std::cout<< "Error: Enqueueing kernel onto command queue. \ (clEnqueueNDRangeKernel)\n"; return 1; } /* wait for the kernel call to finish execution */ status = clWaitForEvents(1, &events[0]); if(status != CL_SUCCESS) { std::cout<< "Error: Waiting for kernel run to finish. \ (clWaitForEvents)\n"; return 1; } } timer.Stop(); cout<<"Enqueue "<<timer.GetElapsedTime()<<endl; clReleaseEvent(events[0]); /* Enqueue readBuffer*/ timer.Reset(); timer.Start(); status = clEnqueueReadBuffer( commandQueue, gradxBuffer, CL_TRUE, 0, length * sizeof(cl_float), gradx.begin(), 0, NULL, &events[1]); if(status != CL_SUCCESS) { std::cout << "Error: clEnqueueReadBuffer failed. \ (clEnqueueReadBuffer)\n"; return 1; } status = clEnqueueReadBuffer( commandQueue, gradyBuffer, CL_TRUE, 0, length * sizeof(cl_float), grady.begin(), 0, NULL, &events[1]); if(status != CL_SUCCESS) { std::cout << "Error: clEnqueueReadBuffer failed. \ (clEnqueueReadBuffer)\n"; return 1; } /* Wait for the read buffer to finish execution */ status = clWaitForEvents(1, &events[1]); if(status != CL_SUCCESS) { std::cout<< "Error: Waiting for read buffer call to finish. \ (clWaitForEvents)\n"; return 1; } timer.Stop(); cout<<"read "<<timer.GetElapsedTime()<<endl; clReleaseEvent(events[1]); return 0; } /* * \brief Release OpenCL resources (Context, Memory etc.) */ int cleanupCL(void) { cl_int status; status = clReleaseKernel(kernel); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseKernel \n"; return 1; } status = clReleaseProgram(program); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseProgram\n"; return 1; } status = clReleaseMemObject(gradxBuffer); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseMemObject (gradxbuffer)\n"; return 1; } status = clReleaseMemObject(gradyBuffer); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseMemObject (gradybuffer)\n"; return 1; } status = clReleaseMemObject(imageBuffer); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseMemObject (imageBuffer)\n"; return 1; } status = clReleaseCommandQueue(commandQueue); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseCommandQueue\n"; return 1; } status = clReleaseContext(context); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseContext\n"; return 1; } return 0; } /* * \brief Releases program's resources */ void cleanupHost(void) { if(image != NULL) { free(image); image = NULL; } /* if(gradx != NULL) { free(gradx); gradx = NULL; } if(grady != NULL) { free(grady); grady = NULL; } */ if(devices != NULL) { free(devices); devices = NULL; } } int main(int argc, char * argv[]) { if (nrows < max(imagerows,imagecols)) { nrows = max(imagerows,imagecols)+4; ncols = nrows; cout << "solver sizes to small changed to " << nrows << " x " << ncols << endl; } int levelspad = int(pow(2.0f,nlevels-1)); if (nrows % levelspad != 0) { cout << "Wrong solver size " << nrows << " x " << ncols << " has to be a multiple of " << levelspad << endl; nrows += levelspad - nrows%levelspad; ncols = nrows; cout << "Problem size changed to " << nrows << " x " << ncols << endl; } if ( location == onCPU ) cout << "compute on CPU" << endl; else cout << "compute on GPU" << endl; unsigned int cpu_thread_id = 1; // Storage for solution, right hand side, residual, and images on each level Array2D<float>* Sol = new Array2D<float>[nlevels]; Array2D<float>* RHS = new Array2D<float>[nlevels]; Array2D<float>* Res = new Array2D<float>[nlevels]; //Array2D<float>* images = new Array2D<float>[nimages]; image = new Array2D<float>[nimages]; // read input image readImageSiemens<unsigned short,cl_float>("comp_1_06_cropped.dri", imagerows,imagecols, nrows+1 , ncols+1, image[0]); image[0].plotpgm("bild.pgm"); cout<<"image width "<<image[0].ncols()<<endl; // copy image[0] data from host to deviceListSize // allocate memory on each level for solution, RHS, and residual on CPU or GPU // setting: node-based grid and Dirichlet boundaries -> add 1 point in each direction int sizeaddrow = 1, sizeaddcol = 1; // in Brook+ mem alloc on GPU not needed! for (int i = 0; i < nlevels; i++ ) { Sol[i].resize ( nrows+sizeaddrow,ncols+sizeaddcol, onCPU, padding ); RHS[i].resize ( nrows+sizeaddrow,ncols+sizeaddcol, onCPU, padding ); Res[i].resize ( nrows+sizeaddrow,ncols+sizeaddcol, onCPU, padding ); cout << "lev: " << i << " " << nrows+sizeaddrow << " " << ncols+sizeaddcol << " real size " << Sol[i].nrows() << " " << Sol[i].ncols() << endl; nrows= ( nrows/2 ); ncols= ( ncols/2 ); } // memory on CPU for solution and image Array2D<float> lsg ( Sol[0].nrows(),Sol[0].ncols(), onHost, padding ); //Array2D<float> img ( image[0].nrows(),image[0].ncols(), onGPU ); // gradient images gradx.resize( RHS[0].nrows(),RHS[0].ncols(), onCPU, padding ); grady.resize( RHS[0].nrows(),RHS[0].ncols(), onCPU, padding ); img.resize ( image[0].nrows(),image[0].ncols(), onCPU, padding ); CPerfCounter timer; timer.GetElapsedTime(); timer.Reset(); width = img.ncols(); length = img.getsize(); // Initialize Host application // if(initializeHost(img)==1) // return 1; img = image[0]; // Initialize OpenCL resources timer.Start(); if(initializeCL()==1) return 1; timer.Stop(); cout << "initializeCL " << timer.GetElapsedTime() << endl; img.plotpgm("bild.pgm"); // Run the CL program if(runCLKernels()==1) return 1; gradx.plotpgm ( "gradx.pgm" ); grady.plotpgm ( "grady.pgm" ); // Releases OpenCL resources if(cleanupCL()==1) return 1; delete [] Sol; delete [] RHS; return 0; } *********************************************************************** kernel: __kernel void templateKernel(__global float* image, __global float* gradx, __global float* grady, __local float* block, const uint width,// the width of image const uint blockSize) { uint globalIdx = get_global_id(0); uint globalIdy = get_global_id(1); uint pos = globalIdy * width + globalIdx; gradx[pos] = image[pos+1]- image[pos]; grady[pos] = image[pos+width] - image[pos]; /* uint localIdx = get_local_id(0); uint localIdy = get_local_id(1); int bx = get_group_id(0); int by = get_group_id(1); int image_width = bx * blockSize + localIdx; int image_height = by * blockSize + localIdy; uint pos = image_height*width + image_width; uint pos_block = localIdy*(blockSize+1) + localIdx; // copy from input to local memory block[pos_block] = image[pos]; // wait until the whole block is filled barrier(CLK_LOCAL_MEM_FENCE); float xval = block[pos_block + 1] - block[pos_block]; float yval = block[pos_block + blockSize + 1] - block[pos_block]; pos = pos + localIdx/blockSize * 5000000 + localIdy/blockSize * 5000000; gradx[pos] = xval; grady[pos] = yval; */ }

        • bad performance on Opencl
          n0thing

          How are you measuring kernel execution time? You should use queue profiling to get accurate times.

          clCreateBuffer + clEnqueueBuffer will result in immediate transfer of data to device over PCIE bus. So when you measure you kernel time by using host timers over runCLKernels() function, it will not include the bus transfer time.

          Using USE_HOST_PTR will result in transfer over PCIE bus as soon as kernel is invoked, so using host timers will result in transfer time over bus added to kernel execution time.

          If you are on windows, then you can use the profiler to accurately measure your kernel time.

          What group-size you are using?