Archives Discussions

Acen · ‎02-15-2011

I have measured the time to tranfer a buffer from the CPU to the GPU. It appears that there is a threshold of more than 1 ms whatever the buffer size is. The procedure I use is:

clCreateBuffer
clEnqueueWriteBuffer with option CL_FALSE
clWaitForEvents
clGetEventProfilingInfo

Do I missed something or is it normal to have a 1 ms latency?

genaganna · ‎02-15-2011

Originally posted by: Acen I have measured the time to tranfer a buffer from the CPU to the GPU. It appears that there is a threshold of more than 1 ms whatever the buffer size is. The procedure I use is:

clCreateBuffer
clEnqueueWriteBuffer with option CL_FALSE
clWaitForEvents
clGetEventProfilingInfo
Do I missed something or is it normal to have a 1 ms latency?

Are you calculating average time or not? please copy your code here.

Acen · ‎02-16-2011

Thank you for your help. Here is my code.

#include "Test.hpp" /******************************************************* * Host Initialization: Allocate and initialize memory ********************************************************/ void initializeHost(void) { inpMem = NULL; outMem = NULL; inpMem = (int16_t *) malloc(sizeInput * sizeof(int16_t)); if(inpMem == NULL) {std::cout << "Error: Failed to allocate inpMem memory on host" << std::endl; return;} outMem = (int16_t *) malloc(sizeOutput * sizeof(int16_t)); if(outMem == NULL) {std::cout << "Error: Failed to allocate inpMem memory on host" << std::endl; return;} matSize[0] = nbBlocksX; matSize[1] = nbBlocksY; srand(1); for(uint i = 0; i < sizeInput ; i++) { inpMem = (int16_t)rand(); } std::cout << "Initialize host." << std::endl; } /***************************************************** * Converts the contents of a file into a string ******************************************************/ std::string convertToString(const char *filename) { size_t size; char* str; std::string s; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); return NULL;} f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; delete[] str; return s; } return NULL; } /******************************************************** * Initialize OpenCL ressources: * Get Platform IDs and Infos of AMD Platform * Create contex and get devices * Create command queues * Create buffers * Create and build CL program * Create kernel object *********************************************************/ void initializeCL(void) { cl_int status = 0; size_t deviceListSize; /************************************************ * Get Platform info and recover AMD platform *************************************************/ cl_uint numPlatforms; cl_platform_id platform = NULL; /* Get the number of platforms --> numplatform */ status = clGetPlatformIDs(0, NULL, &numPlatforms); if(status != CL_SUCCESS) { std::cout << "Error: Getting Platforms. (clGetPlatformsIDs)" << std::endl; return; } /* Get the list of platforms --> platforms */ if(numPlatforms > 0) { cl_platform_id* platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Platform Ids. (clGetPlatformsIDs)" << std::endl; return; } /* Get the platform vendors --> pbuff and break when AMD is found*/ for(unsigned int i=0; i < numPlatforms; ++i) { char pbuff[100]; status = clGetPlatformInfo( platforms, CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Platform Info. (clGetPlatformInfo)" << std::endl; return; } platform = platforms; if(!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { break; } } delete platforms; } if(NULL == platform) { std::cout << "NULL platform found so Exiting Application." << std::endl; return;} /************************************************ * Get Device informations *************************************************/ cl_uint numDevices; /* Get number of devices */ status = clGetDeviceIDs(platform, devType, 0, NULL, &numDevices); if(status != CL_SUCCESS) { std::cout << "Error: Getting Devices. (clGetDeviceIDs)" << std::endl; return; } /* Get ID of the device */ status = clGetDeviceIDs(platform, devType, numDevices, &DeviceID, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Device ID. (clGetDeviceIDs)" << std::endl; return; } /******************************************************** * Create an OpenCL context for CPU and get device infos *********************************************************/ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType(cps, devType, NULL, NULL, &status); if(status != CL_SUCCESS) { std::cout << "Error: Creating Context. (clCreateContextFromType)" << std::endl; return; } std::cout << "Initialize CL." << std::endl; /************************************************** * Create an OpenCL command queue for device 0 ***************************************************/ commandQueue = clCreateCommandQueue( context, DeviceID, CL_QUEUE_PROFILING_ENABLE, &status); if(status != CL_SUCCESS) { std::cout << "Creating Command Queue. (clCreateCommandQueue)" << std::endl; return;} /***************************************************** * Create OpenCL memory input and output buffers ******************************************************/ inpBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY, sizeof(int16_t) * sizeInput, NULL, &status); if(status != CL_SUCCESS) { std::cout << "Error: clCreateBuffer (inpBuffer)" << std::endl; return; } outBuffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY, sizeof(int16_t) * sizeOutput, NULL, &status); if(status != CL_SUCCESS) { std::cout << "Error: clCreateBuffer (outBuffer)" << std::endl; return; } matSizeBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY, sizeof(uint) * 2 , NULL, &status); if(status != CL_SUCCESS) { std::cout << "Error: clCreateBuffer (matSizeBuffer)" << std::endl; return; } std::cout << "Buffer created " << std::endl; /***************************************** * Create a program object with source ******************************************/ const char * filename = "Test_Kernel.cl"; std::string sourceStr = convertToString(filename); const char * source = sourceStr.c_str(); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource( context, 1, &source, sourceSize, &status); if(status != CL_SUCCESS){ std::cout << "Error: Loading Binary into cl_program (clCreateProgramWithBinary)" << std::endl; return; } std::cout << "Program created " << std::endl; /********************************* * Build the program created above **********************************/ status = clBuildProgram( program, 1, &DeviceID, NULL, NULL, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Building Program (clBuildProgram)" << status; return; } std::cout << "Program build " << std::endl; /********************************** * Create a kernel object ***********************************/ kernelUsed = clCreateKernel( program, "H4", &status); if(status != CL_SUCCESS) { std::cout << "Error: Creating Kernel from program. (clCreateKernel)" << std::endl; return; } std::cout << "Kernel created " << std::endl; } /******************************************** * Run OpenCL program *********************************************/ void runCLKernels(void) { cl_int status; cl_event kern_ev[3]; /************************************** * Enqueue WriteBuffer: Input ***************************************/ status = clEnqueueWriteBuffer( commandQueue, inpBuffer, CL_TRUE, 0, sizeInput * sizeof(int16_t), inpMem, 0, NULL, &kern_ev[0]); if(status != CL_SUCCESS) { std::cout << "Error: clEnqueueInputBuffer failed (clEnqueueWriteBuffer)" << std::endl;} status = clEnqueueWriteBuffer( commandQueue, matSizeBuffer, CL_TRUE, 0, 2 *sizeof(uint), matSize, 0, NULL, &kern_ev[1]); if(status != CL_SUCCESS) { std::cout << "Error: clEnqueueBiaBuffer failed (clEnqueueWriteBuffer)" << std::endl;} /******************************************** * Set appropriate arguments to the kernel *********************************************/ /* 1st argument: input of the kernel */ status = clSetKernelArg( kernelUsed, 0, sizeof(cl_mem), (void *)&inpBuffer); if(status != CL_SUCCESS) { std::cout << "Error: Setting kernelUsed argument. (input)" << std::endl; return; } /* 2nd argument: output of the kernel */ status = clSetKernelArg( kernelUsed, 1, sizeof(cl_mem), (void *)&outBuffer); if(status != CL_SUCCESS) { std::cout << "Error: Setting kernelUsed argument. (output)" << std::endl; return; } /* 3rd argument: size of a row */ status = clSetKernelArg( kernelUsed, 2, sizeof(cl_mem), (void *)&matSizeBuffer); if(status != CL_SUCCESS) { std::cout << "Error: Setting kernelUsed argument. (row size)" << std::endl; return; } /************************************************** * wait for the Write buffer to finish execution ***************************************************/ status = clWaitForEvents(2, &kern_ev[0]); if(status != CL_SUCCESS) { std::cout << "Error: Waiting for kernelUsed run to finish (clWaitForEvents)" << std::endl; return;} status = clReleaseEvent(kern_ev[0]); if(status != CL_SUCCESS) {std::cout << "Error: clReleaseEvent. (kern_ev[0])" << std::endl; return;} status = clReleaseEvent(kern_ev[1]); if(status != CL_SUCCESS) { std::cout << "Error: clReleaseEvent. (kern_ev[1])" << std::endl; return;} /******************************** * Profiling information. *********************************/ cl_ulong timeWriteInQueue; cl_ulong timeWriteEnd; cl_ulong deltaWriteIn; status = clGetEventProfilingInfo( kern_ev[0], CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), (void *) &timeWriteInQueue, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Get profiling information (Time buffer_in queue) "<< status << std::endl; return;} status = clGetEventProfilingInfo( kern_ev[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), (void *) &timeWriteEnd, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Get profiling information (Time buffer_in process) "<< status << std::endl; return;} deltaWriteIn = (timeWriteEnd - timeWriteInQueue) / 1000; std::cout << "Write command queued to end: "<< deltaWriteIn << " Âµs"<< std::endl; /******************************** * Enqueue a kernel run call. *********************************/ size_t localThreads[2] = {dimBlock, dimBlock}; size_t globalThreads[2] = {nbBlocksX*dimBlock, nbBlocksY*dimBlock}; status = clEnqueueNDRangeKernel(commandQueue, kernelUsed, 2, NULL, globalThreads, localThreads, 0, NULL, &kern_ev[2]); if(status != CL_SUCCESS) { std::cout << "Error: Enqueueing kernelUsed onto command queue (clEnqueueNDRangeKernel)" << status << std::endl; return;} /************************************************** * wait for the kernel call to finish execution ***************************************************/ status = clWaitForEvents(1, &kern_ev[2]); if(status != CL_SUCCESS) { std::cout << "Error: Waiting for kernelUsed run to finish (clWaitForEvents)" << std::endl; return;} status = clReleaseEvent(kern_ev[2]); if(status != CL_SUCCESS) { std::cout << "Error: clReleaseEvent. (kern_ev[2])" << std::endl; return;} /******************************** * Profiling information. *********************************/ cl_ulong timeKerInQueue; cl_ulong timeKerSubmit; cl_ulong timeKerStart; cl_ulong timeKerEnd; cl_ulong deltaWrite2KerInQueue; cl_ulong deltaKerIn2Submit; cl_ulong deltaKerSubmit2Start; cl_ulong deltaKerIn2End; cl_ulong deltaIn; cl_ulong deltaKernel; status = clGetEventProfilingInfo( kern_ev[2], CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), (void *) &timeKerInQueue, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Get profiling information (Time command enqueued) "<< status << std::endl; return;} status = clGetEventProfilingInfo( kern_ev[2], CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), (void *) &timeKerSubmit, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Get profiling information (Time command submit) "<< status << std::endl; return;} status = clGetEventProfilingInfo( kern_ev[2], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), (void *) &timeKerStart, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Get profiling information (Time command start) "<< status << std::endl; return;} status = clGetEventProfilingInfo( kern_ev[2], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), (void *) &timeKerEnd, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Get profiling information (Time command end) "<< status << std::endl; return;} deltaWrite2KerInQueue = (timeKerInQueue - timeWriteInQueue) / 1000; deltaKerIn2Submit = (timeKerSubmit - timeKerInQueue) / 1000; deltaKerSubmit2Start = (timeKerStart - timeKerSubmit) / 1000; deltaKernel = (timeKerEnd - timeKerStart) / 1000; std::cout << "Write command in queue to kernel command in queue :"<< deltaWrite2KerInQueue<< " ÂµS "<< std::endl; std::cout << "Kernel command in queue to kernel submit :"<< deltaKerIn2Submit << " ÂµS "<< std::endl; std::cout << "Kernel submit to kernel start :"<< deltaKerSubmit2Start << " ÂµS "<< std::endl; std::cout << "Kernel start to kernel end :"<< deltaKernel << " ÂµS "<< std::endl; /************************************** * Enqueue readBuffer: Output ***************************************/ status = clEnqueueReadBuffer( commandQueue, outBuffer, CL_TRUE, 0, sizeOutput * sizeof(int16_t), outMem, 0, NULL, &kern_ev[3]); if(status != CL_SUCCESS) { std::cout << "Error: clEnqueueOutputBuffer failed (clEnqueueReadBuffer)" << std::endl;} /************************************************** * Wait for the read buffer to finish execution ***************************************************/ status = clWaitForEvents(1,&kern_ev[3]); if(status != CL_SUCCESS) { std::cout << "Error: Waiting for read buffer call to finish (clWaitForEvents)" << std::endl; return;} status = clReleaseEvent(kern_ev[3]); if(status != CL_SUCCESS) { std::cout << "Error: clReleaseEvent. (kern_ev[3])" << std::endl; return; } /******************************** * Profiling information. *********************************/ cl_ulong timeReadEnd; cl_ulong deltaOut; cl_ulong deltaAll; status = clGetEventProfilingInfo( kern_ev[3], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), (void *) &timeReadEnd, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Get profiling information (Time buffer outqueued) "<< status << std::endl; return;} deltaOut = (timeReadEnd - timeKerEnd) / 1000; deltaAll = (timeReadEnd - timeWriteInQueue) / 1000; std::cout << "Kernel end to Read command end: "<< deltaOut << " ÂµS "<< std::endl; std::cout << "Total duration: "<< deltaAll << " ÂµS "<< std::endl; } /**************************************************** * Release OpenCL resources (Context, Memory etc.) *****************************************************/ void cleanupCL(void) { cl_int status; status = clReleaseKernel(kernelUsed); if(status != CL_SUCCESS) { std::cout << "Error: In clReleaseKernel " << std::endl; return;} status = clReleaseProgram(program); if(status != CL_SUCCESS) { std::cout << "Error: In clReleaseProgram" << std::endl; return;} status = clReleaseMemObject(inpBuffer); if(status != CL_SUCCESS) { std::cout << "Error: In clReleaseMemObject (inpBuffer)" << std::endl; return;} status = clReleaseMemObject(outBuffer); if(status != CL_SUCCESS) { std::cout << "Error: In clReleaseMemObject (outBuffer)" << std::endl; return;} status = clReleaseMemObject(matSizeBuffer); if(status != CL_SUCCESS) { std::cout << "Error: In clReleaseMemObject (matSizeBuffer)" << std::endl; return;} status = clReleaseCommandQueue(commandQueue); if(status != CL_SUCCESS) { std::cout << "Error: In clReleaseCommandQueue" << std::endl; return;} status = clReleaseContext(context); if(status != CL_SUCCESS) { std::cout << "Error: In clReleaseContext " << std::endl; return;} } /************************************** * Releases program's resources ***************************************/ void cleanupHost(void) { if(inpMem != NULL) { free(inpMem); inpMem = NULL; } if(outMem != NULL) { free(outMem); outMem = NULL; } } /********************* * Main **********************/ int main(int argc, char * argv[]) { string typeProc = argv[1]; nbBlocksX = atoi(argv[2]); nbBlocksY = atoi(argv[3]); bool passed = true; // Parse arguments if (typeProc == "CPU") { devType = CL_DEVICE_TYPE_CPU; std::cout << "CPU device" << std::endl; } else { std::cout << "GPU device" << std::endl; } sizeInput = dimBlock * dimBlock * nbBlocksX * nbBlocksY; sizeOutput = dimBlock * dimBlock * nbBlocksX * nbBlocksY; // Initialize Host application initializeHost(); // Initialize OpenCL resources initializeCL(); // Run the CL program runCLKernels(); // Releases OpenCL resources cleanupCL(); // Release host resources cleanupHost(); return 0; }

genaganna · ‎02-17-2011

Originally posted by: Acen Thank you for your help. Here is my code.

You missed to give Test.hpp. I have looked at your code. Please read Section 4.3 in AMD_Accelerated_Parallel_Processing_OpenCL_Programming_Guide.pdf for information on estimating performance.

Always calculate average time.

Acen · ‎02-17-2011

I'm going to have a look in the Programming Guide. Here is the hpp and the kernel.

#ifndef Test_H_ #define Test_H_ #include <CL/cl.h> #include <string.h> #include <cstdlib> #include <iostream> #include <string> #include <fstream> #include <sys/time.h> #include <unistd.h> using namespace std; /*** GLOBALS ***/ // uint nbBlocksX; uint nbBlocksY; uint dimBlock = 4; uint matSize[2]; /* Memories */ int16_t *inpMem; int16_t *outMem; uint sizeInput; uint sizeOutput; /* The memory buffer that is used as input/output for OpenCL kernel */ cl_mem inpBuffer; cl_mem outBuffer; cl_mem matSizeBuffer; /* Open CL */ int devType = CL_DEVICE_TYPE_GPU; cl_context context; cl_device_id DeviceID; cl_command_queue commandQueue; cl_program program; /* Kernels */ cl_kernel kernelUsed; /*** FUNCTION DECLARATIONS ***/ void initializeHost(void); void initializeCL(void); std::string convertToString(const char * filename); void runCLKernels(void); void cleanupCL(void); void cleanupHost(void); #endif __kernel void H4(__global short * input, __global short * output, __global uint * size) { __local short hda[4][4]; __local short inter[4][4]; __local short block_in[4][4]; uint i = get_local_id(0); uint j = get_local_id(1); uint X = get_global_id(0); uint Y = get_global_id(1); uint nbBlockX = size[0]; uint nbBlockY = size[1]; hda[0][0]= 1; hda[0][1]= 1; hda[0][2]= 1; hda[0][3]= 1; hda[1][0]= 1; hda[1][1]= 1; hda[1][2]= -1;hda[1][3]= -1; hda[2][0]= 1; hda[2][1]= -1; hda[2][2]= -1;hda[2][3]= 1; hda[3][0]= 1; hda[3][1]= -1; hda[3][2]= 1;hda[3][3]= -1; // Local matrix block_in = input[Y*nbBlockX*4+X]; // Calculate A x XT int acc = 0; for (uint k = 0 ; k < 4 ; k++) { acc += hda * block_in; } inter = acc; // Ensure all is finished before proceeding barrier(CLK_LOCAL_MEM_FENCE); //Initialize accumulator acc = 0; // Calculate (A x XT) x A for (uint k = 0 ; k < 4 ; k++) { acc += inter * hda; } output[Y*nbBlockX*4+X] = (acc >> 1); }

genaganna · ‎02-17-2011

Originally posted by: Acen I'm going to have a look in the Programming Guide. Here is the hpp and the kernel.

Please modify code as per programming guide and get back with your questions.

Archives Discussions

Buffer transfer CPU to GPU very slow