cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

Acen
Journeyman III

Buffer transfer CPU to GPU very slow

I have measured the time to tranfer a buffer from the CPU to the GPU. It appears that there is a threshold of more than 1 ms whatever the buffer size is. The procedure I use is:

  1. clCreateBuffer
  2. clEnqueueWriteBuffer with option CL_FALSE
  3. clWaitForEvents
  4. clGetEventProfilingInfo

Do I missed something or is it normal to have a 1 ms latency?

0 Likes
5 Replies
genaganna
Journeyman III

Originally posted by: Acen I have measured the time to tranfer a buffer from the CPU to the GPU. It appears that there is a threshold of more than 1 ms whatever the buffer size is. The procedure I use is:

 

  1. clCreateBuffer
  2. clEnqueueWriteBuffer with option CL_FALSE
  3. clWaitForEvents
  4. clGetEventProfilingInfo
Do I missed something or is it normal to have a 1 ms latency?

 

Are you calculating average time or not?  please copy your code here.

0 Likes

Thank you for your help. Here is my code.

#include "Test.hpp" /******************************************************* * Host Initialization: Allocate and initialize memory ********************************************************/ void initializeHost(void) { inpMem = NULL; outMem = NULL; inpMem = (int16_t *) malloc(sizeInput * sizeof(int16_t)); if(inpMem == NULL) {std::cout << "Error: Failed to allocate inpMem memory on host" << std::endl; return;} outMem = (int16_t *) malloc(sizeOutput * sizeof(int16_t)); if(outMem == NULL) {std::cout << "Error: Failed to allocate inpMem memory on host" << std::endl; return;} matSize[0] = nbBlocksX; matSize[1] = nbBlocksY; srand(1); for(uint i = 0; i < sizeInput ; i++) { inpMem = (int16_t)rand(); } std::cout << "Initialize host." << std::endl; } /***************************************************** * Converts the contents of a file into a string ******************************************************/ std::string convertToString(const char *filename) { size_t size; char* str; std::string s; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); return NULL;} f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; delete[] str; return s; } return NULL; } /******************************************************** * Initialize OpenCL ressources: * Get Platform IDs and Infos of AMD Platform * Create contex and get devices * Create command queues * Create buffers * Create and build CL program * Create kernel object *********************************************************/ void initializeCL(void) { cl_int status = 0; size_t deviceListSize; /************************************************ * Get Platform info and recover AMD platform *************************************************/ cl_uint numPlatforms; cl_platform_id platform = NULL; /* Get the number of platforms --> numplatform */ status = clGetPlatformIDs(0, NULL, &numPlatforms); if(status != CL_SUCCESS) { std::cout << "Error: Getting Platforms. (clGetPlatformsIDs)" << std::endl; return; } /* Get the list of platforms --> platforms */ if(numPlatforms > 0) { cl_platform_id* platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Platform Ids. (clGetPlatformsIDs)" << std::endl; return; } /* Get the platform vendors --> pbuff and break when AMD is found*/ for(unsigned int i=0; i < numPlatforms; ++i) { char pbuff[100]; status = clGetPlatformInfo( platforms, CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Platform Info. (clGetPlatformInfo)" << std::endl; return; } platform = platforms; if(!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { break; } } delete platforms; } if(NULL == platform) { std::cout << "NULL platform found so Exiting Application." << std::endl; return;} /************************************************ * Get Device informations *************************************************/ cl_uint numDevices; /* Get number of devices */ status = clGetDeviceIDs(platform, devType, 0, NULL, &numDevices); if(status != CL_SUCCESS) { std::cout << "Error: Getting Devices. (clGetDeviceIDs)" << std::endl; return; } /* Get ID of the device */ status = clGetDeviceIDs(platform, devType, numDevices, &DeviceID, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Device ID. (clGetDeviceIDs)" << std::endl; return; } /******************************************************** * Create an OpenCL context for CPU and get device infos *********************************************************/ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType(cps, devType, NULL, NULL, &status); if(status != CL_SUCCESS) { std::cout << "Error: Creating Context. (clCreateContextFromType)" << std::endl; return; } std::cout << "Initialize CL." << std::endl; /************************************************** * Create an OpenCL command queue for device 0 ***************************************************/ commandQueue = clCreateCommandQueue( context, DeviceID, CL_QUEUE_PROFILING_ENABLE, &status); if(status != CL_SUCCESS) { std::cout << "Creating Command Queue. (clCreateCommandQueue)" << std::endl; return;} /***************************************************** * Create OpenCL memory input and output buffers ******************************************************/ inpBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY, sizeof(int16_t) * sizeInput, NULL, &status); if(status != CL_SUCCESS) { std::cout << "Error: clCreateBuffer (inpBuffer)" << std::endl; return; } outBuffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY, sizeof(int16_t) * sizeOutput, NULL, &status); if(status != CL_SUCCESS) { std::cout << "Error: clCreateBuffer (outBuffer)" << std::endl; return; } matSizeBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY, sizeof(uint) * 2 , NULL, &status); if(status != CL_SUCCESS) { std::cout << "Error: clCreateBuffer (matSizeBuffer)" << std::endl; return; } std::cout << "Buffer created " << std::endl; /***************************************** * Create a program object with source ******************************************/ const char * filename = "Test_Kernel.cl"; std::string sourceStr = convertToString(filename); const char * source = sourceStr.c_str(); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource( context, 1, &source, sourceSize, &status); if(status != CL_SUCCESS){ std::cout << "Error: Loading Binary into cl_program (clCreateProgramWithBinary)" << std::endl; return; } std::cout << "Program created " << std::endl; /********************************* * Build the program created above **********************************/ status = clBuildProgram( program, 1, &DeviceID, NULL, NULL, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Building Program (clBuildProgram)" << status; return; } std::cout << "Program build " << std::endl; /********************************** * Create a kernel object ***********************************/ kernelUsed = clCreateKernel( program, "H4", &status); if(status != CL_SUCCESS) { std::cout << "Error: Creating Kernel from program. (clCreateKernel)" << std::endl; return; } std::cout << "Kernel created " << std::endl; } /******************************************** * Run OpenCL program *********************************************/ void runCLKernels(void) { cl_int status; cl_event kern_ev[3]; /************************************** * Enqueue WriteBuffer: Input ***************************************/ status = clEnqueueWriteBuffer( commandQueue, inpBuffer, CL_TRUE, 0, sizeInput * sizeof(int16_t), inpMem, 0, NULL, &kern_ev[0]); if(status != CL_SUCCESS) { std::cout << "Error: clEnqueueInputBuffer failed (clEnqueueWriteBuffer)" << std::endl;} status = clEnqueueWriteBuffer( commandQueue, matSizeBuffer, CL_TRUE, 0, 2 *sizeof(uint), matSize, 0, NULL, &kern_ev[1]); if(status != CL_SUCCESS) { std::cout << "Error: clEnqueueBiaBuffer failed (clEnqueueWriteBuffer)" << std::endl;} /******************************************** * Set appropriate arguments to the kernel *********************************************/ /* 1st argument: input of the kernel */ status = clSetKernelArg( kernelUsed, 0, sizeof(cl_mem), (void *)&inpBuffer); if(status != CL_SUCCESS) { std::cout << "Error: Setting kernelUsed argument. (input)" << std::endl; return; } /* 2nd argument: output of the kernel */ status = clSetKernelArg( kernelUsed, 1, sizeof(cl_mem), (void *)&outBuffer); if(status != CL_SUCCESS) { std::cout << "Error: Setting kernelUsed argument. (output)" << std::endl; return; } /* 3rd argument: size of a row */ status = clSetKernelArg( kernelUsed, 2, sizeof(cl_mem), (void *)&matSizeBuffer); if(status != CL_SUCCESS) { std::cout << "Error: Setting kernelUsed argument. (row size)" << std::endl; return; } /************************************************** * wait for the Write buffer to finish execution ***************************************************/ status = clWaitForEvents(2, &kern_ev[0]); if(status != CL_SUCCESS) { std::cout << "Error: Waiting for kernelUsed run to finish (clWaitForEvents)" << std::endl; return;} status = clReleaseEvent(kern_ev[0]); if(status != CL_SUCCESS) {std::cout << "Error: clReleaseEvent. (kern_ev[0])" << std::endl; return;} status = clReleaseEvent(kern_ev[1]); if(status != CL_SUCCESS) { std::cout << "Error: clReleaseEvent. (kern_ev[1])" << std::endl; return;} /******************************** * Profiling information. *********************************/ cl_ulong timeWriteInQueue; cl_ulong timeWriteEnd; cl_ulong deltaWriteIn; status = clGetEventProfilingInfo( kern_ev[0], CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), (void *) &timeWriteInQueue, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Get profiling information (Time buffer_in queue) "<< status << std::endl; return;} status = clGetEventProfilingInfo( kern_ev[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), (void *) &timeWriteEnd, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Get profiling information (Time buffer_in process) "<< status << std::endl; return;} deltaWriteIn = (timeWriteEnd - timeWriteInQueue) / 1000; std::cout << "Write command queued to end: "<< deltaWriteIn << " µs"<< std::endl; /******************************** * Enqueue a kernel run call. *********************************/ size_t localThreads[2] = {dimBlock, dimBlock}; size_t globalThreads[2] = {nbBlocksX*dimBlock, nbBlocksY*dimBlock}; status = clEnqueueNDRangeKernel(commandQueue, kernelUsed, 2, NULL, globalThreads, localThreads, 0, NULL, &kern_ev[2]); if(status != CL_SUCCESS) { std::cout << "Error: Enqueueing kernelUsed onto command queue (clEnqueueNDRangeKernel)" << status << std::endl; return;} /************************************************** * wait for the kernel call to finish execution ***************************************************/ status = clWaitForEvents(1, &kern_ev[2]); if(status != CL_SUCCESS) { std::cout << "Error: Waiting for kernelUsed run to finish (clWaitForEvents)" << std::endl; return;} status = clReleaseEvent(kern_ev[2]); if(status != CL_SUCCESS) { std::cout << "Error: clReleaseEvent. (kern_ev[2])" << std::endl; return;} /******************************** * Profiling information. *********************************/ cl_ulong timeKerInQueue; cl_ulong timeKerSubmit; cl_ulong timeKerStart; cl_ulong timeKerEnd; cl_ulong deltaWrite2KerInQueue; cl_ulong deltaKerIn2Submit; cl_ulong deltaKerSubmit2Start; cl_ulong deltaKerIn2End; cl_ulong deltaIn; cl_ulong deltaKernel; status = clGetEventProfilingInfo( kern_ev[2], CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), (void *) &timeKerInQueue, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Get profiling information (Time command enqueued) "<< status << std::endl; return;} status = clGetEventProfilingInfo( kern_ev[2], CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), (void *) &timeKerSubmit, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Get profiling information (Time command submit) "<< status << std::endl; return;} status = clGetEventProfilingInfo( kern_ev[2], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), (void *) &timeKerStart, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Get profiling information (Time command start) "<< status << std::endl; return;} status = clGetEventProfilingInfo( kern_ev[2], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), (void *) &timeKerEnd, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Get profiling information (Time command end) "<< status << std::endl; return;} deltaWrite2KerInQueue = (timeKerInQueue - timeWriteInQueue) / 1000; deltaKerIn2Submit = (timeKerSubmit - timeKerInQueue) / 1000; deltaKerSubmit2Start = (timeKerStart - timeKerSubmit) / 1000; deltaKernel = (timeKerEnd - timeKerStart) / 1000; std::cout << "Write command in queue to kernel command in queue :"<< deltaWrite2KerInQueue<< " µS "<< std::endl; std::cout << "Kernel command in queue to kernel submit :"<< deltaKerIn2Submit << " µS "<< std::endl; std::cout << "Kernel submit to kernel start :"<< deltaKerSubmit2Start << " µS "<< std::endl; std::cout << "Kernel start to kernel end :"<< deltaKernel << " µS "<< std::endl; /************************************** * Enqueue readBuffer: Output ***************************************/ status = clEnqueueReadBuffer( commandQueue, outBuffer, CL_TRUE, 0, sizeOutput * sizeof(int16_t), outMem, 0, NULL, &kern_ev[3]); if(status != CL_SUCCESS) { std::cout << "Error: clEnqueueOutputBuffer failed (clEnqueueReadBuffer)" << std::endl;} /************************************************** * Wait for the read buffer to finish execution ***************************************************/ status = clWaitForEvents(1,&kern_ev[3]); if(status != CL_SUCCESS) { std::cout << "Error: Waiting for read buffer call to finish (clWaitForEvents)" << std::endl; return;} status = clReleaseEvent(kern_ev[3]); if(status != CL_SUCCESS) { std::cout << "Error: clReleaseEvent. (kern_ev[3])" << std::endl; return; } /******************************** * Profiling information. *********************************/ cl_ulong timeReadEnd; cl_ulong deltaOut; cl_ulong deltaAll; status = clGetEventProfilingInfo( kern_ev[3], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), (void *) &timeReadEnd, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Get profiling information (Time buffer outqueued) "<< status << std::endl; return;} deltaOut = (timeReadEnd - timeKerEnd) / 1000; deltaAll = (timeReadEnd - timeWriteInQueue) / 1000; std::cout << "Kernel end to Read command end: "<< deltaOut << " µS "<< std::endl; std::cout << "Total duration: "<< deltaAll << " µS "<< std::endl; } /**************************************************** * Release OpenCL resources (Context, Memory etc.) *****************************************************/ void cleanupCL(void) { cl_int status; status = clReleaseKernel(kernelUsed); if(status != CL_SUCCESS) { std::cout << "Error: In clReleaseKernel " << std::endl; return;} status = clReleaseProgram(program); if(status != CL_SUCCESS) { std::cout << "Error: In clReleaseProgram" << std::endl; return;} status = clReleaseMemObject(inpBuffer); if(status != CL_SUCCESS) { std::cout << "Error: In clReleaseMemObject (inpBuffer)" << std::endl; return;} status = clReleaseMemObject(outBuffer); if(status != CL_SUCCESS) { std::cout << "Error: In clReleaseMemObject (outBuffer)" << std::endl; return;} status = clReleaseMemObject(matSizeBuffer); if(status != CL_SUCCESS) { std::cout << "Error: In clReleaseMemObject (matSizeBuffer)" << std::endl; return;} status = clReleaseCommandQueue(commandQueue); if(status != CL_SUCCESS) { std::cout << "Error: In clReleaseCommandQueue" << std::endl; return;} status = clReleaseContext(context); if(status != CL_SUCCESS) { std::cout << "Error: In clReleaseContext " << std::endl; return;} } /************************************** * Releases program's resources ***************************************/ void cleanupHost(void) { if(inpMem != NULL) { free(inpMem); inpMem = NULL; } if(outMem != NULL) { free(outMem); outMem = NULL; } } /********************* * Main **********************/ int main(int argc, char * argv[]) { string typeProc = argv[1]; nbBlocksX = atoi(argv[2]); nbBlocksY = atoi(argv[3]); bool passed = true; // Parse arguments if (typeProc == "CPU") { devType = CL_DEVICE_TYPE_CPU; std::cout << "CPU device" << std::endl; } else { std::cout << "GPU device" << std::endl; } sizeInput = dimBlock * dimBlock * nbBlocksX * nbBlocksY; sizeOutput = dimBlock * dimBlock * nbBlocksX * nbBlocksY; // Initialize Host application initializeHost(); // Initialize OpenCL resources initializeCL(); // Run the CL program runCLKernels(); // Releases OpenCL resources cleanupCL(); // Release host resources cleanupHost(); return 0; }

0 Likes

Originally posted by: Acen Thank you for your help. Here is my code.

 

You missed to give Test.hpp.  I  have looked at your code. Please read Section 4.3 in AMD_Accelerated_Parallel_Processing_OpenCL_Programming_Guide.pdf for information on estimating performance.

Always calculate average time.

0 Likes

I'm going to have a look in the Programming Guide. Here is the hpp and the kernel.

#ifndef Test_H_ #define Test_H_ #include <CL/cl.h> #include <string.h> #include <cstdlib> #include <iostream> #include <string> #include <fstream> #include <sys/time.h> #include <unistd.h> using namespace std; /*** GLOBALS ***/ // uint nbBlocksX; uint nbBlocksY; uint dimBlock = 4; uint matSize[2]; /* Memories */ int16_t *inpMem; int16_t *outMem; uint sizeInput; uint sizeOutput; /* The memory buffer that is used as input/output for OpenCL kernel */ cl_mem inpBuffer; cl_mem outBuffer; cl_mem matSizeBuffer; /* Open CL */ int devType = CL_DEVICE_TYPE_GPU; cl_context context; cl_device_id DeviceID; cl_command_queue commandQueue; cl_program program; /* Kernels */ cl_kernel kernelUsed; /*** FUNCTION DECLARATIONS ***/ void initializeHost(void); void initializeCL(void); std::string convertToString(const char * filename); void runCLKernels(void); void cleanupCL(void); void cleanupHost(void); #endif __kernel void H4(__global short * input, __global short * output, __global uint * size) { __local short hda[4][4]; __local short inter[4][4]; __local short block_in[4][4]; uint i = get_local_id(0); uint j = get_local_id(1); uint X = get_global_id(0); uint Y = get_global_id(1); uint nbBlockX = size[0]; uint nbBlockY = size[1]; hda[0][0]= 1; hda[0][1]= 1; hda[0][2]= 1; hda[0][3]= 1; hda[1][0]= 1; hda[1][1]= 1; hda[1][2]= -1;hda[1][3]= -1; hda[2][0]= 1; hda[2][1]= -1; hda[2][2]= -1;hda[2][3]= 1; hda[3][0]= 1; hda[3][1]= -1; hda[3][2]= 1;hda[3][3]= -1; // Local matrix block_in = input[Y*nbBlockX*4+X]; // Calculate A x XT int acc = 0; for (uint k = 0 ; k < 4 ; k++) { acc += hda * block_in; } inter = acc; // Ensure all is finished before proceeding barrier(CLK_LOCAL_MEM_FENCE); //Initialize accumulator acc = 0; // Calculate (A x XT) x A for (uint k = 0 ; k < 4 ; k++) { acc += inter * hda; } output[Y*nbBlockX*4+X] = (acc >> 1); }

0 Likes

Originally posted by: Acen I'm going to have a look in the Programming Guide. Here is the hpp and the kernel.

 

Please modify code as per programming guide and get back with your questions.

0 Likes