9 Replies Latest reply on Oct 4, 2010 5:39 PM by himanshu.gautam

    Change in work per workitem doesn't change amount of fetches/writes

    Raistmer
      Why it can be?

      I have some big long kernel with rather small execution domain.
      When I do some of inner loop unrolls and use more workitems with smaller load on each performance greatly increases.
      But each workitem requires it's own memory starage space. One buffer limited in size by 128MB. So I tried to use 2 temporary buffers 128MB each to increase unroll factor.
      But in this case performance not increased, it hugely decreased instread.
      Looking into profiler data I see that number of read/write instructions per workitem remained almost the same as before (when number of workitems was 2 times lower and each performed 2 times more work).
      I use listed code to alternate between memory buffers. But it looks like each thread passes both ways. Can it be ? Why number of fetch and write instructions doesn't drop in 2 times ?

      EDIT: I use third dimension in execution domain for unroll. It was 32 with single temp buffer and 64 with 2 temp buffers.

      ......... int z=get_global_id(2); int TOffset2 = (32*y+z) * AdvanceBy; if(z>=32){ TOffset2 = (32*y+(z-32)) * AdvanceBy; } .......... __global float4* tmp_pot = tmp_PoT + ul_PoT + TOffset2 * (fft_len4); if(z>=32){//R: other half will use secondary bufer. It';s because of 128MB limit per buffer in current OpenCL ATi implementation tmp_pot=tmp2 + ul_PoT + TOffset2 * (fft_len4); }

        • Change in work per workitem doesn't change amount of fetches/writes
          jeff_golds

          When you say small domain, how small do you mean?  What GPU are you running on?

          Jeff

          • Change in work per workitem doesn't change amount of fetches/writes
            Raistmer
            HD4870 GPU.
            Typical value can be {4,53,32} and I trying to expand it to {4,53,64} by using 2 buffers instead of one.
            Execution domain varies between kernel calls, first number can be from 4 to smth like 256. In profiler I see the same behavior for different X-dimension sizes.
            That is, I trying to increase wavefronts number from smth like 3k to 6k.
                • Change in work per workitem doesn't change amount of fetches/writes
                  Raistmer
                  Originally posted by: himanshu.gautam

                  Please provide a test case



                  Ok, I written small test case kernel that illustrates this issue.
                  Kernel does some work in loop. Loop unrolled by third exucution domain dimension.
                  One case uses x32 unroll and single tmp buffer, second - x64 unroll and switches between 2 tmp buffers based on z value. Try to enable first or second kernels and comment/uncomment corresponding set argument and third execution domain dimension (32 or 64).

                  I see same behavior as with my app.
                  bigger unroll provides much worse performance.
                  But I can't reproduce fetch and write counters behavior here. I can provide full my kernel if needed.

                  #if 1 __kernel void test_kernel(__global float4* in, __global float4* tmp1, __global float4* tmp2){ uint x=get_global_id(0); uint y=get_global_id(1); uint z=get_global_id(2); int offset=(32*y+z)*64; if(z>=32) offset=(32*y+(z-32))*64; __global float4* t=tmp1+offset*4; if(z>=32) t=tmp2+offset*4; for(int i=z;i<1024;i+=64){ for(int j=0;j<16;j++){ t[i]=in[j]+in[j+1]; } } } #else __kernel void test_kernel(__global float4* in, __global float4* tmp1){ uint x=get_global_id(0); uint y=get_global_id(1); uint z=get_global_id(2); int offset=(32*y+z)*64; __global float4* t=tmp1+offset*4; for(int i=z;i<1024;i+=32){ for(int j=0;j<16;j++){ t[i]=in[j]+in[j+1];} } } #endif // test_case.cpp : Defines the entry point for the console application. // #include "stdafx.h" #include "ctype.h" #include <stdio.h> #include <string> #include <CL/cl.h> #include <string.h> #include <cstdlib> #include <iostream> #include <string> #include <fstream> #include <CL/cl.h> // #include <OpenCL_FFT/clFFT.h> cl_context context; cl_device_id *devices; cl_device_id device_id; cl_command_queue cq; cl_int err; cl_program program; std::string convertToString(const char *filename) { size_t size; char* str; std::string s; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); return NULL; } #undef read f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; return s; } return NULL; } void initializeCL(void) { cl_int status = 0; size_t deviceListSize; /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(status != CL_SUCCESS) { fprintf(stderr,"Error: Getting Platforms. (clGetPlatformsIDs)\n"); return; } if(numPlatforms > 0) { cl_platform_id* platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(status != CL_SUCCESS) { fprintf(stderr,"Error: Getting Platform Ids. (clGetPlatformsIDs)\n"); return; } for(unsigned int i=0; i < numPlatforms; ++i) { char pbuff[100]; status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); platform = platforms[i]; fprintf(stderr,"OpenCL platform detected: %s\n",pbuff); #if USE_OPENCL_NV if(!strcmp(pbuff, "NVIDIA Corporation")) #else if(!strcmp(pbuff, "Advanced Micro Devices, Inc.")) #endif { break; } } delete platforms; } /* * If we could find our platform, use it. Otherwise pass a NULL and get whatever the * implementation thinks we should be using. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; cl_context_properties* cprops = (NULL == platform) ? NULL : cps; ///////////////////////////////////////////////////////////////// // Create an OpenCL context ///////////////////////////////////////////////////////////////// context = clCreateContextFromType(cprops, #if USE_OPENCL_CPU CL_DEVICE_TYPE_CPU, #elif USE_OPENCL_NV CL_DEVICE_TYPE_GPU, #else CL_DEVICE_TYPE_GPU/*CL_DEVICE_TYPE_CPU*/, #endif NULL, NULL, &status); if(status != CL_SUCCESS) { fprintf(stderr,"Error: Creating Context. (clCreateContextFromType)\n"); return; } /* First, get the size of device list data */ status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(status != CL_SUCCESS) { fprintf(stderr, "Error: Getting Context Info \ (device list size, clGetContextInfo)\n"); return; } ///////////////////////////////////////////////////////////////// // Detect OpenCL devices ///////////////////////////////////////////////////////////////// devices = (cl_device_id *)malloc(deviceListSize); if(devices == 0) { fprintf(stderr,"Error: No devices found.\n"); return; }else fprintf(stderr,"Found %d OpenCL devices.\n",deviceListSize/sizeof(cl_device_id)); /* Now, get the device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(status != CL_SUCCESS) { fprintf(stderr,"Error: Getting Context Info \ (device list, clGetContextInfo)\n"); return; } ///////////////////////////////////////////////////////////////// // Create an OpenCL command queue ///////////////////////////////////////////////////////////////// device_id=devices[0]; cq = clCreateCommandQueue( context, device_id, CL_QUEUE_PROFILING_ENABLE, &status); if(status != CL_SUCCESS) { fprintf(stderr,"Creating Command Queue. (clCreateCommandQueue)\n"); return; } ///////////////////////////////////////////////////////////////// // Load CL file, build CL program object, create CL kernel object ///////////////////////////////////////////////////////////////// const char * filename = "Test_case.cl"; std::string sourceStr = convertToString(filename); const char * source = sourceStr.c_str(); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource( context, 1, &source, sourceSize, &status); if(status != CL_SUCCESS) { fprintf(stderr,"Error: Loading Binary into cl_program \ (clCreateProgramWithBinary)\n"); return; } // create a cl program executable for all the devices specified #if USE_OPENCL_NV //R: for ATI SDK 2.01 it generates waning - unsupported compiler option //Devaster: build options for cl compiler const char * buildoptions="-cl-fast-relaxed-math"; status = clBuildProgram(program, 1, devices, buildoptions, NULL, NULL); //Devaster: wanna see some info .... char cBuildLog[10240]; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(cBuildLog), cBuildLog, NULL ); fprintf(stderr,"%s\n",cBuildLog); //Devaster #else status = clBuildProgram(program, 1, devices, NULL, NULL, NULL); #endif if(status != CL_SUCCESS) { fprintf(stderr,"Error: Building Program (clBuildProgram)\n"); char cBuildLog[10240]; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(cBuildLog), cBuildLog, NULL ); fprintf(stderr,"%s\n",cBuildLog); exit(0); } } int _tmain(int argc, _TCHAR* argv[]) { //R: OpenCL init initializeCL(); cl_kernel test_kernel=clCreateKernel(program, "test_kernel", &err); if(err != CL_SUCCESS) fprintf(stderr,"ERROR: Creating test_kernel from program: %d\n",err); cl_mem in=clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof (cl_float)*1024*1024, NULL, &err); if(err != CL_SUCCESS)fprintf(stderr,"Error: clCreateBuffer (in): %d\n",err); cl_mem tmp1=clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof (cl_float)*1024*1024*32, NULL, &err); if(err != CL_SUCCESS)fprintf(stderr,"Error: clCreateBuffer (tmp1): %d\n",err); cl_mem tmp2=clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof (cl_float)*1024*1024*32, NULL, &err); if(err != CL_SUCCESS)fprintf(stderr,"Error: clCreateBuffer (tmp1): %d\n",err); err = clSetKernelArg(test_kernel,0,sizeof(cl_mem),(void *)&in); err |= clSetKernelArg(test_kernel,1,sizeof(cl_mem),(void *)&tmp1); err |= clSetKernelArg(test_kernel,2,sizeof(cl_mem),(void *)&tmp2); if(err != CL_SUCCESS)fprintf(stderr,"ERROR: Setting kernel argument:find_single_pulse_kernel: %d\n",err); size_t globalThreads[3]; globalThreads[0] = 4; globalThreads[1] = 1024*1024/4/64; globalThreads[2] =64; //globalThreads[2] =32; err = clEnqueueNDRangeKernel( cq, test_kernel, 3, NULL, globalThreads, NULL,//R: no workgroups requred 0, NULL,//R: synching between kernels not actually needed cause queue in-order one. NULL); if(err != CL_SUCCESS)fprintf(stderr, "ERROR: Enqueueing kernel onto command queue.(test_kernel): %d\n",err); return 0; }

                    • Change in work per workitem doesn't change amount of fetches/writes
                      himanshu.gautam

                      hi raistmer,

                      I checked the code on my 4870 device, and it do show the same problem.

                      I ran your kernel on SKA and found out that the bottleneck of the code was global write as attached.

                      When you are running the kernel for {4 1096 32} threads you fetch only one buffer(1024*1024*32),but when you use {4 4096 64} threads you actually fetch two buffers(2*1024*1024*32).Therefore the fetchsize is same in both the cases.

                      So i tried to run the code by halving the size of temp buffers in {4 1096 64} case and kernel ran successfully with double the performance.

                      My system output

                      {4 1096 32}   1439.94ms

                      {4 1096 64}    725.94ms (with making the buffer size half).

                      I hope that explains your problem.

                      Name,GPR,Scratch Reg,Min,Max,Avg,ALU,Fetch,Write,Est Cycles,ALU:Fetch, BottleNeck,%s\Clock,Throughput FireStream 9170,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A FireStream 9250,8,0,3.40,4569.60,616.82,62,32,16,616.82,0.23, Global Write,0.03,16 M Threads\Sec FireStream 9270,8,0,3.40,4569.60,616.82,62,32,16,616.82,0.23, Global Write,0.03,19 M Threads\Sec Radeon HD 2400,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A Radeon HD 2600,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A Radeon HD 2900,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A Radeon HD 3870,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A Radeon HD 4550,8,0,17.00,11424.00,765.47,63,32,16,675.41,0.48, Global Fetch,0.01,7 M Threads\Sec Radeon HD 4670,8,0,4.25,5712.00,382.74,63,32,16,337.71,0.24, Global Write,0.02,18 M Threads\Sec Radeon HD 4770,8,0,4.25,5712.00,699.06,63,32,16,616.82,0.23, Global Write,0.03,19 M Threads\Sec Radeon HD 4870,8,0,3.40,4569.60,616.82,62,32,16,616.82,0.23, Global Write,0.03,19 M Threads\Sec Radeon HD 4890,8,0,3.40,4569.60,616.82,62,32,16,616.82,0.23, Global Write,0.03,22 M Threads\Sec Radeon HD 5450,10,0,5.00,5712.00,224.03,39,32,16,197.67,0.93, Global Write,0.02,13 M Threads\Sec Radeon HD 5670,10,0,2.00,4569.60,337.71,39,32,16,337.71,0.46, Global Write,0.02,18 M Threads\Sec Radeon HD 5770,10,0,1.00,4569.60,616.82,39,32,16,616.82,0.23, Global Write,0.03,22 M Threads\Sec Radeon HD 5870,10,0,1.00,4569.60,616.82,39,32,16,616.82,0.23, Global Write,0.05,44 M Threads\Sec

                        • Change in work per workitem doesn't change amount of fetches/writes
                          Raistmer
                          The kernel should fetch just the same amount of data. Also, it should write the same amount of data in both cases (but reads and writes spread differently among separate threads).
                          There are 2 different kernels - one for Z=64, second for Z=32.
                          One increment kernel's loop counter by 64, second - by 32.
                          That is, with Z=64 there are 64 different groups of threads (X*Y - number of threads in group, not connected with OpenCL workgroup term) each running 1/64 of whole workload. If Z=32 there are 32 different group each doing 1/32 of total work.

                          You probably forgot to adjust domain size for kernel.
                            • Change in work per workitem doesn't change amount of fetches/writes
                              himanshu.gautam

                              hi raistmer,

                              I analyzed the problem and would like to edit my explanation a little bit.

                              Your problem  is not related to bottleneck in ska.But before the kernel launches it makes sure the buffers it need are available in GPU memory.In {4 1096 32} case you copy two buffers(1 In and 1 temp1),but in {4 1096 64} case you copy 3 buffers(1 In + temp1+temp2).Just see your fetchsize parameter in profiler.it is same in both the cases.

                              Just try doing the following change to the code and you will see my point.Please reply if there is still some confuion.

                               

                              ////After Change cl_mem in=clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof (cl_float)*1024*1024, NULL, &err); if(err != CL_SUCCESS)fprintf(stderr,"Error: clCreateBuffer (in): %d\n",err); cl_mem tmp1=clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof (cl_float)*1024*1024*16, NULL, &err); if(err != CL_SUCCESS)fprintf(stderr,"Error: clCreateBuffer (tmp1): %d\n",err); cl_mem tmp2=clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof (cl_float)*1024*1024*16, NULL, &err); //Before Change cl_mem in=clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof (cl_float)*1024*1024, NULL, &err); if(err != CL_SUCCESS)fprintf(stderr,"Error: clCreateBuffer (in): %d\n",err); cl_mem tmp1=clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof (cl_float)*1024*1024*32, NULL, &err); if(err != CL_SUCCESS)fprintf(stderr,"Error: clCreateBuffer (tmp1): %d\n",err); cl_mem tmp2=clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof (cl_float)*1024*1024*32, NULL, &err);

                    • Change in work per workitem doesn't change amount of fetches/writes
                      Raistmer
                      Yes, there is a lot of confusion:
                      both buffers supposed to be on GPU at time kernel lauches. That is, kernel time should not depend not of amount of GPU memory per buffer (if it reads/writes only the same amount and leaves other part of buffer untouched), nor from number of buffers. The only possible issue I could see is some contention in memory with some particular access stride. But AFAIK ATi doesn't provide enough details about memory subsystem organization (instead of nVidia that gives much more consideration to this topic). How can one check if there are bank conflicts ?
                      I see only one related counter - Fetch Unit stalled.
                      In case of "x64" kernel it has big value, ~40% while in case of "x32" kernel it =0. But how this information can be used to improve memory access?
                        • Change in work per workitem doesn't change amount of fetches/writes
                          himanshu.gautam

                          Hi raistmer,

                          I relooked at the code.I feel you are right it is the bank conflicts that is creating contention.In each thread we are accessing the same elements which result in bank conflicts.

                          When we double the work items the bank conflicts become even more intense.

                          So it can be concluded that dividing the work  would not increase the speed of execution everytime.We always need to select a value which most efficiently uses the processing resources present in the device.

                          I am supplying a simple sample code having a better access pattern which might be helpful.

                          For more information about better access pattern to avoid bank conflicts,refer to the optimization chapter of openCL Programming Guide

                          //KERNEL CODE: #define MoreThreads 0 #if MoreThreads __kernel void test_kernel(__global float4* in, __global float4* tmp1, __global float4* tmp2, int length){ int x=get_global_id(0); float4 a=in[x],b=tmp1[x]; for(int i=0;i<128;i++) { tmp2[x].x=a.x*a.y +a.z*a.w+ b.x*b.y + b.z*b.w +i; tmp2[x].y=a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w+i; tmp2[x].z=a.x/a.y +a.z/a.w+ b.x/b.y + b.z/b.w+i; tmp2[x].w=a.x/b.x + a.y/b.y + a.z/b.z + a.w/b.w+i; } } #else __kernel void test_kernel(__global float4* in, __global float4* tmp1,__global float4* tmp2, int length){ int x=get_global_id(0); float4 a=in[x],b=tmp1[x]; for(int i=0;i<256;i++) { tmp2[x].x=a.x*a.y +a.z*a.w+ b.x*b.y + b.z*b.w+i; tmp2[x].y=a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w+i; tmp2[x].z=a.x/a.y +a.z/a.w+ b.x/b.y + b.z/b.w+i; tmp2[x].w=a.x/b.x + a.y/b.y + a.z/b.z + a.w/b.w+i; } } #endif //HOST CODE: //Header Files #pragma region Header #include <CL\cl.h> #include <iostream> #include <cstdio> #include <fstream> #include <math.h> #include <string.h> #include <windows.h> #pragma endregion //Macro Definitions #pragma region Macro definitions #define MORETHREADS 0 #if MORETHREADS #define GLOBAL_SIZE 8192*1024 #else #define GLOBAL_SIZE 4096*1024 # endif //typedef int DataType; #define LENGTH 4096 #define DataType cl_float4 #define KERNEL0 "test_kernel" #define FORCED_EXIT 1 #define KERNEL_FILE "kernelcode.cl" #pragma endregion //OCL Parameters #pragma region OCLParams cl_platform_id Platform; cl_context Context; cl_command_queue CommandQueue0; cl_mem inputBuffer0,inputBuffer1,outputBuffer; cl_device_id *Devices; cl_program Program; cl_kernel Kernel0; cl_device_type DeviceType=CL_DEVICE_TYPE_GPU; #pragma endregion #pragma region Host Params DataType * Input0=NULL; DataType * Input1=NULL; DataType * Output=NULL; //Initialize host parameters void SetUpData() { Input0 = (DataType*)malloc(LENGTH*sizeof(DataType)); if(Input0==NULL) { printf("Memory allocation failed"); exit(FORCED_EXIT); } Input1 = (DataType*)malloc(LENGTH*sizeof(DataType)); if(Input1==NULL) { printf("Memory allocation failed"); exit(FORCED_EXIT); } Output = (DataType*)malloc(LENGTH*sizeof(DataType)); if(Output==NULL) { printf("Memory allocation failed"); exit(FORCED_EXIT); } for(int i=0;i<LENGTH;i++) { Input0[i].s[0] = (float)(rand()%10); Input0[i].s[1] = (float)(rand()%10); Input0[i].s[2] = (float)(rand()%10); Input0[i].s[3] = (float)(rand()%10); Input1[i].s[0] = (float)(rand()%10); Input1[i].s[1] = (float)(rand()%10); Input1[i].s[2] = (float)(rand()%10); Input1[i].s[3] = (float)(rand()%10); } } void ComputeCPU() { //not called anywhere. } void SetupOCL() { cl_int Status=0; /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint NumPlatforms=0; //Get Number of available platforms Status=clGetPlatformIDs(0, NULL, &NumPlatforms); if(Status!= CL_SUCCESS) { printf("Error in fetching Number of platforms\n"); exit(FORCED_EXIT); } if(NumPlatforms>0) { //Array of objects to store platform ids of available platforms cl_platform_id* Platforms=new cl_platform_id[NumPlatforms]; //Get platform ids of available platforms Status=clGetPlatformIDs(NumPlatforms, Platforms, NULL); if(Status!= CL_SUCCESS) { printf("Error in fetching Number of platforms\n"); exit(FORCED_EXIT); } //Search Appropriate platform AMD preferably for(unsigned i=0;i<NumPlatforms;i++) { char pbuf[100]; //Get Platform vendor name Status=clGetPlatformInfo(Platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); Platform = Platforms[i]; if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { break; } } delete[] Platforms;//free array of platform ids } /* * If we could find our platform, use it. Otherwise pass a NULL * and get whatever the * implementation thinks we should be using. */ //Context Properties cl_context_properties CPS[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)Platform, 0 }; // Use NULL for backward compatibility cl_context_properties* ContextProperties = (NULL == Platform) ? NULL : CPS; Context = clCreateContextFromType(ContextProperties, DeviceType, NULL, NULL, &Status); if(Status!= CL_SUCCESS) { printf("Error in creating context\n"); exit(FORCED_EXIT); } //Creating Memory Buffers inputBuffer0 = clCreateBuffer(Context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, LENGTH*sizeof(DataType), Input0, &Status); if(Status != CL_SUCCESS) { printf("Error in creting buffer inputbuffer0: %d",Status); exit(FORCED_EXIT); } inputBuffer1 = clCreateBuffer(Context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, LENGTH*sizeof(DataType), Input1, &Status); if(Status != CL_SUCCESS) { printf("Error in creating buffer inputbuffer1: %d",Status); exit(FORCED_EXIT); } outputBuffer = clCreateBuffer(Context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, LENGTH*sizeof(DataType), Output, &Status); if(Status != CL_SUCCESS) { printf("Error in allocating output buffer : %d",Status); exit(FORCED_EXIT); } cl_uint NumDevices=0; Status=clGetDeviceIDs(Platform, DeviceType, 0, NULL, &NumDevices); if(Status!= CL_SUCCESS) { printf("Error in fetching Number of devices\n"); exit(FORCED_EXIT); } Devices=(cl_device_id*)malloc(sizeof(cl_device_id)*NumDevices); // Now, set the device list data Status = clGetContextInfo(Context, CL_CONTEXT_DEVICES, sizeof(cl_device_id)*NumDevices, Devices, NULL); if(Status!= CL_SUCCESS) { printf("Error in clGetContextInfo\n"); exit(FORCED_EXIT); } // create Command queues CommandQueue0 = clCreateCommandQueue(Context, Devices[0], CL_QUEUE_PROFILING_ENABLE, &Status); if(Status!= CL_SUCCESS) { printf("Error in creating command queue0\n"); exit(FORCED_EXIT); } // Read kernel file to a string size_t Size; char* Str; // Open file stream std::fstream f(KERNEL_FILE, (std::fstream::in | std::fstream::binary)); // Check if we have opened file stream if (f.is_open()) { f.seekg(0, std::fstream::end); Size = f.tellg(); f.seekg(0, std::fstream::beg); Str = new char[Size + 1]; if (!Str) { f.close(); return; } // Read file f.read(Str, Size); f.close(); Str[Size] = '\0'; } else { return; } // Create program object from source Program = clCreateProgramWithSource(Context, 1, (const char **)&Str, &Size, &Status); if(Status!= CL_SUCCESS) { printf("Error in creating Program Source\n"); exit(FORCED_EXIT); } // create a cl program executable for all the devices specified Status = clBuildProgram(Program, 1, Devices, NULL, NULL, NULL); if(Status != CL_SUCCESS) { if(Status == CL_BUILD_PROGRAM_FAILURE) { cl_int LogStatus; char * BuildLog = NULL; size_t BuildLogSize = 0; //Get size of Build Log LogStatus = clGetProgramBuildInfo (Program , Devices[0], CL_PROGRAM_BUILD_LOG, BuildLogSize, BuildLog, &BuildLogSize); BuildLog = (char*)calloc(BuildLogSize,sizeof(char)); LogStatus = clGetProgramBuildInfo (Program, Devices[0], CL_PROGRAM_BUILD_LOG, BuildLogSize, BuildLog, NULL); std::cout << " \n\t\t\tBUILD LOG for device[0]\n"; std::cout << " ************************************************\n"; std::cout << BuildLog << std::endl; std::cout << " ************************************************\n"; free(BuildLog); } } /* get a kernel object handle for a kernel with the given name */ Kernel0 = clCreateKernel(Program, KERNEL0, &Status); if(Status != CL_SUCCESS) { printf(" Error : clCreateKErnel failed!%d \n",Status); exit(FORCED_EXIT); } } void RunKernel() { cl_int Status=0; cl_uint len=LENGTH; //Set Kernel Arguments Status = clSetKernelArg(Kernel0, 0, sizeof(cl_mem), &inputBuffer0); if(Status!=CL_SUCCESS) { printf("Error in 0th Kernel Argument %d",Status); exit(FORCED_EXIT); } Status = clSetKernelArg(Kernel0, 1, sizeof(cl_mem), &inputBuffer1); if(Status != CL_SUCCESS) { printf("Error in Kernel arg 1: %d",Status); exit(FORCED_EXIT); } Status = clSetKernelArg(Kernel0, 2, sizeof(cl_mem), &outputBuffer); if(Status != CL_SUCCESS) { printf("Error in Kernel arg 2: %d",Status); exit(FORCED_EXIT); } Status=clSetKernelArg(Kernel0, 3, sizeof(cl_float), &len); if(Status != CL_SUCCESS) { printf("Error in Kernel arg 3: %d",Status); exit(FORCED_EXIT); } const size_t GlobalSize[1] = {GLOBAL_SIZE}; size_t LocalSizeDevice0[1] = {16}; Status=clGetKernelWorkGroupInfo(Kernel0, Devices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void*)LocalSizeDevice0, NULL); if(Status!= CL_SUCCESS) { printf("Error in getting Kernel Work Group Info for kernel0 %d\n",Status); exit(FORCED_EXIT); } Status = clFinish(CommandQueue0); if(Status!= CL_SUCCESS) { printf("Error in clfinish before ndrange %d \n",Status); exit(FORCED_EXIT); } Status = clEnqueueNDRangeKernel(CommandQueue0, Kernel0, 1, 0, GlobalSize, LocalSizeDevice0, 0, 0, NULL); if(Status!= CL_SUCCESS) { printf("Error in ndrange kernel1 %d\n",Status); exit(FORCED_EXIT); } Status = clFinish(CommandQueue0); if(Status!= CL_SUCCESS) { printf("Error in clfinish after ndrange %d\n",Status); exit(FORCED_EXIT); } Status = clEnqueueReadBuffer(CommandQueue0, outputBuffer, CL_TRUE, 0, sizeof(DataType)*LENGTH, Output, 0, NULL, NULL); if(Status!= CL_SUCCESS) { printf("Error in clfinish after ndrange %d\n",Status); exit(FORCED_EXIT); } } int main() { SetUpData(); ComputeCPU(); SetupOCL(); RunKernel(); return 0; }