......... int z=get_global_id(2); int TOffset2 = (32*y+z) * AdvanceBy; if(z>=32){ TOffset2 = (32*y+(z-32)) * AdvanceBy; } .......... __global float4* tmp_pot = tmp_PoT + ul_PoT + TOffset2 * (fft_len4); if(z>=32){//R: other half will use secondary bufer. It';s because of 128MB limit per buffer in current OpenCL ATi implementation tmp_pot=tmp2 + ul_PoT + TOffset2 * (fft_len4); }
When you say small domain, how small do you mean? What GPU are you running on?
Jeff
Please provide a test case
Originally posted by: himanshu.gautam
Please provide a test case
#if 1 __kernel void test_kernel(__global float4* in, __global float4* tmp1, __global float4* tmp2){ uint x=get_global_id(0); uint y=get_global_id(1); uint z=get_global_id(2); int offset=(32*y+z)*64; if(z>=32) offset=(32*y+(z-32))*64; __global float4* t=tmp1+offset*4; if(z>=32) t=tmp2+offset*4; for(int i=z;i<1024;i+=64){ for(int j=0;j<16;j++){ t=in
+in[j+1]; } } } #else __kernel void test_kernel(__global float4* in, __global float4* tmp1){ uint x=get_global_id(0); uint y=get_global_id(1); uint z=get_global_id(2); int offset=(32*y+z)*64; __global float4* t=tmp1+offset*4; for(int i=z;i<1024;i+=32){ for(int j=0;j<16;j++){ t=in +in[j+1];} } } #endif // test_case.cpp : Defines the entry point for the console application. // #include "stdafx.h" #include "ctype.h" #include <stdio.h> #include <string> #include <CL/cl.h> #include <string.h> #include <cstdlib> #include <iostream> #include <string> #include <fstream> #include <CL/cl.h> // #include <OpenCL_FFT/clFFT.h> cl_context context; cl_device_id *devices; cl_device_id device_id; cl_command_queue cq; cl_int err; cl_program program; std::string convertToString(const char *filename) { size_t size; char* str; std::string s; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); return NULL; } #undef read f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; return s; } return NULL; } void initializeCL(void) { cl_int status = 0; size_t deviceListSize; /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(status != CL_SUCCESS) { fprintf(stderr,"Error: Getting Platforms. (clGetPlatformsIDs)\n"); return; } if(numPlatforms > 0) { cl_platform_id* platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(status != CL_SUCCESS) { fprintf(stderr,"Error: Getting Platform Ids. (clGetPlatformsIDs)\n"); return; } for(unsigned int i=0; i < numPlatforms; ++i) { char pbuff[100]; status = clGetPlatformInfo( platforms, CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); platform = platforms; fprintf(stderr,"OpenCL platform detected: %s\n",pbuff); #if USE_OPENCL_NV if(!strcmp(pbuff, "NVIDIA Corporation")) #else if(!strcmp(pbuff, "Advanced Micro Devices, Inc.")) #endif { break; } } delete platforms; } /* * If we could find our platform, use it. Otherwise pass a NULL and get whatever the * implementation thinks we should be using. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; cl_context_properties* cprops = (NULL == platform) ? NULL : cps; ///////////////////////////////////////////////////////////////// // Create an OpenCL context ///////////////////////////////////////////////////////////////// context = clCreateContextFromType(cprops, #if USE_OPENCL_CPU CL_DEVICE_TYPE_CPU, #elif USE_OPENCL_NV CL_DEVICE_TYPE_GPU, #else CL_DEVICE_TYPE_GPU/*CL_DEVICE_TYPE_CPU*/, #endif NULL, NULL, &status); if(status != CL_SUCCESS) { fprintf(stderr,"Error: Creating Context. (clCreateContextFromType)\n"); return; } /* First, get the size of device list data */ status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(status != CL_SUCCESS) { fprintf(stderr, "Error: Getting Context Info \ (device list size, clGetContextInfo)\n"); return; } ///////////////////////////////////////////////////////////////// // Detect OpenCL devices ///////////////////////////////////////////////////////////////// devices = (cl_device_id *)malloc(deviceListSize); if(devices == 0) { fprintf(stderr,"Error: No devices found.\n"); return; }else fprintf(stderr,"Found %d OpenCL devices.\n",deviceListSize/sizeof(cl_device_id)); /* Now, get the device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(status != CL_SUCCESS) { fprintf(stderr,"Error: Getting Context Info \ (device list, clGetContextInfo)\n"); return; } ///////////////////////////////////////////////////////////////// // Create an OpenCL command queue ///////////////////////////////////////////////////////////////// device_id=devices[0]; cq = clCreateCommandQueue( context, device_id, CL_QUEUE_PROFILING_ENABLE, &status); if(status != CL_SUCCESS) { fprintf(stderr,"Creating Command Queue. (clCreateCommandQueue)\n"); return; } ///////////////////////////////////////////////////////////////// // Load CL file, build CL program object, create CL kernel object ///////////////////////////////////////////////////////////////// const char * filename = "Test_case.cl"; std::string sourceStr = convertToString(filename); const char * source = sourceStr.c_str(); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource( context, 1, &source, sourceSize, &status); if(status != CL_SUCCESS) { fprintf(stderr,"Error: Loading Binary into cl_program \ (clCreateProgramWithBinary)\n"); return; } // create a cl program executable for all the devices specified #if USE_OPENCL_NV //R: for ATI SDK 2.01 it generates waning - unsupported compiler option //Devaster: build options for cl compiler const char * buildoptions="-cl-fast-relaxed-math"; status = clBuildProgram(program, 1, devices, buildoptions, NULL, NULL); //Devaster: wanna see some info .... char cBuildLog[10240]; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(cBuildLog), cBuildLog, NULL ); fprintf(stderr,"%s\n",cBuildLog); //Devaster #else status = clBuildProgram(program, 1, devices, NULL, NULL, NULL); #endif if(status != CL_SUCCESS) { fprintf(stderr,"Error: Building Program (clBuildProgram)\n"); char cBuildLog[10240]; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(cBuildLog), cBuildLog, NULL ); fprintf(stderr,"%s\n",cBuildLog); exit(0); } } int _tmain(int argc, _TCHAR* argv[]) { //R: OpenCL init initializeCL(); cl_kernel test_kernel=clCreateKernel(program, "test_kernel", &err); if(err != CL_SUCCESS) fprintf(stderr,"ERROR: Creating test_kernel from program: %d\n",err); cl_mem in=clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof (cl_float)*1024*1024, NULL, &err); if(err != CL_SUCCESS)fprintf(stderr,"Error: clCreateBuffer (in): %d\n",err); cl_mem tmp1=clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof (cl_float)*1024*1024*32, NULL, &err); if(err != CL_SUCCESS)fprintf(stderr,"Error: clCreateBuffer (tmp1): %d\n",err); cl_mem tmp2=clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof (cl_float)*1024*1024*32, NULL, &err); if(err != CL_SUCCESS)fprintf(stderr,"Error: clCreateBuffer (tmp1): %d\n",err); err = clSetKernelArg(test_kernel,0,sizeof(cl_mem),(void *)&in); err |= clSetKernelArg(test_kernel,1,sizeof(cl_mem),(void *)&tmp1); err |= clSetKernelArg(test_kernel,2,sizeof(cl_mem),(void *)&tmp2); if(err != CL_SUCCESS)fprintf(stderr,"ERROR: Setting kernel argument:find_single_pulse_kernel: %d\n",err); size_t globalThreads[3]; globalThreads[0] = 4; globalThreads[1] = 1024*1024/4/64; globalThreads[2] =64; //globalThreads[2] =32; err = clEnqueueNDRangeKernel( cq, test_kernel, 3, NULL, globalThreads, NULL,//R: no workgroups requred 0, NULL,//R: synching between kernels not actually needed cause queue in-order one. NULL); if(err != CL_SUCCESS)fprintf(stderr, "ERROR: Enqueueing kernel onto command queue.(test_kernel): %d\n",err); return 0; }
hi raistmer,
I checked the code on my 4870 device, and it do show the same problem.
I ran your kernel on SKA and found out that the bottleneck of the code was global write as attached.
When you are running the kernel for {4 1096 32} threads you fetch only one buffer(1024*1024*32),but when you use {4 4096 64} threads you actually fetch two buffers(2*1024*1024*32).Therefore the fetchsize is same in both the cases.
So i tried to run the code by halving the size of temp buffers in {4 1096 64} case and kernel ran successfully with double the performance.
My system output
{4 1096 32} 1439.94ms
{4 1096 64} 725.94ms (with making the buffer size half).
I hope that explains your problem.
Name,GPR,Scratch Reg,Min,Max,Avg,ALU,Fetch,Write,Est Cycles,ALU:Fetch, BottleNeck,%s\Clock,Throughput FireStream 9170,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A FireStream 9250,8,0,3.40,4569.60,616.82,62,32,16,616.82,0.23, Global Write,0.03,16 M Threads\Sec FireStream 9270,8,0,3.40,4569.60,616.82,62,32,16,616.82,0.23, Global Write,0.03,19 M Threads\Sec Radeon HD 2400,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A Radeon HD 2600,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A Radeon HD 2900,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A Radeon HD 3870,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A Radeon HD 4550,8,0,17.00,11424.00,765.47,63,32,16,675.41,0.48, Global Fetch,0.01,7 M Threads\Sec Radeon HD 4670,8,0,4.25,5712.00,382.74,63,32,16,337.71,0.24, Global Write,0.02,18 M Threads\Sec Radeon HD 4770,8,0,4.25,5712.00,699.06,63,32,16,616.82,0.23, Global Write,0.03,19 M Threads\Sec Radeon HD 4870,8,0,3.40,4569.60,616.82,62,32,16,616.82,0.23, Global Write,0.03,19 M Threads\Sec Radeon HD 4890,8,0,3.40,4569.60,616.82,62,32,16,616.82,0.23, Global Write,0.03,22 M Threads\Sec Radeon HD 5450,10,0,5.00,5712.00,224.03,39,32,16,197.67,0.93, Global Write,0.02,13 M Threads\Sec Radeon HD 5670,10,0,2.00,4569.60,337.71,39,32,16,337.71,0.46, Global Write,0.02,18 M Threads\Sec Radeon HD 5770,10,0,1.00,4569.60,616.82,39,32,16,616.82,0.23, Global Write,0.03,22 M Threads\Sec Radeon HD 5870,10,0,1.00,4569.60,616.82,39,32,16,616.82,0.23, Global Write,0.05,44 M Threads\Sec
hi raistmer,
I analyzed the problem and would like to edit my explanation a little bit.
Your problem is not related to bottleneck in ska.But before the kernel launches it makes sure the buffers it need are available in GPU memory.In {4 1096 32} case you copy two buffers(1 In and 1 temp1),but in {4 1096 64} case you copy 3 buffers(1 In + temp1+temp2).Just see your fetchsize parameter in profiler.it is same in both the cases.
Just try doing the following change to the code and you will see my point.Please reply if there is still some confuion.
////After Change cl_mem in=clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof (cl_float)*1024*1024, NULL, &err); if(err != CL_SUCCESS)fprintf(stderr,"Error: clCreateBuffer (in): %d\n",err); cl_mem tmp1=clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof (cl_float)*1024*1024*16, NULL, &err); if(err != CL_SUCCESS)fprintf(stderr,"Error: clCreateBuffer (tmp1): %d\n",err); cl_mem tmp2=clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof (cl_float)*1024*1024*16, NULL, &err); //Before Change cl_mem in=clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof (cl_float)*1024*1024, NULL, &err); if(err != CL_SUCCESS)fprintf(stderr,"Error: clCreateBuffer (in): %d\n",err); cl_mem tmp1=clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof (cl_float)*1024*1024*32, NULL, &err); if(err != CL_SUCCESS)fprintf(stderr,"Error: clCreateBuffer (tmp1): %d\n",err); cl_mem tmp2=clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof (cl_float)*1024*1024*32, NULL, &err);
Hi raistmer,
I relooked at the code.I feel you are right it is the bank conflicts that is creating contention.In each thread we are accessing the same elements which result in bank conflicts.
When we double the work items the bank conflicts become even more intense.
So it can be concluded that dividing the work would not increase the speed of execution everytime.We always need to select a value which most efficiently uses the processing resources present in the device.
I am supplying a simple sample code having a better access pattern which might be helpful.
For more information about better access pattern to avoid bank conflicts,refer to the optimization chapter of openCL Programming Guide
//KERNEL CODE: #define MoreThreads 0 #if MoreThreads __kernel void test_kernel(__global float4* in, __global float4* tmp1, __global float4* tmp2, int length){ int x=get_global_id(0); float4 a=in
,b=tmp1 ; for(int i=0;i<128;i++) { tmp2 .x=a.x*a.y +a.z*a.w+ b.x*b.y + b.z*b.w +i; tmp2 .y=a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w+i; tmp2 .z=a.x/a.y +a.z/a.w+ b.x/b.y + b.z/b.w+i; tmp2 .w=a.x/b.x + a.y/b.y + a.z/b.z + a.w/b.w+i; } } #else __kernel void test_kernel(__global float4* in, __global float4* tmp1,__global float4* tmp2, int length){ int x=get_global_id(0); float4 a=in ,b=tmp1 ; for(int i=0;i<256;i++) { tmp2 .x=a.x*a.y +a.z*a.w+ b.x*b.y + b.z*b.w+i; tmp2 .y=a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w+i; tmp2 .z=a.x/a.y +a.z/a.w+ b.x/b.y + b.z/b.w+i; tmp2 .w=a.x/b.x + a.y/b.y + a.z/b.z + a.w/b.w+i; } } #endif //HOST CODE: //Header Files #pragma region Header #include <CL\cl.h> #include <iostream> #include <cstdio> #include <fstream> #include <math.h> #include <string.h> #include <windows.h> #pragma endregion //Macro Definitions #pragma region Macro definitions #define MORETHREADS 0 #if MORETHREADS #define GLOBAL_SIZE 8192*1024 #else #define GLOBAL_SIZE 4096*1024 # endif //typedef int DataType; #define LENGTH 4096 #define DataType cl_float4 #define KERNEL0 "test_kernel" #define FORCED_EXIT 1 #define KERNEL_FILE "kernelcode.cl" #pragma endregion //OCL Parameters #pragma region OCLParams cl_platform_id Platform; cl_context Context; cl_command_queue CommandQueue0; cl_mem inputBuffer0,inputBuffer1,outputBuffer; cl_device_id *Devices; cl_program Program; cl_kernel Kernel0; cl_device_type DeviceType=CL_DEVICE_TYPE_GPU; #pragma endregion #pragma region Host Params DataType * Input0=NULL; DataType * Input1=NULL; DataType * Output=NULL; //Initialize host parameters void SetUpData() { Input0 = (DataType*)malloc(LENGTH*sizeof(DataType)); if(Input0==NULL) { printf("Memory allocation failed"); exit(FORCED_EXIT); } Input1 = (DataType*)malloc(LENGTH*sizeof(DataType)); if(Input1==NULL) { printf("Memory allocation failed"); exit(FORCED_EXIT); } Output = (DataType*)malloc(LENGTH*sizeof(DataType)); if(Output==NULL) { printf("Memory allocation failed"); exit(FORCED_EXIT); } for(int i=0;i<LENGTH;i++) { Input0.s[0] = (float)(rand()%10); Input0.s[1] = (float)(rand()%10); Input0.s[2] = (float)(rand()%10); Input0.s[3] = (float)(rand()%10); Input1.s[0] = (float)(rand()%10); Input1.s[1] = (float)(rand()%10); Input1.s[2] = (float)(rand()%10); Input1.s[3] = (float)(rand()%10); } } void ComputeCPU() { //not called anywhere. } void SetupOCL() { cl_int Status=0; /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint NumPlatforms=0; //Get Number of available platforms Status=clGetPlatformIDs(0, NULL, &NumPlatforms); if(Status!= CL_SUCCESS) { printf("Error in fetching Number of platforms\n"); exit(FORCED_EXIT); } if(NumPlatforms>0) { //Array of objects to store platform ids of available platforms cl_platform_id* Platforms=new cl_platform_id[NumPlatforms]; //Get platform ids of available platforms Status=clGetPlatformIDs(NumPlatforms, Platforms, NULL); if(Status!= CL_SUCCESS) { printf("Error in fetching Number of platforms\n"); exit(FORCED_EXIT); } //Search Appropriate platform AMD preferably for(unsigned i=0;i<NumPlatforms;i++) { char pbuf[100]; //Get Platform vendor name Status=clGetPlatformInfo(Platforms, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); Platform = Platforms; if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { break; } } delete[] Platforms;//free array of platform ids } /* * If we could find our platform, use it. Otherwise pass a NULL * and get whatever the * implementation thinks we should be using. */ //Context Properties cl_context_properties CPS[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)Platform, 0 }; // Use NULL for backward compatibility cl_context_properties* ContextProperties = (NULL == Platform) ? NULL : CPS; Context = clCreateContextFromType(ContextProperties, DeviceType, NULL, NULL, &Status); if(Status!= CL_SUCCESS) { printf("Error in creating context\n"); exit(FORCED_EXIT); } //Creating Memory Buffers inputBuffer0 = clCreateBuffer(Context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, LENGTH*sizeof(DataType), Input0, &Status); if(Status != CL_SUCCESS) { printf("Error in creting buffer inputbuffer0: %d",Status); exit(FORCED_EXIT); } inputBuffer1 = clCreateBuffer(Context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, LENGTH*sizeof(DataType), Input1, &Status); if(Status != CL_SUCCESS) { printf("Error in creating buffer inputbuffer1: %d",Status); exit(FORCED_EXIT); } outputBuffer = clCreateBuffer(Context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, LENGTH*sizeof(DataType), Output, &Status); if(Status != CL_SUCCESS) { printf("Error in allocating output buffer : %d",Status); exit(FORCED_EXIT); } cl_uint NumDevices=0; Status=clGetDeviceIDs(Platform, DeviceType, 0, NULL, &NumDevices); if(Status!= CL_SUCCESS) { printf("Error in fetching Number of devices\n"); exit(FORCED_EXIT); } Devices=(cl_device_id*)malloc(sizeof(cl_device_id)*NumDevices); // Now, set the device list data Status = clGetContextInfo(Context, CL_CONTEXT_DEVICES, sizeof(cl_device_id)*NumDevices, Devices, NULL); if(Status!= CL_SUCCESS) { printf("Error in clGetContextInfo\n"); exit(FORCED_EXIT); } // create Command queues CommandQueue0 = clCreateCommandQueue(Context, Devices[0], CL_QUEUE_PROFILING_ENABLE, &Status); if(Status!= CL_SUCCESS) { printf("Error in creating command queue0\n"); exit(FORCED_EXIT); } // Read kernel file to a string size_t Size; char* Str; // Open file stream std::fstream f(KERNEL_FILE, (std::fstream::in | std::fstream::binary)); // Check if we have opened file stream if (f.is_open()) { f.seekg(0, std::fstream::end); Size = f.tellg(); f.seekg(0, std::fstream::beg); Str = new char[Size + 1]; if (!Str) { f.close(); return; } // Read file f.read(Str, Size); f.close(); Str[Size] = '\0'; } else { return; } // Create program object from source Program = clCreateProgramWithSource(Context, 1, (const char **)&Str, &Size, &Status); if(Status!= CL_SUCCESS) { printf("Error in creating Program Source\n"); exit(FORCED_EXIT); } // create a cl program executable for all the devices specified Status = clBuildProgram(Program, 1, Devices, NULL, NULL, NULL); if(Status != CL_SUCCESS) { if(Status == CL_BUILD_PROGRAM_FAILURE) { cl_int LogStatus; char * BuildLog = NULL; size_t BuildLogSize = 0; //Get size of Build Log LogStatus = clGetProgramBuildInfo (Program , Devices[0], CL_PROGRAM_BUILD_LOG, BuildLogSize, BuildLog, &BuildLogSize); BuildLog = (char*)calloc(BuildLogSize,sizeof(char)); LogStatus = clGetProgramBuildInfo (Program, Devices[0], CL_PROGRAM_BUILD_LOG, BuildLogSize, BuildLog, NULL); std::cout << " \n\t\t\tBUILD LOG for device[0]\n"; std::cout << " ************************************************\n"; std::cout << BuildLog << std::endl; std::cout << " ************************************************\n"; free(BuildLog); } } /* get a kernel object handle for a kernel with the given name */ Kernel0 = clCreateKernel(Program, KERNEL0, &Status); if(Status != CL_SUCCESS) { printf(" Error : clCreateKErnel failed!%d \n",Status); exit(FORCED_EXIT); } } void RunKernel() { cl_int Status=0; cl_uint len=LENGTH; //Set Kernel Arguments Status = clSetKernelArg(Kernel0, 0, sizeof(cl_mem), &inputBuffer0); if(Status!=CL_SUCCESS) { printf("Error in 0th Kernel Argument %d",Status); exit(FORCED_EXIT); } Status = clSetKernelArg(Kernel0, 1, sizeof(cl_mem), &inputBuffer1); if(Status != CL_SUCCESS) { printf("Error in Kernel arg 1: %d",Status); exit(FORCED_EXIT); } Status = clSetKernelArg(Kernel0, 2, sizeof(cl_mem), &outputBuffer); if(Status != CL_SUCCESS) { printf("Error in Kernel arg 2: %d",Status); exit(FORCED_EXIT); } Status=clSetKernelArg(Kernel0, 3, sizeof(cl_float), &len); if(Status != CL_SUCCESS) { printf("Error in Kernel arg 3: %d",Status); exit(FORCED_EXIT); } const size_t GlobalSize[1] = {GLOBAL_SIZE}; size_t LocalSizeDevice0[1] = {16}; Status=clGetKernelWorkGroupInfo(Kernel0, Devices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void*)LocalSizeDevice0, NULL); if(Status!= CL_SUCCESS) { printf("Error in getting Kernel Work Group Info for kernel0 %d\n",Status); exit(FORCED_EXIT); } Status = clFinish(CommandQueue0); if(Status!= CL_SUCCESS) { printf("Error in clfinish before ndrange %d \n",Status); exit(FORCED_EXIT); } Status = clEnqueueNDRangeKernel(CommandQueue0, Kernel0, 1, 0, GlobalSize, LocalSizeDevice0, 0, 0, NULL); if(Status!= CL_SUCCESS) { printf("Error in ndrange kernel1 %d\n",Status); exit(FORCED_EXIT); } Status = clFinish(CommandQueue0); if(Status!= CL_SUCCESS) { printf("Error in clfinish after ndrange %d\n",Status); exit(FORCED_EXIT); } Status = clEnqueueReadBuffer(CommandQueue0, outputBuffer, CL_TRUE, 0, sizeof(DataType)*LENGTH, Output, 0, NULL, NULL); if(Status!= CL_SUCCESS) { printf("Error in clfinish after ndrange %d\n",Status); exit(FORCED_EXIT); } } int main() { SetUpData(); ComputeCPU(); SetupOCL(); RunKernel(); return 0; }