14 Replies Latest reply on Dec 6, 2009 11:31 AM by Hill_Groove

    OpenCL

    Hill_Groove
      different calculation results

      Hello. The results (res[0]) of the following code are different. Maybe there is a problem with memory somewhere. It is interesting, that if 7 t = t + 0.01  strings are written instead of 8 (as below), the program works fine.

      #define N 20
      __kernel void interaction(global float A[], global float res[], global float C[])
      {
      __private int a;
      __private int b;
      __private int i;
      __private int j;
      __private float t;
      a = get_global_id(0);
      b = get_global_id(1);
      t = 0;
      for (i=0;i<10;i++)
      {
           t = t + 0.01;
           t = t + 0.01;
           t = t + 0.01;
           t = t + 0.01;
           t = t + 0.01;
           t = t + 0.01;
           t = t + 0.01;
           t = t + 0.01;
      }
      C[a*N+b] = t;
      barrier(CLK_GLOBAL_MEM_FENCE);

      if (a*b==1)
      {
          res[0] = 0;
          for (i=0 ; i <N ;i++)
          {
              for (j=0;j<N;j++)
              {
                  res[0] = res[0] + C[i*N+j];
              }
          }
      }
      }

      Intel Core i7, XFX 4890, Win7

       

        • OpenCL
          omkaranathan

          Hill_Groove,

          Your code seems to be incomplete, could you post the complete code?

            • OpenCL
              nou

              i think that you get memory conflict between threads. most likely you write to res[0] from 20 thread simultaneulsy.

              • OpenCL
                Hill_Groove

                Thank you for your answers, now i think it is a complete code. I don't use A[] array, C[] is a float[N*N] and res is a float[2]. Host runs N*N threads (dim=2), which count the sum and after this the result is collected in res[0] by only thread indexed (0;0).

                  • OpenCL
                    genaganna

                    Hill_Groove,

                      what is workGroup size?

                      • OpenCL
                        Hill_Groove

                        genaganna,

                        the workgroup size is default (NULL argument).

                          • OpenCL
                            omkaranathan

                            Hill_Groove,

                            Could you post the host side code too?

                              • OpenCL
                                Hill_Groove

                                Thank You For Replying, The Host Code :

                                // // EXECUTED FUNCTION // void CountEnergy() { int i, j; float a[8], d=0; GPUC gpu; gpu.gDim = 2; // OpenCL ADDITIONS gpu.gThreads[0] = (size_t) N; // gpu.gThreads[1] = (size_t) N; // a[0] = stick1.Root.x; a[1] = stick1.Root.y; a[2] = stick2.Root.x; a[3] = stick2.Root.y; a[4] = stick1.Shift.x; a[5] = stick1.Shift.y; a[6] = stick2.Shift.x; a[7] = stick2.Shift.y; Energy=0; gpu.LoadKernel("kernel.cl", "interaction"); gpu.SetKernelArg(0, a, 8*sizeof(float)); gpu.SetKernelArg(1, &d, sizeof(float)); gpu.AllocateMem(0, N*N*sizeof(float)); gpu.SetEmptyArg(2, 0); gpu.RunKernel(); gpu.ReadKernelArg(1, &d, sizeof(float)); gpu.ClearAll(); Energy =d; } // // GPUC.H // #include <iostream> #include <fstream> #include <conio.h> #include <CL/cl.hpp> #define gQUEUE_MODE CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE #define gWORKGROUPS NULL class GPUC { public: cl_uint gDim; size_t gThreads[3], gWorkgroupSize[3]; private: char buff[50]; cl_int gErr; cl_uint gNumadd, gNumout; cl_platform_id gPlatforms; cl_device_id gDevice, gLoaded; cl_context gContext; cl_command_queue gGPUqueue; cl_program gKernelProgram; cl_kernel gKernel; cl_event gEvents[2]; cl_mem gArg[10], gBuf[10]; std :: string FileToString(const char *filename); public: GPUC(); int LoadKernel(const char * filename, const char * kernelname); int SetKernelArg(int argnum, const void * arg, int size); int AllocateMem(int argnum, int size); int SetEmptyArg(int argnum, int memnum); int ReadKernelArg(int argnum, void * arg, int size); int RunKernel(); void ClearAll(); }; // // GPUC.CPP // std :: string GPUC :: FileToString(const char *filename) { size_t size; char * str; std :: string s; std :: fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); return NULL; } f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; return s; } return NULL; } GPUC :: GPUC() { gNumadd = 1; gDim = 3; gThreads[0] = 2; gThreads[1] = 2; gThreads[2] = 2; gWorkgroupSize[0] = 2; gWorkgroupSize[1] = 2; gWorkgroupSize[2] = 2; if (clGetPlatformIDs(gNumadd, &gPlatforms, &gNumout) == CL_SUCCESS) { std :: cout << "Platform Found :: ("; clGetPlatformInfo(gPlatforms, CL_PLATFORM_VERSION, 100, buff, NULL); std :: cout << buff << ")" << std :: endl; } else { std :: cout << "\nOpenCL Not Found."; exit(1); } if (clGetDeviceIDs(gPlatforms, CL_DEVICE_TYPE_GPU, gNumadd, &gDevice, &gNumout) == CL_SUCCESS) { if (gNumout==0) { std :: cout << "\nNo OpenCL Capatible GPU Found."; exit(2); } } gContext = clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU, NULL, NULL, &gErr); if (gErr!=CL_SUCCESS) { std :: cout << "\nError Creating A Context."; exit(3); } if (clGetContextInfo(gContext, CL_CONTEXT_DEVICES, 100, &gLoaded, NULL)!= CL_SUCCESS) { std :: cout << "\nError Getting ContextInfo."; exit(3); } gGPUqueue = clCreateCommandQueue(gContext, gLoaded, NULL, &gErr); // CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE possible if (gErr!=CL_SUCCESS) { std :: cout << "\nError Creating A GPU Queue : "; switch (gErr) { case CL_INVALID_CONTEXT: std :: cout << "CL_INVALID_CONTEXT.\n"; break; case CL_INVALID_DEVICE: std :: cout << "CL_INVALID_DEVICE.\n"; break; case CL_INVALID_QUEUE_PROPERTIES: std :: cout << "CL_INVALID_QUEUE_PROPERTIES.\n"; break; case CL_OUT_OF_HOST_MEMORY: std :: cout << "CL_OUT_OF_HOST_MEMORY.\n"; break; } _getch(); exit(4); } } int GPUC :: AllocateMem(int memnum, int size) { gBuf[memnum] = clCreateBuffer(gContext, CL_MEM_READ_WRITE, size, NULL, &gErr); if (gErr!=CL_SUCCESS) { std :: cout << "\nError Creating Buffer."; return 1; } return 0; } int GPUC :: SetEmptyArg(int argnum, int memnum) { gErr = clSetKernelArg(gKernel, argnum, sizeof(cl_mem), &gBuf[memnum]); if (gErr!=CL_SUCCESS) { std :: cout << "\nSetKernelArg 0 Error : "; switch (gErr) { case (CL_INVALID_KERNEL): std :: cout << "CL_INVALID_KERNEL\n"; break; case (CL_INVALID_ARG_INDEX): std :: cout << "CL_INVALID_ARG_INDEX\n"; break; case (CL_INVALID_ARG_VALUE): std :: cout << "CL_INVALID_ARG_VALUE\n"; break; case (CL_INVALID_MEM_OBJECT): std :: cout << "CL_INVALID_MEM_OBJECT\n"; break; case (CL_INVALID_SAMPLER): std :: cout << "CL_INVALID_SAMPLER\n"; break; case (CL_INVALID_ARG_SIZE): std :: cout << "CL_INVALID_ARG_SIZE\n"; break; } return 1; } return 0; } int GPUC :: SetKernelArg(int argnum, const void * arg, int size) { if (argnum>9) { std :: cout << "\nMore Than 10 Arguments Are Not Allowed."; return 1; } gArg[argnum] = clCreateBuffer(gContext, CL_MEM_READ_WRITE, size, NULL, &gErr); if (gErr!=CL_SUCCESS) { std :: cout << "\nError Creating Buffer."; return 1; } clEnqueueWriteBuffer(gGPUqueue, gArg[argnum], 0, 0, size, arg, NULL, 0, NULL); if (gErr!=CL_SUCCESS) { std :: cout << "\nBuffer Enqueue Error."; return 2; } gErr = clSetKernelArg(gKernel, argnum, sizeof(cl_mem), &gArg[argnum]); if (gErr!=CL_SUCCESS) { std :: cout << "\nSetKernelArg 0 Error : "; switch (gErr) { case (CL_INVALID_KERNEL): std :: cout << "CL_INVALID_KERNEL\n"; break; case (CL_INVALID_ARG_INDEX): std :: cout << "CL_INVALID_ARG_INDEX\n"; break; case (CL_INVALID_ARG_VALUE): std :: cout << "CL_INVALID_ARG_VALUE\n"; break; case (CL_INVALID_MEM_OBJECT): std :: cout << "CL_INVALID_MEM_OBJECT\n"; break; case (CL_INVALID_SAMPLER): std :: cout << "CL_INVALID_SAMPLER\n"; break; case (CL_INVALID_ARG_SIZE): std :: cout << "CL_INVALID_ARG_SIZE\n"; break; } return 3; } return 0; } int GPUC :: LoadKernel(const char * filename, const char * kernelname) { std :: string KernelSource = FileToString(filename); const char * KernelCode = KernelSource.c_str(); size_t gSourceSize[] = {strlen(KernelCode)}; gKernelProgram = clCreateProgramWithSource(gContext, 1, &KernelCode, gSourceSize, &gErr); if (gErr!=CL_SUCCESS) { std :: cout << "\nError Creating Program With Code."; } gErr = clBuildProgram (gKernelProgram, 1, &gLoaded, NULL, NULL, NULL); if (gErr!=CL_SUCCESS) { std :: cout << "\n Program Build Error : "; switch(gErr) { case CL_INVALID_PROGRAM: std::cout << "CL_INVALID_PROGRAM."; break; case CL_INVALID_VALUE: std::cout << "CL_INVALID_VALUE."; break; case CL_INVALID_DEVICE: std::cout << "CL_INVALID_DEVICE."; break; case CL_INVALID_BINARY: std::cout << "CL_INVALID_BINARY."; break; case CL_INVALID_BUILD_OPTIONS: std::cout << "CL_INVALID_BUILD_OPTIONS."; break; case CL_INVALID_OPERATION: std::cout << "CL_INVALID_OPERATION."; break; case CL_COMPILER_NOT_AVAILABLE: std::cout << "CL_COMPILER_NOT_AVAILABLE."; break; case CL_BUILD_PROGRAM_FAILURE: std::cout << "CL_BUILD_PROGRAM_FAILURE."; break; case CL_OUT_OF_HOST_MEMORY: std::cout << "CL_OUT_OF_HOST_MEMORY."; break; case CL_DEVICE_COMPILER_AVAILABLE: std::cout << "CL_DEVICE_COMPILER_AVAILABLE."; break; default: std::cout << "^_^"; } return 1; } gKernel = clCreateKernel (gKernelProgram, kernelname, &gErr); if (gErr!=CL_SUCCESS) { std :: cout << "\nError Kernel Creating."; return 1; } return 0; } int GPUC :: RunKernel() { gErr = clEnqueueNDRangeKernel(gGPUqueue, gKernel, gDim, NULL, gThreads, gWORKGROUPS, 0, NULL, &gEvents[0]); if(gErr != CL_SUCCESS) { std :: cout << "\nError Enqueueing Kernel."; return 1; } gErr = clWaitForEvents(1, &gEvents[0]); if(gErr != CL_SUCCESS) { std :: cout << "\nError Waiting Kernel End."; return 1; } clReleaseEvent(gEvents[0]); return 0; } int GPUC :: ReadKernelArg(int argnum, void * arg, int size) { gErr = clEnqueueReadBuffer(gGPUqueue, gArg[argnum], 0, 0, size, arg, 0, NULL, &gEvents[1]); if(gErr != CL_SUCCESS) { std :: cout << "\nReadBuffer Enqueue Error."; return 1; } gErr = clWaitForEvents(1, &gEvents[1]); if(gErr != CL_SUCCESS) { std :: cout << "\nError Waiting For Read Buffer Call To Finish."; return 1; } return 0; } void GPUC :: ClearAll() { clUnloadCompiler(); clReleaseEvent(gEvents[0]); clReleaseEvent(gEvents[1]); clReleaseKernel(gKernel); //clReleaseProgram(gKernelProgram); /*clReleaseMemObject(gArg[0]); clReleaseMemObject(gArg[1]); clReleaseMemObject(gArg[2]); clReleaseMemObject(gArg[3]); clReleaseMemObject(gArg[4]); clReleaseMemObject(gArg[5]); clReleaseMemObject(gArg[6]); clReleaseMemObject(gArg[7]); clReleaseMemObject(gArg[8]); clReleaseMemObject(gArg[9]);*/ }

                                  • OpenCL
                                    genaganna

                                     

                                    Originally posted by: Hill_Groove Thank You For Replying, The Host Code :

                                     

                                    Hill_Groove,

                                             Thank you for providing code.  we are not able to compile code. 

                                    If WorkGroupSize is NULL,  WorkGroupSize value calculated as follows

                                          1. it must be < workGroupSize returned by clGetKernelWorkGroupInfo with CL_KERNEL_WORK_GROUP_SIZE

                                          2. It must divide globalThreads equally

                                    This value you get inside kernel by get_local_size.

                                    Specifying NULL WorkGroupSize is not recommended for all types of Algoirthems. I feel it suits for data parallel algorithms.

                                    could you please run for CL_DEVICE_TYPE_CPU and see you are getting expected results?

                                    For your kernel, if workGroup are more then 1, you might get undefined results.  because C is not synchronized properly.

                        • OpenCL
                          bealto

                          The test a*b==1 does not select thread (0,0) but thread (1,1).

                          Why not use a==0 && b==0 ?

                          Did you try float sum = 0 ... sum += C[i*N+j] ... res[0] = sum. Here you read and write res[0] from global memory N*N times (unless optimized by compiler).

                          -- E

                           

                            • OpenCL
                              Hill_Groove

                              genaganna, 

                              thank you for replying, can you please explain "For your kernel, if workGroup are more then 1, you might get undefined results.  because C is not synchronized properly." and how it should be written correctly to count the sum without memory problems.

                              bealto,

                              thank you, for your comment, you're undoubtedly right. But what it is important for me is to find out why i am getting different answers everytime i run this program. I cannot see a syncronisation problem here, but i'm sure it exists.

                                • OpenCL
                                  genaganna

                                   

                                  Originally posted by: Hill_Groove genaganna, 

                                   

                                  thank you for replying, can you please explain "For your kernel, if workGroup are more then 1, you might get undefined results.  because C is not synchronized properly." and how it should be written correctly to count the sum without memory problems.

                                  Hill_Groove,

                                                 First calculate  C values and then calculate sum value in host by writing simple C code.

                                   

                                  /* Calculate C values */

                                  __kernel void interaction(global float A[], global float C[])
                                  {
                                  __private int a;
                                  __private int b;
                                  __private int i;
                                  __private int j;
                                  __private float t;
                                  a = get_global_id(0);
                                  b = get_global_id(1);
                                  t = 0;
                                  for (i=0;i<10;i++)
                                  {
                                       t = t + 0.01;
                                       t = t + 0.01;
                                       t = t + 0.01;
                                       t = t + 0.01;
                                       t = t + 0.01;
                                       t = t + 0.01;
                                       t = t + 0.01;
                                       t = t + 0.01;
                                  }
                                  C[a*N+b] = t;

                                  }