cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

Hill_Groove
Journeyman III

OpenCL

different calculation results

Hello. The results (res[0]) of the following code are different. Maybe there is a problem with memory somewhere. It is interesting, that if 7 t = t + 0.01  strings are written instead of 8 (as below), the program works fine.

#define N 20
__kernel void interaction(global float A[], global float res[], global float C[])
{
__private int a;
__private int b;
__private int i;
__private int j;
__private float t;
a = get_global_id(0);
b = get_global_id(1);
t = 0;
for (i=0;i<10;i++)
{
     t = t + 0.01;
     t = t + 0.01;
     t = t + 0.01;
     t = t + 0.01;
     t = t + 0.01;
     t = t + 0.01;
     t = t + 0.01;
     t = t + 0.01;
}
C[a*N+b] = t;
barrier(CLK_GLOBAL_MEM_FENCE);

if (a*b==1)
{
    res[0] = 0;
    for (i=0 ; i <N ;i++)
    {
        for (j=0;j<N;j++)
        {
            res[0] = res[0] + C[i*N+j];
        }
    }
}
}

Intel Core i7, XFX 4890, Win7

 

0 Likes
14 Replies
omkaranathan
Adept I

Hill_Groove,

Your code seems to be incomplete, could you post the complete code?

0 Likes

i think that you get memory conflict between threads. most likely you write to res[0] from 20 thread simultaneulsy.

0 Likes

Thank you for your answers, now i think it is a complete code. I don't use A[] array, C[] is a float[N*N] and res is a float[2]. Host runs N*N threads (dim=2), which count the sum and after this the result is collected in res[0] by only thread indexed (0;0).

0 Likes

Hill_Groove,

  what is workGroup size?

0 Likes

genaganna,

the workgroup size is default (NULL argument).

0 Likes

Hill_Groove,

Could you post the host side code too?

0 Likes

Thank You For Replying, The Host Code :

// // EXECUTED FUNCTION // void CountEnergy() { int i, j; float a[8], d=0; GPUC gpu; gpu.gDim = 2; // OpenCL ADDITIONS gpu.gThreads[0] = (size_t) N; // gpu.gThreads[1] = (size_t) N; // a[0] = stick1.Root.x; a[1] = stick1.Root.y; a[2] = stick2.Root.x; a[3] = stick2.Root.y; a[4] = stick1.Shift.x; a[5] = stick1.Shift.y; a[6] = stick2.Shift.x; a[7] = stick2.Shift.y; Energy=0; gpu.LoadKernel("kernel.cl", "interaction"); gpu.SetKernelArg(0, a, 8*sizeof(float)); gpu.SetKernelArg(1, &d, sizeof(float)); gpu.AllocateMem(0, N*N*sizeof(float)); gpu.SetEmptyArg(2, 0); gpu.RunKernel(); gpu.ReadKernelArg(1, &d, sizeof(float)); gpu.ClearAll(); Energy =d; } // // GPUC.H // #include <iostream> #include <fstream> #include <conio.h> #include <CL/cl.hpp> #define gQUEUE_MODE CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE #define gWORKGROUPS NULL class GPUC { public: cl_uint gDim; size_t gThreads[3], gWorkgroupSize[3]; private: char buff[50]; cl_int gErr; cl_uint gNumadd, gNumout; cl_platform_id gPlatforms; cl_device_id gDevice, gLoaded; cl_context gContext; cl_command_queue gGPUqueue; cl_program gKernelProgram; cl_kernel gKernel; cl_event gEvents[2]; cl_mem gArg[10], gBuf[10]; std :: string FileToString(const char *filename); public: GPUC(); int LoadKernel(const char * filename, const char * kernelname); int SetKernelArg(int argnum, const void * arg, int size); int AllocateMem(int argnum, int size); int SetEmptyArg(int argnum, int memnum); int ReadKernelArg(int argnum, void * arg, int size); int RunKernel(); void ClearAll(); }; // // GPUC.CPP // std :: string GPUC :: FileToString(const char *filename) { size_t size; char * str; std :: string s; std :: fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); return NULL; } f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; return s; } return NULL; } GPUC :: GPUC() { gNumadd = 1; gDim = 3; gThreads[0] = 2; gThreads[1] = 2; gThreads[2] = 2; gWorkgroupSize[0] = 2; gWorkgroupSize[1] = 2; gWorkgroupSize[2] = 2; if (clGetPlatformIDs(gNumadd, &gPlatforms, &gNumout) == CL_SUCCESS) { std :: cout << "Platform Found :: ("; clGetPlatformInfo(gPlatforms, CL_PLATFORM_VERSION, 100, buff, NULL); std :: cout << buff << ")" << std :: endl; } else { std :: cout << "\nOpenCL Not Found."; exit(1); } if (clGetDeviceIDs(gPlatforms, CL_DEVICE_TYPE_GPU, gNumadd, &gDevice, &gNumout) == CL_SUCCESS) { if (gNumout==0) { std :: cout << "\nNo OpenCL Capatible GPU Found."; exit(2); } } gContext = clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU, NULL, NULL, &gErr); if (gErr!=CL_SUCCESS) { std :: cout << "\nError Creating A Context."; exit(3); } if (clGetContextInfo(gContext, CL_CONTEXT_DEVICES, 100, &gLoaded, NULL)!= CL_SUCCESS) { std :: cout << "\nError Getting ContextInfo."; exit(3); } gGPUqueue = clCreateCommandQueue(gContext, gLoaded, NULL, &gErr); // CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE possible if (gErr!=CL_SUCCESS) { std :: cout << "\nError Creating A GPU Queue : "; switch (gErr) { case CL_INVALID_CONTEXT: std :: cout << "CL_INVALID_CONTEXT.\n"; break; case CL_INVALID_DEVICE: std :: cout << "CL_INVALID_DEVICE.\n"; break; case CL_INVALID_QUEUE_PROPERTIES: std :: cout << "CL_INVALID_QUEUE_PROPERTIES.\n"; break; case CL_OUT_OF_HOST_MEMORY: std :: cout << "CL_OUT_OF_HOST_MEMORY.\n"; break; } _getch(); exit(4); } } int GPUC :: AllocateMem(int memnum, int size) { gBuf[memnum] = clCreateBuffer(gContext, CL_MEM_READ_WRITE, size, NULL, &gErr); if (gErr!=CL_SUCCESS) { std :: cout << "\nError Creating Buffer."; return 1; } return 0; } int GPUC :: SetEmptyArg(int argnum, int memnum) { gErr = clSetKernelArg(gKernel, argnum, sizeof(cl_mem), &gBuf[memnum]); if (gErr!=CL_SUCCESS) { std :: cout << "\nSetKernelArg 0 Error : "; switch (gErr) { case (CL_INVALID_KERNEL): std :: cout << "CL_INVALID_KERNEL\n"; break; case (CL_INVALID_ARG_INDEX): std :: cout << "CL_INVALID_ARG_INDEX\n"; break; case (CL_INVALID_ARG_VALUE): std :: cout << "CL_INVALID_ARG_VALUE\n"; break; case (CL_INVALID_MEM_OBJECT): std :: cout << "CL_INVALID_MEM_OBJECT\n"; break; case (CL_INVALID_SAMPLER): std :: cout << "CL_INVALID_SAMPLER\n"; break; case (CL_INVALID_ARG_SIZE): std :: cout << "CL_INVALID_ARG_SIZE\n"; break; } return 1; } return 0; } int GPUC :: SetKernelArg(int argnum, const void * arg, int size) { if (argnum>9) { std :: cout << "\nMore Than 10 Arguments Are Not Allowed."; return 1; } gArg[argnum] = clCreateBuffer(gContext, CL_MEM_READ_WRITE, size, NULL, &gErr); if (gErr!=CL_SUCCESS) { std :: cout << "\nError Creating Buffer."; return 1; } clEnqueueWriteBuffer(gGPUqueue, gArg[argnum], 0, 0, size, arg, NULL, 0, NULL); if (gErr!=CL_SUCCESS) { std :: cout << "\nBuffer Enqueue Error."; return 2; } gErr = clSetKernelArg(gKernel, argnum, sizeof(cl_mem), &gArg[argnum]); if (gErr!=CL_SUCCESS) { std :: cout << "\nSetKernelArg 0 Error : "; switch (gErr) { case (CL_INVALID_KERNEL): std :: cout << "CL_INVALID_KERNEL\n"; break; case (CL_INVALID_ARG_INDEX): std :: cout << "CL_INVALID_ARG_INDEX\n"; break; case (CL_INVALID_ARG_VALUE): std :: cout << "CL_INVALID_ARG_VALUE\n"; break; case (CL_INVALID_MEM_OBJECT): std :: cout << "CL_INVALID_MEM_OBJECT\n"; break; case (CL_INVALID_SAMPLER): std :: cout << "CL_INVALID_SAMPLER\n"; break; case (CL_INVALID_ARG_SIZE): std :: cout << "CL_INVALID_ARG_SIZE\n"; break; } return 3; } return 0; } int GPUC :: LoadKernel(const char * filename, const char * kernelname) { std :: string KernelSource = FileToString(filename); const char * KernelCode = KernelSource.c_str(); size_t gSourceSize[] = {strlen(KernelCode)}; gKernelProgram = clCreateProgramWithSource(gContext, 1, &KernelCode, gSourceSize, &gErr); if (gErr!=CL_SUCCESS) { std :: cout << "\nError Creating Program With Code."; } gErr = clBuildProgram (gKernelProgram, 1, &gLoaded, NULL, NULL, NULL); if (gErr!=CL_SUCCESS) { std :: cout << "\n Program Build Error : "; switch(gErr) { case CL_INVALID_PROGRAM: std::cout << "CL_INVALID_PROGRAM."; break; case CL_INVALID_VALUE: std::cout << "CL_INVALID_VALUE."; break; case CL_INVALID_DEVICE: std::cout << "CL_INVALID_DEVICE."; break; case CL_INVALID_BINARY: std::cout << "CL_INVALID_BINARY."; break; case CL_INVALID_BUILD_OPTIONS: std::cout << "CL_INVALID_BUILD_OPTIONS."; break; case CL_INVALID_OPERATION: std::cout << "CL_INVALID_OPERATION."; break; case CL_COMPILER_NOT_AVAILABLE: std::cout << "CL_COMPILER_NOT_AVAILABLE."; break; case CL_BUILD_PROGRAM_FAILURE: std::cout << "CL_BUILD_PROGRAM_FAILURE."; break; case CL_OUT_OF_HOST_MEMORY: std::cout << "CL_OUT_OF_HOST_MEMORY."; break; case CL_DEVICE_COMPILER_AVAILABLE: std::cout << "CL_DEVICE_COMPILER_AVAILABLE."; break; default: std::cout << "^_^"; } return 1; } gKernel = clCreateKernel (gKernelProgram, kernelname, &gErr); if (gErr!=CL_SUCCESS) { std :: cout << "\nError Kernel Creating."; return 1; } return 0; } int GPUC :: RunKernel() { gErr = clEnqueueNDRangeKernel(gGPUqueue, gKernel, gDim, NULL, gThreads, gWORKGROUPS, 0, NULL, &gEvents[0]); if(gErr != CL_SUCCESS) { std :: cout << "\nError Enqueueing Kernel."; return 1; } gErr = clWaitForEvents(1, &gEvents[0]); if(gErr != CL_SUCCESS) { std :: cout << "\nError Waiting Kernel End."; return 1; } clReleaseEvent(gEvents[0]); return 0; } int GPUC :: ReadKernelArg(int argnum, void * arg, int size) { gErr = clEnqueueReadBuffer(gGPUqueue, gArg[argnum], 0, 0, size, arg, 0, NULL, &gEvents[1]); if(gErr != CL_SUCCESS) { std :: cout << "\nReadBuffer Enqueue Error."; return 1; } gErr = clWaitForEvents(1, &gEvents[1]); if(gErr != CL_SUCCESS) { std :: cout << "\nError Waiting For Read Buffer Call To Finish."; return 1; } return 0; } void GPUC :: ClearAll() { clUnloadCompiler(); clReleaseEvent(gEvents[0]); clReleaseEvent(gEvents[1]); clReleaseKernel(gKernel); //clReleaseProgram(gKernelProgram); /*clReleaseMemObject(gArg[0]); clReleaseMemObject(gArg[1]); clReleaseMemObject(gArg[2]); clReleaseMemObject(gArg[3]); clReleaseMemObject(gArg[4]); clReleaseMemObject(gArg[5]); clReleaseMemObject(gArg[6]); clReleaseMemObject(gArg[7]); clReleaseMemObject(gArg[8]); clReleaseMemObject(gArg[9]);*/ }

0 Likes

Originally posted by: Hill_Groove Thank You For Replying, The Host Code :

 

Hill_Groove,

         Thank you for providing code.  we are not able to compile code. 

If WorkGroupSize is NULL,  WorkGroupSize value calculated as follows

      1. it must be < workGroupSize returned by clGetKernelWorkGroupInfo with CL_KERNEL_WORK_GROUP_SIZE

      2. It must divide globalThreads equally

This value you get inside kernel by get_local_size.

Specifying NULL WorkGroupSize is not recommended for all types of Algoirthems. I feel it suits for data parallel algorithms.

could you please run for CL_DEVICE_TYPE_CPU and see you are getting expected results?

For your kernel, if workGroup are more then 1, you might get undefined results.  because C is not synchronized properly.

0 Likes
bealto
Journeyman III

The test a*b==1 does not select thread (0,0) but thread (1,1).

Why not use a==0 && b==0 ?

Did you try float sum = 0 ... sum += C[i*N+j] ... res[0] = sum. Here you read and write res[0] from global memory N*N times (unless optimized by compiler).

-- E

 

0 Likes

genaganna, 

thank you for replying, can you please explain "For your kernel, if workGroup are more then 1, you might get undefined results.  because C is not synchronized properly." and how it should be written correctly to count the sum without memory problems.

bealto,

thank you, for your comment, you're undoubtedly right. But what it is important for me is to find out why i am getting different answers everytime i run this program. I cannot see a syncronisation problem here, but i'm sure it exists.

0 Likes

Originally posted by: Hill_Groove genaganna, 

 

thank you for replying, can you please explain "For your kernel, if workGroup are more then 1, you might get undefined results.  because C is not synchronized properly." and how it should be written correctly to count the sum without memory problems.

Hill_Groove,

               First calculate  C values and then calculate sum value in host by writing simple C code.

 

/* Calculate C values */

__kernel void interaction(global float A[], global float C[])
{
__private int a;
__private int b;
__private int i;
__private int j;
__private float t;
a = get_global_id(0);
b = get_global_id(1);
t = 0;
for (i=0;i<10;i++)
{
     t = t + 0.01;
     t = t + 0.01;
     t = t + 0.01;
     t = t + 0.01;
     t = t + 0.01;
     t = t + 0.01;
     t = t + 0.01;
     t = t + 0.01;
}
C[a*N+b] = t;

}

0 Likes

genaganna,

Thank you very much. Is it right, that all my problems were caused by the fact that barrier(...) works only in a workgroup range not global (as i thought)?

0 Likes

yes. barrier fence synchronize work item within work group. LOCAL synchronize read write to local memory and GLOBAL global memory operations.

0 Likes

Thanks everyone for your help.

0 Likes