15 Replies Latest reply on Mar 1, 2013 6:27 AM by himanshu.gautam

    Problem with memory in OpenCL 1.2

    usachovandrii

      Hello, everyone!

      I have i great problem with memory on output from kernel.

      I use OpenCL 1.2 for parallel programming on CPU.

      As input I have an OpenCL buffer with sctructures ENNInput. Every ENNInput contains a static array of another structures (ENNHit) and also number of elements. Ouput(ENNOutput) is similar, but it contains array of another structures(ENNRing).

      Previously, I hadn't any problem when Buffer elements didn't contain an array as member of structure.

      Of course, I can't use containers instead static array. Also no idea, how to solve it with using 2D arrays.

      Below, you can read a program and kernel. My kernel does nothing except writing a one member of ENNOutput structure.

      #include <CbmL1RichENNRingFinderParallel.h>

      #include <ENNInput.h>

      #include <ENNInputArray.h>

      #include <CL/cl.h>

      #include <fstream>

      #include <iostream>

      #include <cstdio>

      #include <cstring>

      #include <vector>

      using namespace std;

       

       

       

       

      #define MAX_SOURCE_SIZE (0x1000000)

      inline void checkErr(cl_int err, const char * name)

      {

        if (err != CL_SUCCESS) {

          std::cerr << "ERROR: " << name

                    << " (" << err << ")" << std::endl;

        }

      }

       

       

       

       

      int main()

      {

        int firstEvent = 0;

        int lastEvent = 1000;

        int tasks = 1;

        int cores = 1;

       

        string filePrefix = "../input";

        string DatafilePrefix = "../input/ENNHitsDataEvent_";

        string MCPointsfilePrefix = "../input/MCPointsEvent_";

        string MCTracksfilePrefix = "../input/MCTracksEvent_";

        const int NEventsPerThread = lastEvent - firstEvent + 1;

       

       

        ENNInput* InputDataPerThread = new ENNInput[NEventsPerThread];

       

       

        int NEv = 0;

        for ( int kEvents = firstEvent; kEvents <= lastEvent; kEvents++ )

        {

          if (!ReadHitsFromFile(DatafilePrefix,kEvents, &InputDataPerThread[NEv]))

          {

            cout << "Hits Data for Event " << kEvents << " can't be read." << std::endl;

            break;

          }

          NEv++;

        }

       

       

        ENNInputArray ENNRingBufferInput;

        ENNOutputArray ENNRingBufferOutput;

       

       

        ENNRingBufferInput.fInput = new ENNInput[NEventsPerThread];

        ENNRingBufferOutput.fOutput = new ENNOutput[NEventsPerThread];

        for(int j=0; j<NEventsPerThread; j++)

        {

          ENNRingBufferInput.fInput[j] = InputDataPerThread[j];

        }

       

        FILE *fp;

        char *source_str;

        size_t source_size;

       

       

        fp = fopen("../DoFind.cl", "r");

       

        source_str = (char*)malloc(MAX_SOURCE_SIZE);

        source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);

        fclose( fp );

       

       

        // Get platform and device information

        cl_platform_id platform_id = NULL;

        cl_device_id device_id = NULL;  

        cl_uint ret_num_devices;

        cl_uint ret_num_platforms;

        cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);

            checkErr(ret, "clGetPlatformIDs");

       

        ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices);

            checkErr(ret, "clGetDeviceIDs");

       

       

        cl_uint num_devices_ret;

        cl_device_id  out_devices[cores];

        const cl_device_partition_property props[] = {CL_DEVICE_PARTITION_EQUALLY, tasks, 0};

        ret = clCreateSubDevices ( device_id, props, cores , out_devices , &num_devices_ret );

            checkErr(ret, "clCreateSubDevices");

       

       

        // Create an OpenCL context

        cl_context context = clCreateContext( NULL, 1, &out_devices[0], NULL, NULL, &ret);

       

        // Create a command queue

        cl_command_queue command_queue = clCreateCommandQueue(context, out_devices[0], CL_QUEUE_PROFILING_ENABLE, &ret);

       

        // Create memory buffers on the device for each vector

        cl_mem hits_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,NEventsPerThread* sizeof(ENNInput), NULL, &ret);

        cl_mem rings_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, NEventsPerThread* sizeof(ENNOutput), NULL, &ret);

       

        // Copy tracks and rings to their respective memory buffers

        ret = clEnqueueWriteBuffer(command_queue, hits_mem_obj, CL_TRUE, 0, NEventsPerThread* sizeof(ENNInput), ENNRingBufferInput.fInput, 0, NULL, NULL);

            checkErr(ret, "clEnqueueWriteBuffer");

        ret = clEnqueueWriteBuffer(command_queue, rings_mem_obj, CL_TRUE, 0, NEventsPerThread* sizeof(ENNOutput), ENNRingBufferOutput.fOutput, 0, NULL, NULL);

            checkErr(ret, "clEnqueueWriteBuffer");

       

       

        // Create a program from the kernel source

        cl_program program = clCreateProgramWithSource(context, 1,(const char **)&source_str, (const size_t *)&source_size, &ret);

            checkErr(ret, "clCreateProgram");

       

        // Build the program

        ret = clBuildProgram(program, 1, &out_devices[0], "-x clc++", NULL, NULL);

            checkErr(ret, "clBuildProgram");

           

        // Create the OpenCL kernel

        cl_kernel kernel = clCreateKernel(program, "DoFind", &ret);

       

       

        // Set the arguments of the kernel

        ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&hits_mem_obj);

        ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&rings_mem_obj);

       

       

        // Execute the OpenCL kernel on the list

        size_t global_item_size = NEventsPerThread; // Process the entire lists

        //    size_t local_item_size = NCopy/4; // Process in groups of 64

        size_t local_item_size = 1; // Process in groups of 64

       

       

        cl_event event;

        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, &event);

             checkErr(ret, "clEnqueueNDRangeKernel");

        ret = clWaitForEvents(1 , &event);

             checkErr(ret, "clWaitForEvents");

        ret = clEnqueueReadBuffer(command_queue, rings_mem_obj, CL_TRUE, 0, NEventsPerThread * sizeof(ENNOutput), ENNRingBufferOutput.fOutput, 0, NULL, NULL);

             checkErr(ret, "clEnqueueReadBuffer");

       

       

        cl_ulong time_start, time_end;

        clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);

        clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);

       

       

      // Clean up

             ret = clFlush(command_queue);

             ret = clFinish(command_queue);

             ret = clReleaseKernel(kernel);

             ret = clReleaseProgram(program);

             ret = clReleaseMemObject(hits_mem_obj);

             ret = clReleaseMemObject(rings_mem_obj);

             ret = clReleaseCommandQueue(command_queue);

             ret = clReleaseContext(context);

        return 0;

      }

       

      KERNEL:

      struct ENNHit

      {

          float x, y, RefId;      // coordinates

          int quality;            // quality of the best ring with this hit

          int localIndex;   // index in local copy of Clone array

      };

       

       

      struct ENNRing

      {

          bool on;                   // is the ring selected?

          float x, y, r;            // parameters

          float chi2;               // chi^2

          // variables for the selection procedure:

          int NHits;                 // number of ring hits

          int NOwn;                // number of its own hits       

          bool skip;             // skip the ring during selection

          int localIHitsSize;

          int localIHits [100]; // indexes of hits in local array

       

       

      };

      struct ENNInput

      {

        int HitsArraySize;

        ENNHit HitsArray [2000];

      };

       

       

      struct ENNOutput

      {

        int RingsArraySize;

        ENNRing RingsArray [1000];

      };

       

       

       

       

       

       

      __kernel void DoFind( __global ENNInput *InBuffer,

                           __global ENNOutput *OutBuffer)

      {

        int igl = get_global_id(0);

        printf("Event %d\n",igl);

        int NRings = 1;

        printf("%d\n",NRings);

        OutBuffer[igl].RingsArraySize = NRings;

      }

       

      As result I have segmentation fault, when some element of buffer is processing. So program can't   process whole buffer.

      Of course I cut all calculations from kernel. Before it, I observed that program does right calculation only with first element of Cl Buffer. Every next calculations will be more and more wrong. So, because of unknown why, program mixes up memory from different executions (between different elements of buffer).

      If you have any idea about of this problem or about another way, how to do it, please help.

        • Re: Problem with memory in OpenCL 1.2
          himanshu.gautam

          Hi,

          I would suggest you to try attaching the source codes of your testcase as a zip flle. This way it looks very overwhelming.

          It looks like i will need #include <CbmL1RichENNRingFinderParallel.h> file to compile it.

           

          From the kernel point of view, i see you creating a ENNOutput object (having 1000 * 100 ints). Now that is a huge number per work-item of GPU. Does this code work for you, when you have smaller arrays?

          I will try compiling it and let you know anyways.

           

          EDIT: Tried compiling, but many header files are missing, in the source you have posted. Please attach a test case with required headers.

           

          Message was edited by: Himanshu Gautam

          • Re: Problem with memory in OpenCL 1.2
            himanshu.gautam

            1. Since you use CMAKE, this could work both on windows and linux? Are you seeing the problem on both?

             

            2. Bitness of your plaform - 32 or 64bit?

             

            3. Also, What is the APP SDK version that you have installed? Try 2.8 - Thats the latest.

             

            [edit]

             

            The C.cpp file in the package you attached above includes "CbmL1RichENNRingFinderParallel.h". I did not find this anywhere in the package.

            Can you please attach this file too?

              • Re: Problem with memory in OpenCL 1.2
                usachovandrii

                Ooo.Sorry)

                I work with 64-bit Ubuntu, but I had the same problem with another Linux earlier.

                I use the latest APP SDK

                    • Re: Problem with memory in OpenCL 1.2
                      himanshu.gautam

                      Looking into this....Will get back soon. Thanks for your time,

                      • Re: Problem with memory in OpenCL 1.2
                        himanshu.gautam

                        Hi,

                        Sorry it took time. Got stuck in some other issues.

                         

                        From my observation of the test case, I have see two problems:

                        1. When run on a multi-core CPU, clCreateSubDevices give CL_INVALID_VALUE error. Here the usage of the API looks a little faulty to me. The recommended usage would be like:

                        ret = clCreateSubDevices ( device_id, props, 0, NULL, &num_devices_ret );

                              checkErr(ret, "clCreateSubDevices");

                          std::cout << "num_devices_ret:" << num_devices_ret << std::endl;

                          cl_device_id * out_devices = malloc(num_devices_ret * sizeof(cl_device_id));

                          ret = clCreateSubDevices ( device_id, props, num_devices_ret , out_devices , NULL);

                              checkErr(ret, "clCreateSubDevices");

                        2. Once this was implemented, the kernel ran. The application ran properly for small number of global threads. For 1024 global threads, runtime gives a segmentation fault. This looks reasonable as the structs used here are very large. For 1024 threads: ENNOutput size = 419MB. But my card only shows 128MB as Max Memory allocatable. So IMHO segmentation fault is expected here.

                        I even ran for 1024 threads by reducing the struct sizes and that ran fine.

                          • Re: Problem with memory in OpenCL 1.2
                            usachovandrii

                            Thank you a lot for your reply. Sure, I haven't a segmentation fault after reduction the size of thread.

                            But problem with buffer still exists.

                            Program writes to global buffer only first element of array ENNRing, which is field of structure ENNOutput. Also next elements of buffer are not written.

                            Only filling some elemets of ENNOutput were added.

                            Se in attached archive.

                             

                            With best regards, Andrii

                              • Re: Problem with memory in OpenCL 1.2
                                himanshu.gautam

                                Hi,

                                Can you tell what particular file to look here.

                                Or are you expecting me to run the testcase and then check output file?

                                 

                                Also to confirm, you are saying that only a few of your ENNOutput variables are updated after kernel execution?

                                  • Re: Problem with memory in OpenCL 1.2
                                    usachovandrii

                                    Hello)

                                    As you see. I do the same with every element of OpenCl buffer

                                         __kernel void DoFind( __global ENNInput *InBuffer,

                                                         __global ENNOutput *OutBuffer)

                                         {

                                           int igl = get_global_id(0);

                                           printf("Event %d\n",igl);

                                           int NRings = 1;

                                           printf("%d\n",NRings);

                                           OutBuffer[igl].RingsArraySize = NRings;

                                           OutBuffer[igl].RingsArray[0].x = 1;

                                           OutBuffer[igl].RingsArray[1].x = 1;

                                         }

                                     

                                    So, I expect that for every element of OutBuffer :   .RingsArraySize=1

                                                                                                             .RingsArray[0].x = 1;

                                                                                                              RingsArray[1].x = 1;

                                     

                                    some fields are nonzero.

                                    But in    C.cpp      I added printing of fields.

                                           for(int jj=0;jj<NEventsPerThread;jj++)

                                           {

                                                          std::cout<<"Event " <<jj<< "  RingsArraySize = " << ENNRingBufferOutput.fOutput[jj].RingsArraySize << std::endl;

                                                          std::cout<<"  "<< "  RingsArray[0].x = " << ENNRingBufferOutput.fOutput[jj].RingsArray[0].x << std::endl;

                                                 std::cout<<"  "<< "  RingsArray[1].x = " << ENNRingBufferOutput.fOutput[jj].RingsArray[1].x << std::endl;

                                            }

                                    So, I expect to see the same for every element of buffer.

                                    Something like:

                                          Event 2           RingsArraySize = 1

                                                             RingsArray[0].x = 1

                                                             RingsArray[1].x = 1

                                          Event 3           RingsArraySize = 1

                                                             RingsArray[0].x = 1

                                                             RingsArray[1].x = 1


                                     

                                    But realy we have:


                                    Event 0  RingsArraySize = 1

                                        RingsArray[0].x = 1

                                        RingsArray[1].x = 0

                                    Event 1  RingsArraySize = 0

                                        RingsArray[0].x = 0

                                        RingsArray[1].x = 0

                                    Event 2  RingsArraySize = 0

                                        RingsArray[0].x = 0

                                        RingsArray[1].x = 0

                                     


                                     


                        • Re: Problem with memory in OpenCL 1.2
                          usachovandrii

                          As you see, it is some problem with transference of array of structures. But input includes an array of structures too and no problem there. Problem is only in output buffer

                          • Re: Problem with memory in OpenCL 1.2
                            usachovandrii

                            Does exist any restrictions for used stuctures?

                            • Re: Problem with memory in OpenCL 1.2
                              usachovandrii

                              doesn't matter. I found an error.

                              Anyway, thank you a lot for your replying.

                              I found another problem, but I will create a new topic for it