Archives Discussions

usachovandrii · ‎02-06-2013

Hello, everyone!

I have i great problem with memory on output from kernel.

I use OpenCL 1.2 for parallel programming on CPU.

As input I have an OpenCL buffer with sctructures ENNInput. Every ENNInput contains a static array of another structures (ENNHit) and also number of elements. Ouput(ENNOutput) is similar, but it contains array of another structures(ENNRing).

Previously, I hadn't any problem when Buffer elements didn't contain an array as member of structure.

Of course, I can't use containers instead static array. Also no idea, how to solve it with using 2D arrays.

Below, you can read a program and kernel. My kernel does nothing except writing a one member of ENNOutput structure.

#include <CbmL1RichENNRingFinderParallel.h>
#include <ENNInput.h>
#include <ENNInputArray.h>
#include <CL/cl.h>
#include <fstream>
#include <iostream>
#include <cstdio>
#include <cstring>
#include <vector>
using namespace std;
#define MAX_SOURCE_SIZE (0x1000000)
inline void checkErr(cl_int err, const char * name)
{
if (err != CL_SUCCESS) {
    std::cerr << "ERROR: " << name
              << " (" << err << ")" << std::endl;
}
}
int main()
{
int firstEvent = 0;
int lastEvent = 1000;
int tasks = 1;
int cores = 1;

string filePrefix = "../input";
string DatafilePrefix = "../input/ENNHitsDataEvent_";
string MCPointsfilePrefix = "../input/MCPointsEvent_";
string MCTracksfilePrefix = "../input/MCTracksEvent_";
const int NEventsPerThread = lastEvent - firstEvent + 1;
ENNInput* InputDataPerThread = new ENNInput[NEventsPerThread];
int NEv = 0;
for ( int kEvents = firstEvent; kEvents <= lastEvent; kEvents++ )
{
    if (!ReadHitsFromFile(DatafilePrefix,kEvents, &InputDataPerThread[NEv]))
    {
      cout << "Hits Data for Event " << kEvents << " can't be read." << std::endl;
      break;
    }
    NEv++;
}
ENNInputArray ENNRingBufferInput;
ENNOutputArray ENNRingBufferOutput;
ENNRingBufferInput.fInput = new ENNInput[NEventsPerThread];
ENNRingBufferOutput.fOutput = new ENNOutput[NEventsPerThread];
for(int j=0; j<NEventsPerThread; j++)
{
    ENNRingBufferInput.fInput = InputDataPerThread;
}

FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("../DoFind.cl", "r");

source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
      checkErr(ret, "clGetPlatformIDs");

ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices);
      checkErr(ret, "clGetDeviceIDs");
cl_uint num_devices_ret;
cl_device_id out_devices[cores];
const cl_device_partition_property props[] = {CL_DEVICE_PARTITION_EQUALLY, tasks, 0};
ret = clCreateSubDevices ( device_id, props, cores , out_devices , &num_devices_ret );
      checkErr(ret, "clCreateSubDevices");
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &out_devices[0], NULL, NULL, &ret);

// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, out_devices[0], CL_QUEUE_PROFILING_ENABLE, &ret);

// Create memory buffers on the device for each vector
cl_mem hits_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,NEventsPerThread* sizeof(ENNInput), NULL, &ret);
cl_mem rings_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, NEventsPerThread* sizeof(ENNOutput), NULL, &ret);

// Copy tracks and rings to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, hits_mem_obj, CL_TRUE, 0, NEventsPerThread* sizeof(ENNInput), ENNRingBufferInput.fInput, 0, NULL, NULL);
      checkErr(ret, "clEnqueueWriteBuffer");
ret = clEnqueueWriteBuffer(command_queue, rings_mem_obj, CL_TRUE, 0, NEventsPerThread* sizeof(ENNOutput), ENNRingBufferOutput.fOutput, 0, NULL, NULL);
      checkErr(ret, "clEnqueueWriteBuffer");
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,(const char **)&source_str, (const size_t *)&source_size, &ret);
      checkErr(ret, "clCreateProgram");

// Build the program
ret = clBuildProgram(program, 1, &out_devices[0], "-x clc++", NULL, NULL);
      checkErr(ret, "clBuildProgram");

// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "DoFind", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&hits_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&rings_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = NEventsPerThread; // Process the entire lists
//    size_t local_item_size = NCopy/4; // Process in groups of 64
size_t local_item_size = 1; // Process in groups of 64
cl_event event;
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, &event);
       checkErr(ret, "clEnqueueNDRangeKernel");
ret = clWaitForEvents(1 , &event);
       checkErr(ret, "clWaitForEvents");
ret = clEnqueueReadBuffer(command_queue, rings_mem_obj, CL_TRUE, 0, NEventsPerThread * sizeof(ENNOutput), ENNRingBufferOutput.fOutput, 0, NULL, NULL);
       checkErr(ret, "clEnqueueReadBuffer");
cl_ulong time_start, time_end;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
// Clean up
       ret = clFlush(command_queue);
       ret = clFinish(command_queue);
       ret = clReleaseKernel(kernel);
       ret = clReleaseProgram(program);
       ret = clReleaseMemObject(hits_mem_obj);
       ret = clReleaseMemObject(rings_mem_obj);
       ret = clReleaseCommandQueue(command_queue);
       ret = clReleaseContext(context);
return 0;
}

KERNEL:

struct ENNHit
{
    float x, y, RefId;      // coordinates
    int quality;            // quality of the best ring with this hit
    int localIndex;   // index in local copy of Clone array
};
struct ENNRing
{
    bool on;                   // is the ring selected?
    float x, y, r;            // parameters
    float chi2;               // chi^2
    // variables for the selection procedure:
    int NHits;                 // number of ring hits
    int NOwn;                // number of its own hits
    bool skip;             // skip the ring during selection
    int localIHitsSize;
    int localIHits [100]; // indexes of hits in local array
};
struct ENNInput
{
int HitsArraySize;
ENNHit HitsArray [2000];
};
struct ENNOutput
{
int RingsArraySize;
ENNRing RingsArray [1000];
};
__kernel void DoFind( __global ENNInput *InBuffer,
                     __global ENNOutput *OutBuffer)
{
int igl = get_global_id(0);
printf("Event %d\n",igl);
int NRings = 1;
printf("%d\n",NRings);
OutBuffer[igl].RingsArraySize = NRings;
}

As result I have segmentation fault, when some element of buffer is processing. So program can't process whole buffer.

Of course I cut all calculations from kernel. Before it, I observed that program does right calculation only with first element of Cl Buffer. Every next calculations will be more and more wrong. So, because of unknown why, program mixes up memory from different executions (between different elements of buffer).

If you have any idea about of this problem or about another way, how to do it, please help.

Archives Discussions

Problem with memory in OpenCL 1.2