AnsweredAssumed Answered

clEnqueueNDRangeKernel returns  CL_INVALID_PROGRAM_EXECUTABLE

Question asked by landlord on Sep 11, 2015
Latest reply on Sep 11, 2015 by landlord

I want to use two GPU to realize a matrix caculation, and therefore I create two cmd queue for two devices.

Additionally,clBuildProgram() return CL_SUCCESS.The abstract of my code as follows:

===============================================================================================

   //get platform

    status = clGetPlatformIDs(0, NULL, &numPlatforms);

    if(status != CL_SUCCESS)

    {

        printf("Error: Getting Platforms.\(clGetPlatformsIDs)\n");

        return EXIT_FAILURE;

    }

 

    cl_platform_id *platforms = NULL;

    if(numPlatforms > 0)

    {

        //Allocate enough space for each platform

 

 

        platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));

        //fill in the platforms

        status = clGetPlatformIDs(numPlatforms, platforms, NULL);

        if(status != CL_SUCCESS)

        {

            printf("Error: Getting Platform Ids.\(clGetPlatformsIDs)\n");

            return -1;

        }

     

     

    }

 

    cl_uint numDevices = 0;

    cl_device_id *devices = NULL;

    status = clGetDeviceIDs(

        platforms[0],

        CL_DEVICE_TYPE_GPU,

        0,

        NULL,

        &numDevices);

 

    // Allocate enough space for each device

 

    devices = (cl_device_id*)malloc(

            numDevices*sizeof(cl_device_id));

 

    // Fill in devices with clGetDeviceIDs()

 

    status = clGetDeviceIDs(

        platforms[0],

        CL_DEVICE_TYPE_GPU,     

        numDevices,

        devices,

        NULL);

    char devicename[128];

 

    //print the devices' names.

 

    for(int i=0;i<numDevices;i++){

    status=clGetDeviceInfo(devices[i],CL_DEVICE_NAME,128*sizeof(char),devicename,NULL);

    if (status != CL_SUCCESS) {

        printf("Error: Getting devices Info (device name, clGetDeviceInfo)\n");

     

    }

    printf("device[%d] name:\t%s\n",i,devicename);

    }

    cl_context context=NULL;

    context = clCreateContext(

        NULL,

        numDevices,

        devices,

        NULL,

        NULL,

        &status);

 

    const char* source = readSource("matrix.cl");

    cl_program program = clCreateProgramWithSource(context,1,&source,NULL,&status);

   if(status != CL_SUCCESS)

    {

printf("Error: Loading Binary into cl_program \(clCreateProgramWithBinary)\n");
return EXIT_FAILURE;

    }

 

 

    string buildflag="-cl-std=CL2.0";

    status = clBuildProgram(program, 1, devices, buildflag.c_str(), NULL, NULL); // It returns CL_SUCCESS when debug

    if(status != CL_SUCCESS)

    {

printf("Error: Building Program \(clBuildProgram)\n");
printf("%d",status);
char buildLog[16384];
clGetProgramBuildInfo(program,*devices,CL_PROGRAM_BUILD_LOG,sizeof(buildLog),buildLog,NULL);

 

cerr<<"in kernel"<<endl;
cerr<<buildLog;
clReleaseProgram(program);
return EXIT_FAILURE;

    }

//kernels of device[0]

  cl_kernel kernel = clCreateKernel(program, "hellocl", &status);

    cl_kernel kernelend =clCreateKernel(program, "helloclend", &status);

    cl_kernel kernelagain =clCreateKernel(program, "helloclagain", &status);

//kernels of device[1]

    cl_kernel kernel1 = clCreateKernel(program, "hellocl1", &status);

    cl_kernel kernelend1 =clCreateKernel(program, "helloclend1", &status);

    cl_kernel kernelagain1 =clCreateKernel(program, "helloclagain1", &status);

if(status != CL_SUCCESS)

    {

printf("Error: Creating Kernel from program.\(clCreateKernel)\n");
return EXIT_FAILURE;

    }

    cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, &status);        //cmd queue for device[0]

    cl_command_queue commandQueue1 = clCreateCommandQueue(context, devices[1], 0, &status);      //cmd queue for device[1]

    if(status != CL_SUCCESS)

    {

        printf("Creating Command Queue.\(clCreatCommandQueue)\n");

        return EXIT_FAILURE;

    }

    cl_command_queue_properties devProp[] = {CL_QUEUE_PROPERTIES,

       CL_QUEUE_ON_DEVICE|

       CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE|

       CL_QUEUE_ON_DEVICE_DEFAULT,

       CL_QUEUE_SIZE,

       524288,

       0};

    cl_command_queue devCommandQueue;                                                                                               //device-side queue for device[0]

    cl_command_queue devCommandQueue1;                                                                                             //device-side queue for device[1]

    devCommandQueue = clCreateCommandQueueWithProperties(

                                                       context,

                                                       devices[0],

                                                       devProp,

                                                       &status);

    devCommandQueue = clCreateCommandQueueWithProperties(

                                                       context,

                                                       devices[1],

                                                       devProp,

                                                       &status);

    if(status != CL_SUCCESS)

    {

        printf("Creating device-Command Queue.\(clCreatCommandQueuewithproperties)\n");

        return EXIT_FAILURE;

    }

 

    // PASS score-matrix in "outbuffer" in CPU to "outputBuffer" in GPU;commandQueue

    status = clEnqueueWriteBuffer(commandQueue,outputBuffer,CL_TRUE,0,

            (strlen(strA)+1) *(strlen(strB)+1) * 4,outbuffer,0, NULL, NULL);

    status = clEnqueueWriteBuffer(commandQueue1,outputBuffer,CL_TRUE,0,

            (strlen(strA)+1) *(strlen(strB)+1) * 4,outbuffer,0, NULL, NULL);

 

     // pass strA in CPU to bufferA in GPU

    status = clEnqueueWriteBuffer(commandQueue,bufferA,CL_TRUE,0,

            (strlen(strA)+1) * sizeof(char),strA,0, NULL, NULL);

    status = clEnqueueWriteBuffer(commandQueue1,bufferA,CL_TRUE,0,

            (strlen(strA)+1) * sizeof(char),strA,0, NULL, NULL);

    // pass strB in CPU to bufferB in GPU

    status = clEnqueueWriteBuffer(commandQueue,bufferB,CL_TRUE,0,

            (strlen(strB)+1) * sizeof(char),strB,0, NULL, NULL);

    status = clEnqueueWriteBuffer(commandQueue1,bufferB,CL_TRUE,0,

            (strlen(strB)+1) * sizeof(char),strB,0, NULL, NULL);

 

//set arguments of every kernels...

 

        size_t globalThreads[] = {(strlen(strB)+1)/2};                              // strlen(strB)+1=2246

        size_t globalThreads1[] = {((strlen(strB)+1)-(strlen(strB)+1)/2)};

        if(((strlen(strB)+1)/2)<256)

        {

            size_t localThreads[] = {(strlen(strB)+1)/2};

            status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalThreads,localThreads, 0, NULL,NULL);

            if(status != CL_SUCCESS)

            {

                printf("Error: Enqueueing kernel\n");

                return EXIT_FAILURE;

            }

        }

        else

        {

            size_t localThreads[] = {256};

            // clEnqueueNDRangeKernel: put "kernel" into "commandQueue"

            status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalThreads,localThreads, 0, NULL,NULL);   //This function returns CL_SUCCESS

 

            if(status != CL_SUCCESS)

            {

                printf("Error: Enqueueing kernel\n");

                return EXIT_FAILURE;

            }

 

 

        }

 

        if(((strlen(strB)+1)-(strlen(strB)+1)/2)<256)

        {

            size_t localThreads[] = {((strlen(strB)+1)-(strlen(strB)+1)/2)};

            // clEnqueueNDRangeKernel: put "kernel" into "commandQueue"

            status = clEnqueueNDRangeKernel(commandQueue1, kernel1, 1, NULL, globalThreads1,localThreads, 0, NULL,NULL);

            if(status != CL_SUCCESS)

            {

                printf("Error: Enqueueing kernel\n");

                return EXIT_FAILURE;

            }

        }

        else

        {

            size_t localThreads[] = {256};

            // clEnqueueNDRangeKernel: put "kernel" into "commandQueue"

            status = clEnqueueNDRangeKernel(commandQueue1, kernel1, 1, NULL, globalThreads1,localThreads, 0, NULL,NULL);   // This function returns CL_INVALID_PROGRAM_EXECUTABLE

 

            if(status != CL_SUCCESS)

            {

                printf("Error: Enqueueing kernel\n");

                return EXIT_FAILURE;

            }

 

 

        }

 

===================================================================================================================================

I can attach my program file if nessesary.

Please help.

Outcomes