2 Replies Latest reply on Sep 11, 2015 7:13 AM by landlord

    clEnqueueNDRangeKernel returns  CL_INVALID_PROGRAM_EXECUTABLE

    landlord

      I want to use two GPU to realize a matrix caculation, and therefore I create two cmd queue for two devices.

      Additionally,clBuildProgram() return CL_SUCCESS.The abstract of my code as follows:

      ===============================================================================================

         //get platform

          status = clGetPlatformIDs(0, NULL, &numPlatforms);

          if(status != CL_SUCCESS)

          {

              printf("Error: Getting Platforms.\(clGetPlatformsIDs)\n");

              return EXIT_FAILURE;

          }

       

          cl_platform_id *platforms = NULL;

          if(numPlatforms > 0)

          {

              //Allocate enough space for each platform

       

       

              platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));

              //fill in the platforms

              status = clGetPlatformIDs(numPlatforms, platforms, NULL);

              if(status != CL_SUCCESS)

              {

                  printf("Error: Getting Platform Ids.\(clGetPlatformsIDs)\n");

                  return -1;

              }

           

           

          }

       

          cl_uint numDevices = 0;

          cl_device_id *devices = NULL;

          status = clGetDeviceIDs(

              platforms[0],

              CL_DEVICE_TYPE_GPU,

              0,

              NULL,

              &numDevices);

       

          // Allocate enough space for each device

       

          devices = (cl_device_id*)malloc(

                  numDevices*sizeof(cl_device_id));

       

          // Fill in devices with clGetDeviceIDs()

       

          status = clGetDeviceIDs(

              platforms[0],

              CL_DEVICE_TYPE_GPU,     

              numDevices,

              devices,

              NULL);

          char devicename[128];

       

          //print the devices' names.

       

          for(int i=0;i<numDevices;i++){

          status=clGetDeviceInfo(devices[i],CL_DEVICE_NAME,128*sizeof(char),devicename,NULL);

          if (status != CL_SUCCESS) {

              printf("Error: Getting devices Info (device name, clGetDeviceInfo)\n");

           

          }

          printf("device[%d] name:\t%s\n",i,devicename);

          }

          cl_context context=NULL;

          context = clCreateContext(

              NULL,

              numDevices,

              devices,

              NULL,

              NULL,

              &status);

       

          const char* source = readSource("matrix.cl");

          cl_program program = clCreateProgramWithSource(context,1,&source,NULL,&status);

         if(status != CL_SUCCESS)

          {

      printf("Error: Loading Binary into cl_program \(clCreateProgramWithBinary)\n");
      return EXIT_FAILURE;

          }

       

       

          string buildflag="-cl-std=CL2.0";

          status = clBuildProgram(program, 1, devices, buildflag.c_str(), NULL, NULL); // It returns CL_SUCCESS when debug

          if(status != CL_SUCCESS)

          {

      printf("Error: Building Program \(clBuildProgram)\n");
      printf("%d",status);
      char buildLog[16384];
      clGetProgramBuildInfo(program,*devices,CL_PROGRAM_BUILD_LOG,sizeof(buildLog),buildLog,NULL);

       

      cerr<<"in kernel"<<endl;
      cerr<<buildLog;
      clReleaseProgram(program);
      return EXIT_FAILURE;

          }

      //kernels of device[0]

        cl_kernel kernel = clCreateKernel(program, "hellocl", &status);

          cl_kernel kernelend =clCreateKernel(program, "helloclend", &status);

          cl_kernel kernelagain =clCreateKernel(program, "helloclagain", &status);

      //kernels of device[1]

          cl_kernel kernel1 = clCreateKernel(program, "hellocl1", &status);

          cl_kernel kernelend1 =clCreateKernel(program, "helloclend1", &status);

          cl_kernel kernelagain1 =clCreateKernel(program, "helloclagain1", &status);

      if(status != CL_SUCCESS)

          {

      printf("Error: Creating Kernel from program.\(clCreateKernel)\n");
      return EXIT_FAILURE;

          }

          cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, &status);        //cmd queue for device[0]

          cl_command_queue commandQueue1 = clCreateCommandQueue(context, devices[1], 0, &status);      //cmd queue for device[1]

          if(status != CL_SUCCESS)

          {

              printf("Creating Command Queue.\(clCreatCommandQueue)\n");

              return EXIT_FAILURE;

          }

          cl_command_queue_properties devProp[] = {CL_QUEUE_PROPERTIES,

             CL_QUEUE_ON_DEVICE|

             CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE|

             CL_QUEUE_ON_DEVICE_DEFAULT,

             CL_QUEUE_SIZE,

             524288,

             0};

          cl_command_queue devCommandQueue;                                                                                               //device-side queue for device[0]

          cl_command_queue devCommandQueue1;                                                                                             //device-side queue for device[1]

          devCommandQueue = clCreateCommandQueueWithProperties(

                                                             context,

                                                             devices[0],

                                                             devProp,

                                                             &status);

          devCommandQueue = clCreateCommandQueueWithProperties(

                                                             context,

                                                             devices[1],

                                                             devProp,

                                                             &status);

          if(status != CL_SUCCESS)

          {

              printf("Creating device-Command Queue.\(clCreatCommandQueuewithproperties)\n");

              return EXIT_FAILURE;

          }

       

          // PASS score-matrix in "outbuffer" in CPU to "outputBuffer" in GPU;commandQueue

          status = clEnqueueWriteBuffer(commandQueue,outputBuffer,CL_TRUE,0,

                  (strlen(strA)+1) *(strlen(strB)+1) * 4,outbuffer,0, NULL, NULL);

          status = clEnqueueWriteBuffer(commandQueue1,outputBuffer,CL_TRUE,0,

                  (strlen(strA)+1) *(strlen(strB)+1) * 4,outbuffer,0, NULL, NULL);

       

           // pass strA in CPU to bufferA in GPU

          status = clEnqueueWriteBuffer(commandQueue,bufferA,CL_TRUE,0,

                  (strlen(strA)+1) * sizeof(char),strA,0, NULL, NULL);

          status = clEnqueueWriteBuffer(commandQueue1,bufferA,CL_TRUE,0,

                  (strlen(strA)+1) * sizeof(char),strA,0, NULL, NULL);

          // pass strB in CPU to bufferB in GPU

          status = clEnqueueWriteBuffer(commandQueue,bufferB,CL_TRUE,0,

                  (strlen(strB)+1) * sizeof(char),strB,0, NULL, NULL);

          status = clEnqueueWriteBuffer(commandQueue1,bufferB,CL_TRUE,0,

                  (strlen(strB)+1) * sizeof(char),strB,0, NULL, NULL);

       

      //set arguments of every kernels...

       

              size_t globalThreads[] = {(strlen(strB)+1)/2};                              // strlen(strB)+1=2246

              size_t globalThreads1[] = {((strlen(strB)+1)-(strlen(strB)+1)/2)};

              if(((strlen(strB)+1)/2)<256)

              {

                  size_t localThreads[] = {(strlen(strB)+1)/2};

                  status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalThreads,localThreads, 0, NULL,NULL);

                  if(status != CL_SUCCESS)

                  {

                      printf("Error: Enqueueing kernel\n");

                      return EXIT_FAILURE;

                  }

              }

              else

              {

                  size_t localThreads[] = {256};

                  // clEnqueueNDRangeKernel: put "kernel" into "commandQueue"

                  status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalThreads,localThreads, 0, NULL,NULL);   //This function returns CL_SUCCESS

       

                  if(status != CL_SUCCESS)

                  {

                      printf("Error: Enqueueing kernel\n");

                      return EXIT_FAILURE;

                  }

       

       

              }

       

              if(((strlen(strB)+1)-(strlen(strB)+1)/2)<256)

              {

                  size_t localThreads[] = {((strlen(strB)+1)-(strlen(strB)+1)/2)};

                  // clEnqueueNDRangeKernel: put "kernel" into "commandQueue"

                  status = clEnqueueNDRangeKernel(commandQueue1, kernel1, 1, NULL, globalThreads1,localThreads, 0, NULL,NULL);

                  if(status != CL_SUCCESS)

                  {

                      printf("Error: Enqueueing kernel\n");

                      return EXIT_FAILURE;

                  }

              }

              else

              {

                  size_t localThreads[] = {256};

                  // clEnqueueNDRangeKernel: put "kernel" into "commandQueue"

                  status = clEnqueueNDRangeKernel(commandQueue1, kernel1, 1, NULL, globalThreads1,localThreads, 0, NULL,NULL);   // This function returns CL_INVALID_PROGRAM_EXECUTABLE

       

                  if(status != CL_SUCCESS)

                  {

                      printf("Error: Enqueueing kernel\n");

                      return EXIT_FAILURE;

                  }

       

       

              }

       

      ===================================================================================================================================

      I can attach my program file if nessesary.

      Please help.