1 Reply Latest reply on Jan 24, 2017 5:19 AM by dipak

    OpenCL 2.0 Compiler Segmentation Fault or LLVM Error

    vrcomputing

      Hello,

       

      I am trying to get a simple dynamic parallelism code to run using OpenCL 2.0 on my system. Unfortunately I get an LLVM error:

      Error in hsa_code section, at offset 1572:

      Instruction has invalid segment (global), expected one of: group, private

      LLVM ERROR:

      Brig container validation has failed in BRIGAsmPrinter.cpp

       

      Sometimes the OpenCL compile process terminates with a Segmentation Fault...

      My system: Ubuntu 14.04 x64, AMD A10-7890K Radeon R7, 12 Compute Cores 4C+8G, dmesg: module loaded - fglrx 15.20.3 [Sep  8 2015] with 1 minors

       

      Here is my code:

      #include <CL/cl.h>
      #include <CL/cl_platform.h>
      #include <stddef.h>
      #include <cassert>
      #include <cstdio>
      #include <cstdlib>
      #include <iomanip>
      #include <iostream>
      
      using namespace std;
      
      // Use a static data size for simplicity
      //
      #define DATA_SIZE (16)
      const unsigned int data_size = DATA_SIZE;
      
      // Simple compute kernel
      //
      const char *KernelSource = "\n"
              "#define FRAG_SIZE (16 / 8)                                                \n"
              "__kernel void square(                                                     \n"
              "   __global void* data,                                                   \n"
              "   unsigned int size,                                                     \n"
              "   __global void* error)                                                   \n"
              "{                                                                         \n"
              "   size_t pgid = get_global_id(0);                                        \n"
              "                                                                          \n"
              "   // create 1 dim ndrange                                                \n"
              "    ndrange_t ndrange = ndrange_1D(FRAG_SIZE);                             \n"
              "                                                                          \n"
              "   // store the device's default queue                                    \n"
              "   queue_t default_queue = get_default_queue();                           \n"
              "                                                                          \n"
              "   // declare/define the child kernels code                               \n"
              "   void (^fun_blk)(void) = ^{((__global char*)data)[pgid * FRAG_SIZE + get_global_id(0)] = pgid;}; \n"
              "                                                                          \n"
              "   // enqueue child kernels                                               \n"
              "   *((__global int*)error) = enqueue_kernel(default_queue,                                 \n"
              "      CLK_ENQUEUE_FLAGS_NO_WAIT,                                           \n"
              "      ndrange,                                                             \n"
              "      fun_blk);                                                            \n"
              "}                                                                         \n"
              "\n";
      
      void print(char* data, unsigned int size) {
          for (unsigned int i = 0; i < size; ++i) {
              cout << "[";
              cout << setw(2) << setfill('0') << hex << (int) data[i];
              cout << "]";
          }
          cout << endl;
      }
      
      int main(void) {
      
          cl_int errCPU;                      // error code returned from api calls
          cl_int* errGPU;                      // error code returned from device CL C calls
      
          size_t global; // global domain size for our calculation
          size_t local;  // local domain size for our calculation
      
          cl_device_id device_id;  // compute device id
          cl_context context;  // compute context
          cl_command_queue queue_host;  // host's command queue
          cl_command_queue queue_device;  // device's command queue
          cl_program program;  // compute program
          cl_kernel kernel;  // compute kernel
      
          unsigned int count = 2;
          void* data;
      
      // Connect to a platform
      //
          cl_platform_id platforms[2];
          errCPU = clGetPlatformIDs(2, platforms, &count);
          assert(errCPU == CL_SUCCESS);
      
      // Connect to a compute device
      //
          errCPU = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
          assert(errCPU == CL_SUCCESS);
          if (errCPU != CL_SUCCESS) {
              printf("Error: Failed to create a device group!\n");
              exit(1);
          }
      
          const size_t size = 1024;
          char deviceName[size];
          size_t size2 = 0;
          errCPU = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(char) * size, (void*) deviceName, &size2);
          string strDeviceName(deviceName);
          cout << strDeviceName << endl;
      
          // Create a compute context
          //
          context = clCreateContext(0, 1, &device_id, NULL, NULL, &errCPU);
          assert(errCPU == CL_SUCCESS);
          if (!context) {
              printf("Error: Failed to create a compute context!\n");
              exit(1);
          }
      
          // Fill our data set with random float values
          //
          data = clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, DATA_SIZE, 0);
          assert(data != nullptr);
      
          int i = 0;
          for (i = 0; i < DATA_SIZE; i++)
              reinterpret_cast<char*>(data)[i] = 'A';
      
          // Allocate SVM error memory
          //
          errGPU = reinterpret_cast<cl_int*>(clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_int), 0));
      
          // Create host's command queues
          //
          cl_queue_properties props_host[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0, 0 };
          queue_host = clCreateCommandQueueWithProperties(context, device_id, props_host, &errCPU);
          assert(errCPU == CL_SUCCESS);
      
          // Create device's command queues
          //
          cl_queue_properties props_device[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT, 0, 0 };
          queue_device = clCreateCommandQueueWithProperties(context, device_id, props_device, &errCPU);
          assert(errCPU == CL_SUCCESS);
      
          if (!queue_host) {
              printf("Error: Failed to create a command commands!\n");
              exit(1);
          }
      
          // Create the compute program from the source buffer
          //
          program = clCreateProgramWithSource(context, 1, (const char **) &KernelSource, NULL, &errCPU);
          assert(errCPU == CL_SUCCESS);
      
          if (!program) {
              printf("Error: Failed to create compute program!\n");
              exit(1);
          }
      
          // Build the program executable
          //
          errCPU = clBuildProgram(program, 0, NULL, "-cl-opt-disable -cl-std=CL2.0 -g -Werror", NULL, NULL);
      
          if (errCPU != CL_SUCCESS) {
              size_t len;
              char buffer[2048];
      
              printf("Error: Failed to build program executable!\n");
              clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
              printf("%s\n", buffer);
              exit(1);
          }
      
          // Create the compute kernel in the program we wish to run
          //
          kernel = clCreateKernel(program, "square", &errCPU);
          assert(errCPU == CL_SUCCESS);
      
          if (!kernel || errCPU != CL_SUCCESS) {
              printf("Error: Failed to create compute kernel!\n");
              exit(1);
          }
      
      // Set the arguments to our compute kernel
      //
          errCPU = 0;
          errCPU |= clSetKernelArgSVMPointer(kernel, 0, data);
          errCPU |= clSetKernelArg(kernel, 1, sizeof(data_size), &data_size);
          errCPU |= clSetKernelArgSVMPointer(kernel, 2, errGPU);
          if (errCPU != CL_SUCCESS) {
              printf("Error: Failed to set kernel arguments! %d\n", errCPU);
              exit(1);
          }
      
      // Get the maximum work group size for executing the kernel on the device
      //
          errCPU = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
          if (errCPU != CL_SUCCESS) {
              printf("Error: Failed to retrieve kernel work group info! %d\n", errCPU);
              exit(1);
          }
          cout << "CL_KERNEL_WORK_GROUP_SIZE: " << local << endl;
      
      // Execute the kernel over the entire range of our 1d input data set
      // using the maximum number of work group items for this device
      //
          global = 8;
          local = 8;
          errCPU = clEnqueueNDRangeKernel(queue_host, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
          if (errCPU) {
              printf("Error: Failed to execute kernel!\n");
              exit(1);
          }
      
      // Wait for the command commands to get serviced before reading back results
      //
          clFinish(queue_host);
      
      // Validate our results
      //
      // TODO
          print((char*) data, data_size);
      
      // Shutdown and cleanup
      //
          clSVMFree(context, data);
          clReleaseProgram(program);
          clReleaseKernel(kernel);
          clReleaseCommandQueue(queue_host);
          clReleaseContext(context);
          getchar();
          return 0;
      }
      
      

       

      has anyone an idea?

       

      Thanks for your help.