cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

vrcomputing
Adept I

OpenCL 2.0 Compiler Segmentation Fault or LLVM Error

Jump to solution

Hello,

I am trying to get a simple dynamic parallelism code to run using OpenCL 2.0 on my system. Unfortunately I get an LLVM error:

Error in hsa_code section, at offset 1572:

Instruction has invalid segment (global), expected one of: group, private

LLVM ERROR:

Brig container validation has failed in BRIGAsmPrinter.cpp

Sometimes the OpenCL compile process terminates with a Segmentation Fault...

My system: Ubuntu 14.04 x64, AMD A10-7890K Radeon R7, 12 Compute Cores 4C+8G, dmesg: module loaded - fglrx 15.20.3 [Sep  8 2015] with 1 minors

Here is my code:

#include <CL/cl.h>

#include <CL/cl_platform.h>

#include <stddef.h>

#include <cassert>

#include <cstdio>

#include <cstdlib>

#include <iomanip>

#include <iostream>

using namespace std;

// Use a static data size for simplicity

//

#define DATA_SIZE (16)

const unsigned int data_size = DATA_SIZE;

// Simple compute kernel

//

const char *KernelSource = "\n"

        "#define FRAG_SIZE (16 / 😎                                                \n"

        "__kernel void square(                                                     \n"

        "   __global void* data,                                                   \n"

        "   unsigned int size,                                                     \n"

        "   __global void* error)                                                   \n"

        "{                                                                         \n"

        "   size_t pgid = get_global_id(0);                                        \n"

        "                                                                          \n"

        "   // create 1 dim ndrange                                                \n"

        "    ndrange_t ndrange = ndrange_1D(FRAG_SIZE);                             \n"

        "                                                                          \n"

        "   // store the device's default queue                                    \n"

        "   queue_t default_queue = get_default_queue();                           \n"

        "                                                                          \n"

        "   // declare/define the child kernels code                               \n"

        "   void (^fun_blk)(void) = ^{((__global char*)data)[pgid * FRAG_SIZE + get_global_id(0)] = pgid;}; \n"

        "                                                                          \n"

        "   // enqueue child kernels                                               \n"

        "   *((__global int*)error) = enqueue_kernel(default_queue,                                 \n"

        "      CLK_ENQUEUE_FLAGS_NO_WAIT,                                           \n"

        "      ndrange,                                                             \n"

        "      fun_blk);                                                            \n"

        "}                                                                         \n"

        "\n";

void print(char* data, unsigned int size) {

    for (unsigned int i = 0; i < size; ++i) {

        cout << "[";

        cout << setw(2) << setfill('0') << hex << (int) data;

        cout << "]";

    }

    cout << endl;

}

int main(void) {

    cl_int errCPU;                      // error code returned from api calls

    cl_int* errGPU;                      // error code returned from device CL C calls

    size_t global; // global domain size for our calculation

    size_t local;  // local domain size for our calculation

    cl_device_id device_id;  // compute device id

    cl_context context;  // compute context

    cl_command_queue queue_host;  // host's command queue

    cl_command_queue queue_device;  // device's command queue

    cl_program program;  // compute program

    cl_kernel kernel;  // compute kernel

    unsigned int count = 2;

    void* data;

// Connect to a platform

//

    cl_platform_id platforms[2];

    errCPU = clGetPlatformIDs(2, platforms, &count);

    assert(errCPU == CL_SUCCESS);

// Connect to a compute device

//

    errCPU = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);

    assert(errCPU == CL_SUCCESS);

    if (errCPU != CL_SUCCESS) {

        printf("Error: Failed to create a device group!\n");

        exit(1);

    }

    const size_t size = 1024;

    char deviceName[size];

    size_t size2 = 0;

    errCPU = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(char) * size, (void*) deviceName, &size2);

    string strDeviceName(deviceName);

    cout << strDeviceName << endl;

    // Create a compute context

    //

    context = clCreateContext(0, 1, &device_id, NULL, NULL, &errCPU);

    assert(errCPU == CL_SUCCESS);

    if (!context) {

        printf("Error: Failed to create a compute context!\n");

        exit(1);

    }

    // Fill our data set with random float values

    //

    data = clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, DATA_SIZE, 0);

    assert(data != nullptr);

    int i = 0;

    for (i = 0; i < DATA_SIZE; i++)

        reinterpret_cast<char*>(data) = 'A';

    // Allocate SVM error memory

    //

    errGPU = reinterpret_cast<cl_int*>(clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_int), 0));

    // Create host's command queues

    //

    cl_queue_properties props_host[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0, 0 };

    queue_host = clCreateCommandQueueWithProperties(context, device_id, props_host, &errCPU);

    assert(errCPU == CL_SUCCESS);

    // Create device's command queues

    //

    cl_queue_properties props_device[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT, 0, 0 };

    queue_device = clCreateCommandQueueWithProperties(context, device_id, props_device, &errCPU);

    assert(errCPU == CL_SUCCESS);

    if (!queue_host) {

        printf("Error: Failed to create a command commands!\n");

        exit(1);

    }

    // Create the compute program from the source buffer

    //

    program = clCreateProgramWithSource(context, 1, (const char **) &KernelSource, NULL, &errCPU);

    assert(errCPU == CL_SUCCESS);

    if (!program) {

        printf("Error: Failed to create compute program!\n");

        exit(1);

    }

    // Build the program executable

    //

    errCPU = clBuildProgram(program, 0, NULL, "-cl-opt-disable -cl-std=CL2.0 -g -Werror", NULL, NULL);

    if (errCPU != CL_SUCCESS) {

        size_t len;

        char buffer[2048];

        printf("Error: Failed to build program executable!\n");

        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);

        printf("%s\n", buffer);

        exit(1);

    }

    // Create the compute kernel in the program we wish to run

    //

    kernel = clCreateKernel(program, "square", &errCPU);

    assert(errCPU == CL_SUCCESS);

    if (!kernel || errCPU != CL_SUCCESS) {

        printf("Error: Failed to create compute kernel!\n");

        exit(1);

    }

// Set the arguments to our compute kernel

//

    errCPU = 0;

    errCPU |= clSetKernelArgSVMPointer(kernel, 0, data);

    errCPU |= clSetKernelArg(kernel, 1, sizeof(data_size), &data_size);

    errCPU |= clSetKernelArgSVMPointer(kernel, 2, errGPU);

    if (errCPU != CL_SUCCESS) {

        printf("Error: Failed to set kernel arguments! %d\n", errCPU);

        exit(1);

    }

// Get the maximum work group size for executing the kernel on the device

//

    errCPU = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);

    if (errCPU != CL_SUCCESS) {

        printf("Error: Failed to retrieve kernel work group info! %d\n", errCPU);

        exit(1);

    }

    cout << "CL_KERNEL_WORK_GROUP_SIZE: " << local << endl;

// Execute the kernel over the entire range of our 1d input data set

// using the maximum number of work group items for this device

//

    global = 8;

    local = 8;

    errCPU = clEnqueueNDRangeKernel(queue_host, kernel, 1, NULL, &global, &local, 0, NULL, NULL);

    if (errCPU) {

        printf("Error: Failed to execute kernel!\n");

        exit(1);

    }

// Wait for the command commands to get serviced before reading back results

//

    clFinish(queue_host);

// Validate our results

//

// TODO

    print((char*) data, data_size);

// Shutdown and cleanup

//

    clSVMFree(context, data);

    clReleaseProgram(program);

    clReleaseKernel(kernel);

    clReleaseCommandQueue(queue_host);

    clReleaseContext(context);

    getchar();

    return 0;

}

has anyone an idea?

Thanks for your help.

Tags (2)
0 Likes
1 Solution

Accepted Solutions
dipak
Staff
Staff

Re: OpenCL 2.0 Compiler Segmentation Fault or LLVM Error

Jump to solution

Hi,

I couldn't get the error when I tried with Crimson 15.12. Please check with this driver from here: Desktop

Regards,

View solution in original post

0 Likes
1 Reply
dipak
Staff
Staff

Re: OpenCL 2.0 Compiler Segmentation Fault or LLVM Error

Jump to solution

Hi,

I couldn't get the error when I tried with Crimson 15.12. Please check with this driver from here: Desktop

Regards,

View solution in original post

0 Likes