Hello,
I am trying to get a simple dynamic parallelism code to run using OpenCL 2.0 on my system. Unfortunately I get an LLVM error:
Error in hsa_code section, at offset 1572:
Instruction has invalid segment (global), expected one of: group, private
LLVM ERROR:
Brig container validation has failed in BRIGAsmPrinter.cpp
Sometimes the OpenCL compile process terminates with a Segmentation Fault...
My system: Ubuntu 14.04 x64, AMD A10-7890K Radeon R7, 12 Compute Cores 4C+8G, dmesg: module loaded - fglrx 15.20.3 [Sep 8 2015] with 1 minors
Here is my code:
#include <CL/cl.h>
#include <CL/cl_platform.h>
#include <stddef.h>
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <iomanip>
#include <iostream>
using namespace std;
// Use a static data size for simplicity
//
#define DATA_SIZE (16)
const unsigned int data_size = DATA_SIZE;
// Simple compute kernel
//
const char *KernelSource = "\n"
"#define FRAG_SIZE (16 / 😎 \n"
"__kernel void square( \n"
" __global void* data, \n"
" unsigned int size, \n"
" __global void* error) \n"
"{ \n"
" size_t pgid = get_global_id(0); \n"
" \n"
" // create 1 dim ndrange \n"
" ndrange_t ndrange = ndrange_1D(FRAG_SIZE); \n"
" \n"
" // store the device's default queue \n"
" queue_t default_queue = get_default_queue(); \n"
" \n"
" // declare/define the child kernels code \n"
" void (^fun_blk)(void) = ^{((__global char*)data)[pgid * FRAG_SIZE + get_global_id(0)] = pgid;}; \n"
" \n"
" // enqueue child kernels \n"
" *((__global int*)error) = enqueue_kernel(default_queue, \n"
" CLK_ENQUEUE_FLAGS_NO_WAIT, \n"
" ndrange, \n"
" fun_blk); \n"
"} \n"
"\n";
void print(char* data, unsigned int size) {
for (unsigned int i = 0; i < size; ++i) {
cout << "[";
cout << setw(2) << setfill('0') << hex << (int) data;
cout << "]";
}
cout << endl;
}
int main(void) {
cl_int errCPU; // error code returned from api calls
cl_int* errGPU; // error code returned from device CL C calls
size_t global; // global domain size for our calculation
size_t local; // local domain size for our calculation
cl_device_id device_id; // compute device id
cl_context context; // compute context
cl_command_queue queue_host; // host's command queue
cl_command_queue queue_device; // device's command queue
cl_program program; // compute program
cl_kernel kernel; // compute kernel
unsigned int count = 2;
void* data;
// Connect to a platform
//
cl_platform_id platforms[2];
errCPU = clGetPlatformIDs(2, platforms, &count);
assert(errCPU == CL_SUCCESS);
// Connect to a compute device
//
errCPU = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
assert(errCPU == CL_SUCCESS);
if (errCPU != CL_SUCCESS) {
printf("Error: Failed to create a device group!\n");
exit(1);
}
const size_t size = 1024;
char deviceName[size];
size_t size2 = 0;
errCPU = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(char) * size, (void*) deviceName, &size2);
string strDeviceName(deviceName);
cout << strDeviceName << endl;
// Create a compute context
//
context = clCreateContext(0, 1, &device_id, NULL, NULL, &errCPU);
assert(errCPU == CL_SUCCESS);
if (!context) {
printf("Error: Failed to create a compute context!\n");
exit(1);
}
// Fill our data set with random float values
//
data = clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, DATA_SIZE, 0);
assert(data != nullptr);
int i = 0;
for (i = 0; i < DATA_SIZE; i++)
reinterpret_cast<char*>(data) = 'A';
// Allocate SVM error memory
//
errGPU = reinterpret_cast<cl_int*>(clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_int), 0));
// Create host's command queues
//
cl_queue_properties props_host[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0, 0 };
queue_host = clCreateCommandQueueWithProperties(context, device_id, props_host, &errCPU);
assert(errCPU == CL_SUCCESS);
// Create device's command queues
//
cl_queue_properties props_device[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT, 0, 0 };
queue_device = clCreateCommandQueueWithProperties(context, device_id, props_device, &errCPU);
assert(errCPU == CL_SUCCESS);
if (!queue_host) {
printf("Error: Failed to create a command commands!\n");
exit(1);
}
// Create the compute program from the source buffer
//
program = clCreateProgramWithSource(context, 1, (const char **) &KernelSource, NULL, &errCPU);
assert(errCPU == CL_SUCCESS);
if (!program) {
printf("Error: Failed to create compute program!\n");
exit(1);
}
// Build the program executable
//
errCPU = clBuildProgram(program, 0, NULL, "-cl-opt-disable -cl-std=CL2.0 -g -Werror", NULL, NULL);
if (errCPU != CL_SUCCESS) {
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
// Create the compute kernel in the program we wish to run
//
kernel = clCreateKernel(program, "square", &errCPU);
assert(errCPU == CL_SUCCESS);
if (!kernel || errCPU != CL_SUCCESS) {
printf("Error: Failed to create compute kernel!\n");
exit(1);
}
// Set the arguments to our compute kernel
//
errCPU = 0;
errCPU |= clSetKernelArgSVMPointer(kernel, 0, data);
errCPU |= clSetKernelArg(kernel, 1, sizeof(data_size), &data_size);
errCPU |= clSetKernelArgSVMPointer(kernel, 2, errGPU);
if (errCPU != CL_SUCCESS) {
printf("Error: Failed to set kernel arguments! %d\n", errCPU);
exit(1);
}
// Get the maximum work group size for executing the kernel on the device
//
errCPU = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (errCPU != CL_SUCCESS) {
printf("Error: Failed to retrieve kernel work group info! %d\n", errCPU);
exit(1);
}
cout << "CL_KERNEL_WORK_GROUP_SIZE: " << local << endl;
// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
//
global = 8;
local = 8;
errCPU = clEnqueueNDRangeKernel(queue_host, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
if (errCPU) {
printf("Error: Failed to execute kernel!\n");
exit(1);
}
// Wait for the command commands to get serviced before reading back results
//
clFinish(queue_host);
// Validate our results
//
// TODO
print((char*) data, data_size);
// Shutdown and cleanup
//
clSVMFree(context, data);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue_host);
clReleaseContext(context);
getchar();
return 0;
}
has anyone an idea?
Thanks for your help.
Solved! Go to Solution.
Hi,
I couldn't get the error when I tried with Crimson 15.12. Please check with this driver from here: Desktop
Regards,
Hi,
I couldn't get the error when I tried with Crimson 15.12. Please check with this driver from here: Desktop
Regards,