AnsweredAssumed Answered

OpenCL 2.0 - Shared Virtual Memory Programming

Question asked by patricklin on Jul 28, 2015
Latest reply on Jul 29, 2015 by dipak

Hi,

 

I am trying to make a simple tutorial on OpenCL 2.0 SVM, and I choose vector addition to illustrate two features:

A. Coarse-grain buffer

B. Fine-grain buffer

 

The APU I used is AMD A10-7850K (Kaveri), and I have tested FineGrainSVM and SVMBinaryTreeSearch in AMD APP SDK 3.0 successfully.

 

The followings is the main part of my code:

 

 

=============================================================

 

// OpenCL kernel. Each work item takes care of one element of c

const char *kernelSource =                                               "\n" \

"__kernel void vecAdd(  __global double *a,                      \n" \

"                                     __global double *b,                      \n" \

"                                     __global double *c,                      \n" \

"                                     const unsigned int n)                   \n" \

"{                                                                                        \n" \

"    //Get our global thread ID                                            \n" \

"    int id = get_global_id(0);                                              \n" \

"                                                                                         \n" \

"    //Make sure we do not go out of bounds                      \n" \

"    if (id < n)                                                                       \n" \

"        c[id] = a[id] + b[id];                                                   \n" \

"}                                                                                        \n" \

                                                                                         "\n" ;

 

int main( int argc, char* argv[] )

{

    // Length of vectors

    unsigned int n = 100000;

 

...

 

    // Size, in bytes, of each vector

    size_t bytes = n*sizeof(double);

 

...

 

// Bind to platform

err = clGetPlatformIDs(1, &cpPlatform, NULL);

 

// Get ID for the device

err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);

 

// Create a context

cl_context_properties cps[3] =

{

CL_CONTEXT_PLATFORM,

(cl_context_properties)cpPlatform,

0

};

//context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);

context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &err);

 

// Create a command queue

queue = clCreateCommandQueueWithProperties(context, device_id, 0, &err);

 

// Create the compute program from the source buffer

program = clCreateProgramWithSource(context, 1,

                            (const char **) & kernelSource, NULL, &err);

 

// Build the program executable

clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

 

// Create the compute kernel in the program we wish to run

kernel = clCreateKernel(program, "vecAdd", &err);

 

double* h_a = (double*)clSVMAlloc(context, CL_MEM_READ_WRITE, bytes, 0);

double* h_b = (double*)clSVMAlloc(context, CL_MEM_READ_WRITE, bytes, 0);

double* h_c = (double*)clSVMAlloc(context, CL_MEM_READ_WRITE, bytes, 0);

 

clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, h_a, bytes, 0, NULL, NULL);

clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, h_b, bytes, 0, NULL, NULL);

clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, h_c, bytes, 0, NULL, NULL);

 

// Initialize vectors on host

    int i;

    for( i = 0; i < n; i++ )

    {

        h_a[i] = sinf(i)*sinf(i);

        h_b[i] = cosf(i)*cosf(i);

        h_c[i] = 0;

    }

 

clEnqueueSVMUnmap(queue, h_a, 0, NULL, NULL);

clEnqueueSVMUnmap(queue, h_b, 0, NULL, NULL);

clEnqueueSVMUnmap(queue, h_c, 0, NULL, NULL);

 

// Set the arguments to our compute kernel

err = clSetKernelArgSVMPointer(kernel, 0, (double*)h_a);

err |= clSetKernelArgSVMPointer(kernel, 1, (double*)h_b);

err |= clSetKernelArgSVMPointer(kernel, 2, (double*)h_c);

err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);

 

// Execute the kernel over the entire range of the data set

err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,

                                                              0, NULL, NULL);

// Wait for the command queue to get serviced before reading back results

clFlush(queue);

 

clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_READ, h_c, bytes, 0, NULL, NULL);

 

//Sum up vector c and print result divided by n, this should equal 1 within error

    double sum = 0;

    for(i=0; i<n; i++)

        sum += h_c[i];

    printf("final result: %f\n", sum/n);

 

clEnqueueSVMUnmap(queue, h_c, 0, NULL, NULL);

 

clSVMFree(context, h_a);

clSVMFree(context, h_b);

clSVMFree(context, h_c);

    clReleaseProgram(program);

    clReleaseKernel(kernel);

    clReleaseCommandQueue(queue);

    clReleaseContext(context);

 

return 0;

}

 

=============================================================

 

 

Everything goes well when I check the return value of each CL runtime API, however it prints Segmentation fault (core dumped) and does not show "final result: 1.00000"

 

Could someone guide me how to fix it?

If need, I can provide more details about the code.

 

Many thanks in advance!

 

 

Patrick Lin

Outcomes