1 Reply Latest reply on Jul 29, 2015 2:58 AM by dipak

    OpenCL 2.0 - Shared Virtual Memory Programming

    patricklin

      Hi,

       

      I am trying to make a simple tutorial on OpenCL 2.0 SVM, and I choose vector addition to illustrate two features:

      A. Coarse-grain buffer

      B. Fine-grain buffer

       

      The APU I used is AMD A10-7850K (Kaveri), and I have tested FineGrainSVM and SVMBinaryTreeSearch in AMD APP SDK 3.0 successfully.

       

      The followings is the main part of my code:

       

       

      =============================================================

       

      // OpenCL kernel. Each work item takes care of one element of c

      const char *kernelSource =                                               "\n" \

      "__kernel void vecAdd(  __global double *a,                      \n" \

      "                                     __global double *b,                      \n" \

      "                                     __global double *c,                      \n" \

      "                                     const unsigned int n)                   \n" \

      "{                                                                                        \n" \

      "    //Get our global thread ID                                            \n" \

      "    int id = get_global_id(0);                                              \n" \

      "                                                                                         \n" \

      "    //Make sure we do not go out of bounds                      \n" \

      "    if (id < n)                                                                       \n" \

      "        c[id] = a[id] + b[id];                                                   \n" \

      "}                                                                                        \n" \

                                                                                               "\n" ;

       

      int main( int argc, char* argv[] )

      {

          // Length of vectors

          unsigned int n = 100000;

       

      ...

       

          // Size, in bytes, of each vector

          size_t bytes = n*sizeof(double);

       

      ...

       

      // Bind to platform

      err = clGetPlatformIDs(1, &cpPlatform, NULL);

       

      // Get ID for the device

      err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);

       

      // Create a context

      cl_context_properties cps[3] =

      {

      CL_CONTEXT_PLATFORM,

      (cl_context_properties)cpPlatform,

      0

      };

      //context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);

      context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &err);

       

      // Create a command queue

      queue = clCreateCommandQueueWithProperties(context, device_id, 0, &err);

       

      // Create the compute program from the source buffer

      program = clCreateProgramWithSource(context, 1,

                                  (const char **) & kernelSource, NULL, &err);

       

      // Build the program executable

      clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

       

      // Create the compute kernel in the program we wish to run

      kernel = clCreateKernel(program, "vecAdd", &err);

       

      double* h_a = (double*)clSVMAlloc(context, CL_MEM_READ_WRITE, bytes, 0);

      double* h_b = (double*)clSVMAlloc(context, CL_MEM_READ_WRITE, bytes, 0);

      double* h_c = (double*)clSVMAlloc(context, CL_MEM_READ_WRITE, bytes, 0);

       

      clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, h_a, bytes, 0, NULL, NULL);

      clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, h_b, bytes, 0, NULL, NULL);

      clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, h_c, bytes, 0, NULL, NULL);

       

      // Initialize vectors on host

          int i;

          for( i = 0; i < n; i++ )

          {

              h_a[i] = sinf(i)*sinf(i);

              h_b[i] = cosf(i)*cosf(i);

              h_c[i] = 0;

          }

       

      clEnqueueSVMUnmap(queue, h_a, 0, NULL, NULL);

      clEnqueueSVMUnmap(queue, h_b, 0, NULL, NULL);

      clEnqueueSVMUnmap(queue, h_c, 0, NULL, NULL);

       

      // Set the arguments to our compute kernel

      err = clSetKernelArgSVMPointer(kernel, 0, (double*)h_a);

      err |= clSetKernelArgSVMPointer(kernel, 1, (double*)h_b);

      err |= clSetKernelArgSVMPointer(kernel, 2, (double*)h_c);

      err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);

       

      // Execute the kernel over the entire range of the data set

      err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,

                                                                    0, NULL, NULL);

      // Wait for the command queue to get serviced before reading back results

      clFlush(queue);

       

      clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_READ, h_c, bytes, 0, NULL, NULL);

       

      //Sum up vector c and print result divided by n, this should equal 1 within error

          double sum = 0;

          for(i=0; i<n; i++)

              sum += h_c[i];

          printf("final result: %f\n", sum/n);

       

      clEnqueueSVMUnmap(queue, h_c, 0, NULL, NULL);

       

      clSVMFree(context, h_a);

      clSVMFree(context, h_b);

      clSVMFree(context, h_c);

          clReleaseProgram(program);

          clReleaseKernel(kernel);

          clReleaseCommandQueue(queue);

          clReleaseContext(context);

       

      return 0;

      }

       

      =============================================================

       

       

      Everything goes well when I check the return value of each CL runtime API, however it prints Segmentation fault (core dumped) and does not show "final result: 1.00000"

       

      Could someone guide me how to fix it?

      If need, I can provide more details about the code.

       

      Many thanks in advance!

       

       

      Patrick Lin