1 Reply Latest reply on Mar 11, 2010 8:20 PM by omkaranathan

    Global __constant variables don't work on GPU


      I have found that global __constant variables don't seem to work in OpenCL when the code is executed on the GPU.  The same code works just fine on the CPU.

      The system on which the test was performed has the latest version of the ATI Stream SDK (v2.01, Linux) and an ATI RV710 video card.  The code was also tested with NVIDIA's OpenCL implementation, with which it worked as expected.

      I'm attaching a simple program that illustrates this problem.  The kernel in this program is supposed to add a constant number to all elements of a vector of floats.

      #include <CL/cl.h> #include <stdio.h> #include <string.h> #define SIZE 4096 const char *source = "__constant float c = 4.0f;\n" "__kernel void VectorAdd(__global const float* a, __global float* b) {\n" " int iGID = get_global_id(0);\n" " b[iGID] = a[iGID] + c; }\n"; int main(int argc, char **argv) { size_t global[2]; size_t local[2]; cl_int err; int i , lsize = 128; float a[SIZE]; for (i = 0; i < SIZE; i++) { a[i] = (float)i; } cl_platform_id platform; cl_device_id device_id; cl_context context; cl_command_queue command_queue; cl_program program; cl_kernel kernel; int gpu = 1; clGetPlatformIDs(1, &platform, NULL); clGetDeviceIDs(platform, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); command_queue = clCreateCommandQueue(context, device_id, 0, &err); const size_t len = strlen(source); program = clCreateProgramWithSource(context, 1, (const char **) &source, &len, &err); clBuildProgram(program, 0, NULL, NULL, NULL, NULL); kernel = clCreateKernel(program, "VectorAdd", &err); cl_mem dev_a = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_float)*SIZE, a, NULL); cl_mem dev_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_float)*SIZE, a, NULL); clSetKernelArg(kernel, 0, sizeof(cl_mem), &dev_a); clSetKernelArg(kernel, 1, sizeof(cl_mem), &dev_b); global[0] = SIZE; global[1] = 1; local[0] = 128; local[1] = 1; clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global, local, 0, NULL, NULL); clFinish(command_queue); clEnqueueReadBuffer(command_queue, dev_b, CL_TRUE, 0, sizeof(float) * SIZE, a, 0, NULL, NULL); clFinish(command_queue); for (i = 0; i < SIZE; i++) { if (a[i] != (float)i + 4.0f) { printf("%d got %f\n", i, a[i]); } } clReleaseMemObject(dev_a); clReleaseMemObject(dev_b); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(command_queue); clReleaseContext(context); return 0; }