2 Replies Latest reply on Jul 29, 2010 7:03 PM by hduregger

    __constant float array indexed with get_global_id returns wrong value if gid = 0 on GPU

    hduregger

      Hello,

       

      The attached example program prints

      1.0 1.0 1.0 1.0

      when run on the CPU, but

      0.0 1.0 1.0 1.0

      when run on the GPU.

      When indexing the zero entry of the array directly like

      out[id] = f[0];

      it correctly retrieves the array value.

      Using

      • AMD Stream SDK 2.1 64bit (from AMD site)
      • Ubuntu Linux 10.04
      • AMD Phenom II 920
      • AMD Radeon 4870
      • Catalyst Driver Packaging Version 8.723.1-100408a-098580C-ATI

       

      Any idea what could be wrong?

      #define __NO_STD_VECTOR #define __NO_STD_STRING #include <cstdio> #include <cstdlib> #include <fstream> #include <iostream> #include <string> #include <iterator> #if defined(__APPLE__) || defined(__MACOSX) #include <OpenCL/cl.hpp> #else #include <CL/cl.hpp> #endif inline void checkErr(cl_int err, const char * name) { if (err != CL_SUCCESS) { std::cerr << "ERROR: " << name << " (" << err << ")" << std::endl; exit(EXIT_FAILURE); } } // // Prints "1.0f 1.0f 1.0f 1.0f" correctly on CPU but "0.0f 1.0f 1.0f 1.0f" on GPU // char prog[] = "__constant float f[] = { 1.0f, 1.0f, 1.0f, 1.0f };" "__kernel void hello(__global float* out)" "{" " size_t id = get_global_id(0);" " out[id] = f[id];" //" out[id] = id; // verify that id = 0 occurs (success)" "}" ; int main() { int length = 4; cl_int err; // Get platform list cl::vector< cl::Platform > platformList; cl::Platform::get(&platformList); checkErr(platformList.size()!=0 ? CL_SUCCESS : -1, "cl::Platform::get"); std::cout << "Platform number is: " << platformList.size() << std::endl; cl::string platformVendor; platformList[0].getInfo(CL_PLATFORM_VENDOR, &platformVendor); std::cout << "Platform is by: " << platformVendor.c_str() << "\n"; cl_context_properties cprops[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platformList[0])(), 0}; // Create context cl::Context context(CL_DEVICE_TYPE_GPU, cprops, NULL, NULL, &err); // cl::Context context(CL_DEVICE_TYPE_CPU, cprops, NULL, NULL, &err); cl::vector<cl::Device> devices; // Get devices devices = context.getInfo<CL_CONTEXT_DEVICES>(); checkErr(devices.size() > 0 ? CL_SUCCESS : -1, "devices.size() > 0"); std::cout << "Device count is: " << devices.size() << "\n"; // Create source cl::Program::Sources source( 1, std::make_pair(prog, sizeof(prog)) ); // Create program cl::Program program(context, source); err = program.build(devices, ""); cl_build_status bs = program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(devices[0]); std::cout << "Build status:\t"; switch (bs) { case CL_BUILD_NONE: { std::cout << "None\n"; break; } case CL_BUILD_ERROR: { std::cout << "Build error\n"; std::string str( program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]).c_str()); std::cout << "log:\n" << str << std::endl; break; } case CL_BUILD_SUCCESS: { std::cout << "Success\n"; break; } case CL_BUILD_IN_PROGRESS: { std::cout << "Build in progress\n"; break; } default: { std::cout << "ERROR\n"; break; } } checkErr( err, "Program::build()" ); // Create kernel cl::Kernel kernel(program, "hello", &err); checkErr(err, "Kernel::Kernel()"); cl_float* outH = new cl_float[length]; for (int i = 0; i < length; i++) { outH[i] = -1.0f; } cl::Buffer outCL( context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, length * sizeof(cl_float), outH, &err); checkErr(err, "Buffer::Buffer()"); err = kernel.setArg(0, outCL); checkErr(err, "Kernel::setArg()"); // Create queue cl::CommandQueue queue(context, devices[0], 0, &err); checkErr(err, "CommandQueue::CommandQueue()"); cl::Event event; // Enqueue kernel execution err = queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(length ), cl::NullRange, NULL, &event); checkErr(err, "CommandQueue::enqueueNDRangeKernel()"); event.wait(); // Enqueue map buffer cl_float* out = (cl_float*) queue.enqueueMapBuffer(outCL, CL_TRUE, CL_MAP_READ, 0, (length - 1) * sizeof(cl_float), NULL, &event, &err); checkErr(err, "CommandQueue::enqueueMapBuffer()"); // Print result for (int i = 0; i < length; i++) { //std::cout << outH[i] << std::endl; std::cout << out[i] << std::endl; } return EXIT_SUCCESS; }