The runtime code as requested:
I hope it is enough code, since I had to assemble it from multiple sourcefiles (OpenCl helper class) and I left out the Kernel calls that work. I have checked the intermediate results extensively and they match up.
#include<CL/cl.hpp> try { cl::Platform::get(&platforms); VECTOR_CLASS<cl::Platform>::iterator i; if(platforms.size()){ for(i = platforms.begin(); i != platforms.end(); ++i){ if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc.")) { break; } } } else throw cl::Error(-32,"CL platforms not acquired"); /* * If we could find our platform, use it. Otherwise pass a NULL and get whatever the * implementation thinks we should be using. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*i)(), 0 }; context = cl::Context( CL_DEVICE_TYPE_ALL, cps, NULL, NULL ); devices = context.getInfo<CL_CONTEXT_DEVICES>(); // load source files string ksourcestr; ifstream is( ocl_source.c_str() ); // ifstream is("../QEM.cl"); assert( is.is_open() ); getline( is, ksourcestr, '\0'); sources.push_back( std::make_pair( ksourcestr.c_str(), ksourcestr.size() ) ); // create program program = cl::Program(context, sources); cl::Device const device = this->gpuDevice(); // this->cpuDevice(); size_t const GROUP_SIZE = 64; //this->optWorkGroupSize(device,TEST_SIZE); size_t const CL_TEST_SIZE = this->clTaskSize(GROUP_SIZE,TEST_SIZE); // ((TEST_SIZE-1)/GROUP_SIZE)+1 )*GROUP_SIZE cl::CommandQueue queue( context, device, 0 ); // Results cl::Buffer qem_sum_buf( context, CL_MEM_READ_WRITE, CL_TEST_SIZE*sizeof(CL_QEM_T) ), // gets filled correctly by another CL kernel error_buf ( context, CL_MEM_READ_WRITE, CL_TEST_SIZE*sizeof(ERROR_T) ), p_opt_buf ( context, CL_MEM_READ_WRITE, CL_TEST_SIZE*sizeof(PNT_T) ), // gets filled correctly by another CL kernel a_opt_buf ( context, CL_MEM_READ_WRITE, CL_TEST_SIZE*sizeof(ATT_T) ); // gets filled correctly by another CL kernel // other kernels and related initializations ... cl::Kernel kernel( program, "evalQem_Bench" ); cl::KernelFunctor kfunc = kernel.bind( queue, cl::NDRange(CL_TEST_SIZE), cl::NDRange(GROUP_SIZE) ); kfunc( qem_sum_buf, error_buf, p_opt_buf, a_opt_buf ).wait(); // read results from device queue.enqueueReadBuffer( error_buf, CL_FALSE, 0, CL_TEST_SIZE*sizeof(float), &e_array.front() ); queue.enqueueReadBuffer( p_opt_buf, CL_FALSE, 0, CL_TEST_SIZE*sizeof(PNT_T), &points_opt.front() ); queue.enqueueReadBuffer( a_opt_buf, CL_FALSE, 0, CL_TEST_SIZE*sizeof(ATT_T), &attribs_opt.front() ); queue.enqueueReadBuffer( qem_sum_buf, CL_FALSE, 0, TEST_SIZE*sizeof(CL_QEM_T), &qem_sum_vector.front() ); queue.finish(); }