2 Replies Latest reply on Aug 25, 2010 3:47 PM by redditisgreat

    Differing results for GPU vs CPU kernels. (not rounding errors)

    redditisgreat

      I have a quadric error metric implementation that works fine except for one Kernel where my testbench reports results for the GPU Kernel orders of magnitude different from the CPU kernel and the C++ refernce implementation.

       

      Her is the code for the quadric structure an the the Kernel that is acting weird:

       

      // define ADIM as compile time constant typedef struct _qem { float4 C[3]; // 4th row holds corner weights of triangle (A,B,C) in tri qems otherwise c[0].w := lambda float4 b1_c; // holds (b1,c) in 4 vector float4 B[ADIM]; // 4th row is b2 } QEM; __kernel void evalQem_Bench( __global QEM* qem, __global float* error, __global float* points_opt, // pa|pb|pc __global float* attr_opt ) { unsigned int gid = get_global_id(0); QEM tmpq = qem[ gid ]; float av[ADIM]; float pnt[3]; for( size_t i=0; i<3; ++i ) pnt[i] = points_opt[3*gid+i]; for( size_t i=0; i<ADIM; ++i ) av[i] = attr_opt[ADIM*gid + i]; error[gid] = sqrQError( &tmpq, pnt, av ); } float sqrQError( QEM const* qem, float const * p, float const * av ) { float4 A_[3]; A_[0] = qem->C[0]; A_[1] = qem->C[1]; A_[2] = qem->C[2]; A_[0].w = qem->b1_c.x; A_[1].w = qem->b1_c.y; A_[2].w = qem->b1_c.z; float const lambda = qem->C[0].w; float4 tmp = (p[0]*A_[0]) + (p[1]*A_[1]) + (p[2]*A_[2]) + qem->b1_c; float4 pnt = vload4(0,p); pnt.w = 1.f; float errorsqr = 0.f; for(size_t i=0; i<ADIM; ++i) { tmp += av[i] * qem->B[i]; errorsqr += ( dot( pnt, qem->B[i] ) // I have isolated the problem, it concerns this expression +( av[i]*lambda ) ) * av[i] ; } errorsqr += dot( pnt, tmp ); return errorsqr ; }

        • Differing results for GPU vs CPU kernels. (not rounding errors)
          genaganna

           

          Originally posted by: redditisgreat I have a quadric error metric implementation that works fine except for one Kernel where my testbench reports results for the GPU Kernel orders of magnitude different from the CPU kernel and the C++ refernce implementation.

           

           Her is the code for the quadric structure an the the Kernel that is acting weird:

           

          Please send runtime code also which helps us to investigate quickly.

            • Differing results for GPU vs CPU kernels. (not rounding errors)
              redditisgreat

              The runtime code as requested:

               

              I hope it is enough code, since I had to assemble it from multiple sourcefiles (OpenCl helper class) and I left out the Kernel calls that work. I have checked the intermediate results extensively and they match up.

               

              #include<CL/cl.hpp> try { cl::Platform::get(&platforms); VECTOR_CLASS<cl::Platform>::iterator i; if(platforms.size()){ for(i = platforms.begin(); i != platforms.end(); ++i){ if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc.")) { break; } } } else throw cl::Error(-32,"CL platforms not acquired"); /* * If we could find our platform, use it. Otherwise pass a NULL and get whatever the * implementation thinks we should be using. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*i)(), 0 }; context = cl::Context( CL_DEVICE_TYPE_ALL, cps, NULL, NULL ); devices = context.getInfo<CL_CONTEXT_DEVICES>(); // load source files string ksourcestr; ifstream is( ocl_source.c_str() ); // ifstream is("../QEM.cl"); assert( is.is_open() ); getline( is, ksourcestr, '\0'); sources.push_back( std::make_pair( ksourcestr.c_str(), ksourcestr.size() ) ); // create program program = cl::Program(context, sources); cl::Device const device = this->gpuDevice(); // this->cpuDevice(); size_t const GROUP_SIZE = 64; //this->optWorkGroupSize(device,TEST_SIZE); size_t const CL_TEST_SIZE = this->clTaskSize(GROUP_SIZE,TEST_SIZE); // ((TEST_SIZE-1)/GROUP_SIZE)+1 )*GROUP_SIZE cl::CommandQueue queue( context, device, 0 ); // Results cl::Buffer qem_sum_buf( context, CL_MEM_READ_WRITE, CL_TEST_SIZE*sizeof(CL_QEM_T) ), // gets filled correctly by another CL kernel error_buf ( context, CL_MEM_READ_WRITE, CL_TEST_SIZE*sizeof(ERROR_T) ), p_opt_buf ( context, CL_MEM_READ_WRITE, CL_TEST_SIZE*sizeof(PNT_T) ), // gets filled correctly by another CL kernel a_opt_buf ( context, CL_MEM_READ_WRITE, CL_TEST_SIZE*sizeof(ATT_T) ); // gets filled correctly by another CL kernel // other kernels and related initializations ... cl::Kernel kernel( program, "evalQem_Bench" ); cl::KernelFunctor kfunc = kernel.bind( queue, cl::NDRange(CL_TEST_SIZE), cl::NDRange(GROUP_SIZE) ); kfunc( qem_sum_buf, error_buf, p_opt_buf, a_opt_buf ).wait(); // read results from device queue.enqueueReadBuffer( error_buf, CL_FALSE, 0, CL_TEST_SIZE*sizeof(float), &e_array.front() ); queue.enqueueReadBuffer( p_opt_buf, CL_FALSE, 0, CL_TEST_SIZE*sizeof(PNT_T), &points_opt.front() ); queue.enqueueReadBuffer( a_opt_buf, CL_FALSE, 0, CL_TEST_SIZE*sizeof(ATT_T), &attribs_opt.front() ); queue.enqueueReadBuffer( qem_sum_buf, CL_FALSE, 0, TEST_SIZE*sizeof(CL_QEM_T), &qem_sum_vector.front() ); queue.finish(); }