redditisgreat

Differing results for GPU vs CPU kernels. (not rounding errors)

Discussion created by redditisgreat on Aug 24, 2010
Latest reply on Aug 25, 2010 by redditisgreat

I have a quadric error metric implementation that works fine except for one Kernel where my testbench reports results for the GPU Kernel orders of magnitude different from the CPU kernel and the C++ refernce implementation.

 

Her is the code for the quadric structure an the the Kernel that is acting weird:

 

// define ADIM as compile time constant typedef struct _qem { float4 C[3]; // 4th row holds corner weights of triangle (A,B,C) in tri qems otherwise c[0].w := lambda float4 b1_c; // holds (b1,c) in 4 vector float4 B[ADIM]; // 4th row is b2 } QEM; __kernel void evalQem_Bench( __global QEM* qem, __global float* error, __global float* points_opt, // pa|pb|pc __global float* attr_opt ) { unsigned int gid = get_global_id(0); QEM tmpq = qem[ gid ]; float av[ADIM]; float pnt[3]; for( size_t i=0; i<3; ++i ) pnt[i] = points_opt[3*gid+i]; for( size_t i=0; i<ADIM; ++i ) av[i] = attr_opt[ADIM*gid + i]; error[gid] = sqrQError( &tmpq, pnt, av ); } float sqrQError( QEM const* qem, float const * p, float const * av ) { float4 A_[3]; A_[0] = qem->C[0]; A_[1] = qem->C[1]; A_[2] = qem->C[2]; A_[0].w = qem->b1_c.x; A_[1].w = qem->b1_c.y; A_[2].w = qem->b1_c.z; float const lambda = qem->C[0].w; float4 tmp = (p[0]*A_[0]) + (p[1]*A_[1]) + (p[2]*A_[2]) + qem->b1_c; float4 pnt = vload4(0,p); pnt.w = 1.f; float errorsqr = 0.f; for(size_t i=0; i<ADIM; ++i) { tmp += av[i] * qem->B[i]; errorsqr += ( dot( pnt, qem->B[i] ) // I have isolated the problem, it concerns this expression +( av[i]*lambda ) ) * av[i] ; } errorsqr += dot( pnt, tmp ); return errorsqr ; }

Outcomes