Aland

Double-precision sqrt bug?

Discussion created by Aland on May 13, 2010
Latest reply on May 13, 2010 by Aland
Error when using sqrt function

 

When trying to run the following kernel on Ubuntu 10.04 x86 / Radeon HD5870 / StreamSDK v2.1 / fglrx 8.723, I get

$ opencldouble/nbody
BUILD LOG
************************************************
Error: Undeclared function index 1205

************************************************
Error: clBuildProgram failed. Error code : CL_BUILD_PROGRAM_FAILURE

 

This seems to be the issue with sqrt function in 42nd line: if cast its parameter to float,  kernel compiles successfully, but returns NANs. The same code (w/o #pragma OPENCL EXTENSION cl_amd_fp64 : enable) runs perfectly well on nVidia GTX480 (Ubuntu 9.10 x86_64 / SDK 3.0 / driver 195.36.24). The single-precision version of the same code (just changed all double to float) works ok on both platforms.

Samples (including MatrixMulDouble) runs fine.

#pragma OPENCL EXTENSION cl_khr_fp64 : enable #pragma OPENCL EXTENSION cl_amd_fp64 : enable __kernel void nbody_sim( __global double4* pos , __global double4* vel, int numBodies, float deltaTime, __local double4* localPos ) { unsigned int tid = get_local_id(0); unsigned int gid = get_global_id(0); unsigned int localSize = get_local_size(0); // Number of tiles we need to iterate unsigned int numTiles = numBodies / localSize; // position of this work-item //#pragma OPENCL EXTENSION cl_khr_fp64 : enable double4 myPos = pos[gid]; double4 acc = (double4)(0.0f, 0.0f, 0.0f, 0.0f); for(int i = 0; i < numTiles; ++i) { // load one tile into local memory int idx = i * localSize + tid; localPos[tid] = pos[idx]; // Synchronize to make sure data is available for processing barrier(CLK_LOCAL_MEM_FENCE); // calculate acceleration effect due to each body // a[i->j] = m[j] * r[i->j] / r^3 for(int j = 0; j < localSize; ++j) { // Calculate acceleartion caused by particle j on particle myPos (gid) double4 r = localPos[j] - myPos; double dist = sqrt(r.x * r.x + r.y * r.y + r.z * r.z); dist = dist*dist*dist; if (dist>1e-6){ double s = localPos[j].w / dist; // accumulate effect of all particles acc += s * r; } }; // Synchronize so that next tile can be loaded barrier(CLK_LOCAL_MEM_FENCE); }; double4 oldVel = vel[gid]; // updated position and velocity double4 newPos = myPos + oldVel * deltaTime + acc * 0.5f * deltaTime * deltaTime; newPos.w = myPos.w; double4 newVel = oldVel + acc * deltaTime; // write to global memory pos[gid] = newPos; vel[gid] = newVel; }

Outcomes