Error when using sqrt function
When trying to run the following kernel on Ubuntu 10.04 x86 / Radeon HD5870 / StreamSDK v2.1 / fglrx 8.723, I get
$ opencldouble/nbody
BUILD LOG
************************************************
Error: Undeclared function index 1205
************************************************
Error: clBuildProgram failed. Error code : CL_BUILD_PROGRAM_FAILURE
This seems to be the issue with sqrt function in 42nd line: if cast its parameter to float, kernel compiles successfully, but returns NANs. The same code (w/o #pragma OPENCL EXTENSION cl_amd_fp64 : enable) runs perfectly well on nVidia GTX480 (Ubuntu 9.10 x86_64 / SDK 3.0 / driver 195.36.24). The single-precision version of the same code (just changed all double to float) works ok on both platforms.
Samples (including MatrixMulDouble) runs fine.
#pragma OPENCL EXTENSION cl_khr_fp64 : enable #pragma OPENCL EXTENSION cl_amd_fp64 : enable __kernel void nbody_sim( __global double4* pos , __global double4* vel, int numBodies, float deltaTime, __local double4* localPos ) { unsigned int tid = get_local_id(0); unsigned int gid = get_global_id(0); unsigned int localSize = get_local_size(0); // Number of tiles we need to iterate unsigned int numTiles = numBodies / localSize; // position of this work-item //#pragma OPENCL EXTENSION cl_khr_fp64 : enable double4 myPos = pos[gid]; double4 acc = (double4)(0.0f, 0.0f, 0.0f, 0.0f); for(int i = 0; i < numTiles; ++i) { // load one tile into local memory int idx = i * localSize + tid; localPos[tid] = pos[idx]; // Synchronize to make sure data is available for processing barrier(CLK_LOCAL_MEM_FENCE); // calculate acceleration effect due to each body // a[i->j] = m * r[i->j] / r^3 for(int j = 0; j < localSize; ++j) { // Calculate acceleartion caused by particle j on particle myPos (gid) double4 r = localPos - myPos; double dist = sqrt(r.x * r.x + r.y * r.y + r.z * r.z); dist = dist*dist*dist; if (dist>1e-6){ double s = localPos.w / dist; // accumulate effect of all particles acc += s * r; } }; // Synchronize so that next tile can be loaded barrier(CLK_LOCAL_MEM_FENCE); }; double4 oldVel = vel[gid]; // updated position and velocity double4 newPos = myPos + oldVel * deltaTime + acc * 0.5f * deltaTime * deltaTime; newPos.w = myPos.w; double4 newVel = oldVel + acc * deltaTime; // write to global memory pos[gid] = newPos; vel[gid] = newVel; }