I have a kernel which works fine on HD 6xxx and earlier GPUs with Catalyst 12.10 or earlier drivers under Windows 7 x64. The app is built with AMD APP SDK 2.8. When attempting to run the app on HD 7xxx GPUs, I get the following error when building the program:
Unhandled Exception Detected...
- Unhandled Exception Record -
Reason: Access Violation (0xc0000005) at address 0x0000000076F1E4B4 write attempt to address 0x00000024
According to the OpenCL specs, I would expect that it should never crash but should instead fail to compile and allow me to get the build log which would contain the reason for the errors. Instead, it crashes regardless of any try/catch that is in place. This same kernel works 100% on nVidia GPUs, OS X, and pre HD7xx GPUs. It seems that commenting out the mul_hi function will allow it to compile. But, since the output of mul_hi is required to get the correct output, that really isn't an option. I would tell the thousands of BOINC users to use the older application but since the new GPUs won't run the code compiled with Brook+ properly, that isn't an option either.
So.... anyone have any idea why this won't compile on a HD 7790 with 13.1 thru 13.4 drivers on Windows 7 x64?
__kernel void kernelSteps(__const __global uint4 *mosc, __const uint offset, __const uint4 start, __global uint4 *steps)
{
const uint lookahead = 20;
const uint4 sc = (uint4)(1048575,12,1048577,0);
const uint t_offset = get_global_id(sc.w) + get_global_size(sc.w) * get_global_id(1);\n
const uint totalOffset = offset + t_offset;\n
uint4 carry,lut,stepsOut,steps_in = steps[t_offset];
uint2 mul_r;
uint valh,valt,carryh,index,i;
uint cont=1u;
uint4 val = start;
uint overflow;
i = overflow = valh = valt = sc.w;
val.x += totalOffset;
carry.x = (val.x < totalOffset);
val.y += carry.x;
carry.y = (val.y < carry.x);
val.z += carry.y;
while(cont)
{
lut = mosc[val.x & sc.x];
mul_r = mul64((val.x >> lookahead) + (val.y << sc.y), lut.x);
val.x = mul_r.x + lut.y;
carry.x = mul_r.y + (val.x < mul_r.x);
mul_r = mul64((val.y >> lookahead) + (val.z << sc.y), lut.x);
val.y = mul_r.x + carry.x;
carry.y = mul_r.y + (val.y < mul_r.x);
mul_r = mul64((val.z >> lookahead) + (val.w << sc.y), lut.x);
val.z = mul_r.x + carry.y;
carry.z = mul_r.y + (val.z < mul_r.x);
mul_r = mul64((val.w >> lookahead) + (valh << sc.y), lut.x);
val.w = mul_r.x + carry.z;
carry.w = mul_r.y + (val.w < mul_r.x);
mul_r = mul64((valh >> lookahead) + (valt << sc.y), lut.x);
valh = mul_r.x + carry.w;
carryh = mul_r.y + (valh < mul_r.x);
mul_r = mul64(valt >> lookahead, lut.x);
valt = mul_r.x + carryh;
overflow = (overflow | mul_r.y) | (valt < mul_r.x);
i += lut.z;
cont = ((((val.x > (uint)sc.z) | val.y) | (val.z | val.w)) | (valh | valt)) && (i<0x1000000u);
}
index=val.x-2u;
i += mosc[index & sc.x].w;
if(overflow) i = 0x1000000u;
carry.x = steps_in.z + i;
stepsOut.z = carry.x;
stepsOut.w = steps_in.w + (carry.x < i);,
if (i > steps_in.x)
{
stepsOut.x = i;
stepsOut.y = totalOffset;
}
else
{
stepsOut.x = steps_in.x;
stepsOut.y = steps_in.y;
}
steps[t_offset] = stepsOut;
}
I can compile the above kernel using 13.4 driver on Win7-64 with HD 7770 GPU. Build log reports an error about mul64 function, apart from some simple syntax errors.
LOL, I forgot to include the mul64 function in my post. When I comment it out, it compiles OK for me too.
inline uint2 mul64(__const uint a, __const uint b){
return (uint2)(a*b,mul_hi(a,
}
The entire kernel code (with an alternate to mul_hi which doesn't work for me either) is as follows:
__kernel void kernelZeroBuffer(__global uint4 *steps){
steps[get_global_id(0)] = (uint4)(0,0,0,0);
}
inline uint2 mul64(__const uint a, __const uint b){
ulong x = (ulong)a * (ulong)b;
return (uint2)((uint)x,(uint)(x>>32));
}
__kernel void kernelSteps(__const __global uint4 *mosc, __const uint offset, __const uint4 start, __global uint4 *steps)
{
const uint lookahead = 20;
const uint4 sc = (uint4)(1048575,12,1048577,0);
const uint t_offset = get_global_id(sc.w) + get_global_size(sc.w) * get_global_id(1);
const uint totalOffset = offset + t_offset;
uint4 carry,lut,stepsOut,steps_in = steps[t_offset];
uint2 mul_r;
uint valh,valt,carryh,index,i;
uint cont=1u;
uint4 val = start;
uint overflow;
i = overflow = valh = valt = sc.w;
val.x += totalOffset;
carry.x = (val.x < totalOffset);
val.y += carry.x;
carry.y = (val.y < carry.x);
val.z += carry.y;
while(cont)
{
lut = mosc[val.x & sc.x];
mul_r = mul64((val.x >> lookahead) + (val.y << sc.y), lut.x);
val.x = mul_r.x + lut.y;
carry.x = mul_r.y + (val.x < mul_r.x);
mul_r = mul64((val.y >> lookahead) + (val.z << sc.y), lut.x);
val.y = mul_r.x + carry.x;
carry.y = mul_r.y + (val.y < mul_r.x);
mul_r = mul64((val.z >> lookahead) + (val.w << sc.y), lut.x);
val.z = mul_r.x + carry.y;
carry.z = mul_r.y + (val.z < mul_r.x);
mul_r = mul64((val.w >> lookahead) + (valh << sc.y), lut.x);
val.w = mul_r.x + carry.z;
carry.w = mul_r.y + (val.w < mul_r.x);
mul_r = mul64((valh >> lookahead) + (valt << sc.y), lut.x);
valh = mul_r.x + carry.w;
carryh = mul_r.y + (valh < mul_r.x);
mul_r = mul64(valt >> lookahead, lut.x);
valt = mul_r.x + carryh;
overflow = (overflow | mul_r.y) | (valt < mul_r.x);
i += lut.z;
cont = ((((val.x > (uint)sc.z) | val.y) | (val.z | val.w)) | (valh | valt)) && (i<0x1000000u);
}
index=val.x-2u;
i += mosc[index & sc.x].w;
if(overflow) i = 0x1000000u;
carry.x = steps_in.z + i;
stepsOut.z = carry.x;
stepsOut.w = steps_in.w + (carry.x < i);,
if (i > steps_in.x)
{
stepsOut.x = i;
stepsOut.y = totalOffset;
}
else
{
stepsOut.x = steps_in.x;
stepsOut.y = steps_in.y;
}
steps[t_offset] = stepsOut;
}
...but commenting out the mul_hi line is not an option.
If I use "-cl-opt-disable" it no longer crashes in the call to clBuildProgram. It appears the issue is due to the optimization it is trying to do with the newer GPUs. I hate to disable optimization since the Brook+ version of the same application is 50% faster than the OpenCL version and this may make the OpenCL version even slower.
Hi jstong,
It is a critical issue if OCL compiler is wrongly optimizing the code. But I am still not able to reproduce the issue. There seems to be some copy paste isssue with mul64 code above.
I had tried the remaining code and it still compiled for me. No crashes. Can you attach the kernel as a zip file.
I'll do one better. Here's the entire VS2010 solution. Using -cl-opt-disable seems to make it work whereas using no options for the clBuildProgram call causes an LLVM error below.
LLVM ERROR: Cannot select: 0x48b5a80: i32 = setcc 0x48a9ac0, 0x48b3f60, 0x48b457
0 [ID=59]
0x48a9ac0: i64 = AMDILISD::ADD 0x48b3f60, 0x48b4060 [ID=57]
0x48b3f60: i64 = or 0x48b3e60, 0x48b3960 [ORD=26] [ID=49]
0x48b3e60: i64 = shl 0x48b4a70, 0x48b3d60 [ORD=25] [ID=46]
0x48b4a70: i64 = any_extend 0x48b3860 [ID=42]
0x48b3860: i32 = AMDILISD::VEXTRACT 0x48aa2c0, 0x48abde0 [ORD=23] [ID=
35]
0x48aa2c0: v4i32,ch = CopyFromReg 0x483dd10, 0x48abbe0 [ORD=21] [ID=
26]
0x48abbe0: v4i32 = Register %vreg27 [ORD=21] [ID=2]
0x48abde0: i32 = TargetConstant<2> [ORD=14] [ID=22]
0x48b3d60: i32 = Constant<32> [ORD=25] [ID=12]
0x48b3960: i64 = zero_extend 0x48a99c0 [ORD=22] [ID=43]
0x48a99c0: i32 = AMDILISD::VEXTRACT 0x48aa2c0, 0x48a94c0 [ORD=21] [ID=36
]
0x48aa2c0: v4i32,ch = CopyFromReg 0x483dd10, 0x48abbe0 [ORD=21] [ID=26
]
0x48abbe0: v4i32 = Register %vreg27 [ORD=21] [ID=2]
0x48a94c0: i32 = TargetConstant<1> [ORD=13] [ID=21]
0x48b4060: i64 = zero_extend 0x48ac1e0 [ORD=27] [ID=55]
0x48ac1e0: i32 = AMDILISD::ADD 0x48ab5e0, 0x48b4870 [ORD=18] [ID=52]
0x48ab5e0: i32 = AMDILISD::ADD 0x48aa0c0, 0x48a97c0 [ORD=16] [ID=47]
0x48aa0c0: i32 = mul 0x48b5d80, 0x48b5e80 [ORD=15] [ID=44]
0x48b5d80: i32 = AMDILISD::VEXTRACT 0x48a9dc0, 0x48a94c0 [ORD=13] [I
D=39]
0x48a9dc0: v4i32,ch = llvm.AMDIL.get.global.size 0x483dd10, 0x48a9
bc0 [ORD=12] [ID=29]
0x48a9bc0: i32 = TargetConstant<2563> [ORD=12] [ID=6]
0x48a94c0: i32 = TargetConstant<1> [ORD=13] [ID=21]
0x48b5e80: i32 = AMDILISD::VEXTRACT 0x48abce0, 0x48abde0 [ORD=14] [I
D=37]
0x48abce0: v4i32,ch = llvm.AMDIL.get.global.id 0x483dd10, 0x48ab6e
0 [ORD=10] [ID=28]
0x48ab6e0: i32 = TargetConstant<2561> [ORD=10] [ID=4]
0x48abde0: i32 = TargetConstant<2> [ORD=14] [ID=22]
0x48a97c0: i32 = AMDILISD::VEXTRACT 0x48abce0, 0x48a94c0 [ORD=11] [ID=
38]
0x48abce0: v4i32,ch = llvm.AMDIL.get.global.id 0x483dd10, 0x48ab6e0
[ORD=10] [ID=28]
0x48ab6e0: i32 = TargetConstant<2561> [ORD=10] [ID=4] | |
0x48a94c0: i32 = TargetConstant<1> [ORD=13] [ID=21] | |
0x48b4870: i32 = AMDILISD::VEXTRACT 0x48ab8e0, 0x48a95c0 [ORD=17] [ID=32 |
]
0x48ab8e0: v4i32,ch = CopyFromReg 0x483dd10, 0x48ab9e0 [ORD=17] [ID=25 |
]
0x48ab9e0: v4i32 = Register %vreg26 [ORD=17] [ID=1] | |
0x48a95c0: i32 = TargetConstant<3> [ORD=32] [ID=23] |
0x48b3f60: i64 = or 0x48b3e60, 0x48b3960 [ORD=26] [ID=49]
0x48b3e60: i64 = shl 0x48b4a70, 0x48b3d60 [ORD=25] [ID=46] | |
0x48b4a70: i64 = any_extend 0x48b3860 [ID=42] | |
0x48b3860: i32 = AMDILISD::VEXTRACT 0x48aa2c0, 0x48abde0 [ORD=23] [ID=35 |
]
0x48aa2c0: v4i32,ch = CopyFromReg 0x483dd10, 0x48abbe0 [ORD=21] [ID=26 |
]
0x48abbe0: v4i32 = Register %vreg27 [ORD=21] [ID=2] | |
0x48abde0: i32 = TargetConstant<2> [ORD=14] [ID=22] | |
0x48b3d60: i32 = Constant<32> [ORD=25] [ID=12] | |
0x48b3960: i64 = zero_extend 0x48a99c0 [ORD=22] [ID=43] | |
0x48a99c0: i32 = AMDILISD::VEXTRACT 0x48aa2c0, 0x48a94c0 [ORD=21] [ID=36] | |
0x48aa2c0: v4i32,ch = CopyFromReg 0x483dd10, 0x48abbe0 [ORD=21] [ID=26] | |
0x48abbe0: v4i32 = Register %vreg27 [ORD=21] [ID=2] | |
0x48a94c0: i32 = TargetConstant<1> [ORD=13] [ID=21] |
Here's the VS2010 source code:
http://boinc.thesonntags.com/collatz_test/download/CatalystDriverTest.zip
I took your kernel and ran using APP Kernel Analyzer: I get following errors:
========= Build: started ==========
OpenCL Compile Error: clBuildProgram failed (CL_BUILD_PROGRAM_FAILURE).
--------
OpenCL Compile Error: Compiling for device: Tahiti
Line 32: error: identifier
"vall" is undefined
lut = mosc[vall & scx];
^
Line 33: error: function
"mul64" declared implicitly
mul_r = mul64((val.x >> lookahead) + (val.y << scy), lut.x);
^
Line 62: error: a value of
type "uint4" cannot be assigned to an entity of type "uint"
i += mosc[(val.x-2)&(scx)];
^
Line 63: error: identifier
"maxsteps" is undefined
if (overflow) i = maxsteps;
^
Line 12: warning: variable
"index" was set but never used
uint index, cont, valh, valt, carryh, overflow = 0u;
^
4 errors detected.
Frontend phase failed compilation.
--------
========== Build: 0 of 1 succeeded ==========
Just chiming in here to provide more data. I've been struggling with this exact same issue. Windows 7, Catalyst 13.4 and 13.5_beta with APP SDK 2.8. The kernel worked on a HD 6xxx card but crashes during compilation on my new HD 7970. I also found that -cl-opt-disable prevents it from crashing. Here is the code:
#define CHOLESKY_TOL 1e-30f
#define NPAR 5
#define SX 16
#define SY 16
#define NY 256 // 16*16
float evaluate(float *p,
__constant float *cutout,
__global float *fvec)
{
// pull out optimization variables
float xoffset = p[0];
float yoffset = p[1];
float stdev = p[2];
float amp = p[3];
float background = p[4];
float scale = sqrt(2.0f)*stdev;
// precompute the x and y erf factors
float xerf[SX];
float yerf[SY];
for (int x=0; x<SX; x++) {
float fx = x - SX*0.5f + xoffset;
float e = 0.5f * (erf((fx+0.5f)/scale) - erf((fx-0.5f)/scale));
xerf
}
for (int y=0; y<SY; y++) {
float fy = y - SY*0.5f + yoffset;
float e = 0.5f * (erf((fy+0.5f)/scale) - erf((fy-0.5f)/scale));
yerf
}
float sqerror=0;
int c=0;
for (int y=0; y<SY; y++) {
for (int x=0; x<SX; x++) {
float model = amp * xerf
float sample = cutout
float err = (model - sample) / sqrt(max(1.0f, sample));
fvec
sqerror += err*err;
c++;
}
}
return sqerror;
}
float model_photon_count(float *p)
{
float xoffset = p[0];
float yoffset = p[1];
float stdev = p[2];
float amp = p[3];
float scale = sqrt(2.0f)*stdev;
float xerf[SX];
float yerf[SY];
for (int x=0; x<SX; x++) {
float fx = x - SX*0.5f + xoffset;
float e = 0.5f * (erf((fx+0.5f)/scale) - erf((fx-0.5f)/scale));
xerf
}
for (int y=0; y<SY; y++) {
float fy = y - SY*0.5f + yoffset;
float e = 0.5 * (erf((fy+0.5f)/scale) - erf((fy-0.5f)/scale));
yerf
}
float count = 0;
for (int y=0; y<SY; y++) {
for (int x=0; x<SX; x++) {
count += amp * xerf
}
}
return count;
}
/* solve the equation Ax=b for a symmetric positive-definite matrix A,
using the Cholesky decomposition A=LL^T. The matrix L is passed in "l".
Elements above the diagonal are ignored.
*/
void solve_axb_cholesky(float *l,
float *x,
float *b)
{
int i,j;
float sum;
/* solve L*y = b for y (where x[] is used to store y) */
for (i=0; i<NPAR; i++) {
sum = 0;
for (j=0; j<i; j++)
sum += l[i*NPAR+j] * x
x = (b - sum)/l[i*NPAR+i];
}
/* solve L^T*x = y for x (where x[] is used to store both y and x) */
for (i=NPAR-1; i>=0; i--) {
sum = 0;
for (j=i+1; j<NPAR; j++)
sum += l[j*NPAR+i] * x
x = (x - sum)/l[i*NPAR+i];
}
}
/* This function takes a symmetric, positive-definite matrix "a" and returns
its (lower-triangular) Cholesky factor in "l". Elements above the
diagonal are neither used nor modified. The same array may be passed
as both l and a, in which case the decomposition is performed in place.
*/
int cholesky_decomp(float *l,
float *a)
{
int i,j,k;
float sum;
for (i=0; i<NPAR; i++) {
for (j=0; j<i; j++) {
sum = 0;
for (k=0; k<j; k++)
sum += l[i*NPAR+k] * l[j*NPAR+k];
l[i*NPAR+j] = (a[i*NPAR+j] - sum)/l[j*NPAR+j];
}
sum = 0;
for (k=0; k<i; k++)
sum += l[i*NPAR+k] * l[i*NPAR+k];
sum = a[i*NPAR+i] - sum;
if (sum<CHOLESKY_TOL) return 1; /* not positive-definite */
l[i*NPAR+i] = sqrt(sum);
}
return 0;
}
__kernel void fitParticleGaussian2D(__constant float *cutout_block,
__global float *ferr_block,
__global float *ferr2_block,
__global float *jac_block,
__global float *params_block,
const int num_particles,
const int max_iters)
{
if (get_global_id(0) >= num_particles)
return;
__constant float *cutout = &cutout_block[get_global_id(0)*NY];
__global float *ferr = &ferr_block[get_global_id(0)*NY];
__global float *ferr2 = &ferr2_block[get_global_id(0)*NY];
__global float *jac = &jac_block[get_global_id(0)*NY*NPAR];
__global float *params = ¶ms_block[get_global_id(0)*8];
int x, i, j, it, nit, ill;
float lambda, up, down, grad_eps, mult, err, newerr, derr, xnorm, xdelta, ftol, xtol;
float h[NPAR*NPAR];
float ch[NPAR*NPAR];
float d[NPAR];
float delta[NPAR];
float newpar[NPAR];
nit = max_iters;
lambda = 0.0001f;
up = 10.0f;
down = 0.1f;
grad_eps = 0.0001f;
ftol = 1.0e-3f;
xtol = 1.0e-3f;
derr = newerr = xnorm = xdelta = mult = 0.0f;
ill = 0;
/* initial guesses on parameters */
float par[NPAR];
par[0] = 0.0f; // x
par[1] = 0.0f; // y
par[2] = 3.0f; // stdev
par[3] = 400.0f; // amp
par[4] = 0.0f; // background
/* calculate the initial error */
evaluate(par, cutout, ferr);
err = 0;
for (x=0; x<NY; x++)
err += ferr
/* main iteration */
for (it=0; it<nit; it++) {
/* use finite differences */
for (i=0; i<NPAR; i++) {
float temp = par;
float step = sqrt(grad_eps) * fabs(temp);
if (step == 0.0f)
step = sqrt(grad_eps);
par += step;
evaluate(par, cutout, ferr2);
for (x=0; x<NY; x++)
jac[x*NPAR+i] = (ferr2
par = temp;
}
/* calculate the approximation to the Hessian and the "derivative" d */
for (i=0; i<NPAR; i++) {
d = 0.0f;
for (j=0; j<=i; j++)
h[i*NPAR+j] = 0;
}
for (x=0; x<NY; x++) {
for (i=0; i<NPAR; i++) {
d += -ferr
for (j=0; j<=i; j++)
h[i*NPAR+j] += jac[x*NPAR+i]*jac[x*NPAR+j];
}
}
/* make a step "delta." If the step is rejected, increase
lambda and try again */
mult = 1.0f + lambda;
ill = 1; /* ill-conditioned? */
xnorm = 0.0f;
xdelta = 0.0f;
while (ill && (it<nit)) {
for (i=0; i<NPAR; i++)
h[i*NPAR+i] = h[i*NPAR+i]*mult;
ill = cholesky_decomp(ch, h);
if (!ill) {
solve_axb_cholesky(ch, delta, d);
for (i=0; i<NPAR; i++) {
newpar = par + delta;
xdelta += delta*delta;
xnorm += newpar*newpar;
}
xdelta = sqrt(xdelta);
xnorm = sqrt(xnorm);
evaluate(newpar, cutout, ferr);
newerr = 0.0f;
for (x=0; x<NY; x++)
newerr += ferr
derr = 1.0f - newerr/err;
ill = (derr < 0.0f);
}
if (ill) {
mult = (1.0f + lambda*up)/(1.0f + lambda);
lambda *= up;
it++;
}
}
for (i=0; i<NPAR; i++)
par = newpar;
err = newerr;
lambda *= down;
if (!ill) {
// terminate on small change in error
if (derr<ftol) {
break;
}
// terminate on small step size
if ((xdelta/xnorm)<xtol) {
break;
}
}
}
for (i=0; i<NPAR; i++) {
params = par;
}
params[5] = model_photon_count(par);
params[6] = evaluate(par, cutout, ferr);
params[7] = it;
}
Thanks jstong and stevec. Will look into the testcase and let you know my findings.
I could see your kernel crashing the Kernel Analyzer when run without disabling optimizations. When using -cl-opt-disable, the kernel compiles fine. Forwarding it to relevant engineering team.