Archives Discussions

jsonntag · ‎05-08-2013

I have a kernel which works fine on HD 6xxx and earlier GPUs with Catalyst 12.10 or earlier drivers under Windows 7 x64. The app is built with AMD APP SDK 2.8. When attempting to run the app on HD 7xxx GPUs, I get the following error when building the program:

Unhandled Exception Detected...

- Unhandled Exception Record -

Reason: Access Violation (0xc0000005) at address 0x0000000076F1E4B4 write attempt to address 0x00000024

According to the OpenCL specs, I would expect that it should never crash but should instead fail to compile and allow me to get the build log which would contain the reason for the errors. Instead, it crashes regardless of any try/catch that is in place. This same kernel works 100% on nVidia GPUs, OS X, and pre HD7xx GPUs. It seems that commenting out the mul_hi function will allow it to compile. But, since the output of mul_hi is required to get the correct output, that really isn't an option. I would tell the thousands of BOINC users to use the older application but since the new GPUs won't run the code compiled with Brook+ properly, that isn't an option either.

So.... anyone have any idea why this won't compile on a HD 7790 with 13.1 thru 13.4 drivers on Windows 7 x64?

__kernel void kernelSteps(__const __global uint4 *mosc, __const uint offset, __const uint4 start, __global uint4 *steps)

{

const uint lookahead = 20;

const uint4 sc = (uint4)(1048575,12,1048577,0);

const uint t_offset = get_global_id(sc.w) + get_global_size(sc.w) * get_global_id(1);\n

const uint totalOffset = offset + t_offset;\n

uint4 carry,lut,stepsOut,steps_in = steps[t_offset];

uint2 mul_r;

uint valh,valt,carryh,index,i;

uint cont=1u;

uint4 val = start;

uint overflow;

i = overflow = valh = valt = sc.w;

val.x += totalOffset;

carry.x = (val.x < totalOffset);

val.y += carry.x;

carry.y = (val.y < carry.x);

val.z += carry.y;

while(cont)

{

lut = mosc[val.x & sc.x];

mul_r = mul64((val.x >> lookahead) + (val.y << sc.y), lut.x);

val.x = mul_r.x + lut.y;

carry.x = mul_r.y + (val.x < mul_r.x);

mul_r = mul64((val.y >> lookahead) + (val.z << sc.y), lut.x);

val.y = mul_r.x + carry.x;

carry.y = mul_r.y + (val.y < mul_r.x);

mul_r = mul64((val.z >> lookahead) + (val.w << sc.y), lut.x);

val.z = mul_r.x + carry.y;

carry.z = mul_r.y + (val.z < mul_r.x);

mul_r = mul64((val.w >> lookahead) + (valh << sc.y), lut.x);

val.w = mul_r.x + carry.z;

carry.w = mul_r.y + (val.w < mul_r.x);

mul_r = mul64((valh >> lookahead) + (valt << sc.y), lut.x);

valh = mul_r.x + carry.w;

carryh = mul_r.y + (valh < mul_r.x);

mul_r = mul64(valt >> lookahead, lut.x);

valt = mul_r.x + carryh;

overflow = (overflow | mul_r.y) | (valt < mul_r.x);

i += lut.z;

}

index=val.x-2u;

i += mosc[index & sc.x].w;

if(overflow) i = 0x1000000u;

carry.x = steps_in.z + i;

stepsOut.z = carry.x;

stepsOut.w = steps_in.w + (carry.x < i);,

if (i > steps_in.x)

{

stepsOut.x = i;

stepsOut.y = totalOffset;

}

else

{

stepsOut.x = steps_in.x;

stepsOut.y = steps_in.y;

}

steps[t_offset] = stepsOut;

}

himanshu_gautam · ‎05-09-2013

I can compile the above kernel using 13.4 driver on Win7-64 with HD 7770 GPU. Build log reports an error about mul64 function, apart from some simple syntax errors.

jsonntag · ‎05-09-2013

LOL, I forgot to include the mul64 function in my post. When I comment it out, it compiles OK for me too.

inline uint2 mul64(__const uint a, __const uint b){

return (uint2)(a*b,mul_hi(a,

}

The entire kernel code (with an alternate to mul_hi which doesn't work for me either) is as follows:

__kernel void kernelZeroBuffer(__global uint4 *steps){

steps[get_global_id(0)] = (uint4)(0,0,0,0);

}

inline uint2 mul64(__const uint a, __const uint b){

ulong x = (ulong)a * (ulong)b;

return (uint2)((uint)x,(uint)(x>>32));

}

__kernel void kernelSteps(__const __global uint4 *mosc, __const uint offset, __const uint4 start, __global uint4 *steps)

{

const uint lookahead = 20;

const uint4 sc = (uint4)(1048575,12,1048577,0);

const uint t_offset = get_global_id(sc.w) + get_global_size(sc.w) * get_global_id(1);

const uint totalOffset = offset + t_offset;

uint4 carry,lut,stepsOut,steps_in = steps[t_offset];

uint2 mul_r;

uint valh,valt,carryh,index,i;

uint cont=1u;

uint4 val = start;

uint overflow;

i = overflow = valh = valt = sc.w;

val.x += totalOffset;

carry.x = (val.x < totalOffset);

val.y += carry.x;

carry.y = (val.y < carry.x);

val.z += carry.y;

while(cont)

{

lut = mosc[val.x & sc.x];

mul_r = mul64((val.x >> lookahead) + (val.y << sc.y), lut.x);

val.x = mul_r.x + lut.y;

carry.x = mul_r.y + (val.x < mul_r.x);

mul_r = mul64((val.y >> lookahead) + (val.z << sc.y), lut.x);

val.y = mul_r.x + carry.x;

carry.y = mul_r.y + (val.y < mul_r.x);

mul_r = mul64((val.z >> lookahead) + (val.w << sc.y), lut.x);

val.z = mul_r.x + carry.y;

carry.z = mul_r.y + (val.z < mul_r.x);

mul_r = mul64((val.w >> lookahead) + (valh << sc.y), lut.x);

val.w = mul_r.x + carry.z;

carry.w = mul_r.y + (val.w < mul_r.x);

mul_r = mul64((valh >> lookahead) + (valt << sc.y), lut.x);

valh = mul_r.x + carry.w;

carryh = mul_r.y + (valh < mul_r.x);

mul_r = mul64(valt >> lookahead, lut.x);

valt = mul_r.x + carryh;

overflow = (overflow | mul_r.y) | (valt < mul_r.x);

i += lut.z;

}

index=val.x-2u;

i += mosc[index & sc.x].w;

if(overflow) i = 0x1000000u;

carry.x = steps_in.z + i;

stepsOut.z = carry.x;

stepsOut.w = steps_in.w + (carry.x < i);,

if (i > steps_in.x)

{

stepsOut.x = i;

stepsOut.y = totalOffset;

}

else

{

stepsOut.x = steps_in.x;

stepsOut.y = steps_in.y;

}

steps[t_offset] = stepsOut;

}

jsonntag · ‎05-09-2013

...but commenting out the mul_hi line is not an option.

jsonntag · ‎05-09-2013

If I use "-cl-opt-disable" it no longer crashes in the call to clBuildProgram. It appears the issue is due to the optimization it is trying to do with the newer GPUs. I hate to disable optimization since the Brook+ version of the same application is 50% faster than the OpenCL version and this may make the OpenCL version even slower.

himanshu_gautam · ‎05-10-2013

Hi jstong,

It is a critical issue if OCL compiler is wrongly optimizing the code. But I am still not able to reproduce the issue. There seems to be some copy paste isssue with mul64 code above.

I had tried the remaining code and it still compiled for me. No crashes. Can you attach the kernel as a zip file.

jsonntag · ‎05-16-2013

I'll do one better. Here's the entire VS2010 solution. Using -cl-opt-disable seems to make it work whereas using no options for the clBuildProgram call causes an LLVM error below.

LLVM ERROR: Cannot select: 0x48b5a80: i32 = setcc 0x48a9ac0, 0x48b3f60, 0x48b457

0 [ID=59]

0x48a9ac0: i64 = AMDILISD::ADD 0x48b3f60, 0x48b4060 [ID=57]

0x48b3f60: i64 = or 0x48b3e60, 0x48b3960 [ORD=26] [ID=49]

0x48b3e60: i64 = shl 0x48b4a70, 0x48b3d60 [ORD=25] [ID=46]

0x48b4a70: i64 = any_extend 0x48b3860 [ID=42]

0x48b3860: i32 = AMDILISD::VEXTRACT 0x48aa2c0, 0x48abde0 [ORD=23] [ID=

35]

0x48aa2c0: v4i32,ch = CopyFromReg 0x483dd10, 0x48abbe0 [ORD=21] [ID=

26]

0x48abbe0: v4i32 = Register %vreg27 [ORD=21] [ID=2]

0x48abde0: i32 = TargetConstant<2> [ORD=14] [ID=22]

0x48b3d60: i32 = Constant<32> [ORD=25] [ID=12]

0x48b3960: i64 = zero_extend 0x48a99c0 [ORD=22] [ID=43]

0x48a99c0: i32 = AMDILISD::VEXTRACT 0x48aa2c0, 0x48a94c0 [ORD=21] [ID=36

]

0x48aa2c0: v4i32,ch = CopyFromReg 0x483dd10, 0x48abbe0 [ORD=21] [ID=26

]

0x48abbe0: v4i32 = Register %vreg27 [ORD=21] [ID=2]

0x48a94c0: i32 = TargetConstant<1> [ORD=13] [ID=21]

0x48b4060: i64 = zero_extend 0x48ac1e0 [ORD=27] [ID=55]

0x48ac1e0: i32 = AMDILISD::ADD 0x48ab5e0, 0x48b4870 [ORD=18] [ID=52]

0x48ab5e0: i32 = AMDILISD::ADD 0x48aa0c0, 0x48a97c0 [ORD=16] [ID=47]

0x48aa0c0: i32 = mul 0x48b5d80, 0x48b5e80 [ORD=15] [ID=44]

0x48b5d80: i32 = AMDILISD::VEXTRACT 0x48a9dc0, 0x48a94c0 [ORD=13] [I

D=39]

0x48a9dc0: v4i32,ch = llvm.AMDIL.get.global.size 0x483dd10, 0x48a9

bc0 [ORD=12] [ID=29]

0x48a9bc0: i32 = TargetConstant<2563> [ORD=12] [ID=6]

0x48a94c0: i32 = TargetConstant<1> [ORD=13] [ID=21]

0x48b5e80: i32 = AMDILISD::VEXTRACT 0x48abce0, 0x48abde0 [ORD=14] [I

D=37]

0x48abce0: v4i32,ch = llvm.AMDIL.get.global.id 0x483dd10, 0x48ab6e

0 [ORD=10] [ID=28]

0x48ab6e0: i32 = TargetConstant<2561> [ORD=10] [ID=4]

0x48abde0: i32 = TargetConstant<2> [ORD=14] [ID=22]

0x48a97c0: i32 = AMDILISD::VEXTRACT 0x48abce0, 0x48a94c0 [ORD=11] [ID=

38]

0x48abce0: v4i32,ch = llvm.AMDIL.get.global.id 0x483dd10, 0x48ab6e0

[ORD=10] [ID=28]

	0x48ab6e0: i32 = TargetConstant<2561> [ORD=10] [ID=4]
	0x48a94c0: i32 = TargetConstant<1> [ORD=13] [ID=21]
	0x48b4870: i32 = AMDILISD::VEXTRACT 0x48ab8e0, 0x48a95c0 [ORD=17] [ID=32

]

0x48ab8e0: v4i32,ch = CopyFromReg 0x483dd10, 0x48ab9e0 [ORD=17] [ID=25

]

	0x48ab9e0: v4i32 = Register %vreg26 [ORD=17] [ID=1]
	0x48a95c0: i32 = TargetConstant<3> [ORD=32] [ID=23]

0x48b3f60: i64 = or 0x48b3e60, 0x48b3960 [ORD=26] [ID=49]

	0x48b3e60: i64 = shl 0x48b4a70, 0x48b3d60 [ORD=25] [ID=46]
	0x48b4a70: i64 = any_extend 0x48b3860 [ID=42]
	0x48b3860: i32 = AMDILISD::VEXTRACT 0x48aa2c0, 0x48abde0 [ORD=23] [ID=35

]

0x48aa2c0: v4i32,ch = CopyFromReg 0x483dd10, 0x48abbe0 [ORD=21] [ID=26

]

	0x48abbe0: v4i32 = Register %vreg27 [ORD=21] [ID=2]
	0x48abde0: i32 = TargetConstant<2> [ORD=14] [ID=22]
	0x48b3d60: i32 = Constant<32> [ORD=25] [ID=12]
	0x48b3960: i64 = zero_extend 0x48a99c0 [ORD=22] [ID=43]
	0x48a99c0: i32 = AMDILISD::VEXTRACT 0x48aa2c0, 0x48a94c0 [ORD=21] [ID=36]
	0x48aa2c0: v4i32,ch = CopyFromReg 0x483dd10, 0x48abbe0 [ORD=21] [ID=26]
	0x48abbe0: v4i32 = Register %vreg27 [ORD=21] [ID=2]
	0x48a94c0: i32 = TargetConstant<1> [ORD=13] [ID=21]

jsonntag · ‎05-16-2013

Here's the VS2010 source code:

http://boinc.thesonntags.com/collatz_test/download/CatalystDriverTest.zip

himanshu_gautam · ‎05-17-2013

I took your kernel and ran using APP Kernel Analyzer: I get following errors:

========= Build: started ==========
OpenCL Compile Error: clBuildProgram failed (CL_BUILD_PROGRAM_FAILURE).
--------
OpenCL Compile Error: Compiling for device: Tahiti
Line 32: error: identifier
          "vall" is undefined
    lut = mosc[vall & scx];
               ^

Line 33: error: function
          "mul64" declared implicitly
    mul_r = mul64((val.x >> lookahead) + (val.y << scy), lut.x);
            ^

Line 62: error: a value of
          type "uint4" cannot be assigned to an entity of type "uint"
   i += mosc[(val.x-2)&(scx)];
        ^

Line 63: error: identifier
          "maxsteps" is undefined
   if (overflow) i = maxsteps;
                     ^

Line 12: warning: variable
          "index" was set but never used
   uint index, cont, valh, valt, carryh, overflow = 0u;
        ^

4 errors detected.

Frontend phase failed compilation.
--------

========== Build: 0 of 1 succeeded ==========

stevec · ‎05-16-2013

Just chiming in here to provide more data. I've been struggling with this exact same issue. Windows 7, Catalyst 13.4 and 13.5_beta with APP SDK 2.8. The kernel worked on a HD 6xxx card but crashes during compilation on my new HD 7970. I also found that -cl-opt-disable prevents it from crashing. Here is the code:

#define CHOLESKY_TOL 1e-30f

#define NPAR 5

#define SX 16

#define SY 16

#define NY 256 // 16*16

float evaluate(float *p,

__constant float *cutout,

__global float *fvec)

{

// pull out optimization variables

float xoffset = p[0];

float yoffset = p[1];

float stdev = p[2];

float amp = p[3];

float background = p[4];

float scale = sqrt(2.0f)*stdev;

// precompute the x and y erf factors

float xerf[SX];

float yerf[SY];

for (int x=0; x<SX; x++) {

float fx = x - SX*0.5f + xoffset;

float e = 0.5f * (erf((fx+0.5f)/scale) - erf((fx-0.5f)/scale));

xerf = e;

}

for (int y=0; y<SY; y++) {

float fy = y - SY*0.5f + yoffset;

float e = 0.5f * (erf((fy+0.5f)/scale) - erf((fy-0.5f)/scale));

yerf = e;

}

float sqerror=0;

int c=0;

for (int y=0; y<SY; y++) {

for (int x=0; x<SX; x++) {

float model = amp * xerf*yerf + background;

float sample = cutout;

float err = (model - sample) / sqrt(max(1.0f, sample));

fvec = err;

sqerror += err*err;

c++;

}

return sqerror;

}

float model_photon_count(float *p)

{

float xoffset = p[0];

float yoffset = p[1];

float stdev = p[2];

float amp = p[3];

float scale = sqrt(2.0f)*stdev;

float xerf[SX];

float yerf[SY];

for (int x=0; x<SX; x++) {

float fx = x - SX*0.5f + xoffset;

float e = 0.5f * (erf((fx+0.5f)/scale) - erf((fx-0.5f)/scale));

xerf = e;

}

for (int y=0; y<SY; y++) {

float fy = y - SY*0.5f + yoffset;

float e = 0.5 * (erf((fy+0.5f)/scale) - erf((fy-0.5f)/scale));

yerf = e;

}

float count = 0;

for (int y=0; y<SY; y++) {

for (int x=0; x<SX; x++) {

count += amp * xerf*yerf;

}

return count;

}

/* solve the equation Ax=b for a symmetric positive-definite matrix A,

using the Cholesky decomposition A=LL^T. The matrix L is passed in "l".

Elements above the diagonal are ignored.

*/

void solve_axb_cholesky(float *l,

float *x,

float *b)

{

int i,j;

float sum;

/* solve L*y = b for y (where x[] is used to store y) */

for (i=0; i<NPAR; i++) {

sum = 0;

for (j=0; j<i; j++)

sum += l[i*NPAR+j] * x;

x = (b - sum)/l[i*NPAR+i];

}

/* solve L^T*x = y for x (where x[] is used to store both y and x) */

for (i=NPAR-1; i>=0; i--) {

sum = 0;

for (j=i+1; j<NPAR; j++)

sum += l[j*NPAR+i] * x;

x = (x - sum)/l[i*NPAR+i];

}

/* This function takes a symmetric, positive-definite matrix "a" and returns

its (lower-triangular) Cholesky factor in "l". Elements above the

diagonal are neither used nor modified. The same array may be passed

as both l and a, in which case the decomposition is performed in place.

*/

int cholesky_decomp(float *l,

float *a)

{

int i,j,k;

float sum;

for (i=0; i<NPAR; i++) {

for (j=0; j<i; j++) {

sum = 0;

for (k=0; k<j; k++)

sum += l[i*NPAR+k] * l[j*NPAR+k];

l[i*NPAR+j] = (a[i*NPAR+j] - sum)/l[j*NPAR+j];

}

sum = 0;

for (k=0; k<i; k++)

sum += l[i*NPAR+k] * l[i*NPAR+k];

sum = a[i*NPAR+i] - sum;

if (sum<CHOLESKY_TOL) return 1; /* not positive-definite */

l[i*NPAR+i] = sqrt(sum);

}

return 0;

}

__kernel void fitParticleGaussian2D(__constant float *cutout_block,

__global float *ferr_block,

__global float *ferr2_block,

__global float *jac_block,

__global float *params_block,

const int num_particles,

const int max_iters)

{

if (get_global_id(0) >= num_particles)

return;

__constant float *cutout = &cutout_block[get_global_id(0)*NY];

__global float *ferr = &ferr_block[get_global_id(0)*NY];

__global float *ferr2 = &ferr2_block[get_global_id(0)*NY];

__global float *jac = &jac_block[get_global_id(0)*NY*NPAR];

__global float *params = &params_block[get_global_id(0)*8];

int x, i, j, it, nit, ill;

float lambda, up, down, grad_eps, mult, err, newerr, derr, xnorm, xdelta, ftol, xtol;

float h[NPAR*NPAR];

float ch[NPAR*NPAR];

float d[NPAR];

float delta[NPAR];

float newpar[NPAR];

nit = max_iters;

lambda = 0.0001f;

up = 10.0f;

down = 0.1f;

grad_eps = 0.0001f;

ftol = 1.0e-3f;

xtol = 1.0e-3f;

derr = newerr = xnorm = xdelta = mult = 0.0f;

ill = 0;

/* initial guesses on parameters */

float par[NPAR];

par[0] = 0.0f; // x

par[1] = 0.0f; // y

par[2] = 3.0f; // stdev

par[3] = 400.0f; // amp

par[4] = 0.0f; // background

/* calculate the initial error */

evaluate(par, cutout, ferr);

err = 0;

for (x=0; x<NY; x++)

err += ferr*ferr;

/* main iteration */

for (it=0; it<nit; it++) {

/* use finite differences */

for (i=0; i<NPAR; i++) {

float temp = par;

float step = sqrt(grad_eps) * fabs(temp);

if (step == 0.0f)

step = sqrt(grad_eps);

par += step;

evaluate(par, cutout, ferr2);

for (x=0; x<NY; x++)

jac[x*NPAR+i] = (ferr2 - ferr) / (par - temp);

par = temp;

}

/* calculate the approximation to the Hessian and the "derivative" d */

for (i=0; i<NPAR; i++) {

d = 0.0f;

for (j=0; j<=i; j++)

h[i*NPAR+j] = 0;

}

for (x=0; x<NY; x++) {

for (i=0; i<NPAR; i++) {

d += -ferr*jac[x*NPAR+i];

for (j=0; j<=i; j++)

h[i*NPAR+j] += jac[x*NPAR+i]*jac[x*NPAR+j];

}

/* make a step "delta." If the step is rejected, increase

lambda and try again */

mult = 1.0f + lambda;

ill = 1; /* ill-conditioned? */

xnorm = 0.0f;

xdelta = 0.0f;

while (ill && (it<nit)) {

for (i=0; i<NPAR; i++)

h[i*NPAR+i] = h[i*NPAR+i]*mult;

ill = cholesky_decomp(ch, h);

if (!ill) {

solve_axb_cholesky(ch, delta, d);

for (i=0; i<NPAR; i++) {

newpar = par + delta;

xdelta += delta*delta;

xnorm += newpar*newpar;

}

xdelta = sqrt(xdelta);

xnorm = sqrt(xnorm);

evaluate(newpar, cutout, ferr);

newerr = 0.0f;

for (x=0; x<NY; x++)

newerr += ferr*ferr;

derr = 1.0f - newerr/err;

ill = (derr < 0.0f);

}

if (ill) {

mult = (1.0f + lambda*up)/(1.0f + lambda);

lambda *= up;

it++;

}

for (i=0; i<NPAR; i++)

par = newpar;

err = newerr;

lambda *= down;

if (!ill) {

// terminate on small change in error

if (derr<ftol) {

break;

}

// terminate on small step size

if ((xdelta/xnorm)<xtol) {

break;

}

for (i=0; i<NPAR; i++) {

params = par;

}

params[5] = model_photon_count(par);

params[6] = evaluate(par, cutout, ferr);

params[7] = it;

}

himanshu_gautam · ‎05-17-2013

Thanks jstong and stevec. Will look into the testcase and let you know my findings.

himanshu_gautam · ‎05-17-2013

I could see your kernel crashing the Kernel Analyzer when run without disabling optimizations. When using -cl-opt-disable, the kernel compiles fine. Forwarding it to relevant engineering team.

Archives Discussions

OpenCL Compiler Crashes