cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

jsonntag
Journeyman III

OpenCL Compiler Crashes

I have a kernel which works fine on HD 6xxx and earlier GPUs with Catalyst 12.10 or earlier drivers under Windows 7 x64.  The app is built with AMD APP SDK 2.8.  When attempting to run the app on HD 7xxx GPUs, I get the following error when building the program:

Unhandled Exception Detected...

- Unhandled Exception Record -

Reason: Access Violation (0xc0000005) at address 0x0000000076F1E4B4 write attempt to address 0x00000024

According to the OpenCL specs, I would expect that it should never crash but should instead fail to compile and allow me to get the build log which would contain the reason for the errors.  Instead, it crashes regardless of any try/catch that is in place.  This same kernel works 100% on nVidia GPUs, OS X, and pre HD7xx GPUs. It seems that commenting out the mul_hi function will allow it to compile.  But, since the output of mul_hi is required to get the correct output, that really isn't an option.  I would tell the thousands of BOINC users to use the older application but since the new GPUs won't run the code compiled with Brook+ properly, that isn't an option either.

So.... anyone have any idea why this won't compile on a HD 7790 with 13.1 thru 13.4 drivers on Windows 7 x64?

__kernel void kernelSteps(__const __global uint4 *mosc, __const uint offset, __const uint4 start, __global uint4 *steps)

{

  const uint lookahead = 20;

  const uint4 sc = (uint4)(1048575,12,1048577,0);

  const uint t_offset = get_global_id(sc.w) + get_global_size(sc.w) * get_global_id(1);\n

  const uint totalOffset = offset + t_offset;\n

  uint4 carry,lut,stepsOut,steps_in = steps[t_offset];

  uint2 mul_r;

  uint valh,valt,carryh,index,i;

  uint cont=1u;

  uint4 val = start; 

  uint overflow; 

  i = overflow = valh = valt = sc.w;   

  val.x += totalOffset;

  carry.x = (val.x < totalOffset);

  val.y += carry.x;

  carry.y = (val.y < carry.x);

  val.z += carry.y;

  while(cont)

  {

    lut = mosc[val.x & sc.x];

    mul_r = mul64((val.x >> lookahead) + (val.y << sc.y), lut.x);

    val.x = mul_r.x + lut.y;

    carry.x = mul_r.y + (val.x < mul_r.x);

    mul_r = mul64((val.y >> lookahead) + (val.z << sc.y), lut.x);

    val.y = mul_r.x + carry.x;

    carry.y = mul_r.y + (val.y < mul_r.x);

    mul_r = mul64((val.z >> lookahead) + (val.w << sc.y), lut.x);

    val.z = mul_r.x + carry.y;

    carry.z = mul_r.y + (val.z < mul_r.x);

    mul_r = mul64((val.w >> lookahead) + (valh << sc.y), lut.x);

    val.w = mul_r.x + carry.z;

    carry.w = mul_r.y + (val.w < mul_r.x);

    mul_r = mul64((valh >> lookahead) + (valt << sc.y), lut.x);

    valh = mul_r.x + carry.w;

    carryh = mul_r.y + (valh < mul_r.x);

    mul_r = mul64(valt >> lookahead, lut.x);

    valt = mul_r.x + carryh;

    overflow = (overflow | mul_r.y) | (valt < mul_r.x);

    i += lut.z;   

    cont = ((((val.x > (uint)sc.z) | val.y) | (val.z | val.w)) | (valh | valt)) && (i<0x1000000u);

  }

  index=val.x-2u;

  i += mosc[index & sc.x].w;

  if(overflow) i = 0x1000000u;  

  carry.x = steps_in.z + i;

  stepsOut.z = carry.x;

  stepsOut.w = steps_in.w + (carry.x < i);,

  if (i > steps_in.x)

  {   

    stepsOut.x = i;

    stepsOut.y = totalOffset;

  }

  else

  {

    stepsOut.x = steps_in.x;

    stepsOut.y = steps_in.y;

  }

  steps[t_offset] = stepsOut;

}

0 Likes
11 Replies