Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

Journeyman III

Unhandled exception compiling OpenCL kernel unless optimization is disabled

The following code fails to compile on an HD 6970 (Cayman) with an unhandled exception unless I disable all optimization. It actually crashes CodeXL when building. Yet, it compiles OK and runs successfully on an nVidia GPU.  I'm hoping someone can point out the error or what needs to be changed so that it can be compiled with optimization enabled? Yes, I'd also like the optimization bug fixed so it will either provide the error or optimize correctly, but I realize it takes time to get fixes added to releases, so I'd appreciate a work around if there is one in the mean time.

__constant unsigned int LOOKAHEAD=20;

__constant unsigned int BITS=32;

__constant unsigned long maxStep = 0x1000000ul;

inline ulong2 mul128(const unsigned long a, const unsigned long b) {

  return (ulong2)(a*b,mul_hi(a,b));


__kernel void kernelSteps64(__global const uint *sieve, const unsigned long offset, const ulong2 start, __global ulong4 *steps, __global const uint4 *mosc) {

   const uint lookahead = LOOKAHEAD;

  const uint4 sc = (uint4)((1<<lookahead)-1,BITS-lookahead,(1<<lookahead)+1,0);

  const uint t_offset = get_global_id(sc.w);

  const unsigned long totalOffset = offset + sieve[t_offset];

  ulong2 carry,mul_r;

  uint4 lut;

  ulong2 icont;

  ulong4 stepsOut,val;

   val.x = start.x;

   val.y = start.y;

  icont.x = sc.w;

  val.x += totalOffset;

  carry.x = (val.x < totalOffset);

  val.y += carry.x;

  val.z = val.w = sc.w;

  icont.y = 1;



   lut = mosc[val.x & sc.x];

   mul_r = mul128((val.x >> lookahead) + (val.y << sc.y), (unsigned long)lut.x);

   val.x = mul_r.x + lut.y;

   carry.x = mul_r.y + (val.x < mul_r.x);

   mul_r = mul128((val.y >> lookahead) + (val.z << sc.y), (unsigned long)lut.x);

   val.y = mul_r.x + carry.x;

   carry.y = mul_r.y + (val.y < mul_r.x);

   mul_r = mul128((val.z >> lookahead) + (val.w << sc.y), (unsigned long)lut.x);

   val.z = mul_r.x + carry.y;

   carry.x = mul_r.y + (val.z < mul_r.x);

   mul_r = mul128((val.w >> lookahead), (unsigned long)lut.x);

   val.w = mul_r.x + carry.x;

   carry.y = mul_r.y + (val.w < mul_r.x);

   icont.x += lut.z;

   icont.y = ((val.x > (unsigned long)sc.z) | val.y | val.z | val.w | carry.y) && (icont.x<maxStep);


  icont.x += (unsigned long)mosc[(val.x-2u) & sc.x].w;


  icont.x = 0x1000000u;

  if (offset == 0)

   val = (ulong4)(0,0,0,0);


   val = steps[t_offset];

  stepsOut.z = carry.x = val.z + icont.x;

  stepsOut.w = val.w + (carry.x < icont.x);

  if (icont.x > val.x)


   stepsOut.x = icont.x;

   stepsOut.y = totalOffset;

  } else {

   stepsOut.x = val.x;

   stepsOut.y = val.y;


  steps[t_offset] = stepsOut;


6 Replies
Journeyman III

This variation of the kernel does the same thing in that it compiles only if optimization is disabled when building but works fine on nVidia GPUs.

The CodeXL output, if compiling with -O1 -Werror is:

Compiling device: Barts... ...Failed!

OpenCL Compile Error: clBuildProgram had an unhanded exception.

#ifdef cl_amd_media_ops

  #pragma OPENCL EXTENSION cl_amd_media_ops : enable


__constant unsigned int LOOKAHEAD=20;

__constant unsigned int BITS=64;

#define mul64(a,b) (uint2)(a*b,mul_hi(a,b))

__constant uint maxsteps = 0x1000000u;

__constant uint scw = 0;

__kernel void Kernel3(__global const uint *sieve, __const ulong offset, __const ulong2 start, __global ulong4 *steps, __global const uint4 *mosc) {

const uint lookahead = LOOKAHEAD;

const uint scx = (1<<LOOKAHEAD)-1;

const uint scz = (1<<LOOKAHEAD)+1;

const uint t_offset = get_global_id(scw);

const ulong totalOffset = offset + (ulong)sieve[t_offset];

uint i=0;

union {ulong u; uint2 hl;} vall,valm,valh,carry,temp;

uint4 lut;

ulong4 steps_out = select((ulong4)(scw,scw,scw,scw),steps[t_offset],(ulong4)(t_offset,t_offset,t_offset,t_offset));

vall.u = start.x + totalOffset;

carry.hl.x = (vall.u < totalOffset);

valm.u = start.y + carry.hl.x;

valh.u = 0;

do {

  lut = mosc[vall.hl.x & scx];

#ifdef cl_amd_media_ops

  temp.hl.x = amd_bitalign(vall.hl.y, vall.hl.x, lookahead);

  temp.hl.y = amd_bitalign(valm.hl.x, vall.hl.y, lookahead);


  const uint scy = BITS-LOOKAHEAD;

  temp.u = (vall.u >> lookahead) | (valm.u << scy);


  vall.hl.x = temp.hl.x * lut.x;

  carry.hl.y = mul_hi(temp.hl.x, lut.x);

  carry.hl.x = lut.y;

  vall.hl.y = temp.hl.y * lut.x;

  vall.u += carry.u;

  carry.hl.x = mul_hi(temp.hl.y, lut.x) + (uint)(vall.u < carry.u);

  #ifdef cl_amd_media_ops

  temp.hl.x = amd_bitalign(valm.hl.y, valm.hl.x,lookahead);

  temp.hl.y = amd_bitalign(valh.hl.x, valm.hl.y,lookahead);


  temp.u = (valm.u >> lookahead) | (valh.u << scy);


  valm.hl.x = temp.hl.x * lut.x;

  carry.hl.y = mul_hi(temp.hl.x, lut.x);

  valm.hl.y = temp.hl.y * lut.x;

  valm.u += carry.u;

  carry.hl.x = mul_hi(temp.hl.y, lut.x) + (uint)(valm.u < carry.u);

#ifdef cl_amd_media_ops

  temp.hl.x = amd_bitalign(valh.hl.y, valh.hl.x,lookahead);

  temp.hl.y = valh.hl.y >> lookahead;


  temp.u = (valh.u >> lookahead);


  valh.hl.x = temp.hl.x * lut.x;

  carry.hl.y = mul_hi(temp.hl.x, lut.x);

  valh.hl.y = temp.hl.y * lut.x;

  valh.u += carry.u;

  carry.hl.x = mul_hi(temp.hl.y, lut.x) + (uint)(valh.u < carry.u);


} while (((vall.u > scz)||valm.u||valh.u) && (i < maxsteps) && (!carry.hl.x));


if(carry.hl.x) i = maxsteps;

steps_out.z += i;

steps_out.w += (steps_out.z < (ulong)i);

if ((i > steps_out.x) || ((i == steps_out.x) && (totalOffset < steps_out.y)))


  steps_out.xy = (ulong2)((ulong)i,totalOffset);


steps[t_offset] = steps_out;





Thanks for the code and we would like to investigate it further.

Would it be possible for you to provide host side code as well as optimization flags that are causing problem.


I don't see how the host code makes any difference.  If you copy and paste either of the kernels into CodeXL and try and build them, you should be able to duplicate the error.

The kernels are stored as constants in a header file.  The host code is:

  size_t sourceSize[] = { src.length() };

  const char *sourceCode = src.c_str();

  program = clCreateProgramWithSource(context, 1 ,(const char**) &sourceCode, sourceSize, &retval);

  if (retval != CL_SUCCESS) {

    std::cerr<<"clCreateProgramWithSource() failed ("<<endl<<") "<<print_cl_errstring(retval)<<endl;


    return retval;



  if (write_kernel) {

    ofstream fout("kernel_source.ocl",ios::out);  //used to export the kernel which was pastest in the above entries




  std::cerr<<"OpenCL program created."<<endl;


  retval = clBuildProgram(program, 1, &device, NULL, NULL, NULL);

  if (retval != CL_SUCCESS) {

    size_t logSize = 0;

    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL , &logSize);

    char *cBuildLog = new char[logSize+1];

    size_t maxSize = logSize;

    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, maxSize, cBuildLog, NULL);

    cBuildLog[logSize] = '\0';

    std::cerr<<"BUILD LOG"<<endl<<cBuildLog<<endl<<flush;

    std::cerr<<"clBuildProgram() failed ("<<retval<<") "<<print_cl_errstring(retval)<<endl;


    delete[] cBuildLog;

    return retval;


  std::cerr<<"OpenCL program compiled successfully."<<endl;



Thanks. We will look into it.


We tried to compile 2nd version of your code (i.e. having "Kernel3") but faced LLVM compilation error (see below) when optimization was enable.
LLVM ERROR: Cannot select: 0x1af5570: i32 = setcc 0x1af6d90, 0x1acf4b0, 0x1af4e70 [ORD=49] [ID=67]
  0x1af6d90: i64 = add 0x1acf4b0, 0x1b203b0 [ORD=45] [ID=63]
    0x1acf4b0: i64 = AMDILISD::VEXTRACT 0x1acecb0, 0x1af7290 [ORD=44] [ID=46]
      0x1acecb0: v2i64,ch = CopyFromReg 0x16daad0, 0x1acebb0 [ORD=44] [ID=38]
        0x1acebb0: v2i64 = Register %vreg27 [ORD=44] [ID=3]
      0x1af7290: i32 = TargetConstant<1> [ORD=16] [ID=27]
    0x1b203b0: i64 = add 0x1b219c0, 0x1aceab0 [ORD=6] [ID=60]
      0x1b219c0: i64,ch = load 0x16daad0, 0x1b1ffb0, 0x1b200b0<LD4[%arrayidx], zext from i32> [ORD=5] [ID=56]
        0x1b1ffb0: i32 = add 0x1ace8b0, 0x1b1feb0 [ORD=3] [ID=53]
          0x1ace8b0: i32,ch = CopyFromReg 0x16daad0, 0x1ace7b0 [ORD=3] [ID=36]
            0x1ace7b0: i32 = Register %vreg25 [ORD=3] [ID=1]
          0x1b1feb0: i32 = shl 0x1b209b0, 0x1b1fdb0 [ORD=3] [ID=50]
            0x1b209b0: i32 = AMDILISD::VEXTRACT 0x1acf2b0, 0x1af7290 [ORD=2] [ID=47]
              0x1acf2b0: v4i32 = 0x1acf1b0 [ORD=1] [ID=40]
                0x1acf1b0: i32 = TargetConstant<2976> [ORD=1] [ID=5]
              0x1af7290: i32 = TargetConstant<1> [ORD=16] [ID=27]
            0x1b1fdb0: i32 = Constant<2> [ORD=3] [ID=8]
        0x1b200b0: i32 = undef [ORD=4] [ID=9]
      0x1aceab0: i64,ch = CopyFromReg 0x16daad0, 0x1ace9b0 [ORD=6] [ID=37]
        0x1ace9b0: i64 = Register %vreg26 [ORD=6] [ID=2]
  0x1acf4b0: i64 = AMDILISD::VEXTRACT 0x1acecb0, 0x1af7290 [ORD=44] [ID=46]
    0x1acecb0: v2i64,ch = CopyFromReg 0x16daad0, 0x1acebb0 [ORD=44] [ID=38]
      0x1acebb0: v2i64 = Register %vreg27 [ORD=44] [ID=3]
    0x1af7290: i32 = TargetConstant<1> [ORD=16] [ID=27]
In function: __OpenCL_Kernel3_kernel

Please follow this forum post where we faced similar type of LLVM compilation error for optimization and we've filed an internal bug to the compilation team. I suspect it is similar for your case also. Please try to catch details of the compilation error and check whether your observation matches with us or not (at least any LLVM compiler error or not).



Can you confirm whether you are getting the LLVM compiler error or some other error?