cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

inducer77
Adept II

Miscompiled CPU kernel

Hi there,


it appears that the 13.6 beta available from [1] miscompiles the kernel below. I should add that this does not happen entirely repeatably. Never mind.

Thanks,

Andreas


[1] http://support.amd.com/us/gpudownload/linux/Pages/radeon_linux.aspx?type=2.4.1&product=2.4.1.3.42&la...


#pragma OPENCL EXTENSION cl_khr_fp64: enable

#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) layerpot(__global double const *restrict src, __global double const *restrict tgt, __global double const *restrict center_s0, __global double const *restrict center_s1, int const nsrc, int const ntgt, __global double const *restrict strength_0, __global double *restrict result_0, __global double *restrict result_1, __global double const *restrict src_derivative_dir_s0, __global double const *restrict src_derivative_dir_s1)
{
  double var_2;
  double pair_result_0;
  double pair_result_1;
  double acc_isrc_outer_isrc_inner;
  double expr_16;
  double knl_1_scaling;
  double var_0;
  double acc_isrc_outer_isrc_inner_0;
  double expr_8;
  double expr_9;
  double var;
  double expr_7;
  double var_5;
  double expr_5;
  double expr_2;
  double expr_3;
  double expr_0;
  double expr_1;
  double expn0coeff4;
  double expn0coeff3;
  double expn0coeff2;
  double expr_6;
  double expr_18;
  double expr_19;
  double var_6;
  double expr_14;
  double expr_15;
  double expn1_result;
  double expr_17;
  double expr_10;
  double expr_11;
  double expr_12;
  double expr_13;
  double a[2];
  double var_4;
  double b[2];
  double expn0_result;
  double expn1coeff4;
  double var_3;
  double expn1coeff2;
  double expn1coeff3;
  double var_1;
  double knl_0_scaling;
  double expr_4;

  if ((-1 + -16 * gid(0) + -1 * lid(0) + ntgt) >= 0)
  {
    b[0] = tgt[ntgt * 0 + lid(0) + gid(0) * 16] + -1.0 * center_s0[lid(0) + gid(0) * 16];
    b[1] = tgt[ntgt * 1 + lid(0) + gid(0) * 16] + -1.0 * center_s1[lid(0) + gid(0) * 16];
    var_1 = b[0] * b[0];
    var_2 = b[1] * b[1];
    acc_isrc_outer_isrc_inner = 0.0;
    knl_0_scaling = -1.0 / 2.0 * pow(3.141592653589793, -1.0);
    acc_isrc_outer_isrc_inner_0 = 0.0;
    knl_1_scaling = -1.0 / 2.0 * pow(3.141592653589793, -1.0);
    expr_3 = var_2;
    expr_2 = var_1;
    expr_5 = expr_2 + expr_3;
    for (int isrc_outer = 0; isrc_outer <= (-1 + ((255 + nsrc) / 256)); ++isrc_outer)
      for (int isrc_inner = 0; isrc_inner <= 255; ++isrc_inner)
        if ((-1 + -1 * isrc_inner + -256 * isrc_outer + nsrc) >= 0)
        {
          a[0] = center_s0[lid(0) + gid(0) * 16] + -1.0 * src[nsrc * 0 + isrc_inner + isrc_outer * 256];
          a[1] = center_s1[lid(0) + gid(0) * 16] + -1.0 * src[nsrc * 1 + isrc_inner + isrc_outer * 256];
          var_6 = src_derivative_dir_s1[isrc_inner + isrc_outer * 256];
          expr_13 = a[0] * a[1];
          var_3 = a[0] * a[0];
          expr_0 = var_3;
          var_5 = src_derivative_dir_s0[isrc_inner + isrc_outer * 256];
          var_4 = a[1] * a[1];
          expr_1 = var_4;
          expr_4 = expr_1 + expr_0;
          expr_17 = expr_1 * 1.0 / expr_4;
          var_0 = expr_4 * expr_4;
          expr_15 = -2.0 * 1.0 / expr_4;
          expn1coeff2 = 1.0 / expr_4 * (2.0 * expr_13 * expr_15 * var_6 + -2.0 * var_5 * (-1.0 + -1.0 * expr_0 * expr_15));
          expn0coeff2 = 1.0 / expr_4 * (2.0 * expr_13 * expr_15 * var_5 + -2.0 * var_6 * (-1.0 + -1.0 * expr_1 * expr_15));
          expr_7 = var_0;
          expr_14 = 24.0 * 1.0 / expr_7;
          expr_6 = a[0] * b[0] + a[1] * b[1];
          expr_18 = 16.0 * expr_6 * 1.0 / expr_4;
          var = expr_6 * expr_6;
          expr_10 = 4.0 * a[1] * expr_6 * 1.0 / expr_4 + -3.0 * b[1];
          expr_16 = expr_0 * 1.0 / expr_4;
          expr_9 = 4.0 * a[0] * expr_6 * 1.0 / expr_4 + -3.0 * b[0];
          expr_11 = 4.0 * a[0] * a[1] * expr_6 * 1.0 / expr_4 + a[1] * expr_9 + a[0] * expr_10;
          expn1coeff3 = 1.0 / expr_7 * (4.0 * expr_11 * var_6 + 4.0 * var_5 * (2.0 * a[0] * expr_9 + expr_6 * (-3.0 + 4.0 * expr_16)));
          expr_8 = var;
          expr_12 = 4.0 * a[1] * b[0] * expr_6 * 1.0 / expr_4 + 2.0 * a[0] * a[1] * expr_5 * 1.0 / expr_4 + 4.0 * a[0] * b[1] * expr_6 * 1.0 / expr_4 + -12.0 * a[0] * a[1] * expr_8 * 1.0 / expr_7 + -1.0 * b[0] * b[1];
          expr_19 = 4.0 * expr_8 * 1.0 / expr_4;
          expn1coeff4 = 1.0 / expr_7 * (48.0 * expr_12 * var_6 + 24.0 * var_5 * (-3.0 * expr_2 + -1.0 * expr_3 + 4.0 * expr_16 * expr_5 + expr_19 + a[0] * b[0] * expr_18 + -1.0 * expr_0 * expr_14 * expr_8));
          expn0coeff3 = 1.0 / expr_7 * (4.0 * expr_11 * var_5 + 4.0 * var_6 * (expr_6 * (-3.0 + 4.0 * expr_17) + 2.0 * a[1] * expr_10));
          expn0coeff4 = 1.0 / expr_7 * (48.0 * expr_12 * var_5 + 24.0 * var_6 * (4.0 * expr_17 * expr_5 + -1.0 * expr_2 + a[1] * b[1] * expr_18 + -1.0 * expr_1 * expr_14 * expr_8 + expr_19 + -3.0 * expr_3));
          expn0_result = 0.5 * expn0coeff2 + 0.16666666666666666 * expn0coeff3 + 0.041666666666666664 * expn0coeff4;
          expn1_result = 0.041666666666666664 * expn1coeff4 + 0.16666666666666666 * expn1coeff3 + 0.5 * expn1coeff2;
          pair_result_1 = expn1_result * strength_0[isrc_inner + isrc_outer * 256];
          pair_result_0 = expn0_result * strength_0[isrc_inner + isrc_outer * 256];
          acc_isrc_outer_isrc_inner = acc_isrc_outer_isrc_inner + pair_result_0;
          acc_isrc_outer_isrc_inner_0 = acc_isrc_outer_isrc_inner_0 + pair_result_1;
        }
    result_1[lid(0) + gid(0) * 16] = knl_1_scaling * acc_isrc_outer_isrc_inner_0;
    result_0[lid(0) + gid(0) * 16] = knl_0_scaling * acc_isrc_outer_isrc_inner;
  }
}




0 Likes
5 Replies
himanshu_gautam
Grandmaster


inducer77 wrote:



Hi there,




I should add that this does not happen entirely repeatably.




Is the issue not reproducible? Can you confirm? Also share details about the GPU, OS, APP SDK,and Driver.

I can compile the kernel for 13.6 driver, HD 7870, Win7-64, APP SDK 2.8.

0 Likes

It appears reproducible. What made it appear non-reproducible is that the code that generates that kernel somewhat randomly generates one of two (mathematically equivalent) versions. One breaks, the other works.

  • CPU: Intel i7-2620M
  • Debian Linux, kernel 3.9, libc 2.17.
  • No APP SDK installed, just the OpenCL bits from the driver package.

The kernel compiles fine, but it computes the wrong result. FWIW, this is the equivalent kernel that compiles correctly


The kernel compiles fine, but it computes the wrong result. FWIW, this is the equivalent kernel that compiles correctly:#pragma OPENCL EXTENSION cl_khr_fp64: enable

#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) layerpot(__global double const *restrict src, __global double const *restrict tgt, __global double const *restrict center_s0, __global double const *restrict center_s1, int const nsrc, int const ntgt, __global double const *restrict strength_0, __global double *restrict result_0, __global double *restrict result_1, __global double const *restrict src_derivative_dir_s0, __global double const *restrict src_derivative_dir_s1)
{
  double var_2;
  double pair_result_0;
  double pair_result_1;
  double acc_isrc_outer_isrc_inner;
  double expr_16;
  double knl_1_scaling;
  double var_0;
  double acc_isrc_outer_isrc_inner_0;
  double expr_8;
  double expr_9;
  double var;
  double expr_7;
  double var_5;
  double expr_5;
  double expr_2;
  double expr_3;
  double expr_0;
  double expr_1;
  double expn0coeff4;
  double expn0coeff3;
  double expn0coeff2;
  double expr_6;
  double expr_18;
  double expr_19;
  double var_6;
  double expr_14;
  double expr_15;
  double expn1_result;
  double expr_17;
  double expr_10;
  double expr_11;
  double expr_12;
  double expr_13;
  double a[2];
  double var_4;
  double b[2];
  double expn0_result;
  double expn1coeff4;
  double var_3;
  double expn1coeff2;
  double expn1coeff3;
  double var_1;
  double knl_0_scaling;
  double expr_4;

  if ((-1 + -16 * gid(0) + -1 * lid(0) + ntgt) >= 0)
  {
    b[0] = tgt[ntgt * 0 + lid(0) + gid(0) * 16] + -1.0 * center_s0[lid(0) + gid(0) * 16];
    b[1] = tgt[ntgt * 1 + lid(0) + gid(0) * 16] + -1.0 * center_s1[lid(0) + gid(0) * 16];
    var_1 = b[0] * b[0];
    var_2 = b[1] * b[1];
    acc_isrc_outer_isrc_inner = 0.0;
    knl_0_scaling = -1.0 / 2.0 * pow(3.141592653589793, -1.0);
    acc_isrc_outer_isrc_inner_0 = 0.0;
    knl_1_scaling = -1.0 / 2.0 * pow(3.141592653589793, -1.0);
    expr_3 = var_2;
    expr_2 = var_1;
    expr_5 = expr_2 + expr_3;
    for (int isrc_outer = 0; isrc_outer <= (-1 + ((255 + nsrc) / 256)); ++isrc_outer)
      for (int isrc_inner = 0; isrc_inner <= 255; ++isrc_inner)
        if ((-1 + -1 * isrc_inner + -256 * isrc_outer + nsrc) >= 0)
        {
          a[0] = center_s0[lid(0) + gid(0) * 16] + -1.0 * src[nsrc * 0 + isrc_inner + isrc_outer * 256];
          a[1] = center_s1[lid(0) + gid(0) * 16] + -1.0 * src[nsrc * 1 + isrc_inner + isrc_outer * 256];
          var_6 = src_derivative_dir_s1[isrc_inner + isrc_outer * 256];
          expr_13 = a[0] * a[1];
          var_3 = a[0] * a[0];
          expr_0 = var_3;
          var_5 = src_derivative_dir_s0[isrc_inner + isrc_outer * 256];
          var_4 = a[1] * a[1];
          expr_1 = var_4;
          expr_4 = expr_1 + expr_0;
          expr_17 = expr_1 * 1.0 / expr_4;
          var_0 = expr_4 * expr_4;
          expr_15 = -2.0 * 1.0 / expr_4;
          expn1coeff2 = 1.0 / expr_4 * (2.0 * expr_13 * expr_15 * var_6 + -2.0 * var_5 * (-1.0 + -1.0 * expr_0 * expr_15));
          expn0coeff2 = 1.0 / expr_4 * (2.0 * expr_13 * expr_15 * var_5 + -2.0 * var_6 * (-1.0 + -1.0 * expr_1 * expr_15));
          expr_7 = var_0;
          expr_14 = 24.0 * 1.0 / expr_7;
          expr_6 = a[0] * b[0] + a[1] * b[1];
          expr_18 = 16.0 * expr_6 * 1.0 / expr_4;
          var = expr_6 * expr_6;
          expr_10 = 4.0 * a[1] * expr_6 * 1.0 / expr_4 + -3.0 * b[1];
          expr_16 = expr_0 * 1.0 / expr_4;
          expr_9 = 4.0 * a[0] * expr_6 * 1.0 / expr_4 + -3.0 * b[0];
          expr_11 = 4.0 * a[0] * a[1] * expr_6 * 1.0 / expr_4 + a[1] * expr_9 + a[0] * expr_10;
          expn1coeff3 = 1.0 / expr_7 * (4.0 * expr_11 * var_6 + 4.0 * var_5 * (2.0 * a[0] * expr_9 + expr_6 * (-3.0 + 4.0 * expr_16)));
          expr_8 = var;
          expr_12 = 4.0 * a[1] * b[0] * expr_6 * 1.0 / expr_4 + 2.0 * a[0] * a[1] * expr_5 * 1.0 / expr_4 + 4.0 * a[0] * b[1] * expr_6 * 1.0 / expr_4 + -12.0 * a[0] * a[1] * expr_8 * 1.0 / expr_7 + -1.0 * b[0] * b[1];
          expr_19 = 4.0 * expr_8 * 1.0 / expr_4;
          expn1coeff4 = 1.0 / expr_7 * (48.0 * expr_12 * var_6 + 24.0 * var_5 * (-3.0 * expr_2 + -1.0 * expr_3 + 4.0 * expr_16 * expr_5 + expr_19 + a[0] * b[0] * expr_18 + -1.0 * expr_0 * expr_14 * expr_8));
          expn0coeff3 = 1.0 / expr_7 * (4.0 * expr_11 * var_5 + 4.0 * var_6 * (expr_6 * (-3.0 + 4.0 * expr_17) + 2.0 * a[1] * expr_10));
          expn0coeff4 = 1.0 / expr_7 * (48.0 * expr_12 * var_5 + 24.0 * var_6 * (4.0 * expr_17 * expr_5 + -1.0 * expr_2 + a[1] * b[1] * expr_18 + -1.0 * expr_1 * expr_14 * expr_8 + expr_19 + -3.0 * expr_3));
          expn0_result = 0.5 * expn0coeff2 + 0.16666666666666666 * expn0coeff3 + 0.041666666666666664 * expn0coeff4;
          expn1_result = 0.041666666666666664 * expn1coeff4 + 0.16666666666666666 * expn1coeff3 + 0.5 * expn1coeff2;
          pair_result_1 = expn1_result * strength_0[isrc_inner + isrc_outer * 256];
          pair_result_0 = expn0_result * strength_0[isrc_inner + isrc_outer * 256];
          acc_isrc_outer_isrc_inner = acc_isrc_outer_isrc_inner + pair_result_0;
          acc_isrc_outer_isrc_inner_0 = acc_isrc_outer_isrc_inner_0 + pair_result_1;
        }
    result_1[lid(0) + gid(0) * 16] = knl_1_scaling * acc_isrc_outer_isrc_inner_0;
    result_0[lid(0) + gid(0) * 16] = knl_0_scaling * acc_isrc_outer_isrc_inner;
  }
}


0 Likes

can you give the working and notworking kernels as attached zip files. This format is not copy friendly.

0 Likes

Certainly-- here you go: http://tiker.net/tmp/amd-miscompiled.zip

0 Likes

Does your kernel compile fine with the latest drivers?

0 Likes