AnsweredAssumed Answered

Miscompiled CPU kernel

Question asked by inducer77 on Jun 11, 2013
Latest reply on Jun 17, 2014 by pinform

Hi there,


it appears that the 13.6 beta available from [1] miscompiles the kernel below. I should add that this does not happen entirely repeatably. Never mind.

Thanks,

Andreas


[1] http://support.amd.com/us/gpudownload/linux/Pages/radeon_linux.aspx?type=2.4.1&product=2.4.1.3.42&lang=English

 

#pragma OPENCL EXTENSION cl_khr_fp64: enable

#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) layerpot(__global double const *restrict src, __global double const *restrict tgt, __global double const *restrict center_s0, __global double const *restrict center_s1, int const nsrc, int const ntgt, __global double const *restrict strength_0, __global double *restrict result_0, __global double *restrict result_1, __global double const *restrict src_derivative_dir_s0, __global double const *restrict src_derivative_dir_s1)
{
  double var_2;
  double pair_result_0;
  double pair_result_1;
  double acc_isrc_outer_isrc_inner;
  double expr_16;
  double knl_1_scaling;
  double var_0;
  double acc_isrc_outer_isrc_inner_0;
  double expr_8;
  double expr_9;
  double var;
  double expr_7;
  double var_5;
  double expr_5;
  double expr_2;
  double expr_3;
  double expr_0;
  double expr_1;
  double expn0coeff4;
  double expn0coeff3;
  double expn0coeff2;
  double expr_6;
  double expr_18;
  double expr_19;
  double var_6;
  double expr_14;
  double expr_15;
  double expn1_result;
  double expr_17;
  double expr_10;
  double expr_11;
  double expr_12;
  double expr_13;
  double a[2];
  double var_4;
  double b[2];
  double expn0_result;
  double expn1coeff4;
  double var_3;
  double expn1coeff2;
  double expn1coeff3;
  double var_1;
  double knl_0_scaling;
  double expr_4;

  if ((-1 + -16 * gid(0) + -1 * lid(0) + ntgt) >= 0)
  {
    b[0] = tgt[ntgt * 0 + lid(0) + gid(0) * 16] + -1.0 * center_s0[lid(0) + gid(0) * 16];
    b[1] = tgt[ntgt * 1 + lid(0) + gid(0) * 16] + -1.0 * center_s1[lid(0) + gid(0) * 16];
    var_1 = b[0] * b[0];
    var_2 = b[1] * b[1];
    acc_isrc_outer_isrc_inner = 0.0;
    knl_0_scaling = -1.0 / 2.0 * pow(3.141592653589793, -1.0);
    acc_isrc_outer_isrc_inner_0 = 0.0;
    knl_1_scaling = -1.0 / 2.0 * pow(3.141592653589793, -1.0);
    expr_3 = var_2;
    expr_2 = var_1;
    expr_5 = expr_2 + expr_3;
    for (int isrc_outer = 0; isrc_outer <= (-1 + ((255 + nsrc) / 256)); ++isrc_outer)
      for (int isrc_inner = 0; isrc_inner <= 255; ++isrc_inner)
        if ((-1 + -1 * isrc_inner + -256 * isrc_outer + nsrc) >= 0)
        {
          a[0] = center_s0[lid(0) + gid(0) * 16] + -1.0 * src[nsrc * 0 + isrc_inner + isrc_outer * 256];
          a[1] = center_s1[lid(0) + gid(0) * 16] + -1.0 * src[nsrc * 1 + isrc_inner + isrc_outer * 256];
          var_6 = src_derivative_dir_s1[isrc_inner + isrc_outer * 256];
          expr_13 = a[0] * a[1];
          var_3 = a[0] * a[0];
          expr_0 = var_3;
          var_5 = src_derivative_dir_s0[isrc_inner + isrc_outer * 256];
          var_4 = a[1] * a[1];
          expr_1 = var_4;
          expr_4 = expr_1 + expr_0;
          expr_17 = expr_1 * 1.0 / expr_4;
          var_0 = expr_4 * expr_4;
          expr_15 = -2.0 * 1.0 / expr_4;
          expn1coeff2 = 1.0 / expr_4 * (2.0 * expr_13 * expr_15 * var_6 + -2.0 * var_5 * (-1.0 + -1.0 * expr_0 * expr_15));
          expn0coeff2 = 1.0 / expr_4 * (2.0 * expr_13 * expr_15 * var_5 + -2.0 * var_6 * (-1.0 + -1.0 * expr_1 * expr_15));
          expr_7 = var_0;
          expr_14 = 24.0 * 1.0 / expr_7;
          expr_6 = a[0] * b[0] + a[1] * b[1];
          expr_18 = 16.0 * expr_6 * 1.0 / expr_4;
          var = expr_6 * expr_6;
          expr_10 = 4.0 * a[1] * expr_6 * 1.0 / expr_4 + -3.0 * b[1];
          expr_16 = expr_0 * 1.0 / expr_4;
          expr_9 = 4.0 * a[0] * expr_6 * 1.0 / expr_4 + -3.0 * b[0];
          expr_11 = 4.0 * a[0] * a[1] * expr_6 * 1.0 / expr_4 + a[1] * expr_9 + a[0] * expr_10;
          expn1coeff3 = 1.0 / expr_7 * (4.0 * expr_11 * var_6 + 4.0 * var_5 * (2.0 * a[0] * expr_9 + expr_6 * (-3.0 + 4.0 * expr_16)));
          expr_8 = var;
          expr_12 = 4.0 * a[1] * b[0] * expr_6 * 1.0 / expr_4 + 2.0 * a[0] * a[1] * expr_5 * 1.0 / expr_4 + 4.0 * a[0] * b[1] * expr_6 * 1.0 / expr_4 + -12.0 * a[0] * a[1] * expr_8 * 1.0 / expr_7 + -1.0 * b[0] * b[1];
          expr_19 = 4.0 * expr_8 * 1.0 / expr_4;
          expn1coeff4 = 1.0 / expr_7 * (48.0 * expr_12 * var_6 + 24.0 * var_5 * (-3.0 * expr_2 + -1.0 * expr_3 + 4.0 * expr_16 * expr_5 + expr_19 + a[0] * b[0] * expr_18 + -1.0 * expr_0 * expr_14 * expr_8));
          expn0coeff3 = 1.0 / expr_7 * (4.0 * expr_11 * var_5 + 4.0 * var_6 * (expr_6 * (-3.0 + 4.0 * expr_17) + 2.0 * a[1] * expr_10));
          expn0coeff4 = 1.0 / expr_7 * (48.0 * expr_12 * var_5 + 24.0 * var_6 * (4.0 * expr_17 * expr_5 + -1.0 * expr_2 + a[1] * b[1] * expr_18 + -1.0 * expr_1 * expr_14 * expr_8 + expr_19 + -3.0 * expr_3));
          expn0_result = 0.5 * expn0coeff2 + 0.16666666666666666 * expn0coeff3 + 0.041666666666666664 * expn0coeff4;
          expn1_result = 0.041666666666666664 * expn1coeff4 + 0.16666666666666666 * expn1coeff3 + 0.5 * expn1coeff2;
          pair_result_1 = expn1_result * strength_0[isrc_inner + isrc_outer * 256];
          pair_result_0 = expn0_result * strength_0[isrc_inner + isrc_outer * 256];
          acc_isrc_outer_isrc_inner = acc_isrc_outer_isrc_inner + pair_result_0;
          acc_isrc_outer_isrc_inner_0 = acc_isrc_outer_isrc_inner_0 + pair_result_1;
        }
    result_1[lid(0) + gid(0) * 16] = knl_1_scaling * acc_isrc_outer_isrc_inner_0;
    result_0[lid(0) + gid(0) * 16] = knl_0_scaling * acc_isrc_outer_isrc_inner;
  }
}


Outcomes