5 Replies Latest reply on Jun 17, 2014 4:14 AM by pinform

    Miscompiled CPU kernel

    inducer77

      Hi there,


      it appears that the 13.6 beta available from [1] miscompiles the kernel below. I should add that this does not happen entirely repeatably. Never mind.

      Thanks,

      Andreas


      [1] http://support.amd.com/us/gpudownload/linux/Pages/radeon_linux.aspx?type=2.4.1&product=2.4.1.3.42&lang=English

       

      #pragma OPENCL EXTENSION cl_khr_fp64: enable
      
      #define lid(N) ((int) get_local_id(N))
      #define gid(N) ((int) get_group_id(N))
      
      __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) layerpot(__global double const *restrict src, __global double const *restrict tgt, __global double const *restrict center_s0, __global double const *restrict center_s1, int const nsrc, int const ntgt, __global double const *restrict strength_0, __global double *restrict result_0, __global double *restrict result_1, __global double const *restrict src_derivative_dir_s0, __global double const *restrict src_derivative_dir_s1)
      {
        double var_2;
        double pair_result_0;
        double pair_result_1;
        double acc_isrc_outer_isrc_inner;
        double expr_16;
        double knl_1_scaling;
        double var_0;
        double acc_isrc_outer_isrc_inner_0;
        double expr_8;
        double expr_9;
        double var;
        double expr_7;
        double var_5;
        double expr_5;
        double expr_2;
        double expr_3;
        double expr_0;
        double expr_1;
        double expn0coeff4;
        double expn0coeff3;
        double expn0coeff2;
        double expr_6;
        double expr_18;
        double expr_19;
        double var_6;
        double expr_14;
        double expr_15;
        double expn1_result;
        double expr_17;
        double expr_10;
        double expr_11;
        double expr_12;
        double expr_13;
        double a[2];
        double var_4;
        double b[2];
        double expn0_result;
        double expn1coeff4;
        double var_3;
        double expn1coeff2;
        double expn1coeff3;
        double var_1;
        double knl_0_scaling;
        double expr_4;
      
        if ((-1 + -16 * gid(0) + -1 * lid(0) + ntgt) >= 0)
        {
          b[0] = tgt[ntgt * 0 + lid(0) + gid(0) * 16] + -1.0 * center_s0[lid(0) + gid(0) * 16];
          b[1] = tgt[ntgt * 1 + lid(0) + gid(0) * 16] + -1.0 * center_s1[lid(0) + gid(0) * 16];
          var_1 = b[0] * b[0];
          var_2 = b[1] * b[1];
          acc_isrc_outer_isrc_inner = 0.0;
          knl_0_scaling = -1.0 / 2.0 * pow(3.141592653589793, -1.0);
          acc_isrc_outer_isrc_inner_0 = 0.0;
          knl_1_scaling = -1.0 / 2.0 * pow(3.141592653589793, -1.0);
          expr_3 = var_2;
          expr_2 = var_1;
          expr_5 = expr_2 + expr_3;
          for (int isrc_outer = 0; isrc_outer <= (-1 + ((255 + nsrc) / 256)); ++isrc_outer)
            for (int isrc_inner = 0; isrc_inner <= 255; ++isrc_inner)
              if ((-1 + -1 * isrc_inner + -256 * isrc_outer + nsrc) >= 0)
              {
                a[0] = center_s0[lid(0) + gid(0) * 16] + -1.0 * src[nsrc * 0 + isrc_inner + isrc_outer * 256];
                a[1] = center_s1[lid(0) + gid(0) * 16] + -1.0 * src[nsrc * 1 + isrc_inner + isrc_outer * 256];
                var_6 = src_derivative_dir_s1[isrc_inner + isrc_outer * 256];
                expr_13 = a[0] * a[1];
                var_3 = a[0] * a[0];
                expr_0 = var_3;
                var_5 = src_derivative_dir_s0[isrc_inner + isrc_outer * 256];
                var_4 = a[1] * a[1];
                expr_1 = var_4;
                expr_4 = expr_1 + expr_0;
                expr_17 = expr_1 * 1.0 / expr_4;
                var_0 = expr_4 * expr_4;
                expr_15 = -2.0 * 1.0 / expr_4;
                expn1coeff2 = 1.0 / expr_4 * (2.0 * expr_13 * expr_15 * var_6 + -2.0 * var_5 * (-1.0 + -1.0 * expr_0 * expr_15));
                expn0coeff2 = 1.0 / expr_4 * (2.0 * expr_13 * expr_15 * var_5 + -2.0 * var_6 * (-1.0 + -1.0 * expr_1 * expr_15));
                expr_7 = var_0;
                expr_14 = 24.0 * 1.0 / expr_7;
                expr_6 = a[0] * b[0] + a[1] * b[1];
                expr_18 = 16.0 * expr_6 * 1.0 / expr_4;
                var = expr_6 * expr_6;
                expr_10 = 4.0 * a[1] * expr_6 * 1.0 / expr_4 + -3.0 * b[1];
                expr_16 = expr_0 * 1.0 / expr_4;
                expr_9 = 4.0 * a[0] * expr_6 * 1.0 / expr_4 + -3.0 * b[0];
                expr_11 = 4.0 * a[0] * a[1] * expr_6 * 1.0 / expr_4 + a[1] * expr_9 + a[0] * expr_10;
                expn1coeff3 = 1.0 / expr_7 * (4.0 * expr_11 * var_6 + 4.0 * var_5 * (2.0 * a[0] * expr_9 + expr_6 * (-3.0 + 4.0 * expr_16)));
                expr_8 = var;
                expr_12 = 4.0 * a[1] * b[0] * expr_6 * 1.0 / expr_4 + 2.0 * a[0] * a[1] * expr_5 * 1.0 / expr_4 + 4.0 * a[0] * b[1] * expr_6 * 1.0 / expr_4 + -12.0 * a[0] * a[1] * expr_8 * 1.0 / expr_7 + -1.0 * b[0] * b[1];
                expr_19 = 4.0 * expr_8 * 1.0 / expr_4;
                expn1coeff4 = 1.0 / expr_7 * (48.0 * expr_12 * var_6 + 24.0 * var_5 * (-3.0 * expr_2 + -1.0 * expr_3 + 4.0 * expr_16 * expr_5 + expr_19 + a[0] * b[0] * expr_18 + -1.0 * expr_0 * expr_14 * expr_8));
                expn0coeff3 = 1.0 / expr_7 * (4.0 * expr_11 * var_5 + 4.0 * var_6 * (expr_6 * (-3.0 + 4.0 * expr_17) + 2.0 * a[1] * expr_10));
                expn0coeff4 = 1.0 / expr_7 * (48.0 * expr_12 * var_5 + 24.0 * var_6 * (4.0 * expr_17 * expr_5 + -1.0 * expr_2 + a[1] * b[1] * expr_18 + -1.0 * expr_1 * expr_14 * expr_8 + expr_19 + -3.0 * expr_3));
                expn0_result = 0.5 * expn0coeff2 + 0.16666666666666666 * expn0coeff3 + 0.041666666666666664 * expn0coeff4;
                expn1_result = 0.041666666666666664 * expn1coeff4 + 0.16666666666666666 * expn1coeff3 + 0.5 * expn1coeff2;
                pair_result_1 = expn1_result * strength_0[isrc_inner + isrc_outer * 256];
                pair_result_0 = expn0_result * strength_0[isrc_inner + isrc_outer * 256];
                acc_isrc_outer_isrc_inner = acc_isrc_outer_isrc_inner + pair_result_0;
                acc_isrc_outer_isrc_inner_0 = acc_isrc_outer_isrc_inner_0 + pair_result_1;
              }
          result_1[lid(0) + gid(0) * 16] = knl_1_scaling * acc_isrc_outer_isrc_inner_0;
          result_0[lid(0) + gid(0) * 16] = knl_0_scaling * acc_isrc_outer_isrc_inner;
        }
      }
      
      
      
        • Re: Miscompiled CPU kernel
          himanshu.gautam

          inducer77 wrote:

           

          Hi there,


          I should add that this does not happen entirely repeatably.

           

          Is the issue not reproducible? Can you confirm? Also share details about the GPU, OS, APP SDK,and Driver.

          I can compile the kernel for 13.6 driver, HD 7870, Win7-64, APP SDK 2.8.

            • Re: Re: Miscompiled CPU kernel
              inducer77

              It appears reproducible. What made it appear non-reproducible is that the code that generates that kernel somewhat randomly generates one of two (mathematically equivalent) versions. One breaks, the other works.

              • CPU: Intel i7-2620M
              • Debian Linux, kernel 3.9, libc 2.17.
              • No APP SDK installed, just the OpenCL bits from the driver package.

              The kernel compiles fine, but it computes the wrong result. FWIW, this is the equivalent kernel that compiles correctly

              The kernel compiles fine, but it computes the wrong result. FWIW, this is the equivalent kernel that compiles correctly:#pragma OPENCL EXTENSION cl_khr_fp64: enable
              
              #define lid(N) ((int) get_local_id(N))
              #define gid(N) ((int) get_group_id(N))
              
              __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) layerpot(__global double const *restrict src, __global double const *restrict tgt, __global double const *restrict center_s0, __global double const *restrict center_s1, int const nsrc, int const ntgt, __global double const *restrict strength_0, __global double *restrict result_0, __global double *restrict result_1, __global double const *restrict src_derivative_dir_s0, __global double const *restrict src_derivative_dir_s1)
              {
                double var_2;
                double pair_result_0;
                double pair_result_1;
                double acc_isrc_outer_isrc_inner;
                double expr_16;
                double knl_1_scaling;
                double var_0;
                double acc_isrc_outer_isrc_inner_0;
                double expr_8;
                double expr_9;
                double var;
                double expr_7;
                double var_5;
                double expr_5;
                double expr_2;
                double expr_3;
                double expr_0;
                double expr_1;
                double expn0coeff4;
                double expn0coeff3;
                double expn0coeff2;
                double expr_6;
                double expr_18;
                double expr_19;
                double var_6;
                double expr_14;
                double expr_15;
                double expn1_result;
                double expr_17;
                double expr_10;
                double expr_11;
                double expr_12;
                double expr_13;
                double a[2];
                double var_4;
                double b[2];
                double expn0_result;
                double expn1coeff4;
                double var_3;
                double expn1coeff2;
                double expn1coeff3;
                double var_1;
                double knl_0_scaling;
                double expr_4;
              
                if ((-1 + -16 * gid(0) + -1 * lid(0) + ntgt) >= 0)
                {
                  b[0] = tgt[ntgt * 0 + lid(0) + gid(0) * 16] + -1.0 * center_s0[lid(0) + gid(0) * 16];
                  b[1] = tgt[ntgt * 1 + lid(0) + gid(0) * 16] + -1.0 * center_s1[lid(0) + gid(0) * 16];
                  var_1 = b[0] * b[0];
                  var_2 = b[1] * b[1];
                  acc_isrc_outer_isrc_inner = 0.0;
                  knl_0_scaling = -1.0 / 2.0 * pow(3.141592653589793, -1.0);
                  acc_isrc_outer_isrc_inner_0 = 0.0;
                  knl_1_scaling = -1.0 / 2.0 * pow(3.141592653589793, -1.0);
                  expr_3 = var_2;
                  expr_2 = var_1;
                  expr_5 = expr_2 + expr_3;
                  for (int isrc_outer = 0; isrc_outer <= (-1 + ((255 + nsrc) / 256)); ++isrc_outer)
                    for (int isrc_inner = 0; isrc_inner <= 255; ++isrc_inner)
                      if ((-1 + -1 * isrc_inner + -256 * isrc_outer + nsrc) >= 0)
                      {
                        a[0] = center_s0[lid(0) + gid(0) * 16] + -1.0 * src[nsrc * 0 + isrc_inner + isrc_outer * 256];
                        a[1] = center_s1[lid(0) + gid(0) * 16] + -1.0 * src[nsrc * 1 + isrc_inner + isrc_outer * 256];
                        var_6 = src_derivative_dir_s1[isrc_inner + isrc_outer * 256];
                        expr_13 = a[0] * a[1];
                        var_3 = a[0] * a[0];
                        expr_0 = var_3;
                        var_5 = src_derivative_dir_s0[isrc_inner + isrc_outer * 256];
                        var_4 = a[1] * a[1];
                        expr_1 = var_4;
                        expr_4 = expr_1 + expr_0;
                        expr_17 = expr_1 * 1.0 / expr_4;
                        var_0 = expr_4 * expr_4;
                        expr_15 = -2.0 * 1.0 / expr_4;
                        expn1coeff2 = 1.0 / expr_4 * (2.0 * expr_13 * expr_15 * var_6 + -2.0 * var_5 * (-1.0 + -1.0 * expr_0 * expr_15));
                        expn0coeff2 = 1.0 / expr_4 * (2.0 * expr_13 * expr_15 * var_5 + -2.0 * var_6 * (-1.0 + -1.0 * expr_1 * expr_15));
                        expr_7 = var_0;
                        expr_14 = 24.0 * 1.0 / expr_7;
                        expr_6 = a[0] * b[0] + a[1] * b[1];
                        expr_18 = 16.0 * expr_6 * 1.0 / expr_4;
                        var = expr_6 * expr_6;
                        expr_10 = 4.0 * a[1] * expr_6 * 1.0 / expr_4 + -3.0 * b[1];
                        expr_16 = expr_0 * 1.0 / expr_4;
                        expr_9 = 4.0 * a[0] * expr_6 * 1.0 / expr_4 + -3.0 * b[0];
                        expr_11 = 4.0 * a[0] * a[1] * expr_6 * 1.0 / expr_4 + a[1] * expr_9 + a[0] * expr_10;
                        expn1coeff3 = 1.0 / expr_7 * (4.0 * expr_11 * var_6 + 4.0 * var_5 * (2.0 * a[0] * expr_9 + expr_6 * (-3.0 + 4.0 * expr_16)));
                        expr_8 = var;
                        expr_12 = 4.0 * a[1] * b[0] * expr_6 * 1.0 / expr_4 + 2.0 * a[0] * a[1] * expr_5 * 1.0 / expr_4 + 4.0 * a[0] * b[1] * expr_6 * 1.0 / expr_4 + -12.0 * a[0] * a[1] * expr_8 * 1.0 / expr_7 + -1.0 * b[0] * b[1];
                        expr_19 = 4.0 * expr_8 * 1.0 / expr_4;
                        expn1coeff4 = 1.0 / expr_7 * (48.0 * expr_12 * var_6 + 24.0 * var_5 * (-3.0 * expr_2 + -1.0 * expr_3 + 4.0 * expr_16 * expr_5 + expr_19 + a[0] * b[0] * expr_18 + -1.0 * expr_0 * expr_14 * expr_8));
                        expn0coeff3 = 1.0 / expr_7 * (4.0 * expr_11 * var_5 + 4.0 * var_6 * (expr_6 * (-3.0 + 4.0 * expr_17) + 2.0 * a[1] * expr_10));
                        expn0coeff4 = 1.0 / expr_7 * (48.0 * expr_12 * var_5 + 24.0 * var_6 * (4.0 * expr_17 * expr_5 + -1.0 * expr_2 + a[1] * b[1] * expr_18 + -1.0 * expr_1 * expr_14 * expr_8 + expr_19 + -3.0 * expr_3));
                        expn0_result = 0.5 * expn0coeff2 + 0.16666666666666666 * expn0coeff3 + 0.041666666666666664 * expn0coeff4;
                        expn1_result = 0.041666666666666664 * expn1coeff4 + 0.16666666666666666 * expn1coeff3 + 0.5 * expn1coeff2;
                        pair_result_1 = expn1_result * strength_0[isrc_inner + isrc_outer * 256];
                        pair_result_0 = expn0_result * strength_0[isrc_inner + isrc_outer * 256];
                        acc_isrc_outer_isrc_inner = acc_isrc_outer_isrc_inner + pair_result_0;
                        acc_isrc_outer_isrc_inner_0 = acc_isrc_outer_isrc_inner_0 + pair_result_1;
                      }
                  result_1[lid(0) + gid(0) * 16] = knl_1_scaling * acc_isrc_outer_isrc_inner_0;
                  result_0[lid(0) + gid(0) * 16] = knl_0_scaling * acc_isrc_outer_isrc_inner;
                }
              }