Hi there,
it appears that the 13.6 beta available from [1] miscompiles the kernel below. I should add that this does not happen entirely repeatably. Never mind.
Thanks,
Andreas
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
__kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) layerpot(__global double const *restrict src, __global double const *restrict tgt, __global double const *restrict center_s0, __global double const *restrict center_s1, int const nsrc, int const ntgt, __global double const *restrict strength_0, __global double *restrict result_0, __global double *restrict result_1, __global double const *restrict src_derivative_dir_s0, __global double const *restrict src_derivative_dir_s1)
{
double var_2;
double pair_result_0;
double pair_result_1;
double acc_isrc_outer_isrc_inner;
double expr_16;
double knl_1_scaling;
double var_0;
double acc_isrc_outer_isrc_inner_0;
double expr_8;
double expr_9;
double var;
double expr_7;
double var_5;
double expr_5;
double expr_2;
double expr_3;
double expr_0;
double expr_1;
double expn0coeff4;
double expn0coeff3;
double expn0coeff2;
double expr_6;
double expr_18;
double expr_19;
double var_6;
double expr_14;
double expr_15;
double expn1_result;
double expr_17;
double expr_10;
double expr_11;
double expr_12;
double expr_13;
double a[2];
double var_4;
double b[2];
double expn0_result;
double expn1coeff4;
double var_3;
double expn1coeff2;
double expn1coeff3;
double var_1;
double knl_0_scaling;
double expr_4;
if ((-1 + -16 * gid(0) + -1 * lid(0) + ntgt) >= 0)
{
b[0] = tgt[ntgt * 0 + lid(0) + gid(0) * 16] + -1.0 * center_s0[lid(0) + gid(0) * 16];
b[1] = tgt[ntgt * 1 + lid(0) + gid(0) * 16] + -1.0 * center_s1[lid(0) + gid(0) * 16];
var_1 = b[0] * b[0];
var_2 = b[1] * b[1];
acc_isrc_outer_isrc_inner = 0.0;
knl_0_scaling = -1.0 / 2.0 * pow(3.141592653589793, -1.0);
acc_isrc_outer_isrc_inner_0 = 0.0;
knl_1_scaling = -1.0 / 2.0 * pow(3.141592653589793, -1.0);
expr_3 = var_2;
expr_2 = var_1;
expr_5 = expr_2 + expr_3;
for (int isrc_outer = 0; isrc_outer <= (-1 + ((255 + nsrc) / 256)); ++isrc_outer)
for (int isrc_inner = 0; isrc_inner <= 255; ++isrc_inner)
if ((-1 + -1 * isrc_inner + -256 * isrc_outer + nsrc) >= 0)
{
a[0] = center_s0[lid(0) + gid(0) * 16] + -1.0 * src[nsrc * 0 + isrc_inner + isrc_outer * 256];
a[1] = center_s1[lid(0) + gid(0) * 16] + -1.0 * src[nsrc * 1 + isrc_inner + isrc_outer * 256];
var_6 = src_derivative_dir_s1[isrc_inner + isrc_outer * 256];
expr_13 = a[0] * a[1];
var_3 = a[0] * a[0];
expr_0 = var_3;
var_5 = src_derivative_dir_s0[isrc_inner + isrc_outer * 256];
var_4 = a[1] * a[1];
expr_1 = var_4;
expr_4 = expr_1 + expr_0;
expr_17 = expr_1 * 1.0 / expr_4;
var_0 = expr_4 * expr_4;
expr_15 = -2.0 * 1.0 / expr_4;
expn1coeff2 = 1.0 / expr_4 * (2.0 * expr_13 * expr_15 * var_6 + -2.0 * var_5 * (-1.0 + -1.0 * expr_0 * expr_15));
expn0coeff2 = 1.0 / expr_4 * (2.0 * expr_13 * expr_15 * var_5 + -2.0 * var_6 * (-1.0 + -1.0 * expr_1 * expr_15));
expr_7 = var_0;
expr_14 = 24.0 * 1.0 / expr_7;
expr_6 = a[0] * b[0] + a[1] * b[1];
expr_18 = 16.0 * expr_6 * 1.0 / expr_4;
var = expr_6 * expr_6;
expr_10 = 4.0 * a[1] * expr_6 * 1.0 / expr_4 + -3.0 * b[1];
expr_16 = expr_0 * 1.0 / expr_4;
expr_9 = 4.0 * a[0] * expr_6 * 1.0 / expr_4 + -3.0 * b[0];
expr_11 = 4.0 * a[0] * a[1] * expr_6 * 1.0 / expr_4 + a[1] * expr_9 + a[0] * expr_10;
expn1coeff3 = 1.0 / expr_7 * (4.0 * expr_11 * var_6 + 4.0 * var_5 * (2.0 * a[0] * expr_9 + expr_6 * (-3.0 + 4.0 * expr_16)));
expr_8 = var;
expr_12 = 4.0 * a[1] * b[0] * expr_6 * 1.0 / expr_4 + 2.0 * a[0] * a[1] * expr_5 * 1.0 / expr_4 + 4.0 * a[0] * b[1] * expr_6 * 1.0 / expr_4 + -12.0 * a[0] * a[1] * expr_8 * 1.0 / expr_7 + -1.0 * b[0] * b[1];
expr_19 = 4.0 * expr_8 * 1.0 / expr_4;
expn1coeff4 = 1.0 / expr_7 * (48.0 * expr_12 * var_6 + 24.0 * var_5 * (-3.0 * expr_2 + -1.0 * expr_3 + 4.0 * expr_16 * expr_5 + expr_19 + a[0] * b[0] * expr_18 + -1.0 * expr_0 * expr_14 * expr_8));
expn0coeff3 = 1.0 / expr_7 * (4.0 * expr_11 * var_5 + 4.0 * var_6 * (expr_6 * (-3.0 + 4.0 * expr_17) + 2.0 * a[1] * expr_10));
expn0coeff4 = 1.0 / expr_7 * (48.0 * expr_12 * var_5 + 24.0 * var_6 * (4.0 * expr_17 * expr_5 + -1.0 * expr_2 + a[1] * b[1] * expr_18 + -1.0 * expr_1 * expr_14 * expr_8 + expr_19 + -3.0 * expr_3));
expn0_result = 0.5 * expn0coeff2 + 0.16666666666666666 * expn0coeff3 + 0.041666666666666664 * expn0coeff4;
expn1_result = 0.041666666666666664 * expn1coeff4 + 0.16666666666666666 * expn1coeff3 + 0.5 * expn1coeff2;
pair_result_1 = expn1_result * strength_0[isrc_inner + isrc_outer * 256];
pair_result_0 = expn0_result * strength_0[isrc_inner + isrc_outer * 256];
acc_isrc_outer_isrc_inner = acc_isrc_outer_isrc_inner + pair_result_0;
acc_isrc_outer_isrc_inner_0 = acc_isrc_outer_isrc_inner_0 + pair_result_1;
}
result_1[lid(0) + gid(0) * 16] = knl_1_scaling * acc_isrc_outer_isrc_inner_0;
result_0[lid(0) + gid(0) * 16] = knl_0_scaling * acc_isrc_outer_isrc_inner;
}
}
inducer77 wrote:
Hi there,
I should add that this does not happen entirely repeatably.
Is the issue not reproducible? Can you confirm? Also share details about the GPU, OS, APP SDK,and Driver.
I can compile the kernel for 13.6 driver, HD 7870, Win7-64, APP SDK 2.8.
It appears reproducible. What made it appear non-reproducible is that the code that generates that kernel somewhat randomly generates one of two (mathematically equivalent) versions. One breaks, the other works.
The kernel compiles fine, but it computes the wrong result. FWIW, this is the equivalent kernel that compiles correctly
The kernel compiles fine, but it computes the wrong result. FWIW, this is the equivalent kernel that compiles correctly:#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
__kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) layerpot(__global double const *restrict src, __global double const *restrict tgt, __global double const *restrict center_s0, __global double const *restrict center_s1, int const nsrc, int const ntgt, __global double const *restrict strength_0, __global double *restrict result_0, __global double *restrict result_1, __global double const *restrict src_derivative_dir_s0, __global double const *restrict src_derivative_dir_s1)
{
double var_2;
double pair_result_0;
double pair_result_1;
double acc_isrc_outer_isrc_inner;
double expr_16;
double knl_1_scaling;
double var_0;
double acc_isrc_outer_isrc_inner_0;
double expr_8;
double expr_9;
double var;
double expr_7;
double var_5;
double expr_5;
double expr_2;
double expr_3;
double expr_0;
double expr_1;
double expn0coeff4;
double expn0coeff3;
double expn0coeff2;
double expr_6;
double expr_18;
double expr_19;
double var_6;
double expr_14;
double expr_15;
double expn1_result;
double expr_17;
double expr_10;
double expr_11;
double expr_12;
double expr_13;
double a[2];
double var_4;
double b[2];
double expn0_result;
double expn1coeff4;
double var_3;
double expn1coeff2;
double expn1coeff3;
double var_1;
double knl_0_scaling;
double expr_4;
if ((-1 + -16 * gid(0) + -1 * lid(0) + ntgt) >= 0)
{
b[0] = tgt[ntgt * 0 + lid(0) + gid(0) * 16] + -1.0 * center_s0[lid(0) + gid(0) * 16];
b[1] = tgt[ntgt * 1 + lid(0) + gid(0) * 16] + -1.0 * center_s1[lid(0) + gid(0) * 16];
var_1 = b[0] * b[0];
var_2 = b[1] * b[1];
acc_isrc_outer_isrc_inner = 0.0;
knl_0_scaling = -1.0 / 2.0 * pow(3.141592653589793, -1.0);
acc_isrc_outer_isrc_inner_0 = 0.0;
knl_1_scaling = -1.0 / 2.0 * pow(3.141592653589793, -1.0);
expr_3 = var_2;
expr_2 = var_1;
expr_5 = expr_2 + expr_3;
for (int isrc_outer = 0; isrc_outer <= (-1 + ((255 + nsrc) / 256)); ++isrc_outer)
for (int isrc_inner = 0; isrc_inner <= 255; ++isrc_inner)
if ((-1 + -1 * isrc_inner + -256 * isrc_outer + nsrc) >= 0)
{
a[0] = center_s0[lid(0) + gid(0) * 16] + -1.0 * src[nsrc * 0 + isrc_inner + isrc_outer * 256];
a[1] = center_s1[lid(0) + gid(0) * 16] + -1.0 * src[nsrc * 1 + isrc_inner + isrc_outer * 256];
var_6 = src_derivative_dir_s1[isrc_inner + isrc_outer * 256];
expr_13 = a[0] * a[1];
var_3 = a[0] * a[0];
expr_0 = var_3;
var_5 = src_derivative_dir_s0[isrc_inner + isrc_outer * 256];
var_4 = a[1] * a[1];
expr_1 = var_4;
expr_4 = expr_1 + expr_0;
expr_17 = expr_1 * 1.0 / expr_4;
var_0 = expr_4 * expr_4;
expr_15 = -2.0 * 1.0 / expr_4;
expn1coeff2 = 1.0 / expr_4 * (2.0 * expr_13 * expr_15 * var_6 + -2.0 * var_5 * (-1.0 + -1.0 * expr_0 * expr_15));
expn0coeff2 = 1.0 / expr_4 * (2.0 * expr_13 * expr_15 * var_5 + -2.0 * var_6 * (-1.0 + -1.0 * expr_1 * expr_15));
expr_7 = var_0;
expr_14 = 24.0 * 1.0 / expr_7;
expr_6 = a[0] * b[0] + a[1] * b[1];
expr_18 = 16.0 * expr_6 * 1.0 / expr_4;
var = expr_6 * expr_6;
expr_10 = 4.0 * a[1] * expr_6 * 1.0 / expr_4 + -3.0 * b[1];
expr_16 = expr_0 * 1.0 / expr_4;
expr_9 = 4.0 * a[0] * expr_6 * 1.0 / expr_4 + -3.0 * b[0];
expr_11 = 4.0 * a[0] * a[1] * expr_6 * 1.0 / expr_4 + a[1] * expr_9 + a[0] * expr_10;
expn1coeff3 = 1.0 / expr_7 * (4.0 * expr_11 * var_6 + 4.0 * var_5 * (2.0 * a[0] * expr_9 + expr_6 * (-3.0 + 4.0 * expr_16)));
expr_8 = var;
expr_12 = 4.0 * a[1] * b[0] * expr_6 * 1.0 / expr_4 + 2.0 * a[0] * a[1] * expr_5 * 1.0 / expr_4 + 4.0 * a[0] * b[1] * expr_6 * 1.0 / expr_4 + -12.0 * a[0] * a[1] * expr_8 * 1.0 / expr_7 + -1.0 * b[0] * b[1];
expr_19 = 4.0 * expr_8 * 1.0 / expr_4;
expn1coeff4 = 1.0 / expr_7 * (48.0 * expr_12 * var_6 + 24.0 * var_5 * (-3.0 * expr_2 + -1.0 * expr_3 + 4.0 * expr_16 * expr_5 + expr_19 + a[0] * b[0] * expr_18 + -1.0 * expr_0 * expr_14 * expr_8));
expn0coeff3 = 1.0 / expr_7 * (4.0 * expr_11 * var_5 + 4.0 * var_6 * (expr_6 * (-3.0 + 4.0 * expr_17) + 2.0 * a[1] * expr_10));
expn0coeff4 = 1.0 / expr_7 * (48.0 * expr_12 * var_5 + 24.0 * var_6 * (4.0 * expr_17 * expr_5 + -1.0 * expr_2 + a[1] * b[1] * expr_18 + -1.0 * expr_1 * expr_14 * expr_8 + expr_19 + -3.0 * expr_3));
expn0_result = 0.5 * expn0coeff2 + 0.16666666666666666 * expn0coeff3 + 0.041666666666666664 * expn0coeff4;
expn1_result = 0.041666666666666664 * expn1coeff4 + 0.16666666666666666 * expn1coeff3 + 0.5 * expn1coeff2;
pair_result_1 = expn1_result * strength_0[isrc_inner + isrc_outer * 256];
pair_result_0 = expn0_result * strength_0[isrc_inner + isrc_outer * 256];
acc_isrc_outer_isrc_inner = acc_isrc_outer_isrc_inner + pair_result_0;
acc_isrc_outer_isrc_inner_0 = acc_isrc_outer_isrc_inner_0 + pair_result_1;
}
result_1[lid(0) + gid(0) * 16] = knl_1_scaling * acc_isrc_outer_isrc_inner_0;
result_0[lid(0) + gid(0) * 16] = knl_0_scaling * acc_isrc_outer_isrc_inner;
}
}
can you give the working and notworking kernels as attached zip files. This format is not copy friendly.
Certainly-- here you go: http://tiker.net/tmp/amd-miscompiled.zip
Does your kernel compile fine with the latest drivers?