Simplified example.br file:
/*
Should print:
----------------------------
x0 y0 : 0.00 0.10 0.20 0.30
x1 y0 : 0.00 0.00 0.00 0.00
x2 y0 : 0.00 0.00 0.00 0.00
x3 y0 : 0.00 0.00 0.00 0.00
x0 y1 : 0.00 0.10 0.20 0.30
x1 y1 : 0.00 0.00 0.00 0.00
x2 y1 : 0.00 0.00 0.00 0.00
x3 y1 : 0.00 0.00 0.00 0.00
x0 y2 : 0.00 0.10 0.20 0.30
x1 y2 : 0.00 0.00 0.00 0.00
x2 y2 : 0.00 0.00 0.00 0.00
x3 y2 : 0.00 0.00 0.00 0.00
x0 y3 : 0.00 0.10 0.20 0.30
x1 y3 : 0.00 0.00 0.00 0.00
x2 y3 : 0.00 0.00 0.00 0.00
x3 y3 : 0.00 0.00 0.00 0.00
But prints:
----------------------------
x0 y0 : 0.00 -1.00 0.20 0.30
x1 y0 : 0.00 0.00 0.00 0.00
x2 y0 : 0.00 0.00 0.00 0.00
x3 y0 : 0.00 0.00 0.00 0.00
x0 y1 : 0.00 0.00 0.20 0.30
x1 y1 : 0.00 0.00 0.00 0.00
x2 y1 : 0.00 0.00 0.00 0.00
x3 y1 : 0.00 0.00 0.00 0.00
x0 y2 : 0.00 1.00 0.20 0.30
x1 y2 : 0.00 0.00 0.00 0.00
x2 y2 : 0.00 0.00 0.00 0.00
x3 y2 : 0.00 0.00 0.00 0.00
x0 y3 : 0.00 2.00 0.20 0.30
x1 y3 : 0.00 0.00 0.00 0.00
x2 y3 : 0.00 0.00 0.00 0.00
x3 y3 : 0.00 0.00 0.00 0.00
*/
#include < stdio.h>
kernel void GPU_flujoDiff(float4 f0<>, out float4 f<> ) {
float4 t1 = float4(0.0f, 0.0f, 0.0f, 0.0f);
// Option 1: This should return 0.0f
f = f0 * t1;
// Option 2: Hides the bug
// f = t1;
}
kernel void GPU_step1(float4 var0[ ][ ],
out float4 flC<>, out float4 flR<>, out float4 flD<>,
int datos_npx, int datos_npy) {
int2 pos = instance().xy;
// Optional : all var0x are initialized
float4 var0C = var0[ pos.y - 1][ pos.x - 1];
float4 var0R = var0[ pos.y - 1][ pos.x ];
float4 var0D = var0[ pos.y ][ pos.x - 1];
float4 flC1;
float4 flC2;
// Optional : we set every output to 0.0f
flC = float4(0.0f, 0.0f, 0.0f, 0.0f);
flR = float4(0.0f, 0.0f, 0.0f, 0.0f);
flD = float4(0.0f, 0.0f, 0.0f, 0.0f);
// HERE IS THE BUG, when pos.x == 0 var0C.y should be 0.1f
// (If you paste this line twice it works right)
if(pos.x == 0) var0C = float4(var0R.x, 0.1f, 0.2f, 0.3f);
// This hides the bug
//if(pos.x == 0) var0C.y = 0.1f;
if(pos.x == 0) {
flR = var0C; // Here var0C should be float4(var0R.x, 0.1f, 0.2f, 0.3f);
} else if(pos.x == datos_npx) {
GPU_flujoDiff(var0R, flC1);
}
if(pos.y == 0) {
GPU_flujoDiff(var0D, flD);
} else if(pos.y == datos_npy) {
GPU_flujoDiff(var0D, flC2);
}
flC = - flC1 - flC2;
}
int main(int argc, char** argv) {
const int xsize1 = 3;
const int ysize1 = 3;
const int xsize2 = 4;
const int ysize2 = 4;
int i, j, pos;
float *output = (float*)malloc(ysize2 * xsize2 * sizeof(float4));
for(pos = 0; pos < ysize2 * xsize2 * 4; pos++)
output[pos] = 0.0f;
{
float4 var0< ysize1, xsize1>;
float4 flC < ysize2, xsize2>;
float4 flR < ysize2, xsize2>;
float4 flD < ysize2, xsize2>;
int datos_npx = xsize1;
int datos_npy = ysize1;
// 1. Sets input streams to 0.0f
streamRead(var0, output);
// 2. Calls problematic kernel
GPU_step1(var0,
flC, flR, flD,
datos_npx, datos_npy);
// 3. Writes output stream flR to memory
streamWrite(flR, output);
}
// 4. Prints stream output flR
for(j = 0, pos = 0; j < ysize2; j++) {
for(i = 0; i < ysize2; i++, pos+=4) {
printf("x%i y%i : ", i, j);
printf("%4.2f %4.2f %4.2f %4.2f\n",
output[ pos], output[ pos+1], output[ pos+2], output[ pos+3]);
}
}
return 0;
}
----------------------------
WinXP 64, MSVC 2005, Brook 1.3, Radeon 4850