Archives Discussions

Ceq · ‎02-28-2009

Simplified example.br file:

/*
Should print:
----------------------------
x0 y0 : 0.00 0.10 0.20 0.30
x1 y0 : 0.00 0.00 0.00 0.00
x2 y0 : 0.00 0.00 0.00 0.00
x3 y0 : 0.00 0.00 0.00 0.00
x0 y1 : 0.00 0.10 0.20 0.30
x1 y1 : 0.00 0.00 0.00 0.00
x2 y1 : 0.00 0.00 0.00 0.00
x3 y1 : 0.00 0.00 0.00 0.00
x0 y2 : 0.00 0.10 0.20 0.30
x1 y2 : 0.00 0.00 0.00 0.00
x2 y2 : 0.00 0.00 0.00 0.00
x3 y2 : 0.00 0.00 0.00 0.00
x0 y3 : 0.00 0.10 0.20 0.30
x1 y3 : 0.00 0.00 0.00 0.00
x2 y3 : 0.00 0.00 0.00 0.00
x3 y3 : 0.00 0.00 0.00 0.00

But prints:
----------------------------
x0 y0 : 0.00 -1.00 0.20 0.30
x1 y0 : 0.00 0.00 0.00 0.00
x2 y0 : 0.00 0.00 0.00 0.00
x3 y0 : 0.00 0.00 0.00 0.00
x0 y1 : 0.00 0.00 0.20 0.30
x1 y1 : 0.00 0.00 0.00 0.00
x2 y1 : 0.00 0.00 0.00 0.00
x3 y1 : 0.00 0.00 0.00 0.00
x0 y2 : 0.00 1.00 0.20 0.30
x1 y2 : 0.00 0.00 0.00 0.00
x2 y2 : 0.00 0.00 0.00 0.00
x3 y2 : 0.00 0.00 0.00 0.00
x0 y3 : 0.00 2.00 0.20 0.30
x1 y3 : 0.00 0.00 0.00 0.00
x2 y3 : 0.00 0.00 0.00 0.00
x3 y3 : 0.00 0.00 0.00 0.00
*/

#include < stdio.h>

kernel void GPU_flujoDiff(float4 f0<>, out float4 f<> ) {
    float4 t1 = float4(0.0f, 0.0f, 0.0f, 0.0f);
    // Option 1: This should return 0.0f
    f = f0 * t1;

    // Option 2: Hides the bug
    // f = t1;

}

kernel void GPU_step1(float4 var0[ ][ ],
                      out float4 flC<>, out float4 flR<>, out float4 flD<>,
                      int datos_npx, int datos_npy) {

    int2 pos = instance().xy;

    // Optional : all var0x are initialized
    float4 var0C = var0[ pos.y - 1][ pos.x - 1];
    float4 var0R = var0[ pos.y - 1][ pos.x    ];
    float4 var0D = var0[ pos.y    ][ pos.x - 1];

    float4 flC1;
    float4 flC2;

    // Optional : we set every output to 0.0f
    flC = float4(0.0f, 0.0f, 0.0f, 0.0f);
    flR = float4(0.0f, 0.0f, 0.0f, 0.0f);
    flD = float4(0.0f, 0.0f, 0.0f, 0.0f);

    // HERE IS THE BUG, when pos.x == 0 var0C.y should be 0.1f
    // (If you paste this line twice it works right)
    if(pos.x == 0) var0C = float4(var0R.x, 0.1f, 0.2f, 0.3f);

    // This hides the bug
    //if(pos.x == 0) var0C.y = 0.1f;

    if(pos.x == 0) {
        flR = var0C; // Here var0C should be float4(var0R.x, 0.1f, 0.2f, 0.3f);
    } else if(pos.x == datos_npx) {
        GPU_flujoDiff(var0R, flC1);
    }
    if(pos.y == 0) {
        GPU_flujoDiff(var0D, flD);
    } else if(pos.y == datos_npy) {
        GPU_flujoDiff(var0D, flC2);
    }

    flC = - flC1 - flC2;
}

int main(int argc, char** argv) {
    const int xsize1 = 3;
    const int ysize1 = 3;
    const int xsize2 = 4;
    const int ysize2 = 4;
    int i, j, pos;

    float *output = (float*)malloc(ysize2 * xsize2 * sizeof(float4));
    for(pos = 0; pos < ysize2 * xsize2 * 4; pos++)
        output[pos] = 0.0f;

    {
        float4 var0< ysize1, xsize1>;

        float4 flC < ysize2, xsize2>;
        float4 flR < ysize2, xsize2>;
        float4 flD < ysize2, xsize2>;

        int datos_npx      = xsize1;
        int datos_npy      = ysize1;

        // 1. Sets input streams to 0.0f
        streamRead(var0, output);

        // 2. Calls problematic kernel
        GPU_step1(var0,
            flC, flR, flD,
            datos_npx, datos_npy);

        // 3. Writes output stream flR to memory
        streamWrite(flR, output);
    }

    // 4. Prints stream output flR
    for(j = 0, pos = 0; j < ysize2; j++) {
        for(i = 0; i < ysize2; i++, pos+=4) {
            printf("x%i y%i : ", i, j);
            printf("%4.2f %4.2f %4.2f %4.2f\n",
                output[ pos], output[ pos+1], output[ pos+2], output[ pos+3]);
        }
    }
    return 0;
}

----------------------------

WinXP 64, MSVC 2005, Brook 1.3, Radeon 4850

gaurav_garg · ‎03-01-2009

Does CPU backend give correct result?

Ceq · ‎03-01-2009

Yes, CPU backend gives the correct result:

----------------------------

Failed to initialize CAL runtime, falling back to CPU
x0 y0 : 0.00 0.10 0.20 0.30
x1 y0 : 0.00 0.00 0.00 0.00
x2 y0 : 0.00 0.00 0.00 0.00
x3 y0 : 0.00 0.00 0.00 0.00
x0 y1 : 0.00 0.10 0.20 0.30
x1 y1 : 0.00 0.00 0.00 0.00
x2 y1 : 0.00 0.00 0.00 0.00
x3 y1 : 0.00 0.00 0.00 0.00
x0 y2 : 0.00 0.10 0.20 0.30
x1 y2 : 0.00 0.00 0.00 0.00
x2 y2 : 0.00 0.00 0.00 0.00
x3 y2 : 0.00 0.00 0.00 0.00
x0 y3 : 0.00 0.10 0.20 0.30
x1 y3 : 0.00 0.00 0.00 0.00
x2 y3 : 0.00 0.00 0.00 0.00
x3 y3 : 0.00 0.00 0.00 0.00

----------------------------

From the behaviour looks like GPU backend is taking

"pos.y - 1" register instead using var0C.y initialized at 0.1f

Archives Discussions

Brook+ compiler bug?