2 Replies Latest reply on Mar 1, 2009 2:07 PM by Ceq

    Brook+ compiler bug?

    Ceq

      Simplified example.br file:

      /*
      Should print:
      ----------------------------
      x0 y0 : 0.00 0.10 0.20 0.30
      x1 y0 : 0.00 0.00 0.00 0.00
      x2 y0 : 0.00 0.00 0.00 0.00
      x3 y0 : 0.00 0.00 0.00 0.00
      x0 y1 : 0.00 0.10 0.20 0.30
      x1 y1 : 0.00 0.00 0.00 0.00
      x2 y1 : 0.00 0.00 0.00 0.00
      x3 y1 : 0.00 0.00 0.00 0.00
      x0 y2 : 0.00 0.10 0.20 0.30
      x1 y2 : 0.00 0.00 0.00 0.00
      x2 y2 : 0.00 0.00 0.00 0.00
      x3 y2 : 0.00 0.00 0.00 0.00
      x0 y3 : 0.00 0.10 0.20 0.30
      x1 y3 : 0.00 0.00 0.00 0.00
      x2 y3 : 0.00 0.00 0.00 0.00
      x3 y3 : 0.00 0.00 0.00 0.00

      But prints:
      ----------------------------
      x0 y0 : 0.00 -1.00 0.20 0.30
      x1 y0 : 0.00 0.00 0.00 0.00
      x2 y0 : 0.00 0.00 0.00 0.00
      x3 y0 : 0.00 0.00 0.00 0.00
      x0 y1 : 0.00 0.00 0.20 0.30
      x1 y1 : 0.00 0.00 0.00 0.00
      x2 y1 : 0.00 0.00 0.00 0.00
      x3 y1 : 0.00 0.00 0.00 0.00
      x0 y2 : 0.00 1.00 0.20 0.30
      x1 y2 : 0.00 0.00 0.00 0.00
      x2 y2 : 0.00 0.00 0.00 0.00
      x3 y2 : 0.00 0.00 0.00 0.00
      x0 y3 : 0.00 2.00 0.20 0.30
      x1 y3 : 0.00 0.00 0.00 0.00
      x2 y3 : 0.00 0.00 0.00 0.00
      x3 y3 : 0.00 0.00 0.00 0.00
      */

       

      #include < stdio.h>

      kernel void GPU_flujoDiff(float4 f0<>, out float4 f<> ) {
          float4 t1 = float4(0.0f, 0.0f, 0.0f, 0.0f);
          // Option 1: This should return 0.0f
          f = f0 * t1;
        
          // Option 2: Hides the bug
          // f = t1;

      }

      kernel void GPU_step1(float4 var0[ ][ ],
                            out float4 flC<>, out float4 flR<>, out float4 flD<>,
                            int datos_npx, int datos_npy) {

          int2 pos = instance().xy;

          // Optional : all var0x are initialized
          float4 var0C = var0[ pos.y - 1][ pos.x - 1];
          float4 var0R = var0[ pos.y - 1][ pos.x    ];
          float4 var0D = var0[ pos.y    ][ pos.x - 1];

          float4 flC1;
          float4 flC2;

          // Optional : we set every output to 0.0f
          flC = float4(0.0f, 0.0f, 0.0f, 0.0f);
          flR = float4(0.0f, 0.0f, 0.0f, 0.0f);
          flD = float4(0.0f, 0.0f, 0.0f, 0.0f);

          // HERE IS THE BUG, when pos.x == 0 var0C.y should be 0.1f
          // (If you paste this line twice it works right)
          if(pos.x == 0) var0C = float4(var0R.x, 0.1f, 0.2f, 0.3f);

          // This hides the bug
          //if(pos.x == 0) var0C.y = 0.1f;
        
          if(pos.x == 0) {
              flR = var0C; // Here var0C should be float4(var0R.x, 0.1f, 0.2f, 0.3f);
          } else if(pos.x == datos_npx) {
              GPU_flujoDiff(var0R, flC1);
          }
          if(pos.y == 0) {
              GPU_flujoDiff(var0D, flD);
          } else if(pos.y == datos_npy) {
              GPU_flujoDiff(var0D, flC2);
          }

          flC = - flC1 - flC2;
      }


      int main(int argc, char** argv) {
          const int xsize1 = 3;
          const int ysize1 = 3;
          const int xsize2 = 4;
          const int ysize2 = 4;
          int i, j, pos;

          float *output = (float*)malloc(ysize2 * xsize2 * sizeof(float4));
          for(pos = 0; pos < ysize2 * xsize2 * 4; pos++)
              output[pos] = 0.0f;
            
          {
              float4 var0< ysize1, xsize1>;

              float4 flC < ysize2, xsize2>;
              float4 flR < ysize2, xsize2>;
              float4 flD < ysize2, xsize2>;

              int datos_npx      = xsize1;
              int datos_npy      = ysize1;

              // 1. Sets input streams to 0.0f
              streamRead(var0, output);

              // 2. Calls problematic kernel
              GPU_step1(var0,
                  flC, flR, flD,
                  datos_npx, datos_npy);

              // 3. Writes output stream flR to memory
              streamWrite(flR, output);
          }

          // 4. Prints stream output flR
          for(j = 0, pos = 0; j < ysize2; j++) {
              for(i = 0; i < ysize2; i++, pos+=4) {
                  printf("x%i y%i : ", i, j);
                  printf("%4.2f %4.2f %4.2f %4.2f\n",
                      output[ pos], output[ pos+1], output[ pos+2], output[ pos+3]);
              }
          }
          return 0;
      }

       

      ----------------------------

      WinXP 64, MSVC 2005, Brook 1.3, Radeon 4850

        • Brook+ compiler bug?
          gaurav.garg

          Does CPU backend give correct result?

            • Brook+ compiler bug?
              Ceq

              Yes, CPU backend gives the correct result:

               

              ----------------------------

              Failed to initialize CAL runtime, falling back to CPU
              x0 y0 : 0.00 0.10 0.20 0.30
              x1 y0 : 0.00 0.00 0.00 0.00
              x2 y0 : 0.00 0.00 0.00 0.00
              x3 y0 : 0.00 0.00 0.00 0.00
              x0 y1 : 0.00 0.10 0.20 0.30
              x1 y1 : 0.00 0.00 0.00 0.00
              x2 y1 : 0.00 0.00 0.00 0.00
              x3 y1 : 0.00 0.00 0.00 0.00
              x0 y2 : 0.00 0.10 0.20 0.30
              x1 y2 : 0.00 0.00 0.00 0.00
              x2 y2 : 0.00 0.00 0.00 0.00
              x3 y2 : 0.00 0.00 0.00 0.00
              x0 y3 : 0.00 0.10 0.20 0.30
              x1 y3 : 0.00 0.00 0.00 0.00
              x2 y3 : 0.00 0.00 0.00 0.00
              x3 y3 : 0.00 0.00 0.00 0.00

              ----------------------------

              From the behaviour looks like GPU backend is taking

              "pos.y - 1" register instead using var0C.y initialized at 0.1f