3 Replies Latest reply on Aug 29, 2008 11:40 PM by michael.chu

    Brook+ with HD4850

    livuxman

      I have tested the sample apps of Brook+ with a 4850 and encountered problems.

      A example is NLM_Denoise. With the GPU the image result is black, with de CPU the denoise works ok.

      The problem looks as a number precission problem.

      Is a known problem? Can be solved? Is a hardware problem?

      Thanks

        • Brook+ with HD4850
          ryta1203
          livux,

          I just wanted to verify this. I am also running a 4850 and I also get an image output that is almost entirely black (~99% is black with some red and other color pixels in the upper left and bottom right corners).

          I understand that Firestream does not currently support 4850 (hopefully with next release along with 8.8 drivers) but is anyone else having this problem? Anyone with a 3850/3870 verify this problem?

          I'm using VS2005 and haven't had any problems so far with Brook+ samples other than this one (once I got VS2005 stable and running Brook+).
            • Brook+ with HD4850
              livuxman

              I have made more testing and modified the code to make it work.

              The changes are a crazy combination to make the numeric problem dissapear.

              The problem is really disconcerting. Looks like the single precission is far from be real and accumulate a lot of error tending to zero.

              If the problem is software I will wait to a newer version, if is hardware......  will be very disappointed.

              The modified code:

              kernel float lerpf(float a, float b, float c)
              {
                 double p,q,r;
                  p=b*1000;
                   q=p-a;
                  r=q*c;
                  return a+r;
              //    return a + ((b*1000) - a) * c;
              }

              kernel void NLM_Denoise_Pass2(out float4 output<408, 320>, float4 weights[408][320], float4 input[408][320], float inv_win)
              {
                  const float2 si = indexof(output);
                  const float2 s = si;
                  float j = fmod(si.y, 8.0f);
                  float i = fmod(si.x, 8.0f);
                  //for (j = 0; j < 1; j += 1)
                  {
                      if ((j + si.y) >= 408)
                      {
                          return;
                      }
                      //for (i = 0; i < 1; i += 1)
                      {
                          float2 xy = {i, j};
                          float fcount = 0;
                          float4 sumWeights = 0;
                          float3 clr = {0, 0, 0}, clr00 = {0, 0, 0};
                      float3 peclr = {0, 0, 0};
              double dclrx = 0,dclry = 0,dclrz = 0;
              double dclrx2 = 0,dclry2 = 0,dclrz2 = 0;
              double dclrx3 = 0,dclry3 = 0,dclrz3 = 0;
                     
                          float lerpQ = 0;
                          float wi = 0;
                          float2 idx = {0, 0};
                          if ((j + si.x) >= 320)
                          {
                              return;
                          }
                          xy = s - xy;
                          idx.y = xy.y;
                          // Cycle through NLM window, surrounding (x, y) texel
                          for (wi = -3.0f; wi <= 4.0f; wi += 1.0f)
                          {
                              float wj;
                              idx.x = xy.x;
                              for (wj = -3.0f; wj <= 4.0f; wj += 1.0f)
                              {
                                  float2 idx2 = {wj, wi};
                                  float4 weightIJ = weights[idx]; // how do I index this correctly?

                                  // Accumulate (x + j, y + i) texel color with computed weight
                                  float4 clrIJ = input[idx2 + s];
              //                    clr += clrIJ * weightIJ;
                          peclr = weightIJ*1000;
                          dclrx += (peclr.x*clrIJ.x)/1;
                          dclry += (peclr.y*clrIJ.y)/1;
                          dclrz += (peclr.z*clrIJ.z)/1;           
                         
                         
                         
              //            ddclr += clrIJ * weightIJ;

                                  // Sum of weights for color normalization to [0..1] range
                                  sumWeights += weightIJ;

                                  // Update weight counter, if NLM weight for current window texel
                                  // exceeds the weight threshoold
                                  fcount     += (weightIJ > 0.1f) ? inv_win : 0;
                                  idx.x += 1.0f;
                              }
                              idx.y += 1.0f;
                     
                     
                          }
                     
                          clr.x=dclrx/1;
                          clr.y=dclry/1;
                          clr.z=dclrz/1;
                     
                      clr *=1000;
                          // Normalize result color by 1 134 559 232sum of weights
                          sumWeights = 1.0f / sumWeights;
                          clr *= sumWeights;

              clr /= 1000;
                          // Choose LERP quotent basing on how many texels
                          // within the NLM window exceeded the weight threshold
                          lerpQ = (fcount > 0.1f) ? 0.20f : 0.80f;

                          // Write final results to global memory
                          clr00 = input[s];
                          clr.x = lerpf(clr.x, clr00.x, lerpQ);
                          clr.y = lerpf(clr.y, clr00.y, lerpQ);
                          clr.z = lerpf(clr.z, clr00.z, lerpQ);
                          output = float4(clr.x, clr.y, clr.z, 0.0f);
                      }
                  }
              }

              kernel void NLM_Denoise_Pass1(out float4 output<408,320>, float4 input[408][320], float noise, float inv_win)
              {
              double prueba;   
              float prueba2;
                  const float2 si = indexof(output);
                  const float2 s = si;
                  float2 c = si + 3.0f;
                  float n, dist, i = 0.0f, j = 0.0f;
                  // Center for current pixel
                  float weight = 0;
                  j = fmod(si.y,8.0f);
                  i = fmod(si.x,8.0f);
                  //for (j = 0.0f; j < 1.0f; j += 1.0f)
                  {
                      if ((j + si.y) >= 408)
                      {
                          return;
                      }
                      //   for (i = 0.0f; i < 1.0f; i += 1.0f)
                      {
                          float2 xy = {i, j};
                          if ((i + si.x) >= 320)
                          {
                              return;
                          }
                          c = (si - xy) + 3.0f;
                          weight = 0;
                          for (n = -3.0f; n <= 3.0f; n += 1.0f)
                          {
                              float m;
                              for (m = -3.0f; m <= 3.0f; m += 1.0f)
                              {  
                                  float2 index = {m, n};
                                  float4 val1 = input[c + index];
                                  float4 val2 = input[s + index];
                                  weight += vecLen(val1, val2);
                              }
                          }
                          dist = (i - 3.0f) * (i - 3.0f) +
                                 (j - 3.0f) * (j - 3.0f);
              //            output = exp( -(weight * noise + dist * inv_win));
                      prueba=weight/2;
                      prueba2=prueba*noise;
                      output=exp( -((prueba2*2)+ dist * inv_win));


                      }
                  }
              }

               

                • Brook+ with HD4850
                  michael.chu
                  I'm having the engineers look at this. I know that they have the NLM_Denoise sample tested in v1.2-beta so it should work, but I'll check up with them just to be sure.

                  Thanks for bringing this to my attention!

                  Michael.