1 Reply Latest reply on Feb 20, 2012 1:03 PM by MicahVillmow

    kernel crashes compiler and causes compiler to spew out text on my debug console

    antzrhere

      Hi.

       

      This kernel below crashes the compiler.

      Not only only that but during the process it causes some sort of internal compiler debug text to be printed on my own program console window (!!) hundreds of lines of myserious internal debug code...not sure how...

       

      The obvious cause of the error is the lines "BSV0 &= InsideMask"   and    "BSV1 &= InsideMask;"  (which I know are incorrect)...the compiler doesn't crash with these removed.

       

      The program crashes after printing this text, so I can't copy all of it, but it's along the lines of:

       

      "...

      0x6758540: v2f32,ch,glue = CopyFromReg 0x6757300, 0x675adc0, 0x6757308:1 [ ORD=1877] [ID=290]

      0x675aba0: i32 = TargetConstant<1> [ORD=1761] [ID=38]

      ....

      ...."

       

       

      Windows 7 64bit, ATI 5870, SDK 2.6, Catalyst 12.2 preview

       

       

       

      Here's the code

      #define InfinityCorrectfloat3(val) as_float3(as_uint3(val) - (((as_uint3(val) & 0x7F800000) + 1)>>31))
      #define BACKGROUND_FILL_COLOUR      (float4)(1.0f,1.0f,1.0f,0.0f)
      #define FLOAT_MAX_POS               3.402823466e+38f
      #define FLOAT_MAX_NEG               -3.402823466e+38f

       

       

      inline float3 safe_recip_float3(const float3 Val)
      {
         float3 valinv = isequal(fabs(Val),0.0f) ? FLOAT_MAX_POS : 1.0f / Val;
         return InfinityCorrectfloat3(valinv);
      };   

       

      __kernel void  Test(__constant float* TraceParamsD, __global const int* TraceParamsI,__global const uint *ATree,__write_only image2d_t ScreenBuffer)
      {
        float4 VertX,VertY,VertZ,OVertX,OVertY,OVertZ;
        float3 RayPosition,RayDirection,RayDirectionInv,VWeight,ClosestVWeight;
        int2 PixelPos;
        float xangle,yangle,val,val2,xzlen,sinX,cosX,sinY,cosY,IntersectZ,ClosestZ,PixelWidth;
        float BSV0,BSV1,MAX_TRACE_DISTANCE;
        int TriID,ClosestID,NodeSelect,NodesToCheck,NodeDepth;
        uint NodeOffsetStack[32],NodeSwitchSelect,NodeCompletionStack,NodeOffset,VDATAOFFSET,IDATAOFFSET,InBoundsMask,ActiveRayMask;
        float3 T1B,T2B,T1,T2,BoxMin,BoxMax;
        float TNEAR,TFAR;
        int3 OutsideTest;
        uint InsideMask;
        int3 RayDirectionParallelTest;

        PixelPos.x = TraceParamsI[32+get_global_id(0)];
        PixelPos = (int2)(PixelPos.x&65535,PixelPos.x>>16);
        PixelWidth = TraceParamsD[0];
       

        val = TraceParamsD[1] * PixelPos.y;
        RayDirection = (float3)(TraceParamsD[5],TraceParamsD[6],TraceParamsD[7]) + (float3)(TraceParamsD[8],TraceParamsD[9],TraceParamsD[10])  * val;
        RayDirection+= (((float3)(TraceParamsD[11],TraceParamsD[12],TraceParamsD[13]) + (float3)(TraceParamsD[14],TraceParamsD[15],TraceParamsD[16]) * val) - RayDirection) *(PixelWidth * PixelPos.x);
        RayDirection = normalize(RayDirection);
        RayPosition = (float3)(TraceParamsD[17],TraceParamsD[18],TraceParamsD[19])+ RayDirection;
        RayDirectionInv = -safe_recip_float3(RayDirection);  
        RayDirectionParallelTest = isequal(fabs(RayDirection), 0.0f);
       
        xzlen = length((float2)(RayDirection.s0,RayDirection.s2)); 
        xangle = acos(xzlen) * as_float((as_uint(RayDirection.s1)&0x80000000) | 0x3F800000);
        val = RayDirection.s2 / ((xzlen<=0.0f) ? 0.0000000001f : xzlen);
        yangle = -acos(fmin(1.0f, fmax(-1.0f,val)))  * as_float((as_uint(RayDirection.s0)&0x80000000) | 0x3F800000);
        sinX = sincos(xangle,&cosX);
        sinY = sincos(yangle,&cosY);


        MAX_TRACE_DISTANCE =10000000.0f;
        NodeDepth = 0;
        NodeOffset = 0;
        NodeOffsetStack[0] = 0;
        NodeCompletionStack = 0;
        NodeSwitchSelect = 0;
        ClosestID=-1;
        InBoundsMask=0xFFFFFFFF;
        ActiveRayMask = 0xFFFFFFFF;


        while(ActiveRayMask)
        {
               
                
                while((ATree[NodeOffset+3] & InBoundsMask) != 0)
                {
                         BoxMin = *(__global const float3*)(&ATree[NodeOffset]);
                          BoxMax = *(__global const float3*)(&ATree[NodeOffset+4]);
                          OutsideTest = signbit(RayPosition-BoxMin) |  signbit(BoxMax-RayPosition);
                          InsideMask   = as_uint(OutsideTest.s0 | OutsideTest.s1 | OutsideTest.s2);
                          OutsideTest &= RayDirectionParallelTest;
                          T1 = (RayPosition -  BoxMin )  * RayDirectionInv;
                          T1 = InfinityCorrectfloat3(T1);
                          T2 = (RayPosition -   BoxMax ) * RayDirectionInv;
                          T2 = InfinityCorrectfloat3(T2);
                          T1B = fmin(T1,T2);
                          T2B = fmax(T1,T2);
                          TNEAR = fmax(T1B.x, fmax(T1B.y, fmax(T1B.z, FLOAT_MAX_NEG) ));
                          TFAR  = fmin(T2B.x, fmin(T2B.y, fmin(T2B.z, FLOAT_MAX_POS) ));
                          BSV0 =  (OutsideTest.s0 | OutsideTest.s1 | OutsideTest.s2 | isgreater(TNEAR,TFAR) | isless(TNEAR,0.0f))  == 0 ? TNEAR : FLOAT_MAX_POS;    
                          BSV0 &= InsideMask; //bad code
                         
                          BoxMin = *(__global const float3*)(&ATree[NodeOffset+8]);
                          BoxMax = *(__global const float3*)(&ATree[NodeOffset+12]);
                          OutsideTest = signbit(RayPosition-BoxMin) |  signbit(BoxMax-RayPosition);
                          InsideMask   = as_uint(OutsideTest.s0 | OutsideTest.s1 | OutsideTest.s2);
                          OutsideTest &= RayDirectionParallelTest;
                          T1 = (RayPosition -  BoxMin )  * RayDirectionInv;
                          T1 = InfinityCorrectfloat3(T1);
                          T2 = (RayPosition -   BoxMax ) * RayDirectionInv;
                          T2 = InfinityCorrectfloat3(T2);
                          T1B = fmin(T1,T2);
                          T2B = fmax(T1,T2);
                          TNEAR = fmax(T1B.x, fmax(T1B.y, fmax(T1B.z, FLOAT_MAX_NEG) ));
                          TFAR  = fmin(T2B.x, fmin(T2B.y, fmin(T2B.z, FLOAT_MAX_POS) ));
                          BSV1 =  (OutsideTest.s0 | OutsideTest.s1 | OutsideTest.s2 | isgreater(TNEAR,TFAR) | isless(TNEAR,0.0f))  == 0 ? TNEAR : FLOAT_MAX_POS;    
                          BSV1 &= InsideMask; //bad code
                         
                          NodesToCheck  = isless(BSV0 , FLOAT_MAX_POS) + isless(BSV1 , FLOAT_MAX_POS);
                          NodeSelect    = isless(BSV1,BSV0);
                         
                         NodeCompletionStack  |= as_uint(max(NodesToCheck-1, 0)) << NodeDepth;

                          if(NodesToCheck == 0)
                          {         
                                NodeDepth = 31 - clz(NodeCompletionStack);
                                InBoundsMask = 0xFFFFFFFF + (as_uint(NodeDepth)>>31);
                                NodeCompletionStack &= ((1u<<NodeDepth)-1);
                                NodeSelect = ((NodeSwitchSelect >> NodeDepth) &1) ^ 1;                            
                          }
              
                         NodeSwitchSelect = (NodeSwitchSelect & ((1u<<NodeDepth)-1)) | (NodeSelect << NodeDepth);        
                          NodeOffset = ATree[NodeOffsetStack[as_uint(NodeDepth)&InBoundsMask] + 3 + (NodeSelect<<2) ];
                          NodeDepth++;
                          NodeOffsetStack[NodeDepth] = NodeOffset;       
                }
               
               
                BoxMin = *(__global const float3*)(&ATree[NodeOffset]);
                BoxMax = *(__global const float3*)(&ATree[NodeOffset+4]);
                T1 = (RayPosition -  BoxMin )  * RayDirectionInv;
                T1 = InfinityCorrectfloat3(T1);
                T2 = (RayPosition -   BoxMax ) * RayDirectionInv;
                T2 = InfinityCorrectfloat3(T2);        
                T1B = fmin(T1,T2);
                T2B = fmax(T1,T2);
                TNEAR = fmax(T1B.x, fmax(T1B.y, T1B.z));
                TFAR  = fmin(T2B.x, fmin(T2B.y, T2B.z )) * 1.000001f;


                if((isless(TNEAR,MAX_TRACE_DISTANCE) & InBoundsMask) == 0)break; 
                ClosestZ = fmin(MAX_TRACE_DISTANCE,TFAR);
                IDATAOFFSET  = ATree[NodeOffset+15];

                for(TriID = ATree[NodeOffset+11]-1; TriID >= 0; TriID--)
                {
                          VDATAOFFSET = ATree[IDATAOFFSET + TriID];                
                          OVertX = *((__global float4*)&ATree[VDATAOFFSET    ]) - RayPosition.s0;
                       OVertY = *((__global float4*)&ATree[VDATAOFFSET +4 ]) - RayPosition.s1;
                       OVertZ = *((__global float4*)&ATree[VDATAOFFSET +8 ]) - RayPosition.s2;

                       VertX = mad(OVertZ,sinY,OVertX*cosY);
                       VertZ = mad(OVertZ,cosY,-(OVertX*sinY));
                       VertY = mad(OVertY,cosX,-(VertZ*sinX));
                       VertZ = mad(OVertY,sinX,VertZ*cosX);
         
                       val2 = (VertX.s1 * VertY.s2) - (VertX.s2 * VertY.s1);
                          val = (VertX.s0 * VertY.s1) - (VertX.s0 * VertY.s2) - (VertX.s1 * VertY.s0)  + (VertX.s2 * VertY.s0) + val2;
                       VWeight  =  cross((float3)(VertX.s0,VertX.s1,VertX.s2),(float3)(VertY.s0,VertY.s1,VertY.s2)) * native_recip(val);

                       IntersectZ = dot((float3)(VertZ.s0,VertZ.s1,VertZ.s2),VWeight);
                          if( (isequal(val,0.0f) | any(as_int3(VWeight * (1.0f - VWeight))) | islessequal(ClosestZ,IntersectZ) | isless(IntersectZ,1.0f)) == 0)
                    {
                                 ClosestZ=IntersectZ;
                              ClosestID = TriID;
                         ClosestVWeight = VWeight;
                          } 

                       val = (VertX.s3 * VertY.s1) - (VertX.s3 * VertY.s2) - (VertX.s1 * VertY.s3) + (VertX.s2 * VertY.s3) + val2;
                          VWeight  =  cross((float3)(VertX.s3,VertX.s1,VertX.s2),(float3)(VertY.s3,VertY.s1,VertY.s2)) * native_recip(val);

                          IntersectZ = dot((float3)(VertZ.s3,VertZ.s1,VertZ.s2),VWeight);
                          if( (isequal(val,0.0f) | any(as_int3(VWeight * (1.0f - VWeight))) | islessequal(ClosestZ,IntersectZ) | isless(IntersectZ,1.0f)) == 0)
                    {
                                 ClosestZ=IntersectZ;
                              ClosestID = TriID | 0x40000000;
                         ClosestVWeight = VWeight;
                          }
                         
                }

        
                 NodeDepth = 31 - as_int(clz(NodeCompletionStack));
                 if(ClosestID!=-1 || NodeDepth<0)ActiveRayMask=0;     
                 NodeCompletionStack &= ((1<<NodeDepth)-1);
                 NodeSelect = ((NodeSwitchSelect >> NodeDepth) &1)  ^ 1;     
                 NodeOffset = ATree[NodeOffsetStack[as_uint(NodeDepth)&InBoundsMask] + 3 + (NodeSelect<<2) ];
                 NodeDepth++;
                 NodeOffsetStack[NodeDepth] = NodeOffset;
               
        }
       
        float4 PixelColour = BACKGROUND_FILL_COLOUR;
        if(ClosestID>=0)PixelColour = (float4)(ClosestVWeight,0.0f)*(float4)(1.0f,1.0f,1.0f,0.0f);
        write_imagef(ScreenBuffer,PixelPos,PixelColour);


      }