4 Replies Latest reply on Feb 3, 2012 6:34 PM by antzrhere

    OpenCL crashes when compiling this kernel

    antzrhere

      I'm using Catalyst 12.2 preview, Windows 7 64bit, AMD SDK 2.6, ATI Radeon 5870.

       

      The following kernel crashes my program upon compilation every time without fail. It also crashes Kernel App.

      The problem appears to lie with control flow as if I remove/add some break/return points in the code  it compiles (although I'm pretty sure it stiill doesn't run correctly when it does compile).

       

      Any help? The problem is I haven't even began to develop the code and it won't compile at this stage which doesn't bode well... 

       

      Here is the code (apologies for the poor formatting):

       

       

       

      Code

      #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
      #pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable

       

      #define FLOAT_MAX_POS               3.40282346638528860e+38f
      #define FLOAT_MAX_NEG               -3.40282346638528860e+38f

      __constant sampler_t samplerA = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP   |  CLK_FILTER_NEAREST;

       

      inline float3 safe_recip_float3(const float3 Val)
      {
         const float3 valinv = 1.0f / Val;
         return as_float3(as_int3(valinv) + isinf(valinv)) ;
      };   


      inline float BoxLineIntersect(const float3 BoxMin,const float3 BoxMax,const float3 LineStartingPos,const float3 LineVector,const float3 LineVectorInv)
      {
            
           uint InsideMask = (any(as_int3(LineStartingPos-BoxMin)) |  any(as_int3(BoxMax-LineStartingPos)))  * 0xFFFFFFFF;
          
          // calculates t1 and t2 intersection positions for each component
           float3 T1 = (LineStartingPos - BoxMin) * LineVectorInv;
           float3 T2 = (LineStartingPos - BoxMax) * LineVectorInv;

           //gets rid of infinites    
           T1 = as_float3( as_int3(T1) + isinf(T1)  );
           T2 = as_float3( as_int3(T2) + isinf(T2)  );
          
           //swaps components so T1<=T2
           float3 T1B = fmin(T1,T2);
           float3 T2B = fmax(T1,T2);
          
           float TNEAR = fmax(T1B.x, fmax(T1B.y, fmax(T1B.z, FLOAT_MAX_NEG) ));
           float TFAR  = fmin(T2B.x, fmin(T2B.y, fmin(T2B.z, FLOAT_MAX_POS) ));
          
           //parallel but not inside test
           int3 ParallelTest = 0;//isequal(LineVector, 0.0f) & (isless(LineStartingPos, BoxMin) | isgreater(LineStartingPos, BoxMin));
         
           //returns max float pos value if no intersection otherwise returns intersection distance
           return  ((ParallelTest.x | ParallelTest.y | ParallelTest.z | isgreater(TNEAR,TFAR) | isless(TNEAR,0.0f))&InsideMask) == 0 ? TNEAR : FLOAT_MAX_POS;

      }


              
                  

      __kernel void  AceTraceRender(__constant float* TraceParamsD, __global const int* TraceParamsI,const __global const float *ATree,const unsigned int VertDataOffset,__read_only image2d_t DepthBuffer,__write_only image2d_t ScreenBuffer)
      {

        float3 RayPosition,RayDirection,RayDirectionInv,VWeight,TriX,TriY,ClosestVWeight;
        float4 VertX,VertY,VertZ,OVertX,OVertY,OVertZ;
        int2 PixelPos;
        float xangle,yangle,val,xzlen,sinX,cosX,sinY,cosY,IntersectZ,ClosestZ,PixelWidth;
        int TriID,ClosestID;
        int NumPolygons;
        uint NodeOffsetStack[32];
        uint NodeSwitchSelect;
        uint NodeCompletionStack;
        uint Tmp;
        uint NodeOffset;
        uint VDATAOFFSET;
        int NodeSelect;
        int NodesToCheck;
        float BSV0,BSV1;
        float MAX_TRACE_DISTANCE;
        int NodeDepth;

        const __global float4 *TriangleVert = (const __global float4*)&ATree[VertDataOffset];


        PixelPos.x = TraceParamsI[32+get_global_id(0)];
        PixelPos = (int2)(PixelPos.x&65535,PixelPos.x>>16);
        PixelWidth = TraceParamsD[0];
       
        
        val = TraceParamsD[1] * PixelPos.y;
        RayDirection = (float3)(TraceParamsD[5],TraceParamsD[6],TraceParamsD[7]) + (float3)(TraceParamsD[8],TraceParamsD[9],TraceParamsD[10])  * val;
        RayDirection+= (((float3)(TraceParamsD[11],TraceParamsD[12],TraceParamsD[13]) + (float3)(TraceParamsD[14],TraceParamsD[15],TraceParamsD[16]) * val) - RayDirection) *(PixelWidth * PixelPos.x);
        RayDirection = fast_normalize(RayDirection);
        RayPosition = (float3)(convert_float(TraceParamsI[8]),convert_float(TraceParamsI[9]),convert_float(TraceParamsI[10]))+ RayDirection;
        RayDirectionInv = safe_recip_float3(RayDirection);  

        xzlen = length((float2)(RayDirection.s0,RayDirection.s2));
        val = (xzlen==0.0f) ? 1.0f : RayDirection.s2/xzlen;
        xangle = (xzlen<=-0.99999994f) ? xangle = -3.1415926535897932384626433832795f : acos(xzlen);
        xangle *= sign(RayDirection.s1);
        if(xzlen>=0.99999994f)xangle= 0.0f; 
       
        yangle = (val<=-0.99999994f) ? -3.1415926535897932384626433832795f : -acos(val)* sign(RayDirection.s0);
        if(val>=0.99999994f)yangle = 0.0f;
       
        sinX = sincos(xangle,&cosX);
        sinY = sincos(yangle,&cosY);

        VertZ = read_imagef(DepthBuffer,samplerA,PixelPos);
        MAX_TRACE_DISTANCE = VertZ.s0 - 10.0f;
       
        NodeDepth = 0;
        NodeOffset = 0;
        NodeOffsetStack[0] = 0;
        NodeCompletionStack = 0;
        NodeSwitchSelect = 0;

        while(1)
        {
                while(1)
                {
                       if(as_uint(ATree[NodeOffset]) == 0)break;
              
                       BSV0 = BoxLineIntersect(*(__global const float3*)(&ATree[NodeOffset+4]),*(__global const float3*)(&ATree[NodeOffset+8]), RayPosition,-RayDirection,-RayDirectionInv);
                       BSV1 = BoxLineIntersect(*(__global const float3*)(&ATree[NodeOffset+12]),*(__global const float3*)(&ATree[NodeOffset+16]),RayPosition,-RayDirection,-RayDirectionInv);

                       NodesToCheck  = 2 - isequal(BSV0 , FLOAT_MAX_POS);
                       NodesToCheck -= isequal(BSV1 , FLOAT_MAX_POS);
                       NodeSelect    = isless(BSV1,BSV0);
                         
                       NodeCompletionStack  |= as_uint(max(NodesToCheck-1, 0)) << NodeDepth;

                       if(NodesToCheck == 0)
                       {
                                NodeDepth = 31 - clz(NodeCompletionStack);
                                if(NodeDepth<0)break; //all nodes/levels explored,quit
                                NodeCompletionStack &= ((1<<NodeDepth)-1);
                                NodeSelect = ((NodeSwitchSelect >> NodeDepth) &1) +1;     
                       }
              
                       NodeSwitchSelect = (NodeSwitchSelect & ((1<<NodeDepth)-1)) | ((NodeSelect&1) << NodeDepth);       
                       NodeOffset = as_uint(ATree[NodeOffsetStack[NodeDepth] + (NodeSelect&1) ]) * 20;
                       NodeDepth++;
                       NodeOffsetStack[NodeDepth] = NodeOffset;
               }
               
                if(NodeDepth <0 )return;               
               
          NumPolygons  = as_uint(ATree[NodeOffset+2]) * 3;
                VDATAOFFSET  = as_uint(ATree[NodeOffset+3]);
       
                ClosestZ = MAX_TRACE_DISTANCE;
                VertZ.s3=0.0f;
                ClosestID = -1;

                for(TriID = 0; TriID < NumPolygons; TriID+=3)
                {
                       OVertX = TriangleVert[VDATAOFFSET + TriID  ] - RayPosition.s0;
                       OVertY = TriangleVert[VDATAOFFSET + TriID+1] - RayPosition.s1;
                       OVertZ = TriangleVert[VDATAOFFSET + TriID+2] - RayPosition.s2;

                       VertX = mad(OVertZ,sinY,OVertX*cosY);
                       VertZ = mad(OVertZ,cosY,-(OVertX*sinY));
                       VertY = mad(OVertY,cosX,-(VertZ*sinX));
                       VertZ = mad(OVertY,sinX,VertZ*cosX);
         
                       TriX = (float3)(VertX.s0,VertX.s1,VertX.s2);
                       TriY = (float3)(VertY.s0,VertY.s1,VertY.s2);

                       val = (TriX.s0 * TriY.s1) - (TriX.s0 * TriY.s2) - (TriX.s1 * TriY.s0) + (TriX.s1 * TriY.s2) + (TriX.s2 * TriY.s0) - (TriX.s2 * TriY.s1);
                       VWeight  =  cross(TriX,TriY) * native_recip(val);

                       if(isequal(val,0.0f)+any(as_int3(VWeight * (1.0f - VWeight)))==0)
                          {
                          IntersectZ = dot((float3)(VertZ.s0,VertZ.s1,VertZ.s2),VWeight);

                             if(isless(IntersectZ,ClosestZ) & isgreaterequal(IntersectZ,1.0f))
                            {
                                     ClosestZ=IntersectZ;
                             ClosestID = TriID<<1;
                             ClosestVWeight = VWeight;
                            }
                          }

                       TriX.s0 = VertX.s3;
                       TriY.s0 = VertY.s3;

                       val =  (TriX.s0 * TriY.s1) - (TriX.s0 * TriY.s2) - (TriX.s1 * TriY.s0) + (TriX.s1 * TriY.s2) + (TriX.s2 * TriY.s0) - (TriX.s2 * TriY.s1);
                       VWeight  =  cross(TriX,TriY) * native_recip(val);

                       if(isequal(val,0.0f)+any(as_int3(VWeight * (1.0f - VWeight)))==0)
                          {
                            IntersectZ = dot((float3)(VertZ.s3,VertZ.s1,VertZ.s2),VWeight);
                              if(isless(IntersectZ,ClosestZ) & isgreaterequal(IntersectZ,1.0f))
                              {
                                          ClosestZ=IntersectZ;
                                  ClosestID = (TriID<<1)+1;
                                  ClosestVWeight = VWeight;
                                  }
                          }
                }

                if(ClosestID!=-1)break;
               
                 NodeDepth = 31 - as_int(clz(NodeCompletionStack));
                 if(NodeDepth<0)break; //all nodes/levels explored,quit
                 NodeCompletionStack &= ((1<<NodeDepth)-1);
                 NodeSelect = ((NodeSwitchSelect >> NodeDepth) &1) +1;             
                 NodeOffset = as_uint(ATree[NodeOffsetStack[NodeDepth] + (NodeSelect&1) ]) * 20;
                 NodeDepth++;
                 NodeOffsetStack[NodeDepth] = NodeOffset;
               
        }
       
        if(ClosestID!=-1)
        {
              write_imagef(ScreenBuffer,PixelPos,(float4)(1.0f,0.0f,0.0f,0.0f));   
        }
      }

        • Re: OpenCL crashes when compiling this kernel
          antzrhere

          I've managed to re-structure the code to take away some of the break/return values, so it now compiles & runs correctly, but my concern is (apart from it shouldn't crash anyway) is that when I expand the code and add more conditional control flow it's going to break again....

            • Re: OpenCL crashes when compiling this kernel
              MicahVillmow

              Thanks for reporting this, I've been able to reproduce the crash and will be letting the correct engineer fix the problem.

                • Re: OpenCL crashes when compiling this kernel
                  antzrhere

                  I've encountered another *different* problem, compiling for both the GPU and CPU (one hangs in infinte loop, one cannot compile)

                   

                  I managed to modify the previous code (remove some breakpoints)..  everything was working correctly, then...

                   

                  I added a single line that "if(somelocalvariable ==0)return"   - this should prematurely end the kernel (for my debug purposes). Strangely, while everything was OK before, adding this line causes the kernel to get stuck in some infinite loop causing the display driver to reset - (completely the opposite of what should happen!).

                   

                  All I can say is the compiler is taking a bad step  - in this case I can confidently say premature termination of the kernel cannot influence any other thread (i.e. there are not dependencies/comm. between threads that could cause one to infinitely loop by ending another).

                   

                  Furthermore....when I compile for the CPU device, the compiler fails with the error: "8Internal Error:  ld failed"

                   

                  Sorry you won't be able to replicate the gpu hang as there is alot of dependent data and host code, but you can test the CPU-device failing to compile...

                  code:

                   

                   

                  Code

                  #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
                  #pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable

                  #define FLOAT_MAX_POS               3.40282346638528860e+38f

                  #define FLOAT_MAX_NEG               -3.40282346638528860e+38f

                  __constant sampler_t samplerA = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP   |  CLK_FILTER_NEAREST;

                   

                  __kernel void  test(__constant float* TraceParamsD, __global const int* TraceParamsI,__global const float *ATree,uint const VertDataOffset,__read_only image2d_t DepthBuffer,__write_only image2d_t ScreenBuffer)
                  {
                    float3 RayPosition,RayDirection,RayDirectionInv,VWeight,TriX,TriY,ClosestVWeight;
                    float4 VertX,VertY,VertZ,OVertX,OVertY,OVertZ;
                    int2 PixelPos;
                    float xangle,yangle,val,xzlen,sinX,cosX,sinY,cosY,IntersectZ,ClosestZ,PixelWidth;
                    int TriID,ClosestID;
                    int NumPolygons;
                    uint NodeOffsetStack[32];
                    uint NodeSwitchSelect;
                    uint NodeCompletionStack;
                    uint NodeOffset;
                    uint VDATAOFFSET;
                    int NodeSelect;
                    int NodesToCheck;
                    float BSV0,BSV1;
                    float MAX_TRACE_DISTANCE;
                    int NodeDepth;
                    uint InBoundsMask;
                    uint ActiveRayMask;
                   
                     __global const float4 *TriangleVert = (const __global float4*)&ATree[VertDataOffset];
                   
                      PixelPos.x = TraceParamsI[32+get_global_id(0)];
                    PixelPos = (int2)(PixelPos.x&65535,PixelPos.x>>16);
                    PixelWidth = TraceParamsD[0];
                   

                    val = TraceParamsD[1] * PixelPos.y;
                    RayDirection = (float3)(TraceParamsD[5],TraceParamsD[6],TraceParamsD[7]) + (float3)(TraceParamsD[8],TraceParamsD[9],TraceParamsD[10])  * val;
                    RayDirection+= (((float3)(TraceParamsD[11],TraceParamsD[12],TraceParamsD[13]) + (float3)(TraceParamsD[14],TraceParamsD[15],TraceParamsD[16]) * val) - RayDirection) *(PixelWidth * PixelPos.x);
                    RayDirection = fast_normalize(RayDirection);
                    RayPosition = (float3)(convert_float(TraceParamsI[8]),convert_float(TraceParamsI[9]),convert_float(TraceParamsI[10]))+ RayDirection;
                    RayDirectionInv = safe_recip_float3(RayDirection);  

                   

                    xzlen = length((float2)(RayDirection.s0,RayDirection.s2));
                    val = (xzlen==0.0f) ? 1.0f : RayDirection.s2/xzlen;
                    //gets X angle of ray  (if xzlen >= fabs(0.99999994f) then sets manually to fix bug
                    xangle = (xzlen<=-0.99999994f) ? xangle = -3.1415926535897932384626433832795f : acos(xzlen);
                    xangle *= sign(RayDirection.s1);
                    if(xzlen>=0.99999994f)xangle= 0.0f; 
                    //gets Y angle of ray  (if val >= fabs(0.99999994f) then sets manually to fix bug
                    yangle = (val<=-0.99999994f) ? -3.1415926535897932384626433832795f : -acos(val)* sign(RayDirection.s0);
                    if(val>=0.99999994f)yangle = 0.0f;
                    //precomputed sin/cos values
                    sinX = sincos(xangle,&cosX);
                    sinY = sincos(yangle,&cosY);

                    //voxel distance
                    VertZ = read_imagef(DepthBuffer,samplerA,PixelPos);
                    MAX_TRACE_DISTANCE = VertZ.s0 - 10.0f;
                   
                    NodeDepth = 0;
                    NodeOffset = 0;
                    NodeOffsetStack[0] = 0;
                    NodeCompletionStack = 0;
                    NodeSwitchSelect = 0;
                    ClosestID=-1;
                    InBoundsMask=0xFFFFFFFF;
                    ActiveRayMask = 0xFFFFFFFF;
                   
                    while(ActiveRayMask)
                    {
                           
                            while((as_uint(ATree[NodeOffset]) & InBoundsMask) != 0)
                            {
                                   
                                   BSV0 = BoxLineIntersect(*(__global const float3*)(&ATree[NodeOffset+4]),*(__global const float3*)(&ATree[NodeOffset+8]), RayPosition,-RayDirection,-RayDirectionInv);
                                   BSV1 = BoxLineIntersect(*(__global const float3*)(&ATree[NodeOffset+12]),*(__global const float3*)(&ATree[NodeOffset+16]),RayPosition,-RayDirection,-RayDirectionInv);

                   

                                   NodesToCheck  = 2 - isequal(BSV0 , FLOAT_MAX_POS);
                                   NodesToCheck -= isequal(BSV1 , FLOAT_MAX_POS);
                                   NodeSelect    = isless(BSV1,BSV0);
                                     

                                   NodeCompletionStack  |= as_uint(max(NodesToCheck-1, 0)) << NodeDepth;

                   

                      
                                   if(NodesToCheck == 0)
                                   {         
                                            NodeDepth = 31 - clz(NodeCompletionStack);
                                            InBoundsMask = 0xFFFFFFFF + (NodeDepth<0); //all nodes/levels explored,quit
                                            NodeCompletionStack &= ((1<<NodeDepth)-1);
                                            NodeSelect = ((NodeSwitchSelect >> NodeDepth) &1) + 1;     
                                   }
                              
                                  
                                   NodeSwitchSelect = (NodeSwitchSelect & ((1<<NodeDepth)-1)) | ((NodeSelect&1) << NodeDepth);       
                                   NodeOffset = as_uint(ATree[NodeOffsetStack[NodeDepth] + (NodeSelect&1) ]);
                                   NodeDepth++;
                                   NodeOffsetStack[NodeDepth] = NodeOffset;
                                  
                                 
                           }
                           
                    
                               if(NodeDepth <0 || (NodeDepth>0 && fmin(BSV0,BSV1)>=MAX_TRACE_DISTANCE))break;        
                         
                            NumPolygons  = as_uint(ATree[NodeOffset+2]) * 3;
                            VDATAOFFSET  = as_uint(ATree[NodeOffset+3]);


                   
                             if(NumPolygons==0){  return; }   // !!!!!!!!!!! THIS IS THE LINE THAT CAUSES GOU TO HANG...should not make any difference to outcome
                   
                            ClosestZ = MAX_TRACE_DISTANCE;
                            VertZ.s3=0.0f;
                            ClosestID = -1;

                            for(TriID = 0; TriID < NumPolygons; TriID+=3)
                            {
                                   OVertX = TriangleVert[VDATAOFFSET + TriID  ] - RayPosition.s0;
                                   OVertY = TriangleVert[VDATAOFFSET + TriID+1] - RayPosition.s1;
                                   OVertZ = TriangleVert[VDATAOFFSET + TriID+2] - RayPosition.s2;

                                   VertX = mad(OVertZ,sinY,OVertX*cosY);
                                   VertZ = mad(OVertZ,cosY,-(OVertX*sinY));
                                   VertY = mad(OVertY,cosX,-(VertZ*sinX));
                                   VertZ = mad(OVertY,sinX,VertZ*cosX);
                     
                                   TriX = (float3)(VertX.s0,VertX.s1,VertX.s2);
                                   TriY = (float3)(VertY.s0,VertY.s1,VertY.s2);

                                   val = (TriX.s0 * TriY.s1) - (TriX.s0 * TriY.s2) - (TriX.s1 * TriY.s0) + (TriX.s1 * TriY.s2) + (TriX.s2 * TriY.s0) - (TriX.s2 * TriY.s1);
                                   VWeight  =  cross(TriX,TriY) * native_recip(val);

                                   if(isequal(val,0.0f)+any(as_int3(VWeight * (1.0f - VWeight)))==0)
                                      {
                                      IntersectZ = dot((float3)(VertZ.s0,VertZ.s1,VertZ.s2),VWeight);

                                         if(isless(IntersectZ,ClosestZ) & isgreaterequal(IntersectZ,1.0f))
                                        {
                                                 ClosestZ=IntersectZ;
                                                 ClosestID = TriID<<1;
                                                 ClosestVWeight = VWeight;
                                        }
                                      }

                   

                                   TriX.s0 = VertX.s3;
                                   TriY.s0 = VertY.s3;

                                   val =  (TriX.s0 * TriY.s1) - (TriX.s0 * TriY.s2) - (TriX.s1 * TriY.s0) + (TriX.s1 * TriY.s2) + (TriX.s2 * TriY.s0) - (TriX.s2 * TriY.s1);
                                   VWeight  =  cross(TriX,TriY) * native_recip(val);

                   

                                   if(isequal(val,0.0f)+any(as_int3(VWeight * (1.0f - VWeight)))==0)
                                      {
                                        IntersectZ = dot((float3)(VertZ.s3,VertZ.s1,VertZ.s2),VWeight);
                                          if(isless(IntersectZ,ClosestZ) & isgreaterequal(IntersectZ,1.0f))
                                          {
                                                      ClosestZ=IntersectZ;
                                              ClosestID = (TriID<<1)+1;
                                              ClosestVWeight = VWeight;
                                              }
                                      }

                            }

                    
                                 
                             NodeDepth = 31 - as_int(clz(NodeCompletionStack));
                             if(ClosestID!=-1 || NodeDepth<0)ActiveRayMask=0;
                             NodeCompletionStack &= ((1<<NodeDepth)-1);
                             NodeSelect = ((NodeSwitchSelect >> NodeDepth) &1) +1;     
                             //traverses further along tree        
                             NodeOffset = as_uint(ATree[NodeOffsetStack[NodeDepth&ActiveRayMask] + (NodeSelect&1) ]);
                             NodeDepth++;
                             NodeOffsetStack[NodeDepth] = NodeOffset;
                           
                    }
                   
                    if(ClosestID>=0)
                    {
                          write_imagef(ScreenBuffer,PixelPos,(float4)(1.0f,0.0f,0.0f,0.0f));   
                    }
                   
                  }

                  • Re: OpenCL crashes when compiling this kernel
                    antzrhere

                    Just to confirm the CPU compilation fails even without the   "if(NumPolygons==0){return;}" statement where the GPU compiles and runs fine when this is deleted.