AnsweredAssumed Answered

OpenCL crashes when compiling this kernel

Question asked by antzrhere on Feb 2, 2012
Latest reply on Feb 3, 2012 by antzrhere

I'm using Catalyst 12.2 preview, Windows 7 64bit, AMD SDK 2.6, ATI Radeon 5870.

 

The following kernel crashes my program upon compilation every time without fail. It also crashes Kernel App.

The problem appears to lie with control flow as if I remove/add some break/return points in the code  it compiles (although I'm pretty sure it stiill doesn't run correctly when it does compile).

 

Any help? The problem is I haven't even began to develop the code and it won't compile at this stage which doesn't bode well... 

 

Here is the code (apologies for the poor formatting):

 

 

 

Code

#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable

 

#define FLOAT_MAX_POS               3.40282346638528860e+38f
#define FLOAT_MAX_NEG               -3.40282346638528860e+38f

__constant sampler_t samplerA = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP   |  CLK_FILTER_NEAREST;

 

inline float3 safe_recip_float3(const float3 Val)
{
   const float3 valinv = 1.0f / Val;
   return as_float3(as_int3(valinv) + isinf(valinv)) ;
};   


inline float BoxLineIntersect(const float3 BoxMin,const float3 BoxMax,const float3 LineStartingPos,const float3 LineVector,const float3 LineVectorInv)
{
      
     uint InsideMask = (any(as_int3(LineStartingPos-BoxMin)) |  any(as_int3(BoxMax-LineStartingPos)))  * 0xFFFFFFFF;
    
    // calculates t1 and t2 intersection positions for each component
     float3 T1 = (LineStartingPos - BoxMin) * LineVectorInv;
     float3 T2 = (LineStartingPos - BoxMax) * LineVectorInv;

     //gets rid of infinites    
     T1 = as_float3( as_int3(T1) + isinf(T1)  );
     T2 = as_float3( as_int3(T2) + isinf(T2)  );
    
     //swaps components so T1<=T2
     float3 T1B = fmin(T1,T2);
     float3 T2B = fmax(T1,T2);
    
     float TNEAR = fmax(T1B.x, fmax(T1B.y, fmax(T1B.z, FLOAT_MAX_NEG) ));
     float TFAR  = fmin(T2B.x, fmin(T2B.y, fmin(T2B.z, FLOAT_MAX_POS) ));
    
     //parallel but not inside test
     int3 ParallelTest = 0;//isequal(LineVector, 0.0f) & (isless(LineStartingPos, BoxMin) | isgreater(LineStartingPos, BoxMin));
   
     //returns max float pos value if no intersection otherwise returns intersection distance
     return  ((ParallelTest.x | ParallelTest.y | ParallelTest.z | isgreater(TNEAR,TFAR) | isless(TNEAR,0.0f))&InsideMask) == 0 ? TNEAR : FLOAT_MAX_POS;

}


        
            

__kernel void  AceTraceRender(__constant float* TraceParamsD, __global const int* TraceParamsI,const __global const float *ATree,const unsigned int VertDataOffset,__read_only image2d_t DepthBuffer,__write_only image2d_t ScreenBuffer)
{

  float3 RayPosition,RayDirection,RayDirectionInv,VWeight,TriX,TriY,ClosestVWeight;
  float4 VertX,VertY,VertZ,OVertX,OVertY,OVertZ;
  int2 PixelPos;
  float xangle,yangle,val,xzlen,sinX,cosX,sinY,cosY,IntersectZ,ClosestZ,PixelWidth;
  int TriID,ClosestID;
  int NumPolygons;
  uint NodeOffsetStack[32];
  uint NodeSwitchSelect;
  uint NodeCompletionStack;
  uint Tmp;
  uint NodeOffset;
  uint VDATAOFFSET;
  int NodeSelect;
  int NodesToCheck;
  float BSV0,BSV1;
  float MAX_TRACE_DISTANCE;
  int NodeDepth;

  const __global float4 *TriangleVert = (const __global float4*)&ATree[VertDataOffset];


  PixelPos.x = TraceParamsI[32+get_global_id(0)];
  PixelPos = (int2)(PixelPos.x&65535,PixelPos.x>>16);
  PixelWidth = TraceParamsD[0];
 
  
  val = TraceParamsD[1] * PixelPos.y;
  RayDirection = (float3)(TraceParamsD[5],TraceParamsD[6],TraceParamsD[7]) + (float3)(TraceParamsD[8],TraceParamsD[9],TraceParamsD[10])  * val;
  RayDirection+= (((float3)(TraceParamsD[11],TraceParamsD[12],TraceParamsD[13]) + (float3)(TraceParamsD[14],TraceParamsD[15],TraceParamsD[16]) * val) - RayDirection) *(PixelWidth * PixelPos.x);
  RayDirection = fast_normalize(RayDirection);
  RayPosition = (float3)(convert_float(TraceParamsI[8]),convert_float(TraceParamsI[9]),convert_float(TraceParamsI[10]))+ RayDirection;
  RayDirectionInv = safe_recip_float3(RayDirection);  

  xzlen = length((float2)(RayDirection.s0,RayDirection.s2));
  val = (xzlen==0.0f) ? 1.0f : RayDirection.s2/xzlen;
  xangle = (xzlen<=-0.99999994f) ? xangle = -3.1415926535897932384626433832795f : acos(xzlen);
  xangle *= sign(RayDirection.s1);
  if(xzlen>=0.99999994f)xangle= 0.0f; 
 
  yangle = (val<=-0.99999994f) ? -3.1415926535897932384626433832795f : -acos(val)* sign(RayDirection.s0);
  if(val>=0.99999994f)yangle = 0.0f;
 
  sinX = sincos(xangle,&cosX);
  sinY = sincos(yangle,&cosY);

  VertZ = read_imagef(DepthBuffer,samplerA,PixelPos);
  MAX_TRACE_DISTANCE = VertZ.s0 - 10.0f;
 
  NodeDepth = 0;
  NodeOffset = 0;
  NodeOffsetStack[0] = 0;
  NodeCompletionStack = 0;
  NodeSwitchSelect = 0;

  while(1)
  {
          while(1)
          {
                 if(as_uint(ATree[NodeOffset]) == 0)break;
        
                 BSV0 = BoxLineIntersect(*(__global const float3*)(&ATree[NodeOffset+4]),*(__global const float3*)(&ATree[NodeOffset+8]), RayPosition,-RayDirection,-RayDirectionInv);
                 BSV1 = BoxLineIntersect(*(__global const float3*)(&ATree[NodeOffset+12]),*(__global const float3*)(&ATree[NodeOffset+16]),RayPosition,-RayDirection,-RayDirectionInv);

                 NodesToCheck  = 2 - isequal(BSV0 , FLOAT_MAX_POS);
                 NodesToCheck -= isequal(BSV1 , FLOAT_MAX_POS);
                 NodeSelect    = isless(BSV1,BSV0);
                   
                 NodeCompletionStack  |= as_uint(max(NodesToCheck-1, 0)) << NodeDepth;

                 if(NodesToCheck == 0)
                 {
                          NodeDepth = 31 - clz(NodeCompletionStack);
                          if(NodeDepth<0)break; //all nodes/levels explored,quit
                          NodeCompletionStack &= ((1<<NodeDepth)-1);
                          NodeSelect = ((NodeSwitchSelect >> NodeDepth) &1) +1;     
                 }
        
                 NodeSwitchSelect = (NodeSwitchSelect & ((1<<NodeDepth)-1)) | ((NodeSelect&1) << NodeDepth);       
                 NodeOffset = as_uint(ATree[NodeOffsetStack[NodeDepth] + (NodeSelect&1) ]) * 20;
                 NodeDepth++;
                 NodeOffsetStack[NodeDepth] = NodeOffset;
         }
         
          if(NodeDepth <0 )return;               
         
    NumPolygons  = as_uint(ATree[NodeOffset+2]) * 3;
          VDATAOFFSET  = as_uint(ATree[NodeOffset+3]);
 
          ClosestZ = MAX_TRACE_DISTANCE;
          VertZ.s3=0.0f;
          ClosestID = -1;

          for(TriID = 0; TriID < NumPolygons; TriID+=3)
          {
                 OVertX = TriangleVert[VDATAOFFSET + TriID  ] - RayPosition.s0;
                 OVertY = TriangleVert[VDATAOFFSET + TriID+1] - RayPosition.s1;
                 OVertZ = TriangleVert[VDATAOFFSET + TriID+2] - RayPosition.s2;

                 VertX = mad(OVertZ,sinY,OVertX*cosY);
                 VertZ = mad(OVertZ,cosY,-(OVertX*sinY));
                 VertY = mad(OVertY,cosX,-(VertZ*sinX));
                 VertZ = mad(OVertY,sinX,VertZ*cosX);
   
                 TriX = (float3)(VertX.s0,VertX.s1,VertX.s2);
                 TriY = (float3)(VertY.s0,VertY.s1,VertY.s2);

                 val = (TriX.s0 * TriY.s1) - (TriX.s0 * TriY.s2) - (TriX.s1 * TriY.s0) + (TriX.s1 * TriY.s2) + (TriX.s2 * TriY.s0) - (TriX.s2 * TriY.s1);
                 VWeight  =  cross(TriX,TriY) * native_recip(val);

                 if(isequal(val,0.0f)+any(as_int3(VWeight * (1.0f - VWeight)))==0)
                    {
                    IntersectZ = dot((float3)(VertZ.s0,VertZ.s1,VertZ.s2),VWeight);

                       if(isless(IntersectZ,ClosestZ) & isgreaterequal(IntersectZ,1.0f))
                      {
                               ClosestZ=IntersectZ;
                       ClosestID = TriID<<1;
                       ClosestVWeight = VWeight;
                      }
                    }

                 TriX.s0 = VertX.s3;
                 TriY.s0 = VertY.s3;

                 val =  (TriX.s0 * TriY.s1) - (TriX.s0 * TriY.s2) - (TriX.s1 * TriY.s0) + (TriX.s1 * TriY.s2) + (TriX.s2 * TriY.s0) - (TriX.s2 * TriY.s1);
                 VWeight  =  cross(TriX,TriY) * native_recip(val);

                 if(isequal(val,0.0f)+any(as_int3(VWeight * (1.0f - VWeight)))==0)
                    {
                      IntersectZ = dot((float3)(VertZ.s3,VertZ.s1,VertZ.s2),VWeight);
                        if(isless(IntersectZ,ClosestZ) & isgreaterequal(IntersectZ,1.0f))
                        {
                                    ClosestZ=IntersectZ;
                            ClosestID = (TriID<<1)+1;
                            ClosestVWeight = VWeight;
                            }
                    }
          }

          if(ClosestID!=-1)break;
         
           NodeDepth = 31 - as_int(clz(NodeCompletionStack));
           if(NodeDepth<0)break; //all nodes/levels explored,quit
           NodeCompletionStack &= ((1<<NodeDepth)-1);
           NodeSelect = ((NodeSwitchSelect >> NodeDepth) &1) +1;             
           NodeOffset = as_uint(ATree[NodeOffsetStack[NodeDepth] + (NodeSelect&1) ]) * 20;
           NodeDepth++;
           NodeOffsetStack[NodeDepth] = NodeOffset;
         
  }
 
  if(ClosestID!=-1)
  {
        write_imagef(ScreenBuffer,PixelPos,(float4)(1.0f,0.0f,0.0f,0.0f));   
  }
}

Outcomes