1 Reply Latest reply on Feb 20, 2012 1:05 PM by MicahVillmow

    Kernel crashes compiler only for Tahiti (7970), loop unroll problem?

    drallan

      This looks to be a new loop unroll problem that crashes the installed compiler and Kernel Analyzer.  The crash only occurs when compiling the kernel for the GCN Tahiti architecture. I have included the full kernel below in the code box. notes:

       

         1. Crash can be prevented by using a very small loop counter.

         2. Crash did not occur until the write_mem_fence() statement was added, removing it prevents crash.

         3. Compiler version is Installed Driver (12.1 - 8.921-111202a-129903E-ATI) CAL(1.4.1658)

       

      I installed preview 12.1a drivers before installing the 7970 card however, the 12.1a driver refused to recognize any GPUs in my system when the 7970 card was plugged in. I then installed the drivers that came with the card, which recognized all GPUs (Barts and Cayman and the Tahiti). I assume the installed driver (above) came from the Tahiti installation disk.

       

      Also, this does not appear to be the same as an earlier kernel that crashed in a similar way. That kernel crashed all architectures in CAL 11.11 and 11.12 and was fixed as of 12.1.

       

      For reference the earlier kernel is. 

            _kernel void systest(__global int *A){ int i; for(i=0;i<0x8000;i++)A[i<<9]=5;} 

       

      The larger attached kernel is the one crashing under Tahiti.

      Can anyone compile the below kernel for Tahiti? (the question!)

       

      /*
      Crashes in KernelAnalyzer using:
          Installed Driver (12.1 - 8.921-111202a-129903E-ATI) CAL(1.4.1658)
          GPU:Tahiti (only)
      Crashes in applcation with same driver
      Does not crash with other GPUs selected
      
      Note 1. Using a very small loop counter prevents the crash
      Note 2. The kernel only crashed after adding the write_mem_fence() statement
      
      */
      //------------------------------------------------------------------------------------------
      #pragma OPENCL EXTENSION cl_amd_popcnt : enable
      #define __FAST_RELAXED_MATH__
      #define __MAD_ENABLE__
      
      #define LOC_6970
      #define BAR_6970
      #define DAMP0 0.999975
      #define LSIZE 64
      #define LMASK (LSIZE-1)
      #define H6(_a,_b,_c,_d,_e,_f) \
              X.x=_b.w+_c.y;    \
              X.y=_c.x+_c.z;    \
              X.z=_c.y+_c.w;    \
              X.w=_c.z+_d.x;    \
              X= fac*X-_c;      \
              X= X+_c*_f;       \
             _a=_a _e dt*X;   \
             _a=_a*damp;
      
      //------------------------------------------------------------------------------------------
      __kernel void longline(
          __global float4 *restrict rin,
          __global float4 *restrict iin,
          __global float4 *restrict rout,
          __global float4 *restrict iout,
          __global float4 *restrict pot,
            const int   ops,
            const float gdt)
      {
          int gx = get_global_id(0);
          int lid = get_local_id(0);
          int gy = get_global_id(1);
          int i,pid,mid,oad,rad;
          float last;
      
          float4 fac=0.5f,damp;
          float4 A0,A1,A2,A3,L,R,X,dt;
          float4 B0,B1,B2,B3;
          float4 V0,V1,V2,V3;
          float4 A4,A5,A6,A7;
          float4 B4,B5,B6,B7;
          float4 V4,V5,V6,V7;
          __local float4 lr[LSIZE];
      
          damp=(lid<7||lid>56)?DAMP0:1.0f;
          dt=gdt;
          oad=rad=(lid<<3)+gy*(2048/4);
          A0=rin[rad  ];A1=rin[rad+1];A2=rin[rad+2];A3=rin[rad+3];
          B0=iin[rad  ];B1=iin[rad+1];B2=iin[rad+2];B3=iin[rad+3];
          V0=pot[rad  ];V1=pot[rad+1];V2=pot[rad+2];V3=pot[rad+3];
          A4=rin[rad+4];A5=rin[rad+5];A6=rin[rad+6];A7=rin[rad+7];
          B4=iin[rad+4];B5=iin[rad+5];B6=iin[rad+6];B7=iin[rad+7];
          V4=pot[rad+4];V5=pot[rad+5];V6=pot[rad+6];V7=pot[rad+7];
      
          lr[lid].x=A0.x;
          lr[lid].w=A7.w;
          pid=(lid+1)&LMASK;
          mid=(lid-1)&LMASK;
      
          for(i=0;i<2000;i++){
      
              L.w=lr[mid].w;
              R.x=lr[pid].x;
              H6(B0,L ,A0,A1,+,V0)
              H6(B1,A0,A1,A2,+,V1)
              H6(B2,A1,A2,A3,+,V2)
              H6(B3,A2,A3,A4,+,V3)
              H6(B4,A3,A4,A5,+,V4)
              H6(B5,A4,A5,A6,+,V5)
              H6(B6,A5,A6,A7,+,V6)
              H6(B7,A6,A7,R ,+,V7)
              last=B7.w;
              lr[lid].x=B0.x;
              lr[lid].w=last;
              write_mem_fence(CLK_LOCAL_MEM_FENCE);                  // <------- adding this statement causes crash
      
      
              L.w=lr[mid].w;
              R.x=lr[pid].x;
              H6(A0,L ,B0,B1,-,V0)
              H6(A1,B0,B1,B2,-,V1)
              H6(A2,B1,B2,B3,-,V2)
              H6(A3,B2,B3,B4,-,V3)
              H6(A4,B3,B4,B5,-,V4)
              H6(A5,B4,B5,B6,-,V5)
              H6(A6,B5,B6,B7,-,V6)
              H6(A7,B6,B7,R ,-,V7)
              last=A7.w;
              lr[lid].x=A0.x;
              lr[lid].w=last;
         //     write_mem_fence(CLK_LOCAL_MEM_FENCE);
              }
      
          rout[oad  ]=A0;rout[oad+1]=A1;rout[oad+2]=A2;rout[oad+3]=A3;
          iout[oad  ]=B0;iout[oad+1]=B1;iout[oad+2]=B2;iout[oad+3]=B3;
          rout[oad+4]=A4;rout[oad+5]=A5;rout[oad+6]=A6;rout[oad+7]=A7;
          iout[oad+4]=B4;iout[oad+5]=B5;iout[oad+6]=B6;iout[oad+7]=B7;
      }
      //-----------------------------------------------------------------------------------