AnsweredAssumed Answered

Kernel crashes compiler only for Tahiti (7970), loop unroll problem?

Question asked by drallan on Feb 20, 2012
Latest reply on Feb 20, 2012 by MicahVillmow

This looks to be a new loop unroll problem that crashes the installed compiler and Kernel Analyzer.  The crash only occurs when compiling the kernel for the GCN Tahiti architecture. I have included the full kernel below in the code box. notes:

 

   1. Crash can be prevented by using a very small loop counter.

   2. Crash did not occur until the write_mem_fence() statement was added, removing it prevents crash.

   3. Compiler version is Installed Driver (12.1 - 8.921-111202a-129903E-ATI) CAL(1.4.1658)

 

I installed preview 12.1a drivers before installing the 7970 card however, the 12.1a driver refused to recognize any GPUs in my system when the 7970 card was plugged in. I then installed the drivers that came with the card, which recognized all GPUs (Barts and Cayman and the Tahiti). I assume the installed driver (above) came from the Tahiti installation disk.

 

Also, this does not appear to be the same as an earlier kernel that crashed in a similar way. That kernel crashed all architectures in CAL 11.11 and 11.12 and was fixed as of 12.1.

 

For reference the earlier kernel is. 

      _kernel void systest(__global int *A){ int i; for(i=0;i<0x8000;i++)A[i<<9]=5;} 

 

The larger attached kernel is the one crashing under Tahiti.

Can anyone compile the below kernel for Tahiti? (the question!)

 

/*
Crashes in KernelAnalyzer using:
    Installed Driver (12.1 - 8.921-111202a-129903E-ATI) CAL(1.4.1658)
    GPU:Tahiti (only)
Crashes in applcation with same driver
Does not crash with other GPUs selected

Note 1. Using a very small loop counter prevents the crash
Note 2. The kernel only crashed after adding the write_mem_fence() statement

*/
//------------------------------------------------------------------------------------------
#pragma OPENCL EXTENSION cl_amd_popcnt : enable
#define __FAST_RELAXED_MATH__
#define __MAD_ENABLE__

#define LOC_6970
#define BAR_6970
#define DAMP0 0.999975
#define LSIZE 64
#define LMASK (LSIZE-1)
#define H6(_a,_b,_c,_d,_e,_f) \
        X.x=_b.w+_c.y;    \
        X.y=_c.x+_c.z;    \
        X.z=_c.y+_c.w;    \
        X.w=_c.z+_d.x;    \
        X= fac*X-_c;      \
        X= X+_c*_f;       \
       _a=_a _e dt*X;   \
       _a=_a*damp;

//------------------------------------------------------------------------------------------
__kernel void longline(
    __global float4 *restrict rin,
    __global float4 *restrict iin,
    __global float4 *restrict rout,
    __global float4 *restrict iout,
    __global float4 *restrict pot,
      const int   ops,
      const float gdt)
{
    int gx = get_global_id(0);
    int lid = get_local_id(0);
    int gy = get_global_id(1);
    int i,pid,mid,oad,rad;
    float last;

    float4 fac=0.5f,damp;
    float4 A0,A1,A2,A3,L,R,X,dt;
    float4 B0,B1,B2,B3;
    float4 V0,V1,V2,V3;
    float4 A4,A5,A6,A7;
    float4 B4,B5,B6,B7;
    float4 V4,V5,V6,V7;
    __local float4 lr[LSIZE];

    damp=(lid<7||lid>56)?DAMP0:1.0f;
    dt=gdt;
    oad=rad=(lid<<3)+gy*(2048/4);
    A0=rin[rad  ];A1=rin[rad+1];A2=rin[rad+2];A3=rin[rad+3];
    B0=iin[rad  ];B1=iin[rad+1];B2=iin[rad+2];B3=iin[rad+3];
    V0=pot[rad  ];V1=pot[rad+1];V2=pot[rad+2];V3=pot[rad+3];
    A4=rin[rad+4];A5=rin[rad+5];A6=rin[rad+6];A7=rin[rad+7];
    B4=iin[rad+4];B5=iin[rad+5];B6=iin[rad+6];B7=iin[rad+7];
    V4=pot[rad+4];V5=pot[rad+5];V6=pot[rad+6];V7=pot[rad+7];

    lr[lid].x=A0.x;
    lr[lid].w=A7.w;
    pid=(lid+1)&LMASK;
    mid=(lid-1)&LMASK;

    for(i=0;i<2000;i++){

        L.w=lr[mid].w;
        R.x=lr[pid].x;
        H6(B0,L ,A0,A1,+,V0)
        H6(B1,A0,A1,A2,+,V1)
        H6(B2,A1,A2,A3,+,V2)
        H6(B3,A2,A3,A4,+,V3)
        H6(B4,A3,A4,A5,+,V4)
        H6(B5,A4,A5,A6,+,V5)
        H6(B6,A5,A6,A7,+,V6)
        H6(B7,A6,A7,R ,+,V7)
        last=B7.w;
        lr[lid].x=B0.x;
        lr[lid].w=last;
        write_mem_fence(CLK_LOCAL_MEM_FENCE);                  // <------- adding this statement causes crash


        L.w=lr[mid].w;
        R.x=lr[pid].x;
        H6(A0,L ,B0,B1,-,V0)
        H6(A1,B0,B1,B2,-,V1)
        H6(A2,B1,B2,B3,-,V2)
        H6(A3,B2,B3,B4,-,V3)
        H6(A4,B3,B4,B5,-,V4)
        H6(A5,B4,B5,B6,-,V5)
        H6(A6,B5,B6,B7,-,V6)
        H6(A7,B6,B7,R ,-,V7)
        last=A7.w;
        lr[lid].x=A0.x;
        lr[lid].w=last;
   //     write_mem_fence(CLK_LOCAL_MEM_FENCE);
        }

    rout[oad  ]=A0;rout[oad+1]=A1;rout[oad+2]=A2;rout[oad+3]=A3;
    iout[oad  ]=B0;iout[oad+1]=B1;iout[oad+2]=B2;iout[oad+3]=B3;
    rout[oad+4]=A4;rout[oad+5]=A5;rout[oad+6]=A6;rout[oad+7]=A7;
    iout[oad+4]=B4;iout[oad+5]=B5;iout[oad+6]=B6;iout[oad+7]=B7;
}
//-----------------------------------------------------------------------------------

Attachments

Outcomes