This looks to be a new loop unroll problem that crashes the installed compiler and Kernel Analyzer. The crash only occurs when compiling the kernel for the GCN Tahiti architecture. I have included the full kernel below in the code box. notes:
1. Crash can be prevented by using a very small loop counter.
2. Crash did not occur until the write_mem_fence() statement was added, removing it prevents crash.
3. Compiler version is Installed Driver (12.1 - 8.921-111202a-129903E-ATI) CAL(1.4.1658)
I installed preview 12.1a drivers before installing the 7970 card however, the 12.1a driver refused to recognize any GPUs in my system when the 7970 card was plugged in. I then installed the drivers that came with the card, which recognized all GPUs (Barts and Cayman and the Tahiti). I assume the installed driver (above) came from the Tahiti installation disk.
Also, this does not appear to be the same as an earlier kernel that crashed in a similar way. That kernel crashed all architectures in CAL 11.11 and 11.12 and was fixed as of 12.1.
For reference the earlier kernel is.
_kernel void systest(__global int *A){ int i; for(i=0;i<0x8000;i++)A[i<<9]=5;}
The larger attached kernel is the one crashing under Tahiti.
Can anyone compile the below kernel for Tahiti? (the question!)
/*
Crashes in KernelAnalyzer using:
Installed Driver (12.1 - 8.921-111202a-129903E-ATI) CAL(1.4.1658)
GPU:Tahiti (only)
Crashes in applcation with same driver
Does not crash with other GPUs selected
Note 1. Using a very small loop counter prevents the crash
Note 2. The kernel only crashed after adding the write_mem_fence() statement
*/
//------------------------------------------------------------------------------------------
#pragma OPENCL EXTENSION cl_amd_popcnt : enable
#define __FAST_RELAXED_MATH__
#define __MAD_ENABLE__
#define LOC_6970
#define BAR_6970
#define DAMP0 0.999975
#define LSIZE 64
#define LMASK (LSIZE-1)
#define H6(_a,_b,_c,_d,_e,_f) \
X.x=_b.w+_c.y; \
X.y=_c.x+_c.z; \
X.z=_c.y+_c.w; \
X.w=_c.z+_d.x; \
X= fac*X-_c; \
X= X+_c*_f; \
_a=_a _e dt*X; \
_a=_a*damp;
//------------------------------------------------------------------------------------------
__kernel void longline(
__global float4 *restrict rin,
__global float4 *restrict iin,
__global float4 *restrict rout,
__global float4 *restrict iout,
__global float4 *restrict pot,
const int ops,
const float gdt)
{
int gx = get_global_id(0);
int lid = get_local_id(0);
int gy = get_global_id(1);
int i,pid,mid,oad,rad;
float last;
float4 fac=0.5f,damp;
float4 A0,A1,A2,A3,L,R,X,dt;
float4 B0,B1,B2,B3;
float4 V0,V1,V2,V3;
float4 A4,A5,A6,A7;
float4 B4,B5,B6,B7;
float4 V4,V5,V6,V7;
__local float4 lr[LSIZE];
damp=(lid<7||lid>56)?DAMP0:1.0f;
dt=gdt;
oad=rad=(lid<<3)+gy*(2048/4);
A0=rin[rad ];A1=rin[rad+1];A2=rin[rad+2];A3=rin[rad+3];
B0=iin[rad ];B1=iin[rad+1];B2=iin[rad+2];B3=iin[rad+3];
V0=pot[rad ];V1=pot[rad+1];V2=pot[rad+2];V3=pot[rad+3];
A4=rin[rad+4];A5=rin[rad+5];A6=rin[rad+6];A7=rin[rad+7];
B4=iin[rad+4];B5=iin[rad+5];B6=iin[rad+6];B7=iin[rad+7];
V4=pot[rad+4];V5=pot[rad+5];V6=pot[rad+6];V7=pot[rad+7];
lr[lid].x=A0.x;
lr[lid].w=A7.w;
pid=(lid+1)&LMASK;
mid=(lid-1)&LMASK;
for(i=0;i<2000;i++){
L.w=lr[mid].w;
R.x=lr[pid].x;
H6(B0,L ,A0,A1,+,V0)
H6(B1,A0,A1,A2,+,V1)
H6(B2,A1,A2,A3,+,V2)
H6(B3,A2,A3,A4,+,V3)
H6(B4,A3,A4,A5,+,V4)
H6(B5,A4,A5,A6,+,V5)
H6(B6,A5,A6,A7,+,V6)
H6(B7,A6,A7,R ,+,V7)
last=B7.w;
lr[lid].x=B0.x;
lr[lid].w=last;
write_mem_fence(CLK_LOCAL_MEM_FENCE); // <------- adding this statement causes crash
L.w=lr[mid].w;
R.x=lr[pid].x;
H6(A0,L ,B0,B1,-,V0)
H6(A1,B0,B1,B2,-,V1)
H6(A2,B1,B2,B3,-,V2)
H6(A3,B2,B3,B4,-,V3)
H6(A4,B3,B4,B5,-,V4)
H6(A5,B4,B5,B6,-,V5)
H6(A6,B5,B6,B7,-,V6)
H6(A7,B6,B7,R ,-,V7)
last=A7.w;
lr[lid].x=A0.x;
lr[lid].w=last;
// write_mem_fence(CLK_LOCAL_MEM_FENCE);
}
rout[oad ]=A0;rout[oad+1]=A1;rout[oad+2]=A2;rout[oad+3]=A3;
iout[oad ]=B0;iout[oad+1]=B1;iout[oad+2]=B2;iout[oad+3]=B3;
rout[oad+4]=A4;rout[oad+5]=A5;rout[oad+6]=A6;rout[oad+7]=A7;
iout[oad+4]=B4;iout[oad+5]=B5;iout[oad+6]=B6;iout[oad+7]=B7;
}
//-----------------------------------------------------------------------------------