Compiles code that runs at only 50% the speed it did on 13.12. Running 4x 5870 class GPU cores on one system. Uninstalled the driver for now.
Hi,
Would it be possible for you to upload the code which exhibits it? We are not witnessing this in the code we are running.
Okay. Give me a day so I can image the system in question and that will
allow me to switch back and forth quickly from one image to the other
(with 13.12 on one and 14.4 on the other image).
The program is for my own professional development, and plays a card
game. It's going to be (c) 2014 Jeff Kesner Consulting Services. I'm
sure it'll be fine all around to send it to you.
Basically the 13.12 OpenCL plays 33,000,000 games per hour per shader on
the 5870/5970s and the 14.4 plays 17,000,000 games per hour per shader.
Maybe I should compare the assembly code from one version to the other
to see if it is the OpenCL compiler and not the boilerplate that is
doing this. If you can wait, I will do this comparison on my own after
I make the CCC driver 14.4 system image.
Okay I ran a couple of tests. It has 50% of the 13.12 driver's OpenCL performance both on the R9 270 and the 5870/5970 cards. Here is the main part of the kernel that slows down under 14.4: I can send you the whole program if you want, all it does is play a child's card game called "Beggar My Neighbor" looking for unusually long games. The core part of the kernel loop is near the declaration of "alu".
enum eParms { PARM_START_POS, PARM_CARDSS0, PARM_BARRIER_SIZE, PARM_LOCAL_THREADS, PARM_THREADS_PER_DEV, PARM_PREV_MAX, PARM_COUNT};
enum eMaxes { MAXES_TURNS, MAXES_START_POS, MAXES_SHIFT_COUNT, MAXES_INNER_LOOPS, MAXES_SHIFTS_DONE, MAXES_COUNT};
#define __ULONG2 ulong2
#define __ULONG3 ulong3
#define __ULONG4 ulong4
#define __ULONG8 ulong8
#define __LONG4 long4
#define vlp0 control.s0
#define vlp1 control.s2
#define vsp0 control.s3
#define vsp1 control.s1
__kernel void beggar_kernel_play_vector (
__global ulong *gParms,
__global ulong *gDeck,
__global uint *gMaxes)
// __global uint *gDebugArea)
{
uint threadId = (uint) get_global_id(0);
ushort localId = (ushort)get_local_id(0);
uint startPos=0;
ushort turns;
uint innerLoops=0;
uchar maxesShiftCount=0;
uint maxesStartPos=0;
#ifdef EMULATOR_REFERENCE
ushort player=0;
#endif
ushort maxCount=0;
char4 control, defControl;
__local uint startOffset;
__ULONG4 cards0, cards1, cardBase;
// 1111 1110 1101 1100 1011 1010 1001 1000 0111 0110 0101 0100 0011 0010 0001 0000
ulong bitCounts = 0x0102010301020104;
uint game;
ushort cardShift;
int totalLoops = (uint)(gParms[PARM_BARRIER_SIZE]*gParms[PARM_LOCAL_THREADS]);
if (threadId == 0)
{
gParms[PARM_PREV_MAX]=0;
}
barrier(CLK_GLOBAL_MEM_FENCE);
// 12c4 = 495, 8c4 = 70 70*495 = 34650 16c4 = 1820
if (localId==0)
{
startOffset=atomic_add((__global int *)&gParms[PARM_PREV_MAX],totalLoops);
}
barrier(CLK_LOCAL_MEM_FENCE);
turns=0;
vlp0=vlp1=26;
vsp0=vsp1=-26;
defControl = control;
uchar sturns;
// Run until we run out of work units
game = atomic_add(&startOffset,1); // This provides uniformity of completion time across the 1500ish threads.
totalLoops += game;
while (game<totalLoops)
{
cardBase.s0 = gParms[PARM_CARDSS0];
cardBase.hi = vload2(game,gDeck);
cardBase.s1=cardBase.s2;
cardBase.s2=cardBase.s3;
cards0 = cardBase;
cards1 = cards0>>26;
cards0&=0x3ffffff;
// 7225 Reference solution @ startPos=29184574
// cards0 = (long3)(0x1164494, 0x24014, 0x1120004)
// cards1 = (long3)(0x1c90202, 0x490200, 0x1890000)
startPos = (uint)gParms[PARM_START_POS]+game;
cardShift=1;
while (cardShift<=52) // This will be about 52 * 250 loops usually
{
++innerLoops;
#if 0
cards0=0x780413240e601; // This crashes vlp1 to a negative number.
cards1=0x1e0;
vlp0=43;
vsp0=51;
vlp1=1;
vsp1=9;
//617: 0 ? ? 7c02099207300 3e 55 55 -3 -3
#endif
#if 0
sturns = min((uchar)vlp1,(uchar)vlp0);
sturns = min(
(uchar)sturns,
min( (uchar)((bitCounts>>((cards0.s0<<2)&31))&3), (uchar)((bitCounts>>((cards1.s0<<2)&31))&3)) // This section adds 20% to loop time, but improves overall performance.
);
cards0>>=(ulong)sturns;
cards1>>=(ulong)sturns;
control -= (char)sturns;
turns+=sturns+sturns+1;
#else
++turns;
#endif
#ifdef EMULATOR_REFERENCE
if (threadId==0)
{
printf("%4d: %d ? ? %12I64x %12I64x : %12I64x %12I64x %12I64x %12I64x\n", turns-1, (int)player, cards0.s0, cards1.s0, cards0.s1, cards1.s1, cards0.s2, cards1.s2);
// fprintf(fp, "%4d: %d ? ? %12I64x %12I64x : %12I64x %12I64x %12I64x %12I64x\n", turns-1, (int)player, cards0.s0, cards1.s0, cards0.s1, cards1.s1, cards0.s2, cards1.s2);
}
#endif
ulong mask;
uchar next_card;
__ULONG4 alu = cards0 & 1;
next_card = (uchar)(alu.s0 + alu.s1 + alu.s2 + alu.s2);
cards0>>=1;
cards1 |= alu <<(ulong)-vsp1;
cards0 |= alu <<(ulong)(-vsp0-1);
--control.lo;
sturns=min((uchar)vlp1,min(next_card,(uchar)((bitCounts>>((cards1.s0&15)<<2))&7)));
cards1>>=(ulong)sturns;
control.hi-=(char)sturns;
turns+=(ushort)sturns;
next_card = next_card && (uchar)sturns==next_card;
vlp0=select(vlp0,(char)-vsp0,(char)next_card);
vsp1=select(vsp1,(char)-vlp1,(char)next_card);
mask = ((ulong)1<<(-vsp1))-1;
cards1&=mask;
if (!next_card)
{
#ifdef EMULATOR_REFERENCE
player^=1;
#endif
control.hi ^= control.lo; control.lo ^= control.hi; control.hi ^= control.lo;
cards0^=cards1; cards1^=cards0; cards0^=cards1;
}
if (turns>maxCount) // The presence of this if and the following code makes little impact on performance if removed (at 40M/s innerloops)
{
maxesStartPos = startPos;
maxCount=turns;
maxesShiftCount=(uchar)cardShift;
}
sturns=(uchar)min(vlp0,vlp1);
// End of game maintenance
if (sturns==0)
{
#ifdef EMULATOR_DEBUG
printf("Game %d end %d\n",game+cardShift,threadId);
#endif
ulong mask;
mask = ((ulong)1<<((ulong)cardShift))-1;
cards0 = (cardBase >> cardShift) | ((cardBase & mask)<< ((ulong)52-(ulong)cardShift));
cards1 = cards0>>26;
cards0&=0x3ffffff;
++cardShift;
turns=0;
control=defControl;
// First 9 reference games: 7225, 49, 49, 49 254, 49, 49, 338, 49
}
} // Card shift hit 52. SIMD Program counter reload here. Random across 64 local_threads. We break out about once every 4300 collpased turns
game = atomic_add(&startOffset,1); // This provides uniformity of completion time across the 1500ish threads.
} // totalLoops
game=(uint)atomic_max(&gMaxes[MAXES_TURNS],(uint)maxCount);
if (maxCount>game)
{
gMaxes[MAXES_START_POS]=maxesStartPos;
gMaxes[MAXES_SHIFT_COUNT] = (maxesShiftCount+51)%52;
gMaxes[MAXES_SHIFTS_DONE] = 52;
}
}
And one last test: I reinstalled the 13.12 driver on the system with the R9 270, and the compiled kernel from 14.4 runs slowly. If I delete the compiled kernel (my program calls the clBuildProgam() again), its back to regular speed. Thus we can conclude that it is a) hardware independent and b) Inside the clBuildProgram (OpenCL) compiler that the incorrect code generation occurs.
Thanks,
We will try to reproduce it on our end and get back to you.
Hi Jok2000,
We tried to reproduce it but failed. Would it be possible for you to send the complete code, which we can run?
Thanks,
Reviving the thread. Do you have updates? Would it be possible for you to send the complete code?