7 Replies Latest reply on Jun 18, 2014 2:40 AM by pinform

    AMD Catalyst 14.4 OpenCL driver

    jok2000

      Compiles code that runs at only 50% the speed it did on 13.12.  Running 4x 5870 class GPU cores on one system.  Uninstalled the driver for now.

        • Re: AMD Catalyst 14.4 OpenCL driver
          sudarshan

          Hi,

          Would it be possible for you to upload the code which exhibits it? We are not witnessing this in the code we are running.

            • Re: AMD Catalyst 14.4 OpenCL driver
              jok2000

              Okay.  Give me a day so I can image the system in question and that will

              allow me to switch back and forth quickly from one image to the other

              (with 13.12 on one and 14.4 on the other image).

               

              The program is for my own professional development, and plays a card

              game.  It's going to be (c) 2014 Jeff Kesner Consulting Services. I'm

              sure it'll be fine all around to send it to you.

               

              Basically the 13.12 OpenCL plays 33,000,000 games per hour per shader on

              the 5870/5970s and the 14.4 plays 17,000,000 games per hour per shader.

               

              Maybe I should compare the assembly code from one version to the other

              to see if it is the OpenCL compiler and not the boilerplate that is

              doing this.  If you can wait, I will do this comparison on my own after

              I make the CCC driver 14.4 system image.

              • Re: AMD Catalyst 14.4 OpenCL driver
                jok2000

                Okay I ran a couple of tests.  It has 50% of the 13.12 driver's OpenCL performance both on the R9 270 and the 5870/5970 cards.  Here is the main part of the kernel that slows down under 14.4:  I can send you the whole program if you want, all it does is play a child's card game called "Beggar My Neighbor" looking for unusually long games.  The core part of the kernel loop is near the declaration of "alu".

                 

                enum eParms { PARM_START_POS, PARM_CARDSS0, PARM_BARRIER_SIZE, PARM_LOCAL_THREADS, PARM_THREADS_PER_DEV, PARM_PREV_MAX, PARM_COUNT};

                enum eMaxes { MAXES_TURNS, MAXES_START_POS, MAXES_SHIFT_COUNT, MAXES_INNER_LOOPS, MAXES_SHIFTS_DONE, MAXES_COUNT};

                 

                 

                #define __ULONG2 ulong2

                #define __ULONG3 ulong3

                #define __ULONG4 ulong4

                #define __ULONG8 ulong8

                #define __LONG4 long4

                 

                #define vlp0 control.s0

                #define vlp1 control.s2

                #define vsp0 control.s3

                #define vsp1 control.s1

                 

                __kernel void beggar_kernel_play_vector (

                  __global ulong   *gParms,

                  __global ulong   *gDeck,

                  __global uint   *gMaxes)

                // __global uint   *gDebugArea)

                 

                 

                  {

                  uint threadId = (uint) get_global_id(0);

                  ushort localId = (ushort)get_local_id(0);

                  uint startPos=0;

                  ushort turns;

                  uint innerLoops=0;

                  uchar maxesShiftCount=0;

                  uint maxesStartPos=0;

                #ifdef EMULATOR_REFERENCE

                  ushort player=0;

                #endif

                  ushort maxCount=0;

                  char4 control, defControl;

                 

                 

                  __local uint startOffset;

                  __ULONG4 cards0, cards1, cardBase;

                 

                  // 1111 1110 1101 1100 1011 1010 1001 1000 0111 0110 0101 0100 0011 0010 0001 0000

                  ulong bitCounts = 0x0102010301020104;

                  uint game;

                  ushort cardShift;

                 

                 

                  int totalLoops = (uint)(gParms[PARM_BARRIER_SIZE]*gParms[PARM_LOCAL_THREADS]);

                  if (threadId == 0)

                  {

                  gParms[PARM_PREV_MAX]=0;

                  }

                  barrier(CLK_GLOBAL_MEM_FENCE);

                  // 12c4 = 495, 8c4 = 70  70*495 = 34650  16c4 = 1820

                  if (localId==0)

                  {

                  startOffset=atomic_add((__global int *)&gParms[PARM_PREV_MAX],totalLoops);

                  }

                  barrier(CLK_LOCAL_MEM_FENCE);

                 

                 

                 

                 

                  turns=0;

                  vlp0=vlp1=26;

                  vsp0=vsp1=-26;

                  defControl = control;

                  uchar sturns;

                 

                 

                  // Run until we run out of work units

                  game = atomic_add(&startOffset,1); // This provides uniformity of completion time across the 1500ish threads.

                  totalLoops += game;

                  while (game<totalLoops)

                  {

                 

                 

                  cardBase.s0 = gParms[PARM_CARDSS0];

                  cardBase.hi = vload2(game,gDeck);

                  cardBase.s1=cardBase.s2;

                  cardBase.s2=cardBase.s3;

                 

                 

                  cards0 = cardBase;

                  cards1 = cards0>>26;

                  cards0&=0x3ffffff;

                  // 7225 Reference solution @ startPos=29184574

                  // cards0 = (long3)(0x1164494,  0x24014, 0x1120004)

                  //  cards1 = (long3)(0x1c90202, 0x490200, 0x1890000)

                 

                 

                  startPos = (uint)gParms[PARM_START_POS]+game;

                  cardShift=1;

                  while (cardShift<=52) // This will be about 52 * 250 loops usually

                  {

                  ++innerLoops;

                  #if 0

                  cards0=0x780413240e601; // This crashes vlp1 to a negative number.

                  cards1=0x1e0;

                  vlp0=43;

                  vsp0=51;

                  vlp1=1;

                  vsp1=9;

                  //617: 0 ? ? 7c02099207300           3e 55 55 -3 -3

                  #endif

                #if 0

                  sturns = min((uchar)vlp1,(uchar)vlp0);

                  sturns = min(

                  (uchar)sturns,

                  min( (uchar)((bitCounts>>((cards0.s0<<2)&31))&3), (uchar)((bitCounts>>((cards1.s0<<2)&31))&3)) // This section adds 20% to loop time, but improves overall performance.

                  );

                  cards0>>=(ulong)sturns;

                  cards1>>=(ulong)sturns;

                  control -= (char)sturns;

                  turns+=sturns+sturns+1;

                #else

                  ++turns;

                #endif

                  #ifdef EMULATOR_REFERENCE

                  if (threadId==0)

                  {

                  printf("%4d: %d ? ? %12I64x %12I64x : %12I64x %12I64x %12I64x %12I64x\n", turns-1, (int)player, cards0.s0, cards1.s0, cards0.s1, cards1.s1, cards0.s2, cards1.s2);

                  // fprintf(fp, "%4d: %d ? ? %12I64x %12I64x : %12I64x %12I64x %12I64x %12I64x\n", turns-1, (int)player, cards0.s0, cards1.s0, cards0.s1, cards1.s1, cards0.s2, cards1.s2);

                  }

                 

                  #endif

                  ulong mask;

                  uchar next_card;

                  __ULONG4 alu = cards0 & 1;

                  next_card = (uchar)(alu.s0 + alu.s1 + alu.s2 + alu.s2);

                  cards0>>=1;

                  cards1 |= alu <<(ulong)-vsp1;

                  cards0 |= alu <<(ulong)(-vsp0-1);

                  --control.lo;

                  sturns=min((uchar)vlp1,min(next_card,(uchar)((bitCounts>>((cards1.s0&15)<<2))&7)));

                  cards1>>=(ulong)sturns;

                  control.hi-=(char)sturns;

                  turns+=(ushort)sturns;

                  next_card = next_card && (uchar)sturns==next_card;

                  vlp0=select(vlp0,(char)-vsp0,(char)next_card);

                  vsp1=select(vsp1,(char)-vlp1,(char)next_card);

                  mask = ((ulong)1<<(-vsp1))-1;

                  cards1&=mask;

                  if (!next_card)

                  {

                  #ifdef EMULATOR_REFERENCE

                  player^=1;

                  #endif

                  control.hi ^= control.lo; control.lo ^= control.hi; control.hi ^= control.lo;

                  cards0^=cards1; cards1^=cards0; cards0^=cards1;

                  }

                 

                 

                  if (turns>maxCount)  // The presence of this if and the following code makes little impact on performance if removed (at 40M/s innerloops)

                  {

                  maxesStartPos = startPos;

                  maxCount=turns;

                  maxesShiftCount=(uchar)cardShift;

                  }

                 

                 

                  sturns=(uchar)min(vlp0,vlp1);

                 

                 

                  // End of game maintenance

                  if (sturns==0)

                  {

                  #ifdef EMULATOR_DEBUG

                  printf("Game %d end %d\n",game+cardShift,threadId);

                  #endif

                  ulong mask;

                  mask = ((ulong)1<<((ulong)cardShift))-1;

                  cards0 = (cardBase >> cardShift) | ((cardBase & mask)<< ((ulong)52-(ulong)cardShift));

                  cards1 = cards0>>26;

                  cards0&=0x3ffffff;

                  ++cardShift;

                  turns=0;

                  control=defControl;

                  // First 9 reference games: 7225, 49, 49, 49 254, 49, 49, 338, 49

                  }

                  } // Card shift hit 52.  SIMD Program counter reload here.  Random across 64 local_threads.  We break out about once every 4300 collpased turns

                  game = atomic_add(&startOffset,1); // This provides uniformity of completion time across the 1500ish threads.

                  } // totalLoops

                 

                 

                 

                 

                  game=(uint)atomic_max(&gMaxes[MAXES_TURNS],(uint)maxCount);

                  if (maxCount>game)

                  {

                  gMaxes[MAXES_START_POS]=maxesStartPos;

                  gMaxes[MAXES_SHIFT_COUNT] = (maxesShiftCount+51)%52;

                  gMaxes[MAXES_SHIFTS_DONE] = 52;

                  }

                  }

                • Re: AMD Catalyst 14.4 OpenCL driver
                  jok2000

                  And one last test:  I reinstalled the 13.12 driver on the system with the R9 270, and the compiled kernel from 14.4 runs slowly.  If I delete the compiled kernel (my program calls the clBuildProgam() again), its back to regular speed.  Thus we can conclude that it is a) hardware independent and b) Inside the clBuildProgram (OpenCL) compiler that the incorrect code generation occurs.