Archives Discussions

bluewanderer · ‎04-08-2016

Here's the code:

kernel __attribute__((reqd_work_group_size(8, 8, 4))) void test2(global float *in_input, int in_inputSize, global float *in_transform, global float *out_output)
{
 const uint TS = 8;
 const uint TD = 4;
 
 local float bi[TD][TS][TS];
 local float bt[TD][TS][TS];
 
 if (LZ == 0 && LY == 0)
 {
  bt[0][0][LX] = in_transform[in_inputSize * GW + GX];
 }
 
 SYNC;
 
 float out = bt[0][0][LX];
 SYNC;
 
 for (uint i = 0; i < in_inputSize; i += TS * 4)
 {
  bt[LZ][LX][LY] = in_transform[(i + LZ * TS + LY) * GW + GX];
  SYNC;
  for (uint j = 0; j < TS * 4; j += TS)
  {
   bi[LZ][LY][LX] = in_input[(GZ * TS + GY) * in_inputSize + (i + j + LX)];
   
   for (uint k = 0; k < TS; ++k)
   {
    out += bi[LZ][LY] * bt[j / TS][LX];
   }
  }
  
  SYNC;
 }
 
 out_output[(GZ * TS + GY) * GW + GX] = out;
}

With GX/Y/Z are the global ids, LX/Y/Z are the local ids, GW/H/D are the global sizes, and SYNC is barrier.

I ran this kernel with a work size of 256 * 8 * (65536 * 14 / 8)，then I got VALUBusy 23% and MemoryUnitBusy 27%.

Here's the previous version:

kernel __attribute__((reqd_work_group_size(8, 8, 4))) void test1(global float *in_input, int in_inputSize, global float *in_transform, global float *out_output)
{
 const uint TS = 8;
 const uint TD = 4;
 
 local float bi[TD][TS][TS];
 local float bt[TS][TS];
 
 if (LZ == 0 && LY == 0)
 {
  bt[0][LX] = in_transform[in_inputSize * GW + GX];
 }
 
 SYNC;
 float out = bt[0][LX];
 SYNC;
 
 for (uint i = 0; i < in_inputSize; i += TS)
 {
  if (LZ == 0)
  {
   bt[LX][LY] = in_transform[(i + LY) * GW + GX];
  }
  bi[LZ][LY][LX] = in_input[(GZ * TS + GY) * in_inputSize + (i + LX)];
  
  SYNC;
  
  for (uint k = 0; k < TS; ++k)
  {
   out += bi[LZ][LY] * bt[LX];
  }
  
  SYNC;
 }
 
 out_output[(GZ * TS + GY) * GW + GX] = out;
}

I get one block of the matrix instead of four a time, but the total memory access is the same. VALU is about 30% busy and Memory is about 55%. This was when I thought the barrier is the result of the idleness. So I cut the number of synchronizations to 1/4 in the previous kernel. And I got a faster(a little, 232ms v.s. 249ms) but even more idle kernel...

I completely don't understand now.

BTW, what's the right way to do matrix multiplications...?

Archives Discussions

Why both VALU and memory units are NOT busy with like 2G work-items?