0 Replies Latest reply on Apr 8, 2016 10:29 AM by bluewanderer

    Why both VALU and memory units are NOT busy with like 2G work-items?

    bluewanderer

      Here's the code:

      kernel __attribute__((reqd_work_group_size(8, 8, 4))) void test2(global float *in_input, int in_inputSize, global float *in_transform, global float *out_output)
      {
       const uint TS = 8;
       const uint TD = 4;
       
       local float bi[TD][TS][TS];
       local float bt[TD][TS][TS];
       
       if (LZ == 0 && LY == 0)
       {
        bt[0][0][LX] = in_transform[in_inputSize * GW + GX];
       }
       
       SYNC;
       
       float out = bt[0][0][LX];
       SYNC;
       
       for (uint i = 0; i < in_inputSize; i += TS * 4)
       {
        bt[LZ][LX][LY] = in_transform[(i + LZ * TS + LY) * GW + GX];
        SYNC;
        for (uint j = 0; j < TS * 4; j += TS)
        {
         bi[LZ][LY][LX] = in_input[(GZ * TS + GY) * in_inputSize + (i + j + LX)];
         
         for (uint k = 0; k < TS; ++k)
         {
          out += bi[LZ][LY][k] * bt[j / TS][LX][k];
         }
        }
        
        SYNC;
       }
       
       out_output[(GZ * TS + GY) * GW + GX] = out;
      }
      

       

      With GX/Y/Z are the global ids, LX/Y/Z are the local ids, GW/H/D are the global sizes, and SYNC is barrier.

       

      I ran this kernel with a work size of 256 * 8 * (65536 * 14 / 8),then I got VALUBusy 23% and MemoryUnitBusy 27%.

       

      Here's the previous version:

      kernel __attribute__((reqd_work_group_size(8, 8, 4))) void test1(global float *in_input, int in_inputSize, global float *in_transform, global float *out_output)
      {
       const uint TS = 8;
       const uint TD = 4;
       
       local float bi[TD][TS][TS];
       local float bt[TS][TS];
       
       if (LZ == 0 && LY == 0)
       {
        bt[0][LX] = in_transform[in_inputSize * GW + GX];
       }
       
       SYNC;
       float out = bt[0][LX];
       SYNC;
       
       for (uint i = 0; i < in_inputSize; i += TS)
       {
        if (LZ == 0)
        {
         bt[LX][LY] = in_transform[(i + LY) * GW + GX];
        }
        bi[LZ][LY][LX] = in_input[(GZ * TS + GY) * in_inputSize + (i + LX)];
        
        SYNC;
        
        for (uint k = 0; k < TS; ++k)
        {
         out += bi[LZ][LY][k] * bt[LX][k];
        }
        
        SYNC;
       }
       
       out_output[(GZ * TS + GY) * GW + GX] = out;
      }
      

       

      I get one block of the matrix instead of four a time, but the total memory access is the same. VALU is about 30% busy and Memory is about 55%. This was when I thought the barrier is the result of the idleness. So I cut the number of synchronizations to 1/4 in the previous kernel. And I got a faster(a little, 232ms v.s. 249ms) but even more idle kernel...

       

      I completely don't understand now.

       

      BTW, what's the right way to do matrix multiplications...?