Here's the code:
kernel __attribute__((reqd_work_group_size(8, 8, 4))) void test2(global float *in_input, int in_inputSize, global float *in_transform, global float *out_output)
{
const uint TS = 8;
const uint TD = 4;
local float bi[TD][TS][TS];
local float bt[TD][TS][TS];
if (LZ == 0 && LY == 0)
{
bt[0][0][LX] = in_transform[in_inputSize * GW + GX];
}
SYNC;
float out = bt[0][0][LX];
SYNC;
for (uint i = 0; i < in_inputSize; i += TS * 4)
{
bt[LZ][LX][LY] = in_transform[(i + LZ * TS + LY) * GW + GX];
SYNC;
for (uint j = 0; j < TS * 4; j += TS)
{
bi[LZ][LY][LX] = in_input[(GZ * TS + GY) * in_inputSize + (i + j + LX)];
for (uint k = 0; k < TS; ++k)
{
out += bi[LZ][LY] * bt[j / TS][LX];
}
}
SYNC;
}
out_output[(GZ * TS + GY) * GW + GX] = out;
}
With GX/Y/Z are the global ids, LX/Y/Z are the local ids, GW/H/D are the global sizes, and SYNC is barrier.
I ran this kernel with a work size of 256 * 8 * (65536 * 14 / 8),then I got VALUBusy 23% and MemoryUnitBusy 27%.
Here's the previous version:
kernel __attribute__((reqd_work_group_size(8, 8, 4))) void test1(global float *in_input, int in_inputSize, global float *in_transform, global float *out_output)
{
const uint TS = 8;
const uint TD = 4;
local float bi[TD][TS][TS];
local float bt[TS][TS];
if (LZ == 0 && LY == 0)
{
bt[0][LX] = in_transform[in_inputSize * GW + GX];
}
SYNC;
float out = bt[0][LX];
SYNC;
for (uint i = 0; i < in_inputSize; i += TS)
{
if (LZ == 0)
{
bt[LX][LY] = in_transform[(i + LY) * GW + GX];
}
bi[LZ][LY][LX] = in_input[(GZ * TS + GY) * in_inputSize + (i + LX)];
SYNC;
for (uint k = 0; k < TS; ++k)
{
out += bi[LZ][LY] * bt[LX];
}
SYNC;
}
out_output[(GZ * TS + GY) * GW + GX] = out;
}
I get one block of the matrix instead of four a time, but the total memory access is the same. VALU is about 30% busy and Memory is about 55%. This was when I thought the barrier is the result of the idleness. So I cut the number of synchronizations to 1/4 in the previous kernel. And I got a faster(a little, 232ms v.s. 249ms) but even more idle kernel...
I completely don't understand now.
BTW, what's the right way to do matrix multiplications...?