Say I have 8 threads, I want to get array tsum[laneID%8] from lane 0/8 for thread of laneID.
One possible way is to use 16 move dpp instructions with row shift, like this:
__asm ( \
"s_nop 1\n" \
"v_mov_b32 %[dst0], %[src0]\n" \
"v_mov_b32_dpp %[dst1], %[src1] row_shr:1\n" \
"v_mov_b32_dpp %[dst2], %[src2] row_shr:2\n" \
"v_mov_b32_dpp %[dst3], %[src3] row_shr:3\n" \
"v_mov_b32_dpp %[dst4], %[src4] row_shr:4\n" \
"v_mov_b32_dpp %[dst5], %[src5] row_shr:5\n" \
"v_mov_b32_dpp %[dst6], %[src6] row_shr:6\n" \
"v_mov_b32_dpp %[dst7], %[src7] row_shr:7\n" \
"s_nop 1\n" \
: [dst0] "=&v" (s[0]), \
[dst1] "=&v" (s[1]), \
[dst2] "=&v" (s[2]), \
[dst3] "=&v" (s[3]), \
[dst4] "=&v" (s[4]), \
[dst5] "=&v" (s[5]), \
[dst6] "=&v" (s[6]), \
[dst7] "=&v" (s[7]) \
: [src0] "v" (tsum[0]), \
[src1] "v" (tsum[1]), \
[src2] "v" (tsum[2]), \
[src3] "v" (tsum[3]), \
[src4] "v" (tsum[4]), \
[src5] "v" (tsum[5]), \
[src6] "v" (tsum[6]), \
[src7] "v" (tsum[7])); \
Can I do this job in one instruction, that select different src from lane 0, based on my thread id?