Hello,
I am looking at the output of the compilation of this shader in RGA:
#version 450
#pragma shader_stage(compute)
layout (push_constant) uniform PC { mat4 a; };
layout (std430, binding = 0) buffer BufferIn { vec4[] data_in; };
layout (local_size_x = 64) in;
void main() {
data_in[gl_LocalInvocationID.x] = a * vec4(data_in[gl_LocalInvocationID.x]);
}
which results in (for RDNA, but GCN is very similar)
s_getpc_b64 s[0:1]
s_mov_b32 s3, s1
s_load_dword s0, s[2:3], 0x40
s_waitcnt lgkmcnt(0)
s_load_dwordx4 s[4:7], s[0:1], 0x00
v_lshlrev_b32 v0, 4, v0
s_waitcnt lgkmcnt(0)
buffer_load_dwordx4 v[1:4], v0, s[4:7], 0 offen
s_load_dwordx8 s[8:15], s[2:3], 0x00
s_load_dwordx8 s[16:23], s[2:3], 0x20
s_waitcnt vmcnt(0) & lgkmcnt(0)
v_mul_f32 v5, s8, v1
v_mul_f32 v6, s9, v1
v_mul_f32 v7, s10, v1
v_mul_f32 v1, s11, v1
v_fmac_f32 v5, s12, v2
v_fmac_f32 v6, s13, v2
v_fmac_f32 v7, s14, v2
v_fmac_f32 v1, s15, v2
v_fmac_f32 v5, s16, v3
v_fmac_f32 v6, s17, v3
v_fmac_f32 v7, s18, v3
v_fmac_f32 v1, s19, v3
v_fmac_f32 v5, s20, v4
v_fmac_f32 v6, s21, v4
v_fmac_f32 v7, s22, v4
v_fma_f32 v8, s23, v4, v1
buffer_store_dwordx4 v[5:8], v0, s[4:7], 0 offen glc
s_endpgm
What is the reason that the compiler emits 2x s_load_dwordx8 instead of one s_load_dwordx16?
Seems like both source and destination are consecutive.
Thank you in advance!