https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109078
Bug ID: 109078
Summary: Missing optimization on aarch64 for types like
`float32x4x2_t`
Product: gcc
Version: 12.2.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: dorazzsoft at gmail dot com
Target Milestone: ---
Here is a simple code: https://godbolt.org/z/3qMTTfcfx
#include
#include
#include
void simple_gemm(
float* restrict out,
float const* restrict a,
float const* restrict b,
size_t k, bool zero_out
) {
register float32x4x2_t o0;
o0.val[0] = vdupq_n_f32(0.0f);
o0.val[1] = vdupq_n_f32(0.0f);
// begin dot
{
register float32x4_t a0;
register float32x4x2_t b0;
while (k >= 1) {
b0 = vld1q_f32_x2(b);
a0 = vdupq_n_f32(a[0]);
o0.val[0] = vfmaq_f32(o0.val[0], a0, b0.val[0]);
o0.val[1] = vfmaq_f32(o0.val[1], a0, b0.val[1]);
b += 8;
a += 1;
k -= 1;
}
} // end dot
// begin writeback
{
if (!zero_out) {
register float32x4x2_t t0;
t0 = vld1q_f32_x2(out);
o0.val[0] = vaddq_f32(o0.val[0], t0.val[0]);
o0.val[1] = vaddq_f32(o0.val[1], t0.val[1]);
}
// TODO: both clang and gcc generates redundant mov because of bad register
allocation.
vst1q_f32_x2(out, o0);
} // end writeback
}
The assembly generated:
simple_gemm:
moviv3.4s, 0
and w4, w4, 255
mov v4.16b, v3.16b
cbz x3, .L2
.L3:
ld1 {v0.4s - v1.4s}, [x2], 32
subsx3, x3, #1
ld1r{v2.4s}, [x1], 4
fmlav3.4s, v2.4s, v0.4s
fmlav4.4s, v2.4s, v1.4s
bne .L3
.L2:
cbnzw4, .L4
ld1 {v0.4s - v1.4s}, [x0]
faddv3.4s, v3.4s, v0.4s
faddv4.4s, v4.4s, v1.4s
.L4:
mov v0.16b, v3.16b
mov v1.16b, v4.16b
st1 {v0.4s - v1.4s}, [x0]
ret
The two values of float32x4x2_t o0 are assigned to v3 and v4. They should be
able to be used directly as operands of st1, so the mov at L4 is redundant.
I also found that in some code, the register pair may not be neighboring, which
results in some redundant mov instructions.