https://gcc.gnu.org/bugzilla/show_bug.cgi?id=125474
Bug ID: 125474
Summary: LoongArch: missing usdot_prodMN patterns
Product: gcc
Version: 17.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: xry111 at gcc dot gnu.org
Target Milestone: ---
Code simplified from SPEC 2026 706.stockfish_r:
#include <cstdint>
using IndexType = std::uint32_t;
void transform(std::int32_t *__restrict__ output, const std::int8_t *weights,
const std::int32_t *biases, const std::uint8_t *input) {
for (IndexType i = 0; i < 0x1000; ++i) {
const IndexType offset = i * 0x1000;
std::int32_t sum = biases[i];
for (IndexType j = 0; j < 0x1000; ++j) {
sum += weights[offset + j] * input[j];
}
output[i] = sum;
}
}
The loop body is compiled to:
.L2:
xvldx $xr3,$r13,$r12
xvldx $xr1,$r7,$r12
xvmulwod.h.b $xr2,$xr3,$xr6
xvmulwev.h.b $xr0,$xr3,$xr6
xvhaddw.w.h $xr4,$xr2,$xr2
xvhaddw.w.h $xr0,$xr0,$xr0
addi.d $r12,$r12,32
xvadd.w $xr0,$xr0,$xr4
xvadd.w $xr5,$xr0,$xr5
xvadd.b $xr1,$xr1,$xr7
xvadd.w $xr0,$xr0,$xr5
xvmulwev.h.b $xr2,$xr1,$xr3
xvmulwod.h.b $xr1,$xr1,$xr3
xvhaddw.w.h $xr2,$xr2,$xr2
xvhaddw.w.h $xr1,$xr1,$xr1
xvadd.w $xr2,$xr2,$xr1
xvadd.w $xr5,$xr2,$xr0
bne $r12,$r14,.L2
But if we have usdot_prod, we can generate much better code:
.L2:
xvldx $xr0,$r7,$r12
xvldx $xr2,$r13,$r12
addi.d $r12,$r12,32
xvmulwev.h.bu.b $xr1,$xr0,$xr2
xvmulwod.h.bu.b $xr0,$xr0,$xr2
xvhaddw.w.h $xr1,$xr1,$xr1
xvhaddw.w.h $xr0,$xr0,$xr0
xvadd.w $xr1,$xr1,$xr0
xvadd.w $xr3,$xr1,$xr3
bne $r12,$r14,.L2