https://gcc.gnu.org/g:5deed65950ebb551fc85604697bc30a8f1f3e458
commit r16-5332-g5deed65950ebb551fc85604697bc30a8f1f3e458 Author: Artemiy Volkov <[email protected]> Date: Wed Nov 12 20:18:44 2025 +0000 forwprop: restrict vector load decomposition in optimize_vector_load () Since r15-778-g1d1ef1c22752b3, we are compiling the following snippet: void foo (int16_t *dst, const uint8_t *src0, const uint8_t *src1) { uint8x16_t s0 = vld1q_u8 (src0); uint8x16_t s1 = vld1q_u8 (src1); uint16x8_t d0_lo = vsubl_u8 (vget_low_u8 (s0), vget_low_u8 (s1)); uint16x8_t d0_hi = vsubl_u8 (vget_high_u8 (s0), vget_high_u8 (s1)); vst1q_s16 (dst, vreinterpretq_s16_u16 (d0_lo)); vst1q_s16 (dst + 8, vreinterpretq_s16_u16 (d0_hi)); } into: ldp d0, d29, [x1] ldp d30, d31, [x2] usubl v30.8h, v0.8b, v30.8b usubl v31.8h, v29.8b, v31.8b stp q30, q31, [x0] ret rather than: ldr q31, [x1] ldr q30, [x2] usubl v29.8h, v31.8b, v30.8b usubl2 v30.8h, v31.16b, v30.16b stp q29, q30, [x0] ret That is, rather than keeping two 128-bit loads and using the usubl2 instruction designed to operate on upper halves of 128-bit vector registers, we are doing four 64-bit scalar loads and operate on 64-bit values, which leads to increased register pressure. What happens here is the aforementioned commit lowers the vget_half_* () intrinsics to BIT_FIELD_REFs, at which point the logic in tree-ssa-forwprop.cc::optimize_vector_load () kicks in, breaking down vector loads into scalar loads as long as all uses are through BIT_FIELD_REFs. AFAICT, this function (or before it existed, the code comprising it) handles the following scenarios: (1) Introduced in r10-135-ga7eb97ad269b65 in response to PR88983, this code broke down vector loads into smaller loads whenever the target doesn't natively support wider loads, fixing code quality issues. This should always be a win since the original loads weren't even available in the first place. (2) Since r12-2728-g2724d1bba6b364, it is now also handling loads that feed into VEC_UNPACK expressions to prefer extending scalar loads to vector loads + vector unpack, which is beneficial at least on some microarchitectures. This patch restricts the optimization to those scenarios explicitly, while adding another one on top: (3) If any of the BIT_FIELD_REFs have scalar type, prefer scalar loads to vector loads to reduce possible traffic between scalar and vector register files. IOW, only if all BIT_FIELD_REFs are used as subvectors, assume there might be other instructions operating on those subvectors that do not leave the vector register file, and do not perform the transformation. To summarize, after this patch, if either (1), (2), or (3) holds, narrow loads are preferred, otherwise vector loads are left intact. Bootstrapped and regtested on aarch64 and x86_64, no regressions on SPEC2017, the code snippet above added as an aarch64-specific test. gcc/ChangeLog: * tree-ssa-forwprop.cc (optimize_vector_load): Inhibit optimization when all uses are through subvectors without extension. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/usubl2.c: New test. Diff: --- gcc/testsuite/gcc.target/aarch64/simd/usubl2.c | 19 +++++++++++++++++++ gcc/tree-ssa-forwprop.cc | 9 +++++++++ 2 files changed, 28 insertions(+) diff --git a/gcc/testsuite/gcc.target/aarch64/simd/usubl2.c b/gcc/testsuite/gcc.target/aarch64/simd/usubl2.c new file mode 100644 index 000000000000..442b922e71af --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/usubl2.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include <arm_neon.h> + +void foo(int16_t *dst, const uint8_t *src0, const uint8_t *src1) +{ + uint8x16_t s0 = vld1q_u8 (src0); + uint8x16_t s1 = vld1q_u8 (src1); + + uint16x8_t d0_lo = vsubl_u8 (vget_low_u8 (s0), vget_low_u8 (s1)); + uint16x8_t d0_hi = vsubl_u8 (vget_high_u8 (s0), vget_high_u8 (s1)); + + vst1q_s16 (dst, vreinterpretq_s16_u16 (d0_lo)); + vst1q_s16 (dst + 8, vreinterpretq_s16_u16 (d0_hi)); +} + +/* { dg-final { scan-assembler "usubl\tv\[0-9\]+\.8h,\ v\[0-9\]+\.8b,\ v\[0-9\]+\.8b" } } */ +/* { dg-final { scan-assembler "usubl2\tv\[0-9\]+\.8h,\ v\[0-9\]+\.16b,\ v\[0-9\]+\.16b" } } */ diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc index 9f8d4ad3b443..052d17404914 100644 --- a/gcc/tree-ssa-forwprop.cc +++ b/gcc/tree-ssa-forwprop.cc @@ -4245,6 +4245,8 @@ optimize_vector_load (gimple_stmt_iterator *gsi) use_operand_p use_p; imm_use_iterator iter; bool rewrite = true; + bool scalar_use = false; + bool unpack_use = false; auto_vec<gimple *, 8> bf_stmts; auto_vec<tree, 8> worklist; worklist.quick_push (lhs); @@ -4278,6 +4280,8 @@ optimize_vector_load (gimple_stmt_iterator *gsi) ??? Support VEC_UNPACK_FLOAT_{HI,LO}_EXPR. */ && INTEGRAL_TYPE_P (TREE_TYPE (use_rhs))))) { + if (!VECTOR_TYPE_P (TREE_TYPE (gimple_assign_lhs (use_stmt)))) + scalar_use = true; bf_stmts.safe_push (use_stmt); continue; } @@ -4287,6 +4291,7 @@ optimize_vector_load (gimple_stmt_iterator *gsi) || use_code == VEC_UNPACK_LO_EXPR) && use_rhs == lhs) { + unpack_use = true; worklist.safe_push (gimple_assign_lhs (use_stmt)); continue; } @@ -4298,6 +4303,10 @@ optimize_vector_load (gimple_stmt_iterator *gsi) } while (!worklist.is_empty ()); + rewrite = rewrite && (scalar_use + || unpack_use + || !can_implement_p (mov_optab, + TYPE_MODE (TREE_TYPE (lhs)))); if (!rewrite) { gsi_next (gsi);
