[gcc r16-5332] forwprop: restrict vector load decomposition in optimize_vector_load ()

Artemiy Volkov via Gcc-cvs Mon, 17 Nov 2025 01:15:54 -0800

https://gcc.gnu.org/g:5deed65950ebb551fc85604697bc30a8f1f3e458


commit r16-5332-g5deed65950ebb551fc85604697bc30a8f1f3e458
Author: Artemiy Volkov <[email protected]>
Date:   Wed Nov 12 20:18:44 2025 +0000

    forwprop: restrict vector load decomposition in optimize_vector_load ()
    
    Since r15-778-g1d1ef1c22752b3, we are compiling the following snippet:
    
    void foo (int16_t *dst, const uint8_t *src0, const uint8_t *src1)
    {
      uint8x16_t s0 = vld1q_u8 (src0);
      uint8x16_t s1 = vld1q_u8 (src1);
    
      uint16x8_t d0_lo = vsubl_u8 (vget_low_u8 (s0), vget_low_u8 (s1));
      uint16x8_t d0_hi = vsubl_u8 (vget_high_u8 (s0), vget_high_u8 (s1));
    
      vst1q_s16 (dst, vreinterpretq_s16_u16 (d0_lo));
      vst1q_s16 (dst + 8, vreinterpretq_s16_u16 (d0_hi));
    }
    
    into:
    
            ldp     d0, d29, [x1]
            ldp     d30, d31, [x2]
            usubl   v30.8h, v0.8b, v30.8b
            usubl   v31.8h, v29.8b, v31.8b
            stp     q30, q31, [x0]
            ret
    
    rather than:
    
            ldr     q31, [x1]
            ldr     q30, [x2]
            usubl   v29.8h, v31.8b, v30.8b
            usubl2  v30.8h, v31.16b, v30.16b
            stp     q29, q30, [x0]
            ret
    
    That is, rather than keeping two 128-bit loads and using the usubl2
    instruction designed to operate on upper halves of 128-bit vector
    registers, we are doing four 64-bit scalar loads and operate on 64-bit
    values, which leads to increased register pressure.
    
    What happens here is the aforementioned commit lowers the vget_half_* ()
    intrinsics to BIT_FIELD_REFs, at which point the logic in
    tree-ssa-forwprop.cc::optimize_vector_load () kicks in, breaking down
    vector loads into scalar loads as long as all uses are through
    BIT_FIELD_REFs.  AFAICT, this function (or before it existed, the code
    comprising it) handles the following scenarios:
    
    (1) Introduced in r10-135-ga7eb97ad269b65 in response to PR88983, this
    code broke down vector loads into smaller loads whenever the target
    doesn't natively support wider loads, fixing code quality issues.  This
    should always be a win since the original loads weren't even available in
    the first place.
    
    (2) Since r12-2728-g2724d1bba6b364, it is now also handling loads that
    feed into VEC_UNPACK expressions to prefer extending scalar loads to
    vector loads + vector unpack, which is beneficial at least on some
    microarchitectures.
    
    This patch restricts the optimization to those scenarios explicitly, while
    adding another one on top:
    
    (3) If any of the BIT_FIELD_REFs have scalar type, prefer scalar loads to
    vector loads to reduce possible traffic between scalar and vector register
    files.  IOW, only if all BIT_FIELD_REFs are used as subvectors, assume
    there might be other instructions operating on those subvectors that do
    not leave the vector register file, and do not perform the transformation.
    
    To summarize, after this patch, if either (1), (2), or (3) holds, narrow
    loads are preferred, otherwise vector loads are left intact.
    
    Bootstrapped and regtested on aarch64 and x86_64, no regressions on
    SPEC2017, the code snippet above added as an aarch64-specific test.
    
    gcc/ChangeLog:
    
            * tree-ssa-forwprop.cc (optimize_vector_load): Inhibit
            optimization when all uses are through subvectors without
            extension.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/simd/usubl2.c: New test.

Diff:
---
 gcc/testsuite/gcc.target/aarch64/simd/usubl2.c | 19 +++++++++++++++++++
 gcc/tree-ssa-forwprop.cc                       |  9 +++++++++
 2 files changed, 28 insertions(+)

diff --git a/gcc/testsuite/gcc.target/aarch64/simd/usubl2.c 
b/gcc/testsuite/gcc.target/aarch64/simd/usubl2.c
new file mode 100644
index 000000000000..442b922e71af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/usubl2.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_neon.h>
+
+void foo(int16_t *dst, const uint8_t *src0, const uint8_t *src1)
+{
+  uint8x16_t s0 = vld1q_u8 (src0);
+  uint8x16_t s1 = vld1q_u8 (src1);
+
+  uint16x8_t d0_lo = vsubl_u8 (vget_low_u8 (s0), vget_low_u8 (s1));
+  uint16x8_t d0_hi = vsubl_u8 (vget_high_u8 (s0), vget_high_u8 (s1));
+    
+  vst1q_s16 (dst, vreinterpretq_s16_u16 (d0_lo));
+  vst1q_s16 (dst + 8, vreinterpretq_s16_u16 (d0_hi));
+}
+
+/* { dg-final { scan-assembler "usubl\tv\[0-9\]+\.8h,\ v\[0-9\]+\.8b,\ 
v\[0-9\]+\.8b" } } */
+/* { dg-final { scan-assembler "usubl2\tv\[0-9\]+\.8h,\ v\[0-9\]+\.16b,\ 
v\[0-9\]+\.16b" } } */
diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
index 9f8d4ad3b443..052d17404914 100644
--- a/gcc/tree-ssa-forwprop.cc
+++ b/gcc/tree-ssa-forwprop.cc
@@ -4245,6 +4245,8 @@ optimize_vector_load (gimple_stmt_iterator *gsi)
   use_operand_p use_p;
   imm_use_iterator iter;
   bool rewrite = true;
+  bool scalar_use = false;
+  bool unpack_use = false;
   auto_vec<gimple *, 8> bf_stmts;
   auto_vec<tree, 8> worklist;
   worklist.quick_push (lhs);
@@ -4278,6 +4280,8 @@ optimize_vector_load (gimple_stmt_iterator *gsi)
                         ???  Support VEC_UNPACK_FLOAT_{HI,LO}_EXPR.  */
                      && INTEGRAL_TYPE_P (TREE_TYPE (use_rhs)))))
            {
+             if (!VECTOR_TYPE_P (TREE_TYPE (gimple_assign_lhs (use_stmt))))
+               scalar_use = true;
              bf_stmts.safe_push (use_stmt);
              continue;
            }
@@ -4287,6 +4291,7 @@ optimize_vector_load (gimple_stmt_iterator *gsi)
                  || use_code == VEC_UNPACK_LO_EXPR)
              && use_rhs == lhs)
            {
+             unpack_use = true;
              worklist.safe_push (gimple_assign_lhs (use_stmt));
              continue;
            }
@@ -4298,6 +4303,10 @@ optimize_vector_load (gimple_stmt_iterator *gsi)
     }
   while (!worklist.is_empty ());
 
+  rewrite = rewrite && (scalar_use
+                       || unpack_use
+                       || !can_implement_p (mov_optab,
+                                            TYPE_MODE (TREE_TYPE (lhs))));
   if (!rewrite)
     {
       gsi_next (gsi);

[gcc r16-5332] forwprop: restrict vector load decomposition in optimize_vector_load ()

Reply via email to