https://gcc.gnu.org/g:70391e3958db791edea4e877636592de47a785e7

commit r15-9062-g70391e3958db791edea4e877636592de47a785e7
Author: Kyrylo Tkachov <ktkac...@nvidia.com>
Date:   Mon Mar 24 01:53:06 2025 -0700

    PR middle-end/119442: expr.cc: Fix vec_duplicate into vector boolean modes
    
    In this testcase GCC tries to expand a VNx4BI vector:
      vector(4) <signed-boolean:4> _40;
      _39 = (<signed-boolean:4>) _24;
      _40 = {_39, _39, _39, _39};
    
    This ends up in a scalarised sequence of bitfield insert operations.
    This is despite the fact that AArch64 provides a vec_duplicate pattern
    specifically for vec_duplicate into VNx4BI.
    
    The store_constructor code is overly conservative when trying vec_duplicate
    as it sees a requested VNx4BImode and an element mode of QImode, which I 
guess
    is the storage mode of BImode objects.
    
    The vec_duplicate expander in aarch64-sve.md explicitly allows QImode 
element
    modes so it should be safe to use it.  This patch extends that mode check
    to allow such expanders.
    
    The testcase is heavily auto-reduced from a real application but in itself 
is
    nonsensical, but it does demonstrate the current problematic codegen.
    
    This the testcase goes from:
            pfalse  p15.b
            str     p15, [sp, #6, mul vl]
            mov     w0, 0
            ldr     w2, [sp, 12]
            bfi     w2, w0, 0, 4
            uxtw    x2, w2
            bfi     w2, w0, 4, 4
            uxtw    x2, w2
            bfi     w2, w0, 8, 4
            uxtw    x2, w2
            bfi     w2, w0, 12, 4
            str     w2, [sp, 12]
            ldr     p15, [sp, #6, mul vl]
    
    into:
            whilelo p15.s, wzr, wzr
    
    The whilelo could be optimised away into a pfalse of course, but the 
important
    part is that the bfis are gones.
    
    Bootstrapped and tested on aarch64-none-linux-gnu.
    
    Signed-off-by: Kyrylo Tkachov <ktkac...@nvidia.com>
    
    gcc/
    
            PR middle-end/119442
            * expr.cc (store_constructor): Also allow element modes explicitly
            accepted by target vec_duplicate pattern.
    
    gcc/testsuite/
    
            PR middle-end/119442
            * gcc.target/aarch64/vls_sve_vec_dup_1.c: New test.

Diff:
---
 gcc/expr.cc                                          | 11 ++++++++---
 gcc/testsuite/gcc.target/aarch64/vls_sve_vec_dup_1.c | 15 +++++++++++++++
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/gcc/expr.cc b/gcc/expr.cc
index 9f4382d7986b..2147eedad7be 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -7920,11 +7920,16 @@ store_constructor (tree exp, rtx target, int cleared, 
poly_int64 size,
        gcc_assert (eltmode != BLKmode);
 
        /* Try using vec_duplicate_optab for uniform vectors.  */
+       icode = optab_handler (vec_duplicate_optab, mode);
        if (!TREE_SIDE_EFFECTS (exp)
            && VECTOR_MODE_P (mode)
-           && eltmode == GET_MODE_INNER (mode)
-           && ((icode = optab_handler (vec_duplicate_optab, mode))
-               != CODE_FOR_nothing)
+           && icode != CODE_FOR_nothing
+           /* If the vec_duplicate target pattern does not specify an element
+              mode check that eltmode is the normal inner mode of the
+              requested vector mode.  But if the target allows eltmode
+              explicitly go ahead and use it.  */
+           && (eltmode == GET_MODE_INNER (mode)
+               || insn_data[icode].operand[1].mode == eltmode)
            && (elt = uniform_vector_p (exp))
            && !VECTOR_TYPE_P (TREE_TYPE (elt)))
          {
diff --git a/gcc/testsuite/gcc.target/aarch64/vls_sve_vec_dup_1.c 
b/gcc/testsuite/gcc.target/aarch64/vls_sve_vec_dup_1.c
new file mode 100644
index 000000000000..ada0d4fc0a43
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vls_sve_vec_dup_1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -msve-vector-bits=128" } */
+
+float fasten_main_etot_0;
+void fasten_main() {
+  for (int l = 0; l < 2;) {
+    int phphb_nz;
+    for (; l < 32; l++) {
+      float dslv_e = l && phphb_nz;
+      fasten_main_etot_0 += dslv_e;
+    }
+  }
+}
+
+/* { dg-final { scan-assembler-not {bfi\tw\[0-9\]+} } } */

Reply via email to