https://gcc.gnu.org/g:a166a6ccdc6c3d6532a24ba3a2057a177ce44752

commit r15-5386-ga166a6ccdc6c3d6532a24ba3a2057a177ce44752
Author: Robin Dapp <rd...@ventanamicro.com>
Date:   Thu Aug 8 10:30:58 2024 +0200

    aarch64: Add masked-load else operands.
    
    This adds zero else operands to masked loads and their intrinsics.
    I needed to adjust more than initially thought because we rely on
    combine for several instructions and a change in a "base" pattern
    needs to propagate to all those.
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-sve-builtins-base.cc: Add else
            handling.
            * config/aarch64/aarch64-sve-builtins.cc 
(function_expander::use_contiguous_load_insn):
            Ditto.
            * config/aarch64/aarch64-sve-builtins.h: Add else operand to
            contiguous load.
            * config/aarch64/aarch64-sve.md 
(@aarch64_load<SVE_PRED_LOAD:pred_load>
            _<ANY_EXTEND:optab><SVE_HSDI:mode><SVE_PARTIAL_I:mode>):
            Split and add else operand.
            
(@aarch64_load_<ANY_EXTEND:optab><SVE_HSDI:mode><SVE_PARTIAL_I:mode>):
            Ditto.
            
(*aarch64_load_<ANY_EXTEND:optab>_mov<SVE_HSDI:mode><SVE_PARTIAL_I:mode>):
            Ditto.
            * config/aarch64/aarch64-sve2.md: Ditto.
            * config/aarch64/iterators.md: Remove unused iterators.
            * config/aarch64/predicates.md (aarch64_maskload_else_operand):
            Add zero else operand.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc | 24 +++++++-----
 gcc/config/aarch64/aarch64-sve-builtins.cc      | 12 +++++-
 gcc/config/aarch64/aarch64-sve-builtins.h       |  2 +-
 gcc/config/aarch64/aarch64-sve.md               | 52 +++++++++++++++++++++----
 gcc/config/aarch64/aarch64-sve2.md              |  3 +-
 gcc/config/aarch64/iterators.md                 |  4 --
 gcc/config/aarch64/predicates.md                |  4 ++
 7 files changed, 77 insertions(+), 24 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 2117eceb6063..20820fb1985c 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -1524,11 +1524,12 @@ public:
     gimple_seq stmts = NULL;
     tree pred = f.convert_pred (stmts, vectype, 0);
     tree base = f.fold_contiguous_base (stmts, vectype);
+    tree els = build_zero_cst (vectype);
     gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT);
 
     tree cookie = f.load_store_cookie (TREE_TYPE (vectype));
-    gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD, 3,
-                                                 base, cookie, pred);
+    gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD, 4,
+                                                 base, cookie, pred, els);
     gimple_call_set_lhs (new_call, f.lhs);
     return new_call;
   }
@@ -1542,7 +1543,7 @@ public:
                                     e.vector_mode (0), e.gp_mode (0));
     else
       icode = code_for_aarch64 (UNSPEC_LD1_COUNT, e.tuple_mode (0));
-    return e.use_contiguous_load_insn (icode);
+    return e.use_contiguous_load_insn (icode, true);
   }
 };
 
@@ -1555,10 +1556,10 @@ public:
   rtx
   expand (function_expander &e) const override
   {
-    insn_code icode = code_for_aarch64_load (UNSPEC_LD1_SVE, extend_rtx_code 
(),
+    insn_code icode = code_for_aarch64_load (extend_rtx_code (),
                                             e.vector_mode (0),
                                             e.memory_vector_mode ());
-    return e.use_contiguous_load_insn (icode);
+    return e.use_contiguous_load_insn (icode, true);
   }
 };
 
@@ -1577,6 +1578,8 @@ public:
     e.prepare_gather_address_operands (1);
     /* Put the predicate last, as required by mask_gather_load_optab.  */
     e.rotate_inputs_left (0, 5);
+    /* Add the else operand.  */
+    e.args.quick_push (CONST0_RTX (e.vector_mode (0)));
     machine_mode mem_mode = e.memory_vector_mode ();
     machine_mode int_mode = aarch64_sve_int_mode (mem_mode);
     insn_code icode = convert_optab_handler (mask_gather_load_optab,
@@ -1600,6 +1603,8 @@ public:
     e.rotate_inputs_left (0, 5);
     /* Add a constant predicate for the extension rtx.  */
     e.args.quick_push (CONSTM1_RTX (VNx16BImode));
+    /* Add the else operand.  */
+    e.args.quick_push (CONST0_RTX (e.vector_mode (1)));
     insn_code icode = code_for_aarch64_gather_load (extend_rtx_code (),
                                                    e.vector_mode (0),
                                                    e.memory_vector_mode ());
@@ -1742,6 +1747,7 @@ public:
     /* Get the predicate and base pointer.  */
     gimple_seq stmts = NULL;
     tree pred = f.convert_pred (stmts, vectype, 0);
+    tree els = build_zero_cst (vectype);
     tree base = f.fold_contiguous_base (stmts, vectype);
     gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT);
 
@@ -1760,8 +1766,8 @@ public:
 
     /* Emit the load itself.  */
     tree cookie = f.load_store_cookie (TREE_TYPE (vectype));
-    gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
-                                                 base, cookie, pred);
+    gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 4,
+                                                 base, cookie, pred, els);
     gimple_call_set_lhs (new_call, lhs_array);
     gsi_insert_after (f.gsi, new_call, GSI_SAME_STMT);
 
@@ -1774,7 +1780,7 @@ public:
     machine_mode tuple_mode = e.result_mode ();
     insn_code icode = convert_optab_handler (vec_mask_load_lanes_optab,
                                             tuple_mode, e.vector_mode (0));
-    return e.use_contiguous_load_insn (icode);
+    return e.use_contiguous_load_insn (icode, true);
   }
 };
 
@@ -1845,7 +1851,7 @@ public:
                       ? code_for_aarch64_ldnt1 (e.vector_mode (0))
                       : code_for_aarch64 (UNSPEC_LDNT1_COUNT,
                                           e.tuple_mode (0)));
-    return e.use_contiguous_load_insn (icode);
+    return e.use_contiguous_load_insn (icode, true);
   }
 };
 
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc 
b/gcc/config/aarch64/aarch64-sve-builtins.cc
index b3d961452d32..e937431df107 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -4284,9 +4284,12 @@ function_expander::use_vcond_mask_insn (insn_code icode,
 /* Implement the call using instruction ICODE, which loads memory operand 1
    into register operand 0 under the control of predicate operand 2.
    Extending loads have a further predicate (operand 3) that nominally
-   controls the extension.  */
+   controls the extension.
+   HAS_ELSE is true if the pattern has an additional operand that specifies
+   the values of inactive lanes.  This exists to match the general maskload
+   interface and is always zero for AArch64.  */
 rtx
-function_expander::use_contiguous_load_insn (insn_code icode)
+function_expander::use_contiguous_load_insn (insn_code icode, bool has_else)
 {
   machine_mode mem_mode = memory_vector_mode ();
 
@@ -4295,6 +4298,11 @@ function_expander::use_contiguous_load_insn (insn_code 
icode)
   add_input_operand (icode, args[0]);
   if (GET_MODE_UNIT_BITSIZE (mem_mode) < type_suffix (0).element_bits)
     add_input_operand (icode, CONSTM1_RTX (VNx16BImode));
+
+  /* If we have an else operand, add it.  */
+  if (has_else)
+    add_input_operand (icode, CONST0_RTX (mem_mode));
+
   return generate_insn (icode);
 }
 
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h 
b/gcc/config/aarch64/aarch64-sve-builtins.h
index 5bd9b88d1179..4094f8207f9c 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.h
+++ b/gcc/config/aarch64/aarch64-sve-builtins.h
@@ -696,7 +696,7 @@ public:
   rtx use_pred_x_insn (insn_code);
   rtx use_cond_insn (insn_code, unsigned int = DEFAULT_MERGE_ARGNO);
   rtx use_vcond_mask_insn (insn_code, unsigned int = DEFAULT_MERGE_ARGNO);
-  rtx use_contiguous_load_insn (insn_code);
+  rtx use_contiguous_load_insn (insn_code, bool = false);
   rtx use_contiguous_prefetch_insn (insn_code);
   rtx use_contiguous_store_insn (insn_code);
 
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index affdb24a93d5..7a48f900fa52 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -1291,7 +1291,8 @@
   [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
        (unspec:SVE_ALL
          [(match_operand:<VPRED> 2 "register_operand" "Upl")
-          (match_operand:SVE_ALL 1 "memory_operand" "m")]
+          (match_operand:SVE_ALL 1 "memory_operand" "m")
+          (match_operand:SVE_ALL 3 "aarch64_maskload_else_operand")]
          UNSPEC_LD1_SVE))]
   "TARGET_SVE"
   "ld1<Vesize>\t%0.<Vctype>, %2/z, %1"
@@ -1302,11 +1303,13 @@
   [(set (match_operand:SVE_STRUCT 0 "register_operand")
        (unspec:SVE_STRUCT
          [(match_dup 2)
-          (match_operand:SVE_STRUCT 1 "memory_operand")]
+          (match_operand:SVE_STRUCT 1 "memory_operand")
+          (match_dup 3)]
          UNSPEC_LDN))]
   "TARGET_SVE"
   {
     operands[2] = aarch64_ptrue_reg (<VPRED>mode);
+    operands[3] = CONST0_RTX (<MODE>mode);
   }
 )
 
@@ -1315,7 +1318,8 @@
   [(set (match_operand:SVE_STRUCT 0 "register_operand" "=w")
        (unspec:SVE_STRUCT
          [(match_operand:<VPRED> 2 "register_operand" "Upl")
-          (match_operand:SVE_STRUCT 1 "memory_operand" "m")]
+          (match_operand:SVE_STRUCT 1 "memory_operand" "m")
+          (match_operand 3 "aarch64_maskload_else_operand")]
          UNSPEC_LDN))]
   "TARGET_SVE"
   "ld<vector_count><Vesize>\t%0, %2/z, %1"
@@ -1334,15 +1338,16 @@
 ;; -------------------------------------------------------------------------
 
 ;; Predicated load and extend, with 8 elements per 128-bit block.
-(define_insn_and_rewrite 
"@aarch64_load<SVE_PRED_LOAD:pred_load>_<ANY_EXTEND:optab><SVE_HSDI:mode><SVE_PARTIAL_I:mode>"
+(define_insn_and_rewrite 
"@aarch64_load_<ANY_EXTEND:optab><SVE_HSDI:mode><SVE_PARTIAL_I:mode>"
   [(set (match_operand:SVE_HSDI 0 "register_operand" "=w")
        (unspec:SVE_HSDI
          [(match_operand:<SVE_HSDI:VPRED> 3 "general_operand" "UplDnm")
           (ANY_EXTEND:SVE_HSDI
             (unspec:SVE_PARTIAL_I
               [(match_operand:<SVE_PARTIAL_I:VPRED> 2 "register_operand" "Upl")
-               (match_operand:SVE_PARTIAL_I 1 "memory_operand" "m")]
-              SVE_PRED_LOAD))]
+               (match_operand:SVE_PARTIAL_I 1 "memory_operand" "m")
+               (match_operand:SVE_PARTIAL_I 4 "aarch64_maskload_else_operand")]
+              UNSPEC_LD1_SVE))]
          UNSPEC_PRED_X))]
   "TARGET_SVE && (~<SVE_HSDI:narrower_mask> & <SVE_PARTIAL_I:self_mask>) == 0"
   "ld1<ANY_EXTEND:s><SVE_PARTIAL_I:Vesize>\t%0.<SVE_HSDI:Vctype>, %2/z, %1"
@@ -1352,6 +1357,26 @@
   }
 )
 
+;; Same as above without the maskload_else_operand to still allow combine to
+;; match a sign-extended pred_mov pattern.
+(define_insn_and_rewrite 
"*aarch64_load_<ANY_EXTEND:optab>_mov<SVE_HSDI:mode><SVE_PARTIAL_I:mode>"
+  [(set (match_operand:SVE_HSDI 0 "register_operand" "=w")
+       (unspec:SVE_HSDI
+         [(match_operand:<SVE_HSDI:VPRED> 3 "general_operand" "UplDnm")
+          (ANY_EXTEND:SVE_HSDI
+            (unspec:SVE_PARTIAL_I
+              [(match_operand:<SVE_PARTIAL_I:VPRED> 2 "register_operand" "Upl")
+               (match_operand:SVE_PARTIAL_I 1 "memory_operand" "m")]
+               UNSPEC_PRED_X))]
+          UNSPEC_PRED_X))]
+  "TARGET_SVE && (~<SVE_HSDI:narrower_mask> & <SVE_PARTIAL_I:self_mask>) == 0"
+  "ld1<ANY_EXTEND:s><SVE_PARTIAL_I:Vesize>\t%0.<SVE_HSDI:Vctype>, %2/z, %1"
+  "&& !CONSTANT_P (operands[3])"
+  {
+    operands[3] = CONSTM1_RTX (<SVE_HSDI:VPRED>mode);
+  }
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- First-faulting contiguous loads
 ;; -------------------------------------------------------------------------
@@ -1433,7 +1458,8 @@
   [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
        (unspec:SVE_FULL
          [(match_operand:<VPRED> 2 "register_operand" "Upl")
-          (match_operand:SVE_FULL 1 "memory_operand" "m")]
+          (match_operand:SVE_FULL 1 "memory_operand" "m")
+          (match_operand:SVE_FULL 3 "aarch64_maskload_else_operand")]
          UNSPEC_LDNT1_SVE))]
   "TARGET_SVE"
   "ldnt1<Vesize>\t%0.<Vetype>, %2/z, %1"
@@ -1456,11 +1482,13 @@
           (match_operand:<V_INT_CONTAINER> 2 "register_operand")
           (match_operand:DI 3 "const_int_operand")
           (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>")
+          (match_dup 6)
           (mem:BLK (scratch))]
          UNSPEC_LD1_GATHER))]
   "TARGET_SVE && TARGET_NON_STREAMING"
   {
     operands[5] = aarch64_ptrue_reg (<VPRED>mode);
+    operands[6] = CONST0_RTX (<MODE>mode);
   }
 )
 
@@ -1474,6 +1502,7 @@
           (match_operand:VNx4SI 2 "register_operand")
           (match_operand:DI 3 "const_int_operand")
           (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>")
+          (match_operand:SVE_4 6 "aarch64_maskload_else_operand")
           (mem:BLK (scratch))]
          UNSPEC_LD1_GATHER))]
   "TARGET_SVE && TARGET_NON_STREAMING"
@@ -1503,6 +1532,7 @@
           (match_operand:VNx2DI 2 "register_operand")
           (match_operand:DI 3 "const_int_operand")
           (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>")
+          (match_operand:SVE_2 6 "aarch64_maskload_else_operand")
           (mem:BLK (scratch))]
          UNSPEC_LD1_GATHER))]
   "TARGET_SVE && TARGET_NON_STREAMING"
@@ -1531,6 +1561,7 @@
             UNSPEC_PRED_X)
           (match_operand:DI 3 "const_int_operand")
           (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>")
+          (match_operand:SVE_2 7 "aarch64_maskload_else_operand")
           (mem:BLK (scratch))]
          UNSPEC_LD1_GATHER))]
   "TARGET_SVE && TARGET_NON_STREAMING"
@@ -1561,6 +1592,7 @@
             UNSPEC_PRED_X)
           (match_operand:DI 3 "const_int_operand")
           (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>")
+          (match_operand:SVE_2 7 "aarch64_maskload_else_operand")
           (mem:BLK (scratch))]
          UNSPEC_LD1_GATHER))]
   "TARGET_SVE && TARGET_NON_STREAMING"
@@ -1588,6 +1620,7 @@
             (match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
           (match_operand:DI 3 "const_int_operand")
           (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>")
+          (match_operand:SVE_2 7 "aarch64_maskload_else_operand")
           (mem:BLK (scratch))]
          UNSPEC_LD1_GATHER))]
   "TARGET_SVE && TARGET_NON_STREAMING"
@@ -1624,6 +1657,7 @@
                (match_operand:VNx4SI 2 "register_operand")
                (match_operand:DI 3 "const_int_operand")
                (match_operand:DI 4 
"aarch64_gather_scale_operand_<SVE_4BHI:Vesize>")
+               (match_operand:SVE_4BHI 7 "aarch64_maskload_else_operand")
                (mem:BLK (scratch))]
               UNSPEC_LD1_GATHER))]
          UNSPEC_PRED_X))]
@@ -1663,6 +1697,7 @@
                (match_operand:VNx2DI 2 "register_operand")
                (match_operand:DI 3 "const_int_operand")
                (match_operand:DI 4 
"aarch64_gather_scale_operand_<SVE_2BHSI:Vesize>")
+               (match_operand:SVE_2BHSI 7 "aarch64_maskload_else_operand")
                (mem:BLK (scratch))]
               UNSPEC_LD1_GATHER))]
          UNSPEC_PRED_X))]
@@ -1701,6 +1736,7 @@
                  UNSPEC_PRED_X)
                (match_operand:DI 3 "const_int_operand")
                (match_operand:DI 4 
"aarch64_gather_scale_operand_<SVE_2BHSI:Vesize>")
+               (match_operand:SVE_2BHSI 8 "aarch64_maskload_else_operand")
                (mem:BLK (scratch))]
               UNSPEC_LD1_GATHER))]
          UNSPEC_PRED_X))]
@@ -1738,6 +1774,7 @@
                  UNSPEC_PRED_X)
                (match_operand:DI 3 "const_int_operand")
                (match_operand:DI 4 
"aarch64_gather_scale_operand_<SVE_2BHSI:Vesize>")
+               (match_operand:SVE_2BHSI 8 "aarch64_maskload_else_operand")
                (mem:BLK (scratch))]
               UNSPEC_LD1_GATHER))]
          UNSPEC_PRED_X))]
@@ -1772,6 +1809,7 @@
                  (match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
                (match_operand:DI 3 "const_int_operand")
                (match_operand:DI 4 
"aarch64_gather_scale_operand_<SVE_2BHSI:Vesize>")
+               (match_operand:SVE_2BHSI 8 "aarch64_maskload_else_operand")
                (mem:BLK (scratch))]
               UNSPEC_LD1_GATHER))]
          UNSPEC_PRED_X))]
diff --git a/gcc/config/aarch64/aarch64-sve2.md 
b/gcc/config/aarch64/aarch64-sve2.md
index 9383c777d80e..e67421bad84b 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -264,7 +264,8 @@
   [(set (match_operand:SVE_FULLx24 0 "aligned_register_operand" 
"=Uw<vector_count>")
        (unspec:SVE_FULLx24
          [(match_operand:VNx16BI 2 "register_operand" "Uph")
-          (match_operand:SVE_FULLx24 1 "memory_operand" "m")]
+          (match_operand:SVE_FULLx24 1 "memory_operand" "m")
+          (match_operand:SVE_FULLx24 3 "aarch64_maskload_else_operand")]
          LD1_COUNT))]
   "TARGET_SVE2p1_OR_SME2"
   "<optab><Vesize>\t%0, %K2/z, %1"
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index ce8f032c1410..d7cb27e18852 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -3331,10 +3331,6 @@
 
 (define_int_iterator SVE_LDFF1_LDNF1 [UNSPEC_LDFF1 UNSPEC_LDNF1])
 
-(define_int_iterator SVE_PRED_LOAD [UNSPEC_PRED_X UNSPEC_LD1_SVE])
-
-(define_int_attr pred_load [(UNSPEC_PRED_X "_x") (UNSPEC_LD1_SVE "")])
-
 (define_int_iterator LD1_COUNT [UNSPEC_LD1_COUNT UNSPEC_LDNT1_COUNT])
 
 (define_int_iterator ST1_COUNT [UNSPEC_ST1_COUNT UNSPEC_STNT1_COUNT])
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 6ad9a4bd8b92..26cfaed2402c 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -1067,3 +1067,7 @@
   (and (match_code "const_int")
        (match_test "IN_RANGE (INTVAL (op),  -4096, 4080)
                    && !(INTVAL (op) & 0xf)")))
+
+(define_predicate "aarch64_maskload_else_operand"
+  (and (match_code "const_int,const_vector")
+       (match_test "op == CONST0_RTX (GET_MODE (op))")))

Reply via email to