Add a new tune to select whether to prefer zeroing SVE move immediate or
use the merging variant after zeroing the destination.  Since the expander
is bypassed in multiple places, split after reload to zero the destination.
Enable it on some cores where the latter was measured to be slightly faster.

Passes regress, OK for commit?

gcc:
        * config/aarch64/aarch64.h (TARGET_SVE_PREFER_ZEROING_MOVIMM): New
        define.
        * config/aarch64/aarch64-sve.md (*vcond_mask_<mode><vpred>): Add a
        split condition for zero predicate.
        * config/aarch64/aarch64-tuning-flags.def: Add AVOID_MOVIMM_Z tune.
        * config/aarch64/tuning_models/neoversev1.h (tune_flags): Update.
        * config/aarch64/tuning_models/neoversev2.h (tune_flags): Update.

---

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 
4d67ad0dc8753baab1a69428ad4cb1f04736a583..8dc8278f743ecd415aba29dfd3d1f1c31b7650d2
 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -8609,7 +8609,7 @@ (define_expand "@vcond_mask_<mode><vpred>"
 ;; This creates a false dependency on z0 which can result in stalls.
 ;; The zeroing will be done via a movi d0, 0 which is cheaper.
 ;;
-(define_insn "*vcond_mask_<mode><vpred>"
+(define_insn_and_rewrite "*vcond_mask_<mode><vpred>"
   [(set (match_operand:SVE_ALL 0 "register_operand")
        (unspec:SVE_ALL
          [(match_operand:<VPRED> 3 "aarch64_predicate_operand")
@@ -8627,6 +8627,13 @@ (define_insn "*vcond_mask_<mode><vpred>"
      [ ?&w      , vss , w  , Upa ; yes            ] movprfx\t%0, 
%2\;mov\t%0.<Vetype>, %3/m, #%I1
      [ ?&w      , Ufc , w  , Upa ; yes            ] movprfx\t%0, 
%2\;fmov\t%0.<Vetype>, %3/m, #%1
   }
+  "&& reload_completed
+   && aarch64_simd_or_scalar_imm_zero (operands[2], <MODE>mode)
+   && !TARGET_SVE_PREFER_ZEROING_MOVIMM"
+  {
+    emit_move_insn (operands[0], operands[2]);
+    operands[2] = copy_rtx (operands[0]);
+  }
 )
 
 ;; Optimize selects between a duplicated scalar variable and another vector.
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def 
b/gcc/config/aarch64/aarch64-tuning-flags.def
index 
058dadecccaac458e30a6585b558581a7bef6f54..4d8df80120d48ce03f586b5b3177992c28aa519f
 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -77,4 +77,7 @@ AARCH64_EXTRA_TUNING_OPTION ("dispatch_sched", DISPATCH_SCHED)
    32 bits are unused.  */
 AARCH64_EXTRA_TUNING_OPTION ("narrow_gp_writes", NARROW_GP_WRITES)
 
+/* Enable when the target prefers SVE merging movimm over zeroing.  */
+AARCH64_EXTRA_TUNING_OPTION ("avoid_zeroing_movimm", AVOID_MOVIMM_Z)
+
 #undef AARCH64_EXTRA_TUNING_OPTION
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 
c3c61c6939c0a5e1c66d0660c3c66b3a27467eb8..7f95d22bf2c83ea9e06c45a1c33266a4936879c5
 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -505,6 +505,10 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
                                 && (aarch64_tune_params.extra_tuning_flags \
                                     & AARCH64_EXTRA_TUNE_AVOID_PRED_RMW))
 
+/* Set if we prefer SVE merging predicated mov immediate over zeroing.  */
+#define TARGET_SVE_PREFER_ZEROING_MOVIMM \
+  !(aarch64_tune_params.extra_tuning_flags & AARCH64_EXTRA_TUNE_AVOID_MOVIMM_Z)
+
 /* fp8 instructions are enabled through +fp8.  */
 #define TARGET_FP8 AARCH64_HAVE_ISA (FP8)
 
diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h 
b/gcc/config/aarch64/tuning_models/neoversev1.h
index 
253f11e87a68548a51201ee8e1318aaae606cab2..004676a8dbacf438beb5c5598a39f80988bb6a48
 100644
--- a/gcc/config/aarch64/tuning_models/neoversev1.h
+++ b/gcc/config/aarch64/tuning_models/neoversev1.h
@@ -229,7 +229,8 @@ static const struct tune_params neoversev1_tunings =
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
-   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),       /* tune_flags.  */
+   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW
+   | AARCH64_EXTRA_TUNE_AVOID_MOVIMM_Z),       /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* stp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h 
b/gcc/config/aarch64/tuning_models/neoversev2.h
index 
061fa6b8445c55bc14683f4c92c009d8bcb2c819..bddd583b7f7d3dd25c9fab6b6e46977ce8b48677
 100644
--- a/gcc/config/aarch64/tuning_models/neoversev2.h
+++ b/gcc/config/aarch64/tuning_models/neoversev2.h
@@ -359,6 +359,7 @@ static const struct tune_params neoversev2_tunings =
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
    | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW
+   | AARCH64_EXTRA_TUNE_AVOID_MOVIMM_Z
    | AARCH64_EXTRA_TUNE_AVOID_LDAPUR
    | AARCH64_EXTRA_TUNE_DISPATCH_SCHED),       /* tune_flags.  */
   &generic_armv9a_prefetch_tune,

Reply via email to