[PATCH 6/6] aarch64: Improve early-ra handling of reductions

Richard Sandiford Wed, 13 Nov 2024 02:37:41 -0800

At the moment, early-ra ducks out of allocating any region
that contains a register with both a strong FPR affinity and
a strong GPR affinity.  The proper allocators are much better
at handling that situation.


But this means that early-ra tends not to allocate a region
of vector code that ends in a reduction to a scalar integer
if any later arithmetic is done on the scalar integer result.

Currently, if a block acts as an isolated allocation region, the pass
will try to split the block into subregions *between* instructions if
there are no live FPRs or FPR allocnos.  In the reduction case described
above, it's convenient to try the same thing *within* instructions.
If a block of vector code ends in a reduction, all FPRs and FPR
allocnos will be dead between the "use phase" and the "def phase"
of the reduction: the vector input will then have died, but the
scalar result will not yet have been born.

If we split the block that way, the problematic reduction result
will be part of the second region, which we can skip allocating,
but the vector work will be part of a separate region, which we
might be able to allocate.

This avoids a MOV in the testcase and also helps a small amount
with x264.

gcc/
        * config/aarch64/aarch64-early-ra.cc
        (early_ra::IGNORE_REG): New flag.
        (early_ra::fpr_preference): Handle it.
        (early_ra::record_constraints): Fail the allocation if an
        IGNORE_REG output operand is not independent of the inputs.
        (defines_multi_def_pseudo): New function.
        (early_ra::could_split_region_here): New member function, split
        out from...
        (early_ra::process_block): ...here.  Try splitting a block into
        multiple regions between the definition and use phases of an
        instruction.  Set IGNORE_REG on the output registers if we do so.

gcc/testsuite/
        * gcc.target/aarch64/early_ra_1.c: New test.
---
 gcc/config/aarch64/aarch64-early-ra.cc        | 108 ++++++++++++++++--
 gcc/testsuite/gcc.target/aarch64/early_ra_1.c |  14 +++
 2 files changed, 113 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/early_ra_1.c

diff --git a/gcc/config/aarch64/aarch64-early-ra.cc 
b/gcc/config/aarch64/aarch64-early-ra.cc
index 33d82ea64c2..3d5355b040e 100644
--- a/gcc/config/aarch64/aarch64-early-ra.cc
+++ b/gcc/config/aarch64/aarch64-early-ra.cc
@@ -160,6 +160,9 @@ private:
   // as a multi-register vector operand in some other non-move instruction.
   static constexpr unsigned int HAS_FLEXIBLE_STRIDE = 1U << 10;
   static constexpr unsigned int HAS_FIXED_STRIDE = 1U << 11;
+  //
+  // Whether we have decided not to allocate the pseudo register.
+  static constexpr unsigned int IGNORE_REG = 1U << 12;
 
   // Flags that should be propagated across moves between pseudo registers.
   static constexpr unsigned int PSEUDO_COPY_FLAGS = ~(HAS_FLEXIBLE_STRIDE
@@ -500,6 +503,7 @@ private:
 
   void process_region ();
   bool is_dead_insn (rtx_insn *);
+  bool could_split_region_here ();
   void process_block (basic_block, bool);
   void process_blocks ();
 
@@ -1227,7 +1231,9 @@ early_ra::fpr_preference (unsigned int regno)
 {
   auto mode = m_pseudo_regs[regno].mode;
   auto flags = m_pseudo_regs[regno].flags;
-  if (mode == VOIDmode || !targetm.hard_regno_mode_ok (V0_REGNUM, mode))
+  if (flags & IGNORE_REG)
+    return -4;
+  else if (mode == VOIDmode || !targetm.hard_regno_mode_ok (V0_REGNUM, mode))
     return -3;
   else if (flags & HAS_FLEXIBLE_STRIDE)
     return 3;
@@ -1932,19 +1938,34 @@ early_ra::record_constraints (rtx_insn *insn)
       });
     }
 
-  // Record if there is an output operand that is never earlyclobber and never
-  // matched to an input.  See the comment below for how this is used.
   rtx dest_op = NULL_RTX;
   for (int opno = 0; opno < recog_data.n_operands; ++opno)
     {
+      rtx op = recog_data.operand[opno];
       auto op_mask = operand_mask (1) << opno;
+      // Record if there is an output operand that is "independent" of the
+      // inputs, in the sense that it is never earlyclobber and is never
+      // matched to an input.  See the comment below for how this is used.
       if (recog_data.operand_type[opno] == OP_OUT
          && (earlyclobber_operands & op_mask) == 0
          && (matched_operands & op_mask) == 0)
        {
-         dest_op = recog_data.operand[opno];
+         dest_op = op;
          break;
        }
+      // We sometimes decide not to allocate pseudos even if they meet
+      // the normal FPR preference criteria; see the setters of IGNORE_REG
+      // for details.  However, the premise is that we should only ignore
+      // definitions of such pseudos if they are independent of the other
+      // operands in the instruction.
+      else if (recog_data.operand_type[opno] != OP_IN
+              && REG_P (op)
+              && !HARD_REGISTER_P (op)
+              && (m_pseudo_regs[REGNO (op)].flags & IGNORE_REG) != 0)
+       record_allocation_failure ([&](){
+         fprintf (dump_file, "ignored register r%d depends on input operands",
+                  REGNO (op));
+       });
     }
 
   for (int opno = 0; opno < recog_data.n_operands; ++opno)
@@ -3488,6 +3509,35 @@ early_ra::is_dead_insn (rtx_insn *insn)
   return true;
 }
 
+// Return true if we could split a single-block region into two regions
+// at the current program point.  This is true if the block up to this
+// point is worth treating as an independent region and if no registers
+// that would affect the allocation are currently live.
+inline bool
+early_ra::could_split_region_here ()
+{
+  return (m_accurate_live_ranges
+         && bitmap_empty_p (m_live_allocnos)
+         && m_live_fprs == 0
+         && !m_allocnos.is_empty ());
+}
+
+// Return true if any of the pseudos defined by INSN are also defined
+// elsewhere.
+static bool
+defines_multi_def_pseudo (rtx_insn *insn)
+{
+  df_ref ref;
+
+  FOR_EACH_INSN_DEF (ref, insn)
+    {
+      unsigned int regno = DF_REF_REGNO (ref);
+      if (!HARD_REGISTER_NUM_P (regno) && DF_REG_DEF_COUNT (regno) > 1)
+       return true;
+    }
+  return false;
+}
+
 // Build up information about block BB.  IS_ISOLATED is true if the
 // block is not part of a larger region.
 void
@@ -3559,6 +3609,50 @@ early_ra::process_block (basic_block bb, bool 
is_isolated)
       else
        {
          record_insn_defs (insn);
+
+         // If we've decided not to allocate the current region, and if
+         // all relevant registers are now dead, consider splitting the
+         // block into two regions at this point.
+         //
+         // This is useful when a sequence of vector code ends in a
+         // vector-to-scalar-integer operation.  We wouldn't normally
+         // want to allocate the scalar integer, since IRA is much better
+         // at handling cross-file moves.  But if nothing of relevance is
+         // live between the death of the final vector and the birth of the
+         // scalar integer, we can try to split the region at that point.
+         // We'd then allocate the vector input and leave the real RA
+         // to handle the scalar output.
+         if (is_isolated
+             && !m_allocation_successful
+             && could_split_region_here ()
+             && !defines_multi_def_pseudo (insn))
+           {
+             // Mark that we've deliberately chosen to leave the output
+             // pseudos to the real RA.
+             df_ref ref;
+             FOR_EACH_INSN_DEF (ref, insn)
+               {
+                 unsigned int regno = DF_REF_REGNO (ref);
+                 if (!HARD_REGISTER_NUM_P (regno))
+                   {
+                     if (dump_file && (dump_flags & TDF_DETAILS))
+                       fprintf (dump_file, "Ignoring register r%d\n", regno);
+                     m_pseudo_regs[regno].flags |= IGNORE_REG;
+                     bitmap_clear_bit (m_fpr_pseudos, regno);
+                   }
+               }
+
+             if (dump_file && (dump_flags & TDF_DETAILS))
+               fprintf (dump_file, "\nStarting new region at insn %d\n",
+                        INSN_UID (insn));
+
+             // Start a new region and replay the definitions.  The replay
+             // is needed if the instruction sets a hard FP register.
+             start_new_region ();
+             record_insn_defs (insn);
+             start_insn = insn;
+           }
+
          if (auto *call_insn = dyn_cast<rtx_call_insn *> (insn))
            record_insn_call (call_insn);
          record_insn_uses (insn);
@@ -3574,11 +3668,7 @@ early_ra::process_block (basic_block bb, bool 
is_isolated)
 
       // See whether we have a complete region, with no remaining live
       // allocnos.
-      if (is_isolated
-         && m_accurate_live_ranges
-         && bitmap_empty_p (m_live_allocnos)
-         && m_live_fprs == 0
-         && !m_allocnos.is_empty ())
+      if (is_isolated && could_split_region_here ())
        {
          rtx_insn *prev_insn = PREV_INSN (insn);
          if (m_allocation_successful)
diff --git a/gcc/testsuite/gcc.target/aarch64/early_ra_1.c 
b/gcc/testsuite/gcc.target/aarch64/early_ra_1.c
new file mode 100644
index 00000000000..c3d7f425be0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/early_ra_1.c
@@ -0,0 +1,14 @@
+/* { dg-options "-O2 -fno-vect-cost-model" } */
+
+typedef unsigned int v4si __attribute__((vector_size(16)));
+
+unsigned int
+foo (v4si *ptr)
+{
+  v4si x = __builtin_shufflevector (ptr[0], ptr[2], 0, 7, 1, 5);
+  v4si y = x ^ ptr[4];
+  return (y[0] + y[1] + y[2] + y[3]) >> 1;
+}
+
+/* { dg-final { scan-assembler {\taddv\t} } } */
+/* { dg-final { scan-assembler-not {\tmov\t} } } */
-- 
2.25.1

[PATCH 6/6] aarch64: Improve early-ra handling of reductions

Reply via email to