https://gcc.gnu.org/g:a50916a6c0a6c73c1537d033509d4f7034341f75

commit r15-2768-ga50916a6c0a6c73c1537d033509d4f7034341f75
Author: Tamar Christina <tamar.christ...@arm.com>
Date:   Tue Aug 6 22:41:10 2024 +0100

    AArch64: take gather/scatter decode overhead into account
    
    Gather and scatters are not usually beneficial when the loop count is small.
    This is because there's not only a cost to their execution within the loop 
but
    there is also some cost to enter loops with them.
    
    As such this patch models this overhead.  For generic tuning we however 
still
    prefer gathers/scatters when the loop costs work out.
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-protos.h (struct sve_vec_cost): Add
            gather_load_x32_init_cost and gather_load_x64_init_cost.
            * config/aarch64/aarch64.cc (aarch64_vector_costs): Add
            m_sve_gather_scatter_init_cost.
            (aarch64_vector_costs::add_stmt_cost): Use them.
            (aarch64_vector_costs::finish_cost): Likewise.
            * config/aarch64/tuning_models/a64fx.h: Update.
            * config/aarch64/tuning_models/cortexx925.h: Update.
            * config/aarch64/tuning_models/generic.h: Update.
            * config/aarch64/tuning_models/generic_armv8_a.h: Update.
            * config/aarch64/tuning_models/generic_armv9_a.h: Update.
            * config/aarch64/tuning_models/neoverse512tvb.h: Update.
            * config/aarch64/tuning_models/neoversen2.h: Update.
            * config/aarch64/tuning_models/neoversen3.h: Update.
            * config/aarch64/tuning_models/neoversev1.h: Update.
            * config/aarch64/tuning_models/neoversev2.h: Update.
            * config/aarch64/tuning_models/neoversev3.h: Update.
            * config/aarch64/tuning_models/neoversev3ae.h: Update.

Diff:
---
 gcc/config/aarch64/aarch64-protos.h                | 10 +++++++++
 gcc/config/aarch64/aarch64.cc                      | 26 ++++++++++++++++++++++
 gcc/config/aarch64/tuning_models/a64fx.h           |  2 ++
 gcc/config/aarch64/tuning_models/cortexx925.h      |  2 ++
 gcc/config/aarch64/tuning_models/generic.h         |  2 ++
 gcc/config/aarch64/tuning_models/generic_armv8_a.h |  2 ++
 gcc/config/aarch64/tuning_models/generic_armv9_a.h |  2 ++
 gcc/config/aarch64/tuning_models/neoverse512tvb.h  |  2 ++
 gcc/config/aarch64/tuning_models/neoversen2.h      |  2 ++
 gcc/config/aarch64/tuning_models/neoversen3.h      |  2 ++
 gcc/config/aarch64/tuning_models/neoversev1.h      |  2 ++
 gcc/config/aarch64/tuning_models/neoversev2.h      |  2 ++
 gcc/config/aarch64/tuning_models/neoversev3.h      |  2 ++
 gcc/config/aarch64/tuning_models/neoversev3ae.h    |  2 ++
 14 files changed, 60 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index f64afe288901..44b881b5c57a 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -262,6 +262,8 @@ struct sve_vec_cost : simd_vec_cost
                          unsigned int fadda_f64_cost,
                          unsigned int gather_load_x32_cost,
                          unsigned int gather_load_x64_cost,
+                         unsigned int gather_load_x32_init_cost,
+                         unsigned int gather_load_x64_init_cost,
                          unsigned int scatter_store_elt_cost)
     : simd_vec_cost (base),
       clast_cost (clast_cost),
@@ -270,6 +272,8 @@ struct sve_vec_cost : simd_vec_cost
       fadda_f64_cost (fadda_f64_cost),
       gather_load_x32_cost (gather_load_x32_cost),
       gather_load_x64_cost (gather_load_x64_cost),
+      gather_load_x32_init_cost (gather_load_x32_init_cost),
+      gather_load_x64_init_cost (gather_load_x64_init_cost),
       scatter_store_elt_cost (scatter_store_elt_cost)
   {}
 
@@ -289,6 +293,12 @@ struct sve_vec_cost : simd_vec_cost
   const int gather_load_x32_cost;
   const int gather_load_x64_cost;
 
+  /* Additional loop initialization cost of using a gather load instruction.  
The x32
+     value is for loads of 32-bit elements and the x64 value is for loads of
+     64-bit elements.  */
+  const int gather_load_x32_init_cost;
+  const int gather_load_x64_init_cost;
+
   /* The per-element cost of a scatter store.  */
   const int scatter_store_elt_cost;
 };
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 9e12bd9711cd..2ac5a22c848e 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16231,6 +16231,10 @@ private:
      supported by Advanced SIMD and SVE2.  */
   bool m_has_avg = false;
 
+  /* Additional initialization costs for using gather or scatter operation in
+     the current loop.  */
+  unsigned int m_sve_gather_scatter_init_cost = 0;
+
   /* True if the vector body contains a store to a decl and if the
      function is known to have a vld1 from the same decl.
 
@@ -17295,6 +17299,23 @@ aarch64_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
        stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
                                                        stmt_info, vectype,
                                                        where, stmt_cost);
+
+      /* Check if we've seen an SVE gather/scatter operation and which size.  
*/
+      if (kind == scalar_load
+         && aarch64_sve_mode_p (TYPE_MODE (vectype))
+         && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
+       {
+         const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve;
+         if (sve_costs)
+           {
+             if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
+               m_sve_gather_scatter_init_cost
+                 += sve_costs->gather_load_x64_init_cost;
+             else
+               m_sve_gather_scatter_init_cost
+                 += sve_costs->gather_load_x32_init_cost;
+           }
+       }
     }
 
   /* Do any SVE-specific adjustments to the cost.  */
@@ -17680,6 +17701,11 @@ aarch64_vector_costs::finish_cost (const vector_costs 
*uncast_scalar_costs)
       m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
                                             m_costs[vect_body]);
       m_suggested_unroll_factor = determine_suggested_unroll_factor ();
+
+      /* For gather and scatters there's an additional overhead for the first
+        iteration.  For low count loops they're not beneficial so model the
+        overhead as loop prologue costs.  */
+      m_costs[vect_prologue] += m_sve_gather_scatter_init_cost;
     }
 
   /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
diff --git a/gcc/config/aarch64/tuning_models/a64fx.h 
b/gcc/config/aarch64/tuning_models/a64fx.h
index 6091289d4c3c..378a1b3889ee 100644
--- a/gcc/config/aarch64/tuning_models/a64fx.h
+++ b/gcc/config/aarch64/tuning_models/a64fx.h
@@ -104,6 +104,8 @@ static const sve_vec_cost a64fx_sve_vector_cost =
   13, /* fadda_f64_cost  */
   64, /* gather_load_x32_cost  */
   32, /* gather_load_x64_cost  */
+  0, /* gather_load_x32_init_cost  */
+  0, /* gather_load_x64_init_cost  */
   1 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/cortexx925.h 
b/gcc/config/aarch64/tuning_models/cortexx925.h
index 6cae5b7de5ca..b509cae75841 100644
--- a/gcc/config/aarch64/tuning_models/cortexx925.h
+++ b/gcc/config/aarch64/tuning_models/cortexx925.h
@@ -135,6 +135,8 @@ static const sve_vec_cost cortexx925_sve_vector_cost =
      operation more than a 64-bit gather.  */
   14, /* gather_load_x32_cost  */
   12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
   1 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/generic.h 
b/gcc/config/aarch64/tuning_models/generic.h
index 2b1f68b30521..101969bdbb9c 100644
--- a/gcc/config/aarch64/tuning_models/generic.h
+++ b/gcc/config/aarch64/tuning_models/generic.h
@@ -105,6 +105,8 @@ static const sve_vec_cost generic_sve_vector_cost =
   2, /* fadda_f64_cost  */
   4, /* gather_load_x32_cost  */
   2, /* gather_load_x64_cost  */
+  12, /* gather_load_x32_init_cost  */
+  4, /* gather_load_x64_init_cost  */
   1 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/generic_armv8_a.h 
b/gcc/config/aarch64/tuning_models/generic_armv8_a.h
index b38b9a8c5cad..b5088afe068a 100644
--- a/gcc/config/aarch64/tuning_models/generic_armv8_a.h
+++ b/gcc/config/aarch64/tuning_models/generic_armv8_a.h
@@ -106,6 +106,8 @@ static const sve_vec_cost generic_armv8_a_sve_vector_cost =
   2, /* fadda_f64_cost  */
   4, /* gather_load_x32_cost  */
   2, /* gather_load_x64_cost  */
+  12, /* gather_load_x32_init_cost  */
+  4, /* gather_load_x64_init_cost  */
   1 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h 
b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
index 7156dbe5787e..999985ed40f6 100644
--- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h
+++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
@@ -136,6 +136,8 @@ static const sve_vec_cost generic_armv9_a_sve_vector_cost =
      operation more than a 64-bit gather.  */
   14, /* gather_load_x32_cost  */
   12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
   3 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/neoverse512tvb.h 
b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
index 825c6a64990b..d2a0b647791d 100644
--- a/gcc/config/aarch64/tuning_models/neoverse512tvb.h
+++ b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
@@ -79,6 +79,8 @@ static const sve_vec_cost neoverse512tvb_sve_vector_cost =
      operation more than a 64-bit gather.  */
   14, /* gather_load_x32_cost  */
   12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
   3 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h 
b/gcc/config/aarch64/tuning_models/neoversen2.h
index d41e714aa045..1a5b66901b5c 100644
--- a/gcc/config/aarch64/tuning_models/neoversen2.h
+++ b/gcc/config/aarch64/tuning_models/neoversen2.h
@@ -135,6 +135,8 @@ static const sve_vec_cost neoversen2_sve_vector_cost =
      operation more than a 64-bit gather.  */
   14, /* gather_load_x32_cost  */
   12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
   3 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/neoversen3.h 
b/gcc/config/aarch64/tuning_models/neoversen3.h
index c027cefbd2fd..3e2b84ca497e 100644
--- a/gcc/config/aarch64/tuning_models/neoversen3.h
+++ b/gcc/config/aarch64/tuning_models/neoversen3.h
@@ -135,6 +135,8 @@ static const sve_vec_cost neoversen3_sve_vector_cost =
      operation more than a 64-bit gather.  */
   14, /* gather_load_x32_cost  */
   12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
   1 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h 
b/gcc/config/aarch64/tuning_models/neoversev1.h
index 0fc41ce6a41b..705ed025730f 100644
--- a/gcc/config/aarch64/tuning_models/neoversev1.h
+++ b/gcc/config/aarch64/tuning_models/neoversev1.h
@@ -126,6 +126,8 @@ static const sve_vec_cost neoversev1_sve_vector_cost =
   8, /* fadda_f64_cost  */
   32, /* gather_load_x32_cost  */
   16, /* gather_load_x64_cost  */
+  96, /* gather_load_x32_init_cost  */
+  32, /* gather_load_x64_init_cost  */
   3 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h 
b/gcc/config/aarch64/tuning_models/neoversev2.h
index bd259a37e9c9..1ebb96b296d3 100644
--- a/gcc/config/aarch64/tuning_models/neoversev2.h
+++ b/gcc/config/aarch64/tuning_models/neoversev2.h
@@ -135,6 +135,8 @@ static const sve_vec_cost neoversev2_sve_vector_cost =
      operation more than a 64-bit gather.  */
   14, /* gather_load_x32_cost  */
   12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
   3 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/neoversev3.h 
b/gcc/config/aarch64/tuning_models/neoversev3.h
index c602d067c711..c91e8c829532 100644
--- a/gcc/config/aarch64/tuning_models/neoversev3.h
+++ b/gcc/config/aarch64/tuning_models/neoversev3.h
@@ -135,6 +135,8 @@ static const sve_vec_cost neoversev3_sve_vector_cost =
      operation more than a 64-bit gather.  */
   14, /* gather_load_x32_cost  */
   12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
   1 /* scatter_store_elt_cost  */
 };
 
diff --git a/gcc/config/aarch64/tuning_models/neoversev3ae.h 
b/gcc/config/aarch64/tuning_models/neoversev3ae.h
index 96d7ccf03cd9..61e439326eb6 100644
--- a/gcc/config/aarch64/tuning_models/neoversev3ae.h
+++ b/gcc/config/aarch64/tuning_models/neoversev3ae.h
@@ -135,6 +135,8 @@ static const sve_vec_cost neoversev3ae_sve_vector_cost =
      operation more than a 64-bit gather.  */
   14, /* gather_load_x32_cost  */
   12, /* gather_load_x64_cost  */
+  42, /* gather_load_x32_init_cost  */
+  24, /* gather_load_x64_init_cost  */
   1 /* scatter_store_elt_cost  */
 };

Reply via email to