[gcc r15-6752] AArch64: Fix costing of emulated gathers/scatters [PR118188]

Tamar Christina via Gcc-cvs Thu, 09 Jan 2025 13:32:08 -0800

https://gcc.gnu.org/g:08b6e875c6b1b52c6e98f4a2e37124bf8c6a6ccb


commit r15-6752-g08b6e875c6b1b52c6e98f4a2e37124bf8c6a6ccb
Author: Tamar Christina <tamar.christ...@arm.com>
Date:   Thu Jan 9 21:31:05 2025 +0000

    AArch64: Fix costing of emulated gathers/scatters [PR118188]
    
    When a target does not support gathers and scatters the vectorizer tries to
    emulate these using scalar loads/stores and a reconstruction of vectors from
    scalar.
    
    The loads are still marked with VMAT_GATHER_SCATTER to indicate that they 
are
    gather/scatters, however the vectorizer also asks the target to cost the
    instruction that generates the indexes for the emulated instructions.
    
    This is done by asking the target to cost vec_to_scalar and vec_construct 
with
    a stmt_vinfo being the VMAT_GATHER_SCATTER.
    
    Since Adv. SIMD does not have an LD1 variant that takes an Adv. SIMD Scalar
    element the operation is lowered entirely into a sequence of GPR loads to 
create
    the x registers for the indexes.
    
    At the moment however we don't cost these, and so the vectorizer things that
    when it emulates the instructions that it's much cheaper than using an 
actual
    gather/scatter with SVE.  Consider:
    
    #define iterations 100000
    #define LEN_1D 32000
    
    float a[LEN_1D], b[LEN_1D];
    
    float
    s4115 (int *ip)
    {
        float sum = 0.;
        for (int i = 0; i < LEN_1D; i++)
            {
                sum += a[i] * b[ip[i]];
            }
        return sum;
    }
    
    which before this patch with -mcpu=<sve-core> generates:
    
    .L2:
            add     x3, x0, x1
            ldrsw   x4, [x0, x1]
            ldrsw   x6, [x3, 4]
            ldpsw   x3, x5, [x3, 8]
            ldr     s1, [x2, x4, lsl 2]
            ldr     s30, [x2, x6, lsl 2]
            ldr     s31, [x2, x5, lsl 2]
            ldr     s29, [x2, x3, lsl 2]
            uzp1    v30.2s, v30.2s, v31.2s
            ldr     q31, [x7, x1]
            add     x1, x1, 16
            uzp1    v1.2s, v1.2s, v29.2s
            zip1    v30.4s, v1.4s, v30.4s
            fmla    v0.4s, v31.4s, v30.4s
            cmp     x1, x8
            bne     .L2
    
    but during costing:
    
    a[i_18] 1 times vector_load costs 4 in body
    *_4 1 times unaligned_load (misalign -1) costs 4 in body
    b[_5] 4 times vec_to_scalar costs 32 in body
    b[_5] 4 times scalar_load costs 16 in body
    b[_5] 1 times vec_construct costs 3 in body
    _1 * _6 1 times vector_stmt costs 2 in body
    _7 + sum_16 1 times scalar_to_vec costs 4 in prologue
    _7 + sum_16 1 times vector_stmt costs 2 in epilogue
    _7 + sum_16 1 times vec_to_scalar costs 4 in epilogue
    _7 + sum_16 1 times vector_stmt costs 2 in body
    
    Here we see that the latency for the vec_to_scalar is very high.  We know 
the
    intermediate vector isn't usable by the target ISA and will always be 
elided.
    However these latencies need to remain high because when costing 
gather/scatters
    IFNs we still pass the nunits of the type along.  In other words, the 
vectorizer
    is still costing vector gather/scatters as scalar load/stores.
    
    Lowering the cost for the emulated gathers would result in emulation being
    seemingly cheaper.  So while the emulated costs are very high, they need to 
be
    higher than those for the IFN costing.
    
    i.e. the vectorizer generates:
    
      vect__5.9_8 = MEM <vector(4) intD.7> [(intD.7 *)vectp_ip.7_14];
      _35 = BIT_FIELD_REF <vect__5.9_8, 32, 0>;
      _36 = (sizetype) _35;
      _37 = _36 * 4;
      _38 = _34 + _37;
      _39 = (voidD.55 *) _38;
      # VUSE <.MEM_10(D)>
      _40 = MEM[(floatD.32 *)_39];
    
    which after IVopts is:
    
      _63 = &MEM <vector(4) int> [(int *)ip_11(D) + ivtmp.19_27 * 1];
      _47 = BIT_FIELD_REF <MEM <vector(4) int> [(int *)_63], 32, 64>;
      _41 = BIT_FIELD_REF <MEM <vector(4) int> [(int *)_63], 32, 32>;
      _35 = BIT_FIELD_REF <MEM <vector(4) int> [(int *)_63], 32, 0>;
      _53 = BIT_FIELD_REF <MEM <vector(4) int> [(int *)_63], 32, 96>;
    
    Which we correctly lower in RTL to individual loads to avoid the repeated 
umov.
    
    As such, we should cost the vec_to_scalar as GPR loads and also do so for 
the
    throughput which we at the moment cost as:
    
      note:  Vector issue estimate:
      note:    load operations = 6
      note:    store operations = 0
      note:    general operations = 6
      note:    reduction latency = 2
      note:    estimated min cycles per iteration = 2.000000
    
    Which means 3 loads for the GOR indexes are missing, making it seem like the
    emulated loop has a much lower cycles per iter than it actually does since 
the
    bottleneck on the load units are not modelled.
    
    But worse, because the vectorizer costs gathers/scatters IFNs as scalar
    load/stores the number of loads required for an SVE gather is always much
    higher than the equivalent emulated variant.
    
    gcc/ChangeLog:
    
            PR target/118188
            * config/aarch64/aarch64.cc (aarch64_vector_costs::count_ops): 
Adjust
            throughput of emulated gather and scatters.
    
    gcc/testsuite/ChangeLog:
    
            PR target/118188
            * gcc.target/aarch64/sve/gather_load_12.c: New test.
            * gcc.target/aarch64/sve/gather_load_13.c: New test.
            * gcc.target/aarch64/sve/gather_load_14.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64.cc                      | 41 ++++++++++++++++++++++
 .../gcc.target/aarch64/sve/gather_load_12.c        | 20 +++++++++++
 .../gcc.target/aarch64/sve/gather_load_13.c        | 20 +++++++++++
 .../gcc.target/aarch64/sve/gather_load_14.c        | 20 +++++++++++
 4 files changed, 101 insertions(+)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 78d2cc4bbe49..6fe0fa2722bd 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -17388,6 +17388,47 @@ aarch64_vector_costs::count_ops (unsigned int count, 
vect_cost_for_stmt kind,
        return;
     }
 
+  /* Detect the case where we are using an emulated gather/scatter.  When a
+     target does not support gathers and scatters directly the vectorizer
+     emulates these by constructing an index vector and then issuing an
+     extraction for every lane in the vector.  If the index vector is loaded
+     from memory, the vector load and extractions are subsequently lowered by
+     veclower into a series of scalar index loads.  After the final loads are
+     done it issues a vec_construct to recreate the vector from the scalar.  
For
+     costing when we see a vec_to_scalar on a stmt with VMAT_GATHER_SCATTER we
+     are dealing with an emulated instruction and should adjust costing
+     properly.  */
+  if (kind == vec_to_scalar
+      && (m_vec_flags & VEC_ADVSIMD)
+      && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
+    {
+      auto dr = STMT_VINFO_DATA_REF (stmt_info);
+      tree dr_ref = DR_REF (dr);
+      while (handled_component_p (dr_ref))
+       {
+         if (TREE_CODE (dr_ref) == ARRAY_REF)
+           {
+             tree offset = TREE_OPERAND (dr_ref, 1);
+             if (SSA_VAR_P (offset))
+               {
+                 if (gimple_vuse (SSA_NAME_DEF_STMT (offset)))
+                   {
+                     if (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type)
+                       ops->loads += count - 1;
+                     else
+                         /* Stores want to count both the index to array and 
data to
+                            array using vec_to_scalar.  However we have index 
stores
+                            in Adv.SIMD and so we only want to adjust the index
+                            loads.  */
+                       ops->loads += count / 2;
+                     return;
+                   }
+                 break;
+               }
+           }
+         dr_ref = TREE_OPERAND (dr_ref, 0);
+       }
+    }
 
   /* Count the basic operation cost associated with KIND.  */
   switch (kind)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_12.c 
b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_12.c
new file mode 100644
index 000000000000..d550f005d638
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_12.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Ofast -mcpu=neoverse-v2" } */
+
+#define iterations 100000
+#define LEN_1D 32000
+
+float a[LEN_1D], b[LEN_1D];
+
+float
+s4115 (int *ip)
+{
+    float sum = 0.;
+    for (int i = 0; i < LEN_1D; i++)
+      {
+        sum += a[i] * b[ip[i]];
+      }
+    return sum;
+}
+
+/* { dg-final { scan-assembler {\s+ld1w\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_13.c 
b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_13.c
new file mode 100644
index 000000000000..24da0646a75e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_13.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Ofast -mcpu=neoverse-v2" } */
+
+#define iterations 100000
+#define LEN_1D 32000
+
+float a[LEN_1D], b[LEN_1D];
+
+float
+s4115 (int *ip)
+{
+    float sum = 0.;
+    for (int i = 0; i < LEN_1D; i++)
+      {
+        sum += a[i] * b[ip[i] + 1];
+      }
+    return sum;
+}
+
+/* { dg-final { scan-assembler {\s+ld1w\t} { xfail *-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_14.c 
b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_14.c
new file mode 100644
index 000000000000..77d06d2a0e0e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_14.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Ofast -mcpu=neoverse-v2" } */
+
+#define iterations 100000
+#define LEN_1D 32000
+
+float a[LEN_1D], b[LEN_1D];
+
+float
+s4115 (int *ip)
+{
+    float sum = 0.;
+    for (int i = 0; i < LEN_1D; i++)
+      {
+        sum += a[i] * b[ip[i]];
+      }
+    return sum;
+}
+
+/* { dg-final { scan-assembler-not {\s+st1w\t} } } */

[gcc r15-6752] AArch64: Fix costing of emulated gathers/scatters [PR118188]

Reply via email to