https://gcc.gnu.org/g:e84e5d034124c6733d3b36d8623c56090d4d17f7

commit r15-3767-ge84e5d034124c6733d3b36d8623c56090d4d17f7
Author: Tamar Christina <tamar.christ...@arm.com>
Date:   Sun Sep 22 13:34:10 2024 +0100

    aarch64: Take into account when VF is higher than known scalar iters
    
    Consider low overhead loops like:
    
    void
    foo (char *restrict a, int *restrict b, int *restrict c, int n)
    {
      for (int i = 0; i < 9; i++)
        {
          int res = c[i];
          int t = b[i];
          if (a[i] != 0)
            res = t;
          c[i] = res;
        }
    }
    
    For such loops we use latency only costing since the loop bounds is known 
and
    small.
    
    The current costing however does not consider the case where niters < VF.
    
    So when comparing the scalar vs vector costs it doesn't keep in mind that 
the
    scalar code can't perform VF iterations.  This makes it overestimate the 
cost
    for the scalar loop and we incorrectly vectorize.
    
    This patch takes the minimum of the VF and niters in such cases.
    Before the patch we generate:
    
     note:  Original vector body cost = 46
     note:  Vector loop iterates at most 1 times
     note:  Scalar issue estimate:
     note:    load operations = 2
     note:    store operations = 1
     note:    general operations = 1
     note:    reduction latency = 0
     note:    estimated min cycles per iteration = 1.000000
     note:    estimated cycles per vector iteration (for VF 32) = 32.000000
     note:  SVE issue estimate:
     note:    load operations = 5
     note:    store operations = 4
     note:    general operations = 11
     note:    predicate operations = 12
     note:    reduction latency = 0
     note:    estimated min cycles per iteration without predication = 5.500000
     note:    estimated min cycles per iteration for predication = 12.000000
     note:    estimated min cycles per iteration = 12.000000
     note:  Low iteration count, so using pure latency costs
     note:  Cost model analysis:
    
    vs after:
    
     note:  Original vector body cost = 46
     note:  Known loop bounds, capping VF to 9 for analysis
     note:  Vector loop iterates at most 1 times
     note:  Scalar issue estimate:
     note:    load operations = 2
     note:    store operations = 1
     note:    general operations = 1
     note:    reduction latency = 0
     note:    estimated min cycles per iteration = 1.000000
     note:    estimated cycles per vector iteration (for VF 9) = 9.000000
     note:  SVE issue estimate:
     note:    load operations = 5
     note:    store operations = 4
     note:    general operations = 11
     note:    predicate operations = 12
     note:    reduction latency = 0
     note:    estimated min cycles per iteration without predication = 5.500000
     note:    estimated min cycles per iteration for predication = 12.000000
     note:    estimated min cycles per iteration = 12.000000
     note:  Increasing body cost to 1472 because the scalar code could issue 
within the limit imposed by predicate operations
     note:  Low iteration count, so using pure latency costs
     note:  Cost model analysis:
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64.cc (adjust_body_cost):
            Cap VF for low iteration loops.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/sve/asrdiv_4.c: Update bounds.
            * gcc.target/aarch64/sve/cond_asrd_2.c: Likewise.
            * gcc.target/aarch64/sve/cond_uxt_6.c: Likewise.
            * gcc.target/aarch64/sve/cond_uxt_7.c: Likewise.
            * gcc.target/aarch64/sve/cond_uxt_8.c: Likewise.
            * gcc.target/aarch64/sve/miniloop_1.c: Likewise.
            * gcc.target/aarch64/sve/spill_6.c: Likewise.
            * gcc.target/aarch64/sve/sve_iters_low_1.c: New test.
            * gcc.target/aarch64/sve/sve_iters_low_2.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64.cc                        | 13 +++++++++++++
 gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c      | 12 ++++++------
 gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c   | 12 ++++++------
 gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c    |  8 ++++----
 gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c    |  8 ++++----
 gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c    |  8 ++++----
 gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c    |  2 +-
 gcc/testsuite/gcc.target/aarch64/sve/spill_6.c       |  8 ++++----
 .../gcc.target/aarch64/sve/sve_iters_low_1.c         | 17 +++++++++++++++++
 .../gcc.target/aarch64/sve/sve_iters_low_2.c         | 20 ++++++++++++++++++++
 10 files changed, 79 insertions(+), 29 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 92763d403c75..68913beaee20 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -17565,6 +17565,19 @@ adjust_body_cost (loop_vec_info loop_vinfo,
     dump_printf_loc (MSG_NOTE, vect_location,
                     "Original vector body cost = %d\n", body_cost);
 
+  /* If we know we have a single partial vector iteration, cap the VF
+     to the number of scalar iterations for costing purposes.  */
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    {
+      auto niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
+      if (niters < estimated_vf && dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "Scalar loop iterates at most %wd times.  Capping VF "
+                        " from %d to %wd\n", niters, estimated_vf, niters);
+
+      estimated_vf = MIN (estimated_vf, niters);
+    }
+
   fractional_cost scalar_cycles_per_iter
     = scalar_ops.min_cycles_per_iter () * estimated_vf;
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c 
b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c
index 6684fe1c1244..10a96a894afd 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c
@@ -15,12 +15,12 @@
   }
 
 #define TEST_ALL(T) \
-  T (int16_t, int8_t, 7) \
-  T (int32_t, int8_t, 3) \
-  T (int32_t, int16_t, 3) \
-  T (int64_t, int8_t, 5) \
-  T (int64_t, int16_t, 5) \
-  T (int64_t, int32_t, 5)
+  T (int16_t, int8_t, 70) \
+  T (int32_t, int8_t, 30) \
+  T (int32_t, int16_t, 30) \
+  T (int64_t, int8_t, 50) \
+  T (int64_t, int16_t, 50) \
+  T (int64_t, int32_t, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c
index e4040ee3520c..db1721efbc7b 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c
@@ -14,12 +14,12 @@
   }
 
 #define TEST_ALL(T) \
-  T (int16_t, int8_t, 7) \
-  T (int32_t, int8_t, 3) \
-  T (int32_t, int16_t, 3) \
-  T (int64_t, int8_t, 5) \
-  T (int64_t, int16_t, 5) \
-  T (int64_t, int32_t, 5)
+  T (int16_t, int8_t, 70) \
+  T (int32_t, int8_t, 30) \
+  T (int32_t, int16_t, 30) \
+  T (int64_t, int8_t, 50) \
+  T (int64_t, int16_t, 50) \
+  T (int64_t, int32_t, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c 
b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c
index e47276a3a352..b8b3e862d0a1 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c
@@ -14,11 +14,11 @@
   }
 
 #define TEST_ALL(T)                    \
-  T (int32_t, uint16_t, 0xff, 3)       \
+  T (int32_t, uint16_t, 0xff, 30)      \
                                        \
-  T (int64_t, uint16_t, 0xff, 5)       \
-  T (int64_t, uint32_t, 0xff, 5)       \
-  T (int64_t, uint32_t, 0xffff, 5)
+  T (int64_t, uint16_t, 0xff, 50)      \
+  T (int64_t, uint32_t, 0xff, 50)      \
+  T (int64_t, uint32_t, 0xffff, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c 
b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c
index f49915c4ac14..2d02fb70f33f 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c
@@ -14,11 +14,11 @@
   }
 
 #define TEST_ALL(T)                    \
-  T (int32_t, uint16_t, 0xff, 3)       \
+  T (int32_t, uint16_t, 0xff, 30)      \
                                        \
-  T (int64_t, uint16_t, 0xff, 5)       \
-  T (int64_t, uint32_t, 0xff, 5)       \
-  T (int64_t, uint32_t, 0xffff, 5)
+  T (int64_t, uint16_t, 0xff, 50)      \
+  T (int64_t, uint32_t, 0xff, 50)      \
+  T (int64_t, uint32_t, 0xffff, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c 
b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c
index 42eb4b2661b3..8fe2455687b5 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c
@@ -14,11 +14,11 @@
   }
 
 #define TEST_ALL(T)                    \
-  T (int32_t, uint16_t, 0xff, 3)       \
+  T (int32_t, uint16_t, 0xff, 30)      \
                                        \
-  T (int64_t, uint16_t, 0xff, 5)       \
-  T (int64_t, uint32_t, 0xff, 5)       \
-  T (int64_t, uint32_t, 0xffff, 5)
+  T (int64_t, uint16_t, 0xff, 50)      \
+  T (int64_t, uint32_t, 0xff, 50)      \
+  T (int64_t, uint32_t, 0xffff, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c
index 09eb4146816c..cd1fd2b8a078 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c
@@ -6,7 +6,7 @@ void loop (int * __restrict__ a, int * __restrict__ b, int * 
__restrict__ c,
           int * __restrict__ g, int * __restrict__ h)
 {
   int i = 0;
-  for (i = 0; i < 3; i++)
+  for (i = 0; i < 30; i++)
     {
       a[i] += i;
       b[i] += i;
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c 
b/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c
index ae9c338f5696..2ff969ced009 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c
@@ -11,20 +11,20 @@ void consumer (void *);
   {                                                                    \
     if (which)                                                         \
       {                                                                        
\
-       for (int i = 0; i < 7; ++i)                                     \
+       for (int i = 0; i < 70; ++i)                                    \
          x1[i] += VAL;                                                 \
        consumer (x1);                                                  \
-       for (int i = 0; i < 7; ++i)                                     \
+       for (int i = 0; i < 70; ++i)                                    \
          x2[i] -= VAL;                                                 \
        consumer (x2);                                                  \
       }                                                                        
\
     else                                                               \
       {                                                                        
\
-       for (int i = 0; i < 7; ++i)                                     \
+       for (int i = 0; i < 70; ++i)                                    \
          x3[i] &= VAL;                                                 \
        consumer (x3);                                                  \
       }                                                                        
\
-    for (int i = 0; i < 7; ++i)                                                
\
+    for (int i = 0; i < 70; ++i)                                       \
       x4[i] |= VAL;                                                    \
     consumer (x4);                                                     \
   }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c
new file mode 100644
index 000000000000..952a4b1cd580
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv9-a -Ofast -fdump-tree-vect-details" } 
*/
+
+void
+foo (char *restrict a, int *restrict b, int *restrict c, int n)
+{
+  for (int i = 0; i < 9; i++)
+    {
+      int res = c[i];
+      int t = b[i];
+      if (a[i] != 0)
+        res = t;
+      c[i] = res;
+    }
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c
new file mode 100644
index 000000000000..02d10de2a621
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv9-a -Ofast -fdump-tree-vect-details" } 
*/
+
+void
+foo (char *restrict a, int *restrict b, int *restrict c, int n, int stride)
+{
+  if (stride <= 1)
+    return;
+
+  for (int i = 0; i < 9; i++)
+    {
+      int res = c[i];
+      int t = b[i*stride];
+      if (a[i] != 0)
+        res = t;
+      c[i] = res;
+    }
+}
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */

Reply via email to