[gcc r15-9778] Fix regression from x86 multi-epilogue tuning

Richard Biener via Gcc-cvs Fri, 06 Jun 2025 02:38:30 -0700

https://gcc.gnu.org/g:e93f02828faf7dc0df6a4d67b1b6b2a30bd713cb


commit r15-9778-ge93f02828faf7dc0df6a4d67b1b6b2a30bd713cb
Author: Richard Biener <rguent...@suse.de>
Date:   Wed May 14 16:36:29 2025 +0200

    Fix regression from x86 multi-epilogue tuning
    
    With the avx512_two_epilogues tuning enabled for zen4 and zen5
    the gcc.target/i386/vect-epilogues-5.c testcase below regresses
    and ends up using AVX2 sized vectors for the masked epilogue
    rather than AVX512 sized vectors.  The following patch rectifies
    this and adds coverage for the intended behavior.
    
            * config/i386/i386.cc (ix86_vector_costs::finish_cost):
            Do not suggest a first epilogue mode for AVX512 sized
            main loops with X86_TUNE_AVX512_TWO_EPILOGUES as that
            interferes with using a masked epilogue.
    
            * gcc.target/i386/vect-epilogues-1.c: New testcase.
            * gcc.target/i386/vect-epilogues-2.c: Likewise.
            * gcc.target/i386/vect-epilogues-3.c: Likewise.
            * gcc.target/i386/vect-epilogues-4.c: Likewise.
            * gcc.target/i386/vect-epilogues-5.c: Likewise.
    
    (cherry picked from commit 75c7f90bfe6fa8e6c1a70b784e98a3412861646d)

Diff:
---
 gcc/config/i386/i386.cc                          | 10 +++-------
 gcc/testsuite/gcc.target/i386/vect-epilogues-1.c | 14 ++++++++++++++
 gcc/testsuite/gcc.target/i386/vect-epilogues-2.c | 15 +++++++++++++++
 gcc/testsuite/gcc.target/i386/vect-epilogues-3.c | 15 +++++++++++++++
 gcc/testsuite/gcc.target/i386/vect-epilogues-4.c | 13 +++++++++++++
 gcc/testsuite/gcc.target/i386/vect-epilogues-5.c | 13 +++++++++++++
 6 files changed, 73 insertions(+), 7 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 38df84f7db24..a6f0a582c3d2 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25545,14 +25545,10 @@ ix86_vector_costs::finish_cost (const vector_costs 
*scalar_costs)
   /* When X86_TUNE_AVX512_TWO_EPILOGUES is enabled arrange for both
      a AVX2 and a SSE epilogue for AVX512 vectorized loops.  */
   if (loop_vinfo
+      && LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+      && GET_MODE_SIZE (loop_vinfo->vector_mode) == 32
       && ix86_tune_features[X86_TUNE_AVX512_TWO_EPILOGUES])
-    {
-      if (GET_MODE_SIZE (loop_vinfo->vector_mode) == 64)
-       m_suggested_epilogue_mode = V32QImode;
-      else if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
-              && GET_MODE_SIZE (loop_vinfo->vector_mode) == 32)
-       m_suggested_epilogue_mode = V16QImode;
-    }
+    m_suggested_epilogue_mode = V16QImode;
   /* When a 128bit SSE vectorized epilogue still has a VF of 16 or larger
      enable a 64bit SSE epilogue.  */
   if (loop_vinfo
diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-1.c 
b/gcc/testsuite/gcc.target/i386/vect-epilogues-1.c
new file mode 100644
index 000000000000..a7f5f12c71bc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -mno-avx512f -mtune=generic 
-fdump-tree-vect-optimized" } */
+
+int test (signed char *data, int n)
+{
+  int sum = 0;
+  for (int i = 0; i < n; ++i)
+    sum += data[i];
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump "loop vectorized using 32 byte vectors" "vect" 
} } */
+/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" 
} } */
+/* { dg-final { scan-tree-dump "loop vectorized using 8 byte vectors" "vect" { 
target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-2.c 
b/gcc/testsuite/gcc.target/i386/vect-epilogues-2.c
new file mode 100644
index 000000000000..d6c06edcacd1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-2.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512bw -mtune=generic -fdump-tree-vect-optimized" } */
+
+int test (signed char *data, int n)
+{
+  int sum = 0;
+  for (int i = 0; i < n; ++i)
+    sum += data[i];
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump "loop vectorized using 64 byte vectors" "vect" 
} } */
+/* { dg-final { scan-tree-dump "loop vectorized using 32 byte vectors" "vect" 
} } */
+/* { dg-final { scan-tree-dump-not "loop vectorized using 16 byte vectors" 
"vect" } } */
+/* { dg-final { scan-tree-dump-not "loop vectorized using 8 byte vectors" 
"vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-3.c 
b/gcc/testsuite/gcc.target/i386/vect-epilogues-3.c
new file mode 100644
index 000000000000..0ee610f5e3ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-3.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512bw -mtune=znver4 -fdump-tree-vect-optimized" } */
+
+int test (signed char *data, int n)
+{
+  int sum = 0;
+  for (int i = 0; i < n; ++i)
+    sum += data[i];
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump "loop vectorized using 64 byte vectors" "vect" 
} } */
+/* { dg-final { scan-tree-dump "loop vectorized using 32 byte vectors" "vect" 
} } */
+/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" 
} } */
+/* { dg-final { scan-tree-dump "loop vectorized using 8 byte vectors" "vect" { 
target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-4.c 
b/gcc/testsuite/gcc.target/i386/vect-epilogues-4.c
new file mode 100644
index 000000000000..498db6b5a13c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-4.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512bw -mtune=generic --param 
vect-partial-vector-usage=1 -fdump-tree-vect-optimized" } */
+
+int test (signed char *data, int n)
+{
+  int sum = 0;
+  for (int i = 0; i < n; ++i)
+    sum += data[i];
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 2 
"vect" } } */
+/* { dg-final { scan-tree-dump-not "loop vectorized using 32 byte vectors" 
"vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-5.c 
b/gcc/testsuite/gcc.target/i386/vect-epilogues-5.c
new file mode 100644
index 000000000000..6772cabeb4a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-5.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512bw -mtune=znver4 --param 
vect-partial-vector-usage=1 -fdump-tree-vect-optimized" } */
+
+int test (signed char *data, int n)
+{
+  int sum = 0;
+  for (int i = 0; i < n; ++i)
+    sum += data[i];
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 2 
"vect" } } */
+/* { dg-final { scan-tree-dump-not "loop vectorized using 32 byte vectors" 
"vect" } } */

[gcc r15-9778] Fix regression from x86 multi-epilogue tuning

Reply via email to