https://gcc.gnu.org/g:e9079e4f43d13579c41110ce1871051a43c577b6

commit r16-2088-ge9079e4f43d13579c41110ce1871051a43c577b6
Author: Richard Biener <rguent...@suse.de>
Date:   Sun May 25 19:29:04 2025 +0200

    add masked-epilogue tuning
    
    The following adds a x86 tuning to enable the use of AVX512 masked
    epilogues in cases we heuristically determine it to be not detrimental
    by high chance.  Basically problematic cases are when there are
    data streams that are both stored and loaded from and an outer loop
    could end up executing only the inner loop masked epilogue and with
    unlucky data stream advacement from the outer loop end up needing
    to forward from masked stores to masked loads.  This isn't very
    well handled, esp. for the case where unmasked operations would
    not need to forward at all - that is, when forwarding completely
    from the masked out portion of the store (like the AVX upper half
    to the AVX lower half of a load).  There's also the case where
    the number of iterations is known at compile time, only with
    cost comparing we'd consider a non-masked epilog - as we are not
    doing that we have to add heuristics to avoid masking when a
    single vector epilog iteration would cover all scalar iterations
    left (this is exercised by gcc.target/i386/pr110310.c).
    
    SPEC CPU 2017 shows 3% text size savings over not using masked
    epilogues with performance impact in the noise.  Masking all vector
    epilogues gets that to 4% text size savings with some major
    runtime regressions in 503.bwaves_r and 527.cam4_r
    (measured on a Zen4 system), we're leaving a 5% improvement
    for 549.fotonik3d_r unrealized with the implemented heuristic.
    
    With the heuristics we turn 22513 vector epilogues + up to 12305 scalar
    epilogues into 12305 masked vector epilogues of which 574 are for
    AVX vector sizes, 79 for SSE vector sizes and the rest for AVX512.
    When masking all epilogues we get 14567 of them from
    29467 vector + up to 14567 scalar epilogues, so the heuristics disable
    an additional 20% of masked epilogues.
    
            * config/i386/x86-tune.def (X86_TUNE_AVX512_MASKED_EPILOGUES):
            New tunable, default on for m_ZNVER4 and m_ZNVER5.
            * config/i386/i386.cc (ix86_vector_costs::finish_cost): With
            X86_TUNE_AVX512_MASKED_EPILOGUES and when the main loop
            had a vectorization factor > 2 use a masked epilogue when
            possible and when not obviously problematic.
    
            * gcc.target/i386/vect-mask-epilogue-1.c: New testcase.
            * gcc.target/i386/vect-mask-epilogue-2.c: Likewise.
            * gcc.target/i386/vect-epilogues-3.c: Adjust.

Diff:
---
 gcc/config/i386/i386.cc                            | 59 ++++++++++++++++++++++
 gcc/config/i386/x86-tune.def                       |  5 ++
 gcc/testsuite/gcc.target/i386/vect-epilogues-3.c   |  2 +-
 .../gcc.target/i386/vect-mask-epilogue-1.c         | 11 ++++
 .../gcc.target/i386/vect-mask-epilogue-2.c         | 14 +++++
 5 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index fd3f35de14d3..ad7360ec71a4 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -26295,6 +26295,65 @@ ix86_vector_costs::finish_cost (const vector_costs 
*scalar_costs)
       && LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () >= 16)
     m_suggested_epilogue_mode = V8QImode;
 
+  /* When X86_TUNE_AVX512_MASKED_EPILOGUES is enabled try to use
+     a masked epilogue if that doesn't seem detrimental.  */
+  if (loop_vinfo
+      && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+      && LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () > 2
+      && ix86_tune_features[X86_TUNE_AVX512_MASKED_EPILOGUES]
+      && !OPTION_SET_P (param_vect_partial_vector_usage))
+    {
+      bool avoid = false;
+      if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+         && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
+       {
+         unsigned int peel_niter
+           = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+         if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+           peel_niter += 1;
+         /* When we know the number of scalar iterations of the epilogue,
+            avoid masking when a single vector epilog iteration handles
+            it in full.  */
+         if (pow2p_hwi ((LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter)
+                        % LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ()))
+           avoid = true;
+       }
+      if (!avoid && loop_outer (loop_outer (LOOP_VINFO_LOOP (loop_vinfo))))
+       for (auto ddr : LOOP_VINFO_DDRS (loop_vinfo))
+         {
+           if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
+             ;
+           else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
+             ;
+           else
+             {
+               int loop_depth
+                   = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
+                                         DDR_LOOP_NEST (ddr));
+               if (DDR_NUM_DIST_VECTS (ddr) == 1
+                   && DDR_DIST_VECTS (ddr)[0][loop_depth] == 0)
+                 {
+                   /* Avoid the case when there's an outer loop that might
+                      traverse a multi-dimensional array with the inner
+                      loop just executing the masked epilogue with a
+                      read-write where the next outer iteration might
+                      read from the masked part of the previous write,
+                      'n' filling half a vector.
+                        for (j = 0; j < m; ++j)
+                          for (i = 0; i < n; ++i)
+                            a[j][i] = c * a[j][i];  */
+                   avoid = true;
+                   break;
+                 }
+             }
+         }
+      if (!avoid)
+       {
+         m_suggested_epilogue_mode = loop_vinfo->vector_mode;
+         m_masked_epilogue = 1;
+       }
+    }
+
   vector_costs::finish_cost (scalar_costs);
 }
 
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 91cdca7fbfc2..4773e5dd5ad1 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -639,6 +639,11 @@ DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, 
"avx512_store_by_pieces",
 DEF_TUNE (X86_TUNE_AVX512_TWO_EPILOGUES, "avx512_two_epilogues",
          m_ZNVER4 | m_ZNVER5)
 
+/* X86_TUNE_AVX512_MAKED_EPILOGUES: Use two masked vector epilogues
+   when fit.  */
+DEF_TUNE (X86_TUNE_AVX512_MASKED_EPILOGUES, "avx512_masked_epilogues",
+         m_ZNVER4 | m_ZNVER5)
+
 /*****************************************************************************/
 /*****************************************************************************/
 /* Historical relics: tuning flags that helps a specific old CPU designs     */
diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-3.c 
b/gcc/testsuite/gcc.target/i386/vect-epilogues-3.c
index 0ee610f5e3ef..e88ab30c770f 100644
--- a/gcc/testsuite/gcc.target/i386/vect-epilogues-3.c
+++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-3.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O3 -mavx512bw -mtune=znver4 -fdump-tree-vect-optimized" } */
+/* { dg-options "-O3 -mavx512bw -mtune=znver4 --param 
vect-partial-vector-usage=0 -fdump-tree-vect-optimized" } */
 
 int test (signed char *data, int n)
 {
diff --git a/gcc/testsuite/gcc.target/i386/vect-mask-epilogue-1.c 
b/gcc/testsuite/gcc.target/i386/vect-mask-epilogue-1.c
new file mode 100644
index 000000000000..55519aa87fdc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-mask-epilogue-1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=znver5 -fdump-tree-vect-optimized" } */
+
+void bar (double *a, double *b, double c, int n, int m)
+{
+  for (int j = 0; j < m; ++j)
+    for (int i = 0; i < n; ++i)
+      a[j*n + i] = b[j*n + i] + c;
+}
+
+/* { dg-final { scan-tree-dump "epilogue loop vectorized using masked 64 byte 
vectors" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-mask-epilogue-2.c 
b/gcc/testsuite/gcc.target/i386/vect-mask-epilogue-2.c
new file mode 100644
index 000000000000..3dc28b39b625
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-mask-epilogue-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=znver5 -fdump-tree-vect-optimized" } */
+
+void foo (double *a, double b, double c, int n, int m)
+{
+  for (int j = 0; j < m; ++j)
+    for (int i = 0; i < n; ++i)
+      a[j*n + i] = a[j*n + i] * b + c;
+}
+
+/* We do not want to use a masked epilogue for the inner loop as the next
+   outer iteration will possibly immediately read from elements masked of
+   the previous inner loop epilogue and that never forwards.  */
+/* { dg-final { scan-tree-dump "epilogue loop vectorized using 32 byte 
vectors" "vect" } } */

Reply via email to