https://gcc.gnu.org/g:753e5c8a3b04320ae183a7546fb8b926a4678bdb

commit r16-4031-g753e5c8a3b04320ae183a7546fb8b926a4678bdb
Author: liuhongt <[email protected]>
Date:   Thu Sep 18 19:13:22 2025 -0700

    Disable vect unroll for znver2/Znver1.
    
    Since it regressed SPEC performance(Refer to PR121994), I guess
    it's related to register pressure and can be tuned by adjusting
    reduc_lat_mult_thr. I don't have Zen2 machine, so for simplity, I'll
    just disable unroll in vectorizer for Zen2.
    
    Also adjust count number for {AVX256,AVX512}_SPLIT_REGS.
    
    gcc/ChangeLog:
    
            PR target/121994
            * config/i386/x86-tune-costs.h (znver2_cost): Set
            vect_unroll_limit to 1.
            (znver1_cost): Ditto.
            * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
            Adjust count number for {AVX256,AVX512}_SPLIT_REGS.

Diff:
---
 gcc/config/i386/i386.cc          | 18 +++++++++++++-----
 gcc/config/i386/x86-tune-costs.h |  4 ++--
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 5ef7c315091d..6eb26cd7b824 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -26144,6 +26144,14 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
   /* Record number of load/store/gather/scatter in vectorized body.  */
   if (where == vect_body && !m_costing_for_scalar)
     {
+      int scale = 1;
+      if (vectype
+         && ((GET_MODE_SIZE (TYPE_MODE (vectype)) == 64
+             && TARGET_AVX512_SPLIT_REGS)
+             || (GET_MODE_SIZE (TYPE_MODE (vectype)) == 32
+                 && TARGET_AVX256_SPLIT_REGS)))
+       scale = 2;
+
       switch (kind)
        {
          /* Emulated gather/scatter or any scalarization.  */
@@ -26166,7 +26174,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
              /* Handle __builtin_fma.  */
              if (gimple_call_combined_fn (stmt_info->stmt) == CFN_FMA)
                {
-                 m_num_reduc[X86_REDUC_FMA] += count;
+                 m_num_reduc[X86_REDUC_FMA] += count * scale;
                  break;
                }
 
@@ -26203,12 +26211,12 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
                      && (def = SSA_NAME_DEF_STMT (rhs1), true)
                      && is_gimple_assign (def)
                      && gimple_assign_rhs_code (def) == MULT_EXPR)
-                   m_num_reduc[X86_REDUC_FMA] += count;
+                   m_num_reduc[X86_REDUC_FMA] += count * scale;
                  else if (TREE_CODE (rhs2) == SSA_NAME
                           && (def = SSA_NAME_DEF_STMT (rhs2), true)
                           && is_gimple_assign (def)
                           && gimple_assign_rhs_code (def) == MULT_EXPR)
-                   m_num_reduc[X86_REDUC_FMA] += count;
+                   m_num_reduc[X86_REDUC_FMA] += count * scale;
                  break;
 
                  /* Vectorizer lane_reducing_op_p supports DOT_PROX_EXPR,
@@ -26237,7 +26245,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
                             ? TARGET_AVX10_2
                             : (TARGET_AVXVNNIINT8 || TARGET_AVX10_2));
                    }
-                 m_num_reduc[X86_REDUC_DOT_PROD] += count;
+                 m_num_reduc[X86_REDUC_DOT_PROD] += count * scale;
 
                  /* Dislike to do unroll and partial sum for
                     emulated DOT_PROD_EXPR.  */
@@ -26246,7 +26254,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
                  break;
 
                case SAD_EXPR:
-                 m_num_reduc[X86_REDUC_SAD] += count;
+                 m_num_reduc[X86_REDUC_SAD] += count * scale;
                  break;
 
                default:
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 1649ea2fe3e5..c7a0f6805ca1 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -1744,7 +1744,7 @@ struct processor_costs znver1_cost = {
                                           FMA/DOT_PROD_EXPR/SAD_EXPR,
                                           it's used to determine unroll
                                           factor in the vectorizer.  */
-  4,                                   /* Limit how much the autovectorizer
+  1,                                   /* Limit how much the autovectorizer
                                           may unroll a loop.  */
   znver1_memcpy,
   znver1_memset,
@@ -1918,7 +1918,7 @@ struct processor_costs znver2_cost = {
                                           FMA/DOT_PROD_EXPR/SAD_EXPR,
                                           it's used to determine unroll
                                           factor in the vectorizer.  */
-  4,                                   /* Limit how much the autovectorizer
+  1,                                   /* Limit how much the autovectorizer
                                           may unroll a loop.  */
   znver2_memcpy,
   znver2_memset,

Reply via email to