On Tue, Sep 23, 2025 at 8:17 AM liuhongt <[email protected]> wrote:
>
> Update in V2:
> 1. Also disable vect unroll for Znver1.
> 2. double the count for {AVX512,AVX256}_SPLIT_REGS for corresponding vector 
> length.
>
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
>
> Since it regressed SPEC performance(Refer to PR121994), I guess
> it's related to register pressure and can be tuned by adjusting
> reduc_lat_mult_thr. I don't have Zen2 machine, so for simplity, I'll
> just disable unroll in vectorizer for Zen2.
>
> Also adjust count number for {AVX256,AVX512}_SPLIT_REGS.

LGTM.

Richard.

> gcc/ChangeLog:
>
>         PR target/121994
>         * config/i386/x86-tune-costs.h (znver2_cost): Set
>         vect_unroll_limit to 1.
>         (znver1_cost): Ditto.
>         * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
>         Adjust count number for {AVX256,AVX512}_SPLIT_REGS.
> ---
>  gcc/config/i386/i386.cc          | 18 +++++++++++++-----
>  gcc/config/i386/x86-tune-costs.h |  4 ++--
>  2 files changed, 15 insertions(+), 7 deletions(-)
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 5ef7c315091..6eb26cd7b82 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -26144,6 +26144,14 @@ ix86_vector_costs::add_stmt_cost (int count, 
> vect_cost_for_stmt kind,
>    /* Record number of load/store/gather/scatter in vectorized body.  */
>    if (where == vect_body && !m_costing_for_scalar)
>      {
> +      int scale = 1;
> +      if (vectype
> +         && ((GET_MODE_SIZE (TYPE_MODE (vectype)) == 64
> +             && TARGET_AVX512_SPLIT_REGS)
> +             || (GET_MODE_SIZE (TYPE_MODE (vectype)) == 32
> +                 && TARGET_AVX256_SPLIT_REGS)))
> +       scale = 2;
> +
>        switch (kind)
>         {
>           /* Emulated gather/scatter or any scalarization.  */
> @@ -26166,7 +26174,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
> vect_cost_for_stmt kind,
>               /* Handle __builtin_fma.  */
>               if (gimple_call_combined_fn (stmt_info->stmt) == CFN_FMA)
>                 {
> -                 m_num_reduc[X86_REDUC_FMA] += count;
> +                 m_num_reduc[X86_REDUC_FMA] += count * scale;
>                   break;
>                 }
>
> @@ -26203,12 +26211,12 @@ ix86_vector_costs::add_stmt_cost (int count, 
> vect_cost_for_stmt kind,
>                       && (def = SSA_NAME_DEF_STMT (rhs1), true)
>                       && is_gimple_assign (def)
>                       && gimple_assign_rhs_code (def) == MULT_EXPR)
> -                   m_num_reduc[X86_REDUC_FMA] += count;
> +                   m_num_reduc[X86_REDUC_FMA] += count * scale;
>                   else if (TREE_CODE (rhs2) == SSA_NAME
>                            && (def = SSA_NAME_DEF_STMT (rhs2), true)
>                            && is_gimple_assign (def)
>                            && gimple_assign_rhs_code (def) == MULT_EXPR)
> -                   m_num_reduc[X86_REDUC_FMA] += count;
> +                   m_num_reduc[X86_REDUC_FMA] += count * scale;
>                   break;
>
>                   /* Vectorizer lane_reducing_op_p supports DOT_PROX_EXPR,
> @@ -26237,7 +26245,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
> vect_cost_for_stmt kind,
>                              ? TARGET_AVX10_2
>                              : (TARGET_AVXVNNIINT8 || TARGET_AVX10_2));
>                     }
> -                 m_num_reduc[X86_REDUC_DOT_PROD] += count;
> +                 m_num_reduc[X86_REDUC_DOT_PROD] += count * scale;
>
>                   /* Dislike to do unroll and partial sum for
>                      emulated DOT_PROD_EXPR.  */
> @@ -26246,7 +26254,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
> vect_cost_for_stmt kind,
>                   break;
>
>                 case SAD_EXPR:
> -                 m_num_reduc[X86_REDUC_SAD] += count;
> +                 m_num_reduc[X86_REDUC_SAD] += count * scale;
>                   break;
>
>                 default:
> diff --git a/gcc/config/i386/x86-tune-costs.h 
> b/gcc/config/i386/x86-tune-costs.h
> index 1649ea2fe3e..c7a0f6805ca 100644
> --- a/gcc/config/i386/x86-tune-costs.h
> +++ b/gcc/config/i386/x86-tune-costs.h
> @@ -1744,7 +1744,7 @@ struct processor_costs znver1_cost = {
>                                            FMA/DOT_PROD_EXPR/SAD_EXPR,
>                                            it's used to determine unroll
>                                            factor in the vectorizer.  */
> -  4,                                   /* Limit how much the autovectorizer
> +  1,                                   /* Limit how much the autovectorizer
>                                            may unroll a loop.  */
>    znver1_memcpy,
>    znver1_memset,
> @@ -1918,7 +1918,7 @@ struct processor_costs znver2_cost = {
>                                            FMA/DOT_PROD_EXPR/SAD_EXPR,
>                                            it's used to determine unroll
>                                            factor in the vectorizer.  */
> -  4,                                   /* Limit how much the autovectorizer
> +  1,                                   /* Limit how much the autovectorizer
>                                            may unroll a loop.  */
>    znver2_memcpy,
>    znver2_memset,
> --
> 2.34.1
>

Reply via email to