On Tue, 29 Jul 2025, liuhongt wrote:

> The patch is trying to unroll the vectorized loop when there're
> FMA/DOT_PRDO_EXPR reductions, it will break cross-iteration dependence
> and enable more parallelism(since vectorize will also enable partial
> sum).
> 
> When there's gather/scatter or scalarization in the loop, don't do the
> unroll since the performance bottleneck is not at the reduction.
> 
> The unroll factor is set as
> 
> +       unsigned unroll_factor
> +         = 1 << ceil_log2 (issue_rate / m_num_reduction);
> +
> +       /* Default unroll limit 4.  */
> +       m_suggested_unroll_factor
> +         = MIN ((unsigned) ix86_vect_unroll_limit, unroll_factor);
> 
> Note when DOT_PROD_EXPR is not native support,
> m_num_reduction += 3 * count which almost prevents unroll.
> 
> There's performance boost for simple benchmark with DOT_PRDO_EXPR/FMA
> chain, slight improvement in SPEC2017 performance.
> 
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Any comments?

Few comments inline below - thanks for looking at this, IIRC this
has come up before.

> gcc/ChangeLog:
> 
>       * config/i386/i386.cc (ix86_vector_costs::ix86_vector_costs):
>       Addd new memeber m_num_reduction, m_prefer_unroll.
>       (ix86_vector_costs::add_stmt_cost): Set m_prefer_unroll and
>       m_num_reduction.
>       (ix86_vector_costs::finish_cost): Determine
>       m_suggested_unroll_vector with consideration of issue_rate,
>       m_num_reduction and ix86_vect_unroll_limit.
>       * config/i386/i386.opt: Add -param=ix86-vect-unroll-limit.
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.target/i386/vect_unroll-1.c: New test.
>       * gcc.target/i386/vect_unroll-2.c: New test.
>       * gcc.target/i386/vect_unroll-3.c: New test.
>       * gcc.target/i386/vect_unroll-4.c: New test.
>       * gcc.target/i386/vect_unroll-5.c: New test.
> ---
>  gcc/config/i386/i386.cc                       | 138 +++++++++++++++++-
>  gcc/config/i386/i386.opt                      |   4 +
>  gcc/testsuite/gcc.target/i386/vect_unroll-1.c |  12 ++
>  gcc/testsuite/gcc.target/i386/vect_unroll-2.c |  12 ++
>  gcc/testsuite/gcc.target/i386/vect_unroll-3.c |  12 ++
>  gcc/testsuite/gcc.target/i386/vect_unroll-4.c |  12 ++
>  gcc/testsuite/gcc.target/i386/vect_unroll-5.c |  13 ++
>  7 files changed, 200 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/vect_unroll-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/vect_unroll-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/vect_unroll-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/vect_unroll-4.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/vect_unroll-5.c
> 
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 49bd3939eb4..801f916b161 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -25762,15 +25762,20 @@ private:
>    unsigned m_num_sse_needed[3];
>    /* Number of 256-bit vector permutation.  */
>    unsigned m_num_avx256_vec_perm[3];
> +  /* Number of reductions for FMA/DOT_PROD_EXPR.  */
> +  unsigned m_num_reduction;
> +  /* Number of load/store/gather/scatter in loop body.  */
> +  bool m_prefer_unroll;

The comment doesn't match the bool type.

>  };
>  
>  ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool 
> costing_for_scalar)
>    : vector_costs (vinfo, costing_for_scalar),
>      m_num_gpr_needed (),
>      m_num_sse_needed (),
> -    m_num_avx256_vec_perm ()
> -{
> -}
> +    m_num_avx256_vec_perm (),
> +    m_num_reduction (),
> +    m_prefer_unroll (true)
> +{}
>  
>  /* Implement targetm.vectorize.create_costs.  */
>  
> @@ -26067,6 +26072,118 @@ ix86_vector_costs::add_stmt_cost (int count, 
> vect_cost_for_stmt kind,
>       }
>      }
>  
> +  /* Record number of load/store/gather/scatter in vectorized body.  */
> +  if (where == vect_body && !m_costing_for_scalar)
> +    {
> +      switch (kind)
> +     {
> +       /* Emulated gather/scatter or any scalarization.  */
> +     case scalar_load:
> +     case scalar_stmt:
> +     case scalar_store:
> +     case vector_gather_load:
> +     case vector_scatter_store:
> +       m_prefer_unroll = false;
> +       break;
> +
> +     case vector_stmt:
> +     case vec_to_scalar:
> +       /* Count number of reduction FMA and "real" DOT_PROD_EXPR,
> +          unroll in the vectorizer will enable partial sum.  */
> +       if (stmt_info
> +           && vect_is_reduction (stmt_info)
> +           && stmt_info->stmt)
> +         {
> +           /* Handle __builtin_fma.  */
> +           if (gimple_call_combined_fn (stmt_info->stmt) == CFN_FMA)
> +             {
> +               m_num_reduction += count;
> +               break;
> +             }
> +
> +           if (gimple_code (stmt_info->stmt) != GIMPLE_ASSIGN)

is_gimple_assign (stmt_info->stmt)

> +             break;
> +
> +           tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
> +           machine_mode inner_mode = GET_MODE_INNER (mode);
> +           tree rhs1, rhs2;
> +           bool native_vnni_p = true;
> +           gimple* def;
> +           machine_mode mode_rhs;
> +           switch (subcode)
> +             {
> +             case PLUS_EXPR:
> +             case MINUS_EXPR:
> +               if (!fp || !flag_associative_math
> +                   || flag_fp_contract_mode != FP_CONTRACT_FAST)
> +                 break;
> +
> +               /* FMA condition for different modes.  */
> +               if (((inner_mode == DFmode || inner_mode == SFmode)
> +                    && !TARGET_FMA && !TARGET_AVX512VL)
> +                   || (inner_mode == HFmode && !TARGET_AVX512FP16)
> +                   || (inner_mode == BFmode && !TARGET_AVX10_2))
> +                 break;
> +
> +               /* MULT_EXPR + PLUS_EXPR/MINUS_EXPR is transformed
> +                  to FMA/FNMA after vectorization.  */
> +               rhs1 = gimple_assign_rhs1 (stmt_info->stmt);
> +               rhs2 = gimple_assign_rhs2 (stmt_info->stmt);
> +               if (subcode == PLUS_EXPR
> +                   && TREE_CODE (rhs1) == SSA_NAME
> +                   && (def = SSA_NAME_DEF_STMT (rhs1), true)
> +                   && is_gimple_assign (def)
> +                   && gimple_assign_rhs_code (def) == MULT_EXPR)
> +                 m_num_reduction += count;
> +               else if (TREE_CODE (rhs2) == SSA_NAME
> +                        && (def = SSA_NAME_DEF_STMT (rhs2), true)
> +                        && is_gimple_assign (def)
> +                        && gimple_assign_rhs_code (def) == MULT_EXPR)
> +                 m_num_reduction += count;
> +               break;
> +
> +             case DOT_PROD_EXPR:

There's also SAD_EXPR?  The vectorizer has lane_reducing_op_p ()
for this that also lists WIDEN_SUM_EXPR.

> +               rhs1 = gimple_assign_rhs1 (stmt_info->stmt);
> +               mode_rhs = TYPE_MODE (TREE_TYPE (rhs1));
> +               if (mode_rhs == QImode)
> +                 {
> +                   rhs2 = gimple_assign_rhs2 (stmt_info->stmt);
> +                   signop signop1_p = TYPE_SIGN (TREE_TYPE (rhs1));
> +                   signop signop2_p = TYPE_SIGN (TREE_TYPE (rhs2));
> +
> +                   /* vpdpbusd.  */
> +                   if (signop1_p != signop2_p)
> +                     native_vnni_p
> +                       = (GET_MODE_SIZE (mode) == 64
> +                          ? TARGET_AVX512VNNI
> +                          : ((TARGET_AVX512VNNI && TARGET_AVX512VL)
> +                             || TARGET_AVXVNNI));
> +                   else
> +                     /* vpdpbssd.  */
> +                     native_vnni_p
> +                       = (GET_MODE_SIZE (mode) == 64
> +                          ? TARGET_AVX10_2
> +                          : (TARGET_AVXVNNIINT8 || TARGET_AVX10_2));
> +                 }
> +               m_num_reduction += count;
> +
> +               /* Dislike to do unroll and partial sum for
> +                  emulated DOT_PROD_EXPR.  */
> +               if (!native_vnni_p)
> +                 m_num_reduction += 3 * count;
> +               break;
> +
> +             default:
> +               break;
> +             }
> +         }
> +
> +     default:
> +       break;
> +     }
> +    }
> +
> +
>    combined_fn cfn;
>    if ((kind == vector_stmt || kind == scalar_stmt)
>        && stmt_info
> @@ -26282,6 +26399,21 @@ ix86_vector_costs::finish_cost (const vector_costs 
> *scalar_costs)
>         && (exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ())
>             > ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo))))
>       m_costs[vect_body] = INT_MAX;
> +
> +      unsigned issue_rate = ix86_issue_rate ();

is issue rate a good measure here?  I think for the given
operation it's more like the number of ops that can be
issued in parallel (like 2 for FMA) times the latency
(like 3), thus the number of op that can be in flight?
ix86_issue_rate should only be a very rough approximation of
that?  I suppose we should have separate tuning entries
for this, like one for the number of FMAC units and the
FMAC units (best case) latency?  As for SAD_EXPR that
would be an integer op, that probably goes to a different
pipeline unit.

In general we should have a look at register pressure, I
suppose issue_rate / m_num_reductions ensures we're never
getting close to this in practice.

Thanks,
Richard.

> +      if (m_num_reduction && m_num_reduction < issue_rate
> +       /* Not much gain for loop with gather and scatter.  */
> +       && m_prefer_unroll
> +       && !LOOP_VINFO_EPILOGUE_P (loop_vinfo))
> +     {
> +       unsigned unroll_factor
> +         = 1 << ceil_log2 (issue_rate / m_num_reduction);
> +
> +       /* Default unroll limit 4.  */
> +       m_suggested_unroll_factor
> +         = MIN ((unsigned) ix86_vect_unroll_limit, unroll_factor);
> +     }
> +
>      }
>  
>    ix86_vect_estimate_reg_pressure ();
> diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
> index c93c0b1bb38..6bda22f4843 100644
> --- a/gcc/config/i386/i386.opt
> +++ b/gcc/config/i386/i386.opt
> @@ -1246,6 +1246,10 @@ munroll-only-small-loops
>  Target Var(ix86_unroll_only_small_loops) Init(0) Optimization
>  Enable conservative small loop unrolling.
>  
> +-param=ix86-vect-unroll-limit=
> +Target Joined UInteger Var(ix86_vect_unroll_limit) Init(4) Param
> +Limit how much the autovectorizer may unroll a loop.
> +
>  mlam=
>  Target RejectNegative Joined Enum(lam_type) Var(ix86_lam_type) Init(lam_none)
>  -mlam=[none|u48|u57] Instrument meta data position in user data pointers.
> diff --git a/gcc/testsuite/gcc.target/i386/vect_unroll-1.c 
> b/gcc/testsuite/gcc.target/i386/vect_unroll-1.c
> new file mode 100644
> index 00000000000..2e294d3aea6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/vect_unroll-1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64-v3 -Ofast" } */
> +/* { dg-final { scan-assembler-times {(?n)vfmadd[1-3]*ps[^\n]*ymm} 4 } } */
> +
> +float
> +foo (float* a, float* b, int n)
> +{
> +  float sum = 0;
> +  for (int i = 0; i != n; i++)
> +    sum += a[i] * b[i];
> +  return sum;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/vect_unroll-2.c 
> b/gcc/testsuite/gcc.target/i386/vect_unroll-2.c
> new file mode 100644
> index 00000000000..069f7d37ae7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/vect_unroll-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64-v3 -Ofast" } */
> +/* { dg-final { scan-assembler-times {(?n)vfnmadd[1-3]*ps[^\n]*ymm} 4 } } */
> +
> +float
> +foo (float* a, float* b, int n)
> +{
> +  float sum = 0;
> +  for (int i = 0; i != n; i++)
> +    sum -= a[i] * b[i];
> +  return sum;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/vect_unroll-3.c 
> b/gcc/testsuite/gcc.target/i386/vect_unroll-3.c
> new file mode 100644
> index 00000000000..6860c2ffbd5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/vect_unroll-3.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavxvnni -O3" } */
> +/* { dg-final { scan-assembler-times {(?n)vpdpbusd[^\n]*ymm} 4 } } */
> +
> +int
> +foo (unsigned char* a, char* b, int n)
> +{
> +  int sum = 0;
> +  for (int i = 0; i != n; i++)
> +    sum += a[i] * b[i];
> +  return sum;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/vect_unroll-4.c 
> b/gcc/testsuite/gcc.target/i386/vect_unroll-4.c
> new file mode 100644
> index 00000000000..442c2e9529f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/vect_unroll-4.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64-v3 -O3 -mno-avxvnni" } */
> +/* { dg-final { scan-assembler-times {(?n)vpmaddwd[^\n]*ymm} 2 } } */
> +
> +int
> +foo (unsigned char* a, char* b, int n)
> +{
> +  int sum = 0;
> +  for (int i = 0; i != n; i++)
> +    sum += a[i] * b[i];
> +  return sum;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/vect_unroll-5.c 
> b/gcc/testsuite/gcc.target/i386/vect_unroll-5.c
> new file mode 100644
> index 00000000000..c6375b1bc8d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/vect_unroll-5.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64-v3 -Ofast -mgather" } */
> +/* { dg-final { scan-assembler-times {(?n)vfmadd[1-3]*ps[^\n]*ymm} 1 } } */
> +
> +float
> +foo (float* a, int* b, float* c, int n)
> +{
> +  float sum = 0;
> +  for (int i = 0; i != n; i++)
> +    sum += a[b[i]] *c[i];
> +  return sum;
> +}
> +
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to