On Tue, 29 Jul 2025, liuhongt wrote: > The patch is trying to unroll the vectorized loop when there're > FMA/DOT_PRDO_EXPR reductions, it will break cross-iteration dependence > and enable more parallelism(since vectorize will also enable partial > sum). > > When there's gather/scatter or scalarization in the loop, don't do the > unroll since the performance bottleneck is not at the reduction. > > The unroll factor is set as > > + unsigned unroll_factor > + = 1 << ceil_log2 (issue_rate / m_num_reduction); > + > + /* Default unroll limit 4. */ > + m_suggested_unroll_factor > + = MIN ((unsigned) ix86_vect_unroll_limit, unroll_factor); > > Note when DOT_PROD_EXPR is not native support, > m_num_reduction += 3 * count which almost prevents unroll. > > There's performance boost for simple benchmark with DOT_PRDO_EXPR/FMA > chain, slight improvement in SPEC2017 performance. > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Any comments?
Few comments inline below - thanks for looking at this, IIRC this has come up before. > gcc/ChangeLog: > > * config/i386/i386.cc (ix86_vector_costs::ix86_vector_costs): > Addd new memeber m_num_reduction, m_prefer_unroll. > (ix86_vector_costs::add_stmt_cost): Set m_prefer_unroll and > m_num_reduction. > (ix86_vector_costs::finish_cost): Determine > m_suggested_unroll_vector with consideration of issue_rate, > m_num_reduction and ix86_vect_unroll_limit. > * config/i386/i386.opt: Add -param=ix86-vect-unroll-limit. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/vect_unroll-1.c: New test. > * gcc.target/i386/vect_unroll-2.c: New test. > * gcc.target/i386/vect_unroll-3.c: New test. > * gcc.target/i386/vect_unroll-4.c: New test. > * gcc.target/i386/vect_unroll-5.c: New test. > --- > gcc/config/i386/i386.cc | 138 +++++++++++++++++- > gcc/config/i386/i386.opt | 4 + > gcc/testsuite/gcc.target/i386/vect_unroll-1.c | 12 ++ > gcc/testsuite/gcc.target/i386/vect_unroll-2.c | 12 ++ > gcc/testsuite/gcc.target/i386/vect_unroll-3.c | 12 ++ > gcc/testsuite/gcc.target/i386/vect_unroll-4.c | 12 ++ > gcc/testsuite/gcc.target/i386/vect_unroll-5.c | 13 ++ > 7 files changed, 200 insertions(+), 3 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/vect_unroll-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/vect_unroll-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/vect_unroll-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/vect_unroll-4.c > create mode 100644 gcc/testsuite/gcc.target/i386/vect_unroll-5.c > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > index 49bd3939eb4..801f916b161 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -25762,15 +25762,20 @@ private: > unsigned m_num_sse_needed[3]; > /* Number of 256-bit vector permutation. */ > unsigned m_num_avx256_vec_perm[3]; > + /* Number of reductions for FMA/DOT_PROD_EXPR. */ > + unsigned m_num_reduction; > + /* Number of load/store/gather/scatter in loop body. */ > + bool m_prefer_unroll; The comment doesn't match the bool type. > }; > > ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool > costing_for_scalar) > : vector_costs (vinfo, costing_for_scalar), > m_num_gpr_needed (), > m_num_sse_needed (), > - m_num_avx256_vec_perm () > -{ > -} > + m_num_avx256_vec_perm (), > + m_num_reduction (), > + m_prefer_unroll (true) > +{} > > /* Implement targetm.vectorize.create_costs. */ > > @@ -26067,6 +26072,118 @@ ix86_vector_costs::add_stmt_cost (int count, > vect_cost_for_stmt kind, > } > } > > + /* Record number of load/store/gather/scatter in vectorized body. */ > + if (where == vect_body && !m_costing_for_scalar) > + { > + switch (kind) > + { > + /* Emulated gather/scatter or any scalarization. */ > + case scalar_load: > + case scalar_stmt: > + case scalar_store: > + case vector_gather_load: > + case vector_scatter_store: > + m_prefer_unroll = false; > + break; > + > + case vector_stmt: > + case vec_to_scalar: > + /* Count number of reduction FMA and "real" DOT_PROD_EXPR, > + unroll in the vectorizer will enable partial sum. */ > + if (stmt_info > + && vect_is_reduction (stmt_info) > + && stmt_info->stmt) > + { > + /* Handle __builtin_fma. */ > + if (gimple_call_combined_fn (stmt_info->stmt) == CFN_FMA) > + { > + m_num_reduction += count; > + break; > + } > + > + if (gimple_code (stmt_info->stmt) != GIMPLE_ASSIGN) is_gimple_assign (stmt_info->stmt) > + break; > + > + tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt); > + machine_mode inner_mode = GET_MODE_INNER (mode); > + tree rhs1, rhs2; > + bool native_vnni_p = true; > + gimple* def; > + machine_mode mode_rhs; > + switch (subcode) > + { > + case PLUS_EXPR: > + case MINUS_EXPR: > + if (!fp || !flag_associative_math > + || flag_fp_contract_mode != FP_CONTRACT_FAST) > + break; > + > + /* FMA condition for different modes. */ > + if (((inner_mode == DFmode || inner_mode == SFmode) > + && !TARGET_FMA && !TARGET_AVX512VL) > + || (inner_mode == HFmode && !TARGET_AVX512FP16) > + || (inner_mode == BFmode && !TARGET_AVX10_2)) > + break; > + > + /* MULT_EXPR + PLUS_EXPR/MINUS_EXPR is transformed > + to FMA/FNMA after vectorization. */ > + rhs1 = gimple_assign_rhs1 (stmt_info->stmt); > + rhs2 = gimple_assign_rhs2 (stmt_info->stmt); > + if (subcode == PLUS_EXPR > + && TREE_CODE (rhs1) == SSA_NAME > + && (def = SSA_NAME_DEF_STMT (rhs1), true) > + && is_gimple_assign (def) > + && gimple_assign_rhs_code (def) == MULT_EXPR) > + m_num_reduction += count; > + else if (TREE_CODE (rhs2) == SSA_NAME > + && (def = SSA_NAME_DEF_STMT (rhs2), true) > + && is_gimple_assign (def) > + && gimple_assign_rhs_code (def) == MULT_EXPR) > + m_num_reduction += count; > + break; > + > + case DOT_PROD_EXPR: There's also SAD_EXPR? The vectorizer has lane_reducing_op_p () for this that also lists WIDEN_SUM_EXPR. > + rhs1 = gimple_assign_rhs1 (stmt_info->stmt); > + mode_rhs = TYPE_MODE (TREE_TYPE (rhs1)); > + if (mode_rhs == QImode) > + { > + rhs2 = gimple_assign_rhs2 (stmt_info->stmt); > + signop signop1_p = TYPE_SIGN (TREE_TYPE (rhs1)); > + signop signop2_p = TYPE_SIGN (TREE_TYPE (rhs2)); > + > + /* vpdpbusd. */ > + if (signop1_p != signop2_p) > + native_vnni_p > + = (GET_MODE_SIZE (mode) == 64 > + ? TARGET_AVX512VNNI > + : ((TARGET_AVX512VNNI && TARGET_AVX512VL) > + || TARGET_AVXVNNI)); > + else > + /* vpdpbssd. */ > + native_vnni_p > + = (GET_MODE_SIZE (mode) == 64 > + ? TARGET_AVX10_2 > + : (TARGET_AVXVNNIINT8 || TARGET_AVX10_2)); > + } > + m_num_reduction += count; > + > + /* Dislike to do unroll and partial sum for > + emulated DOT_PROD_EXPR. */ > + if (!native_vnni_p) > + m_num_reduction += 3 * count; > + break; > + > + default: > + break; > + } > + } > + > + default: > + break; > + } > + } > + > + > combined_fn cfn; > if ((kind == vector_stmt || kind == scalar_stmt) > && stmt_info > @@ -26282,6 +26399,21 @@ ix86_vector_costs::finish_cost (const vector_costs > *scalar_costs) > && (exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ()) > > ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo)))) > m_costs[vect_body] = INT_MAX; > + > + unsigned issue_rate = ix86_issue_rate (); is issue rate a good measure here? I think for the given operation it's more like the number of ops that can be issued in parallel (like 2 for FMA) times the latency (like 3), thus the number of op that can be in flight? ix86_issue_rate should only be a very rough approximation of that? I suppose we should have separate tuning entries for this, like one for the number of FMAC units and the FMAC units (best case) latency? As for SAD_EXPR that would be an integer op, that probably goes to a different pipeline unit. In general we should have a look at register pressure, I suppose issue_rate / m_num_reductions ensures we're never getting close to this in practice. Thanks, Richard. > + if (m_num_reduction && m_num_reduction < issue_rate > + /* Not much gain for loop with gather and scatter. */ > + && m_prefer_unroll > + && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)) > + { > + unsigned unroll_factor > + = 1 << ceil_log2 (issue_rate / m_num_reduction); > + > + /* Default unroll limit 4. */ > + m_suggested_unroll_factor > + = MIN ((unsigned) ix86_vect_unroll_limit, unroll_factor); > + } > + > } > > ix86_vect_estimate_reg_pressure (); > diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt > index c93c0b1bb38..6bda22f4843 100644 > --- a/gcc/config/i386/i386.opt > +++ b/gcc/config/i386/i386.opt > @@ -1246,6 +1246,10 @@ munroll-only-small-loops > Target Var(ix86_unroll_only_small_loops) Init(0) Optimization > Enable conservative small loop unrolling. > > +-param=ix86-vect-unroll-limit= > +Target Joined UInteger Var(ix86_vect_unroll_limit) Init(4) Param > +Limit how much the autovectorizer may unroll a loop. > + > mlam= > Target RejectNegative Joined Enum(lam_type) Var(ix86_lam_type) Init(lam_none) > -mlam=[none|u48|u57] Instrument meta data position in user data pointers. > diff --git a/gcc/testsuite/gcc.target/i386/vect_unroll-1.c > b/gcc/testsuite/gcc.target/i386/vect_unroll-1.c > new file mode 100644 > index 00000000000..2e294d3aea6 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/vect_unroll-1.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=x86-64-v3 -Ofast" } */ > +/* { dg-final { scan-assembler-times {(?n)vfmadd[1-3]*ps[^\n]*ymm} 4 } } */ > + > +float > +foo (float* a, float* b, int n) > +{ > + float sum = 0; > + for (int i = 0; i != n; i++) > + sum += a[i] * b[i]; > + return sum; > +} > diff --git a/gcc/testsuite/gcc.target/i386/vect_unroll-2.c > b/gcc/testsuite/gcc.target/i386/vect_unroll-2.c > new file mode 100644 > index 00000000000..069f7d37ae7 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/vect_unroll-2.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=x86-64-v3 -Ofast" } */ > +/* { dg-final { scan-assembler-times {(?n)vfnmadd[1-3]*ps[^\n]*ymm} 4 } } */ > + > +float > +foo (float* a, float* b, int n) > +{ > + float sum = 0; > + for (int i = 0; i != n; i++) > + sum -= a[i] * b[i]; > + return sum; > +} > diff --git a/gcc/testsuite/gcc.target/i386/vect_unroll-3.c > b/gcc/testsuite/gcc.target/i386/vect_unroll-3.c > new file mode 100644 > index 00000000000..6860c2ffbd5 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/vect_unroll-3.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavxvnni -O3" } */ > +/* { dg-final { scan-assembler-times {(?n)vpdpbusd[^\n]*ymm} 4 } } */ > + > +int > +foo (unsigned char* a, char* b, int n) > +{ > + int sum = 0; > + for (int i = 0; i != n; i++) > + sum += a[i] * b[i]; > + return sum; > +} > diff --git a/gcc/testsuite/gcc.target/i386/vect_unroll-4.c > b/gcc/testsuite/gcc.target/i386/vect_unroll-4.c > new file mode 100644 > index 00000000000..442c2e9529f > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/vect_unroll-4.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=x86-64-v3 -O3 -mno-avxvnni" } */ > +/* { dg-final { scan-assembler-times {(?n)vpmaddwd[^\n]*ymm} 2 } } */ > + > +int > +foo (unsigned char* a, char* b, int n) > +{ > + int sum = 0; > + for (int i = 0; i != n; i++) > + sum += a[i] * b[i]; > + return sum; > +} > diff --git a/gcc/testsuite/gcc.target/i386/vect_unroll-5.c > b/gcc/testsuite/gcc.target/i386/vect_unroll-5.c > new file mode 100644 > index 00000000000..c6375b1bc8d > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/vect_unroll-5.c > @@ -0,0 +1,13 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=x86-64-v3 -Ofast -mgather" } */ > +/* { dg-final { scan-assembler-times {(?n)vfmadd[1-3]*ps[^\n]*ymm} 1 } } */ > + > +float > +foo (float* a, int* b, float* c, int n) > +{ > + float sum = 0; > + for (int i = 0; i != n; i++) > + sum += a[b[i]] *c[i]; > + return sum; > +} > + > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)