On Wed, 16 Oct 2024, Robin Dapp wrote: > Hi, > > this is probably rather an RFC than a patch as I'm not sure whether > reassoc is the right place to fix it. On top, the heuristic might > be a bit "ad-hoc". Maybe we can also work around it in the vectorizer? > > The following function is vectorized in a very inefficient way because we > construct vectors from scalar loads. > > uint64_t > foo (uint8_t *pix, int i_stride) > { > uint32_t sum = 0, sqr = 0; > int x, y; > for (y = 0; y < 16; y++) > { > for (x = 0; x < 16; x++) > sum += pix[x]; > pix += i_stride; > } > return sum; > } > > The reason for this is that reassoc reorders the last three operands of > the summation sequence by rank introducing a temporary in the process > that breaks the "homogeneity" (sum_n = sum_{n - 1} + pix[n]) of the sum. > > This patch adds a function likely_vectorizable_p that checks if an > operand vector contains only operands of the same rank except the last > one. In that case the sequence is likely vectorizable and the easier > vectorization will outweigh any CSE opportunity we can expose by > swapping operands.
Hmm, reassoc isn't supposed to apply width > 1 before vectorization; IIRC I "fixed" that at some point, but this is what I see: # sum_126 = PHI <sum_145(5), 0(2)> ... _23 = *pix_134; _24 = (unsigned int) _23; _31 = MEM[(uint8_t *)pix_134 + 1B]; _32 = (unsigned int) _31; _118 = _24 + _32; sum_33 = _118 + sum_126; ... I think the place you add the condition to on swap_ops_for_binary_stmt simply lacks a !reassoc_insert_powi_p check like we have on the earlier rewrite_expr_tree_parallel. Richard. > Bootstrapped and regtested on x86, regtested on rv64gcv. > > Regards > Robin > > gcc/ChangeLog: > > * tree-ssa-reassoc.cc (likely_vectorizable_p): New function. > (reassociate_bb): Use new function. > (dump_ops_vector): Change prototype. > (debug_ops_vector): Change prototype. > > gcc/testsuite/ChangeLog: > > * gcc.dg/tree-ssa/reassoc-52.c: New test. > --- > gcc/testsuite/gcc.dg/tree-ssa/reassoc-52.c | 27 ++++++ > gcc/tree-ssa-reassoc.cc | 102 ++++++++++++++++++++- > 2 files changed, 124 insertions(+), 5 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/reassoc-52.c > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/reassoc-52.c > b/gcc/testsuite/gcc.dg/tree-ssa/reassoc-52.c > new file mode 100644 > index 00000000000..b117b7519bb > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/reassoc-52.c > @@ -0,0 +1,27 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-O3 -fdump-tree-reassoc-details" } */ > + > +typedef unsigned char uint8_t; > +typedef unsigned int uint32_t; > +typedef unsigned long uint64_t; > + > +uint64_t > +foo (uint8_t *pix, int i_stride) > +{ > + uint32_t sum = 0, sqr = 0; > + int x, y; > + for (y = 0; y < 16; y++) > + { > + for (x = 0; x < 16; x++) > + sum += pix[x]; > + pix += i_stride; > + } > + return sum; > +} > + > +/* Ensure that we only add to sum variables and don't create a temporary that > + does something else. In doing so we enable a much more efficient > + vectorization scheme. */ > + > +/* { dg-final { scan-tree-dump-times "\\s+sum_\\d+\\s+=\\s+sum_\\d+\\s+\\\+" > 16 "reassoc1" } } */ > +/* { dg-final { scan-tree-dump-times "\\s+sum_\\d+\\s=.*\\\+\\ssum_\\d+;" 1 > "reassoc1" } } */ > diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc > index 556ecdebe2d..13f8a1070b6 100644 > --- a/gcc/tree-ssa-reassoc.cc > +++ b/gcc/tree-ssa-reassoc.cc > @@ -6992,6 +6992,96 @@ rank_ops_for_fma (vec<operand_entry *> *ops) > } > return mult_num; > } > + > +/* Check if the operand vector contains operands that all have the same rank > + apart from the last one. The last one must be a PHI node which links back > + to the first operand. > + This can indicate an easily vectorizable sequence in which case we don't > + want to reorder the first three elements for rank reasons. > + > + Consider the following example: > + > + for (int y = 0; y < 16; y++) > + { > + for (int x = 0; x < 16; x++) > + { > + sum += pix[x]; > + } > + ... > + } > + > + # sum_201 = PHI <sum_230(3), 0(2)> > + # y_149 = PHI <y_24(3), 0(2)> > + > + _33 = *pix_214; > + _34 = (unsigned intD.4) _33; > + sum_35 = sum_201 + _34; > + > + _46 = MEM[(uint8_tD.2849 *)pix_214 + 1B]; > + _47 = (unsigned intD.4) _46; > + sum_48 = sum_35 + _47; > + _49 = (intD.1) _46; > + > + All loads of pix are of the same rank, just the sum PHI has a different > + one in {..., _46, _33, sum_201}. > + swap_ops_for_binary_stmt will move sum_201 before _46 and _33 so that > + the first operation _33 OP _46 is exposed as an optimization opportunity. > + In doing so it will trip up the vectorizer which now needs to work much > + harder because it can't recognize the whole sequence as doing the same > + operation. */ > + > +static bool > +likely_vectorizable_p (vec<operand_entry *> *ops) > +{ > + operand_entry *oe; > + unsigned int i; > + unsigned int rank = 0; > + > + if (ops->length () < 3) > + return false; > + > + /* Check if the last operand is a PHI. */ > + operand_entry *oelast = (*ops)[ops->length () - 1]; > + > + if (TREE_CODE (oelast->op) != SSA_NAME) > + return false; > + > + gimple *glast = SSA_NAME_DEF_STMT (oelast->op); > + > + if (gimple_code (glast) != GIMPLE_PHI > + || gimple_phi_num_args (glast) != 2) > + return false; > + > + /* If so, check if its first argument is a gimple assign. */ > + tree phi_arg = gimple_phi_arg_def (glast, 0); > + > + if (TREE_CODE (phi_arg) != SSA_NAME) > + return false; > + > + gimple *garg_def = SSA_NAME_DEF_STMT (phi_arg); > + if (!is_gimple_assign (garg_def)) > + return false; > + > + /* It must link back to the beginning of our sequence. */ > + if (gimple_assign_rhs2 (garg_def) != (*ops)[0]->op) > + return false; > + > + /* Make sure everything apart from the PHI has the same rank. */ > + FOR_EACH_VEC_ELT (*ops, i, oe) > + { > + if (i < ops->length () - 1) > + { > + if (rank == 0) > + rank = oe->rank; > + else > + if (rank != oe->rank) > + return false; > + } > + } > + > + return true; > +} > + > /* Reassociate expressions in basic block BB and its post-dominator as > children. > > @@ -7182,6 +7272,8 @@ reassociate_bb (basic_block bb) > mult_num = rank_ops_for_fma (&ops); > } > > + bool vectorizable_p = likely_vectorizable_p (&ops); > + > /* Only rewrite the expression tree to parallel in the > last reassoc pass to avoid useless work back-and-forth > with initial linearization. */ > @@ -7208,7 +7300,7 @@ reassociate_bb (basic_block bb) > binary op are chosen wisely. */ > int len = ops.length (); > if (len >= 3 > - && (!has_fma > + && ((!has_fma && !vectorizable_p) > /* width > 1 means ranking ops results in better > parallelism. Check current value to avoid > calling get_reassociation_width again. */ > @@ -7346,13 +7438,13 @@ branch_fixup (void) > reassoc_branch_fixups.release (); > } > > -void dump_ops_vector (FILE *file, vec<operand_entry *> ops); > -void debug_ops_vector (vec<operand_entry *> ops); > +void dump_ops_vector (FILE *file, const vec<operand_entry *> &ops); > +void debug_ops_vector (const vec<operand_entry *> &ops); > > /* Dump the operand entry vector OPS to FILE. */ > > void > -dump_ops_vector (FILE *file, vec<operand_entry *> ops) > +dump_ops_vector (FILE *file, const vec<operand_entry *> &ops) > { > operand_entry *oe; > unsigned int i; > @@ -7368,7 +7460,7 @@ dump_ops_vector (FILE *file, vec<operand_entry *> ops) > /* Dump the operand entry vector OPS to STDERR. */ > > DEBUG_FUNCTION void > -debug_ops_vector (vec<operand_entry *> ops) > +debug_ops_vector (const vec<operand_entry *> &ops) > { > dump_ops_vector (stderr, ops); > } > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)