On Wed, 16 Oct 2024, Robin Dapp wrote:

> Hi,
> 
> this is probably rather an RFC than a patch as I'm not sure whether
> reassoc is the right place to fix it.  On top, the heuristic might
> be a bit "ad-hoc".  Maybe we can also work around it in the vectorizer?
> 
> The following function is vectorized in a very inefficient way because we
> construct vectors from scalar loads.
> 
> uint64_t
> foo (uint8_t *pix, int i_stride)
> {
>   uint32_t sum = 0, sqr = 0;
>   int x, y;
>   for (y = 0; y < 16; y++)
>     {
>       for (x = 0; x < 16; x++)
>       sum += pix[x];
>       pix += i_stride;
>     }
>   return sum;
> }
> 
> The reason for this is that reassoc reorders the last three operands of
> the summation sequence by rank introducing a temporary in the process
> that breaks the "homogeneity" (sum_n = sum_{n - 1} + pix[n]) of the sum.
> 
> This patch adds a function likely_vectorizable_p that checks if an
> operand vector contains only operands of the same rank except the last
> one.  In that case the sequence is likely vectorizable and the easier
> vectorization will outweigh any CSE opportunity we can expose by
> swapping operands.

Hmm, reassoc isn't supposed to apply width > 1 before vectorization;
IIRC I "fixed" that at some point, but this is what I see:

  # sum_126 = PHI <sum_145(5), 0(2)>
...
  _23 = *pix_134; 
  _24 = (unsigned int) _23;
  _31 = MEM[(uint8_t *)pix_134 + 1B]; 
  _32 = (unsigned int) _31;
  _118 = _24 + _32;
  sum_33 = _118 + sum_126; 
...

I think the place you add the condition to on swap_ops_for_binary_stmt
simply lacks a !reassoc_insert_powi_p check like we have on the earlier
rewrite_expr_tree_parallel.

Richard.

> Bootstrapped and regtested on x86, regtested on rv64gcv.
> 
> Regards
>  Robin
> 
> gcc/ChangeLog:
> 
>       * tree-ssa-reassoc.cc (likely_vectorizable_p): New function.
>       (reassociate_bb): Use new function.
>       (dump_ops_vector): Change prototype.
>       (debug_ops_vector): Change prototype.
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.dg/tree-ssa/reassoc-52.c: New test.
> ---
>  gcc/testsuite/gcc.dg/tree-ssa/reassoc-52.c |  27 ++++++
>  gcc/tree-ssa-reassoc.cc                    | 102 ++++++++++++++++++++-
>  2 files changed, 124 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/reassoc-52.c
> 
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/reassoc-52.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/reassoc-52.c
> new file mode 100644
> index 00000000000..b117b7519bb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/reassoc-52.c
> @@ -0,0 +1,27 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -fdump-tree-reassoc-details" } */
> +
> +typedef unsigned char uint8_t;
> +typedef unsigned int uint32_t;
> +typedef unsigned long uint64_t;
> +
> +uint64_t
> +foo (uint8_t *pix, int i_stride)
> +{
> +  uint32_t sum = 0, sqr = 0;
> +  int x, y;
> +  for (y = 0; y < 16; y++)
> +    {
> +      for (x = 0; x < 16; x++)
> +     sum += pix[x];
> +      pix += i_stride;
> +    }
> +  return sum;
> +}
> +
> +/* Ensure that we only add to sum variables and don't create a temporary that
> +   does something else.  In doing so we enable a much more efficient
> +   vectorization scheme.  */
> +
> +/* { dg-final { scan-tree-dump-times "\\s+sum_\\d+\\s+=\\s+sum_\\d+\\s+\\\+" 
> 16 "reassoc1" } } */
> +/* { dg-final { scan-tree-dump-times "\\s+sum_\\d+\\s=.*\\\+\\ssum_\\d+;" 1 
> "reassoc1" } } */
> diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc
> index 556ecdebe2d..13f8a1070b6 100644
> --- a/gcc/tree-ssa-reassoc.cc
> +++ b/gcc/tree-ssa-reassoc.cc
> @@ -6992,6 +6992,96 @@ rank_ops_for_fma (vec<operand_entry *> *ops)
>      }
>    return mult_num;
>  }
> +
> +/* Check if the operand vector contains operands that all have the same rank
> +   apart from the last one.  The last one must be a PHI node which links back
> +   to the first operand.
> +   This can indicate an easily vectorizable sequence in which case we don't
> +   want to reorder the first three elements for rank reasons.
> +
> +   Consider the following example:
> +
> +    for (int y = 0; y < 16; y++)
> +      {
> +     for (int x = 0; x < 16; x++)
> +       {
> +         sum += pix[x];
> +       }
> +       ...
> +      }
> +
> +    # sum_201 = PHI <sum_230(3), 0(2)>
> +    # y_149 = PHI <y_24(3), 0(2)>
> +
> +    _33 = *pix_214;
> +    _34 = (unsigned intD.4) _33;
> +    sum_35 = sum_201 + _34;
> +
> +    _46 = MEM[(uint8_tD.2849 *)pix_214 + 1B];
> +    _47 = (unsigned intD.4) _46;
> +    sum_48 = sum_35 + _47;
> +    _49 = (intD.1) _46;
> +
> +    All loads of pix are of the same rank, just the sum PHI has a different
> +    one in {..., _46, _33, sum_201}.
> +    swap_ops_for_binary_stmt will move sum_201 before _46 and _33 so that
> +    the first operation _33 OP _46 is exposed as an optimization opportunity.
> +    In doing so it will trip up the vectorizer which now needs to work much
> +    harder because it can't recognize the whole sequence as doing the same
> +    operation.  */
> +
> +static bool
> +likely_vectorizable_p (vec<operand_entry *> *ops)
> +{
> +  operand_entry *oe;
> +  unsigned int i;
> +  unsigned int rank = 0;
> +
> +  if (ops->length () < 3)
> +    return false;
> +
> +  /* Check if the last operand is a PHI.  */
> +  operand_entry *oelast = (*ops)[ops->length () - 1];
> +
> +  if (TREE_CODE (oelast->op) != SSA_NAME)
> +    return false;
> +
> +  gimple *glast = SSA_NAME_DEF_STMT (oelast->op);
> +
> +  if (gimple_code (glast) != GIMPLE_PHI
> +      || gimple_phi_num_args (glast) != 2)
> +    return false;
> +
> +  /* If so, check if its first argument is a gimple assign.  */
> +  tree phi_arg = gimple_phi_arg_def (glast, 0);
> +
> +  if (TREE_CODE (phi_arg) != SSA_NAME)
> +    return false;
> +
> +  gimple *garg_def = SSA_NAME_DEF_STMT (phi_arg);
> +  if (!is_gimple_assign (garg_def))
> +    return false;
> +
> +  /* It must link back to the beginning of our sequence.  */
> +  if (gimple_assign_rhs2 (garg_def) != (*ops)[0]->op)
> +    return false;
> +
> +  /* Make sure everything apart from the PHI has the same rank.  */
> +  FOR_EACH_VEC_ELT (*ops, i, oe)
> +    {
> +      if (i < ops->length () - 1)
> +     {
> +       if (rank == 0)
> +         rank = oe->rank;
> +       else
> +         if (rank != oe->rank)
> +           return false;
> +     }
> +    }
> +
> +  return true;
> +}
> +
>  /* Reassociate expressions in basic block BB and its post-dominator as
>     children.
>  
> @@ -7182,6 +7272,8 @@ reassociate_bb (basic_block bb)
>                     mult_num = rank_ops_for_fma (&ops);
>                   }
>  
> +               bool vectorizable_p = likely_vectorizable_p (&ops);
> +
>                 /* Only rewrite the expression tree to parallel in the
>                    last reassoc pass to avoid useless work back-and-forth
>                    with initial linearization.  */
> @@ -7208,7 +7300,7 @@ reassociate_bb (basic_block bb)
>                        binary op are chosen wisely.  */
>                     int len = ops.length ();
>                     if (len >= 3
> -                       && (!has_fma
> +                       && ((!has_fma && !vectorizable_p)
>                             /* width > 1 means ranking ops results in better
>                                parallelism.  Check current value to avoid
>                                calling get_reassociation_width again.  */
> @@ -7346,13 +7438,13 @@ branch_fixup (void)
>    reassoc_branch_fixups.release ();
>  }
>  
> -void dump_ops_vector (FILE *file, vec<operand_entry *> ops);
> -void debug_ops_vector (vec<operand_entry *> ops);
> +void dump_ops_vector (FILE *file, const vec<operand_entry *> &ops);
> +void debug_ops_vector (const vec<operand_entry *> &ops);
>  
>  /* Dump the operand entry vector OPS to FILE.  */
>  
>  void
> -dump_ops_vector (FILE *file, vec<operand_entry *> ops)
> +dump_ops_vector (FILE *file, const vec<operand_entry *> &ops)
>  {
>    operand_entry *oe;
>    unsigned int i;
> @@ -7368,7 +7460,7 @@ dump_ops_vector (FILE *file, vec<operand_entry *> ops)
>  /* Dump the operand entry vector OPS to STDERR.  */
>  
>  DEBUG_FUNCTION void
> -debug_ops_vector (vec<operand_entry *> ops)
> +debug_ops_vector (const vec<operand_entry *> &ops)
>  {
>    dump_ops_vector (stderr, ops);
>  }
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to