On Wed, Mar 5, 2025 at 12:50 PM Richard Biener <rguent...@suse.de> wrote:
>
> On Tue, 4 Mar 2025, Richard Sandiford wrote:
>
> > Richard Biener <rguent...@suse.de> writes:
> > > When vectorizing a shift of u16 data by an amount that's known to
> > > be less than 16 we currently fail to emit a vector u16 shift.  The
> > > first reason is that the promotion of the shift amount is hoisted
> > > only by PRE and that cannot preserve range info, the second reason
> > > is that pattern detection doesn't use range info when computing
> > > the precision required for an operation.
> > >
> > > The following addresses the first issue by making LIM hoist all
> > > expressions for the pass that runs right before PRE and the
> > > second issue by querying ranges for the shift amount.
> > >
> > > Bootstrapped and tested on x86_64-unknown-linux-gnu.
> > >
> > >     PR tree-optimization/119103
> > >     * tree-ssa-loop-im.cc (in_loop_pipeline): Globalize.
> > >     (compute_invariantness): Override costing when we run
> > >     right before PRE and PRE is enabled.
> > >     (pass_lim::execute): Adjust.
> > >     * tree-vect-patterns.cc (vect_determine_precisions_from_users):
> > >     For variable shift amounts use range information.
> > >
> > >     * gcc.target/i386/pr119103.c: New testcase.
> >
> > Nice!
> >
> > > [...]
> > > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> > > index 4f0a7ea162b..09408359cf2 100644
> > > --- a/gcc/tree-vect-patterns.cc
> > > +++ b/gcc/tree-vect-patterns.cc
> > > @@ -6544,10 +6544,19 @@ vect_determine_precisions_from_users 
> > > (stmt_vec_info stmt_info, gassign *stmt)
> > >      case RSHIFT_EXPR:
> > >        {
> > >     tree shift = gimple_assign_rhs2 (stmt);
> > > -   if (TREE_CODE (shift) != INTEGER_CST
> > > -       || !wi::ltu_p (wi::to_widest (shift), precision))
> > > +   unsigned int const_shift;
> > > +   wide_int min_shift, max_shift;
> > > +   if (TREE_CODE (shift) == SSA_NAME
> > > +       && vect_get_range_info (shift, &min_shift, &max_shift)
> > > +       && wi::ge_p (min_shift, 0, TYPE_SIGN (TREE_TYPE (shift)))
> > > +       && wi::lt_p (max_shift, TYPE_PRECISION (type),
> > > +                    TYPE_SIGN (TREE_TYPE (shift))))
> > > +     const_shift = max_shift.to_uhwi ();
> >
> > Don't we need to use min_shift rather than max_shift for LSHIFT_EXPR's:
> >
> >           /* We need CONST_SHIFT fewer bits of the input.  */
> >           min_input_precision = (MAX (operation_precision, const_shift)
> >                                  - const_shift);
> >
> > ?  If so, I guess that means replacing const_shift with a minimum and
> > a maximum.
>
> Right - fixed as below.

Pushed to trunk now after re-bootstrapping and testing on
x86_64-unknown-linux-gnu.

Richard.

> Richard.
>
> From 41db126ca782410c69480b9d305c240b5e9166b2 Mon Sep 17 00:00:00 2001
> From: Richard Biener <rguent...@suse.de>
> Date: Tue, 4 Mar 2025 10:34:39 +0100
> Subject: [PATCH] tree-optimization/119103 - missed overwidening detection for
>  shift
> To: gcc-patches@gcc.gnu.org
>
> When vectorizing a shift of u16 data by an amount that's known to
> be less than 16 we currently fail to emit a vector u16 shift.  The
> first reason is that the promotion of the shift amount is hoisted
> only by PRE and that cannot preserve range info, the second reason
> is that pattern detection doesn't use range info when computing
> the precision required for an operation.
>
> The following addresses the first issue by making LIM hoist all
> expressions for the pass that runs right before PRE and the
> second issue by querying ranges for the shift amount.
>
>         PR tree-optimization/119103
>         * tree-ssa-loop-im.cc (in_loop_pipeline): Globalize.
>         (compute_invariantness): Override costing when we run
>         right before PRE and PRE is enabled.
>         (pass_lim::execute): Adjust.
>         * tree-vect-patterns.cc (vect_determine_precisions_from_users):
>         For variable shift amounts use range information.
>
>         * gcc.target/i386/pr119103.c: New testcase.
> ---
>  gcc/testsuite/gcc.target/i386/pr119103.c | 13 ++++++++++++
>  gcc/tree-ssa-loop-im.cc                  | 10 +++++++--
>  gcc/tree-vect-patterns.cc                | 26 +++++++++++++++++-------
>  3 files changed, 40 insertions(+), 9 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr119103.c
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr119103.c 
> b/gcc/testsuite/gcc.target/i386/pr119103.c
> new file mode 100644
> index 00000000000..57210dc3bbe
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr119103.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2" } */
> +
> +void lshift(unsigned short *x, unsigned char amount)
> +{
> +  if (amount > 15)
> +    __builtin_unreachable();
> +
> +  for (int i = 0; i < 16; i++)
> +    x[i] <<= amount;
> +}
> +
> +/* { dg-final { scan-assembler "vpsllw" } } */
> diff --git a/gcc/tree-ssa-loop-im.cc b/gcc/tree-ssa-loop-im.cc
> index 225964c6215..a3ca5af3e3e 100644
> --- a/gcc/tree-ssa-loop-im.cc
> +++ b/gcc/tree-ssa-loop-im.cc
> @@ -143,6 +143,8 @@ public:
>                                    different modes.  */
>  };
>
> +static bool in_loop_pipeline;
> +
>  /* We use six bits per loop in the ref->dep_loop bitmap to record
>     the dep_kind x dep_state combinations.  */
>
> @@ -1239,7 +1241,11 @@ compute_invariantness (basic_block bb)
>                    lim_data->cost);
>         }
>
> -      if (lim_data->cost >= LIM_EXPENSIVE)
> +      if (lim_data->cost >= LIM_EXPENSIVE
> +         /* When we run before PRE and PRE is active hoist all expressions
> +            since PRE would do so anyway and we can preserve range info
> +            but PRE cannot.  */
> +         || (flag_tree_pre && !in_loop_pipeline))
>         set_profitable_level (stmt);
>      }
>  }
> @@ -3759,7 +3765,7 @@ public:
>  unsigned int
>  pass_lim::execute (function *fun)
>  {
> -  bool in_loop_pipeline = scev_initialized_p ();
> +  in_loop_pipeline = scev_initialized_p ();
>    if (!in_loop_pipeline)
>      loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
>
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index 4f0a7ea162b..ca19add83c0 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -6544,10 +6544,22 @@ vect_determine_precisions_from_users (stmt_vec_info 
> stmt_info, gassign *stmt)
>      case RSHIFT_EXPR:
>        {
>         tree shift = gimple_assign_rhs2 (stmt);
> -       if (TREE_CODE (shift) != INTEGER_CST
> -           || !wi::ltu_p (wi::to_widest (shift), precision))
> +       unsigned int min_const_shift, max_const_shift;
> +       wide_int min_shift, max_shift;
> +       if (TREE_CODE (shift) == SSA_NAME
> +           && vect_get_range_info (shift, &min_shift, &max_shift)
> +           && wi::ge_p (min_shift, 0, TYPE_SIGN (TREE_TYPE (shift)))
> +           && wi::lt_p (max_shift, TYPE_PRECISION (type),
> +                        TYPE_SIGN (TREE_TYPE (shift))))
> +         {
> +           min_const_shift = min_shift.to_uhwi ();
> +           max_const_shift = max_shift.to_uhwi ();
> +         }
> +       else if (TREE_CODE (shift) == INTEGER_CST
> +                && wi::ltu_p (wi::to_widest (shift), precision))
> +         min_const_shift = max_const_shift = TREE_INT_CST_LOW (shift);
> +       else
>           return;
> -       unsigned int const_shift = TREE_INT_CST_LOW (shift);
>         if (code == LSHIFT_EXPR)
>           {
>             /* Avoid creating an undefined shift.
> @@ -6559,16 +6571,16 @@ vect_determine_precisions_from_users (stmt_vec_info 
> stmt_info, gassign *stmt)
>                of vectorization.  This sort of thing should really be
>                handled before vectorization.  */
>             operation_precision = MAX (stmt_info->min_output_precision,
> -                                      const_shift + 1);
> +                                      max_const_shift + 1);
>             /* We need CONST_SHIFT fewer bits of the input.  */
> -           min_input_precision = (MAX (operation_precision, const_shift)
> -                                  - const_shift);
> +           min_input_precision = (MAX (operation_precision, max_const_shift)
> +                                  - min_const_shift);
>           }
>         else
>           {
>             /* We need CONST_SHIFT extra bits to do the operation.  */
>             operation_precision = (stmt_info->min_output_precision
> -                                  + const_shift);
> +                                  + max_const_shift);
>             min_input_precision = operation_precision;
>           }
>         break;
> --
> 2.43.0
>

Reply via email to