On Wed, Mar 5, 2025 at 12:50 PM Richard Biener <rguent...@suse.de> wrote: > > On Tue, 4 Mar 2025, Richard Sandiford wrote: > > > Richard Biener <rguent...@suse.de> writes: > > > When vectorizing a shift of u16 data by an amount that's known to > > > be less than 16 we currently fail to emit a vector u16 shift. The > > > first reason is that the promotion of the shift amount is hoisted > > > only by PRE and that cannot preserve range info, the second reason > > > is that pattern detection doesn't use range info when computing > > > the precision required for an operation. > > > > > > The following addresses the first issue by making LIM hoist all > > > expressions for the pass that runs right before PRE and the > > > second issue by querying ranges for the shift amount. > > > > > > Bootstrapped and tested on x86_64-unknown-linux-gnu. > > > > > > PR tree-optimization/119103 > > > * tree-ssa-loop-im.cc (in_loop_pipeline): Globalize. > > > (compute_invariantness): Override costing when we run > > > right before PRE and PRE is enabled. > > > (pass_lim::execute): Adjust. > > > * tree-vect-patterns.cc (vect_determine_precisions_from_users): > > > For variable shift amounts use range information. > > > > > > * gcc.target/i386/pr119103.c: New testcase. > > > > Nice! > > > > > [...] > > > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc > > > index 4f0a7ea162b..09408359cf2 100644 > > > --- a/gcc/tree-vect-patterns.cc > > > +++ b/gcc/tree-vect-patterns.cc > > > @@ -6544,10 +6544,19 @@ vect_determine_precisions_from_users > > > (stmt_vec_info stmt_info, gassign *stmt) > > > case RSHIFT_EXPR: > > > { > > > tree shift = gimple_assign_rhs2 (stmt); > > > - if (TREE_CODE (shift) != INTEGER_CST > > > - || !wi::ltu_p (wi::to_widest (shift), precision)) > > > + unsigned int const_shift; > > > + wide_int min_shift, max_shift; > > > + if (TREE_CODE (shift) == SSA_NAME > > > + && vect_get_range_info (shift, &min_shift, &max_shift) > > > + && wi::ge_p (min_shift, 0, TYPE_SIGN (TREE_TYPE (shift))) > > > + && wi::lt_p (max_shift, TYPE_PRECISION (type), > > > + TYPE_SIGN (TREE_TYPE (shift)))) > > > + const_shift = max_shift.to_uhwi (); > > > > Don't we need to use min_shift rather than max_shift for LSHIFT_EXPR's: > > > > /* We need CONST_SHIFT fewer bits of the input. */ > > min_input_precision = (MAX (operation_precision, const_shift) > > - const_shift); > > > > ? If so, I guess that means replacing const_shift with a minimum and > > a maximum. > > Right - fixed as below.
Pushed to trunk now after re-bootstrapping and testing on x86_64-unknown-linux-gnu. Richard. > Richard. > > From 41db126ca782410c69480b9d305c240b5e9166b2 Mon Sep 17 00:00:00 2001 > From: Richard Biener <rguent...@suse.de> > Date: Tue, 4 Mar 2025 10:34:39 +0100 > Subject: [PATCH] tree-optimization/119103 - missed overwidening detection for > shift > To: gcc-patches@gcc.gnu.org > > When vectorizing a shift of u16 data by an amount that's known to > be less than 16 we currently fail to emit a vector u16 shift. The > first reason is that the promotion of the shift amount is hoisted > only by PRE and that cannot preserve range info, the second reason > is that pattern detection doesn't use range info when computing > the precision required for an operation. > > The following addresses the first issue by making LIM hoist all > expressions for the pass that runs right before PRE and the > second issue by querying ranges for the shift amount. > > PR tree-optimization/119103 > * tree-ssa-loop-im.cc (in_loop_pipeline): Globalize. > (compute_invariantness): Override costing when we run > right before PRE and PRE is enabled. > (pass_lim::execute): Adjust. > * tree-vect-patterns.cc (vect_determine_precisions_from_users): > For variable shift amounts use range information. > > * gcc.target/i386/pr119103.c: New testcase. > --- > gcc/testsuite/gcc.target/i386/pr119103.c | 13 ++++++++++++ > gcc/tree-ssa-loop-im.cc | 10 +++++++-- > gcc/tree-vect-patterns.cc | 26 +++++++++++++++++------- > 3 files changed, 40 insertions(+), 9 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr119103.c > > diff --git a/gcc/testsuite/gcc.target/i386/pr119103.c > b/gcc/testsuite/gcc.target/i386/pr119103.c > new file mode 100644 > index 00000000000..57210dc3bbe > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr119103.c > @@ -0,0 +1,13 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx2" } */ > + > +void lshift(unsigned short *x, unsigned char amount) > +{ > + if (amount > 15) > + __builtin_unreachable(); > + > + for (int i = 0; i < 16; i++) > + x[i] <<= amount; > +} > + > +/* { dg-final { scan-assembler "vpsllw" } } */ > diff --git a/gcc/tree-ssa-loop-im.cc b/gcc/tree-ssa-loop-im.cc > index 225964c6215..a3ca5af3e3e 100644 > --- a/gcc/tree-ssa-loop-im.cc > +++ b/gcc/tree-ssa-loop-im.cc > @@ -143,6 +143,8 @@ public: > different modes. */ > }; > > +static bool in_loop_pipeline; > + > /* We use six bits per loop in the ref->dep_loop bitmap to record > the dep_kind x dep_state combinations. */ > > @@ -1239,7 +1241,11 @@ compute_invariantness (basic_block bb) > lim_data->cost); > } > > - if (lim_data->cost >= LIM_EXPENSIVE) > + if (lim_data->cost >= LIM_EXPENSIVE > + /* When we run before PRE and PRE is active hoist all expressions > + since PRE would do so anyway and we can preserve range info > + but PRE cannot. */ > + || (flag_tree_pre && !in_loop_pipeline)) > set_profitable_level (stmt); > } > } > @@ -3759,7 +3765,7 @@ public: > unsigned int > pass_lim::execute (function *fun) > { > - bool in_loop_pipeline = scev_initialized_p (); > + in_loop_pipeline = scev_initialized_p (); > if (!in_loop_pipeline) > loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS); > > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc > index 4f0a7ea162b..ca19add83c0 100644 > --- a/gcc/tree-vect-patterns.cc > +++ b/gcc/tree-vect-patterns.cc > @@ -6544,10 +6544,22 @@ vect_determine_precisions_from_users (stmt_vec_info > stmt_info, gassign *stmt) > case RSHIFT_EXPR: > { > tree shift = gimple_assign_rhs2 (stmt); > - if (TREE_CODE (shift) != INTEGER_CST > - || !wi::ltu_p (wi::to_widest (shift), precision)) > + unsigned int min_const_shift, max_const_shift; > + wide_int min_shift, max_shift; > + if (TREE_CODE (shift) == SSA_NAME > + && vect_get_range_info (shift, &min_shift, &max_shift) > + && wi::ge_p (min_shift, 0, TYPE_SIGN (TREE_TYPE (shift))) > + && wi::lt_p (max_shift, TYPE_PRECISION (type), > + TYPE_SIGN (TREE_TYPE (shift)))) > + { > + min_const_shift = min_shift.to_uhwi (); > + max_const_shift = max_shift.to_uhwi (); > + } > + else if (TREE_CODE (shift) == INTEGER_CST > + && wi::ltu_p (wi::to_widest (shift), precision)) > + min_const_shift = max_const_shift = TREE_INT_CST_LOW (shift); > + else > return; > - unsigned int const_shift = TREE_INT_CST_LOW (shift); > if (code == LSHIFT_EXPR) > { > /* Avoid creating an undefined shift. > @@ -6559,16 +6571,16 @@ vect_determine_precisions_from_users (stmt_vec_info > stmt_info, gassign *stmt) > of vectorization. This sort of thing should really be > handled before vectorization. */ > operation_precision = MAX (stmt_info->min_output_precision, > - const_shift + 1); > + max_const_shift + 1); > /* We need CONST_SHIFT fewer bits of the input. */ > - min_input_precision = (MAX (operation_precision, const_shift) > - - const_shift); > + min_input_precision = (MAX (operation_precision, max_const_shift) > + - min_const_shift); > } > else > { > /* We need CONST_SHIFT extra bits to do the operation. */ > operation_precision = (stmt_info->min_output_precision > - + const_shift); > + + max_const_shift); > min_input_precision = operation_precision; > } > break; > -- > 2.43.0 >