On Mon, Apr 28, 2025 at 9:09 PM Richard Biener <richard.guent...@gmail.com> wrote: > > On Wed, Mar 5, 2025 at 12:50 PM Richard Biener <rguent...@suse.de> wrote: > > > > On Tue, 4 Mar 2025, Richard Sandiford wrote: > > > > > Richard Biener <rguent...@suse.de> writes: > > > > When vectorizing a shift of u16 data by an amount that's known to > > > > be less than 16 we currently fail to emit a vector u16 shift. The > > > > first reason is that the promotion of the shift amount is hoisted > > > > only by PRE and that cannot preserve range info, the second reason > > > > is that pattern detection doesn't use range info when computing > > > > the precision required for an operation. > > > > > > > > The following addresses the first issue by making LIM hoist all > > > > expressions for the pass that runs right before PRE and the > > > > second issue by querying ranges for the shift amount. > > > > > > > > Bootstrapped and tested on x86_64-unknown-linux-gnu. > > > > > > > > PR tree-optimization/119103 > > > > * tree-ssa-loop-im.cc (in_loop_pipeline): Globalize. > > > > (compute_invariantness): Override costing when we run > > > > right before PRE and PRE is enabled. > > > > (pass_lim::execute): Adjust. > > > > * tree-vect-patterns.cc (vect_determine_precisions_from_users): > > > > For variable shift amounts use range information. > > > > > > > > * gcc.target/i386/pr119103.c: New testcase. > > > > > > Nice! > > > > > > > [...] > > > > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc > > > > index 4f0a7ea162b..09408359cf2 100644 > > > > --- a/gcc/tree-vect-patterns.cc > > > > +++ b/gcc/tree-vect-patterns.cc > > > > @@ -6544,10 +6544,19 @@ vect_determine_precisions_from_users > > > > (stmt_vec_info stmt_info, gassign *stmt) > > > > case RSHIFT_EXPR: > > > > { > > > > tree shift = gimple_assign_rhs2 (stmt); > > > > - if (TREE_CODE (shift) != INTEGER_CST > > > > - || !wi::ltu_p (wi::to_widest (shift), precision)) > > > > + unsigned int const_shift; > > > > + wide_int min_shift, max_shift; > > > > + if (TREE_CODE (shift) == SSA_NAME > > > > + && vect_get_range_info (shift, &min_shift, &max_shift) > > > > + && wi::ge_p (min_shift, 0, TYPE_SIGN (TREE_TYPE (shift))) > > > > + && wi::lt_p (max_shift, TYPE_PRECISION (type), > > > > + TYPE_SIGN (TREE_TYPE (shift)))) > > > > + const_shift = max_shift.to_uhwi (); > > > > > > Don't we need to use min_shift rather than max_shift for LSHIFT_EXPR's: > > > > > > /* We need CONST_SHIFT fewer bits of the input. */ > > > min_input_precision = (MAX (operation_precision, const_shift) > > > - const_shift); > > > > > > ? If so, I guess that means replacing const_shift with a minimum and > > > a maximum. > > > > Right - fixed as below. > > Pushed to trunk now after re-bootstrapping and testing on > x86_64-unknown-linux-gnu.
This caused: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119982 > Richard. > > > Richard. > > > > From 41db126ca782410c69480b9d305c240b5e9166b2 Mon Sep 17 00:00:00 2001 > > From: Richard Biener <rguent...@suse.de> > > Date: Tue, 4 Mar 2025 10:34:39 +0100 > > Subject: [PATCH] tree-optimization/119103 - missed overwidening detection > > for > > shift > > To: gcc-patches@gcc.gnu.org > > > > When vectorizing a shift of u16 data by an amount that's known to > > be less than 16 we currently fail to emit a vector u16 shift. The > > first reason is that the promotion of the shift amount is hoisted > > only by PRE and that cannot preserve range info, the second reason > > is that pattern detection doesn't use range info when computing > > the precision required for an operation. > > > > The following addresses the first issue by making LIM hoist all > > expressions for the pass that runs right before PRE and the > > second issue by querying ranges for the shift amount. > > > > PR tree-optimization/119103 > > * tree-ssa-loop-im.cc (in_loop_pipeline): Globalize. > > (compute_invariantness): Override costing when we run > > right before PRE and PRE is enabled. > > (pass_lim::execute): Adjust. > > * tree-vect-patterns.cc (vect_determine_precisions_from_users): > > For variable shift amounts use range information. > > > > * gcc.target/i386/pr119103.c: New testcase. > > --- > > gcc/testsuite/gcc.target/i386/pr119103.c | 13 ++++++++++++ > > gcc/tree-ssa-loop-im.cc | 10 +++++++-- > > gcc/tree-vect-patterns.cc | 26 +++++++++++++++++------- > > 3 files changed, 40 insertions(+), 9 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/pr119103.c > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr119103.c > > b/gcc/testsuite/gcc.target/i386/pr119103.c > > new file mode 100644 > > index 00000000000..57210dc3bbe > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr119103.c > > @@ -0,0 +1,13 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -mavx2" } */ > > + > > +void lshift(unsigned short *x, unsigned char amount) > > +{ > > + if (amount > 15) > > + __builtin_unreachable(); > > + > > + for (int i = 0; i < 16; i++) > > + x[i] <<= amount; > > +} > > + > > +/* { dg-final { scan-assembler "vpsllw" } } */ > > diff --git a/gcc/tree-ssa-loop-im.cc b/gcc/tree-ssa-loop-im.cc > > index 225964c6215..a3ca5af3e3e 100644 > > --- a/gcc/tree-ssa-loop-im.cc > > +++ b/gcc/tree-ssa-loop-im.cc > > @@ -143,6 +143,8 @@ public: > > different modes. */ > > }; > > > > +static bool in_loop_pipeline; > > + > > /* We use six bits per loop in the ref->dep_loop bitmap to record > > the dep_kind x dep_state combinations. */ > > > > @@ -1239,7 +1241,11 @@ compute_invariantness (basic_block bb) > > lim_data->cost); > > } > > > > - if (lim_data->cost >= LIM_EXPENSIVE) > > + if (lim_data->cost >= LIM_EXPENSIVE > > + /* When we run before PRE and PRE is active hoist all expressions > > + since PRE would do so anyway and we can preserve range info > > + but PRE cannot. */ > > + || (flag_tree_pre && !in_loop_pipeline)) > > set_profitable_level (stmt); > > } > > } > > @@ -3759,7 +3765,7 @@ public: > > unsigned int > > pass_lim::execute (function *fun) > > { > > - bool in_loop_pipeline = scev_initialized_p (); > > + in_loop_pipeline = scev_initialized_p (); > > if (!in_loop_pipeline) > > loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS); > > > > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc > > index 4f0a7ea162b..ca19add83c0 100644 > > --- a/gcc/tree-vect-patterns.cc > > +++ b/gcc/tree-vect-patterns.cc > > @@ -6544,10 +6544,22 @@ vect_determine_precisions_from_users (stmt_vec_info > > stmt_info, gassign *stmt) > > case RSHIFT_EXPR: > > { > > tree shift = gimple_assign_rhs2 (stmt); > > - if (TREE_CODE (shift) != INTEGER_CST > > - || !wi::ltu_p (wi::to_widest (shift), precision)) > > + unsigned int min_const_shift, max_const_shift; > > + wide_int min_shift, max_shift; > > + if (TREE_CODE (shift) == SSA_NAME > > + && vect_get_range_info (shift, &min_shift, &max_shift) > > + && wi::ge_p (min_shift, 0, TYPE_SIGN (TREE_TYPE (shift))) > > + && wi::lt_p (max_shift, TYPE_PRECISION (type), > > + TYPE_SIGN (TREE_TYPE (shift)))) > > + { > > + min_const_shift = min_shift.to_uhwi (); > > + max_const_shift = max_shift.to_uhwi (); > > + } > > + else if (TREE_CODE (shift) == INTEGER_CST > > + && wi::ltu_p (wi::to_widest (shift), precision)) > > + min_const_shift = max_const_shift = TREE_INT_CST_LOW (shift); > > + else > > return; > > - unsigned int const_shift = TREE_INT_CST_LOW (shift); > > if (code == LSHIFT_EXPR) > > { > > /* Avoid creating an undefined shift. > > @@ -6559,16 +6571,16 @@ vect_determine_precisions_from_users (stmt_vec_info > > stmt_info, gassign *stmt) > > of vectorization. This sort of thing should really be > > handled before vectorization. */ > > operation_precision = MAX (stmt_info->min_output_precision, > > - const_shift + 1); > > + max_const_shift + 1); > > /* We need CONST_SHIFT fewer bits of the input. */ > > - min_input_precision = (MAX (operation_precision, const_shift) > > - - const_shift); > > + min_input_precision = (MAX (operation_precision, > > max_const_shift) > > + - min_const_shift); > > } > > else > > { > > /* We need CONST_SHIFT extra bits to do the operation. */ > > operation_precision = (stmt_info->min_output_precision > > - + const_shift); > > + + max_const_shift); > > min_input_precision = operation_precision; > > } > > break; > > -- > > 2.43.0 > > -- H.J.