On Sat, Jan 27, 2024 at 4:44 PM Richard Sandiford <richard.sandif...@arm.com> wrote: > > This was another PR caused by the way that > vect_determine_precisions_from_range handle shifts. We tried to > narrow 32768 >> x to a 16-bit shift based on range information for > the inputs and outputs, with vect_recog_over_widening_pattern > (after PR110828) adjusting the shift amount. But this doesn't > work for the case where x is in [16, 31], since then 32-bit > 32768 >> x is a well-defined zero, whereas no well-defined > 16-bit 32768 >> y will produce 0. > > We could perhaps generate x < 16 ? 32768 >> x : 0 instead, > but since vect_determine_precisions_from_range was never really > supposed to rely on fix-ups, it seems better to fix that instead. > > The patch also makes the code more selective about which codes > can be narrowed based on input and output ranges. This showed > that vect_truncatable_operation_p was missing cases for > BIT_NOT_EXPR (equivalent to BIT_XOR_EXPR of -1) and NEGATE_EXPR > (equivalent to BIT_NOT_EXPR followed by a PLUS_EXPR of 1). > > pr113281-1.c is the original testcase. pr113281-[23].c failed > before the patch due to overly optimistic narrowing. pr113281-[45].c > previously passed and are meant to protect against accidental > optimisation regressions. > > Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install?
OK. Thanks, Richard. > Richard > > > gcc/ > PR target/113281 > * tree-vect-patterns.cc (vect_recog_over_widening_pattern): Remove > workaround for right shifts. > (vect_truncatable_operation_p): Handle NEGATE_EXPR and BIT_NOT_EXPR. > (vect_determine_precisions_from_range): Be more selective about > which codes can be narrowed based on their input and output ranges. > For shifts, require at least one more bit of precision than the > maximum shift amount. > > gcc/testsuite/ > PR target/113281 > * gcc.dg/vect/pr113281-1.c: New test. > * gcc.dg/vect/pr113281-2.c: Likewise. > * gcc.dg/vect/pr113281-3.c: Likewise. > * gcc.dg/vect/pr113281-4.c: Likewise. > * gcc.dg/vect/pr113281-5.c: Likewise. > --- > gcc/testsuite/gcc.dg/vect/pr113281-1.c | 17 +++ > gcc/testsuite/gcc.dg/vect/pr113281-2.c | 50 +++++++++ > gcc/testsuite/gcc.dg/vect/pr113281-3.c | 39 +++++++ > gcc/testsuite/gcc.dg/vect/pr113281-4.c | 55 ++++++++++ > gcc/testsuite/gcc.dg/vect/pr113281-5.c | 66 ++++++++++++ > gcc/tree-vect-patterns.cc | 144 +++++++++++++------------ > 6 files changed, 305 insertions(+), 66 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/vect/pr113281-1.c > create mode 100644 gcc/testsuite/gcc.dg/vect/pr113281-2.c > create mode 100644 gcc/testsuite/gcc.dg/vect/pr113281-3.c > create mode 100644 gcc/testsuite/gcc.dg/vect/pr113281-4.c > create mode 100644 gcc/testsuite/gcc.dg/vect/pr113281-5.c > > diff --git a/gcc/testsuite/gcc.dg/vect/pr113281-1.c > b/gcc/testsuite/gcc.dg/vect/pr113281-1.c > new file mode 100644 > index 00000000000..6df4231cb5f > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/pr113281-1.c > @@ -0,0 +1,17 @@ > +#include "tree-vect.h" > + > +unsigned char a; > + > +int main() { > + check_vect (); > + > + short b = a = 0; > + for (; a != 19; a++) > + if (a) > + b = 32872 >> a; > + > + if (b == 0) > + return 0; > + else > + return 1; > +} > diff --git a/gcc/testsuite/gcc.dg/vect/pr113281-2.c > b/gcc/testsuite/gcc.dg/vect/pr113281-2.c > new file mode 100644 > index 00000000000..3a1170c28b6 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/pr113281-2.c > @@ -0,0 +1,50 @@ > +/* { dg-do compile } */ > + > +#define N 128 > + > +short x[N]; > +short y[N]; > + > +void > +f1 (void) > +{ > + for (int i = 0; i < N; ++i) > + x[i] >>= y[i]; > +} > + > +void > +f2 (void) > +{ > + for (int i = 0; i < N; ++i) > + x[i] >>= (y[i] < 32 ? y[i] : 32); > +} > + > +void > +f3 (void) > +{ > + for (int i = 0; i < N; ++i) > + x[i] >>= (y[i] < 31 ? y[i] : 31); > +} > + > +void > +f4 (void) > +{ > + for (int i = 0; i < N; ++i) > + x[i] >>= (y[i] & 31); > +} > + > +void > +f5 (void) > +{ > + for (int i = 0; i < N; ++i) > + x[i] >>= 0x8000 >> y[i]; > +} > + > +void > +f6 (void) > +{ > + for (int i = 0; i < N; ++i) > + x[i] >>= 0x8000 >> (y[i] & 31); > +} > + > +/* { dg-final { scan-tree-dump-not {can narrow[^\n]+>>} "vect" } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/pr113281-3.c > b/gcc/testsuite/gcc.dg/vect/pr113281-3.c > new file mode 100644 > index 00000000000..5982dd2d16f > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/pr113281-3.c > @@ -0,0 +1,39 @@ > +/* { dg-do compile } */ > + > +#define N 128 > + > +short x[N]; > +short y[N]; > + > +void > +f1 (void) > +{ > + for (int i = 0; i < N; ++i) > + x[i] >>= (y[i] < 30 ? y[i] : 30); > +} > + > +void > +f2 (void) > +{ > + for (int i = 0; i < N; ++i) > + x[i] >>= ((y[i] & 15) + 2); > +} > + > +void > +f3 (void) > +{ > + for (int i = 0; i < N; ++i) > + x[i] >>= (y[i] < 16 ? y[i] : 16); > +} > + > +void > +f4 (void) > +{ > + for (int i = 0; i < N; ++i) > + x[i] = 32768 >> ((y[i] & 15) + 3); > +} > + > +/* { dg-final { scan-tree-dump {can narrow to signed:31 without loss > [^\n]+>>} "vect" } } */ > +/* { dg-final { scan-tree-dump {can narrow to signed:18 without loss > [^\n]+>>} "vect" } } */ > +/* { dg-final { scan-tree-dump {can narrow to signed:17 without loss > [^\n]+>>} "vect" } } */ > +/* { dg-final { scan-tree-dump {can narrow to unsigned:19 without loss > [^\n]+>>} "vect" } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/pr113281-4.c > b/gcc/testsuite/gcc.dg/vect/pr113281-4.c > new file mode 100644 > index 00000000000..10fbc0e8405 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/pr113281-4.c > @@ -0,0 +1,55 @@ > +/* { dg-do compile } */ > + > +#define N 128 > + > +short x[N]; > +short y[N]; > + > +void > +f1 (void) > +{ > + for (int i = 0; i < N; ++i) > + x[i] >>= (y[i] & 15); > +} > + > +void > +f2 (void) > +{ > + for (int i = 0; i < N; ++i) > + x[i] >>= ((y[i] & 7) + 8); > +} > + > +void > +f3 (void) > +{ > + for (int i = 0; i < N; ++i) > + x[i] >>= ((y[i] & 7) ^ 11); > +} > + > +void > +f4 (void) > +{ > + for (int i = 0; i < N; ++i) > + x[i] >>= (y[i] < 15 ? y[i] : 15); > +} > + > +void > +f5 (void) > +{ > + for (int i = 0; i < N; ++i) > + x[i] >>= (y[i] < 15 ? y[i] : 1); > +} > + > +void > +f6 (void) > +{ > + for (int i = 0; i < N; ++i) > + x[i] = 32768 >> (y[i] & 15); > +} > + > +/* { dg-final { scan-tree-dump {:11:[^\n]+can narrow to signed:16 without > loss [^\n]+>>} "vect" } } */ > +/* { dg-final { scan-tree-dump {:18:[^\n]+can narrow to signed:16 without > loss [^\n]+>>} "vect" } } */ > +/* { dg-final { scan-tree-dump {:25:[^\n]+can narrow to signed:16 without > loss [^\n]+>>} "vect" } } */ > +/* { dg-final { scan-tree-dump {:32:[^\n]+can narrow to signed:16 without > loss [^\n]+>>} "vect" } } */ > +/* { dg-final { scan-tree-dump {:39:[^\n]+can narrow to signed:16 without > loss [^\n]+>>} "vect" } } */ > +/* { dg-final { scan-tree-dump {can narrow to unsigned:16 without loss > [^\n]+>>} "vect" } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/pr113281-5.c > b/gcc/testsuite/gcc.dg/vect/pr113281-5.c > new file mode 100644 > index 00000000000..4a4571792e2 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/pr113281-5.c > @@ -0,0 +1,66 @@ > +/* { dg-do compile } */ > + > +#define N 128 > + > +short x[N]; > +short y[N]; > + > +void > +f1 (void) > +{ > + for (int i = 0; i < N; ++i) > + { > + int a = y[i]; > + int b = ~a; > + x[i] = b; > + } > +} > + > +void > +f2 (void) > +{ > + for (int i = 0; i < N; ++i) > + { > + int a = y[i]; > + int b = -a; > + x[i] = b; > + } > +} > + > +void > +f3 (void) > +{ > + for (int i = 0; i < N; ++i) > + { > + int a = x[i]; > + int b = a / y[i]; > + x[i] = b; > + } > +} > + > +void > +f4 (void) > +{ > + for (int i = 0; i < N; ++i) > + { > + int a = x[i]; > + int b = a < y[i] ? a : y[i]; > + x[i] = b; > + } > +} > + > +void > +f5 (void) > +{ > + for (int i = 0; i < N; ++i) > + { > + int a = x[i]; > + int b = a > y[i] ? a : y[i]; > + x[i] = b; > + } > +} > + > +/* { dg-final { scan-tree-dump {can narrow to signed:17 without loss [^\n]+= > -} "vect" } } */ > +/* { dg-final { scan-tree-dump {can narrow to signed:16 without loss [^\n]+= > ~} "vect" } } */ > +/* { dg-final { scan-tree-dump {can narrow to signed:16 without loss [^\n]+ > MIN_EXPR} "vect" } } */ > +/* { dg-final { scan-tree-dump {can narrow to signed:16 without loss [^\n]+ > MAX_EXPR} "vect" } } */ > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc > index f1a95ef34b9..d562f57920f 100644 > --- a/gcc/tree-vect-patterns.cc > +++ b/gcc/tree-vect-patterns.cc > @@ -3164,46 +3164,9 @@ vect_recog_over_widening_pattern (vec_info *vinfo, > tree ops[3] = {}; > for (unsigned int i = 1; i < first_op; ++i) > ops[i - 1] = gimple_op (last_stmt, i); > - /* For right shifts limit the shift operand. */ > vect_convert_inputs (vinfo, last_stmt_info, nops, &ops[first_op - 1], > op_type, &unprom[0], op_vectype); > > - /* Limit shift operands. */ > - if (code == RSHIFT_EXPR) > - { > - wide_int min_value, max_value; > - if (TREE_CODE (ops[1]) == INTEGER_CST) > - ops[1] = wide_int_to_tree (op_type, > - wi::umin (wi::to_wide (ops[1]), > - new_precision - 1)); > - else if (!vect_get_range_info (ops[1], &min_value, &max_value) > - || wi::ge_p (max_value, new_precision, TYPE_SIGN (op_type))) > - { > - /* ??? Note the following bad for SLP as that only supports > - same argument widened shifts and it un-CSEs same arguments. */ > - tree new_var = vect_recog_temp_ssa_var (op_type, NULL); > - gimple *pattern_stmt > - = gimple_build_assign (new_var, MIN_EXPR, ops[1], > - build_int_cst (op_type, new_precision - > 1)); > - gimple_set_location (pattern_stmt, gimple_location (last_stmt)); > - if (ops[1] == unprom[1].op && unprom[1].dt == vect_external_def) > - { > - if (edge e = vect_get_external_def_edge (vinfo, ops[1])) > - { > - basic_block new_bb > - = gsi_insert_on_edge_immediate (e, pattern_stmt); > - gcc_assert (!new_bb); > - } > - else > - return NULL; > - } > - else > - append_pattern_def_seq (vinfo, last_stmt_info, pattern_stmt, > - op_vectype); > - ops[1] = new_var; > - } > - } > - > /* Use the operation to produce a result of type OP_TYPE. */ > tree new_var = vect_recog_temp_ssa_var (op_type, NULL); > gimple *pattern_stmt = gimple_build_assign (new_var, code, > @@ -6359,9 +6322,11 @@ vect_truncatable_operation_p (tree_code code) > { > switch (code) > { > + case NEGATE_EXPR: > case PLUS_EXPR: > case MINUS_EXPR: > case MULT_EXPR: > + case BIT_NOT_EXPR: > case BIT_AND_EXPR: > case BIT_IOR_EXPR: > case BIT_XOR_EXPR: > @@ -6520,38 +6485,85 @@ vect_determine_precisions_from_range (stmt_vec_info > stmt_info, gassign *stmt) > unsigned int nops = gimple_num_ops (stmt); > > if (!vect_truncatable_operation_p (code)) > - /* Check that all relevant input operands are compatible, and update > - [MIN_VALUE, MAX_VALUE] to include their ranges. */ > - for (unsigned int i = 1; i < nops; ++i) > - { > - tree op = gimple_op (stmt, i); > - if (TREE_CODE (op) == INTEGER_CST) > - { > - /* Don't require the integer to have RHS_TYPE (which it might > - not for things like shift amounts, etc.), but do require it > - to fit the type. */ > - if (!int_fits_type_p (op, type)) > - return; > - > - min_value = wi::min (min_value, wi::to_wide (op, precision), > sign); > - max_value = wi::max (max_value, wi::to_wide (op, precision), > sign); > - } > - else if (TREE_CODE (op) == SSA_NAME) > - { > - /* Ignore codes that don't take uniform arguments. */ > - if (!types_compatible_p (TREE_TYPE (op), type)) > - return; > + { > + /* Handle operations that can be computed in type T if all inputs > + and outputs can be represented in type T. Also handle left and > + right shifts, where (in addition) the maximum shift amount must > + be less than the number of bits in T. */ > + bool is_shift; > + switch (code) > + { > + case LSHIFT_EXPR: > + case RSHIFT_EXPR: > + is_shift = true; > + break; > > - wide_int op_min_value, op_max_value; > - if (!vect_get_range_info (op, &op_min_value, &op_max_value)) > - return; > + case ABS_EXPR: > + case MIN_EXPR: > + case MAX_EXPR: > + case TRUNC_DIV_EXPR: > + case CEIL_DIV_EXPR: > + case FLOOR_DIV_EXPR: > + case ROUND_DIV_EXPR: > + case EXACT_DIV_EXPR: > + /* Modulus is excluded because it is typically calculated by doing > + a division, for which minimum signed / -1 isn't representable in > + the original signed type. We could take the division range into > + account instead, if handling modulus ever becomes important. */ > + is_shift = false; > + break; > > - min_value = wi::min (min_value, op_min_value, sign); > - max_value = wi::max (max_value, op_max_value, sign); > - } > - else > + default: > return; > - } > + } > + for (unsigned int i = 1; i < nops; ++i) > + { > + tree op = gimple_op (stmt, i); > + wide_int op_min_value, op_max_value; > + if (TREE_CODE (op) == INTEGER_CST) > + { > + unsigned int op_precision = TYPE_PRECISION (TREE_TYPE (op)); > + op_min_value = op_max_value = wi::to_wide (op, op_precision); > + } > + else if (TREE_CODE (op) == SSA_NAME) > + { > + if (!vect_get_range_info (op, &op_min_value, &op_max_value)) > + return; > + } > + else > + return; > + > + if (is_shift && i == 2) > + { > + /* There needs to be one more bit than the maximum shift amount. > + > + If the maximum shift amount is already 1 less than PRECISION > + then we can't narrow the shift further. Dealing with that > + case first ensures that we can safely use an unsigned range > + below. > + > + op_min_value isn't relevant, since shifts by negative amounts > + are UB. */ > + if (wi::geu_p (op_max_value, precision - 1)) > + return; > + unsigned int min_bits = op_max_value.to_uhwi () + 1; > + > + /* As explained below, we can convert a signed shift into an > + unsigned shift if the sign bit is always clear. At this > + point we've already processed the ranges of the output and > + the first input. */ > + auto op_sign = sign; > + if (sign == SIGNED && !wi::neg_p (min_value)) > + op_sign = UNSIGNED; > + op_min_value = wide_int::from (wi::min_value (min_bits, > op_sign), > + precision, op_sign); > + op_max_value = wide_int::from (wi::max_value (min_bits, > op_sign), > + precision, op_sign); > + } > + min_value = wi::min (min_value, op_min_value, sign); > + max_value = wi::max (max_value, op_max_value, sign); > + } > + } > > /* Try to switch signed types for unsigned types if we can. > This is better for two reasons. First, unsigned ops tend > -- > 2.25.1 >