On Wed, Mar 12, 2025 at 12:00 PM Richard Sandiford <richard.sandif...@arm.com> wrote: > > Using a combination of rules, we were able to fold > > ((X >> C1) & C2) * (1 << C1) --> X & (C2 << C1) > > if everything was done at the same precision, but we couldn't fold > it if the AND was done at a different precision. The optimisation is > often (but not always) valid for that case too. > > This patch adds a dedicated rule for the case where different precisions > are involved. > > An alternative would be to extend the individual folds that together > handle the same-precision case so that those rules handle differing > precisions. But the risk is that that could replace narrow operations > with wide operations, which would be especially harmful on targets > like avr. It's also not obviously free of cycles. > > I also wondered whether the converts should be non-optional. > > gcc/ > * match.pd: Fold ((X >> C1) & C2) * (1 << C1) to X & (C2 << C1). > > gcc/testsuite/ > * gcc.dg/fold-mul-and-lshift-1.c: New test. > * gcc.dg/fold-mul-and-lshift-2.c: Likewise. > --- > gcc/match.pd | 29 ++++++++++ > gcc/testsuite/gcc.dg/fold-mul-and-lshift-1.c | 59 ++++++++++++++++++++ > gcc/testsuite/gcc.dg/fold-mul-and-lshift-2.c | 15 +++++ > 3 files changed, 103 insertions(+) > create mode 100644 gcc/testsuite/gcc.dg/fold-mul-and-lshift-1.c > create mode 100644 gcc/testsuite/gcc.dg/fold-mul-and-lshift-2.c > > diff --git a/gcc/match.pd b/gcc/match.pd > index 5c679848bdf..3197d1cac75 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -5231,6 +5231,35 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > (if (mask) > (bit_op (shift (convert @0) @1) { mask; }))))))) > > +/* Fold ((X >> C1) & C2) * (1 << C1) into X & (C2 << C1), including cases > where > + the & happens in a different type. It is the conversion case that isn't > + a composition of other folds. > + > + Let the type of the * and >> be T1 and the type of the & be T2. > + The fold is valid if the conversion to T2 preserves all information; > + that is, if T2 is wider than T1 or drops no more than C1 bits from T1. > + In that case, the & might operate on bits that are dropped by the > + later conversion to T1 and the multiplication by (1 << C1), but those > + bits are also dropped by ANDing with C2 << C1 (converted to T1). > + > + If the conversion to T2 is not information-preserving, we have to be > + careful about the later conversion to T1 acting as a sign extension. > + We need either T2 to be unsigned or the top (sign) bit of C2 to be clear. > + That is equivalent to testing whether C2 is nonnegative. */ > +(simplify > + (mult > + (convert? (bit_and (convert? (rshift @0 INTEGER_CST@1)) INTEGER_CST@2)) > + INTEGER_CST@3) > + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) > + (with { auto prec = element_precision (type); } Since we know this needs to be a scalar, Using TREE_PRECISION here is fine too.
> + (if (wi::ltu_p (wi::to_widest (@1), prec)) I think using wi::to_wide is better than using wi::to_widest here. The other place in match which checks shift count does: /* Use a signed compare to leave negative shift counts alone. */ && wi::ges_p (wi::to_wide (uniform_integer_cst_p (@1)), element_precision (type))) > + (with { auto shift = tree_to_uhwi (@1); } > + (if ((prec <= element_precision (TREE_TYPE (@2)) + shift > + || wi::to_widest (@2) >= 0) I think `wi::to_widest (@2) >= 0` can be written as `!tree_int_cst_sign_bit (@2)`. Other than that the patch looks good. Thanks, Andrew > + && wi::to_wide (@3) == wi::set_bit_in_zero (shift, prec)) > + (with { auto mask = wide_int::from (wi::to_wide (@2), prec, UNSIGNED); > } > + (bit_and @0 { wide_int_to_tree (type, mask << shift); })))))))) > + > /* ~(~X >> Y) -> X >> Y (for arithmetic shift). */ > (simplify > (bit_not (convert1?:s (rshift:s (convert2?@0 (bit_not @1)) @2))) > diff --git a/gcc/testsuite/gcc.dg/fold-mul-and-lshift-1.c > b/gcc/testsuite/gcc.dg/fold-mul-and-lshift-1.c > new file mode 100644 > index 00000000000..b1ce10495e3 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/fold-mul-and-lshift-1.c > @@ -0,0 +1,59 @@ > +/* { dg-do compile { target lp64 } } */ > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > +/* { dg-final { scan-tree-dump-not { >> } "optimized" } } */ > +/* { dg-final { scan-tree-dump-not { \* } "optimized" } } */ > + > +unsigned int > +f1 (unsigned int x) > +{ > + x >>= 1; > + unsigned long y = x; > + y &= 255; > + x = y; > + x *= 2; > + return x; > +} > + > +unsigned int > +f2 (unsigned int x) > +{ > + x >>= 1; > + unsigned long y = x; > + y &= -2UL; > + x = y; > + x *= 2; > + return x; > +} > + > +unsigned int > +f3 (unsigned int x) > +{ > + x >>= 1; > + unsigned short y = x; > + y &= 255; > + x = y; > + x *= 2; > + return x; > +} > + > +unsigned int > +f4 (unsigned int x) > +{ > + x >>= 1; > + short y = x; > + y &= (unsigned short) ~0U >> 1; > + x = y; > + x *= 2; > + return x; > +} > + > +unsigned int > +f5 (unsigned int x) > +{ > + x >>= 16; > + short y = x; > + y &= -2; > + x = y; > + x *= 1 << 16; > + return x; > +} > diff --git a/gcc/testsuite/gcc.dg/fold-mul-and-lshift-2.c > b/gcc/testsuite/gcc.dg/fold-mul-and-lshift-2.c > new file mode 100644 > index 00000000000..86eabef0fef > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/fold-mul-and-lshift-2.c > @@ -0,0 +1,15 @@ > +/* { dg-do compile { target int32 } } */ > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > +/* { dg-final { scan-tree-dump { >> 15;} "optimized" } } */ > +/* { dg-final { scan-tree-dump { \* 32768;} "optimized" } } */ > + > +unsigned int > +f1 (unsigned int x) > +{ > + x >>= 15; > + short y = x; > + y &= -2; > + x = y; > + x *= 1 << 15; > + return x; > +} > -- > 2.25.1 >