On Mon, May 9, 2022 at 7:19 AM liuhongt <hongtao....@intel.com> wrote: > > This patch will enable below optimization: > > { > - int bit; > - long long unsigned int _1; > - long long unsigned int _2; > - > <bb 2> [local count: 46707768]: > - > - <bb 3> [local count: 1027034057]: > - # tmp_11 = PHI <tmp_8(3), tmp_6(D)(2)> > - # bit_13 = PHI <bit_9(3), 63(2)> > - _1 = 1 << bit_13; > - _2 = ~_1; > - tmp_8 = _2 & tmp_11; > - bit_9 = bit_13 + -3; > - if (bit_9 != -3(OVF)) > - goto <bb 3>; [95.65%] > - else > - goto <bb 4>; [4.35%] > - > - <bb 4> [local count: 46707768]: > - return tmp_8; > + tmp_12 = tmp_6(D) & 7905747460161236406; > + return tmp_12; > > } > > > Boostrapped and regtested on x86_64-pc-linux-gnu{-m32,} > Ok for trunk? > > gcc/ChangeLog: > > PR middle-end/103462 > * match.pd (bitwise_induction_p): New match. > * tree-scalar-evolution.c (gimple_bitwise_induction_p): > Declare. > (analyze_and_compute_bitwise_induction_effect): New function. > (enum bit_op_kind): New enum. > (final_value_replacement_loop): Enhanced to handle bitwise > induction. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr103462-1.c: New test. > * gcc.target/i386/pr103462-2.c: New test. > * gcc.target/i386/pr103462-3.c: New test. > * gcc.target/i386/pr103462-4.c: New test. > * gcc.target/i386/pr103462-5.c: New test. > * gcc.target/i386/pr103462-6.c: New test. > --- > gcc/match.pd | 7 + > gcc/testsuite/gcc.target/i386/pr103462-1.c | 111 +++++++++++++ > gcc/testsuite/gcc.target/i386/pr103462-2.c | 45 ++++++ > gcc/testsuite/gcc.target/i386/pr103462-3.c | 111 +++++++++++++ > gcc/testsuite/gcc.target/i386/pr103462-4.c | 46 ++++++ > gcc/testsuite/gcc.target/i386/pr103462-5.c | 111 +++++++++++++ > gcc/testsuite/gcc.target/i386/pr103462-6.c | 46 ++++++ > gcc/tree-scalar-evolution.cc | 178 ++++++++++++++++++++- > 8 files changed, 654 insertions(+), 1 deletion(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-4.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-5.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-6.c > > diff --git a/gcc/match.pd b/gcc/match.pd > index 6d691d302b3..24ff5f9e6a8 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -7746,3 +7746,10 @@ and, > == TYPE_UNSIGNED (TREE_TYPE (@3)))) > && single_use (@4) > && single_use (@5)))) > + > +(for bit_op (bit_and bit_ior bit_xor) > + (match (bitwise_induction_p @0 @2 @3) > + (bit_op:c (nop_convert1? (bit_not2?@0 (convert3? (lshift integer_onep@1 > @2)))) @3))) > + > +(match (bitwise_induction_p @0 @2 @3) > + (bit_not (nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) > @3)))) > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-1.c > b/gcc/testsuite/gcc.target/i386/pr103462-1.c > new file mode 100644 > index 00000000000..1dc4c2acad6 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103462-1.c > @@ -0,0 +1,111 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } > */ > + > +unsigned long long > +__attribute__((noipa)) > +foo (unsigned long long tmp) > +{ > + for (int bit = 0; bit < 64; bit += 3) > + tmp &= ~(1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo1 (unsigned long long tmp) > +{ > + for (int bit = 63; bit >= 0; bit -= 3) > + tmp &= ~(1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo2 (unsigned long long tmp) > +{ > + for (int bit = 0; bit < 64; bit += 3) > + tmp &= (1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo3 (unsigned long long tmp) > +{ > + for (int bit = 63; bit >= 0; bit -= 3) > + tmp &= (1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo4 (unsigned long long tmp) > +{ > + for (int bit = 0; bit < 64; bit += 3) > + tmp |= ~(1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo5 (unsigned long long tmp) > +{ > + for (int bit = 63; bit >= 0; bit -= 3) > + tmp |= ~(1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo6 (unsigned long long tmp) > +{ > + for (int bit = 0; bit < 64; bit += 3) > + tmp |= (1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo7 (unsigned long long tmp) > +{ > + for (int bit = 63; bit >= 0; bit -= 3) > + tmp |= (1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo8 (unsigned long long tmp) > +{ > + for (int bit = 0; bit < 64; bit += 3) > + tmp ^= ~(1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo9 (unsigned long long tmp) > +{ > + for (int bit = 63; bit >= 0; bit -= 3) > + tmp ^= ~(1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo10 (unsigned long long tmp) > +{ > + for (int bit = 0; bit < 64; bit += 3) > + tmp ^= (1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo11 (unsigned long long tmp) > +{ > + for (int bit = 63; bit >= 0; bit -= 3) > + tmp ^= (1ULL << bit); > + return tmp; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-2.c > b/gcc/testsuite/gcc.target/i386/pr103462-2.c > new file mode 100644 > index 00000000000..bc375cb78d4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103462-2.c > @@ -0,0 +1,45 @@ > +/* { dg-do run } */ > +/* { dg-options "-O1" } */ > + > +#include "pr103462-1.c" > + > +int main() > +{ > + unsigned long long tmp = 0x1111111111111111ULL; > + if (foo (tmp) != 0x110110110110110ULL) > + __builtin_abort (); > + > + if (foo1 (tmp) != 0x110110110110110ULL) > + __builtin_abort (); > + > + if (foo2 (tmp) != 0x0ULL) > + __builtin_abort (); > + > + if (foo3 (tmp) != 0x0ULL) > + __builtin_abort (); > + > + if (foo4 (tmp) != 0xffffffffffffffffULL) > + __builtin_abort (); > + > + if (foo5 (tmp) != 0xffffffffffffffffULL) > + __builtin_abort (); > + > + if (foo6 (tmp) != 0x9359359359359359ULL) > + __builtin_abort (); > + > + if (foo7 (tmp) != 0x9359359359359359ULL) > + __builtin_abort (); > + > + if (foo8 (tmp) != 0x8358358358358358ULL) > + __builtin_abort (); > + > + if (foo9 (tmp) != 0x8358358358358358ULL) > + __builtin_abort (); > + > + if (foo10 (tmp) != 0x8358358358358358ULL) > + __builtin_abort (); > + > + if (foo11 (tmp) != 0x8358358358358358ULL) > + __builtin_abort (); > +} > + > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-3.c > b/gcc/testsuite/gcc.target/i386/pr103462-3.c > new file mode 100644 > index 00000000000..4ba248a4872 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103462-3.c > @@ -0,0 +1,111 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } > */ > + > +unsigned int > +__attribute__((noipa)) > +foo (unsigned int tmp) > +{ > + for (int bit = 0; bit < 32; bit += 3) > + tmp &= ~(1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo1 (unsigned int tmp) > +{ > + for (int bit = 31; bit >= 0; bit -= 3) > + tmp &= ~(1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo2 (unsigned int tmp) > +{ > + for (int bit = 0; bit < 32; bit += 3) > + tmp &= (1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo3 (unsigned int tmp) > +{ > + for (int bit = 31; bit >= 0; bit -= 3) > + tmp &= (1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo4 (unsigned int tmp) > +{ > + for (int bit = 0; bit < 32; bit += 3) > + tmp |= ~(1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo5 (unsigned int tmp) > +{ > + for (int bit = 31; bit >= 0; bit -= 3) > + tmp |= ~(1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo6 (unsigned int tmp) > +{ > + for (int bit = 0; bit < 32; bit += 3) > + tmp |= (1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo7 (unsigned int tmp) > +{ > + for (int bit = 31; bit >= 0; bit -= 3) > + tmp |= (1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo8 (unsigned int tmp) > +{ > + for (int bit = 0; bit < 32; bit += 3) > + tmp ^= ~(1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo9 (unsigned int tmp) > +{ > + for (int bit = 31; bit >= 0; bit -= 3) > + tmp ^= ~(1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo10 (unsigned int tmp) > +{ > + for (int bit = 0; bit < 32; bit += 3) > + tmp ^= (1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo11 (unsigned int tmp) > +{ > + for (int bit = 31; bit >= 0; bit -= 3) > + tmp ^= (1U << bit); > + return tmp; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-4.c > b/gcc/testsuite/gcc.target/i386/pr103462-4.c > new file mode 100644 > index 00000000000..e2f4056f525 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103462-4.c > @@ -0,0 +1,46 @@ > +/* { dg-do run } */ > +/* { dg-options "-O1" } */ > + > +#include "pr103462-3.c" > + > +int main() > +{ > + unsigned int tmp = 0x11111111U; > + > + if (foo (tmp) != 0x10110110U) > + __builtin_abort (); > + > + if (foo1 (tmp) != 0x1101101U) > + __builtin_abort (); > + > + if (foo2 (tmp) != 0x0U) > + __builtin_abort (); > + > + if (foo3 (tmp) != 0x0U) > + __builtin_abort (); > + > + if (foo4 (tmp) != 0xffffffffU) > + __builtin_abort (); > + > + if (foo5 (tmp) != 0xffffffffU) > + __builtin_abort (); > + > + if (foo6 (tmp) != 0x59359359U) > + __builtin_abort (); > + > + if (foo7 (tmp) != 0x93593593U) > + __builtin_abort (); > + > + if (foo8 (tmp) != 0xa7ca7ca7U) > + __builtin_abort (); > + > + if (foo9 (tmp) != 0x7ca7ca7cU) > + __builtin_abort (); > + > + if (foo10 (tmp) != 0x58358358U) > + __builtin_abort (); > + > + if (foo11 (tmp) != 0x83583583U) > + __builtin_abort (); > +} > + > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-5.c > b/gcc/testsuite/gcc.target/i386/pr103462-5.c > new file mode 100644 > index 00000000000..1f4ffa34b48 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103462-5.c > @@ -0,0 +1,111 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } > */ > + > +unsigned short > +__attribute__((noipa)) > +foo (unsigned short tmp) > +{ > + for (int bit = 0; bit < 16; bit += 3) > + tmp &= ~(1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo1 (unsigned short tmp) > +{ > + for (int bit = 15; bit >= 0; bit -= 3) > + tmp &= ~(1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo2 (unsigned short tmp) > +{ > + for (int bit = 0; bit < 16; bit += 3) > + tmp &= (1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo3 (unsigned short tmp) > +{ > + for (int bit = 15; bit >= 0; bit -= 3) > + tmp &= (1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo4 (unsigned short tmp) > +{ > + for (int bit = 0; bit < 16; bit += 3) > + tmp |= ~(1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo5 (unsigned short tmp) > +{ > + for (int bit = 15; bit >= 0; bit -= 3) > + tmp |= ~(1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo6 (unsigned short tmp) > +{ > + for (int bit = 0; bit < 16; bit += 3) > + tmp |= (1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo7 (unsigned short tmp) > +{ > + for (int bit = 15; bit >= 0; bit -= 3) > + tmp |= (1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo8 (unsigned short tmp) > +{ > + for (int bit = 0; bit < 16; bit += 3) > + tmp ^= ~(1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo9 (unsigned short tmp) > +{ > + for (int bit = 15; bit >= 0; bit -= 3) > + tmp ^= ~(1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo10 (unsigned short tmp) > +{ > + for (int bit = 0; bit < 16; bit += 3) > + tmp ^= (1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo11 (unsigned short tmp) > +{ > + for (int bit = 15; bit >= 0; bit -= 3) > + tmp ^= (1U << bit); > + return tmp; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-6.c > b/gcc/testsuite/gcc.target/i386/pr103462-6.c > new file mode 100644 > index 00000000000..65426d71efe > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103462-6.c > @@ -0,0 +1,46 @@ > +/* { dg-do run } */ > +/* { dg-options "-O1" } */ > + > +#include "pr103462-5.c" > + > +int main() > +{ > + unsigned short tmp = 0x1111U; > + > + if (foo (tmp) != 0x110) > + __builtin_abort (); > + > + if (foo1 (tmp) != 0x110) > + __builtin_abort (); > + > + if (foo2 (tmp) != 0x0) > + __builtin_abort (); > + > + if (foo3 (tmp) != 0x0) > + __builtin_abort (); > + > + if (foo4 (tmp) != 0xffff) > + __builtin_abort (); > + > + if (foo5 (tmp) != 0xffff) > + __builtin_abort (); > + > + if (foo6 (tmp) != 0x9359) > + __builtin_abort (); > + > + if (foo7 (tmp) != 0x9359) > + __builtin_abort (); > + > + if (foo8 (tmp) != 0x8358) > + __builtin_abort (); > + > + if (foo9 (tmp) != 0x8358) > + __builtin_abort (); > + > + if (foo10 (tmp) != 0x8358) > + __builtin_abort (); > + > + if (foo11 (tmp) != 0x8358) > + __builtin_abort (); > +} > + > diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc > index 72ceb4001e3..9b0aec4fd09 100644 > --- a/gcc/tree-scalar-evolution.cc > +++ b/gcc/tree-scalar-evolution.cc > @@ -3487,6 +3487,164 @@ expression_expensive_p (tree expr) > || expanded_size > cache.elements ()); > } > > +/* Match.pd function to match bitwise inductive expression. > + .i.e. > + _2 = 1 << _1; > + _3 = ~_2; > + tmp_9 = _3 & tmp_12; */ > +extern bool gimple_bitwise_induction_p (tree, tree *, tree (*)(tree)); > + > +/* Return the inductive expression of bitwise operation if possible, > + otherwise returns DEF. */ > +static tree > +analyze_and_compute_bitwise_induction_effect (class loop* ex_loop, > + class loop* loop, > + tree phidef, > + tree def, > + tree niter) > +{ > + tree match_op[3],inv, bitwise_scev, bitwise_res; > + bool folded_casts = false; > + tree type = TREE_TYPE (phidef); > + gimple* header_phi = NULL;
use a gphi * > + > + /* Match things like op2(MATCH_OP[2]), op1(MATCH_OP[1]), phidef(PHIDEF) > + > + op2 = PHI <phidef, inv> > + _1 = (int) bit_17; > + _3 = 1 << _1; > + op1 = ~_3; > + phidef = op1 & op2; */ > + if (!gimple_bitwise_induction_p (phidef, &match_op[0], NULL) > + || TREE_CODE (match_op[2]) != SSA_NAME > + || gimple_code (SSA_NAME_DEF_STMT (match_op[2])) != GIMPLE_PHI || !(header_phi = dyn_cast <gphi *> (SSA_NAME_DEF_STMT (match_op[2]))) > + || gimple_phi_num_args (SSA_NAME_DEF_STMT (match_op[2])) != 2) and then use header_phi here, if you pass in the PHI to use you'll always have 2 args > + return def; > + > + header_phi = SSA_NAME_DEF_STMT (match_op[2]); > + if (gimple_bb (header_phi) != loop->header) > + return def; I think passing in the PHI node and the exit edge instead of phidef would make the above more clear > + if (PHI_ARG_DEF_FROM_EDGE (header_phi, loop_latch_edge (loop)) != phidef) > + return def; > + > + inv = gimple_phi_arg_def (header_phi, 0); > + if (inv == phidef) > + inv = gimple_phi_arg_def (header_phi, 1); inv = gimple_phi_arg_def_from_edge (header_phi, loop_preheader_edge (loop)); where is this used? > + > + bitwise_scev = analyze_scalar_evolution_in_loop (ex_loop, loop, > + match_op[1], > + &folded_casts); you want bitwise_scev = analyze_scalar_evolution (loop, match_op[1]); bitwise_scev = instantiate_parameters (loop, bitwise_scev); instead since you are interested in in-loop behavior? > + > + /* Make sure bits is in range of type precision. */ > + if (TREE_CODE (bitwise_scev) != POLYNOMIAL_CHREC I think you want to test || !INTEGRAL_TYPE_P (bitwise_scev) since you do not rule out pointers or vector types. > + || !tree_fits_uhwi_p (CHREC_LEFT (bitwise_scev)) > + || tree_to_uhwi (CHREC_LEFT (bitwise_scev)) >= TYPE_PRECISION (type) > + || !tree_fits_shwi_p (CHREC_RIGHT (bitwise_scev))) > + return def; > + > + bitwise_res = compute_overall_effect_of_inner_loop (ex_loop, bitwise_scev); > + if (!tree_fits_uhwi_p (bitwise_res) > + || tree_to_uhwi (bitwise_res) >= TYPE_PRECISION (type)) > + return def; Hmm, so I think that bitwise_res is the final value of the 'bit' IV, correct? So name it bit_final? > + > +enum bit_op_kind > + { > + INDUCTION_BIT_CLEAR, > + INDUCTION_BIT_IOR, > + INDUCTION_BIT_XOR, > + INDUCTION_BIT_RESET, > + INDUCTION_ZERO, > + INDUCTION_ALL > + }; > + > + enum bit_op_kind induction_kind; > + enum tree_code code1 > + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (phidef)); > + enum tree_code code2 > + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (match_op[0])); > + > + /* BIT_CLEAR: A &= ~(1 << bit) > + BIT_RESET: A ^= (1 << bit). > + BIT_IOR: A |= (1 << bit) > + BIT_ZERO: A &= (1 << bit) > + BIT_ALL: A |= ~(1 << bit) > + BIT_XOR: A ^= ~(1 << bit). > + bit is induction variable. */ > + switch (code1) > + { > + case BIT_AND_EXPR: > + induction_kind = code2 == BIT_NOT_EXPR > + ? INDUCTION_BIT_CLEAR > + : INDUCTION_ZERO; > + break; > + case BIT_IOR_EXPR: > + induction_kind = code2 == BIT_NOT_EXPR > + ? INDUCTION_ALL > + : INDUCTION_BIT_IOR; > + break; > + case BIT_XOR_EXPR: > + induction_kind = code2 == BIT_NOT_EXPR > + ? INDUCTION_BIT_XOR > + : INDUCTION_BIT_RESET; > + break; > + /* A ^ ~(1 << bit) is equal to ~(A ^ (1 << bit)). */ > + case BIT_NOT_EXPR: > + gcc_assert (code2 == BIT_XOR_EXPR); > + induction_kind = INDUCTION_BIT_XOR; > + break; > + default: > + gcc_unreachable(); > + } > + > + if (induction_kind == INDUCTION_ZERO) > + return build_zero_cst (type); is that true even when the loop doesn't iterate? > + if (induction_kind == INDUCTION_ALL) > + return build_all_ones_cst (type); Likewise? > + > + wide_int bits = wi::zero (TYPE_PRECISION (type)); > + HOST_WIDE_INT start = tree_to_shwi (CHREC_LEFT (bitwise_scev)); you checked fits_uhwi above, name it bit_start? > + HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev)); > + unsigned HOST_WIDE_INT num_iter = tree_to_uhwi (niter); > + > + /* Loop tripcount should be num_iter + 1. */ > + for (unsigned i = 0; i != num_iter + 1; i++) > + { > + bits = wi::bit_or (bits, > + wi::lshift (wi::one (TYPE_PRECISION (type)), > + start)); you can use bits = wi::set_bit (bits, start); > + start += step; > + } Uh. You don't limit 'num_iter' but you limit bit_final. So supposed that the IV wraps and is unsigned char and you have a 256 bit integer the above could run quite a long time. Please limit num_iter to TYPE_PRECISION as well (do you have a testcase with a wrapping IV? try some != CST exit condition and a large increment) > + > + bool inverted = false; > + switch (induction_kind) > + { > + case INDUCTION_BIT_CLEAR: > + code1 = BIT_AND_EXPR; > + inverted = true; > + break; > + case INDUCTION_BIT_IOR: > + code1 = BIT_IOR_EXPR; > + break; > + case INDUCTION_BIT_RESET: > + code1 = BIT_XOR_EXPR; > + break; > + /* A ^= ~(1 << bit) is special, when loop tripcount is even, > + it's equal to A ^= bits, else A ^= ~bits. */ > + case INDUCTION_BIT_XOR: > + code1 = BIT_XOR_EXPR; > + if (num_iter % 2 == 0) > + inverted = true; > + break; > + default: > + gcc_unreachable (); > + } > + > + if (inverted) > + bits = wi::bit_not (bits); > + return fold_build2 (code1, type, inv, wide_int_to_tree (type, bits)); > +} > + > /* Do final value replacement for LOOP, return true if we did anything. */ > > bool > @@ -3519,7 +3677,8 @@ final_value_replacement_loop (class loop *loop) > { > gphi *phi = psi.phi (); > tree rslt = PHI_RESULT (phi); > - tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit); > + tree phidef = PHI_ARG_DEF_FROM_EDGE (phi, exit); > + tree def = phidef; > if (virtual_operand_p (def)) > { > gsi_next (&psi); > @@ -3537,6 +3696,23 @@ final_value_replacement_loop (class loop *loop) > def = analyze_scalar_evolution_in_loop (ex_loop, loop, def, > &folded_casts); > def = compute_overall_effect_of_inner_loop (ex_loop, def); > + > + /* Handle bitwise induction expression. > + > + .i.e. > + for (int i = 0; i != 64; i+=3) > + res &= ~(1UL << i); > + > + RES can't be analyzed out by SCEV because it is not polynomially > + expressible, but in fact final value of RES can be replaced by > + RES & CONSTANT where CONSTANT all ones with bit {0,3,6,9,... ,63} > + being cleared, similar for BIT_IOR_EXPR/BIT_XOR_EXPR. */ > + if (tree_fits_uhwi_p (niter) > + && tree_to_uhwi (niter) that's redundant - ah, it's the not iterating case that's excluded. please write it as tree_to_uhwi (niter) != 0, also consider assigning to an unsigned HOST_WIDE_INT and pass it that way to analyze_and_compute_bitwise_induction_effect > + && tree_to_uhwi (niter) != HOST_WIDE_INT_M1U) > + def = analyze_and_compute_bitwise_induction_effect (ex_loop, loop, > + phidef, def, > niter); > + it seems you overwrite the SCEV analysis result unconditionally here, please move this inside of the following if () so it is done as fallback only (or add it as && !(def = ...) to the condition) > if (!tree_does_not_contain_chrecs (def) > || chrec_contains_symbols_defined_in_loop (def, ex_loop->num) > /* Moving the computation from the loop may prolong life range > -- > 2.18.1 >