On Tue, 1 Dec 2020, Jakub Jelinek wrote: > Hi! > > The following patch implements what Thomas wrote about, in particular > that we can handle also double-word divison by the constants for which > the earlier patch optimized modulo (if it would be otherwise a library > call) and that we can also easily handle such constants shifted to the left. > Unfortunately, seems CSE isn't able to optimize away the two almost > identical sequences (one to compute remainder, one to compute quotient), > probably because of the ADD_OVERFLOW introduced jumps, so the patch also > adjusts expand_DIVMOD. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
OK. Richard. > 2020-12-01 Jakub Jelinek <ja...@redhat.com> > > PR rtl-optimization/97459 > * optabs.h (expand_doubleword_divmod): Declare. > * optabs.c (expand_doubleword_divmod): New function. > (expand_binop): Use it. > * internal-fn.c (expand_DIVMOD): Likewise. > > * gcc.target/i386/pr97282.c (foo): Use 123456 divisor instead of > 10. > * gcc.dg/pr97459-1.c (TESTS): Add tests for 10, 12 and > 6144. > * gcc.dg/pr97459-2.c (TESTS): Likewise. > * gcc.dg/pr97459-3.c: New test. > * gcc.dg/pr97459-4.c: New test. > * gcc.dg/pr97459-5.c: New test. > * gcc.dg/pr97459-6.c: New test. > > --- gcc/optabs.h.jj 2020-12-01 13:19:12.831127718 +0100 > +++ gcc/optabs.h 2020-12-01 17:40:08.783137431 +0100 > @@ -183,6 +183,8 @@ extern bool force_expand_binop (machine_ > enum optab_methods); > extern rtx expand_vector_broadcast (machine_mode, rtx); > > +extern rtx expand_doubleword_divmod (machine_mode, rtx, rtx, rtx *, bool); > + > /* Generate code for a simple binary or unary operation. "Simple" in > this case means "can be unambiguously described by a (mode, code) > pair and mapped to a single optab." */ > --- gcc/optabs.c.jj 2020-12-01 16:25:01.567733333 +0100 > +++ gcc/optabs.c 2020-12-01 18:11:12.878230572 +0100 > @@ -1118,6 +1118,99 @@ expand_doubleword_mod (machine_mode mode > } > return NULL_RTX; > } > + > +/* Similarly to the above function, but compute both quotient and remainder. > + Quotient can be computed from the remainder as: > + rem = op0 % op1; // Handled using expand_doubleword_mod > + quot = (op0 - rem) * inv; // inv is multiplicative inverse of op1 modulo > + // 2 * BITS_PER_WORD > + > + We can also handle cases where op1 is a multiple of power of two constant > + and constant handled by expand_doubleword_mod. > + op11 = 1 << __builtin_ctz (op1); > + op12 = op1 / op11; > + rem1 = op0 % op12; // Handled using expand_doubleword_mod > + quot1 = (op0 - rem1) * inv; // inv is multiplicative inverse of op12 > modulo > + // 2 * BITS_PER_WORD > + rem = (quot1 % op11) * op12 + rem1; > + quot = quot1 / op11; */ > + > +rtx > +expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem, > + bool unsignedp) > +{ > + *rem = NULL_RTX; > + > + /* Negative dividend should have been optimized into positive, > + similarly modulo by 1 and modulo by power of two is optimized > + differently too. */ > + if (INTVAL (op1) <= 1 || pow2p_hwi (INTVAL (op1))) > + return NULL_RTX; > + > + rtx op11 = const1_rtx; > + rtx op12 = op1; > + if ((INTVAL (op1) & 1) == 0) > + { > + int bit = ctz_hwi (INTVAL (op1)); > + op11 = GEN_INT (HOST_WIDE_INT_1 << bit); > + op12 = GEN_INT (INTVAL (op1) >> bit); > + } > + > + rtx rem1 = expand_doubleword_mod (mode, op0, op12, unsignedp); > + if (rem1 == NULL_RTX) > + return NULL_RTX; > + > + int prec = 2 * BITS_PER_WORD; > + wide_int a = wide_int::from (INTVAL (op12), prec + 1, UNSIGNED); > + wide_int b = wi::shifted_mask (prec, 1, false, prec + 1); > + wide_int m = wide_int::from (wi::mod_inv (a, b), prec, UNSIGNED); > + rtx inv = immed_wide_int_const (m, mode); > + > + rtx_insn *last = get_last_insn (); > + rtx quot1 = expand_simple_binop (mode, MINUS, op0, rem1, > + NULL_RTX, unsignedp, OPTAB_DIRECT); > + if (quot1 == NULL_RTX) > + return NULL_RTX; > + > + quot1 = expand_simple_binop (mode, MULT, quot1, inv, > + NULL_RTX, unsignedp, OPTAB_DIRECT); > + if (quot1 == NULL_RTX) > + return NULL_RTX; > + > + if (op11 != const1_rtx) > + { > + rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11, > + NULL_RTX, unsignedp); > + if (rem2 == NULL_RTX) > + return NULL_RTX; > + > + rem2 = expand_simple_binop (mode, MULT, rem2, op12, NULL_RTX, > + unsignedp, OPTAB_DIRECT); > + if (rem2 == NULL_RTX) > + return NULL_RTX; > + > + rem2 = expand_simple_binop (mode, PLUS, rem2, rem1, NULL_RTX, > + unsignedp, OPTAB_DIRECT); > + if (rem2 == NULL_RTX) > + return NULL_RTX; > + > + rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11, > + NULL_RTX, unsignedp); > + if (quot2 == NULL_RTX) > + return NULL_RTX; > + > + rem1 = rem2; > + quot1 = quot2; > + } > + > + /* Punt if we need any library calls. */ > + for (; last; last = NEXT_INSN (last)) > + if (CALL_P (last)) > + return NULL_RTX; > + > + *rem = rem1; > + return quot1; > +} > > /* Wrapper around expand_binop which takes an rtx code to specify > the operation to perform, not an optab pointer. All other > @@ -1999,7 +2092,10 @@ expand_binop (machine_mode mode, optab b > } > > /* Attempt to synthetize double word modulo by constant divisor. */ > - if ((binoptab == umod_optab || binoptab == smod_optab) > + if ((binoptab == umod_optab > + || binoptab == smod_optab > + || binoptab == udiv_optab > + || binoptab == sdiv_optab) > && optimize > && CONST_INT_P (op1) > && is_int_mode (mode, &int_mode) > @@ -2008,21 +2104,33 @@ expand_binop (machine_mode mode, optab b > && optab_handler (add_optab, word_mode) != CODE_FOR_nothing > && optimize_insn_for_speed_p ()) > { > - rtx remainder = expand_doubleword_mod (int_mode, op0, op1, > - binoptab == umod_optab); > - if (remainder != NULL_RTX) > + rtx res = NULL_RTX; > + if ((binoptab == umod_optab || binoptab == smod_optab) > + && (INTVAL (op1) & 1) == 0) > + res = expand_doubleword_mod (int_mode, op0, op1, > + binoptab == umod_optab); > + else > + { > + rtx quot = expand_doubleword_divmod (int_mode, op0, op1, &res, > + binoptab == umod_optab > + || binoptab == udiv_optab); > + if (quot == NULL_RTX) > + res = NULL_RTX; > + else if (binoptab == udiv_optab || binoptab == sdiv_optab) > + res = quot; > + } > + if (res != NULL_RTX) > { > if (optab_handler (mov_optab, int_mode) != CODE_FOR_nothing) > { > - rtx_insn *move = emit_move_insn (target ? target : remainder, > - remainder); > - set_dst_reg_note (move, > - REG_EQUAL, > - gen_rtx_fmt_ee (UMOD, int_mode, > - copy_rtx (op0), op1), > - target ? target : remainder); > + rtx_insn *move = emit_move_insn (target ? target : res, > + res); > + set_dst_reg_note (move, REG_EQUAL, > + gen_rtx_fmt_ee (optab_to_code (binoptab), > + int_mode, copy_rtx (op0), op1), > + target ? target : res); > } > - return remainder; > + return res; > } > else > delete_insns_since (last); > --- gcc/internal-fn.c.jj 2020-11-30 10:55:33.134963320 +0100 > +++ gcc/internal-fn.c 2020-12-01 18:18:20.964436077 +0100 > @@ -3230,27 +3230,68 @@ expand_DIVMOD (internal_fn, gcall *call_ > the division and modulo and if it emits any library calls or any > {,U}{DIV,MOD} rtxes throw it away and use a divmod optab or > divmod libcall. */ > - struct separate_ops ops; > - ops.code = TRUNC_DIV_EXPR; > - ops.type = type; > - ops.op0 = make_tree (ops.type, op0); > - ops.op1 = arg1; > - ops.op2 = NULL_TREE; > - ops.location = gimple_location (call_stmt); > - start_sequence (); > - quotient = expand_expr_real_2 (&ops, NULL_RTX, mode, EXPAND_NORMAL); > - if (contains_call_div_mod (get_insns ())) > - quotient = NULL_RTX; > - else > + scalar_int_mode int_mode; > + if (remainder == NULL_RTX > + && optimize > + && CONST_INT_P (op1) > + && !pow2p_hwi (INTVAL (op1)) > + && is_int_mode (TYPE_MODE (type), &int_mode) > + && GET_MODE_SIZE (int_mode) == 2 * UNITS_PER_WORD > + && optab_handler (and_optab, word_mode) != CODE_FOR_nothing > + && optab_handler (add_optab, word_mode) != CODE_FOR_nothing > + && optimize_insn_for_speed_p ()) > { > - ops.code = TRUNC_MOD_EXPR; > - remainder = expand_expr_real_2 (&ops, NULL_RTX, mode, EXPAND_NORMAL); > + rtx_insn *last = get_last_insn (); > + remainder = NULL_RTX; > + quotient = expand_doubleword_divmod (int_mode, op0, op1, &remainder, > + TYPE_UNSIGNED (type)); > + if (quotient != NULL_RTX) > + { > + if (optab_handler (mov_optab, int_mode) != CODE_FOR_nothing) > + { > + rtx_insn *move = emit_move_insn (quotient, quotient); > + set_dst_reg_note (move, REG_EQUAL, > + gen_rtx_fmt_ee (TYPE_UNSIGNED (type) > + ? UDIV : DIV, int_mode, > + copy_rtx (op0), op1), > + quotient); > + move = emit_move_insn (remainder, remainder); > + set_dst_reg_note (move, REG_EQUAL, > + gen_rtx_fmt_ee (TYPE_UNSIGNED (type) > + ? UMOD : MOD, int_mode, > + copy_rtx (op0), op1), > + quotient); > + } > + } > + else > + delete_insns_since (last); > + } > + > + if (remainder == NULL_RTX) > + { > + struct separate_ops ops; > + ops.code = TRUNC_DIV_EXPR; > + ops.type = type; > + ops.op0 = make_tree (ops.type, op0); > + ops.op1 = arg1; > + ops.op2 = NULL_TREE; > + ops.location = gimple_location (call_stmt); > + start_sequence (); > + quotient = expand_expr_real_2 (&ops, NULL_RTX, mode, EXPAND_NORMAL); > if (contains_call_div_mod (get_insns ())) > - remainder = NULL_RTX; > + quotient = NULL_RTX; > + else > + { > + ops.code = TRUNC_MOD_EXPR; > + remainder = expand_expr_real_2 (&ops, NULL_RTX, mode, > + EXPAND_NORMAL); > + if (contains_call_div_mod (get_insns ())) > + remainder = NULL_RTX; > + } > + if (remainder) > + insns = get_insns (); > + end_sequence (); > } > - if (remainder) > - insns = get_insns (); > - end_sequence (); > } > > if (remainder) > --- gcc/testsuite/gcc.target/i386/pr97282.c.jj 2020-10-06 > 10:32:14.769771587 +0200 > +++ gcc/testsuite/gcc.target/i386/pr97282.c 2020-12-01 21:52:57.901708048 > +0100 > @@ -18,8 +18,8 @@ foo (T x) > unsigned long ret = 0; > while (x > 0) > { > - ret = ret + x % 10; > - x = x / 10; > + ret = ret + x % 123456; > + x = x / 123456; > } > return ret; > } > --- gcc/testsuite/gcc.dg/pr97459-1.c.jj 2020-11-30 10:55:33.135963309 > +0100 > +++ gcc/testsuite/gcc.dg/pr97459-1.c 2020-12-01 18:23:25.243031163 +0100 > @@ -24,7 +24,7 @@ T __attribute__((noipa)) foo (T x, T n) > #define C3(n) C2(n##0) C2(n##4) C2(n##9) > #define C4(n) C3(n##0) C3(n##3) C3(n##7) > #endif > -#define TESTS C4(1) > +#define TESTS C4(1) C1(10010) C1(10012) C1(16144) > > TESTS > > --- gcc/testsuite/gcc.dg/pr97459-2.c.jj 2020-11-30 10:55:33.136963298 > +0100 > +++ gcc/testsuite/gcc.dg/pr97459-2.c 2020-12-01 18:23:51.423738268 +0100 > @@ -26,7 +26,7 @@ T __attribute__((noipa)) foo (T x, T n) > #define C3(n) C2(n##0) C2(n##4) C2(n##9) > #define C4(n) C3(n##0) C3(n##3) C3(n##7) > #endif > -#define TESTS C4(1) > +#define TESTS C4(1) C1(10010) C1(10012) C1(16144) > > TESTS > > --- gcc/testsuite/gcc.dg/pr97459-3.c.jj 2020-12-01 18:25:38.452540896 > +0100 > +++ gcc/testsuite/gcc.dg/pr97459-3.c 2020-12-01 18:26:00.662292428 +0100 > @@ -0,0 +1,54 @@ > +/* PR rtl-optimization/97459 */ > +/* { dg-do run } */ > +/* { dg-options "-O2" } */ > +/* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */ > + > +#ifdef __SIZEOF_INT128__ > +typedef __uint128_t T; > +#else > +typedef unsigned long long T; > +#endif > + > +T __attribute__((noipa)) foo (T x, T n) { return x / n; } > +#define C(n) T __attribute__((noipa)) foo##n (T x) { return x / (n - 10000); > } > + > +#define C1(n) C(n##1) C(n##3) C(n##5) C(n##7) C(n##9) > +#define C2(n) C1(n##0) C1(n##1) C1(n##2) C1(n##3) C1(n##4) \ > + C1(n##5) C1(n##6) C1(n##7) C1(n##8) C1(n##9) > +#ifdef EXPENSIVE > +#define C3(n) C2(n##0) C2(n##1) C2(n##2) C2(n##3) C2(n##4) \ > + C2(n##5) C2(n##6) C2(n##7) C2(n##8) C2(n##9) > +#define C4(n) C3(n##0) C3(n##1) C3(n##2) C3(n##3) C3(n##4) \ > + C3(n##5) C3(n##6) C3(n##7) C3(n##8) C3(n##9) > +#else > +#define C3(n) C2(n##0) C2(n##4) C2(n##9) > +#define C4(n) C3(n##0) C3(n##3) C3(n##7) > +#endif > +#define TESTS C4(1) C1(10010) C1(10012) C1(16144) > + > +TESTS > + > +struct S { T x; T (*foo) (T); }; > + > +#undef C > +#define C(n) { n - 10000, foo##n }, > + > +struct S tests[] = { > +TESTS > + { 0, 0 } > +}; > + > +int > +main () > +{ > + int i, j, k; > + for (k = 0; tests[k].x; k++) > + for (i = 0; i < sizeof (T) * __CHAR_BIT__; i++) > + for (j = -5; j <= 5; j++) > + { > + T x = ((T) 1 << i) + j; > + if (foo (x, tests[k].x) != tests[k].foo (x)) > + __builtin_abort (); > + } > + return 0; > +} > --- gcc/testsuite/gcc.dg/pr97459-4.c.jj 2020-12-01 18:26:10.272184915 > +0100 > +++ gcc/testsuite/gcc.dg/pr97459-4.c 2020-12-01 18:26:27.345993905 +0100 > @@ -0,0 +1,57 @@ > +/* PR rtl-optimization/97459 */ > +/* { dg-do run } */ > +/* { dg-options "-O2" } */ > +/* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */ > + > +#ifdef __SIZEOF_INT128__ > +typedef __int128_t T; > +typedef __uint128_t U; > +#else > +typedef long long T; > +typedef unsigned long long U; > +#endif > + > +T __attribute__((noipa)) foo (T x, T n) { return x / n; } > +#define C(n) T __attribute__((noipa)) foo##n (T x) { return x / (n - 10000); > } > + > +#define C1(n) C(n##1) C(n##3) C(n##5) C(n##7) C(n##9) > +#define C2(n) C1(n##0) C1(n##1) C1(n##2) C1(n##3) C1(n##4) \ > + C1(n##5) C1(n##6) C1(n##7) C1(n##8) C1(n##9) > +#ifdef EXPENSIVE > +#define C3(n) C2(n##0) C2(n##1) C2(n##2) C2(n##3) C2(n##4) \ > + C2(n##5) C2(n##6) C2(n##7) C2(n##8) C2(n##9) > +#define C4(n) C3(n##0) C3(n##1) C3(n##2) C3(n##3) C3(n##4) \ > + C3(n##5) C3(n##6) C3(n##7) C3(n##8) C3(n##9) > +#else > +#define C3(n) C2(n##0) C2(n##4) C2(n##9) > +#define C4(n) C3(n##0) C3(n##3) C3(n##7) > +#endif > +#define TESTS C4(1) C1(10010) C1(10012) C1(16144) > + > +TESTS > + > +struct S { T x; T (*foo) (T); }; > + > +#undef C > +#define C(n) { n - 10000, foo##n }, > + > +struct S tests[] = { > +TESTS > + { 0, 0 } > +}; > + > +int > +main () > +{ > + int i, j, k; > + for (k = 0; tests[k].x; k++) > + for (i = 0; i < sizeof (T) * __CHAR_BIT__; i++) > + for (j = -5; j <= 5; j++) > + { > + U x = ((U) 1 << i) + j; > + if (foo ((T) x, tests[k].x) != tests[k].foo ((T) x) > + || foo ((T) -x, tests[k].x) != tests[k].foo ((T) -x)) > + __builtin_abort (); > + } > + return 0; > +} > --- gcc/testsuite/gcc.dg/pr97459-5.c.jj 2020-12-01 18:27:03.386590701 > +0100 > +++ gcc/testsuite/gcc.dg/pr97459-5.c 2020-12-01 18:28:30.324618095 +0100 > @@ -0,0 +1,56 @@ > +/* PR rtl-optimization/97459 */ > +/* { dg-do run } */ > +/* { dg-options "-O2" } */ > +/* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */ > + > +#ifdef __SIZEOF_INT128__ > +typedef __uint128_t T; > +#else > +typedef unsigned long long T; > +#endif > + > +T __attribute__((noipa)) foo (T x, T n, T *r) { *r = x % n; return x / n; } > +#define C(n) T __attribute__((noipa)) foo##n (T x, T *r) { *r = x % (n - > 10000); return x / (n - 10000); } > + > +#define C1(n) C(n##1) C(n##3) C(n##5) C(n##7) C(n##9) > +#define C2(n) C1(n##0) C1(n##1) C1(n##2) C1(n##3) C1(n##4) \ > + C1(n##5) C1(n##6) C1(n##7) C1(n##8) C1(n##9) > +#ifdef EXPENSIVE > +#define C3(n) C2(n##0) C2(n##1) C2(n##2) C2(n##3) C2(n##4) \ > + C2(n##5) C2(n##6) C2(n##7) C2(n##8) C2(n##9) > +#define C4(n) C3(n##0) C3(n##1) C3(n##2) C3(n##3) C3(n##4) \ > + C3(n##5) C3(n##6) C3(n##7) C3(n##8) C3(n##9) > +#else > +#define C3(n) C2(n##0) C2(n##4) C2(n##9) > +#define C4(n) C3(n##0) C3(n##3) C3(n##7) > +#endif > +#define TESTS C4(1) C1(10010) C1(10012) C1(16144) > + > +TESTS > + > +struct S { T x; T (*foo) (T, T *); }; > + > +#undef C > +#define C(n) { n - 10000, foo##n }, > + > +struct S tests[] = { > +TESTS > + { 0, 0 } > +}; > + > +int > +main () > +{ > + int i, j, k; > + for (k = 0; tests[k].x; k++) > + for (i = 0; i < sizeof (T) * __CHAR_BIT__; i++) > + for (j = -5; j <= 5; j++) > + { > + T x = ((T) 1 << i) + j; > + T r1, r2; > + if (foo (x, tests[k].x, &r1) != tests[k].foo (x, &r2) > + || r1 != r2) > + __builtin_abort (); > + } > + return 0; > +} > --- gcc/testsuite/gcc.dg/pr97459-6.c.jj 2020-12-01 18:28:55.452336978 > +0100 > +++ gcc/testsuite/gcc.dg/pr97459-6.c 2020-12-01 18:31:50.667376785 +0100 > @@ -0,0 +1,62 @@ > +/* PR rtl-optimization/97459 */ > +/* { dg-do run } */ > +/* { dg-options "-O2" } */ > +/* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */ > + > +#ifdef __SIZEOF_INT128__ > +typedef __int128_t T; > +typedef __uint128_t U; > +#else > +typedef long long T; > +typedef unsigned long long U; > +#endif > + > +T __attribute__((noipa)) foo (T x, T n, T *r) { *r = x % n; return x / n; } > +#define C(n) T __attribute__((noipa)) foo##n (T x, T *r) { *r = x % (n - > 10000); return x / (n - 10000); } > + > +#define C1(n) C(n##1) C(n##3) C(n##5) C(n##7) C(n##9) > +#define C2(n) C1(n##0) C1(n##1) C1(n##2) C1(n##3) C1(n##4) \ > + C1(n##5) C1(n##6) C1(n##7) C1(n##8) C1(n##9) > +#ifdef EXPENSIVE > +#define C3(n) C2(n##0) C2(n##1) C2(n##2) C2(n##3) C2(n##4) \ > + C2(n##5) C2(n##6) C2(n##7) C2(n##8) C2(n##9) > +#define C4(n) C3(n##0) C3(n##1) C3(n##2) C3(n##3) C3(n##4) \ > + C3(n##5) C3(n##6) C3(n##7) C3(n##8) C3(n##9) > +#else > +#define C3(n) C2(n##0) C2(n##4) C2(n##9) > +#define C4(n) C3(n##0) C3(n##3) C3(n##7) > +#endif > +#define TESTS C4(1) C1(10010) C1(10012) C1(16144) > + > +TESTS > + > +struct S { T x; T (*foo) (T, T *); }; > + > +#undef C > +#define C(n) { n - 10000, foo##n }, > + > +struct S tests[] = { > +TESTS > + { 0, 0 } > +}; > + > +int > +main () > +{ > + int i, j, k; > + for (k = 0; tests[k].x; k++) > + for (i = 0; i < sizeof (T) * __CHAR_BIT__; i++) > + for (j = -5; j <= 5; j++) > + { > + U x = ((U) 1 << i) + j; > + T r1 = 0, r2 = 0; > + if (foo ((T) x, tests[k].x, &r1) != tests[k].foo ((T) x, &r2) > + || r1 != r2) > + __builtin_abort (); > + r1 = 0; r2 = 0; > + if (foo ((T) -x, tests[k].x, &r1) != tests[k].foo ((T) -x, &r2) > + || r1 != r2) > + __builtin_abort (); > + } > + return 0; > +} > > Jakub > > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)