From: MITSUNARI Shigeo <[email protected]>

For 32-bit unsigned integer division by constants that require 33-bit
magic multipliers (mh != 0, IsAdd case), use a pre-shifted 64-bit magic
constant and a single 64-bit high-part multiply instead of the traditional
sub/shift/add sequence.

The 33-bit magic constant (2^32 + ml) is pre-shifted by (32 - post_shift)
bits, allowing the quotient to be obtained directly from the upper 64 bits
of a 64x64 multiplication, then truncated to 32 bits.

This reduces the instruction count for divisions like x/7 from 7
instructions to 4 on x86_64.

Before (x / 7):
    movl    %edi, %eax
    imulq   $613566757, %rax, %rax
    shrq    $32, %rax
    subl    %eax, %edi
    shrl    %edi
    addl    %edi, %eax
    shrl    $2, %eax

After:
    movabsq $2635249153617166336, %rcx
    movl    %edi, %eax
    mulq    %rcx
    movl    %edx, %eax

gcc/ChangeLog:

* expmed.cc (expand_divmod): For 32-bit unsigned division with
33-bit magic on 64-bit targets, use pre-shifted 64-bit multiply.

Signed-off-by: MITSUNARI Shigeo <[email protected]>
---
 gcc/expmed.cc                                 | 90 +++++++++++++------
 gcc/testsuite/gcc.target/i386/mulq-highpart.c | 19 ++++
 2 files changed, 82 insertions(+), 27 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/mulq-highpart.c

diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index d57ea78d6b1..e440aee16a8 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -4521,33 +4521,69 @@ expand_divmod (int rem_flag, enum tree_code code, 
machine_mode mode,
                        else
                          pre_shift = 0;
 
-                       if (mh != 0)
-                         {
-                           rtx t1, t2, t3, t4;
-
-                           if (post_shift - 1 >= BITS_PER_WORD)
-                             goto fail1;
-
-                           extra_cost
-                             = (shift_cost (speed, int_mode, post_shift - 1)
-                                + shift_cost (speed, int_mode, 1)
-                                + 2 * add_cost (speed, int_mode));
-                           t1 = expmed_mult_highpart
-                             (int_mode, op0, gen_int_mode (ml, int_mode),
-                              NULL_RTX, 1, max_cost - extra_cost);
-                           if (t1 == 0)
-                             goto fail1;
-                           t2 = force_operand (gen_rtx_MINUS (int_mode,
-                                                              op0, t1),
-                                               NULL_RTX);
-                           t3 = expand_shift (RSHIFT_EXPR, int_mode,
-                                              t2, 1, NULL_RTX, 1);
-                           t4 = force_operand (gen_rtx_PLUS (int_mode,
-                                                             t1, t3),
-                                               NULL_RTX);
-                           quotient = expand_shift
-                             (RSHIFT_EXPR, int_mode, t4,
-                              post_shift - 1, tquotient, 1);
+                           if (mh != 0)
+                             {
+                               bool did_wide_mulh_path = false;
+
+                           /* For 32-bit unsigned division, if converting to a 
wider
+                              mode lets us obtain the high part of the product
+                              directly, pre-shift the 33-bit magic constant
+                              (2^32 + ml) into that wider mode and use the high
+                              part of the widened multiply instead of the
+                              sub/shift/add sequence.
+                              Pre-shift by (32 - post_shift) so that the high
+                              bits of (x64 * magic) give the quotient directly.
+                              Note: mh!=0 implies pre_shift==0.  */
+                           if (size == 32 && post_shift >= 1)
+                             {
+                               scalar_int_mode wide_mode
+                                 = GET_MODE_WIDER_MODE (int_mode).require ();
+                               uint64_t magic
+                                 = (uint64_t (ml) + (uint64_t (1) << 32))
+                                   << (32 - post_shift);
+                               start_sequence ();
+                               rtx x64 = convert_to_mode (wide_mode, op0, 1);
+                               rtx hi = expmed_mult_highpart (wide_mode, x64,
+                                  gen_int_mode (magic, wide_mode),
+                                  NULL_RTX, 1, max_cost);
+                               rtx_insn *insns = end_sequence ();
+                               if (hi != NULL_RTX)
+                                         {
+                                           emit_insn (insns);
+                                           quotient = gen_lowpart (int_mode, 
hi);
+                                           did_wide_mulh_path = true;
+                                         }
+                             }
+
+                           if (!did_wide_mulh_path)
+                             {
+                               rtx t1, t2, t3, t4;
+
+                               if (post_shift - 1 >= BITS_PER_WORD)
+                                 goto fail1;
+
+                               extra_cost
+                                 = (shift_cost (speed, int_mode,
+                                                post_shift - 1)
+                                    + shift_cost (speed, int_mode, 1)
+                                    + 2 * add_cost (speed, int_mode));
+                               t1 = expmed_mult_highpart
+                                 (int_mode, op0, gen_int_mode (ml, int_mode),
+                                  NULL_RTX, 1, max_cost - extra_cost);
+                               if (t1 == 0)
+                                 goto fail1;
+                               t2 = force_operand (gen_rtx_MINUS (int_mode,
+                                                                  op0, t1),
+                                                       NULL_RTX);
+                               t3 = expand_shift (RSHIFT_EXPR, int_mode,
+                                                  t2, 1, NULL_RTX, 1);
+                               t4 = force_operand (gen_rtx_PLUS (int_mode,
+                                                                 t1, t3),
+                                                       NULL_RTX);
+                               quotient = expand_shift
+                                 (RSHIFT_EXPR, int_mode, t4,
+                                  post_shift - 1, tquotient, 1);
+                             }
                          }
                        else
                          {
diff --git a/gcc/testsuite/gcc.target/i386/mulq-highpart.c 
b/gcc/testsuite/gcc.target/i386/mulq-highpart.c
new file mode 100644
index 00000000000..133c9b35d57
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/mulq-highpart.c
@@ -0,0 +1,19 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-bmi2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target *-*-linux* *-*-gnu* 
} } } */
+
+/*
+**div7:
+**     movabsq \$2635249153617166336, %rcx
+**     movl    %edi, %eax
+**     mulq    %rcx
+**     movl    %edx, %eax
+**     ret
+**...
+*/
+
+unsigned int
+div7 (unsigned int x)
+{
+  return x / 7;
+}
\ No newline at end of file
-- 
2.43.0

Reply via email to