Greetings!  I am requesting feedback on whether the gcc community is
interested in this contribution and any technical input on the proposed
approach. Thank you!

*Background:*

The current unsigned integer division algorithm in GCC (see expmed.cc)
replaces division with a multiplication and shift sequence.

1. QUOTIENT = (OP0 * M) >> (SIZE + POST_SHIFT)

where the multiplication constant (M) can be up to SIZE + 1 bits, where
SIZE is the width of the numerator operand (OP0). The multiplier is
expressed as the sum of the low (ML) and high (MH) parts, where MH is 1 or
0.

M = (MH << SIZE) + ML

If MH == 1, the current implementation assumes that the machine mode is not
sufficiently wide to accommodate the multiplication and distributes the
multiplication.

2. QUOTIENT = (OP0 + T1) >> POST_SHIFT

where T1 = (ML * OP0) >> SIZE. The current implementation further assumes
that the intermediate sum in 2 will overflow, and uses the midpoint trick:
a + b >> 1 == a + (b - a) >> 1 where b >= a .

3. QUOTIENT = (T1 + ((OP0 - T1) >> 1)) >> (POST_SHIFT - 1)

Finally, if the divisor is even, the numerator can be “pre-shifted” to
reduce the size of the magic number, which results in a pre-shifted version
of path 1. The current implementation implicitly assumes (and I confirmed
with benchmarking) that the pre-shift is not worthwhile unless MH == 1.

4. QUOTIENT = ((OP0 >> PRE_SHIFT) * ML) >> (SIZE + POST_SHIFT)

*Proposal:*

This proposal addresses some missed opportunities to use more efficient
paths for the MH == 1 case. If the machine mode is sufficiently wide to
compute the quotient using paths 1 or 2, the resulting code is faster and
more compact. The pre-shift logic is left as-is, as benchmarks demonstrate
that the pre-shift is counter productive and results in slower code if MH
== 0, even if the RTL cost model predicts otherwise. Please see the
attached patch and github link for implementation details.

https://github.com/gcc-mirror/gcc/compare/master...chuckyschluz:gcc:wide_udiv

*Benchmarks:*

Tested using D = 7 (MH = 1, no pre-shift), and D = 14 (MH = 1, pre-shift)
with google benchmark on x86_64 (i7-13700KF) at -O3.


int16_t

7: 1.97x speedup (avoid path 3 and use path 1)
14: 1.47x speedup (avoid path 4 and use path 1)

int32_t

7: 1.33x speedup (avoid path 3 and use path 2)
14: no change (path 4 is optimal)


Charlie Schlosser
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index d57ea78d6b1..635cce82378 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -44,6 +44,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "langhooks.h"
 #include "tree-vector-builder.h"
 #include "recog.h"
+#include "machmode.h"
 
 struct target_expmed default_target_expmed;
 #if SWITCHABLE_TARGET
@@ -4219,7 +4220,213 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx op0, HOST_WIDE_INT d)
   emit_label (label);
   return expand_shift (RSHIFT_EXPR, mode, temp, logd, NULL_RTX, 0);
 }
-
+
+/*
+Expand unsigned division of OP0 by D using a multiplication-shift strategy in
+INT_MODE and successively wider modes. Choose the method with the lowest
+cost.
+
+The quotient OP0 / D is calculated using the magic number M and POST_SHIFT that
+correspond to the divisor D. M is at-most SIZE + 1 bits, and can be decomposed
+such that M = (MH << SIZE) + ML, where MH is 1 or 0.
+
+If the mode precision is sufficient, the quotient can be computed directly.
+1. QUOTIENT = (M * OP0) >> (SIZE + POST_SHIFT)
+This requires precision PREC such that:
+  a. Product: PREC >= WIDTH(M) + WIDTH(OP0)
+  b. Shift: PREC > SIZE + POST_SHIFT
+
+The quantity T1 := (ML * OP0) >> SIZE can be computed directly or with the
+"highpart" multiplication, which may use a synthesized widening multiplication.
+If the highpart multiplication is used, there is no explicit precision
+requirement, as that logic is delegated to 'expmed_mult_highpart'.
+
+If MH is 1, the product can be distributed to accomodate lower precision modes.
+2. QUOTIENT = (OP0 + T1) >> POST_SHIFT
+  a. Product: PREC >= WIDTH(ML) + WIDTH(OP0)
+  b. Sum: PREC > WIDTH(OP0)
+  c. Shift 1: PREC > SIZE
+  d. Shift 2: PREC > POST_SHIFT
+
+If the precision is insufficient to accomodate the intermediate addition,
+seuence 2 can be distributed further.
+3. QUOTIENT = (T1 + ((OP0 - T1) >> 1)) >> (POST_SHIFT - 1)
+  a. Product: PREC >= WIDTH(ML) + WIDTH(OP0)
+  b. Shift 1: PREC > SIZE
+  c. Shift 2: PREC > POST_SHIFT - 1
+
+Any of the above strategies can also be attempted by pre-shifting OP0 if D has
+leading zeros, which renders MH == 0. This is useful to avoid scenarios 2 and 3,
+but is otherwise counterproductive. If MH == 1 and D has leading zeros, all the
+strategies will be re-evaluated with the pre-shift sequence.
+*/
+
+static rtx expand_udiv_using_mult(rtx op0, rtx target, scalar_int_mode int_mode,
+                                  unsigned HOST_WIDE_INT d, int max_cost) {
+  struct strat
+  {
+    rtx result;
+    rtx_insn *seq;
+  };
+
+  int size = GET_MODE_BITSIZE(int_mode);
+  auto_vec<strat> strat_vec;
+
+  unsigned HOST_WIDE_INT ml;
+  int post_shift;
+  unsigned HOST_WIDE_INT mh =
+      choose_multiplier(d, size, size, &ml, &post_shift);
+
+  int ml_width = (ml == 0) ? 0 : (HOST_BITS_PER_WIDE_INT - clz_hwi(ml));
+  int m_width = mh ? size + 1 : ml_width;
+
+  bool can_preshift = (d & 1) == 0;
+
+  for (opt_scalar_int_mode mode_iter = GET_MODE_WIDER_MODE(int_mode);
+       mode_iter.exists();
+       mode_iter = GET_MODE_WIDER_MODE(mode_iter.require())) {
+    scalar_int_mode compute_mode = mode_iter.require();
+    int prec = GET_MODE_PRECISION(compute_mode);
+    if (prec > HOST_BITS_PER_WIDE_INT) {
+      break;
+    }
+
+    /* Attempt the direct multiply-shift sequence. */
+    if (prec >= (m_width + size) && prec > (size + post_shift)) {
+      wide_int ml_wi = wi::zext(wi::uhwi(ml, prec), size);
+      wide_int mh_wi = wi::lshift(wi::uhwi(mh, prec), size);
+      wide_int m_wi = wi::bit_or(mh_wi, ml_wi);
+      rtx t1 = NULL_RTX, result = NULL_RTX;
+
+      start_sequence();
+      rtx op0_c = convert_modes(compute_mode, int_mode, op0, 1);
+      if (op0_c) {
+        t1 = expand_binop(compute_mode, smul_optab, op0_c,
+                          immed_wide_int_const(m_wi, compute_mode), NULL_RTX, 1,
+                          OPTAB_DIRECT);
+      }
+      if (t1) {
+        result = expand_shift(RSHIFT_EXPR, compute_mode, t1, size + post_shift,
+                              NULL_RTX, 1);
+      }
+      if (result) {
+        convert_move(target, result, 1);
+      }
+      rtx_insn *seq = end_sequence();
+      if (result && seq) {
+        strat_vec.safe_push({target, seq});
+      }
+    }
+
+    /* Attempt the distributed multiply-shift sequence if MH == 1. */
+    if (mh && !can_preshift && prec >= (ml_width + size) && prec > size &&
+        prec > post_shift) {
+      rtx t1 = NULL_RTX, t2 = NULL_RTX, t3 = NULL_RTX, result = NULL_RTX;
+
+      start_sequence();
+      rtx op0_c = convert_modes(compute_mode, int_mode, op0, 1);
+      if (op0_c) {
+        t1 = expand_binop(compute_mode, smul_optab, op0_c,
+                          gen_int_mode(ml, compute_mode), NULL_RTX, 1,
+                          OPTAB_DIRECT);
+      }
+      if (t1) {
+        t2 = expand_shift(RSHIFT_EXPR, compute_mode, t1, size, NULL_RTX, 1);
+      }
+      if (t2) {
+        t3 = expand_binop(compute_mode, add_optab, op0_c, t2, NULL_RTX, 1,
+                          OPTAB_DIRECT);
+      }
+      if (t3) {
+        result = expand_shift(RSHIFT_EXPR, compute_mode, t3, post_shift,
+                              NULL_RTX, 1);
+      }
+      if (result) {
+        convert_move(target, result, 1);
+      }
+      rtx_insn *seq = end_sequence();
+      if (result && seq) {
+        strat_vec.safe_push({target, seq});
+      }
+    }
+  }
+
+  /* We can do better for even divisors using an initial right shift. */
+  int pre_shift;
+  if (mh && can_preshift) {
+    pre_shift = ctz_or_zero(d);
+    mh = choose_multiplier(d >> pre_shift, size, size - pre_shift, &ml,
+                           &post_shift);
+    gcc_assert(!(mh && pre_shift));
+  } else {
+    pre_shift = 0;
+  }
+
+  /* Attempt the distributed multiply-highpart-shift sequence. */
+  if (size > post_shift) {
+    rtx t1 = NULL_RTX, result = NULL_RTX;
+    start_sequence();
+    rtx op0_pre = pre_shift ? expand_shift(RSHIFT_EXPR, int_mode, op0,
+                                           pre_shift, NULL_RTX, 1)
+                            : op0;
+    if (op0_pre) {
+      t1 = expmed_mult_highpart(int_mode, op0_pre, gen_int_mode(ml, int_mode),
+                                NULL_RTX, 1, max_cost);
+    }
+    if (mh) {
+      gcc_assert(post_shift >= 1);
+      rtx t2 = NULL_RTX, t3 = NULL_RTX, t4 = NULL_RTX;
+      if (t1) {
+        t2 = expand_binop(int_mode, sub_optab, op0_pre, t1, NULL_RTX, 1,
+                          OPTAB_DIRECT);
+      }
+      if (t2) {
+        t3 = expand_shift(RSHIFT_EXPR, int_mode, t2, 1, NULL_RTX, 1);
+      }
+      if (t3) {
+        t4 = expand_binop(int_mode, add_optab, t1, t3, NULL_RTX, 1,
+                          OPTAB_DIRECT);
+      }
+      if (t4) {
+        result =
+            expand_shift(RSHIFT_EXPR, int_mode, t4, post_shift - 1, target, 1);
+      }
+    } else {
+      if (t1) {
+        result = expand_shift(RSHIFT_EXPR, int_mode, t1, post_shift, target, 1);
+      }
+    }
+    rtx_insn *seq = end_sequence();
+    if (result && seq) {
+      strat_vec.safe_push({result, seq});
+    }
+  }
+
+  rtx result = NULL_RTX;
+  rtx_insn *seq = NULL;
+
+  unsigned cost = max_cost;
+  bool speed = optimize_insn_for_speed_p ();
+
+  for (unsigned i = 0; i < strat_vec.length(); i++) {
+    rtx this_result = strat_vec[i].result;
+    rtx_insn *this_seq = strat_vec[i].seq;
+    unsigned this_cost = seq_cost(this_seq, speed);
+    if (this_cost < cost) {
+      result = this_result;
+      seq = this_seq;
+      cost = this_cost;
+    }
+  }
+
+  if (seq) {
+    emit_insn(seq);
+    return result;
+  }
+
+  return NULL_RTX;
+}
+
 /* Emit the code to divide OP0 by OP1, putting the result in TARGET
    if that is convenient, and returning where the result is.
    You may request either the quotient or the remainder as the result;
@@ -4469,14 +4676,12 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 	    int size = GET_MODE_BITSIZE (int_mode);
 	    if (unsignedp)
 	      {
-		unsigned HOST_WIDE_INT mh, ml;
-		int pre_shift, post_shift;
 		wide_int wd = rtx_mode_t (op1, int_mode);
 		unsigned HOST_WIDE_INT d = wd.to_uhwi ();
 
 		if (wi::popcount (wd) == 1)
 		  {
-		    pre_shift = floor_log2 (d);
+		    int pre_shift = floor_log2 (d);
 		    if (rem_flag)
 		      {
 			unsigned HOST_WIDE_INT mask
@@ -4502,77 +4707,12 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 		      }
 		    else
 		      {
-			/* Find a suitable multiplier and right shift count
-			   instead of directly dividing by D.  */
-			mh = choose_multiplier (d, size, size,
-						&ml, &post_shift);
-
-			/* If the suggested multiplier is more than SIZE bits,
-			   we can do better for even divisors, using an
-			   initial right shift.  */
-			if (mh != 0 && (d & 1) == 0)
-			  {
-			    pre_shift = ctz_or_zero (d);
-			    mh = choose_multiplier (d >> pre_shift, size,
-						    size - pre_shift,
-						    &ml, &post_shift);
-			    gcc_assert (!mh);
-			  }
-			else
-			  pre_shift = 0;
+			quotient
+			  = expand_udiv_using_mult (op0, tquotient, int_mode, d,
+						    max_cost);
 
-			if (mh != 0)
-			  {
-			    rtx t1, t2, t3, t4;
-
-			    if (post_shift - 1 >= BITS_PER_WORD)
-			      goto fail1;
-
-			    extra_cost
-			      = (shift_cost (speed, int_mode, post_shift - 1)
-				 + shift_cost (speed, int_mode, 1)
-				 + 2 * add_cost (speed, int_mode));
-			    t1 = expmed_mult_highpart
-			      (int_mode, op0, gen_int_mode (ml, int_mode),
-			       NULL_RTX, 1, max_cost - extra_cost);
-			    if (t1 == 0)
-			      goto fail1;
-			    t2 = force_operand (gen_rtx_MINUS (int_mode,
-							       op0, t1),
-						NULL_RTX);
-			    t3 = expand_shift (RSHIFT_EXPR, int_mode,
-					       t2, 1, NULL_RTX, 1);
-			    t4 = force_operand (gen_rtx_PLUS (int_mode,
-							      t1, t3),
-						NULL_RTX);
-			    quotient = expand_shift
-			      (RSHIFT_EXPR, int_mode, t4,
-			       post_shift - 1, tquotient, 1);
-			  }
-			else
-			  {
-			    rtx t1, t2;
-
-			    if (pre_shift >= BITS_PER_WORD
-				|| post_shift >= BITS_PER_WORD)
-			      goto fail1;
-
-			    t1 = expand_shift
-			      (RSHIFT_EXPR, int_mode, op0,
-			       pre_shift, NULL_RTX, 1);
-			    extra_cost
-			      = (shift_cost (speed, int_mode, pre_shift)
-				 + shift_cost (speed, int_mode, post_shift));
-			    t2 = expmed_mult_highpart
-			      (int_mode, t1,
-			       gen_int_mode (ml, int_mode),
-			       NULL_RTX, 1, max_cost - extra_cost);
-			    if (t2 == 0)
-			      goto fail1;
-			    quotient = expand_shift
-			      (RSHIFT_EXPR, int_mode, t2,
-			       post_shift, tquotient, 1);
-			  }
+			if (!quotient)
+			  goto fail1;
 		      }
 		  }
 		else		/* Too wide mode to use tricky code */

Reply via email to