Suppose the following shift:
T A = (T) B << C
which met the following conditions:
* T is a signed type with 2 x word_size.
* B is with signed type with the size <= word_size.
* C + sizeof (B) < word_size.
|< T >|
| high | low |
| B shifted to |
| E | D |
gcc is generating sub-optimal instruction pattern for most targets, like
arm/mips/sparc, during rtl expand.
gcc's expand logic is:
1. assign low with B.
2. assign high with sign bit of B spreaded.
3. logic left shift low by C bit to get the final low.
4. logic left shift high by E bit.
5. logic right shift B by D bit.
6. or the result of 4 and 5 to get final high.
step 2, 4, 5, 6 could actually be combined into one step: arithmetic right
shift B by D bit.
aarch64 example
================
what gcc generate for
__int128_t
load3 (int data)
{
return (__int128_t) data << 50;
}
==>
sxtw x2, w0
asr x1, x2, 63
lsl x0, x2, 50
lsl x1, x1, 50
orr x1, x1, x2, lsr 14
actually could be simplified to
sxtw x1, w0
lsl x0, x1, 50
asr x1, x1, 14
the following three instructions are actually doing "asr x1, x1, 14"
asr x1, x2, 63
lsl x1, x1, 50
orr x1, x1, x2, lsr 14
arm example (.fpu softvfp)
===========
what gcc generated for
long long
shift (int data)
{
return (long long) data << 20;
}
==>
shift:
stmfd sp!, {r4, r5}
mov r5, r0, asr #31
mov r3, r0
mov r0, r0, asl #20
mov r1, r5, asl #20
orr r1, r1, r3, lsr #12
ldmfd sp!, {r4, r5}
bx lr
actually could be simplified to
shift:
mov r1, r0
mov r0, r0, asl #20
mov r1, r1, asr #12
bx lr
this patch tring to combine step 2, 4, 5, 6 into one single arithmetic
right shift in gcc's combine pass.
test done
===
bootstrap OK on x86-64/aarch64
no regresson aarch64-none-elf/arm-none-eabi
please review, thanks.
-- Jiong
gcc/
* simplify-rtx.c (simplify_binary_operation_1): New combine check for
ASHIFTRT.
gcc/testsuite
* gcc.dg/wide_shift_64_1.c: New testcase.
* gcc.dg/wide_shift_128_1.c: Ditto.
* gcc.target/aarch64/ashlti3_1.c: Ditto.
* gcc.target/arm/ashldisi_1.c: Ditto.
commit 2cdd04f4de73db87b1f27d801922ff409ca7b3a8
Author: Jiong Wang <jiong.w...@arm.com>
Date: Tue Jul 29 14:57:01 2014 +0100
commit
diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
index 9f6dbe1..5fb663e 100644
--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -2599,6 +2599,126 @@ simplify_binary_operation_1 (enum rtx_code code, enum machine_mode mode,
XEXP (op0, 1));
}
+ /* If we have
+
+ (ior (lshiftrt A imm1) (and (ashiftrt (A imm2)) mask))
+
+ where imm1 = imm2 + 1
+ mask = ((1 << imm1) - 1) << (mode_size (A) - imm1)
+
+ then we may simplify this into
+
+ (ashiftrt A imm1).
+
+ For example:
+
+ (ior:DI (lshiftrt:DI (reg:DI 79 [ data ])
+ (const_int 14 [0xe]))
+ (and:DI (ashiftrt:DI (reg:DI 79 [ data ])
+ (const_int 13 [0xd]))
+ (const_int -1125899906842624 [0xfffc000000000000]))).
+
+ is actually:
+
+ (ashiftrt:DI (reg:DI 79 [ data ]) (const_int 14 [0xe])).
+
+ There is another form:
+
+ (ior (lshiftrt A imm1)
+ (ashift (subreg (sign_extend A) high_offset) imm2)
+
+ where imm1 + imm2 == mode_size (A) could be simplified into
+
+ (ashiftrt A imm1). */
+
+ if (GET_CODE (op1) == LSHIFTRT)
+ {
+ opleft = op1;
+ opright = op0;
+ }
+ else
+ {
+ opright = op1;
+ opleft = op0;
+ }
+
+ if (GET_CODE (opleft) == LSHIFTRT
+ && GET_CODE (opright) == ASHIFT
+ && CONST_INT_P (XEXP (opleft, 1))
+ && CONST_INT_P (XEXP (opright, 1))
+ && (INTVAL (XEXP (opleft, 1)) + INTVAL (XEXP (opright, 1)))
+ == GET_MODE_SIZE (GET_MODE (XEXP (opleft, 0)))
+ && GET_CODE (XEXP (opright, 0)) == SUBREG
+ && GET_CODE (XEXP (XEXP (opright, 0), 0)) == SIGN_EXTEND
+ && (SUBREG_BYTE (XEXP (opright, 0))
+ == subreg_highpart_offset (GET_MODE (XEXP (opright, 0)),
+ GET_MODE (XEXP (XEXP (opright, 0),
+ 0))))
+ && rtx_equal_p (XEXP (opleft, 0), XEXP (XEXP (XEXP (opright, 0), 0),
+ 0))
+ && !side_effects_p (XEXP (opleft, 0))
+ && have_insn_for (ASHIFTRT, mode))
+ return simplify_gen_binary (ASHIFTRT, mode, XEXP (opleft, 0),
+ XEXP (opleft, 1));
+
+ if (GET_CODE (opleft) == LSHIFTRT
+ && GET_CODE (opright) == AND
+ && CONST_INT_P (XEXP (opleft, 1))
+ && CONST_INT_P (XEXP (opright, 1))
+ && GET_CODE (XEXP (opright, 0)) == ASHIFTRT
+ && CONST_INT_P (XEXP (XEXP (opright, 0), 1))
+ && rtx_equal_p (XEXP (opleft, 0), XEXP (XEXP (opright, 0), 0))
+ && !side_effects_p (XEXP (opleft, 0))
+ && have_insn_for (ASHIFTRT, mode))
+ {
+ int lshiftrt_imm = INTVAL (XEXP (opleft, 1));
+ HOST_WIDE_INT mask_nonzero_part, and_mask = INTVAL (XEXP (opright, 1));
+ int ashiftrt_imm, t_zero, l_one;
+
+ if (width < HOST_BITS_PER_WIDE_INT)
+ and_mask &= ((unsigned HOST_WIDE_INT) 1 << width) - 1;
+
+ if (!and_mask)
+ return opleft;
+
+ ashiftrt_imm = INTVAL (XEXP (XEXP (opright, 0), 1));
+#if GCC_VERSION >= 3004
+ t_zero = (width <= HOST_BITS_PER_INT
+ ? __builtin_ctz (and_mask)
+ : (width <= HOST_BITS_PER_LONG
+ ? __builtin_ctzl (and_mask)
+ : __builtin_ctzll (and_mask)));
+#else
+ t_zero = 0;
+ while (((~and_mask) >> t_zero) & 1)
+ ++t_zero;
+#endif
+ mask_nonzero_part = (((unsigned HOST_WIDE_INT) and_mask) >> t_zero);
+#if GCC_VERSION >= 3400
+ l_one = (width <= HOST_BITS_PER_INT
+ ? __builtin_popcount (mask_nonzero_part)
+ : (width <= HOST_BITS_PER_LONG
+ ? __builtin_popcountl (mask_nonzero_part)
+ : __builtin_popcountll (mask_nonzero_part)));
+#else
+ /* calculate the number of leading 1-bits.
+ we use _popcount above, because the second condition check
+ of "if" below will make sure there is no gap between
+ those 1-bits. */
+ l_one = sizeof (and_mask) * CHAR_BIT;
+ while ((and_mask >> (l_one - 1)) & 1)
+ --l_one;
+ l_one = sizeof (and_mask) * CHAR_BIT - l_one;
+#endif
+
+ if ((ashiftrt_imm + 1) == lshiftrt_imm
+ && (mask_nonzero_part
+ == ((HOST_WIDE_INT_C (1) << (ashiftrt_imm + 1)) - 1))
+ && lshiftrt_imm == l_one)
+ return simplify_gen_binary (ASHIFTRT, mode, XEXP (opleft, 0),
+ XEXP (opleft, 1));
+ }
+
tem = simplify_byte_swapping_operation (code, mode, op0, op1);
if (tem)
return tem;
diff --git a/gcc/testsuite/gcc.dg/wide-shift-128.c b/gcc/testsuite/gcc.dg/wide-shift-128.c
new file mode 100644
index 0000000..9b62715
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/wide-shift-128.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target aarch64*-*-* mips64*-*-* sparc64*-*-* } } */
+/* { dg-require-effective-target int128 } */
+/* { dg-options "-O2 -fdump-rtl-combine" } */
+
+__int128_t
+load2 (int data)
+{
+ return (__int128_t) data << 50;
+}
+
+/* { dg-final { scan-rtl-dump-not "ior" "combine" } } */
+/* { dg-final { cleanup-rtl-dump "combine" } } */
diff --git a/gcc/testsuite/gcc.dg/wide-shift-64.c b/gcc/testsuite/gcc.dg/wide-shift-64.c
new file mode 100644
index 0000000..5bc278f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/wide-shift-64.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target arm*-*-* mips*-*-* sparc*-*-* } } */
+/* { dg-options "-O2 -fdump-rtl-combine" } */
+
+long long
+load1 (int data)
+{
+ return (long long) data << 12;
+}
+
+/* { dg-final { scan-rtl-dump-not "ior" "combine" } } */
+/* { dg-final { cleanup-rtl-dump "combine" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/ashltidisi.c b/gcc/testsuite/gcc.target/aarch64/ashltidisi.c
new file mode 100644
index 0000000..aeb2a24
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ashltidisi.c
@@ -0,0 +1,49 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -save-temps" } */
+
+extern void abort (void);
+
+#define GEN_TEST_CASE(x, y, z)\
+__uint128_t __attribute__ ((noinline))\
+ushift_##x##_##z (unsigned y data)\
+{\
+ return (__uint128_t) data << x;\
+}\
+__int128_t __attribute__ ((noinline)) \
+shift_##x##_##z (y data) \
+{\
+ return (__int128_t) data << x;\
+}
+
+GEN_TEST_CASE (53, int, i)
+GEN_TEST_CASE (3, long long, ll)
+GEN_TEST_CASE (13, long long, ll)
+GEN_TEST_CASE (53, long long, ll)
+
+int
+main (int argc, char **argv)
+{
+
+#define SHIFT_CHECK(x, y, z, p) \
+ if (ushift_##y##_##p (x)\
+ != ((__uint128_t) (unsigned z) x << y)) \
+ abort ();\
+ if (shift_##y##_##p (x)\
+ != ((__uint128_t) (signed z) x << y)) \
+ abort ();
+
+ SHIFT_CHECK (0x12345678, 53, int, i)
+ SHIFT_CHECK (0xcafecafe, 53, int, i)
+
+ SHIFT_CHECK (0x1234567890abcdefLL, 3, long long, ll)
+ SHIFT_CHECK (0x1234567890abcdefLL, 13, long long, ll)
+ SHIFT_CHECK (0x1234567890abcdefLL, 53, long long, ll)
+ SHIFT_CHECK (0xcafecafedeaddeadLL, 3, long long, ll)
+ SHIFT_CHECK (0xcafecafedeaddeadLL, 13, long long, ll)
+ SHIFT_CHECK (0xcafecafedeaddeadLL, 53, long long, ll)
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler-times "asr" 4 } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.target/arm/ashldisi.c b/gcc/testsuite/gcc.target/arm/ashldisi.c
new file mode 100644
index 0000000..bb5297a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/ashldisi.c
@@ -0,0 +1,44 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -save-temps" } */
+
+extern void abort (void);
+
+#define GEN_TEST_CASE(x)\
+unsigned long long __attribute__ ((noinline))\
+ushift_ ## x (unsigned int data)\
+{\
+ return (unsigned long long) data << x;\
+}\
+long long __attribute__ ((noinline)) \
+shift_ ## x (int data) \
+{\
+ return (long long) data << x;\
+}
+
+GEN_TEST_CASE (3)
+GEN_TEST_CASE (23)
+GEN_TEST_CASE (30)
+int
+main (int argc, char **argv)
+{
+
+#define SHIFT_CHECK(x, y) \
+ if (ushift_ ## y (x)\
+ != ((unsigned long long) (unsigned) x << y)) \
+ abort (); \
+ if (shift_ ## y (x)\
+ != ((long long) (signed) x << y)) \
+ abort ();
+
+ SHIFT_CHECK (0x12345678, 3)
+ SHIFT_CHECK (0xcafecafe, 3)
+ SHIFT_CHECK (0x12345678, 23)
+ SHIFT_CHECK (0xcafecafe, 23)
+ SHIFT_CHECK (0x12345678, 30)
+ SHIFT_CHECK (0xcafecafe, 30)
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler-times "asr" 3 } } */
+/* { dg-final { cleanup-saved-temps } } */