Similar for (v + B) * C + D -> C * v + BCD.
Don't simplify it when there's overflow and overflow is UB for type v.
There's new failure
gcc.dg/vect/slp-11a.c scan-tree-dump-times vect "vectorizing stmts using SLP" 0
It's because the patch simplify different operations to mult + add and enables
SLP. So I adjust the testcase to prevent simplication by making the
multiplication result overflow.
Also with -fwrapv, benchmark in the PR now 100% faster than
origin(scalar version).
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ok for trunk?
gcc/ChangeLog:
PR tree-optimization/53533
* match.pd: Simplify (B * v + C) * D -> BD * v + CD and
(v + B) * C + D -> C * v + BCD when B,C,D are all INTEGER_CST,
and there's no overflow or TYPE_OVERFLOW_WRAP.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr53533-1.c: New test.
* gcc.target/i386/pr53533-2.c: New test.
* gcc.target/i386/pr53533-3.c: New test.
* gcc.target/i386/pr53533-4.c: New test.
* gcc.dg/vect/slp-11a.c: Adjust testcase.
---
gcc/match.pd | 36 ++++++++++++++++++
gcc/testsuite/gcc.dg/vect/slp-11a.c | 10 ++---
gcc/testsuite/gcc.target/i386/pr53533-1.c | 23 ++++++++++++
gcc/testsuite/gcc.target/i386/pr53533-2.c | 46 +++++++++++++++++++++++
gcc/testsuite/gcc.target/i386/pr53533-3.c | 24 ++++++++++++
gcc/testsuite/gcc.target/i386/pr53533-4.c | 46 +++++++++++++++++++++++
6 files changed, 180 insertions(+), 5 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-3.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr53533-4.c
diff --git a/gcc/match.pd b/gcc/match.pd
index 88c6c414881..b753f7bda3c 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -489,6 +489,42 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(if (!overflow || TYPE_OVERFLOW_WRAPS (type))
(mult @0 { wide_int_to_tree (type, mul); }))))
+/* Similar to above, but there could be an extra add/sub between
+ successive multuiplications. */
+(simplify
+ (mult:c (plus:c@4 (mult:c@5 @0 INTEGER_CST@1) INTEGER_CST@2) INTEGER_CST@3)
+ (if (single_use (@4)
+ && single_use (@5))
+ (with {
+ wi::overflow_type overflow;
+ wi::overflow_type overflow2;
+ wide_int mul = wi::mul (wi::to_wide (@1), wi::to_wide (@3),
+ TYPE_SIGN (type), &overflow);
+ wide_int add = wi::mul (wi::to_wide (@2), wi::to_wide (@3),
+ TYPE_SIGN (type), &overflow2);
+ }
+ /* Skip folding on overflow. */
+ (if (!(overflow || overflow2) || TYPE_OVERFLOW_WRAPS (type))
+ (plus (mult @0 { wide_int_to_tree (type, mul); })
+ { wide_int_to_tree (type, add); })))))
+
+/* Similar to above, but a multiplication between successive additions. */
+(simplify
+ (plus:c (mult:c@4 (plus:c@5 @0 INTEGER_CST@1) INTEGER_CST@2) INTEGER_CST@3)
+ (if (single_use (@4)
+ && single_use (@5))
+ (with {
+ wi::overflow_type overflow;
+ wi::overflow_type overflow2;
+ wide_int mul = wi::mul (wi::to_wide (@1), wi::to_wide (@2),
+ TYPE_SIGN (type), &overflow);
+ wide_int add = wi::add (mul, wi::to_wide (@3),
+ TYPE_SIGN (type), &overflow2);
+ }
+ /* Skip folding on overflow. */
+ (if (!(overflow || overflow2) || TYPE_OVERFLOW_WRAPS (type))
+ (plus (mult @0 @2) { wide_int_to_tree (type, add); })))))
+
/* Optimize A / A to 1.0 if we don't care about
NaNs or Infinities. */
(simplify
diff --git a/gcc/testsuite/gcc.dg/vect/slp-11a.c
b/gcc/testsuite/gcc.dg/vect/slp-11a.c
index bcd3c861ca4..e6632fa77be 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-11a.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-11a.c
@@ -9,14 +9,14 @@ int
main1 ()
{
int i;
- unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4,
b5, b6, b7;
- unsigned int in[N*8] =
{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7;
+ int in[N*8] =
{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
/* Different operations - not SLPable. */
for (i = 0; i < N; i++)
{
a0 = in[i*8] + 5;
- a1 = in[i*8 + 1] * 6;
+ a1 = in[i*8 + 1] * 51072;
a2 = in[i*8 + 2] + 7;
a3 = in[i*8 + 3] + 8;
a4 = in[i*8 + 4] + 9;
@@ -25,7 +25,7 @@ main1 ()
a7 = in[i*8 + 7] + 12;
b0 = a0 * 3;
- b1 = a1 * 2;
+ b1 = a1 * 51072;
b2 = a2 * 12;
b3 = a3 * 5;
b4 = a4 * 8;
@@ -47,7 +47,7 @@ main1 ()
for (i = 0; i < N; i++)
{
if (out[i*8] != (in[i*8] + 5) * 3 - 2
- || out[i*8 + 1] != (in[i*8 + 1] * 6) * 2 - 3
+ || out[i*8 + 1] != (in[i*8 + 1] * 51072) * 51072 - 3
|| out[i*8 + 2] != (in[i*8 + 2] + 7) * 12 - 2
|| out[i*8 + 3] != (in[i*8 + 3] + 8) * 5 - 1
|| out[i*8 + 4] != (in[i*8 + 4] + 9) * 8 - 8
diff --git a/gcc/testsuite/gcc.target/i386/pr53533-1.c
b/gcc/testsuite/gcc.target/i386/pr53533-1.c
new file mode 100644
index 00000000000..095de665366
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53533-1.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+/* { dg-final { scan-assembler-times "imull\[ \t\]" "1" } } */
+/* { dg-final { scan-assembler-times "(?:addl|subl)\[ \t\]" "1" { target { !
ia32 } } } } */
+
+void
+__attribute__((noipa))
+foo (unsigned a[256], unsigned b[256])
+{
+ int i;
+ for (i = 0; i < 256; ++i)
+ {
+ unsigned tmp = a[i] + 12345U;
+ tmp *= 914237U;
+ tmp += 12332U;
+ tmp *= 914237U;
+ tmp += 12332U;
+ tmp *= 914237U;
+ tmp -= 13U;
+ tmp *= 8000U;
+ b[i] = tmp;
+ }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr53533-2.c
b/gcc/testsuite/gcc.target/i386/pr53533-2.c
new file mode 100644
index 00000000000..c31b6ff4dec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53533-2.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+
+#include "pr53533-1.c"
+
+void
+__attribute__((optimize("-O0")))
+foo1 (unsigned a[256], unsigned b[256])
+{
+ int i;
+ for (i = 0; i < 256; ++i)
+ {
+ unsigned tmp = a[i] + 12345U;
+ tmp *= 914237U;
+ tmp += 12332U;
+ tmp *= 914237U;
+ tmp += 12332U;
+ tmp *= 914237U;
+ tmp -= 13U;
+ tmp *= 8000U;
+ b[i] = tmp;
+ }
+}
+
+int main()
+{
+ unsigned int a[256];
+ unsigned int b[256];
+ unsigned int c[256];
+ for (unsigned int i = 0; i != 256; i++)
+ {
+ b[i] = 0;
+ c[i] = 1;
+ a[i] = i * i - 10 * i + 33;
+ }
+ foo (a, b);
+ foo1 (a, c);
+
+ for (unsigned int i = 0; i != 256; i++)
+ {
+ if (b[i] != c[i])
+ __builtin_abort ();
+ }
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr53533-3.c
b/gcc/testsuite/gcc.target/i386/pr53533-3.c
new file mode 100644
index 00000000000..3b260d134e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53533-3.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -fwrapv" } */
+/* { dg-final { scan-assembler-times "imull\[ \t\]" "1" } } */
+/* { dg-final { scan-assembler-times "(?:addl|subl)\[ \t\]" "1" { target { !
ia32 } } } } */
+
+void
+__attribute__((noipa))
+foo (int a[256], int b[256])
+{
+ int i;
+ for (i = 0; i < 256; ++i)
+ {
+ int tmp = a[i] + 12345;
+ tmp *= 914237;
+ tmp += 12332;
+ tmp *= 914237;
+ tmp += 12332;
+ tmp *= 914237;
+ tmp -= 13;
+ tmp *= 8000;
+ b[i] = tmp;
+ }
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr53533-4.c
b/gcc/testsuite/gcc.target/i386/pr53533-4.c
new file mode 100644
index 00000000000..c29f90a44dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr53533-4.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fwrapv" } */
+
+#include "pr53533-3.c"
+
+void
+__attribute__((optimize("-O0")))
+foo1 (int a[256], int b[256])
+{
+ int i;
+ for (i = 0; i < 256; ++i)
+ {
+ int tmp = a[i] + 12345;
+ tmp *= 914237;
+ tmp += 12332;
+ tmp *= 914237;
+ tmp += 12332;
+ tmp *= 914237;
+ tmp -= 13;
+ tmp *= 8000;
+ b[i] = tmp;
+ }
+}
+
+int main()
+{
+ int a[256];
+ int b[256];
+ int c[256];
+ for (int i = 0; i != 256; i++)
+ {
+ b[i] = 0;
+ c[i] = 1;
+ a[i] = i * i - 10 * i + 33;
+ }
+ foo (a, b);
+ foo1 (a, c);
+
+ for (unsigned int i = 0; i != 256; i++)
+ {
+ if (b[i] != c[i])
+ __builtin_abort ();
+ }
+
+ return 0;
+}
--
2.18.1