1) Optimize (a >> 1) + (b >> 1) + ((a | b) & 1) to .AVG_CEIL (a, b)
2) Optimize (a | b) - ((a ^ b) >> 1) to .AVG_CEIL (a, b)

Prof is at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=118994#c6

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

        PR middle-end/118994
        * match.pd ((a >> 1) + (b >> 1) + ((a | b) & 1) to
        .AVG_CEIL (a, b)): New pattern.
        ((a | b) - ((a ^ b) >> 1) to .AVG_CEIL (a, b)): Ditto.

gcc/testsuite/ChangeLog:

        * gcc.target/i386/pr118994-1.c: New test.
        * gcc.target/i386/pr118994-2.c: New test.
---
 gcc/match.pd                               | 23 ++++++++++++++
 gcc/testsuite/gcc.target/i386/pr118994-1.c | 37 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr118994-2.c | 37 ++++++++++++++++++++++
 3 files changed, 97 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr118994-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr118994-2.c

diff --git a/gcc/match.pd b/gcc/match.pd
index 96136404f5e..d391ac86edc 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -11455,3 +11455,26 @@ and,
       }
       (if (full_perm_p)
        (vec_perm (op@3 @0 @1) @3 @2))))))
+
+#if GIMPLE
+/* Simplify (a >> 1) + (b >> 1) + ((a | b) & 1) to .AVG_CEIL (a, b).
+   Similar for (a | b) - ((a ^ b) >> 1).  */
+
+(simplify
+  (plus:c
+    (plus (rshift @0 integer_onep@1) (rshift @2 @1))
+    (bit_and (bit_ior @0 @2) integer_onep@3))
+  (if (cfun && (cfun->curr_properties & PROP_last_full_fold) != 0
+      && VECTOR_TYPE_P (type)
+      && direct_internal_fn_supported_p (IFN_AVG_CEIL, type, 
OPTIMIZE_FOR_BOTH))
+      (IFN_AVG_CEIL @0 @2)))
+
+(simplify
+  (minus
+    (bit_ior @0 @2)
+    (rshift (bit_xor @0 @2) integer_onep@1))
+  (if (cfun && (cfun->curr_properties & PROP_last_full_fold) != 0
+      && VECTOR_TYPE_P (type)
+      && direct_internal_fn_supported_p (IFN_AVG_CEIL, type, 
OPTIMIZE_FOR_BOTH))
+      (IFN_AVG_CEIL @0 @2)))
+#endif
diff --git a/gcc/testsuite/gcc.target/i386/pr118994-1.c 
b/gcc/testsuite/gcc.target/i386/pr118994-1.c
new file mode 100644
index 00000000000..5f40ababccc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr118994-1.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512vl -O2 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-times "\.AVG_CEIL" 6 "optimized"} } */
+
+#define VecRoundingAvg(a, b) ((a >> 1) + (b >> 1) + ((a | b) & 1))
+
+typedef unsigned char GccU8x16Vec __attribute__((__vector_size__(16)));
+typedef unsigned short GccU16x8Vec __attribute__((__vector_size__(16)));
+typedef unsigned char GccU8x32Vec __attribute__((__vector_size__(32)));
+typedef unsigned short GccU16x16Vec __attribute__((__vector_size__(32)));
+typedef unsigned char GccU8x64Vec __attribute__((__vector_size__(64)));
+typedef unsigned short GccU16x32Vec __attribute__((__vector_size__(64)));
+
+GccU8x16Vec U8x16VecRoundingAvg(GccU8x16Vec a, GccU8x16Vec b) {
+  return VecRoundingAvg(a, b);
+}
+
+GccU16x8Vec U16x8VecRoundingAvg(GccU16x8Vec a, GccU16x8Vec b) {
+  return VecRoundingAvg(a, b);
+}
+
+GccU8x32Vec U8x32VecRoundingAvg(GccU8x32Vec a, GccU8x32Vec b) {
+  return VecRoundingAvg(a, b);
+}
+
+GccU16x16Vec U16x16VecRoundingAvg(GccU16x16Vec a, GccU16x16Vec b) {
+  return VecRoundingAvg(a, b);
+}
+
+GccU8x64Vec U8x64VecRoundingAvg(GccU8x64Vec a, GccU8x64Vec b) {
+  return VecRoundingAvg(a, b);
+}
+
+GccU16x32Vec U16x32VecRoundingAvg(GccU16x32Vec a, GccU16x32Vec b) {
+  return VecRoundingAvg(a, b);
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr118994-2.c 
b/gcc/testsuite/gcc.target/i386/pr118994-2.c
new file mode 100644
index 00000000000..ba90e0a2992
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr118994-2.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512vl -O2 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-times "\.AVG_CEIL" 6 "optimized"} } */
+
+#define VecRoundingAvg(a, b) ((a | b) - ((a ^ b) >> 1))
+
+typedef unsigned char GccU8x16Vec __attribute__((__vector_size__(16)));
+typedef unsigned short GccU16x8Vec __attribute__((__vector_size__(16)));
+typedef unsigned char GccU8x32Vec __attribute__((__vector_size__(32)));
+typedef unsigned short GccU16x16Vec __attribute__((__vector_size__(32)));
+typedef unsigned char GccU8x64Vec __attribute__((__vector_size__(64)));
+typedef unsigned short GccU16x32Vec __attribute__((__vector_size__(64)));
+
+GccU8x16Vec U8x16VecRoundingAvg(GccU8x16Vec a, GccU8x16Vec b) {
+  return VecRoundingAvg(a, b);
+}
+
+GccU16x8Vec U16x8VecRoundingAvg(GccU16x8Vec a, GccU16x8Vec b) {
+  return VecRoundingAvg(a, b);
+}
+
+GccU8x32Vec U8x32VecRoundingAvg(GccU8x32Vec a, GccU8x32Vec b) {
+  return VecRoundingAvg(a, b);
+}
+
+GccU16x16Vec U16x16VecRoundingAvg(GccU16x16Vec a, GccU16x16Vec b) {
+  return VecRoundingAvg(a, b);
+}
+
+GccU8x64Vec U8x64VecRoundingAvg(GccU8x64Vec a, GccU8x64Vec b) {
+  return VecRoundingAvg(a, b);
+}
+
+GccU16x32Vec U16x32VecRoundingAvg(GccU16x32Vec a, GccU16x32Vec b) {
+  return VecRoundingAvg(a, b);
+}
+
-- 
2.34.1

Reply via email to