1) Optimize (a >> 1) + (b >> 1) + ((a | b) & 1) to .AVG_CEIL (a, b) 2) Optimize (a | b) - ((a ^ b) >> 1) to .AVG_CEIL (a, b)
Prof is at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=118994#c6 Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. Ok for trunk? gcc/ChangeLog: PR middle-end/118994 * match.pd ((a >> 1) + (b >> 1) + ((a | b) & 1) to .AVG_CEIL (a, b)): New pattern. ((a | b) - ((a ^ b) >> 1) to .AVG_CEIL (a, b)): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/pr118994-1.c: New test. * gcc.target/i386/pr118994-2.c: New test. --- gcc/match.pd | 23 ++++++++++++++ gcc/testsuite/gcc.target/i386/pr118994-1.c | 37 ++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr118994-2.c | 37 ++++++++++++++++++++++ 3 files changed, 97 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/pr118994-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr118994-2.c diff --git a/gcc/match.pd b/gcc/match.pd index 96136404f5e..d391ac86edc 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -11455,3 +11455,26 @@ and, } (if (full_perm_p) (vec_perm (op@3 @0 @1) @3 @2)))))) + +#if GIMPLE +/* Simplify (a >> 1) + (b >> 1) + ((a | b) & 1) to .AVG_CEIL (a, b). + Similar for (a | b) - ((a ^ b) >> 1). */ + +(simplify + (plus:c + (plus (rshift @0 integer_onep@1) (rshift @2 @1)) + (bit_and (bit_ior @0 @2) integer_onep@3)) + (if (cfun && (cfun->curr_properties & PROP_last_full_fold) != 0 + && VECTOR_TYPE_P (type) + && direct_internal_fn_supported_p (IFN_AVG_CEIL, type, OPTIMIZE_FOR_BOTH)) + (IFN_AVG_CEIL @0 @2))) + +(simplify + (minus + (bit_ior @0 @2) + (rshift (bit_xor @0 @2) integer_onep@1)) + (if (cfun && (cfun->curr_properties & PROP_last_full_fold) != 0 + && VECTOR_TYPE_P (type) + && direct_internal_fn_supported_p (IFN_AVG_CEIL, type, OPTIMIZE_FOR_BOTH)) + (IFN_AVG_CEIL @0 @2))) +#endif diff --git a/gcc/testsuite/gcc.target/i386/pr118994-1.c b/gcc/testsuite/gcc.target/i386/pr118994-1.c new file mode 100644 index 00000000000..5f40ababccc --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr118994-1.c @@ -0,0 +1,37 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -mavx512vl -O2 -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump-times "\.AVG_CEIL" 6 "optimized"} } */ + +#define VecRoundingAvg(a, b) ((a >> 1) + (b >> 1) + ((a | b) & 1)) + +typedef unsigned char GccU8x16Vec __attribute__((__vector_size__(16))); +typedef unsigned short GccU16x8Vec __attribute__((__vector_size__(16))); +typedef unsigned char GccU8x32Vec __attribute__((__vector_size__(32))); +typedef unsigned short GccU16x16Vec __attribute__((__vector_size__(32))); +typedef unsigned char GccU8x64Vec __attribute__((__vector_size__(64))); +typedef unsigned short GccU16x32Vec __attribute__((__vector_size__(64))); + +GccU8x16Vec U8x16VecRoundingAvg(GccU8x16Vec a, GccU8x16Vec b) { + return VecRoundingAvg(a, b); +} + +GccU16x8Vec U16x8VecRoundingAvg(GccU16x8Vec a, GccU16x8Vec b) { + return VecRoundingAvg(a, b); +} + +GccU8x32Vec U8x32VecRoundingAvg(GccU8x32Vec a, GccU8x32Vec b) { + return VecRoundingAvg(a, b); +} + +GccU16x16Vec U16x16VecRoundingAvg(GccU16x16Vec a, GccU16x16Vec b) { + return VecRoundingAvg(a, b); +} + +GccU8x64Vec U8x64VecRoundingAvg(GccU8x64Vec a, GccU8x64Vec b) { + return VecRoundingAvg(a, b); +} + +GccU16x32Vec U16x32VecRoundingAvg(GccU16x32Vec a, GccU16x32Vec b) { + return VecRoundingAvg(a, b); +} + diff --git a/gcc/testsuite/gcc.target/i386/pr118994-2.c b/gcc/testsuite/gcc.target/i386/pr118994-2.c new file mode 100644 index 00000000000..ba90e0a2992 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr118994-2.c @@ -0,0 +1,37 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -mavx512vl -O2 -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump-times "\.AVG_CEIL" 6 "optimized"} } */ + +#define VecRoundingAvg(a, b) ((a | b) - ((a ^ b) >> 1)) + +typedef unsigned char GccU8x16Vec __attribute__((__vector_size__(16))); +typedef unsigned short GccU16x8Vec __attribute__((__vector_size__(16))); +typedef unsigned char GccU8x32Vec __attribute__((__vector_size__(32))); +typedef unsigned short GccU16x16Vec __attribute__((__vector_size__(32))); +typedef unsigned char GccU8x64Vec __attribute__((__vector_size__(64))); +typedef unsigned short GccU16x32Vec __attribute__((__vector_size__(64))); + +GccU8x16Vec U8x16VecRoundingAvg(GccU8x16Vec a, GccU8x16Vec b) { + return VecRoundingAvg(a, b); +} + +GccU16x8Vec U16x8VecRoundingAvg(GccU16x8Vec a, GccU16x8Vec b) { + return VecRoundingAvg(a, b); +} + +GccU8x32Vec U8x32VecRoundingAvg(GccU8x32Vec a, GccU8x32Vec b) { + return VecRoundingAvg(a, b); +} + +GccU16x16Vec U16x16VecRoundingAvg(GccU16x16Vec a, GccU16x16Vec b) { + return VecRoundingAvg(a, b); +} + +GccU8x64Vec U8x64VecRoundingAvg(GccU8x64Vec a, GccU8x64Vec b) { + return VecRoundingAvg(a, b); +} + +GccU16x32Vec U16x32VecRoundingAvg(GccU16x32Vec a, GccU16x32Vec b) { + return VecRoundingAvg(a, b); +} + -- 2.34.1