On Thu, May 15, 2025 at 10:04 AM liuhongt <hongtao....@intel.com> wrote: > > 1) Optimize (a >> 1) + (b >> 1) + ((a | b) & 1) to .AVG_CEIL (a, b) > 2) Optimize (a | b) - ((a ^ b) >> 1) to .AVG_CEIL (a, b) > > Prof is at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=118994#c6 > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Ok for trunk?
OK. Thanks, Richard. > gcc/ChangeLog: > > PR middle-end/118994 > * match.pd ((a >> 1) + (b >> 1) + ((a | b) & 1) to > .AVG_CEIL (a, b)): New pattern. > ((a | b) - ((a ^ b) >> 1) to .AVG_CEIL (a, b)): Ditto. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr118994-1.c: New test. > * gcc.target/i386/pr118994-2.c: New test. > --- > gcc/match.pd | 23 ++++++++++++++ > gcc/testsuite/gcc.target/i386/pr118994-1.c | 37 ++++++++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr118994-2.c | 37 ++++++++++++++++++++++ > 3 files changed, 97 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/i386/pr118994-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr118994-2.c > > diff --git a/gcc/match.pd b/gcc/match.pd > index 96136404f5e..d391ac86edc 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -11455,3 +11455,26 @@ and, > } > (if (full_perm_p) > (vec_perm (op@3 @0 @1) @3 @2)))))) > + > +#if GIMPLE > +/* Simplify (a >> 1) + (b >> 1) + ((a | b) & 1) to .AVG_CEIL (a, b). > + Similar for (a | b) - ((a ^ b) >> 1). */ > + > +(simplify > + (plus:c > + (plus (rshift @0 integer_onep@1) (rshift @2 @1)) > + (bit_and (bit_ior @0 @2) integer_onep@3)) > + (if (cfun && (cfun->curr_properties & PROP_last_full_fold) != 0 > + && VECTOR_TYPE_P (type) > + && direct_internal_fn_supported_p (IFN_AVG_CEIL, type, > OPTIMIZE_FOR_BOTH)) > + (IFN_AVG_CEIL @0 @2))) > + > +(simplify > + (minus > + (bit_ior @0 @2) > + (rshift (bit_xor @0 @2) integer_onep@1)) > + (if (cfun && (cfun->curr_properties & PROP_last_full_fold) != 0 > + && VECTOR_TYPE_P (type) > + && direct_internal_fn_supported_p (IFN_AVG_CEIL, type, > OPTIMIZE_FOR_BOTH)) > + (IFN_AVG_CEIL @0 @2))) > +#endif > diff --git a/gcc/testsuite/gcc.target/i386/pr118994-1.c > b/gcc/testsuite/gcc.target/i386/pr118994-1.c > new file mode 100644 > index 00000000000..5f40ababccc > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr118994-1.c > @@ -0,0 +1,37 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512bw -mavx512vl -O2 -fdump-tree-optimized" } */ > +/* { dg-final { scan-tree-dump-times "\.AVG_CEIL" 6 "optimized"} } */ > + > +#define VecRoundingAvg(a, b) ((a >> 1) + (b >> 1) + ((a | b) & 1)) > + > +typedef unsigned char GccU8x16Vec __attribute__((__vector_size__(16))); > +typedef unsigned short GccU16x8Vec __attribute__((__vector_size__(16))); > +typedef unsigned char GccU8x32Vec __attribute__((__vector_size__(32))); > +typedef unsigned short GccU16x16Vec __attribute__((__vector_size__(32))); > +typedef unsigned char GccU8x64Vec __attribute__((__vector_size__(64))); > +typedef unsigned short GccU16x32Vec __attribute__((__vector_size__(64))); > + > +GccU8x16Vec U8x16VecRoundingAvg(GccU8x16Vec a, GccU8x16Vec b) { > + return VecRoundingAvg(a, b); > +} > + > +GccU16x8Vec U16x8VecRoundingAvg(GccU16x8Vec a, GccU16x8Vec b) { > + return VecRoundingAvg(a, b); > +} > + > +GccU8x32Vec U8x32VecRoundingAvg(GccU8x32Vec a, GccU8x32Vec b) { > + return VecRoundingAvg(a, b); > +} > + > +GccU16x16Vec U16x16VecRoundingAvg(GccU16x16Vec a, GccU16x16Vec b) { > + return VecRoundingAvg(a, b); > +} > + > +GccU8x64Vec U8x64VecRoundingAvg(GccU8x64Vec a, GccU8x64Vec b) { > + return VecRoundingAvg(a, b); > +} > + > +GccU16x32Vec U16x32VecRoundingAvg(GccU16x32Vec a, GccU16x32Vec b) { > + return VecRoundingAvg(a, b); > +} > + > diff --git a/gcc/testsuite/gcc.target/i386/pr118994-2.c > b/gcc/testsuite/gcc.target/i386/pr118994-2.c > new file mode 100644 > index 00000000000..ba90e0a2992 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr118994-2.c > @@ -0,0 +1,37 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx512bw -mavx512vl -O2 -fdump-tree-optimized" } */ > +/* { dg-final { scan-tree-dump-times "\.AVG_CEIL" 6 "optimized"} } */ > + > +#define VecRoundingAvg(a, b) ((a | b) - ((a ^ b) >> 1)) > + > +typedef unsigned char GccU8x16Vec __attribute__((__vector_size__(16))); > +typedef unsigned short GccU16x8Vec __attribute__((__vector_size__(16))); > +typedef unsigned char GccU8x32Vec __attribute__((__vector_size__(32))); > +typedef unsigned short GccU16x16Vec __attribute__((__vector_size__(32))); > +typedef unsigned char GccU8x64Vec __attribute__((__vector_size__(64))); > +typedef unsigned short GccU16x32Vec __attribute__((__vector_size__(64))); > + > +GccU8x16Vec U8x16VecRoundingAvg(GccU8x16Vec a, GccU8x16Vec b) { > + return VecRoundingAvg(a, b); > +} > + > +GccU16x8Vec U16x8VecRoundingAvg(GccU16x8Vec a, GccU16x8Vec b) { > + return VecRoundingAvg(a, b); > +} > + > +GccU8x32Vec U8x32VecRoundingAvg(GccU8x32Vec a, GccU8x32Vec b) { > + return VecRoundingAvg(a, b); > +} > + > +GccU16x16Vec U16x16VecRoundingAvg(GccU16x16Vec a, GccU16x16Vec b) { > + return VecRoundingAvg(a, b); > +} > + > +GccU8x64Vec U8x64VecRoundingAvg(GccU8x64Vec a, GccU8x64Vec b) { > + return VecRoundingAvg(a, b); > +} > + > +GccU16x32Vec U16x32VecRoundingAvg(GccU16x32Vec a, GccU16x32Vec b) { > + return VecRoundingAvg(a, b); > +} > + > -- > 2.34.1 >