On Fri, Feb 11, 2022 at 2:38 AM liuhongt <hongtao....@intel.com> wrote: > > >>> Confirmed. When uncond_op is expensive (there's *div amongst them) that's > >>> definitely unwanted. OTOH when it is cheap then combining will reduce > >>> latency. > >>> > >>> GIMPLE wise it's a neutral transform if uncond_op is not single-use unless > >>> we need two v_c_es. > >> > >> We can leave it to rtl combine/fwprop which will consider rtx_cost for > >> them. > >> > > > >That certainly makes sense for the !single_use case. > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,} and > aarch64-unknown-linux-gnu. > Also Bootstrapped and regtested on CLX with gcc configure --with-arch=native > --with-cpu=native. > > Ok for trunk?
OK. Thanks, Richard. > gcc/ChangeLog: > > PR tree-optimization/104479 > * match.pd (uncond_op + vec_cond -> cond_op): Add single_use > for the dest of uncond_op. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr104479.c: New test. > * gcc.target/i386/cond_op_shift_w-1.c: Adjust testcase. > --- > gcc/match.pd | 12 ++++--- > .../gcc.target/i386/cond_op_shift_w-1.c | 3 +- > gcc/testsuite/gcc.target/i386/pr104479.c | 33 +++++++++++++++++++ > 3 files changed, 42 insertions(+), 6 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr104479.c > > diff --git a/gcc/match.pd b/gcc/match.pd > index 7bbb80172fc..c195c8cc882 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -7385,13 +7385,15 @@ and, > (vec_cond @0 (view_convert? (uncond_op@4 @1 @2)) @3) > (with { tree op_type = TREE_TYPE (@4); } > (if (vectorized_internal_fn_supported_p (as_internal_fn (cond_op), > op_type) > - && is_truth_type_for (op_type, TREE_TYPE (@0))) > + && is_truth_type_for (op_type, TREE_TYPE (@0)) > + && single_use (@4)) > (view_convert (cond_op @0 @1 @2 (view_convert:op_type @3)))))) > (simplify > (vec_cond @0 @1 (view_convert? (uncond_op@4 @2 @3))) > (with { tree op_type = TREE_TYPE (@4); } > (if (vectorized_internal_fn_supported_p (as_internal_fn (cond_op), > op_type) > - && is_truth_type_for (op_type, TREE_TYPE (@0))) > + && is_truth_type_for (op_type, TREE_TYPE (@0)) > + && single_use (@4)) > (view_convert (cond_op (bit_not @0) @2 @3 (view_convert:op_type @1))))))) > > /* Same for ternary operations. */ > @@ -7401,13 +7403,15 @@ and, > (vec_cond @0 (view_convert? (uncond_op@5 @1 @2 @3)) @4) > (with { tree op_type = TREE_TYPE (@5); } > (if (vectorized_internal_fn_supported_p (as_internal_fn (cond_op), > op_type) > - && is_truth_type_for (op_type, TREE_TYPE (@0))) > + && is_truth_type_for (op_type, TREE_TYPE (@0)) > + && single_use (@5)) > (view_convert (cond_op @0 @1 @2 @3 (view_convert:op_type @4)))))) > (simplify > (vec_cond @0 @1 (view_convert? (uncond_op@5 @2 @3 @4))) > (with { tree op_type = TREE_TYPE (@5); } > (if (vectorized_internal_fn_supported_p (as_internal_fn (cond_op), > op_type) > - && is_truth_type_for (op_type, TREE_TYPE (@0))) > + && is_truth_type_for (op_type, TREE_TYPE (@0)) > + && single_use (@5)) > (view_convert (cond_op (bit_not @0) @2 @3 @4 > (view_convert:op_type @1))))))) > #endif > diff --git a/gcc/testsuite/gcc.target/i386/cond_op_shift_w-1.c > b/gcc/testsuite/gcc.target/i386/cond_op_shift_w-1.c > index 54c854f2f37..23ab8fa166f 100644 > --- a/gcc/testsuite/gcc.target/i386/cond_op_shift_w-1.c > +++ b/gcc/testsuite/gcc.target/i386/cond_op_shift_w-1.c > @@ -1,7 +1,6 @@ > /* { dg-do compile } */ > /* { dg-options "-O2 -march=skylake-avx512 -fdump-tree-optimized > -DTYPE=int16" } */ > -/* { dg-final { scan-tree-dump-times ".COND_SHR" 1 "optimized" } } */ > -/* { dg-final { scan-tree-dump-times ".COND_SHL" 1 "optimized" } } */ > +/* { dg-final { scan-tree-dump-times "\.COND_" 4 "optimized" } } */ > /* { dg-final { scan-assembler-times "vpsraw" 1 } } */ > /* { dg-final { scan-assembler-times "vpsllw" 1 } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr104479.c > b/gcc/testsuite/gcc.target/i386/pr104479.c > new file mode 100644 > index 00000000000..4ca4c482542 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr104479.c > @@ -0,0 +1,33 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=icelake-server -Ofast -fdump-tree-optimized" } */ > +/* { dg-final { scan-tree-dump-not "\.COND_SHR" "optimized" } } */ > +/* { dg-final { scan-tree-dump-not "\.COND_FMA" "optimized" } } */ > + > +void > +cond_shr (unsigned int* __restrict dst, > + unsigned int* __restrict src, > + unsigned int* __restrict y, > + int i_width) > +{ > + for(int x = 0; x < i_width; x++) > + { > + unsigned int temp = src[x] >> 3; > + dst[x] = temp > 255 ? temp : y[x]; > + } > +} > + > + > +void > +cond_fma (float* __restrict dst, > + float* __restrict src1, > + float* __restrict src2, > + float* __restrict src3, > + unsigned int* __restrict y, > + int i_width) > +{ > + for(int x = 0; x < i_width; x++) > + { > + float temp = __builtin_fmaf (src1[x], src2[x], src3[x]); > + dst[x] = temp > 0.0f ? temp : y[x]; > + } > +} > -- > 2.18.1 >