> gcc/ChangeLog:
> 
>       * config/i386/i386-expand.cc (emit_reduc_half): Use shuffles to
>       generate reduc half for V4SI, similar modes.
>       * config/i386/i386.h (TARGET_SSE_REDUCTION_PREFER_PSHUF): New Macro.
>       * config/i386/x86-tune.def (X86_TUNE_SSE_REDUCTION_PREFER_PSHUF):
>       New tuning.
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.target/i386/reduc-pshuf.c: New test.
> ---
>  gcc/config/i386/i386-expand.cc              | 28 ++++++++++++++++++---
>  gcc/config/i386/i386.h                      |  2 ++
>  gcc/config/i386/x86-tune.def                |  5 ++++
>  gcc/testsuite/gcc.target/i386/reduc-pshuf.c | 14 +++++++++++
>  4 files changed, 46 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/reduc-pshuf.c
> 
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 7fd03c88630..c7aec716a55 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -18724,9 +18724,31 @@ emit_reduc_half (rtx dest, rtx src, int i)
>      case E_V8HFmode:
>      case E_V4SImode:
>      case E_V2DImode:
> -      d = gen_reg_rtx (V1TImode);
> -      tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
> -                             GEN_INT (i / 2));
> +      if (TARGET_SSE_REDUCTION_PREFER_PSHUF) {
> +        if (i == 128) {
> +          d = gen_reg_rtx(V4SImode);
> +          tem = gen_sse2_pshufd_1(
> +              d, force_reg(V4SImode, gen_lowpart(V4SImode, src)), GEN_INT(2),
> +              GEN_INT(3), GEN_INT(2), GEN_INT(3));
> +        } else if (i == 64) {
> +          d = gen_reg_rtx(V4SImode);
> +          tem = gen_sse2_pshufd_1(
> +              d, force_reg(V4SImode, gen_lowpart(V4SImode, src)), GEN_INT(1),
> +              GEN_INT(1), GEN_INT(1), GEN_INT(1));
> +        } else if (i == 32) {
> +          d = gen_reg_rtx(V8HImode);
> +          tem = gen_sse2_pshuflw_1(
> +              d, force_reg(V8HImode, gen_lowpart(V8HImode, src)), GEN_INT(1),
> +              GEN_INT(1), GEN_INT(1), GEN_INT(1));
> +        } else {
> +          d = gen_reg_rtx(V1TImode);
> +          tem =
> +              gen_sse2_lshrv1ti3(d, gen_lowpart(V1TImode, src), GEN_INT(i / 
> 2));
> +        }
> +      } else {
> +        d = gen_reg_rtx(V1TImode);
> +        tem = gen_sse2_lshrv1ti3(d, gen_lowpart(V1TImode, src), GEN_INT(i / 
> 2));

Instead of duplicating gen_sse2_lshrv1ti3 it is probably cleaner to
simply break after each gen_sse_pshuw call and remove else.

OK with that change
Honza

Reply via email to