On Wed, Jul 31, 2019 at 9:10 AM Jakub Jelinek <ja...@redhat.com> wrote:
>
> Hi!
>
> As mentioned in the PR, we can use psadbw to shorten the final reductions to
> scalar for 8-bit elements.  E.g. for -mavx2 the difference is:
> -       vmovdqa %xmm1, %xmm0
> -       vextracti128    $0x1, %ymm1, %xmm1
> -       vpaddb  %xmm1, %xmm0, %xmm0
> -       vpsrldq $8, %xmm0, %xmm1
> -       vpaddb  %xmm1, %xmm0, %xmm0
> -       vpsrldq $4, %xmm0, %xmm1
> -       vpaddb  %xmm1, %xmm0, %xmm0
> -       vpsrldq $2, %xmm0, %xmm1
> -       vpaddb  %xmm1, %xmm0, %xmm0
> -       vpsrldq $1, %xmm0, %xmm1
> -       vpaddb  %xmm1, %xmm0, %xmm0
> +       vextracti128    $0x1, %ymm1, %xmm0
> +       vpaddb  %xmm1, %xmm0, %xmm1
> +       vpsrldq $8, %xmm1, %xmm0
> +       vpaddb  %xmm0, %xmm1, %xmm1
> +       vpxor   %xmm0, %xmm0, %xmm0
> +       vpsadbw %xmm0, %xmm1, %xmm0
>         vpextrb $0, %xmm0, %eax
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2019-07-31  Jakub Jelinek  <ja...@redhat.com>
>
>         PR tree-optimization/91201
>         * config/i386/sse.md (reduc_plus_scal_v16qi): New expander.
>         (REDUC_PLUS_MODE): Add V32QImode for TARGET_AVX and V64QImode for
>         TARGET_AVX512F.
>         (reduc_plus_scal_<mode>): Improve formatting by introducing
>         a temporary.
>
>         * gcc.target/i386/sse2-pr91201.c: New test.
>         * gcc.target/i386/avx2-pr91201.c: New test.
>         * gcc.target/i386/avx512bw-pr91201.c: New test.

OK.

Thanks,
Uros.

> --- gcc/config/i386/sse.md.jj   2019-07-30 12:19:45.999490854 +0200
> +++ gcc/config/i386/sse.md      2019-07-30 12:19:55.379352735 +0200
> @@ -2728,9 +2728,30 @@ (define_expand "reduc_plus_scal_<mode>"
>    DONE;
>  })
>
> +(define_expand "reduc_plus_scal_v16qi"
> + [(plus:V16QI
> +    (match_operand:QI 0 "register_operand")
> +    (match_operand:V16QI 1 "register_operand"))]
> + "TARGET_SSE2"
> +{
> +  rtx tmp = gen_reg_rtx (V1TImode);
> +  emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, operands[1]),
> +                                GEN_INT (64)));
> +  rtx tmp2 = gen_reg_rtx (V16QImode);
> +  emit_insn (gen_addv16qi3 (tmp2, operands[1], gen_lowpart (V16QImode, 
> tmp)));
> +  rtx tmp3 = gen_reg_rtx (V16QImode);
> +  emit_move_insn (tmp3, CONST0_RTX (V16QImode));
> +  rtx tmp4 = gen_reg_rtx (V2DImode);
> +  emit_insn (gen_sse2_psadbw (tmp4, tmp2, tmp3));
> +  tmp4 = gen_lowpart (V16QImode, tmp4);
> +  emit_insn (gen_vec_extractv16qiqi (operands[0], tmp4, const0_rtx));
> +  DONE;
> +})
> +
>  (define_mode_iterator REDUC_PLUS_MODE
>   [(V4DF "TARGET_AVX") (V8SF "TARGET_AVX")
> -  (V8DF "TARGET_AVX512F") (V16SF "TARGET_AVX512F")])
> +  (V8DF "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
> +  (V32QI "TARGET_AVX") (V64QI "TARGET_AVX512F")])
>
>  (define_expand "reduc_plus_scal_<mode>"
>   [(plus:REDUC_PLUS_MODE
> @@ -2741,8 +2762,8 @@ (define_expand "reduc_plus_scal_<mode>"
>    rtx tmp = gen_reg_rtx (<ssehalfvecmode>mode);
>    emit_insn (gen_vec_extract_hi_<mode> (tmp, operands[1]));
>    rtx tmp2 = gen_reg_rtx (<ssehalfvecmode>mode);
> -  emit_insn (gen_add<ssehalfvecmodelower>3
> -    (tmp2, tmp, gen_lowpart (<ssehalfvecmode>mode, operands[1])));
> +  rtx tmp3 = gen_lowpart (<ssehalfvecmode>mode, operands[1]);
> +  emit_insn (gen_add<ssehalfvecmodelower>3 (tmp2, tmp, tmp3));
>    emit_insn (gen_reduc_plus_scal_<ssehalfvecmodelower> (operands[0], tmp2));
>    DONE;
>  })
> --- gcc/testsuite/gcc.target/i386/sse2-pr91201.c.jj     2019-07-30 
> 12:23:48.930913778 +0200
> +++ gcc/testsuite/gcc.target/i386/sse2-pr91201.c        2019-07-30 
> 12:23:45.518964018 +0200
> @@ -0,0 +1,18 @@
> +/* PR tree-optimization/91201 */
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -msse2 -mno-sse3" } */
> +/* { dg-final { scan-assembler "\tpsadbw\t" } } */
> +
> +unsigned char bytes[1024];
> +
> +unsigned char
> +sum (void)
> +{
> +  unsigned char r = 0;
> +  unsigned char *p = (unsigned char *) bytes;
> +  int n;
> +
> +  for (n = 0; n < sizeof (bytes); ++n)
> +    r += p[n];
> +  return r;
> +}
> --- gcc/testsuite/gcc.target/i386/avx2-pr91201.c.jj     2019-07-30 
> 12:24:05.199674228 +0200
> +++ gcc/testsuite/gcc.target/i386/avx2-pr91201.c        2019-07-30 
> 12:24:34.544242142 +0200
> @@ -0,0 +1,6 @@
> +/* PR tree-optimization/91201 */
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx2 -mno-avx512f" } */
> +/* { dg-final { scan-assembler "\tvpsadbw\t" } } */
> +
> +#include "sse2-pr91201.c"
> --- gcc/testsuite/gcc.target/i386/avx512bw-pr91201.c.jj 2019-07-30 
> 12:24:50.079013395 +0200
> +++ gcc/testsuite/gcc.target/i386/avx512bw-pr91201.c    2019-07-30 
> 12:25:10.685709971 +0200
> @@ -0,0 +1,6 @@
> +/* PR tree-optimization/91201 */
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx512bw -mprefer-vector-width=512" } */
> +/* { dg-final { scan-assembler "\tvpsadbw\t" } } */
> +
> +#include "sse2-pr91201.c"
>
>         Jakub

Reply via email to