On Wed, Jul 31, 2019 at 9:10 AM Jakub Jelinek <ja...@redhat.com> wrote: > > Hi! > > As mentioned in the PR, we can use psadbw to shorten the final reductions to > scalar for 8-bit elements. E.g. for -mavx2 the difference is: > - vmovdqa %xmm1, %xmm0 > - vextracti128 $0x1, %ymm1, %xmm1 > - vpaddb %xmm1, %xmm0, %xmm0 > - vpsrldq $8, %xmm0, %xmm1 > - vpaddb %xmm1, %xmm0, %xmm0 > - vpsrldq $4, %xmm0, %xmm1 > - vpaddb %xmm1, %xmm0, %xmm0 > - vpsrldq $2, %xmm0, %xmm1 > - vpaddb %xmm1, %xmm0, %xmm0 > - vpsrldq $1, %xmm0, %xmm1 > - vpaddb %xmm1, %xmm0, %xmm0 > + vextracti128 $0x1, %ymm1, %xmm0 > + vpaddb %xmm1, %xmm0, %xmm1 > + vpsrldq $8, %xmm1, %xmm0 > + vpaddb %xmm0, %xmm1, %xmm1 > + vpxor %xmm0, %xmm0, %xmm0 > + vpsadbw %xmm0, %xmm1, %xmm0 > vpextrb $0, %xmm0, %eax > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2019-07-31 Jakub Jelinek <ja...@redhat.com> > > PR tree-optimization/91201 > * config/i386/sse.md (reduc_plus_scal_v16qi): New expander. > (REDUC_PLUS_MODE): Add V32QImode for TARGET_AVX and V64QImode for > TARGET_AVX512F. > (reduc_plus_scal_<mode>): Improve formatting by introducing > a temporary. > > * gcc.target/i386/sse2-pr91201.c: New test. > * gcc.target/i386/avx2-pr91201.c: New test. > * gcc.target/i386/avx512bw-pr91201.c: New test.
OK. Thanks, Uros. > --- gcc/config/i386/sse.md.jj 2019-07-30 12:19:45.999490854 +0200 > +++ gcc/config/i386/sse.md 2019-07-30 12:19:55.379352735 +0200 > @@ -2728,9 +2728,30 @@ (define_expand "reduc_plus_scal_<mode>" > DONE; > }) > > +(define_expand "reduc_plus_scal_v16qi" > + [(plus:V16QI > + (match_operand:QI 0 "register_operand") > + (match_operand:V16QI 1 "register_operand"))] > + "TARGET_SSE2" > +{ > + rtx tmp = gen_reg_rtx (V1TImode); > + emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, operands[1]), > + GEN_INT (64))); > + rtx tmp2 = gen_reg_rtx (V16QImode); > + emit_insn (gen_addv16qi3 (tmp2, operands[1], gen_lowpart (V16QImode, > tmp))); > + rtx tmp3 = gen_reg_rtx (V16QImode); > + emit_move_insn (tmp3, CONST0_RTX (V16QImode)); > + rtx tmp4 = gen_reg_rtx (V2DImode); > + emit_insn (gen_sse2_psadbw (tmp4, tmp2, tmp3)); > + tmp4 = gen_lowpart (V16QImode, tmp4); > + emit_insn (gen_vec_extractv16qiqi (operands[0], tmp4, const0_rtx)); > + DONE; > +}) > + > (define_mode_iterator REDUC_PLUS_MODE > [(V4DF "TARGET_AVX") (V8SF "TARGET_AVX") > - (V8DF "TARGET_AVX512F") (V16SF "TARGET_AVX512F")]) > + (V8DF "TARGET_AVX512F") (V16SF "TARGET_AVX512F") > + (V32QI "TARGET_AVX") (V64QI "TARGET_AVX512F")]) > > (define_expand "reduc_plus_scal_<mode>" > [(plus:REDUC_PLUS_MODE > @@ -2741,8 +2762,8 @@ (define_expand "reduc_plus_scal_<mode>" > rtx tmp = gen_reg_rtx (<ssehalfvecmode>mode); > emit_insn (gen_vec_extract_hi_<mode> (tmp, operands[1])); > rtx tmp2 = gen_reg_rtx (<ssehalfvecmode>mode); > - emit_insn (gen_add<ssehalfvecmodelower>3 > - (tmp2, tmp, gen_lowpart (<ssehalfvecmode>mode, operands[1]))); > + rtx tmp3 = gen_lowpart (<ssehalfvecmode>mode, operands[1]); > + emit_insn (gen_add<ssehalfvecmodelower>3 (tmp2, tmp, tmp3)); > emit_insn (gen_reduc_plus_scal_<ssehalfvecmodelower> (operands[0], tmp2)); > DONE; > }) > --- gcc/testsuite/gcc.target/i386/sse2-pr91201.c.jj 2019-07-30 > 12:23:48.930913778 +0200 > +++ gcc/testsuite/gcc.target/i386/sse2-pr91201.c 2019-07-30 > 12:23:45.518964018 +0200 > @@ -0,0 +1,18 @@ > +/* PR tree-optimization/91201 */ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -msse2 -mno-sse3" } */ > +/* { dg-final { scan-assembler "\tpsadbw\t" } } */ > + > +unsigned char bytes[1024]; > + > +unsigned char > +sum (void) > +{ > + unsigned char r = 0; > + unsigned char *p = (unsigned char *) bytes; > + int n; > + > + for (n = 0; n < sizeof (bytes); ++n) > + r += p[n]; > + return r; > +} > --- gcc/testsuite/gcc.target/i386/avx2-pr91201.c.jj 2019-07-30 > 12:24:05.199674228 +0200 > +++ gcc/testsuite/gcc.target/i386/avx2-pr91201.c 2019-07-30 > 12:24:34.544242142 +0200 > @@ -0,0 +1,6 @@ > +/* PR tree-optimization/91201 */ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mavx2 -mno-avx512f" } */ > +/* { dg-final { scan-assembler "\tvpsadbw\t" } } */ > + > +#include "sse2-pr91201.c" > --- gcc/testsuite/gcc.target/i386/avx512bw-pr91201.c.jj 2019-07-30 > 12:24:50.079013395 +0200 > +++ gcc/testsuite/gcc.target/i386/avx512bw-pr91201.c 2019-07-30 > 12:25:10.685709971 +0200 > @@ -0,0 +1,6 @@ > +/* PR tree-optimization/91201 */ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mavx512bw -mprefer-vector-width=512" } */ > +/* { dg-final { scan-assembler "\tvpsadbw\t" } } */ > + > +#include "sse2-pr91201.c" > > Jakub