On Wed, Jul 31, 2019 at 11:30 AM Jakub Jelinek <ja...@redhat.com> wrote: > > Hi! > > On Wed, Jul 31, 2019 at 10:51:22AM +0200, Uros Bizjak wrote: > > OK. > > Thanks. This follow-up implements the same for mmx with sse for V8QImode, > the testcase shows that it is useful too. The difference is quite large: > > - movq $0, -72(%rsp) > - movl $bytes, %eax > movq bytes(%rip), %xmm0 > + movl $bytes, %eax > + pxor %xmm2, %xmm2 > .p2align 4,,10 > .p2align 3 > .L2: > movdqa %xmm0, %xmm1 > movq 8(%rax), %xmm0 > - movq -72(%rsp), %xmm2 > addq $8, %rax > paddb %xmm0, %xmm1 > paddb %xmm0, %xmm2 > movq %xmm1, -8(%rax) > - movq %xmm2, -72(%rsp) > cmpq $bytes+1016, %rax > jne .L2 > - movq -72(%rsp), %rcx > - movzbl -72(%rsp), %eax > - movzbl %ch, %edx > - addl %edx, %eax > - movq %rcx, %rdx > - shrq $16, %rdx > - addl %edx, %eax > - movq %rcx, %rdx > - shrq $24, %rdx > - addl %edx, %eax > - movq %rcx, %rdx > - shrq $32, %rdx > - addl %edx, %eax > - movq %rcx, %rdx > - shrq $40, %rdx > - addl %edx, %eax > - movq %rcx, %rdx > - shrq $48, %rdx > - addl %eax, %edx > - movq %rcx, %rax > - shrq $56, %rax > - addl %edx, %eax > + pxor %xmm0, %xmm0 > + movdqa %xmm2, %xmm3 > + psadbw %xmm0, %xmm3 > + movq %xmm3, %rax
Excellent! IIRC, there are quite some (integer) named patterns that can be implemented using TARGET_MMX_WITH_SSE. I'm not at my keyboard right now, but it looks that horizontal adds can be implemented using the same approach. I'm glad that TARGET_MMX_WITH_SSE opens such noticeable optimization opportunities. > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2019-07-31 Jakub Jelinek <ja...@redhat.com> > > PR tree-optimization/91201 > * config/i386/mmx.md (reduc_plus_scal_v8qi): New expander. > > * gcc.target/i386/sse2-pr91201-2.c: New test. OK. Thanks, Uros. > --- gcc/config/i386/mmx.md.jj 2019-07-20 08:35:05.720255567 +0200 > +++ gcc/config/i386/mmx.md 2019-07-31 08:43:23.054776025 +0200 > @@ -1897,6 +1897,21 @@ (define_insn "mmx_psadbw" > (set_attr "type" "mmxshft,sseiadd,sseiadd") > (set_attr "mode" "DI,TI,TI")]) > > +(define_expand "reduc_plus_scal_v8qi" > + [(plus:V8QI > + (match_operand:QI 0 "register_operand") > + (match_operand:V8QI 1 "register_operand"))] > + "TARGET_MMX_WITH_SSE" > +{ > + rtx tmp = gen_reg_rtx (V8QImode); > + emit_move_insn (tmp, CONST0_RTX (V8QImode)); > + rtx tmp2 = gen_reg_rtx (V1DImode); > + emit_insn (gen_mmx_psadbw (tmp2, operands[1], tmp)); > + tmp2 = gen_lowpart (V8QImode, tmp2); > + emit_insn (gen_vec_extractv8qiqi (operands[0], tmp2, const0_rtx)); > + DONE; > +}) > + > (define_insn_and_split "mmx_pmovmskb" > [(set (match_operand:SI 0 "register_operand" "=r,r") > (unspec:SI [(match_operand:V8QI 1 "register_operand" "y,x")] > --- gcc/testsuite/gcc.target/i386/sse2-pr91201-2.c.jj 2019-07-31 > 08:45:19.553086849 +0200 > +++ gcc/testsuite/gcc.target/i386/sse2-pr91201-2.c 2019-07-31 > 08:46:52.556738334 +0200 > @@ -0,0 +1,21 @@ > +/* PR tree-optimization/91201 */ > +/* { dg-do compile { target lp64 } } */ > +/* { dg-options "-O3 -msse2 -mno-sse3" } */ > +/* { dg-final { scan-assembler "\tpsadbw\t" } } */ > + > +unsigned char bytes[1024]; > + > +unsigned char > +sum (void) > +{ > + unsigned char r = 0; > + unsigned char *p = (unsigned char *) bytes; > + int n; > + > + for (n = 8; n < sizeof (bytes); ++n) > + { > + p[n - 8] += p[n]; > + r += p[n]; > + } > + return r; > +} > > > Jakub