On Wed, Jul 31, 2019 at 11:30 AM Jakub Jelinek <ja...@redhat.com> wrote:
>
> Hi!
>
> On Wed, Jul 31, 2019 at 10:51:22AM +0200, Uros Bizjak wrote:
> > OK.
>
> Thanks.  This follow-up implements the same for mmx with sse for V8QImode,
> the testcase shows that it is useful too.  The difference is quite large:
>
> -       movq    $0, -72(%rsp)
> -       movl    $bytes, %eax
>         movq    bytes(%rip), %xmm0
> +       movl    $bytes, %eax
> +       pxor    %xmm2, %xmm2
>         .p2align 4,,10
>         .p2align 3
>  .L2:
>         movdqa  %xmm0, %xmm1
>         movq    8(%rax), %xmm0
> -       movq    -72(%rsp), %xmm2
>         addq    $8, %rax
>         paddb   %xmm0, %xmm1
>         paddb   %xmm0, %xmm2
>         movq    %xmm1, -8(%rax)
> -       movq    %xmm2, -72(%rsp)
>         cmpq    $bytes+1016, %rax
>         jne     .L2
> -       movq    -72(%rsp), %rcx
> -       movzbl  -72(%rsp), %eax
> -       movzbl  %ch, %edx
> -       addl    %edx, %eax
> -       movq    %rcx, %rdx
> -       shrq    $16, %rdx
> -       addl    %edx, %eax
> -       movq    %rcx, %rdx
> -       shrq    $24, %rdx
> -       addl    %edx, %eax
> -       movq    %rcx, %rdx
> -       shrq    $32, %rdx
> -       addl    %edx, %eax
> -       movq    %rcx, %rdx
> -       shrq    $40, %rdx
> -       addl    %edx, %eax
> -       movq    %rcx, %rdx
> -       shrq    $48, %rdx
> -       addl    %eax, %edx
> -       movq    %rcx, %rax
> -       shrq    $56, %rax
> -       addl    %edx, %eax
> +       pxor    %xmm0, %xmm0
> +       movdqa  %xmm2, %xmm3
> +       psadbw  %xmm0, %xmm3
> +       movq    %xmm3, %rax

Excellent!

IIRC, there are quite some (integer) named patterns that can be
implemented using TARGET_MMX_WITH_SSE. I'm not at my keyboard right
now, but it looks that horizontal adds can be implemented using the
same approach. I'm glad that TARGET_MMX_WITH_SSE opens such noticeable
optimization opportunities.

> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2019-07-31  Jakub Jelinek  <ja...@redhat.com>
>
>         PR tree-optimization/91201
>         * config/i386/mmx.md (reduc_plus_scal_v8qi): New expander.
>
>         * gcc.target/i386/sse2-pr91201-2.c: New test.

OK.

Thanks,
Uros.

> --- gcc/config/i386/mmx.md.jj   2019-07-20 08:35:05.720255567 +0200
> +++ gcc/config/i386/mmx.md      2019-07-31 08:43:23.054776025 +0200
> @@ -1897,6 +1897,21 @@ (define_insn "mmx_psadbw"
>     (set_attr "type" "mmxshft,sseiadd,sseiadd")
>     (set_attr "mode" "DI,TI,TI")])
>
> +(define_expand "reduc_plus_scal_v8qi"
> + [(plus:V8QI
> +    (match_operand:QI 0 "register_operand")
> +    (match_operand:V8QI 1 "register_operand"))]
> + "TARGET_MMX_WITH_SSE"
> +{
> +  rtx tmp = gen_reg_rtx (V8QImode);
> +  emit_move_insn (tmp, CONST0_RTX (V8QImode));
> +  rtx tmp2 = gen_reg_rtx (V1DImode);
> +  emit_insn (gen_mmx_psadbw (tmp2, operands[1], tmp));
> +  tmp2 = gen_lowpart (V8QImode, tmp2);
> +  emit_insn (gen_vec_extractv8qiqi (operands[0], tmp2, const0_rtx));
> +  DONE;
> +})
> +
>  (define_insn_and_split "mmx_pmovmskb"
>    [(set (match_operand:SI 0 "register_operand" "=r,r")
>         (unspec:SI [(match_operand:V8QI 1 "register_operand" "y,x")]
> --- gcc/testsuite/gcc.target/i386/sse2-pr91201-2.c.jj   2019-07-31 
> 08:45:19.553086849 +0200
> +++ gcc/testsuite/gcc.target/i386/sse2-pr91201-2.c      2019-07-31 
> 08:46:52.556738334 +0200
> @@ -0,0 +1,21 @@
> +/* PR tree-optimization/91201 */
> +/* { dg-do compile { target lp64 } } */
> +/* { dg-options "-O3 -msse2 -mno-sse3" } */
> +/* { dg-final { scan-assembler "\tpsadbw\t" } } */
> +
> +unsigned char bytes[1024];
> +
> +unsigned char
> +sum (void)
> +{
> +  unsigned char r = 0;
> +  unsigned char *p = (unsigned char *) bytes;
> +  int n;
> +
> +  for (n = 8; n < sizeof (bytes); ++n)
> +    {
> +      p[n - 8] += p[n];
> +      r += p[n];
> +    }
> +  return r;
> +}
>
>
>         Jakub

Reply via email to