On Tue, May 18, 2021 at 12:31 AM Bernd Edlinger <bernd.edlin...@hotmail.de> wrote: > > On 5/17/21 3:15 PM, H.J. Lu via Gcc-patches wrote: > > Changes in the v3 patches: > > > > 1. Split the TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE changes > > into the generic part and the x86 part. > > > > > > 1. Add TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE to support > > target instructions to duplicate QImode value to TImode/OImode/XImode > > value for memmset. > > 2. x86: Avoid stack realignment when copying data > > 3. x86: Remov MAX_BITSIZE_MODE_ANY_INT. Only x86 backend defines it. > > 4. x86: Use TImode/OImode/XImode integers for piecewise move and store. > > 5. x86: Add tests for TImode/OImode/XImode for piecewise move and store. > > 6. x86: Adjust existing tests. > > > > On x86-64, SPEC CPU 2017 performance impact is neutral. Glibc code size > > differences with -O2 build are: > > > > Before After > > libc.so 1906572 1906444 > > > > Some code sequence differences in libc.so are: > > > > <svcudp_bufcreate@GLIBC_2.2.5>: > > ... > > jne <svcudp_bufcreate@GLIBC_2.2.5+0x318> | jne > > <svcudp_bufcreate@GLIBC_2.2.5+0x2a8> > > test %r15,%r15 test > > %r15,%r15 > > je <svcudp_bufcreate@GLIBC_2.2.5+0x318> | je > > <svcudp_bufcreate@GLIBC_2.2.5+0x2a8> > > mov %r13d,(%r14) mov > > %r13d,(%r14) > > lea 0x10(%r14),%rdi lea > > 0x10(%r14),%rdi > > mov $0x1,%ecx mov > > $0x1,%ecx > > mov %r13d,%edx mov > > %r13d,%edx > > mov %r15,0x40(%r12) mov > > %r15,0x40(%r12) > > mov %r15,%rsi mov > > %r15,%rsi > > call <xdrmem_create@GLIBC_2.2.5> call > > <xdrmem_create@GLIBC_2.2.5> > > lea 0xa2f9b(%rip),%rax # <svcudp_op> | lea > > 0xa2fab(%rip),%rax # <svcudp_op> > > xor %esi,%esi xor > > %esi,%esi > > mov %ebp,%edi mov > > %ebp,%edi > > mov %rax,0x8(%r12) mov > > %rax,0x8(%r12) > > movzwl 0x12(%rsp),%eax > > movzwl 0x12(%rsp),%eax > > mov $0x8,%edx < > > lea 0xc(%rsp),%rcx lea > > 0xc(%rsp),%rcx > > mov %r14,0x48(%r12) < > > add $0x40,%r14 < > > mov $0x4,%r8d mov > > $0x4,%r8d > > > movq > > $0x0,0x1d0(%r14) > > > mov > > $0x8,%edx > > rol $0x8,%ax rol > > $0x8,%ax > > mov %ebp,(%r12) | mov > > %r14,0x48(%r12) > > movq $0x0,0x190(%r14) | add > > $0x40,%r14 > > mov %ax,0x4(%r12) < > > mov %r14,0x30(%r12) mov > > %r14,0x30(%r12) > > > mov > > %ax,0x4(%r12) > > > mov > > %ebp,(%r12) > > movl $0x1,0xc(%rsp) movl > > $0x1,0xc(%rsp) > > call <setsockopt> call > > <setsockopt> > > mov %r12,%rdi mov > > %r12,%rdi > > movabs $0x101010101010101,%rdx < > > test %eax,%eax test > > %eax,%eax > > mov $0xff,%eax mov > > $0xff,%eax > > cmove %eax,%ebx cmove > > %eax,%ebx > > movzbl %bl,%eax | movd > > %ebx,%xmm0 > > mov %ebx,0xc(%rsp) mov > > %ebx,0xc(%rsp) > > mov %rax,%rsi | > > punpcklbw %xmm0,%xmm0 > > imul %rdx,%rsi | > > punpcklwd %xmm0,%xmm0 > > mul %rdx | > > pshufd $0x0,%xmm0,%xmm0 > > add %rsi,%rdx | > > movups %xmm0,0x50(%r12) > > mov %rax,0x50(%r12) | > > movups %xmm0,0x60(%r12) > > mov %rdx,0x58(%r12) | > > movups %xmm0,0x70(%r12) > > mov %rax,0x60(%r12) | > > movups %xmm0,0x80(%r12) > > mov %rdx,0x68(%r12) | > > movups %xmm0,0x90(%r12) > > mov %rax,0x70(%r12) | > > movups %xmm0,0xa0(%r12) > > mov %rdx,0x78(%r12) | > > movups %xmm0,0xb0(%r12) > > mov %rax,0x80(%r12) | > > movups %xmm0,0xc0(%r12) > > mov %rdx,0x88(%r12) | > > movups %xmm0,0xd0(%r12) > > mov %rax,0x90(%r12) | > > movups %xmm0,0xe0(%r12) > > mov %rdx,0x98(%r12) | > > movups %xmm0,0xf0(%r12) > > mov %rax,0xa0(%r12) | > > movups %xmm0,0x100(%r12) > > mov %rdx,0xa8(%r12) | > > movups %xmm0,0x110(%r12) > > mov %rax,0xb0(%r12) | > > movups %xmm0,0x120(%r12) > > mov %rdx,0xb8(%r12) | > > movups %xmm0,0x130(%r12) > > mov %rax,0xc0(%r12) | > > movups %xmm0,0x140(%r12) > > mov %rdx,0xc8(%r12) < > > mov %rax,0xd0(%r12) < > > mov %rdx,0xd8(%r12) < > > mov %rax,0xe0(%r12) < > > mov %rdx,0xe8(%r12) < > > mov %rax,0xf0(%r12) < > > mov %rdx,0xf8(%r12) < > > mov %rax,0x100(%r12) < > > mov %rdx,0x108(%r12) < > > mov %rax,0x110(%r12) < > > mov %rdx,0x118(%r12) < > > mov %rax,0x120(%r12) < > > mov %rdx,0x128(%r12) < > > mov %rax,0x130(%r12) < > > mov %rdx,0x138(%r12) < > > mov %rax,0x140(%r12) < > > mov %rdx,0x148(%r12) < > > call <xprt_register@GLIBC_2.2.5> call > > <xprt_register@GLIBC_2.2.5> > > add $0x28,%rsp add > > $0x28,%rsp > > mov %r12,%rax mov > > %r12,%rax > > pop %rbx pop > > %rbx > > pop %rbp pop > > %rbp > > pop %r12 pop > > %r12 > > pop %r13 pop > > %r13 > > pop %r14 pop > > %r14 > > pop %r15 pop > > %r15 > > ret ret > > > > > > H.J. Lu (12): > > Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE > > x86: Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE > > x86: Avoid stack realignment when copying data > > Remove MAX_BITSIZE_MODE_ANY_INT > > x86: Update piecewise move and store > > x86: Add AVX2 tests for PR middle-end/90773 > > x86: Add tests for piecewise move and store > > x86: Also pass -mno-avx to pr72839.c > > x86: Also pass -mno-avx to cold-attribute-1.c > > x86: Also pass -mno-avx to sw-1.c for ia32 > > x86: Update gcc.target/i386/incoming-11.c > > constructor: Check if it is faster to load constant from memory > > > > gcc/builtins.c | 47 +-- > > gcc/config/i386/i386-expand.c | 18 +- > > gcc/config/i386/i386-modes.def | 15 +- > > gcc/config/i386/i386-protos.h | 5 + > > gcc/config/i386/i386.c | 289 +++++++++++++++++- > > gcc/config/i386/i386.h | 35 ++- > > gcc/doc/tm.texi | 16 + > > gcc/doc/tm.texi.in | 4 + > > gcc/expr.c | 10 + > > gcc/target.def | 20 ++ > > gcc/targhooks.c | 56 ++++ > > gcc/targhooks.h | 4 + > > .../gcc.target/i386/cold-attribute-1.c | 2 +- > > gcc/testsuite/gcc.target/i386/eh_return-1.c | 26 ++ > > gcc/testsuite/gcc.target/i386/incoming-11.c | 2 +- > > .../gcc.target/i386/pieces-memcpy-10.c | 16 + > > .../gcc.target/i386/pieces-memcpy-11.c | 17 ++ > > .../gcc.target/i386/pieces-memcpy-12.c | 16 + > > .../gcc.target/i386/pieces-memcpy-13.c | 16 + > > .../gcc.target/i386/pieces-memcpy-14.c | 17 ++ > > .../gcc.target/i386/pieces-memcpy-15.c | 16 + > > .../gcc.target/i386/pieces-memcpy-16.c | 16 + > > .../gcc.target/i386/pieces-memcpy-7.c | 15 + > > .../gcc.target/i386/pieces-memcpy-8.c | 14 + > > .../gcc.target/i386/pieces-memcpy-9.c | 14 + > > .../gcc.target/i386/pieces-memset-1.c | 16 + > > .../gcc.target/i386/pieces-memset-10.c | 16 + > > .../gcc.target/i386/pieces-memset-11.c | 16 + > > .../gcc.target/i386/pieces-memset-12.c | 16 + > > .../gcc.target/i386/pieces-memset-13.c | 16 + > > .../gcc.target/i386/pieces-memset-14.c | 16 + > > .../gcc.target/i386/pieces-memset-15.c | 16 + > > .../gcc.target/i386/pieces-memset-16.c | 16 + > > .../gcc.target/i386/pieces-memset-17.c | 16 + > > .../gcc.target/i386/pieces-memset-18.c | 16 + > > .../gcc.target/i386/pieces-memset-19.c | 17 ++ > > .../gcc.target/i386/pieces-memset-2.c | 12 + > > .../gcc.target/i386/pieces-memset-20.c | 17 ++ > > .../gcc.target/i386/pieces-memset-21.c | 17 ++ > > .../gcc.target/i386/pieces-memset-22.c | 17 ++ > > .../gcc.target/i386/pieces-memset-23.c | 17 ++ > > .../gcc.target/i386/pieces-memset-24.c | 17 ++ > > .../gcc.target/i386/pieces-memset-25.c | 17 ++ > > .../gcc.target/i386/pieces-memset-26.c | 17 ++ > > .../gcc.target/i386/pieces-memset-27.c | 17 ++ > > .../gcc.target/i386/pieces-memset-28.c | 17 ++ > > .../gcc.target/i386/pieces-memset-29.c | 17 ++ > > .../gcc.target/i386/pieces-memset-3.c | 18 ++ > > .../gcc.target/i386/pieces-memset-30.c | 17 ++ > > .../gcc.target/i386/pieces-memset-31.c | 17 ++ > > .../gcc.target/i386/pieces-memset-32.c | 17 ++ > > .../gcc.target/i386/pieces-memset-33.c | 17 ++ > > .../gcc.target/i386/pieces-memset-34.c | 17 ++ > > .../gcc.target/i386/pieces-memset-35.c | 17 ++ > > .../gcc.target/i386/pieces-memset-36.c | 17 ++ > > .../gcc.target/i386/pieces-memset-37.c | 15 + > > .../gcc.target/i386/pieces-memset-38.c | 17 ++ > > .../gcc.target/i386/pieces-memset-39.c | 16 + > > .../gcc.target/i386/pieces-memset-4.c | 16 + > > .../gcc.target/i386/pieces-memset-40.c | 17 ++ > > .../gcc.target/i386/pieces-memset-41.c | 16 + > > .../gcc.target/i386/pieces-memset-42.c | 17 ++ > > .../gcc.target/i386/pieces-memset-43.c | 17 ++ > > .../gcc.target/i386/pieces-memset-5.c | 12 + > > .../gcc.target/i386/pieces-memset-6.c | 16 + > > .../gcc.target/i386/pieces-memset-7.c | 16 + > > .../gcc.target/i386/pieces-memset-8.c | 16 + > > .../gcc.target/i386/pieces-memset-9.c | 16 + > > gcc/testsuite/gcc.target/i386/pr72839.c | 2 +- > > gcc/testsuite/gcc.target/i386/pr90773-1.c | 10 +- > > gcc/testsuite/gcc.target/i386/pr90773-14.c | 2 +- > > gcc/testsuite/gcc.target/i386/pr90773-15.c | 14 + > > gcc/testsuite/gcc.target/i386/pr90773-16.c | 14 + > > gcc/testsuite/gcc.target/i386/pr90773-17.c | 14 + > > gcc/testsuite/gcc.target/i386/pr90773-18.c | 15 + > > gcc/testsuite/gcc.target/i386/pr90773-19.c | 14 + > > gcc/testsuite/gcc.target/i386/pr90773-20.c | 13 + > > gcc/testsuite/gcc.target/i386/pr90773-21.c | 13 + > > gcc/testsuite/gcc.target/i386/pr90773-22.c | 13 + > > gcc/testsuite/gcc.target/i386/pr90773-23.c | 13 + > > gcc/testsuite/gcc.target/i386/pr90773-24.c | 22 ++ > > gcc/testsuite/gcc.target/i386/pr90773-25.c | 20 ++ > > gcc/testsuite/gcc.target/i386/pr90773-4.c | 2 +- > > gcc/testsuite/gcc.target/i386/sw-1.c | 1 + > > 84 files changed, 1509 insertions(+), 82 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/eh_return-1.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-11.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-12.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-13.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-14.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-15.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-16.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-7.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-8.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-9.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-1.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-10.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-11.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-12.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-13.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-14.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-15.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-16.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-17.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-18.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-19.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-2.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-20.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-21.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-22.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-23.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-24.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-25.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-26.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-27.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-28.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-29.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-3.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-30.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-31.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-32.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-33.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-34.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-35.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-36.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-37.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-38.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-39.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-4.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-40.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-41.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-42.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-43.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-5.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-6.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-7.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-8.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-9.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-15.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-16.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-17.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-18.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-19.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-20.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-21.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-22.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-23.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-24.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-25.c > > > > Hello, > > I've tried this patch series, and found it seems to cause the following > regression > on x86_64-pc-linux-gnu: > > FAIL: gnat.dg/opt87.adb scan-tree-dump store-merging "1 stores to replace old > one of 6 stores"
The problem is that * config/i386/i386.h (MOVE_MAX): Set to 64. MOVE_MAX should be set to MOVE_MAX_PIECES and MAX_MOVE_MAX should be defined to 64 instead. I am testing a fix. -- H.J.