On 5/17/21 3:15 PM, H.J. Lu via Gcc-patches wrote: > Changes in the v3 patches: > > 1. Split the TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE changes > into the generic part and the x86 part. > > > 1. Add TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE to support > target instructions to duplicate QImode value to TImode/OImode/XImode > value for memmset. > 2. x86: Avoid stack realignment when copying data > 3. x86: Remov MAX_BITSIZE_MODE_ANY_INT. Only x86 backend defines it. > 4. x86: Use TImode/OImode/XImode integers for piecewise move and store. > 5. x86: Add tests for TImode/OImode/XImode for piecewise move and store. > 6. x86: Adjust existing tests. > > On x86-64, SPEC CPU 2017 performance impact is neutral. Glibc code size > differences with -O2 build are: > > Before After > libc.so 1906572 1906444 > > Some code sequence differences in libc.so are: > > <svcudp_bufcreate@GLIBC_2.2.5>: > ... > jne <svcudp_bufcreate@GLIBC_2.2.5+0x318> | jne > <svcudp_bufcreate@GLIBC_2.2.5+0x2a8> > test %r15,%r15 test > %r15,%r15 > je <svcudp_bufcreate@GLIBC_2.2.5+0x318> | je > <svcudp_bufcreate@GLIBC_2.2.5+0x2a8> > mov %r13d,(%r14) mov > %r13d,(%r14) > lea 0x10(%r14),%rdi lea > 0x10(%r14),%rdi > mov $0x1,%ecx mov > $0x1,%ecx > mov %r13d,%edx mov > %r13d,%edx > mov %r15,0x40(%r12) mov > %r15,0x40(%r12) > mov %r15,%rsi mov > %r15,%rsi > call <xdrmem_create@GLIBC_2.2.5> call > <xdrmem_create@GLIBC_2.2.5> > lea 0xa2f9b(%rip),%rax # <svcudp_op> | lea > 0xa2fab(%rip),%rax # <svcudp_op> > xor %esi,%esi xor > %esi,%esi > mov %ebp,%edi mov > %ebp,%edi > mov %rax,0x8(%r12) mov > %rax,0x8(%r12) > movzwl 0x12(%rsp),%eax movzwl > 0x12(%rsp),%eax > mov $0x8,%edx < > lea 0xc(%rsp),%rcx lea > 0xc(%rsp),%rcx > mov %r14,0x48(%r12) < > add $0x40,%r14 < > mov $0x4,%r8d mov > $0x4,%r8d > > movq > $0x0,0x1d0(%r14) > > mov > $0x8,%edx > rol $0x8,%ax rol > $0x8,%ax > mov %ebp,(%r12) | mov > %r14,0x48(%r12) > movq $0x0,0x190(%r14) | add > $0x40,%r14 > mov %ax,0x4(%r12) < > mov %r14,0x30(%r12) mov > %r14,0x30(%r12) > > mov > %ax,0x4(%r12) > > mov > %ebp,(%r12) > movl $0x1,0xc(%rsp) movl > $0x1,0xc(%rsp) > call <setsockopt> call > <setsockopt> > mov %r12,%rdi mov > %r12,%rdi > movabs $0x101010101010101,%rdx < > test %eax,%eax test > %eax,%eax > mov $0xff,%eax mov > $0xff,%eax > cmove %eax,%ebx cmove > %eax,%ebx > movzbl %bl,%eax | movd > %ebx,%xmm0 > mov %ebx,0xc(%rsp) mov > %ebx,0xc(%rsp) > mov %rax,%rsi | > punpcklbw %xmm0,%xmm0 > imul %rdx,%rsi | > punpcklwd %xmm0,%xmm0 > mul %rdx | pshufd > $0x0,%xmm0,%xmm0 > add %rsi,%rdx | movups > %xmm0,0x50(%r12) > mov %rax,0x50(%r12) | movups > %xmm0,0x60(%r12) > mov %rdx,0x58(%r12) | movups > %xmm0,0x70(%r12) > mov %rax,0x60(%r12) | movups > %xmm0,0x80(%r12) > mov %rdx,0x68(%r12) | movups > %xmm0,0x90(%r12) > mov %rax,0x70(%r12) | movups > %xmm0,0xa0(%r12) > mov %rdx,0x78(%r12) | movups > %xmm0,0xb0(%r12) > mov %rax,0x80(%r12) | movups > %xmm0,0xc0(%r12) > mov %rdx,0x88(%r12) | movups > %xmm0,0xd0(%r12) > mov %rax,0x90(%r12) | movups > %xmm0,0xe0(%r12) > mov %rdx,0x98(%r12) | movups > %xmm0,0xf0(%r12) > mov %rax,0xa0(%r12) | movups > %xmm0,0x100(%r12) > mov %rdx,0xa8(%r12) | movups > %xmm0,0x110(%r12) > mov %rax,0xb0(%r12) | movups > %xmm0,0x120(%r12) > mov %rdx,0xb8(%r12) | movups > %xmm0,0x130(%r12) > mov %rax,0xc0(%r12) | movups > %xmm0,0x140(%r12) > mov %rdx,0xc8(%r12) < > mov %rax,0xd0(%r12) < > mov %rdx,0xd8(%r12) < > mov %rax,0xe0(%r12) < > mov %rdx,0xe8(%r12) < > mov %rax,0xf0(%r12) < > mov %rdx,0xf8(%r12) < > mov %rax,0x100(%r12) < > mov %rdx,0x108(%r12) < > mov %rax,0x110(%r12) < > mov %rdx,0x118(%r12) < > mov %rax,0x120(%r12) < > mov %rdx,0x128(%r12) < > mov %rax,0x130(%r12) < > mov %rdx,0x138(%r12) < > mov %rax,0x140(%r12) < > mov %rdx,0x148(%r12) < > call <xprt_register@GLIBC_2.2.5> call > <xprt_register@GLIBC_2.2.5> > add $0x28,%rsp add > $0x28,%rsp > mov %r12,%rax mov > %r12,%rax > pop %rbx pop > %rbx > pop %rbp pop > %rbp > pop %r12 pop > %r12 > pop %r13 pop > %r13 > pop %r14 pop > %r14 > pop %r15 pop > %r15 > ret ret > > > H.J. Lu (12): > Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE > x86: Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE > x86: Avoid stack realignment when copying data > Remove MAX_BITSIZE_MODE_ANY_INT > x86: Update piecewise move and store > x86: Add AVX2 tests for PR middle-end/90773 > x86: Add tests for piecewise move and store > x86: Also pass -mno-avx to pr72839.c > x86: Also pass -mno-avx to cold-attribute-1.c > x86: Also pass -mno-avx to sw-1.c for ia32 > x86: Update gcc.target/i386/incoming-11.c > constructor: Check if it is faster to load constant from memory > > gcc/builtins.c | 47 +-- > gcc/config/i386/i386-expand.c | 18 +- > gcc/config/i386/i386-modes.def | 15 +- > gcc/config/i386/i386-protos.h | 5 + > gcc/config/i386/i386.c | 289 +++++++++++++++++- > gcc/config/i386/i386.h | 35 ++- > gcc/doc/tm.texi | 16 + > gcc/doc/tm.texi.in | 4 + > gcc/expr.c | 10 + > gcc/target.def | 20 ++ > gcc/targhooks.c | 56 ++++ > gcc/targhooks.h | 4 + > .../gcc.target/i386/cold-attribute-1.c | 2 +- > gcc/testsuite/gcc.target/i386/eh_return-1.c | 26 ++ > gcc/testsuite/gcc.target/i386/incoming-11.c | 2 +- > .../gcc.target/i386/pieces-memcpy-10.c | 16 + > .../gcc.target/i386/pieces-memcpy-11.c | 17 ++ > .../gcc.target/i386/pieces-memcpy-12.c | 16 + > .../gcc.target/i386/pieces-memcpy-13.c | 16 + > .../gcc.target/i386/pieces-memcpy-14.c | 17 ++ > .../gcc.target/i386/pieces-memcpy-15.c | 16 + > .../gcc.target/i386/pieces-memcpy-16.c | 16 + > .../gcc.target/i386/pieces-memcpy-7.c | 15 + > .../gcc.target/i386/pieces-memcpy-8.c | 14 + > .../gcc.target/i386/pieces-memcpy-9.c | 14 + > .../gcc.target/i386/pieces-memset-1.c | 16 + > .../gcc.target/i386/pieces-memset-10.c | 16 + > .../gcc.target/i386/pieces-memset-11.c | 16 + > .../gcc.target/i386/pieces-memset-12.c | 16 + > .../gcc.target/i386/pieces-memset-13.c | 16 + > .../gcc.target/i386/pieces-memset-14.c | 16 + > .../gcc.target/i386/pieces-memset-15.c | 16 + > .../gcc.target/i386/pieces-memset-16.c | 16 + > .../gcc.target/i386/pieces-memset-17.c | 16 + > .../gcc.target/i386/pieces-memset-18.c | 16 + > .../gcc.target/i386/pieces-memset-19.c | 17 ++ > .../gcc.target/i386/pieces-memset-2.c | 12 + > .../gcc.target/i386/pieces-memset-20.c | 17 ++ > .../gcc.target/i386/pieces-memset-21.c | 17 ++ > .../gcc.target/i386/pieces-memset-22.c | 17 ++ > .../gcc.target/i386/pieces-memset-23.c | 17 ++ > .../gcc.target/i386/pieces-memset-24.c | 17 ++ > .../gcc.target/i386/pieces-memset-25.c | 17 ++ > .../gcc.target/i386/pieces-memset-26.c | 17 ++ > .../gcc.target/i386/pieces-memset-27.c | 17 ++ > .../gcc.target/i386/pieces-memset-28.c | 17 ++ > .../gcc.target/i386/pieces-memset-29.c | 17 ++ > .../gcc.target/i386/pieces-memset-3.c | 18 ++ > .../gcc.target/i386/pieces-memset-30.c | 17 ++ > .../gcc.target/i386/pieces-memset-31.c | 17 ++ > .../gcc.target/i386/pieces-memset-32.c | 17 ++ > .../gcc.target/i386/pieces-memset-33.c | 17 ++ > .../gcc.target/i386/pieces-memset-34.c | 17 ++ > .../gcc.target/i386/pieces-memset-35.c | 17 ++ > .../gcc.target/i386/pieces-memset-36.c | 17 ++ > .../gcc.target/i386/pieces-memset-37.c | 15 + > .../gcc.target/i386/pieces-memset-38.c | 17 ++ > .../gcc.target/i386/pieces-memset-39.c | 16 + > .../gcc.target/i386/pieces-memset-4.c | 16 + > .../gcc.target/i386/pieces-memset-40.c | 17 ++ > .../gcc.target/i386/pieces-memset-41.c | 16 + > .../gcc.target/i386/pieces-memset-42.c | 17 ++ > .../gcc.target/i386/pieces-memset-43.c | 17 ++ > .../gcc.target/i386/pieces-memset-5.c | 12 + > .../gcc.target/i386/pieces-memset-6.c | 16 + > .../gcc.target/i386/pieces-memset-7.c | 16 + > .../gcc.target/i386/pieces-memset-8.c | 16 + > .../gcc.target/i386/pieces-memset-9.c | 16 + > gcc/testsuite/gcc.target/i386/pr72839.c | 2 +- > gcc/testsuite/gcc.target/i386/pr90773-1.c | 10 +- > gcc/testsuite/gcc.target/i386/pr90773-14.c | 2 +- > gcc/testsuite/gcc.target/i386/pr90773-15.c | 14 + > gcc/testsuite/gcc.target/i386/pr90773-16.c | 14 + > gcc/testsuite/gcc.target/i386/pr90773-17.c | 14 + > gcc/testsuite/gcc.target/i386/pr90773-18.c | 15 + > gcc/testsuite/gcc.target/i386/pr90773-19.c | 14 + > gcc/testsuite/gcc.target/i386/pr90773-20.c | 13 + > gcc/testsuite/gcc.target/i386/pr90773-21.c | 13 + > gcc/testsuite/gcc.target/i386/pr90773-22.c | 13 + > gcc/testsuite/gcc.target/i386/pr90773-23.c | 13 + > gcc/testsuite/gcc.target/i386/pr90773-24.c | 22 ++ > gcc/testsuite/gcc.target/i386/pr90773-25.c | 20 ++ > gcc/testsuite/gcc.target/i386/pr90773-4.c | 2 +- > gcc/testsuite/gcc.target/i386/sw-1.c | 1 + > 84 files changed, 1509 insertions(+), 82 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/eh_return-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-11.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-12.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-13.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-14.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-15.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-16.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-7.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-8.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-9.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-10.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-11.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-12.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-13.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-14.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-15.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-16.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-17.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-18.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-19.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-20.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-21.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-22.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-23.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-24.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-25.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-26.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-27.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-28.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-29.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-30.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-31.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-32.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-33.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-34.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-35.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-36.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-37.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-38.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-39.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-4.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-40.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-41.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-42.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-43.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-5.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-6.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-7.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-8.c > create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-9.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-15.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-16.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-17.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-18.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-19.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-20.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-21.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-22.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-23.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-24.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-25.c >
Hello, I've tried this patch series, and found it seems to cause the following regression on x86_64-pc-linux-gnu: FAIL: gnat.dg/opt87.adb scan-tree-dump store-merging "1 stores to replace old one of 6 stores" Bernd.