On Wed, Aug 10, 2016 at 8:55 AM, Uros Bizjak <ubiz...@gmail.com> wrote: > On Wed, Aug 10, 2016 at 5:32 PM, H.J. Lu <hongjiu...@intel.com> wrote: >> Use TImode for piecewise move in 64-bit mode. When vector register >> is used for piecewise move, we don't increase stack_alignment_needed >> since vector register spill isn't required for piecewise move. Since >> stack_realign_needed is set to true by checking stack_alignment_estimated >> set by pseudo vector register usage, we also need to check >> stack_realign_needed to eliminate frame pointer. > > Why only in 64-bit mode? We can use SSE moves also in 32-bit mode.
I will extend it to 32-bit mode. > I don't think we can handle crtl->stack_realign_needed in this way. If > there are other insns with 32byte vector registers in use in the same > (large) function as converted __builtin_memcpy, we will *still* need > realigned stack. We don't align stack in leaf functions using SSE/AVX/AVX512 vector registers if there is no spill when __builtin_memcpy isn't optimized with SSE load/store. My change just extends it to __builtin_memcpy optimized with SSE load/store with or without 32byte vector registers in use. > Uros. > > >> Tested on x86-64. OK for trunk? >> >> H.J. >> --- >> gcc/ >> >> * config/i386/i386.c (ix86_finalize_stack_realign_flags): Also >> check stack_realign_needed for stack realignment. >> * config/i386/i386.h (MOVE_MAX_PIECES): Set to 16 in 64-bit mode >> if unaligned SSE load and store are optimal. >> >> gcc/testsuite/ >> >> * gcc.target/i386/pieces-memcpy-1.c: New test. >> * gcc.target/i386/pieces-memcpy-2.c: Likewise. >> * gcc.target/i386/pieces-memcpy-3.c: Likewise. >> * gcc.target/i386/pieces-memcpy-4.c: Likewise. >> * gcc.target/i386/pieces-memcpy-5.c: Likewise. >> * gcc.target/i386/pieces-memcpy-6.c: Likewise. >> --- >> gcc/config/i386/i386.c | 11 +++++++++-- >> gcc/config/i386/i386.h | 6 +++++- >> gcc/testsuite/gcc.target/i386/pieces-memcpy-1.c | 17 +++++++++++++++++ >> gcc/testsuite/gcc.target/i386/pieces-memcpy-2.c | 17 +++++++++++++++++ >> gcc/testsuite/gcc.target/i386/pieces-memcpy-3.c | 17 +++++++++++++++++ >> gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c | 17 +++++++++++++++++ >> gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c | 17 +++++++++++++++++ >> gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c | 17 +++++++++++++++++ >> 8 files changed, 116 insertions(+), 3 deletions(-) >> create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-1.c >> create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-2.c >> create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-3.c >> create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c >> create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c >> create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c >> >> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c >> index 93eaab1..60dc160 100644 >> --- a/gcc/config/i386/i386.c >> +++ b/gcc/config/i386/i386.c >> @@ -13286,8 +13286,15 @@ ix86_finalize_stack_realign_flags (void) >> /* If the only reason for frame_pointer_needed is that we conservatively >> assumed stack realignment might be needed, but in the end nothing that >> needed the stack alignment had been spilled, clear frame_pointer_needed >> - and say we don't need stack realignment. */ >> - if (stack_realign >> + and say we don't need stack realignment. >> + >> + When vector register is used for piecewise move and store, we don't >> + increase stack_alignment_needed as there is no register spill for >> + piecewise move and store. Since stack_realign_needed is set to true >> + by checking stack_alignment_estimated which is updated by pseudo >> + vector register usage, we also need to check stack_realign_needed to >> + eliminate frame pointer. */ >> + if ((stack_realign || crtl->stack_realign_needed) >> && frame_pointer_needed >> && crtl->is_leaf >> && flag_omit_frame_pointer >> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h >> index 9b66264..24db855 100644 >> --- a/gcc/config/i386/i386.h >> +++ b/gcc/config/i386/i386.h >> @@ -1951,7 +1951,11 @@ typedef struct ix86_args { >> /* MOVE_MAX_PIECES is the number of bytes at a time which we can >> move efficiently, as opposed to MOVE_MAX which is the maximum >> number of bytes we can move with a single instruction. */ >> -#define MOVE_MAX_PIECES UNITS_PER_WORD >> +#define MOVE_MAX_PIECES \ >> + ((TARGET_64BIT \ >> + && TARGET_SSE2 \ >> + && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \ >> + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) ? 16 : UNITS_PER_WORD) >> >> /* If a memory-to-memory move would take MOVE_RATIO or more simple >> move-instruction pairs, we will do a movmem or libcall instead. >> diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-1.c >> b/gcc/testsuite/gcc.target/i386/pieces-memcpy-1.c >> new file mode 100644 >> index 0000000..adc0aa8 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-1.c >> @@ -0,0 +1,17 @@ >> +/* { dg-do compile { target { ! ia32 } } } */ >> +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */ >> + >> +extern char *dst, *src; >> + >> +void >> +foo (void) >> +{ >> + __builtin_memcpy (dst, src, 64); >> +} >> + >> +/* { dg-final { scan-assembler-times "movdqu\[ \\t\]+\[^\n\]*%xmm" 4 } } */ >> +/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 4 } } */ >> +/* No need to dynamically realign the stack here. */ >> +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */ >> +/* Nor use a frame pointer. */ >> +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */ >> diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-2.c >> b/gcc/testsuite/gcc.target/i386/pieces-memcpy-2.c >> new file mode 100644 >> index 0000000..c52c1d9 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-2.c >> @@ -0,0 +1,17 @@ >> +/* { dg-do compile { target { ! ia32 } } } */ >> +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */ >> + >> +extern char *dst, *src; >> + >> +void >> +foo (void) >> +{ >> + __builtin_memcpy (dst, src, 33); >> +} >> + >> +/* { dg-final { scan-assembler-times "movdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */ >> +/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 2 } } */ >> +/* No need to dynamically realign the stack here. */ >> +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */ >> +/* Nor use a frame pointer. */ >> +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */ >> diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-3.c >> b/gcc/testsuite/gcc.target/i386/pieces-memcpy-3.c >> new file mode 100644 >> index 0000000..c532bbd >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-3.c >> @@ -0,0 +1,17 @@ >> +/* { dg-do compile { target { ! ia32 } } } */ >> +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */ >> + >> +extern char *dst, *src; >> + >> +void >> +foo (void) >> +{ >> + __builtin_memcpy (dst, src, 17); >> +} >> + >> +/* { dg-final { scan-assembler-times "movdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */ >> +/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 1 } } */ >> +/* No need to dynamically realign the stack here. */ >> +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */ >> +/* Nor use a frame pointer. */ >> +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */ >> diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c >> b/gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c >> new file mode 100644 >> index 0000000..4ef763d >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-4.c >> @@ -0,0 +1,17 @@ >> +/* { dg-do compile { target { ! ia32 } } } */ >> +/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */ >> + >> +extern char *dst, *src; >> + >> +void >> +foo (void) >> +{ >> + __builtin_memcpy (dst, src, 18); >> +} >> + >> +/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */ >> +/* { dg-final { scan-assembler-times "vmovups\[ \\t\]+\[^\n\]*%xmm" 1 } } */ >> +/* No need to dynamically realign the stack here. */ >> +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */ >> +/* Nor use a frame pointer. */ >> +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */ >> diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c >> b/gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c >> new file mode 100644 >> index 0000000..2687560 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-5.c >> @@ -0,0 +1,17 @@ >> +/* { dg-do compile { target { ! ia32 } } } */ >> +/* { dg-options "-O2 -mavx512f -mtune=generic" } */ >> + >> +extern char *dst, *src; >> + >> +void >> +foo (void) >> +{ >> + __builtin_memcpy (dst, src, 19); >> +} >> + >> +/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */ >> +/* { dg-final { scan-assembler-times "vmovups\[ \\t\]+\[^\n\]*%xmm" 1 } } */ >> +/* No need to dynamically realign the stack here. */ >> +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */ >> +/* Nor use a frame pointer. */ >> +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */ >> diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c >> b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c >> new file mode 100644 >> index 0000000..a205f83 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c >> @@ -0,0 +1,17 @@ >> +/* { dg-do compile { target { ! ia32 } } } */ >> +/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */ >> + >> +extern char *dst, *src; >> + >> +void >> +foo (void) >> +{ >> + __builtin_memcpy (dst, src, 33); >> +} >> + >> +/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */ >> +/* { dg-final { scan-assembler-times "vmovups\[ \\t\]+\[^\n\]*%xmm" 2 } } */ >> +/* No need to dynamically realign the stack here. */ >> +/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */ >> +/* Nor use a frame pointer. */ >> +/* { dg-final { scan-assembler-not "%\[re\]bp" } } */ >> -- >> 2.7.4 >> -- H.J.