Since MOVE_MAX defines the maximum number of bytes that an instruction can move quickly between memory and registers, use it to get the widest vector mode in vector loop when inlining memcpy and memset.
gcc/ PR target/120708 * config/i386/i386-expand.cc (ix86_expand_set_or_cpymem): Use MOVE_MAX to get the widest vector mode in vector loop. gcc/testsuite/ PR target/120708 * gcc.target/i386/memcpy-pr120708-1.c: New test. * gcc.target/i386/memcpy-pr120708-2.c: Likewise. * gcc.target/i386/memcpy-pr120708-3.c: Likewise. * gcc.target/i386/memcpy-pr120708-4.c: Likewise. * gcc.target/i386/memcpy-pr120708-5.c: Likewise. * gcc.target/i386/memcpy-pr120708-6.c: Likewise. * gcc.target/i386/memset-pr120708-1.c: Likewise. * gcc.target/i386/memset-pr120708-2.c: Likewise. * gcc.target/i386/memcpy-strategy-1.c: Drop dg-skip-if. Replace -march=atom with -mno-avx -msse2 -mtune=generic -mtune-ctrl=^sse_typeless_stores. * gcc.target/i386/memcpy-strategy-2.c: Likewise. * gcc.target/i386/memcpy-vector_loop-1.c: Likewise. * gcc.target/i386/memcpy-vector_loop-2.c: Likewise. * gcc.target/i386/memset-vector_loop-1.c: Likewise. * gcc.target/i386/memset-vector_loop-2.c: Likewise. -- H.J.
From 875b530cca71982b970900df31907dbb26c2833c Mon Sep 17 00:00:00 2001 From: "H.J. Lu" <hjl.to...@gmail.com> Date: Thu, 19 Jun 2025 05:03:48 +0800 Subject: [PATCH] x86: Get the widest vector mode from MOVE_MAX Since MOVE_MAX defines the maximum number of bytes that an instruction can move quickly between memory and registers, use it to get the widest vector mode in vector loop when inlining memcpy and memset. gcc/ PR target/120708 * config/i386/i386-expand.cc (ix86_expand_set_or_cpymem): Use MOVE_MAX to get the widest vector mode in vector loop. gcc/testsuite/ PR target/120708 * gcc.target/i386/memcpy-pr120708-1.c: New test. * gcc.target/i386/memcpy-pr120708-2.c: Likewise. * gcc.target/i386/memcpy-pr120708-3.c: Likewise. * gcc.target/i386/memcpy-pr120708-4.c: Likewise. * gcc.target/i386/memcpy-pr120708-5.c: Likewise. * gcc.target/i386/memcpy-pr120708-6.c: Likewise. * gcc.target/i386/memset-pr120708-1.c: Likewise. * gcc.target/i386/memset-pr120708-2.c: Likewise. * gcc.target/i386/memcpy-strategy-1.c: Drop dg-skip-if. Replace -march=atom with -mno-avx -msse2 -mtune=generic -mtune-ctrl=^sse_typeless_stores. * gcc.target/i386/memcpy-strategy-2.c: Likewise. * gcc.target/i386/memcpy-vector_loop-1.c: Likewise. * gcc.target/i386/memcpy-vector_loop-2.c: Likewise. * gcc.target/i386/memset-vector_loop-1.c: Likewise. * gcc.target/i386/memset-vector_loop-2.c: Likewise. Signed-off-by: H.J. Lu <hjl.to...@gmail.com> --- gcc/config/i386/i386-expand.cc | 31 ++++++------------- .../gcc.target/i386/memcpy-pr120708-1.c | 11 +++++++ .../gcc.target/i386/memcpy-pr120708-2.c | 11 +++++++ .../gcc.target/i386/memcpy-pr120708-3.c | 11 +++++++ .../gcc.target/i386/memcpy-pr120708-4.c | 11 +++++++ .../gcc.target/i386/memcpy-pr120708-5.c | 15 +++++++++ .../gcc.target/i386/memcpy-pr120708-6.c | 15 +++++++++ .../gcc.target/i386/memcpy-strategy-1.c | 3 +- .../gcc.target/i386/memcpy-strategy-2.c | 3 +- .../gcc.target/i386/memcpy-vector_loop-1.c | 3 +- .../gcc.target/i386/memcpy-vector_loop-2.c | 5 ++- .../gcc.target/i386/memset-pr120708-1.c | 10 ++++++ .../gcc.target/i386/memset-pr120708-2.c | 10 ++++++ .../gcc.target/i386/memset-vector_loop-1.c | 3 +- .../gcc.target/i386/memset-vector_loop-2.c | 3 +- 15 files changed, 110 insertions(+), 35 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120708-1.c create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120708-2.c create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120708-3.c create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120708-4.c create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120708-5.c create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120708-6.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120708-1.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120708-2.c diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 4946f87a131..423fc632003 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -9351,7 +9351,6 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, bool need_zero_guard = false; bool noalign; machine_mode move_mode = VOIDmode; - machine_mode wider_mode; int unroll_factor = 1; /* TODO: Once value ranges are available, fill in proper data. */ unsigned HOST_WIDE_INT min_size = 0; @@ -9427,6 +9426,7 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, unroll_factor = 1; move_mode = word_mode; + int nunits; switch (alg) { case libcall: @@ -9447,27 +9447,14 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, case vector_loop: need_zero_guard = true; unroll_factor = 4; - /* Find the widest supported mode. */ - move_mode = word_mode; - while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode) - && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing) - move_mode = wider_mode; - - if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128) - move_mode = TImode; - if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 256) - move_mode = OImode; - - /* Find the corresponding vector mode with the same size as MOVE_MODE. - MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */ - if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode)) - { - int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode); - if (!mode_for_vector (word_mode, nunits).exists (&move_mode) - || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing) - move_mode = word_mode; - } - gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing); + /* Get the vector mode to move MOVE_MAX bytes. */ + nunits = MOVE_MAX / GET_MODE_SIZE (word_mode); + if (nunits > 1) + { + move_mode = mode_for_vector (word_mode, nunits).require (); + gcc_assert (optab_handler (mov_optab, move_mode) + != CODE_FOR_nothing); + } break; case rep_prefix_8_byte: move_mode = DImode; diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120708-1.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-1.c new file mode 100644 index 00000000000..d4fe2adc7ff --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-1.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-skip-if "" { *-*-* } { "-march=*" } { "-march=atom" } } */ +/* { dg-options "-O2 -march=atom -minline-all-stringops -mstringop-strategy=vector_loop" } */ +/* { dg-final { scan-assembler-not "movdqa" } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120708-2.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-2.c new file mode 100644 index 00000000000..9a6fcfd171b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-2.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-skip-if "" { *-*-* } { "-march=*" } { "-march=atom" } } */ +/* { dg-options "-O2 -march=atom -minline-all-stringops -mstringop-strategy=vector_loop" } */ +/* { dg-final { scan-assembler-not "movdqa" } } */ + +char *a; +char *b; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120708-3.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-3.c new file mode 100644 index 00000000000..010ac24d50f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-3.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-skip-if "" { *-*-* } { "-march=*" } { "-march=atom" } } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */ +/* { dg-final { scan-assembler-not "movdqa" } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120708-4.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-4.c new file mode 100644 index 00000000000..87a58ef369a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-4.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-skip-if "" { *-*-* } { "-march=*" } { "-march=atom" } } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ +/* { dg-final { scan-assembler-not "movdqa" } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120708-5.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-5.c new file mode 100644 index 00000000000..19e060075cf --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-5.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -mprefer-vector-width=128 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */ + +#define SIZE (16 + 1) * 16 + +char dest[SIZE]; +char src[SIZE]; + +void +foo (void) +{ + __builtin_memcpy (dest, src, SIZE); +} + +/* { dg-final { scan-assembler-times "vmovdqa\[ \t]\+\[^\n\r]*%xmm\[0-9\]\+" 10 } } */ diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120708-6.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-6.c new file mode 100644 index 00000000000..17b101f130e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120708-6.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -mprefer-vector-width=256 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */ + +#define SIZE (16 + 1) * 32 + +char dest[SIZE]; +char src[SIZE]; + +void +foo (void) +{ + __builtin_memcpy (dest, src, SIZE); +} + +/* { dg-final { scan-assembler-times "vmovdqa\[ \t]\+\[^\n\r]*%ymm\[0-9\]\+" 10 } } */ diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-1.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-1.c index 6ac80c91053..b2986738892 100644 --- a/gcc/testsuite/gcc.target/i386/memcpy-strategy-1.c +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-1.c @@ -1,6 +1,5 @@ /* { dg-do compile } */ -/* { dg-skip-if "" { *-*-* } { "-march=*" } { "-march=atom" } } */ -/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -mtune-ctrl=^sse_typeless_stores -mmemcpy-strategy=vector_loop:-1:align" } */ /* { dg-final { scan-assembler-times "movdqa" 8 } } */ char a[2048]; diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-2.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-2.c index c103896a110..18e260b0191 100644 --- a/gcc/testsuite/gcc.target/i386/memcpy-strategy-2.c +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-2.c @@ -1,6 +1,5 @@ /* { dg-do compile } */ -/* { dg-skip-if "" { *-*-* } { "-march=*" } { "-march=atom" } } */ -/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -mtune-ctrl=^sse_typeless_stores -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ /* { dg-final { scan-assembler-times "movdqa" 8 } } */ char a[2048]; diff --git a/gcc/testsuite/gcc.target/i386/memcpy-vector_loop-1.c b/gcc/testsuite/gcc.target/i386/memcpy-vector_loop-1.c index 93f428acc85..cec8c90e565 100644 --- a/gcc/testsuite/gcc.target/i386/memcpy-vector_loop-1.c +++ b/gcc/testsuite/gcc.target/i386/memcpy-vector_loop-1.c @@ -1,6 +1,5 @@ /* { dg-do compile } */ -/* { dg-skip-if "" { *-*-* } { "-march=*" } { "-march=atom" } } */ -/* { dg-options "-O2 -march=atom -minline-all-stringops -mstringop-strategy=vector_loop" } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -mtune-ctrl=^sse_typeless_stores -minline-all-stringops -mstringop-strategy=vector_loop" } */ /* { dg-final { scan-assembler-times "movdqa" 8 } } */ char a[2048]; diff --git a/gcc/testsuite/gcc.target/i386/memcpy-vector_loop-2.c b/gcc/testsuite/gcc.target/i386/memcpy-vector_loop-2.c index ab235401972..314eb3d5b53 100644 --- a/gcc/testsuite/gcc.target/i386/memcpy-vector_loop-2.c +++ b/gcc/testsuite/gcc.target/i386/memcpy-vector_loop-2.c @@ -1,7 +1,6 @@ /* { dg-do compile } */ -/* { dg-skip-if "" { *-*-* } { "-march=*" } { "-march=atom" } } */ -/* { dg-options "-O2 -march=atom -minline-all-stringops -mstringop-strategy=vector_loop" } */ -/* { dg-final { scan-assembler-times "movdqa" 4} } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -mtune-ctrl=^sse_typeless_stores -minline-all-stringops -mstringop-strategy=vector_loop" } */ +/* { dg-final { scan-assembler-times "movdqa" 4 } } */ char *a; char *b; diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120708-1.c b/gcc/testsuite/gcc.target/i386/memset-pr120708-1.c new file mode 100644 index 00000000000..fba05883db5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120708-1.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -mprefer-vector-width=128 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */ + +void +foo (char *dest) +{ + __builtin_memset (dest, 0, 254); +} + +/* { dg-final { scan-assembler "vmovdqu\[ \t]\+%xmm\[0-9\]+, \\(\[^\n\r]*\\)" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120708-2.c b/gcc/testsuite/gcc.target/i386/memset-pr120708-2.c new file mode 100644 index 00000000000..d9a3e7ebc67 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-pr120708-2.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -mprefer-vector-width=256 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */ + +void +foo (char *dest) +{ + __builtin_memset (dest, 0, 254); +} + +/* { dg-final { scan-assembler "vmovdqu\[ \t]\+%ymm\[0-9\]+, \\(\[^\n\r]*\\)" } } */ diff --git a/gcc/testsuite/gcc.target/i386/memset-vector_loop-1.c b/gcc/testsuite/gcc.target/i386/memset-vector_loop-1.c index d6fdc981908..5bb30a844ea 100644 --- a/gcc/testsuite/gcc.target/i386/memset-vector_loop-1.c +++ b/gcc/testsuite/gcc.target/i386/memset-vector_loop-1.c @@ -1,6 +1,5 @@ /* { dg-do compile } */ -/* { dg-skip-if "" { *-*-* } { "-march=*" } { "-march=atom" } } */ -/* { dg-options "-O2 -march=atom -minline-all-stringops -mstringop-strategy=vector_loop" } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -mtune-ctrl=^sse_typeless_stores -minline-all-stringops -mstringop-strategy=vector_loop" } */ /* { dg-final { scan-assembler-times "movdqa" 4 } } */ char a[2048]; diff --git a/gcc/testsuite/gcc.target/i386/memset-vector_loop-2.c b/gcc/testsuite/gcc.target/i386/memset-vector_loop-2.c index bce8be0ffae..6e31070ee86 100644 --- a/gcc/testsuite/gcc.target/i386/memset-vector_loop-2.c +++ b/gcc/testsuite/gcc.target/i386/memset-vector_loop-2.c @@ -1,6 +1,5 @@ /* { dg-do compile } */ -/* { dg-skip-if "" { *-*-* } { "-march=*" } { "-march=atom" } } */ -/* { dg-options "-O2 -march=atom -minline-all-stringops -mstringop-strategy=vector_loop" } */ +/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -mtune-ctrl=^sse_typeless_stores -mstringop-strategy=vector_loop" } */ /* { dg-final { scan-assembler-times "movdqa" 4} } */ char *a; -- 2.49.0