From: "hongtao.liu" <[email protected]>
Update in V3:
#define MOVE_MAX \
((TARGET_AVX512F \
&& (ix86_move_max =3D=3D PVW_AVX512 \
|| ix86_store_max =3D=3D PVW_AVX512)) \
? 64 \
Since MOVE_MAX in x86 is also related to ix86_store_max, I'll still
remove SPR/GNR/DMR from avx512_store_by_pieces.
And why do we have ix86_store_max in MOVE_MAX?
I guess, it's because:
In the middle-end, the real size used for memset is decided by MIN
(MOVE_MAX_PIECES(alignment request), STORE_MAX_PIECES)
Maybe we should remove the option mstore-max= and the tune
{avx512,avx256}_store_by_pieces.
Since they eventually have the same impact as just setting ix86_move_max.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ready push to trunk.
Align move_max with prefer_vector_width for SPR/GNR/DMR similar as
below commit.
commit 6ea25c041964bf63014fcf7bb68fb1f5a0a4e123
Author: liuhongt <[email protected]>
Date: Thu Aug 15 12:54:07 2024 +0800
Align ix86_{move_max,store_max} with vectorizer.
When none of mprefer-vector-width, avx256_optimal/avx128_optimal,
avx256_store_by_pieces/avx512_store_by_pieces is specified, GCC will
set ix86_{move_max,store_max} as max available vector length except
for AVX part.
if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)
&& TARGET_EVEX512_P (opts->x_ix86_isa_flags2))
opts->x_ix86_move_max = PVW_AVX512;
else
opts->x_ix86_move_max = PVW_AVX128;
So for -mavx2, vectorizer will choose 256-bit for vectorization, but
128-bit is used for struct copy, there could be a potential STLF issue
due to this "misalign".
gcc/ChangeLog:
* config/i386/x86-tune.def (X86_TUNE_AVX512_MOVE_BY_PIECES):
Remove SPR/GNR/DMR.
(X86_TUNE_AVX512_STORE_BY_PIECES): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pieces-memcpy-18.c: Use -mtune=znver5
instead of -mtune=sapphirerapids.
* gcc.target/i386/pieces-memcpy-21.c: Ditto.
* gcc.target/i386/pieces-memset-46.c: Ditto.
* gcc.target/i386/pieces-memset-49.c: Ditto.
---
gcc/config/i386/x86-tune.def | 8 ++++----
gcc/testsuite/gcc.target/i386/pieces-memcpy-18.c | 2 +-
gcc/testsuite/gcc.target/i386/pieces-memcpy-21.c | 2 +-
gcc/testsuite/gcc.target/i386/pieces-memset-46.c | 2 +-
gcc/testsuite/gcc.target/i386/pieces-memset-49.c | 2 +-
5 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index a86cbad281c..255ea4a16cc 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -612,6 +612,8 @@ DEF_TUNE (X86_TUNE_AVX256_AVOID_VEC_PERM,
/* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX512 ops are split into two AVX256
ops. */
DEF_TUNE (X86_TUNE_AVX512_SPLIT_REGS, "avx512_split_regs", m_ZNVER4)
+/* It's better to align MOVE_MAX with prefer_vector_width to reduce
+ risk of STLF stalls(small store followed by big load.) */
/* X86_TUNE_AVX256_MOVE_BY_PIECES: Optimize move_by_pieces with 256-bit
AVX instructions. */
DEF_TUNE (X86_TUNE_AVX256_MOVE_BY_PIECES, "avx256_move_by_pieces",
@@ -625,14 +627,12 @@ DEF_TUNE (X86_TUNE_AVX256_STORE_BY_PIECES,
"avx256_store_by_pieces",
/* X86_TUNE_AVX512_MOVE_BY_PIECES: Optimize move_by_pieces with 512-bit
AVX instructions. */
DEF_TUNE (X86_TUNE_AVX512_MOVE_BY_PIECES, "avx512_move_by_pieces",
- m_SAPPHIRERAPIDS | m_GRANITERAPIDS | m_GRANITERAPIDS_D
- | m_DIAMONDRAPIDS | m_ZNVER4 | m_ZNVER5)
+ m_ZNVER4 | m_ZNVER5)
/* X86_TUNE_AVX512_STORE_BY_PIECES: Optimize store_by_pieces with 512-bit
AVX instructions. */
DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, "avx512_store_by_pieces",
- m_SAPPHIRERAPIDS | m_GRANITERAPIDS | m_GRANITERAPIDS_D
- | m_DIAMONDRAPIDS | m_ZNVER4 | m_ZNVER5)
+ m_ZNVER4 | m_ZNVER5)
/* X86_TUNE_AVX512_TWO_EPILOGUES: Use two vector epilogues for 512-bit
vectorized loops. */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-18.c
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-18.c
index b15a0db9ff0..b4995ac0598 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-18.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-18.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O2 -march=sapphirerapids" } */
+/* { dg-options "-O2 -march=znver5" } */
extern char *dst, *src;
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-21.c
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-21.c
index ef439f20f74..804a2989d64 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-21.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-21.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O2 -mtune=sapphirerapids -march=x86-64 -mavx2" } */
+/* { dg-options "-O2 -mtune=znver5 -march=x86-64 -mavx2" } */
extern char *dst, *src;
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-46.c
b/gcc/testsuite/gcc.target/i386/pieces-memset-46.c
index be1b054eed2..43d636ee3ff 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memset-46.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-46.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O2 -march=sapphirerapids" } */
+/* { dg-options "-O2 -march=znver5" } */
extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-49.c
b/gcc/testsuite/gcc.target/i386/pieces-memset-49.c
index ad43f89a9bd..ca4933ac1d8 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memset-49.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-49.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O2 -mtune=sapphirerapids -march=x86-64 -mavx2" } */
+/* { dg-options "-O2 -mtune=znver5 -march=x86-64 -mavx2" } */
extern char *dst;
--
2.34.1