Simply memcpy and memset inline strategies to avoid branches for
-mtune=generic:
1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
load and store for up to 16 * 16 (256) bytes when the data size is
fixed and known.
2. Inline only if data size is known to be <= 256.
a. Use "rep movsb/stosb" with simple code sequence if the data size
is a constant.
b. Use loop if data size is not a constant.
3. Use memcpy/memset libray function if data size is unknown or > 256.
With -mtune=generic -O2,
1. On Ice Lake processor,
Performance impacts on SPEC CPU 2017:
500.perlbench_r 0.51%
502.gcc_r 0.55%
505.mcf_r 0.38%
520.omnetpp_r -0.74%
523.xalancbmk_r -0.35%
525.x264_r 2.99%
531.deepsjeng_r -0.17%
541.leela_r -0.98%
548.exchange2_r 0.89%
557.xz_r 0.70%
Geomean 0.37%
503.bwaves_r 0.04%
507.cactuBSSN_r -0.01%
508.namd_r -0.45%
510.parest_r -0.09%
511.povray_r -1.37%
519.lbm_r 0.00%
521.wrf_r -2.56%
526.blender_r -0.01%
527.cam4_r -0.05%
538.imagick_r 0.36%
544.nab_r 0.08%
549.fotonik3d_r -0.06%
554.roms_r 0.05%
Geomean -0.34%
Significant impacts on eembc benchmarks:
eembc/nnet_test 14.85%
eembc/mp2decoddata2 13.57%
2. On Cascadelake processor,
Performance impacts on SPEC CPU 2017:
500.perlbench_r -0.02%
502.gcc_r 0.10%
505.mcf_r -1.14%
520.omnetpp_r -0.22%
523.xalancbmk_r 0.21%
525.x264_r 0.94%
531.deepsjeng_r -0.37%
541.leela_r -0.46%
548.exchange2_r -0.40%
557.xz_r 0.60%
Geomean -0.08%
503.bwaves_r -0.50%
507.cactuBSSN_r 0.05%
508.namd_r -0.02%
510.parest_r 0.09%
511.povray_r -1.35%
519.lbm_r 0.00%
521.wrf_r -0.03%
526.blender_r -0.83%
527.cam4_r 1.23%
538.imagick_r 0.97%
544.nab_r -0.02%
549.fotonik3d_r -0.12%
554.roms_r 0.55%
Geomean 0.00%
Significant impacts on eembc benchmarks:
eembc/nnet_test 9.90%
eembc/mp2decoddata2 16.42%
eembc/textv2data3 -4.86%
eembc/qos 12.90%
3. On Znver3 processor,
Performance impacts on SPEC CPU 2017:
500.perlbench_r -0.96%
502.gcc_r -1.06%
505.mcf_r -0.01%
520.omnetpp_r -1.45%
523.xalancbmk_r 2.89%
525.x264_r 4.98%
531.deepsjeng_r 0.18%
541.leela_r -1.54%
548.exchange2_r -1.25%
557.xz_r -0.01%
Geomean 0.16%
503.bwaves_r 0.04%
507.cactuBSSN_r 0.85%
508.namd_r -0.13%
510.parest_r 0.39%
511.povray_r 0.00%
519.lbm_r 0.00%
521.wrf_r 0.28%
526.blender_r -0.10%
527.cam4_r -0.58%
538.imagick_r 0.69%
544.nab_r -0.04%
549.fotonik3d_r -0.04%
554.roms_r 0.40%
Geomean 0.15%
Significant impacts on eembc benchmarks:
eembc/aifftr01 13.95%
eembc/idctrn01 8.41%
eembc/nnet_test 30.25%
eembc/mp2decoddata2 5.05%
eembc/textv2data3 6.43%
eembc/qos -5.79%
gcc/
* config/i386/x86-tune-costs.h (generic_memcpy): Updated.
(generic_memset): Likewise.
(generic_cost): Change CLEAR_RATIO to 17.
* config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
Add m_GENERIC.
gcc/testsuite/
* gcc.target/i386/memcpy-strategy-12.c: New test.
* gcc.target/i386/memcpy-strategy-13.c: Likewise.
* gcc.target/i386/memset-strategy-10.c: Likewise.
* gcc.target/i386/memset-strategy-11.c: Likewise.
* gcc.target/i386/shrink_wrap_1.c: Also pass
-mmemset-strategy=rep_8byte:-1:align.
* gcc.target/i386/sw-1.c: Also pass -mstringop-strategy=rep_byte.
---
gcc/config/i386/x86-tune-costs.h | 31 ++++++++++++-------
gcc/config/i386/x86-tune.def | 2 +-
.../gcc.target/i386/memcpy-strategy-12.c | 9 ++++++
.../gcc.target/i386/memcpy-strategy-13.c | 11 +++++++
.../gcc.target/i386/memset-strategy-10.c | 11 +++++++
.../gcc.target/i386/memset-strategy-11.c | 9 ++++++
gcc/testsuite/gcc.target/i386/shrink_wrap_1.c | 2 +-
gcc/testsuite/gcc.target/i386/sw-1.c | 2 +-
8 files changed, 63 insertions(+), 14 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c
create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-10.c
create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-11.c
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index ffe810f2bcb..30e7c3e4261 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2844,19 +2844,28 @@ struct processor_costs intel_cost = {
"16", /* Func alignment. */
};
-/* Generic should produce code tuned for Core-i7 (and newer chips)
- and btver1 (and newer chips). */
+/* Generic should produce code tuned for Haswell (and newer chips)
+ and znver1 (and newer chips). NB: rep_prefix_1_byte is used only
+ for known size. */
static stringop_algs generic_memcpy[2] = {
- {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
- {-1, libcall, false}}},
- {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
+ {libcall,
+ {{256, rep_prefix_1_byte, true},
+ {256, loop, false},
+ {-1, libcall, false}}},
+ {libcall,
+ {{256, rep_prefix_1_byte, true},
+ {256, loop, false},
+ {-1, libcall, false}}}};
static stringop_algs generic_memset[2] = {
- {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
- {-1, libcall, false}}},
- {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
- {-1, libcall, false}}}};
+ {libcall,
+ {{256, rep_prefix_1_byte, true},
+ {256, loop, false},
+ {-1, libcall, false}}},
+ {libcall,
+ {{256, rep_prefix_1_byte, true},
+ {256, loop, false},
+ {-1, libcall, false}}}};
static const
struct processor_costs generic_cost = {
{
@@ -2913,7 +2922,7 @@ struct processor_costs generic_cost = {
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
17, /* MOVE_RATIO */
- 6, /* CLEAR_RATIO */
+ 17, /* CLEAR_RATIO */
{6, 6, 6}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index eb057a67750..fd9c011a3f5 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -273,7 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop",
m_386 | m_P4_NOCONA)
move/set sequences of bytes with known size. */
DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
"prefer_known_rep_movsb_stosb",
- m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
+ m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512 | m_GENERIC)
/* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
compact prologues and epilogues by issuing a misaligned moves. This
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
b/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
new file mode 100644
index 00000000000..87f03352736
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic" } */
+/* { dg-final { scan-assembler "rep movsb" } } */
+
+void
+foo (char *dest, char *src)
+{
+ __builtin_memcpy (dest, src, 249);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c
b/gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c
new file mode 100644
index 00000000000..cfc3cfba623
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic" } */
+/* { dg-final { scan-assembler "jmp\tmemcpy" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler "call\tmemcpy" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep movsb" } } */
+
+void
+foo (char *dest, char *src)
+{
+ __builtin_memcpy (dest, src, 257);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-10.c
b/gcc/testsuite/gcc.target/i386/memset-strategy-10.c
new file mode 100644
index 00000000000..ade5e8da42c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-10.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic" } */
+/* { dg-final { scan-assembler "jmp\tmemset" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler "call\tmemset" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep stosb" } } */
+
+void
+foo (char *dest)
+{
+ __builtin_memset (dest, 0, 257);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-11.c
b/gcc/testsuite/gcc.target/i386/memset-strategy-11.c
new file mode 100644
index 00000000000..d1b86152474
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-11.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic" } */
+/* { dg-final { scan-assembler "rep stosb" } } */
+
+void
+foo (char *dest)
+{
+ __builtin_memset (dest, 0, 253);
+}
diff --git a/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c
b/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c
index 94dadd6cdbd..44fe7d2836e 100644
--- a/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c
+++ b/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c
@@ -1,5 +1,5 @@
/* { dg-do compile { target { ! ia32 } } } */
-/* { dg-options "-O2 -fdump-rtl-pro_and_epilogue" } */
+/* { dg-options "-O2 -mmemset-strategy=rep_8byte:-1:align
-fdump-rtl-pro_and_epilogue" } */
enum machine_mode
{
diff --git a/gcc/testsuite/gcc.target/i386/sw-1.c
b/gcc/testsuite/gcc.target/i386/sw-1.c
index aec095eda62..f61621e42bf 100644
--- a/gcc/testsuite/gcc.target/i386/sw-1.c
+++ b/gcc/testsuite/gcc.target/i386/sw-1.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O2 -mtune=generic -fshrink-wrap -fdump-rtl-pro_and_epilogue"
} */
+/* { dg-options "-O2 -mtune=generic -mstringop-strategy=rep_byte -fshrink-wrap
-fdump-rtl-pro_and_epilogue" } */
/* { dg-skip-if "No shrink-wrapping preformed" { x86_64-*-mingw* } } */
#include <string.h>
--
2.30.2