Simply memcpy and memset inline strategies to avoid branches for Skylake family CPUs:
1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector load and store for up to 16 * 16 (256) bytes when the data size is fixed and known. 2. Inline only if data size is known to be <= 256. a. Use "rep movsb/stosb" with simple code sequence if the data size is a constant. b. Use loop if data size is not a constant. 3. Use memcpy/memset libray function if data size is unknown or > 256. On Cascadelake processor with -march=native -Ofast -flto, 1. Performance impacts of SPEC CPU 2017 rate are: 500.perlbench_r 0.17% 502.gcc_r -0.36% 505.mcf_r 0.00% 520.omnetpp_r 0.08% 523.xalancbmk_r -0.62% 525.x264_r 1.04% 531.deepsjeng_r 0.11% 541.leela_r -1.09% 548.exchange2_r -0.25% 557.xz_r 0.17% Geomean -0.08% 503.bwaves_r 0.00% 507.cactuBSSN_r 0.69% 508.namd_r -0.07% 510.parest_r 1.12% 511.povray_r 1.82% 519.lbm_r 0.00% 521.wrf_r -1.32% 526.blender_r -0.47% 527.cam4_r 0.23% 538.imagick_r -1.72% 544.nab_r -0.56% 549.fotonik3d_r 0.12% 554.roms_r 0.43% Geomean 0.02% 2. Significant impacts on eembc benchmarks are: eembc/idctrn01 9.23% eembc/nnet_test 29.26% gcc/ * config/i386/x86-tune-costs.h (skylake_memcpy): Updated. (skylake_memset): Likewise. (skylake_cost): Change CLEAR_RATIO to 17. * config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB): Replace m_CANNONLAKE, m_ICELAKE_CLIENT, m_ICELAKE_SERVER, m_TIGERLAKE and m_SAPPHIRERAPIDS with m_SKYLAKE and m_CORE_AVX512. gcc/testsuite/ * gcc.target/i386/memcpy-strategy-9.c: New test. * gcc.target/i386/memcpy-strategy-10.c: Likewise. * gcc.target/i386/memcpy-strategy-11.c: Likewise. * gcc.target/i386/memset-strategy-7.c: Likewise. * gcc.target/i386/memset-strategy-8.c: Likewise. * gcc.target/i386/memset-strategy-9.c: Likewise. --- gcc/config/i386/x86-tune-costs.h | 27 ++++++++++++------- gcc/config/i386/x86-tune.def | 3 +-- .../gcc.target/i386/memcpy-strategy-10.c | 11 ++++++++ .../gcc.target/i386/memcpy-strategy-11.c | 18 +++++++++++++ .../gcc.target/i386/memcpy-strategy-9.c | 9 +++++++ .../gcc.target/i386/memset-strategy-7.c | 11 ++++++++ .../gcc.target/i386/memset-strategy-8.c | 9 +++++++ .../gcc.target/i386/memset-strategy-9.c | 17 ++++++++++++ 8 files changed, 93 insertions(+), 12 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-7.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-8.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-9.c diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index 0e00ff99df3..ffe810f2bcb 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -1822,17 +1822,24 @@ struct processor_costs znver3_cost = { /* skylake_cost should produce code tuned for Skylake familly of CPUs. */ static stringop_algs skylake_memcpy[2] = { - {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, - {libcall, {{16, loop, false}, {512, unrolled_loop, false}, - {-1, libcall, false}}}}; + {libcall, + {{256, rep_prefix_1_byte, true}, + {256, loop, false}, + {-1, libcall, false}}}, + {libcall, + {{256, rep_prefix_1_byte, true}, + {256, loop, false}, + {-1, libcall, false}}}}; static stringop_algs skylake_memset[2] = { - {libcall, {{6, loop_1_byte, true}, - {24, loop, true}, - {8192, rep_prefix_4_byte, true}, - {-1, libcall, false}}}, - {libcall, {{24, loop, true}, {512, unrolled_loop, false}, - {-1, libcall, false}}}}; + {libcall, + {{256, rep_prefix_1_byte, true}, + {256, loop, false}, + {-1, libcall, false}}}, + {libcall, + {{256, rep_prefix_1_byte, true}, + {256, loop, false}, + {-1, libcall, false}}}}; static const struct processor_costs skylake_cost = { @@ -1889,7 +1896,7 @@ struct processor_costs skylake_cost = { COSTS_N_INSNS (0), /* cost of movzx */ 8, /* "large" insn */ 17, /* MOVE_RATIO */ - 6, /* CLEAR_RATIO */ + 17, /* CLEAR_RATIO */ {4, 4, 4}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 134916cc972..eb057a67750 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -273,8 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA) move/set sequences of bytes with known size. */ DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB, "prefer_known_rep_movsb_stosb", - m_CANNONLAKE | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_TIGERLAKE - | m_ALDERLAKE | m_SAPPHIRERAPIDS) + m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512) /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of compact prologues and epilogues by issuing a misaligned moves. This diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c new file mode 100644 index 00000000000..970aa741971 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-10.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake -mno-sse" } */ +/* { dg-final { scan-assembler "jmp\tmemcpy" { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler "call\tmemcpy" { target ia32 } } } */ +/* { dg-final { scan-assembler-not "rep movsb" } } */ + +void +foo (char *dest, char *src) +{ + __builtin_memcpy (dest, src, 257); +} diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c new file mode 100644 index 00000000000..b6041944630 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-11.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake" } */ +/* { dg-final { scan-assembler-not "jmp\tmemcpy" { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-not "call\tmemcpy" { target ia32 } } } */ +/* { dg-final { scan-assembler-not "rep movsb" } } */ + +typedef unsigned char e_u8; + +#define MAXBC 8 + +void MixColumn(e_u8 a[4][MAXBC], e_u8 BC) +{ + e_u8 b[4][MAXBC]; + int i, j; + + for(i = 0; i < 4; i++) + for(j = 0; j < BC; j++) a[i][j] = b[i][j]; +} diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c new file mode 100644 index 00000000000..b0dc7484d09 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-9.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake -mno-sse" } */ +/* { dg-final { scan-assembler "rep movsb" } } */ + +void +foo (char *dest, char *src) +{ + __builtin_memcpy (dest, src, 256); +} diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-7.c b/gcc/testsuite/gcc.target/i386/memset-strategy-7.c new file mode 100644 index 00000000000..07c2816910c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-7.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake -mno-sse" } */ +/* { dg-final { scan-assembler "jmp\tmemset" { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler "call\tmemset" { target ia32 } } } */ +/* { dg-final { scan-assembler-not "rep stosb" } } */ + +void +foo (char *dest) +{ + __builtin_memset (dest, 0, 257); +} diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-8.c b/gcc/testsuite/gcc.target/i386/memset-strategy-8.c new file mode 100644 index 00000000000..52ea882c814 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-8.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake -mno-sse" } */ +/* { dg-final { scan-assembler "rep stosb" } } */ + +void +foo (char *dest) +{ + __builtin_memset (dest, 0, 256); +} diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-9.c b/gcc/testsuite/gcc.target/i386/memset-strategy-9.c new file mode 100644 index 00000000000..d4db031958f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-9.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake" } */ +/* { dg-final { scan-assembler-not "jmp\tmemset" { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-not "call\tmemset" { target ia32 } } } */ +/* { dg-final { scan-assembler-not "rep stosb" } } */ + +typedef unsigned char e_u8; + +#define MAXBC 8 + +void MixColumn(e_u8 a[4][MAXBC], e_u8 BC) +{ + int i, j; + + for(i = 0; i < 4; i++) + for(j = 0; j < BC; j++) a[i][j] = 1; +} -- 2.30.2