On Mon, Mar 22, 2021 at 2:19 PM H.J. Lu via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Simply memcpy and memset inline strategies to avoid branches for
> -mtune=generic:
>
> 1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
>    load and store for up to 16 * 16 (256) bytes when the data size is
>    fixed and known.
> 2. Inline only if data size is known to be <= 256.
>    a. Use "rep movsb/stosb" with simple code sequence if the data size
>       is a constant.
>    b. Use loop if data size is not a constant.
> 3. Use memcpy/memset libray function if data size is unknown or > 256.
>
> With -mtune=generic -O2,

Is there any visible code-size effect of increasing CLEAR_RATIO on
SPEC/eembc?  Did you play with other values of MOVE/CLEAR_RATIO?
17 memory-to-memory/memory-clear insns looks quite a lot.

> 1. On Ice Lake processor,
>
> Performance impacts on SPEC CPU 2017:
>
> 500.perlbench_r  0.51%
> 502.gcc_r        0.55%
> 505.mcf_r        0.38%
> 520.omnetpp_r   -0.74%
> 523.xalancbmk_r -0.35%
> 525.x264_r       2.99%
> 531.deepsjeng_r -0.17%
> 541.leela_r     -0.98%
> 548.exchange2_r  0.89%
> 557.xz_r         0.70%
> Geomean          0.37%
>
> 503.bwaves_r     0.04%
> 507.cactuBSSN_r -0.01%
> 508.namd_r      -0.45%
> 510.parest_r    -0.09%
> 511.povray_r    -1.37%
> 519.lbm_r        0.00%
> 521.wrf_r       -2.56%
> 526.blender_r   -0.01%
> 527.cam4_r      -0.05%
> 538.imagick_r    0.36%
> 544.nab_r        0.08%
> 549.fotonik3d_r -0.06%
> 554.roms_r       0.05%
> Geomean         -0.34%
>
> Significant impacts on eembc benchmarks:
>
> eembc/nnet_test      14.85%
> eembc/mp2decoddata2  13.57%
>
> 2. On Cascadelake processor,
>
> Performance impacts on SPEC CPU 2017:
>
> 500.perlbench_r -0.02%
> 502.gcc_r        0.10%
> 505.mcf_r       -1.14%
> 520.omnetpp_r   -0.22%
> 523.xalancbmk_r  0.21%
> 525.x264_r       0.94%
> 531.deepsjeng_r -0.37%
> 541.leela_r     -0.46%
> 548.exchange2_r -0.40%
> 557.xz_r         0.60%
> Geomean         -0.08%
>
> 503.bwaves_r    -0.50%
> 507.cactuBSSN_r  0.05%
> 508.namd_r      -0.02%
> 510.parest_r     0.09%
> 511.povray_r    -1.35%
> 519.lbm_r        0.00%
> 521.wrf_r       -0.03%
> 526.blender_r   -0.83%
> 527.cam4_r       1.23%
> 538.imagick_r    0.97%
> 544.nab_r       -0.02%
> 549.fotonik3d_r -0.12%
> 554.roms_r       0.55%
> Geomean          0.00%
>
> Significant impacts on eembc benchmarks:
>
> eembc/nnet_test      9.90%
> eembc/mp2decoddata2  16.42%
> eembc/textv2data3   -4.86%
> eembc/qos            12.90%
>
> 3. On Znver3 processor,
>
> Performance impacts on SPEC CPU 2017:
>
> 500.perlbench_r -0.96%
> 502.gcc_r       -1.06%
> 505.mcf_r       -0.01%
> 520.omnetpp_r   -1.45%
> 523.xalancbmk_r  2.89%
> 525.x264_r       4.98%
> 531.deepsjeng_r  0.18%
> 541.leela_r     -1.54%
> 548.exchange2_r -1.25%
> 557.xz_r        -0.01%
> Geomean          0.16%
>
> 503.bwaves_r     0.04%
> 507.cactuBSSN_r  0.85%
> 508.namd_r      -0.13%
> 510.parest_r     0.39%
> 511.povray_r     0.00%
> 519.lbm_r        0.00%
> 521.wrf_r        0.28%
> 526.blender_r   -0.10%
> 527.cam4_r      -0.58%
> 538.imagick_r    0.69%
> 544.nab_r       -0.04%
> 549.fotonik3d_r -0.04%
> 554.roms_r       0.40%
> Geomean          0.15%
>
> Significant impacts on eembc benchmarks:
>
> eembc/aifftr01       13.95%
> eembc/idctrn01       8.41%
> eembc/nnet_test      30.25%
> eembc/mp2decoddata2  5.05%
> eembc/textv2data3    6.43%
> eembc/qos           -5.79%
>
> gcc/
>
>         * config/i386/x86-tune-costs.h (generic_memcpy): Updated.
>         (generic_memset): Likewise.
>         (generic_cost): Change CLEAR_RATIO to 17.
>         * config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
>         Add m_GENERIC.
>
> gcc/testsuite/
>
>         * gcc.target/i386/memcpy-strategy-12.c: New test.
>         * gcc.target/i386/memcpy-strategy-13.c: Likewise.
>         * gcc.target/i386/memset-strategy-10.c: Likewise.
>         * gcc.target/i386/memset-strategy-11.c: Likewise.
>         * gcc.target/i386/shrink_wrap_1.c: Also pass
>         -mmemset-strategy=rep_8byte:-1:align.
>         * gcc.target/i386/sw-1.c: Also pass -mstringop-strategy=rep_byte.
> ---
>  gcc/config/i386/x86-tune-costs.h              | 31 ++++++++++++-------
>  gcc/config/i386/x86-tune.def                  |  2 +-
>  .../gcc.target/i386/memcpy-strategy-12.c      |  9 ++++++
>  .../gcc.target/i386/memcpy-strategy-13.c      | 11 +++++++
>  .../gcc.target/i386/memset-strategy-10.c      | 11 +++++++
>  .../gcc.target/i386/memset-strategy-11.c      |  9 ++++++
>  gcc/testsuite/gcc.target/i386/shrink_wrap_1.c |  2 +-
>  gcc/testsuite/gcc.target/i386/sw-1.c          |  2 +-
>  8 files changed, 63 insertions(+), 14 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-10.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-11.c
>
> diff --git a/gcc/config/i386/x86-tune-costs.h 
> b/gcc/config/i386/x86-tune-costs.h
> index ffe810f2bcb..30e7c3e4261 100644
> --- a/gcc/config/i386/x86-tune-costs.h
> +++ b/gcc/config/i386/x86-tune-costs.h
> @@ -2844,19 +2844,28 @@ struct processor_costs intel_cost = {
>    "16",                                        /* Func alignment.  */
>  };
>
> -/* Generic should produce code tuned for Core-i7 (and newer chips)
> -   and btver1 (and newer chips).  */
> +/* Generic should produce code tuned for Haswell (and newer chips)
> +   and znver1 (and newer chips).  NB: rep_prefix_1_byte is used only
> +   for known size.  */
>
>  static stringop_algs generic_memcpy[2] = {
> -  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
> -             {-1, libcall, false}}},
> -  {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
> -             {-1, libcall, false}}}};
> +  {libcall,
> +   {{256, rep_prefix_1_byte, true},
> +    {256, loop, false},
> +    {-1, libcall, false}}},
> +  {libcall,
> +   {{256, rep_prefix_1_byte, true},
> +    {256, loop, false},
> +    {-1, libcall, false}}}};
>  static stringop_algs generic_memset[2] = {
> -  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
> -             {-1, libcall, false}}},
> -  {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
> -             {-1, libcall, false}}}};
> +  {libcall,
> +   {{256, rep_prefix_1_byte, true},
> +    {256, loop, false},
> +    {-1, libcall, false}}},
> +  {libcall,
> +   {{256, rep_prefix_1_byte, true},
> +    {256, loop, false},
> +    {-1, libcall, false}}}};
>  static const
>  struct processor_costs generic_cost = {
>    {
> @@ -2913,7 +2922,7 @@ struct processor_costs generic_cost = {
>    COSTS_N_INSNS (1),                   /* cost of movzx */
>    8,                                   /* "large" insn */
>    17,                                  /* MOVE_RATIO */
> -  6,                                   /* CLEAR_RATIO */
> +  17,                                  /* CLEAR_RATIO */
>    {6, 6, 6},                           /* cost of loading integer registers
>                                            in QImode, HImode and SImode.
>                                            Relative to reg-reg move (2).  */
> diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> index eb057a67750..fd9c011a3f5 100644
> --- a/gcc/config/i386/x86-tune.def
> +++ b/gcc/config/i386/x86-tune.def
> @@ -273,7 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", 
> m_386 | m_P4_NOCONA)
>     move/set sequences of bytes with known size.  */
>  DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
>           "prefer_known_rep_movsb_stosb",
> -         m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
> +         m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512 | m_GENERIC)
>
>  /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
>     compact prologues and epilogues by issuing a misaligned moves.  This
> diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c 
> b/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
> new file mode 100644
> index 00000000000..87f03352736
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mtune=generic" } */
> +/* { dg-final { scan-assembler "rep movsb" } } */
> +
> +void
> +foo (char *dest, char *src)
> +{
> +  __builtin_memcpy (dest, src, 249);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c 
> b/gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c
> new file mode 100644
> index 00000000000..cfc3cfba623
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mtune=generic" } */
> +/* { dg-final { scan-assembler "jmp\tmemcpy" { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler "call\tmemcpy" { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "rep movsb" } } */
> +
> +void
> +foo (char *dest, char *src)
> +{
> +  __builtin_memcpy (dest, src, 257);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-10.c 
> b/gcc/testsuite/gcc.target/i386/memset-strategy-10.c
> new file mode 100644
> index 00000000000..ade5e8da42c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-10.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mtune=generic" } */
> +/* { dg-final { scan-assembler "jmp\tmemset" { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler "call\tmemset" { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "rep stosb" } } */
> +
> +void
> +foo (char *dest)
> +{
> +  __builtin_memset (dest, 0, 257);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-11.c 
> b/gcc/testsuite/gcc.target/i386/memset-strategy-11.c
> new file mode 100644
> index 00000000000..d1b86152474
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-11.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mtune=generic" } */
> +/* { dg-final { scan-assembler "rep stosb" } } */
> +
> +void
> +foo (char *dest)
> +{
> +  __builtin_memset (dest, 0, 253);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c 
> b/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c
> index 94dadd6cdbd..44fe7d2836e 100644
> --- a/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c
> +++ b/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile { target { ! ia32 } } } */
> -/* { dg-options "-O2 -fdump-rtl-pro_and_epilogue" } */
> +/* { dg-options "-O2 -mmemset-strategy=rep_8byte:-1:align 
> -fdump-rtl-pro_and_epilogue" } */
>
>  enum machine_mode
>  {
> diff --git a/gcc/testsuite/gcc.target/i386/sw-1.c 
> b/gcc/testsuite/gcc.target/i386/sw-1.c
> index aec095eda62..f61621e42bf 100644
> --- a/gcc/testsuite/gcc.target/i386/sw-1.c
> +++ b/gcc/testsuite/gcc.target/i386/sw-1.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -mtune=generic -fshrink-wrap 
> -fdump-rtl-pro_and_epilogue" } */
> +/* { dg-options "-O2 -mtune=generic -mstringop-strategy=rep_byte 
> -fshrink-wrap -fdump-rtl-pro_and_epilogue" } */
>  /* { dg-skip-if "No shrink-wrapping preformed" { x86_64-*-mingw* } } */
>
>  #include <string.h>
> --
> 2.30.2
>

Reply via email to