r259399, which added PROCESSOR_SKYLAKE, disabled many x86 optimizations which are enabled by PROCESSOR_HASWELL. As the result, -mtune=skylake generates slower codes on Skylake than before. The same also applies to Cannonlake and Icelak tuning.
This patch changes -mtune={skylake|cannonlake|icelake} to tune like -mtune=haswell for until their tuning is properly adjusted. It also enables -mprefer-vector-width=256 for -mtune=haswell, which has no impact on codegen when AVX512 isn't enabled. Performance impacts on SPEC CPU 2017 rate with 1 copy using -march=native -mfpmath=sse -O2 -m64 are 1. On Broadwell server: 500.perlbench_r -0.56% 502.gcc_r -0.18% 505.mcf_r 0.24% 520.omnetpp_r 0.00% 523.xalancbmk_r -0.32% 525.x264_r -0.17% 531.deepsjeng_r 0.00% 541.leela_r 0.00% 548.exchange2_r 0.12% 557.xz_r 0.00% geomean 0.00% 503.bwaves_r 0.00% 507.cactuBSSN_r 0.21% 508.namd_r 0.00% 510.parest_r 0.19% 511.povray_r -0.48% 519.lbm_r 0.00% 521.wrf_r 0.28% 526.blender_r 0.19% 527.cam4_r 0.39% 538.imagick_r 0.00% 544.nab_r -0.36% 549.fotonik3d_r 0.51% 554.roms_r 0.00% geomean 0.17% On Skylake client: 500.perlbench_r 0.96% 502.gcc_r 0.13% 505.mcf_r -1.03% 520.omnetpp_r -1.11% 523.xalancbmk_r 1.02% 525.x264_r 0.50% 531.deepsjeng_r 2.97% 541.leela_r 0.50% 548.exchange2_r -0.95% 557.xz_r 2.41% geomean 0.56% 503.bwaves_r 0.49% 507.cactuBSSN_r 3.17% 508.namd_r 4.05% 510.parest_r 0.15% 511.povray_r 0.80% 519.lbm_r 3.15% 521.wrf_r 10.56% 526.blender_r 2.97% 527.cam4_r 2.36% 538.imagick_r 46.40% 544.nab_r 2.04% 549.fotonik3d_r 0.00% 554.roms_r 1.27% geomean 5.49% On Skylake server: 500.perlbench_r 0.71% 502.gcc_r -0.51% 505.mcf_r -1.06% 520.omnetpp_r -0.33% 523.xalancbmk_r -0.22% 525.x264_r 1.72% 531.deepsjeng_r -0.26% 541.leela_r 0.57% 548.exchange2_r -0.75% 557.xz_r -1.28% geomean -0.21% 503.bwaves_r 0.00% 507.cactuBSSN_r 2.66% 508.namd_r 3.67% 510.parest_r 1.25% 511.povray_r 2.26% 519.lbm_r 1.69% 521.wrf_r 11.03% 526.blender_r 3.39% 527.cam4_r 1.69% 538.imagick_r 64.59% 544.nab_r -0.54% 549.fotonik3d_r 2.68% 554.roms_r 0.00% geomean 6.19% This patch improves -march=native performance on Skylake up to 60% and leaves -march=native performance unchanged on Haswell. OK for trunk? Thanks. H.J. --- gcc/ 2018-07-12 H.J. Lu <hongjiu...@intel.com> Sunil K Pandey <sunil.k.pan...@intel.com> PR target/84413 * config/i386/i386.c (m_HASWELL): Add PROCESSOR_SKYLAKE, PROCESSOR_SKYLAKE_AVX512, PROCESSOR_CANNONLAKE, PROCESSOR_ICELAKE_CLIENT and PROCESSOR_ICELAKE_SERVER. (m_SKYLAKE): Set to 0. (m_SKYLAKE_AVX512): Likewise. (m_CANNONLAKE): Likewise. (m_ICELAKE_CLIENT): Likewise. (m_ICELAKE_SERVER): Likewise. * config/i386/x86-tune.def (avx256_optimal): Also enabled for m_HASWELL. gcc/testsuite/ 2018-07-12 H.J. Lu <hongjiu...@intel.com> Sunil K Pandey <sunil.k.pan...@intel.com> PR target/84413 * gcc.target/i386/pr84413-1.c: New test. * gcc.target/i386/pr84413-2.c: Likewise. * gcc.target/i386/pr84413-3.c: Likewise. * gcc.target/i386/pr84413-4.c: Likewise. --- gcc/config/i386/i386.c | 17 +++++++++++------ gcc/config/i386/x86-tune.def | 9 ++++++--- gcc/testsuite/gcc.target/i386/pr84413-1.c | 17 +++++++++++++++++ gcc/testsuite/gcc.target/i386/pr84413-2.c | 17 +++++++++++++++++ gcc/testsuite/gcc.target/i386/pr84413-3.c | 17 +++++++++++++++++ gcc/testsuite/gcc.target/i386/pr84413-4.c | 17 +++++++++++++++++ 6 files changed, 85 insertions(+), 9 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr84413-4.c diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 9e46b7b136f..762ab89fc9e 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -137,17 +137,22 @@ const struct processor_costs *ix86_cost = NULL; #define m_CORE2 (HOST_WIDE_INT_1U<<PROCESSOR_CORE2) #define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM) #define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE) -#define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL) +#define m_HASWELL ((HOST_WIDE_INT_1U<<PROCESSOR_HASWELL) \ + | (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE) \ + | (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512) \ + | (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE) \ + | (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT) \ + | (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER)) #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL) #define m_BONNELL (HOST_WIDE_INT_1U<<PROCESSOR_BONNELL) #define m_SILVERMONT (HOST_WIDE_INT_1U<<PROCESSOR_SILVERMONT) #define m_KNL (HOST_WIDE_INT_1U<<PROCESSOR_KNL) #define m_KNM (HOST_WIDE_INT_1U<<PROCESSOR_KNM) -#define m_SKYLAKE (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE) -#define m_SKYLAKE_AVX512 (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512) -#define m_CANNONLAKE (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE) -#define m_ICELAKE_CLIENT (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT) -#define m_ICELAKE_SERVER (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER) +#define m_SKYLAKE 0 +#define m_SKYLAKE_AVX512 0 +#define m_CANNONLAKE 0 +#define m_ICELAKE_CLIENT 0 +#define m_ICELAKE_SERVER 0 #define m_GOLDMONT (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT) #define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS) #define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT) diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 8a8d5ab2440..c8abaedad8c 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -444,9 +444,12 @@ DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal" DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2 | m_ZNVER1) -/* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX - instructions in the auto-vectorizer. */ -DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_SKYLAKE_AVX512) +/* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit + AVX instructions in the auto-vectorizer. NB: This is also enabled for + -mtune=haswell so that we can tune Skylake, Cannonlake and Icelake as + Haswell. */ +DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_SKYLAKE_AVX512 + | m_HASWELL) /*****************************************************************************/ /* Historical relics: tuning flags that helps a specific old CPU designs */ diff --git a/gcc/testsuite/gcc.target/i386/pr84413-1.c b/gcc/testsuite/gcc.target/i386/pr84413-1.c new file mode 100644 index 00000000000..1c94d7715cf --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr84413-1.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=skylake-avx512" } */ +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */ + +#define N 1024 + +double a[N], b[N], c[N]; + +void +avx512f_test (void) +{ + int i; + + for (i = 0; i < N; i++) + c[i] = a[i] * b[i]; +} diff --git a/gcc/testsuite/gcc.target/i386/pr84413-2.c b/gcc/testsuite/gcc.target/i386/pr84413-2.c new file mode 100644 index 00000000000..adf9b527cd6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr84413-2.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=cannonlake" } */ +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */ + +#define N 1024 + +double a[N], b[N], c[N]; + +void +avx512f_test (void) +{ + int i; + + for (i = 0; i < N; i++) + c[i] = a[i] * b[i]; +} diff --git a/gcc/testsuite/gcc.target/i386/pr84413-3.c b/gcc/testsuite/gcc.target/i386/pr84413-3.c new file mode 100644 index 00000000000..76bf25fc56b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr84413-3.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=icelake-server" } */ +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */ + +#define N 1024 + +double a[N], b[N], c[N]; + +void +avx512f_test (void) +{ + int i; + + for (i = 0; i < N; i++) + c[i] = a[i] * b[i]; +} diff --git a/gcc/testsuite/gcc.target/i386/pr84413-4.c b/gcc/testsuite/gcc.target/i386/pr84413-4.c new file mode 100644 index 00000000000..031ef0c8916 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr84413-4.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=haswell -mavx512f" } */ +/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */ +/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */ + +#define N 1024 + +double a[N], b[N], c[N]; + +void +avx512f_test (void) +{ + int i; + + for (i = 0; i < N; i++) + c[i] = a[i] * b[i]; +} -- 2.17.1