Hi:
  This patch is about to set X86_TUNE_AVX128_OPTIMAL as default for
all AVX target because we found there's still performance gap between
128-bit auto-vectorization and 256-bit auto-vectorization even with
epilog vectorized.
  The performance influence of setting avx128_optimal as default on
SPEC2017 with option `-march=native -funroll-loops -Ofast -flto" on
CLX is as bellow:

    INT rate
    500.perlbench_r         -0.32%
    502.gcc_r                       -1.32%
    505.mcf_r                       -0.12%
    520.omnetpp_r                   -0.34%
    523.xalancbmk_r         -0.65%
    525.x264_r                      2.23%
    531.deepsjeng_r         0.81%
    541.leela_r                     -0.02%
    548.exchange2_r         10.89%  ----------> big improvement
    557.xz_r                        0.38%
    geomean for intrate             1.10%

    FP rate
    503.bwaves_r                    1.41%
    507.cactuBSSN_r         -0.14%
    508.namd_r                      1.54%
    510.parest_r                    -0.87%
    511.povray_r                    0.28%
    519.lbm_r                       0.32%
    521.wrf_r                       -0.54%
    526.blender_r                   0.59%
    527.cam4_r                      -2.70%
    538.imagick_r                   3.92%
    544.nab_r                       0.59%
    549.fotonik3d_r         -5.44%  -------------> regression
    554.roms_r                      -2.34%
    geomean for fprate              -0.28%

The 10% improvement of 548.exchange_r is because there is 9-layer
nested loop, and the loop count for innermost layer is small(enough
for 128-bit vectorization, but not for 256-bit vectorization).
Since loop count is not statically analyzed out, vectorizer will
choose 256-bit vectorization which would never never be triggered. The
vectorization of epilog will introduced some extra instructions,
normally it will bring back some performance, but since it's 9-layer
nested loop, costs of extra instructions will cover the gain.

The 5.44% regression of 549.fotonik3d_r is because 256-bit
vectorization is better than 128-bit vectorization. Generally when
enabling 256-bit or 512-bit vectorization, there will be instruction
clocksticks reduction also with frequency reduction. when frequency
reduction is less than instructions clocksticks reduction, long vector
width vectorization would be better than shorter one, otherwise the
opposite. The regression of 549.fotonik3d_r is due to this, similar
for 554.roms_r, 528.cam4_r, for those 3 benchmarks, 512-bit
vectorization is best.

Bootstrap and regression test on i386 is ok.
Ok for trunk?

Changelog
    gcc/
            * config/i386/i386-option.c (m_CORE_AVX): New macro.
            * config/i386/x86-tune.def: Enable 128_optimal for avx and
            replace m_SANDYBRIDGE | m_CORE_AVX2 with m_CORE_AVX.
            * testsuite/gcc.target/i386/pr84413-1.c: Adjust testcase.
            * testsuite/gcc.target/i386/pr84413-2.c: Ditto.
            * testsuite/gcc.target/i386/pr84413-3.c: Ditto.
            * testsuite/gcc.target/i386/pr70021.c: Ditto.
            * testsuite/gcc.target/i386/pr90579.c: New test.


-- 
BR,
Hongtao
From a02d5c896600c4c80765f375d531c5412a778145 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao....@intel.com>
Date: Wed, 6 Nov 2019 09:36:57 +0800
Subject: [PATCH] Enbale 128-bit auto-vectorization for avx

Performance impact test on CLX8280 with best perf option
-Ofast -march=native -funroll-loops -flto -mfpmath=sse.

INT rate
500.perlbench_r		-0.32%
502.gcc_r			-1.32%
505.mcf_r			-0.12%
520.omnetpp_r			-0.34%
523.xalancbmk_r		-0.65%
525.x264_r			2.23%
531.deepsjeng_r		0.81%
541.leela_r			-0.02%
548.exchange2_r		10.89%
557.xz_r			0.38%
geomean for intrate		1.10%

FP rate
503.bwaves_r			1.41%
507.cactuBSSN_r		-0.14%
508.namd_r			1.54%
510.parest_r			-0.87%
511.povray_r			0.28%
519.lbm_r			0.32%
521.wrf_r			-0.54%
526.blender_r			0.59%
527.cam4_r			-2.70%
538.imagick_r			3.92%
544.nab_r			0.59%
549.fotonik3d_r		-5.44%
554.roms_r			-2.34%
geomean for fprate		-0.28%

Changelog
gcc/
	* config/i386/i386-option.c (m_CORE_AVX): New macro.
	* config/i386/x86-tune.def: Enable 128_optimal for avx and
	replace m_SANDYBRIDGE | m_CORE_AVX2 with m_CORE_AVX.
	* testsuite/gcc.target/i386/pr84413-1.c: Adjust testcase.
	* testsuite/gcc.target/i386/pr84413-2.c: Ditto.
	* testsuite/gcc.target/i386/pr84413-3.c: Ditto.
	* testsuite/gcc.target/i386/pr70021.c: Ditto.
	* testsuite/gcc.target/i386/pr90579.c: New test.
---
 gcc/config/i386/i386-options.c            |  1 +
 gcc/config/i386/x86-tune.def              | 24 +++++++++++------------
 gcc/testsuite/gcc.target/i386/pr70021.c   |  2 +-
 gcc/testsuite/gcc.target/i386/pr84413-1.c |  4 ++--
 gcc/testsuite/gcc.target/i386/pr84413-2.c |  4 ++--
 gcc/testsuite/gcc.target/i386/pr84413-3.c |  4 ++--
 gcc/testsuite/gcc.target/i386/pr90579.c   | 20 +++++++++++++++++++
 7 files changed, 40 insertions(+), 19 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90579.c

diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
index dfc8ae23ba0..7277f74e360 100644
--- a/gcc/config/i386/i386-options.c
+++ b/gcc/config/i386/i386-options.c
@@ -127,6 +127,7 @@ along with GCC; see the file COPYING3.  If not see
 		       | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_CASCADELAKE \
 		       | m_TIGERLAKE | m_COOPERLAKE)
 #define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512)
+#define m_CORE_AVX (m_SANDYBRIDGE | m_CORE_AVX2)
 #define m_CORE_ALL (m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE | m_CORE_AVX2)
 #define m_GOLDMONT (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT)
 #define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS)
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index e289efdf2e0..4f5d82eaed0 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -49,8 +49,8 @@ DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
    over partial stores.  For example preffer MOVZBL or MOVQ to load 8bit
    value over movb.  */
 DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
-          m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE | m_CORE_AVX2
-	  | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL
+          m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_CORE_AVX | m_BONNELL
+	  | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL
 	  | m_KNL | m_KNM | m_AMD_MULTIPLE | m_TREMONT
 	  | m_GENERIC)
 
@@ -85,10 +85,10 @@ DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
    partial dependencies.  */
 DEF_TUNE (X86_TUNE_MOVX, "movx",
-          m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE
+          m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM
 	  | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_KNL | m_KNM | m_INTEL
 	  | m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE
-	  | m_CORE_AVX2 | m_TREMONT | m_GENERIC)
+	  | m_CORE_AVX | m_TREMONT | m_GENERIC)
 
 /* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by
    full sized loads.  */
@@ -105,21 +105,21 @@ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32",
 /* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent
    conditional jump instruction for TARGET_64BIT.  */
 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64",
-	  m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER
+	  m_NEHALEM | m_CORE_AVX | m_BDVER
 	  | m_ZNVER | m_GENERIC)
 
 /* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a
    subsequent conditional jump instruction when the condition jump
    check sign flag (SF) or overflow flag (OF).  */
 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
-	  m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER
+	  m_NEHALEM | m_CORE_AVX | m_BDVER
 	  | m_ZNVER | m_GENERIC)
 
 /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
    jump instruction when the alu instruction produces the CCFLAG consumed by
    the conditional jump instruction. */
 DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
-          m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC)
+          m_CORE_AVX | m_GENERIC)
 
 
 /*****************************************************************************/
@@ -299,7 +299,7 @@ DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
 /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
    for bit-manipulation instructions.  */
 DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
-	  m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC)
+	  m_CORE_AVX | m_GENERIC)
 
 /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
    on hardware capabilities. Bdver3 hardware has a loop buffer which makes
@@ -355,14 +355,14 @@ DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead
    of a sequence loading registers by parts.  */
 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
-	  m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
+	  m_NEHALEM | m_CORE_AVX | m_SILVERMONT | m_KNL | m_KNM
 	  | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS
 	  | m_TREMONT | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER | m_GENERIC)
 
 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead
    of a sequence loading registers by parts.  */
 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
-	  m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
+	  m_NEHALEM | m_CORE_AVX | m_SILVERMONT | m_KNL | m_KNM
 	  | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS
 	  | m_TREMONT | m_BDVER | m_ZNVER | m_GENERIC)
 
@@ -456,11 +456,11 @@ DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal"
 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
    the auto-vectorizer.  */
 DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2
-	  | m_ZNVER1)
+	  | m_ZNVER1 | m_CORE_AVX)
 
 /* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX
    instructions in the auto-vectorizer.  */
-DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512)
+DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", 0U)
 
 /*****************************************************************************/
 /* Historical relics: tuning flags that helps a specific old CPU designs     */
diff --git a/gcc/testsuite/gcc.target/i386/pr70021.c b/gcc/testsuite/gcc.target/i386/pr70021.c
index 6562c0f2bd0..de6da345119 100644
--- a/gcc/testsuite/gcc.target/i386/pr70021.c
+++ b/gcc/testsuite/gcc.target/i386/pr70021.c
@@ -1,7 +1,7 @@
 /* PR target/70021 */
 /* { dg-do run } */
 /* { dg-require-effective-target avx2 } */
-/* { dg-options "-O2 -ftree-vectorize -mavx2 -fdump-tree-vect-details -mtune=skylake" } */
+/* { dg-options "-O2 -ftree-vectorize -mavx2 -fdump-tree-vect-details" } */
 
 #include "avx2-check.h"
 
diff --git a/gcc/testsuite/gcc.target/i386/pr84413-1.c b/gcc/testsuite/gcc.target/i386/pr84413-1.c
index 1c94d7715cf..e87115ee921 100644
--- a/gcc/testsuite/gcc.target/i386/pr84413-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr84413-1.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-O3 -march=skylake-avx512" } */
-/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
-/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */
+/* { dg-final { scan-assembler-not "%\[yz\]mm\[0-9\]+" } } */
+/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%xmm\[0-9\]+" } } */
 
 #define N 1024
 
diff --git a/gcc/testsuite/gcc.target/i386/pr84413-2.c b/gcc/testsuite/gcc.target/i386/pr84413-2.c
index adf9b527cd6..e31e3f4281a 100644
--- a/gcc/testsuite/gcc.target/i386/pr84413-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr84413-2.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-O3 -march=cannonlake" } */
-/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
-/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */
+/* { dg-final { scan-assembler-not "%\[yz\]mm\[0-9\]+" } } */
+/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%xmm\[0-9\]+" } } */
 
 #define N 1024
 
diff --git a/gcc/testsuite/gcc.target/i386/pr84413-3.c b/gcc/testsuite/gcc.target/i386/pr84413-3.c
index 76bf25fc56b..75180a85ee2 100644
--- a/gcc/testsuite/gcc.target/i386/pr84413-3.c
+++ b/gcc/testsuite/gcc.target/i386/pr84413-3.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-O3 -march=icelake-server" } */
-/* { dg-final { scan-assembler-not "%zmm\[0-9\]+" } } */
-/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" } } */
+/* { dg-final { scan-assembler-not "%\[yz\]mm\[0-9\]+" } } */
+/* { dg-final { scan-assembler "vmulpd\[ \\t\]+\[^\n\]*%xmm\[0-9\]+" } } */
 
 #define N 1024
 
diff --git a/gcc/testsuite/gcc.target/i386/pr90579.c b/gcc/testsuite/gcc.target/i386/pr90579.c
new file mode 100644
index 00000000000..5f81de412fe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90579.c
@@ -0,0 +1,20 @@
+/* PR target/90579 */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -funroll-loops -march=skylake-avx512 -mfpmath=sse" } */
+/* { dg-final { scan-assembler-not "vmov\[^\{\n\]*\[ \\t\]+\[^\n\{\]*%rsp" } } */
+
+/* There's no need of load/store for r[i].
+   Set -mprefer-vector-width=128 as default is a walk-around of real issue.  */
+extern double a[];
+double
+store_forward_stall (int k, double x, double y)
+{
+  int i;
+  double t=0;
+  double r[6];
+  for (i=0;i<6;i++)
+    r[i] = x * a[i + k];
+  for (i=0;i<6;i++)
+    t += y * r[5-i];
+  return t;
+}
-- 
2.19.1

Reply via email to