[gcc r15-4234] Add a new tune avx256_avoid_vec_perm for SRF.

hongtao Liu via Gcc-cvs Wed, 09 Oct 2024 19:22:51 -0700

https://gcc.gnu.org/g:9eaecce3d8c1d9349adbf8c2cdaf8d87672ed29c


commit r15-4234-g9eaecce3d8c1d9349adbf8c2cdaf8d87672ed29c
Author: liuhongt <hongtao....@intel.com>
Date:   Wed Sep 25 13:11:11 2024 +0800

    Add a new tune avx256_avoid_vec_perm for SRF.
    
    According to Intel SOM[1], For Crestmont,  most 256-bit Intel AVX2
    instructions can be decomposed into two independent 128-bit
    micro-operations, except for a subset of Intel AVX2 instructions,
    known as cross-lane operations, can only compute the result for an
    element by utilizing one or more sources belonging to other elements.
    
    The 256-bit instructions listed below use more operand sources than
    can be natively supported by a single reservation station within these
    microarchitectures. They are decomposed into two μops, where the first
    μop resolves a subset of operand dependencies across two cycles. The
    dependent second μop executes the 256-bit operation by using a single
    128-bit execution port for two consecutive cycles with a five-cycle
    latency for a total latency of seven cycles.
    
    VPERM2I128 ymm1, ymm2, ymm3/m256, imm8
    VPERM2F128 ymm1, ymm2, ymm3/m256, imm8
    VPERMPD ymm1, ymm2/m256, imm8
    VPERMPS ymm1, ymm2, ymm3/m256
    VPERMD ymm1, ymm2, ymm3/m256
    VPERMQ ymm1, ymm2/m256, imm8
    
    Instead of setting tune avx128_optimal for SRF, the patch add a new
    tune avx256_avoid_vec_perm for it. so by default, vectorizer still
    uses 256-bit VF if cost is profitable, but lowers to 128-bit whenever
    256-bit vec_perm is needed for auto-vectorization. w/o vec_perm,
    performance of 256-bit vectorization should be similar as 128-bit
    ones(some benchmark results show it's even better than 128-bit
    vectorization since it enables more parallelism for convert cases.)
    
    [1] 
https://www.intel.com/content/www/us/en/content-details/814198/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html
    
    gcc/ChangeLog:
    
            * config/i386/i386.cc (ix86_vector_costs::ix86_vector_costs):
            Add new member m_num_avx256_vec_perm.
            (ix86_vector_costs::add_stmt_cost): Record 256-bit vec_perm.
            (ix86_vector_costs::finish_cost): Prevent vectorization for
            TAREGT_AVX256_AVOID_VEC_PERM when there's 256-bit vec_perm
            instruction.
            * config/i386/i386.h (TARGET_AVX256_AVOID_VEC_PERM): New
            Macro.
            * config/i386/x86-tune.def (X86_TUNE_AVX256_SPLIT_REGS): Add
            m_CORE_ATOM.
            (X86_TUNE_AVX256_AVOID_VEC_PERM): New tune.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/i386/avx256_avoid_vec_perm.c: New test.

Diff:
---
 gcc/config/i386/i386.cc                            | 14 +++++++++++++-
 gcc/config/i386/i386.h                             |  2 ++
 gcc/config/i386/x86-tune.def                       |  7 ++++++-
 .../gcc.target/i386/avx256_avoid_vec_perm.c        | 22 ++++++++++++++++++++++
 4 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 90a564b2ffaa..ab0ade3790f2 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25025,12 +25025,15 @@ private:
      where we know it's not loaded from memory.  */
   unsigned m_num_gpr_needed[3];
   unsigned m_num_sse_needed[3];
+  /* Number of 256-bit vector permutation.  */
+  unsigned m_num_avx256_vec_perm[3];
 };
 
 ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool costing_for_scalar)
   : vector_costs (vinfo, costing_for_scalar),
     m_num_gpr_needed (),
-    m_num_sse_needed ()
+    m_num_sse_needed (),
+    m_num_avx256_vec_perm ()
 {
 }
 
@@ -25264,6 +25267,10 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
   if (stmt_cost == -1)
     stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
 
+  if (kind == vec_perm && vectype
+      && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32)
+    m_num_avx256_vec_perm[where]++;
+
   /* Penalize DFmode vector operations for Bonnell.  */
   if (TARGET_CPU_P (BONNELL) && kind == vector_stmt
       && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
@@ -25333,6 +25340,11 @@ ix86_vector_costs::finish_cost (const vector_costs 
*scalar_costs)
 
   ix86_vect_estimate_reg_pressure ();
 
+  for (int i = 0; i != 3; i++)
+    if (m_num_avx256_vec_perm[i]
+       && TARGET_AVX256_AVOID_VEC_PERM)
+      m_costs[i] = INT_MAX;
+
   vector_costs::finish_cost (scalar_costs);
 }
 
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index d5d54ee66040..f5204aa1ed23 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -439,6 +439,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
        ix86_tune_features[X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL]
 #define TARGET_AVX256_SPLIT_REGS \
        ix86_tune_features[X86_TUNE_AVX256_SPLIT_REGS]
+#define TARGET_AVX256_AVOID_VEC_PERM \
+       ix86_tune_features[X86_TUNE_AVX256_AVOID_VEC_PERM]
 #define TARGET_AVX512_SPLIT_REGS \
        ix86_tune_features[X86_TUNE_AVX512_SPLIT_REGS]
 #define TARGET_GENERAL_REGS_SSE_SPILL \
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index b815b6dc255b..6ebb2fd3414e 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -558,7 +558,7 @@ DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, 
"256_unaligned_store_optimal"
 
 /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX256 ops are split into two AVX128 
ops.  */
 DEF_TUNE (X86_TUNE_AVX256_SPLIT_REGS, "avx256_split_regs",m_BDVER | m_BTVER2
-         | m_ZNVER1)
+         | m_ZNVER1 | m_CORE_ATOM)
 
 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
    the auto-vectorizer.  */
@@ -569,6 +569,11 @@ DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", 
m_BDVER | m_BTVER2
    instructions in the auto-vectorizer.  */
 DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512)
 
+/* X86_TUNE_AVX256_AVOID_VEC_PERM: Avoid using 256-bit cross-lane
+   vector permutation instructions in the auto-vectorizer.  */
+DEF_TUNE (X86_TUNE_AVX256_AVOID_VEC_PERM,
+        "avx256_avoid_vec_perm", m_CORE_ATOM)
+
 /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX512 ops are split into two AVX256 
ops.  */
 DEF_TUNE (X86_TUNE_AVX512_SPLIT_REGS, "avx512_split_regs", m_ZNVER4)
 
diff --git a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c 
b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c
new file mode 100644
index 000000000000..d4f00b3fb520
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=sierraforest -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" 
} } */
+
+int a[256], b[256];
+
+void __attribute__((noinline))
+foo (void)
+{
+  int i;
+  for (i = 0; i < 32; ++i)
+    {
+      b[i*8+0] = a[i*8+0];
+      b[i*8+1] = a[i*8+0];
+      b[i*8+2] = a[i*8+3];
+      b[i*8+3] = a[i*8+3];
+      b[i*8+4] = a[i*8+4];
+      b[i*8+5] = a[i*8+6];
+      b[i*8+6] = a[i*8+4];
+      b[i*8+7] = a[i*8+6];
+    }
+}

[gcc r15-4234] Add a new tune avx256_avoid_vec_perm for SRF.

Reply via email to