According to Intel SOM[1], For Crestmont, most 256-bit Intel AVX2 instructions can be decomposed into two independent 128-bit micro-operations, except for a subset of Intel AVX2 instructions, known as cross-lane operations, can only compute the result for an element by utilizing one or more sources belonging to other elements.
The 256-bit instructions listed below use more operand sources than can be natively supported by a single reservation station within these microarchitectures. They are decomposed into two μops, where the first μop resolves a subset of operand dependencies across two cycles. The dependent second μop executes the 256-bit operation by using a single 128-bit execution port for two consecutive cycles with a five-cycle latency for a total latency of seven cycles. VPERM2I128 ymm1, ymm2, ymm3/m256, imm8 VPERM2F128 ymm1, ymm2, ymm3/m256, imm8 VPERMPD ymm1, ymm2/m256, imm8 VPERMPS ymm1, ymm2, ymm3/m256 VPERMD ymm1, ymm2, ymm3/m256 VPERMQ ymm1, ymm2/m256, imm8 Instead of setting tune avx128_optimal for SRF, the patch add a new tune avx256_avoid_vec_perm for it. so by default, vectorizer still uses 256-bit VF if cost is profitable, but lowers to 128-bit whenever 256-bit vec_perm is needed for auto-vectorization. w/o vec_perm, performance of 256-bit vectorization should be similar as 128-bit ones(some benchmark results show it's even better than 128-bit vectorization since it enables more parallelism for convert cases.) [1] https://www.intel.com/content/www/us/en/content-details/814198/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html gcc/ChangeLog: * config/i386/i386.cc (ix86_vector_costs::ix86_vector_costs): Add new member m_num_avx256_vec_perm. (ix86_vector_costs::add_stmt_cost): Record 256-bit vec_perm. (ix86_vector_costs::finish_cost): Prevent vectorization for TAREGT_AVX256_AVOID_VEC_PERM when there's 256-bit vec_perm instruction. * config/i386/i386.h (TARGET_AVX256_AVOID_VEC_PERM): New Macro. * config/i386/x86-tune.def (X86_TUNE_AVX256_SPLIT_REGS): Add m_CORE_ATOM. (X86_TUNE_AVX256_AVOID_VEC_PERM): New tune. gcc/testsuite/ChangeLog: * gcc.target/i386/avx256_avoid_vec_perm.c: New test. --- gcc/config/i386/i386.cc | 14 +++++++++++- gcc/config/i386/i386.h | 2 ++ gcc/config/i386/x86-tune.def | 7 +++++- .../gcc.target/i386/avx256_avoid_vec_perm.c | 22 +++++++++++++++++++ 4 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 7dbae1d72e3..77567b233d5 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -24816,12 +24816,15 @@ private: where we know it's not loaded from memory. */ unsigned m_num_gpr_needed[3]; unsigned m_num_sse_needed[3]; + /* Number of 256-bit vector permutation. */ + unsigned m_num_avx256_vec_perm[3]; }; ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool costing_for_scalar) : vector_costs (vinfo, costing_for_scalar), m_num_gpr_needed (), - m_num_sse_needed () + m_num_sse_needed (), + m_num_avx256_vec_perm () { } @@ -25055,6 +25058,10 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, if (stmt_cost == -1) stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); + if (kind == vec_perm && vectype + && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32) + m_num_avx256_vec_perm[where]++; + /* Penalize DFmode vector operations for Bonnell. */ if (TARGET_CPU_P (BONNELL) && kind == vector_stmt && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode) @@ -25124,6 +25131,11 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs) ix86_vect_estimate_reg_pressure (); + for (int i = 0; i != 3; i++) + if (m_num_avx256_vec_perm[i] + && TARGET_AVX256_AVOID_VEC_PERM) + m_costs[i] = INT_MAX; + vector_costs::finish_cost (scalar_costs); } diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index f01f31d208a..d57a1ca3e5c 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -439,6 +439,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL] #define TARGET_AVX256_SPLIT_REGS \ ix86_tune_features[X86_TUNE_AVX256_SPLIT_REGS] +#define TARGET_AVX256_AVOID_VEC_PERM \ + ix86_tune_features[X86_TUNE_AVX256_AVOID_VEC_PERM] #define TARGET_AVX512_SPLIT_REGS \ ix86_tune_features[X86_TUNE_AVX512_SPLIT_REGS] #define TARGET_GENERAL_REGS_SSE_SPILL \ diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index b815b6dc255..6ebb2fd3414 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -558,7 +558,7 @@ DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal" /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX256 ops are split into two AVX128 ops. */ DEF_TUNE (X86_TUNE_AVX256_SPLIT_REGS, "avx256_split_regs",m_BDVER | m_BTVER2 - | m_ZNVER1) + | m_ZNVER1 | m_CORE_ATOM) /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for the auto-vectorizer. */ @@ -569,6 +569,11 @@ DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2 instructions in the auto-vectorizer. */ DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512) +/* X86_TUNE_AVX256_AVOID_VEC_PERM: Avoid using 256-bit cross-lane + vector permutation instructions in the auto-vectorizer. */ +DEF_TUNE (X86_TUNE_AVX256_AVOID_VEC_PERM, + "avx256_avoid_vec_perm", m_CORE_ATOM) + /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX512 ops are split into two AVX256 ops. */ DEF_TUNE (X86_TUNE_AVX512_SPLIT_REGS, "avx512_split_regs", m_ZNVER4) diff --git a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c new file mode 100644 index 00000000000..d4f00b3fb52 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=sierraforest -fdump-tree-vect-details" } */ +/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" } } */ + +int a[256], b[256]; + +void __attribute__((noinline)) +foo (void) +{ + int i; + for (i = 0; i < 32; ++i) + { + b[i*8+0] = a[i*8+0]; + b[i*8+1] = a[i*8+0]; + b[i*8+2] = a[i*8+3]; + b[i*8+3] = a[i*8+3]; + b[i*8+4] = a[i*8+4]; + b[i*8+5] = a[i*8+6]; + b[i*8+6] = a[i*8+4]; + b[i*8+7] = a[i*8+6]; + } +} -- 2.31.1