The following adds a new target hook, targetm.vectorize.split_reduction, which allows the target to specify a preferred mode to perform the final reducion on using either vector shifts or scalar extractions. Up to that mode the vector reduction result is reduced by combining lowparts and highparts recursively. This avoids lane-crossing operations when doing AVX256 on Zen and Bulldozer and also speeds up things on Haswell (I verified ~20% speedup on Broadwell).
Thus the patch implements the target hook on x86 to _always_ prefer SSE modes for the final reduction. For the testcase in the bugzilla int sumint(const int arr[]) { arr = __builtin_assume_aligned(arr, 64); int sum=0; for (int i=0 ; i<1024 ; i++) sum+=arr[i]; return sum; } this changes -O3 -mavx512f code from sumint: .LFB0: .cfi_startproc vpxord %zmm0, %zmm0, %zmm0 leaq 4096(%rdi), %rax .p2align 4,,10 .p2align 3 .L2: vpaddd (%rdi), %zmm0, %zmm0 addq $64, %rdi cmpq %rdi, %rax jne .L2 vpxord %zmm1, %zmm1, %zmm1 vshufi32x4 $78, %zmm1, %zmm0, %zmm2 vpaddd %zmm2, %zmm0, %zmm0 vmovdqa64 .LC0(%rip), %zmm2 vpermi2d %zmm1, %zmm0, %zmm2 vpaddd %zmm2, %zmm0, %zmm0 vmovdqa64 .LC1(%rip), %zmm2 vpermi2d %zmm1, %zmm0, %zmm2 vpaddd %zmm2, %zmm0, %zmm0 vmovdqa64 .LC2(%rip), %zmm2 vpermi2d %zmm1, %zmm0, %zmm2 vpaddd %zmm2, %zmm0, %zmm0 vmovd %xmm0, %eax to sumint: .LFB0: .cfi_startproc vpxord %zmm0, %zmm0, %zmm0 leaq 4096(%rdi), %rax .p2align 4,,10 .p2align 3 .L2: vpaddd (%rdi), %zmm0, %zmm0 addq $64, %rdi cmpq %rdi, %rax jne .L2 vextracti64x4 $0x1, %zmm0, %ymm1 vpaddd %ymm0, %ymm1, %ymm1 vmovdqa %xmm1, %xmm0 vextracti128 $1, %ymm1, %xmm1 vpaddd %xmm1, %xmm0, %xmm0 vpsrldq $8, %xmm0, %xmm1 vpaddd %xmm1, %xmm0, %xmm0 vpsrldq $4, %xmm0, %xmm1 vpaddd %xmm1, %xmm0, %xmm0 vmovd %xmm0, %eax and for -O3 -mavx2 from sumint: .LFB0: .cfi_startproc vpxor %xmm0, %xmm0, %xmm0 leaq 4096(%rdi), %rax .p2align 4,,10 .p2align 3 .L2: vpaddd (%rdi), %ymm0, %ymm0 addq $32, %rdi cmpq %rdi, %rax jne .L2 vpxor %xmm1, %xmm1, %xmm1 vperm2i128 $33, %ymm1, %ymm0, %ymm2 vpaddd %ymm2, %ymm0, %ymm0 vperm2i128 $33, %ymm1, %ymm0, %ymm2 vpalignr $8, %ymm0, %ymm2, %ymm2 vpaddd %ymm2, %ymm0, %ymm0 vperm2i128 $33, %ymm1, %ymm0, %ymm1 vpalignr $4, %ymm0, %ymm1, %ymm1 vpaddd %ymm1, %ymm0, %ymm0 vmovd %xmm0, %eax to sumint: .LFB0: .cfi_startproc vpxor %xmm0, %xmm0, %xmm0 leaq 4096(%rdi), %rax .p2align 4,,10 .p2align 3 .L2: vpaddd (%rdi), %ymm0, %ymm0 addq $32, %rdi cmpq %rdi, %rax jne .L2 vmovdqa %xmm0, %xmm1 vextracti128 $1, %ymm0, %xmm0 vpaddd %xmm0, %xmm1, %xmm0 vpsrldq $8, %xmm0, %xmm1 vpaddd %xmm1, %xmm0, %xmm0 vpsrldq $4, %xmm0, %xmm1 vpaddd %xmm1, %xmm0, %xmm0 vmovd %xmm0, %eax vzeroupper ret which besides being faster is also smaller (less prefixes). SPEC 2k6 results on Haswell (thus AVX2) are neutral. As it merely effects reduction vectorization epilogues I didn't expect big effects but for loops that do not run much (more likely with AVX512). Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. Ok for trunk? The PR mentions some more tricks to optimize the sequence but those look like backend only optimizations. Thanks, Richard. 2017-11-28 Richard Biener <rguent...@suse.de> PR tree-optimization/80846 * target.def (split_reduction): New target hook. * targhooks.c (default_split_reduction): New function. * targhooks.h (default_split_reduction): Declare. * tree-vect-loop.c (vect_create_epilog_for_reduction): If the target requests first reduce vectors by combining low and high parts. * tree-vect-stmts.c (vect_gen_perm_mask_any): Adjust. (get_vectype_for_scalar_type_and_size): Export. * tree-vectorizer.h (get_vectype_for_scalar_type_and_size): Declare. * doc/tm.texi.in (TARGET_VECTORIZE_SPLIT_REDUCTION): Document. * doc/tm.texi: Regenerate. i386/ * config/i386/i386.c (ix86_split_reduction): Implement TARGET_VECTORIZE_SPLIT_REDUCTION. * gcc.target/i386/pr80846-1.c: New testcase. * gcc.target/i386/pr80846-2.c: Likewise. Index: gcc/config/i386/i386.c =================================================================== --- gcc/config/i386/i386.c (revision 255197) +++ gcc/config/i386/i386.c (working copy) @@ -48864,6 +48864,36 @@ ix86_preferred_simd_mode (scalar_mode mo } } +/* All CPUs perfer to avoid cross-lane operations so perform reductions + upper against lower halves up to SSE reg size. */ + +static machine_mode +ix86_split_reduction (machine_mode mode) +{ + /* Reduce lowpart against highpart until we reach SSE reg width to + avoid cross-lane operations. */ + switch (mode) + { + case E_V16SImode: + case E_V8SImode: + return V4SImode; + case E_V32HImode: + case E_V16HImode: + return V8HImode; + case E_V64QImode: + case E_V32QImode: + return V16QImode; + case E_V16SFmode: + case E_V8SFmode: + return V4SFmode; + case E_V8DFmode: + case E_V4DFmode: + return V2DFmode; + default: + return mode; + } +} + /* If AVX is enabled then try vectorizing with both 256bit and 128bit vectors. If AVX512F is enabled then try vectorizing with 512bit, 256bit and 128bit vectors. */ @@ -50486,6 +50516,9 @@ ix86_run_selftests (void) #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \ ix86_preferred_simd_mode +#undef TARGET_VECTORIZE_SPLIT_REDUCTION +#define TARGET_VECTORIZE_SPLIT_REDUCTION \ + ix86_split_reduction #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \ ix86_autovectorize_vector_sizes Index: gcc/doc/tm.texi =================================================================== --- gcc/doc/tm.texi (revision 255197) +++ gcc/doc/tm.texi (working copy) @@ -5844,6 +5844,13 @@ equal to @code{word_mode}, because the v transformations even in absence of specialized @acronym{SIMD} hardware. @end deftypefn +@deftypefn {Target Hook} machine_mode TARGET_VECTORIZE_SPLIT_REDUCTION (machine_mode) +This hook should return the preferred mode to split the final reduction +step on @var{mode} to. The reduction is then carried out reducing upper +against lower halves of vectors recursively until the specified mode is +reached. The default is @var{mode} which means no splitting. +@end deftypefn + @deftypefn {Target Hook} {unsigned int} TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES (void) This hook should return a mask of sizes that should be iterated over after trying to autovectorize using the vector size derived from the Index: gcc/doc/tm.texi.in =================================================================== --- gcc/doc/tm.texi.in (revision 255197) +++ gcc/doc/tm.texi.in (working copy) @@ -4091,6 +4091,8 @@ address; but often a machine-dependent @hook TARGET_VECTORIZE_PREFERRED_SIMD_MODE +@hook TARGET_VECTORIZE_SPLIT_REDUCTION + @hook TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES @hook TARGET_VECTORIZE_GET_MASK_MODE Index: gcc/target.def =================================================================== --- gcc/target.def (revision 255197) +++ gcc/target.def (working copy) @@ -1875,6 +1875,17 @@ transformations even in absence of speci (scalar_mode mode), default_preferred_simd_mode) +/* Returns the preferred mode for splitting SIMD reductions to. */ +DEFHOOK +(split_reduction, + "This hook should return the preferred mode to split the final reduction\n\ +step on @var{mode} to. The reduction is then carried out reducing upper\n\ +against lower halves of vectors recursively until the specified mode is\n\ +reached. The default is @var{mode} which means no splitting.", + machine_mode, + (machine_mode), + default_split_reduction) + /* Returns a mask of vector sizes to iterate over when auto-vectorizing after processing the preferred one derived from preferred_simd_mode. */ DEFHOOK Index: gcc/targhooks.c =================================================================== --- gcc/targhooks.c (revision 255197) +++ gcc/targhooks.c (working copy) @@ -1281,6 +1281,14 @@ default_preferred_simd_mode (scalar_mode return word_mode; } +/* By default do not split reductions further. */ + +machine_mode +default_split_reduction (machine_mode mode) +{ + return mode; +} + /* By default only the size derived from the preferred vector mode is tried. */ Index: gcc/targhooks.h =================================================================== --- gcc/targhooks.h (revision 255197) +++ gcc/targhooks.h (working copy) @@ -108,6 +108,7 @@ default_builtin_support_vector_misalignm const_tree, int, bool); extern machine_mode default_preferred_simd_mode (scalar_mode mode); +extern machine_mode default_split_reduction (machine_mode); extern unsigned int default_autovectorize_vector_sizes (void); extern opt_machine_mode default_get_mask_mode (unsigned, unsigned); extern void *default_init_cost (struct loop *); Index: gcc/testsuite/gcc.target/i386/pr80846-1.c =================================================================== --- gcc/testsuite/gcc.target/i386/pr80846-1.c (nonexistent) +++ gcc/testsuite/gcc.target/i386/pr80846-1.c (working copy) @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mavx512f" } */ + +int sumint(const int arr[]) { + arr = __builtin_assume_aligned(arr, 64); + int sum=0; + for (int i=0 ; i<1024 ; i++) + sum+=arr[i]; + return sum; +} + +/* { dg-final { scan-assembler-times "vextracti" 2 } } */ Index: gcc/testsuite/gcc.target/i386/pr80846-2.c =================================================================== --- gcc/testsuite/gcc.target/i386/pr80846-2.c (nonexistent) +++ gcc/testsuite/gcc.target/i386/pr80846-2.c (working copy) @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mavx2" } */ + +int sumint(const int arr[]) { + arr = __builtin_assume_aligned(arr, 64); + int sum=0; + for (int i=0 ; i<1024 ; i++) + sum+=arr[i]; + return sum; +} + +/* { dg-final { scan-assembler-times "vextracti" 1 } } */ Index: gcc/tree-vect-loop.c =================================================================== --- gcc/tree-vect-loop.c (revision 255197) +++ gcc/tree-vect-loop.c (working copy) @@ -4994,38 +4994,126 @@ vect_create_epilog_for_reduction (vec<tr } else { - bool reduce_with_shift = have_whole_vector_shift (mode); - int element_bitsize = tree_to_uhwi (bitsize); - int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); + bool reduce_with_shift; tree vec_temp; - + /* COND reductions all do the final reduction with MAX_EXPR. */ if (code == COND_EXPR) code = MAX_EXPR; - /* Regardless of whether we have a whole vector shift, if we're - emulating the operation via tree-vect-generic, we don't want - to use it. Only the first round of the reduction is likely - to still be profitable via emulation. */ - /* ??? It might be better to emit a reduction tree code here, so that - tree-vect-generic can expand the first round via bit tricks. */ - if (!VECTOR_MODE_P (mode)) - reduce_with_shift = false; + /* See if the target wants to do the final (shift) reduction + in a vector mode of smaller size and first reduce upper/lower + halves against each other. */ + enum machine_mode mode1 = mode; + tree vectype1 = vectype; + unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype)); + unsigned sz1 = sz; + if (!slp_reduc + && (mode1 = targetm.vectorize.split_reduction (mode)) != mode) + sz1 = GET_MODE_SIZE (mode1); + + vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1); + reduce_with_shift = have_whole_vector_shift (mode1); + if (!VECTOR_MODE_P (mode1)) + reduce_with_shift = false; else - { - optab optab = optab_for_tree_code (code, vectype, optab_default); - if (optab_handler (optab, mode) == CODE_FOR_nothing) - reduce_with_shift = false; - } + { + optab optab = optab_for_tree_code (code, vectype1, optab_default); + if (optab_handler (optab, mode1) == CODE_FOR_nothing) + reduce_with_shift = false; + } + + /* First reduce the vector to the desired vector size we should + do shift reduction on by combining upper and lower halves. */ + new_temp = new_phi_result; + while (sz > sz1) + { + gcc_assert (!slp_reduc); + sz /= 2; + vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz); + + /* The target has to make sure we support lowpart/highpart + extraction, either via direct vector extract or through + an integer mode punning. */ + tree dst1, dst2; + if (convert_optab_handler (vec_extract_optab, + TYPE_MODE (TREE_TYPE (new_temp)), + TYPE_MODE (vectype1)) + != CODE_FOR_nothing) + { + /* Extract sub-vectors directly once vec_extract becomes + a conversion optab. */ + dst1 = make_ssa_name (vectype1); + epilog_stmt + = gimple_build_assign (dst1, BIT_FIELD_REF, + build3 (BIT_FIELD_REF, vectype1, + new_temp, TYPE_SIZE (vectype1), + bitsize_int (0))); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + dst2 = make_ssa_name (vectype1); + epilog_stmt + = gimple_build_assign (dst2, BIT_FIELD_REF, + build3 (BIT_FIELD_REF, vectype1, + new_temp, TYPE_SIZE (vectype1), + bitsize_int (sz * BITS_PER_UNIT))); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + } + else + { + /* Extract via punning to appropriately sized integer mode + vector. */ + tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT, + 1); + tree etype = build_vector_type (eltype, 2); + gcc_assert (convert_optab_handler (vec_extract_optab, + TYPE_MODE (etype), + TYPE_MODE (eltype)) + != CODE_FOR_nothing); + tree tem = make_ssa_name (etype); + epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR, + build1 (VIEW_CONVERT_EXPR, + etype, new_temp)); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + new_temp = tem; + tem = make_ssa_name (eltype); + epilog_stmt + = gimple_build_assign (tem, BIT_FIELD_REF, + build3 (BIT_FIELD_REF, eltype, + new_temp, TYPE_SIZE (eltype), + bitsize_int (0))); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + dst1 = make_ssa_name (vectype1); + epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR, + build1 (VIEW_CONVERT_EXPR, + vectype1, tem)); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + tem = make_ssa_name (eltype); + epilog_stmt + = gimple_build_assign (tem, BIT_FIELD_REF, + build3 (BIT_FIELD_REF, eltype, + new_temp, TYPE_SIZE (eltype), + bitsize_int (sz * BITS_PER_UNIT))); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + dst2 = make_ssa_name (vectype1); + epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR, + build1 (VIEW_CONVERT_EXPR, + vectype1, tem)); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + } + + new_temp = make_ssa_name (vectype1); + epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + } if (reduce_with_shift && !slp_reduc) { - int nelements = vec_size_in_bits / element_bitsize; + int nelements = TYPE_VECTOR_SUBPARTS (vectype1); auto_vec_perm_indices sel (nelements); int elt_offset; - tree zero_vec = build_zero_cst (vectype); + tree zero_vec = build_zero_cst (vectype1); /* Case 2: Create: for (offset = nelements/2; offset >= 1; offset/=2) { @@ -5039,15 +5127,15 @@ vect_create_epilog_for_reduction (vec<tr dump_printf_loc (MSG_NOTE, vect_location, "Reduce using vector shifts\n"); - vec_dest = vect_create_destination_var (scalar_dest, vectype); - new_temp = new_phi_result; + mode1 = TYPE_MODE (vectype1); + vec_dest = vect_create_destination_var (scalar_dest, vectype1); for (elt_offset = nelements / 2; elt_offset >= 1; elt_offset /= 2) { sel.truncate (0); - calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel); - tree mask = vect_gen_perm_mask_any (vectype, sel); + calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel); + tree mask = vect_gen_perm_mask_any (vectype1, sel); epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR, new_temp, zero_vec, mask); new_name = make_ssa_name (vec_dest, epilog_stmt); @@ -5092,7 +5180,8 @@ vect_create_epilog_for_reduction (vec<tr dump_printf_loc (MSG_NOTE, vect_location, "Reduce using scalar code.\n"); - vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); + int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); + int element_bitsize = tree_to_uhwi (bitsize); FOR_EACH_VEC_ELT (new_phis, i, new_phi) { int bit_offset; Index: gcc/tree-vect-stmts.c =================================================================== --- gcc/tree-vect-stmts.c (revision 255197) +++ gcc/tree-vect-stmts.c (working copy) @@ -6514,7 +6514,7 @@ vect_gen_perm_mask_any (tree vectype, ve mask_elt_type = lang_hooks.types.type_for_mode (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1); - mask_type = get_vectype_for_scalar_type (mask_elt_type); + mask_type = get_same_sized_vectype (mask_elt_type, vectype); auto_vec<tree, 32> mask_elts (nunits); for (unsigned int i = 0; i < nunits; ++i) @@ -9065,7 +9065,7 @@ free_stmt_vec_info (gimple *stmt) Returns the vector type corresponding to SCALAR_TYPE and SIZE as supported by the target. */ -static tree +tree get_vectype_for_scalar_type_and_size (tree scalar_type, unsigned size) { tree orig_scalar_type = scalar_type; Index: gcc/tree-vectorizer.h =================================================================== --- gcc/tree-vectorizer.h (revision 255197) +++ gcc/tree-vectorizer.h (working copy) @@ -1151,6 +1151,7 @@ extern bool vect_can_advance_ivs_p (loop /* In tree-vect-stmts.c. */ extern unsigned int current_vector_size; extern tree get_vectype_for_scalar_type (tree); +extern tree get_vectype_for_scalar_type_and_size (tree, unsigned); extern tree get_mask_type_for_scalar_type (tree); extern tree get_same_sized_vectype (tree, tree); extern bool vect_is_simple_use (tree, vec_info *, gimple **,