Re: [PATCH] Fix PR80846, change vectorizer reduction epilogue (on x86)

Richard Biener Thu, 11 Jan 2018 04:08:04 -0800

On Wed, 10 Jan 2018, Richard Biener wrote:

> On Tue, 9 Jan 2018, Jeff Law wrote:
> 
> > On 01/05/2018 02:01 AM, Richard Biener wrote:
> > > On Tue, 28 Nov 2017, Richard Biener wrote:
> > > 
> > >>
> > >> The following adds a new target hook, targetm.vectorize.split_reduction,
> > >> which allows the target to specify a preferred mode to perform the
> > >> final reducion on using either vector shifts or scalar extractions.
> > >> Up to that mode the vector reduction result is reduced by combining
> > >> lowparts and highparts recursively.  This avoids lane-crossing operations
> > >> when doing AVX256 on Zen and Bulldozer and also speeds up things on
> > >> Haswell (I verified ~20% speedup on Broadwell).
> > >>
> > >> Thus the patch implements the target hook on x86 to _always_ prefer
> > >> SSE modes for the final reduction.
> > >>
> > >> For the testcase in the bugzilla
> > >>
> > >> int sumint(const int arr[]) {
> > >>     arr = __builtin_assume_aligned(arr, 64);
> > >>     int sum=0;
> > >>     for (int i=0 ; i<1024 ; i++)
> > >>       sum+=arr[i];
> > >>     return sum;
> > >> }
> > >>
> > >> this changes -O3 -mavx512f code from
> > >>
> > >> sumint:
> > >> .LFB0:
> > >>         .cfi_startproc
> > >>         vpxord  %zmm0, %zmm0, %zmm0
> > >>         leaq    4096(%rdi), %rax
> > >>         .p2align 4,,10
> > >>         .p2align 3
> > >> .L2:
> > >>         vpaddd  (%rdi), %zmm0, %zmm0
> > >>         addq    $64, %rdi
> > >>         cmpq    %rdi, %rax
> > >>         jne     .L2
> > >>         vpxord  %zmm1, %zmm1, %zmm1
> > >>         vshufi32x4      $78, %zmm1, %zmm0, %zmm2
> > >>         vpaddd  %zmm2, %zmm0, %zmm0
> > >>         vmovdqa64       .LC0(%rip), %zmm2
> > >>         vpermi2d        %zmm1, %zmm0, %zmm2
> > >>         vpaddd  %zmm2, %zmm0, %zmm0
> > >>         vmovdqa64       .LC1(%rip), %zmm2
> > >>         vpermi2d        %zmm1, %zmm0, %zmm2
> > >>         vpaddd  %zmm2, %zmm0, %zmm0
> > >>         vmovdqa64       .LC2(%rip), %zmm2
> > >>         vpermi2d        %zmm1, %zmm0, %zmm2
> > >>         vpaddd  %zmm2, %zmm0, %zmm0
> > >>         vmovd   %xmm0, %eax
> > >>
> > >> to
> > >>
> > >> sumint:
> > >> .LFB0:
> > >>         .cfi_startproc
> > >>         vpxord  %zmm0, %zmm0, %zmm0
> > >>         leaq    4096(%rdi), %rax
> > >>         .p2align 4,,10
> > >>         .p2align 3
> > >> .L2:
> > >>         vpaddd  (%rdi), %zmm0, %zmm0
> > >>         addq    $64, %rdi
> > >>         cmpq    %rdi, %rax
> > >>         jne     .L2
> > >>         vextracti64x4   $0x1, %zmm0, %ymm1
> > >>         vpaddd  %ymm0, %ymm1, %ymm1
> > >>         vmovdqa %xmm1, %xmm0
> > >>         vextracti128    $1, %ymm1, %xmm1
> > >>         vpaddd  %xmm1, %xmm0, %xmm0
> > >>         vpsrldq $8, %xmm0, %xmm1
> > >>         vpaddd  %xmm1, %xmm0, %xmm0
> > >>         vpsrldq $4, %xmm0, %xmm1
> > >>         vpaddd  %xmm1, %xmm0, %xmm0
> > >>         vmovd   %xmm0, %eax
> > >>
> > >> and for -O3 -mavx2 from
> > >>
> > >> sumint:
> > >> .LFB0:
> > >>         .cfi_startproc
> > >>         vpxor   %xmm0, %xmm0, %xmm0
> > >>         leaq    4096(%rdi), %rax
> > >>         .p2align 4,,10
> > >>         .p2align 3
> > >> .L2:
> > >>         vpaddd  (%rdi), %ymm0, %ymm0
> > >>         addq    $32, %rdi
> > >>         cmpq    %rdi, %rax
> > >>         jne     .L2
> > >>         vpxor   %xmm1, %xmm1, %xmm1
> > >>         vperm2i128      $33, %ymm1, %ymm0, %ymm2
> > >>         vpaddd  %ymm2, %ymm0, %ymm0
> > >>         vperm2i128      $33, %ymm1, %ymm0, %ymm2
> > >>         vpalignr        $8, %ymm0, %ymm2, %ymm2
> > >>         vpaddd  %ymm2, %ymm0, %ymm0
> > >>         vperm2i128      $33, %ymm1, %ymm0, %ymm1
> > >>         vpalignr        $4, %ymm0, %ymm1, %ymm1
> > >>         vpaddd  %ymm1, %ymm0, %ymm0
> > >>         vmovd   %xmm0, %eax
> > >>
> > >> to
> > >>
> > >> sumint:
> > >> .LFB0:
> > >>         .cfi_startproc
> > >>         vpxor   %xmm0, %xmm0, %xmm0
> > >>         leaq    4096(%rdi), %rax
> > >>         .p2align 4,,10
> > >>         .p2align 3
> > >> .L2:
> > >>         vpaddd  (%rdi), %ymm0, %ymm0
> > >>         addq    $32, %rdi
> > >>         cmpq    %rdi, %rax
> > >>         jne     .L2
> > >>         vmovdqa %xmm0, %xmm1
> > >>         vextracti128    $1, %ymm0, %xmm0
> > >>         vpaddd  %xmm0, %xmm1, %xmm0
> > >>         vpsrldq $8, %xmm0, %xmm1
> > >>         vpaddd  %xmm1, %xmm0, %xmm0
> > >>         vpsrldq $4, %xmm0, %xmm1
> > >>         vpaddd  %xmm1, %xmm0, %xmm0
> > >>         vmovd   %xmm0, %eax
> > >>         vzeroupper
> > >>         ret
> > >>
> > >> which besides being faster is also smaller (less prefixes).
> > >>
> > >> SPEC 2k6 results on Haswell (thus AVX2) are neutral.  As it merely
> > >> effects reduction vectorization epilogues I didn't expect big effects
> > >> but for loops that do not run much (more likely with AVX512).
> > >>
> > >> Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
> > >>
> > >> Ok for trunk?
> > > 
> > > Ping?
> > > 
> > > Richard.
> > > 
> > >> The PR mentions some more tricks to optimize the sequence but
> > >> those look like backend only optimizations.
> > >>
> > >> Thanks,
> > >> Richard.
> > >>
> > >> 2017-11-28  Richard Biener  <[email protected]>
> > >>
> > >>  PR tree-optimization/80846
> > >>  * target.def (split_reduction): New target hook.
> > >>  * targhooks.c (default_split_reduction): New function.
> > >>  * targhooks.h (default_split_reduction): Declare.
> > >>  * tree-vect-loop.c (vect_create_epilog_for_reduction): If the
> > >>  target requests first reduce vectors by combining low and high
> > >>  parts.
> > >>  * tree-vect-stmts.c (vect_gen_perm_mask_any): Adjust.
> > >>  (get_vectype_for_scalar_type_and_size): Export.
> > >>  * tree-vectorizer.h (get_vectype_for_scalar_type_and_size): Declare.
> > >>
> > >>  * doc/tm.texi.in (TARGET_VECTORIZE_SPLIT_REDUCTION): Document.
> > >>  * doc/tm.texi: Regenerate.
> > >>
> > >>  i386/
> > >>  * config/i386/i386.c (ix86_split_reduction): Implement
> > >>  TARGET_VECTORIZE_SPLIT_REDUCTION.
> > >>
> > >>  * gcc.target/i386/pr80846-1.c: New testcase.
> > >>  * gcc.target/i386/pr80846-2.c: Likewise.
> > I've got no objections here and you know this code far better than I.
> 
> I was really looking for x86 maintainer ack for the target hook
> implementation which I just quote here for reference again:
> 
> +/* All CPUs perfer to avoid cross-lane operations so perform reductions
> +   upper against lower halves up to SSE reg size.  */
> +
> +static machine_mode
> +ix86_split_reduction (machine_mode mode)
> +{
> +  /* Reduce lowpart against highpart until we reach SSE reg width to
> +     avoid cross-lane operations.  */
> +  switch (mode)
> +    {
> +    case E_V16SImode:
> +    case E_V8SImode:
> +      return V4SImode;
> +    case E_V32HImode:
> +    case E_V16HImode:
> +      return V8HImode;
> +    case E_V64QImode:
> +    case E_V32QImode:
> +      return V16QImode;
> +    case E_V16SFmode:
> +    case E_V8SFmode:
> +      return V4SFmode;
> +    case E_V8DFmode:
> +    case E_V4DFmode:
> +      return V2DFmode;
> +    default:
> +      return mode;
> +    }
> +}
> 
> this means we'll do [zmm -> ymm] -> xmm (looks like I forgot VnDImode
> in the above list, consider that added).


For the record, here's what I am re-testing (crossed with some SVE
changes again...).  Also we have regressed in code generation
for the pr80846-1.c testcase I am going to add and for some odd reason
started spilling the acummulator:

        vmovdqa64       %zmm4, -64(%rsp)
        .p2align 4,,10
        .p2align 3
.L2:
        vmovdqa64       -64(%rsp), %zmm3
        vpaddd  (%rdi), %zmm3, %zmm2
        addq    $64, %rdi
        vmovdqa64       %zmm2, -64(%rsp)
        cmpq    %rdi, %rax
        jne     .L2
        vmovdqa -32(%rsp), %ymm5
        vpaddd  -64(%rsp), %ymm5, %ymm0
        vmovdqa %xmm0, %xmm1
        vextracti128    $1, %ymm0, %xmm0

the load to %ymm5 should have been a vextract instead of a load
from stack.  The RTL before ira/lra is

(insn 16 15 18 4 (set (reg:V8SI 107)
        (plus:V8SI (subreg:V8SI (reg:V16SI 94 [ vect_sum_11.4 ]) 32)
            (subreg:V8SI (reg:V16SI 94 [ vect_sum_11.4 ]) 0))) 2998 
{*addv8si3}
     (expr_list:REG_DEAD (reg:V16SI 94 [ vect_sum_11.4 ])
        (nil)))

and that doesn't get handled properly for some reason anymore (it
worked a few months ago).  So we spill reg:V16SI 94.

We should sort this out given the vectorizer creates such subregs
elsewhere already, so I'm going to ignore this "regression" and
let the testcase in FAIL state.  The vectorizer produces

  _20 = BIT_FIELD_REF <vect_sum_11.4_5, 256, 0>;
  _13 = BIT_FIELD_REF <vect_sum_11.4_5, 256, 256>;
  _18 = _20 + _13;

here after checking

          if (convert_optab_handler (vec_extract_optab,
                                     TYPE_MODE (TREE_TYPE (new_temp)),
                                     TYPE_MODE (vectype1))
              != CODE_FOR_nothing)
            {

and it does the v8si -> v4si via

  _19 = VIEW_CONVERT_EXPR<vector(2) uint128_t>(_18);
  _25 = BIT_FIELD_REF <_19, 128, 0>;
  _26 = VIEW_CONVERT_EXPR<vector(4) int>(_25);
  _27 = BIT_FIELD_REF <_19, 128, 128>;
  _28 = VIEW_CONVERT_EXPR<vector(4) int>(_27);
  _29 = _26 + _28;

where the target doesn't handle the vec_extract with vector modes.
So something broke that avx512 upper/lower half extraction in the
target (or the RTL that feeds it).

Re-bootstrap & regtest running on x86_64-unknown-linux-gnu.  I'm going
to commit this with Jeffs approval as x86 maintainers seem to not
care too much here.  I've done the benchmarking after all.

Will open a bug for the above after checkin.

Thanks,
Richard.

2018-01-11  Richard Biener  <[email protected]>

        PR tree-optimization/80846
        * target.def (split_reduction): New target hook.
        * targhooks.c (default_split_reduction): New function.
        * targhooks.h (default_split_reduction): Declare.
        * tree-vect-loop.c (vect_create_epilog_for_reduction): If the
        target requests first reduce vectors by combining low and high
        parts.
        * tree-vect-stmts.c (vect_gen_perm_mask_any): Adjust.
        (get_vectype_for_scalar_type_and_size): Export.
        * tree-vectorizer.h (get_vectype_for_scalar_type_and_size): Declare.

        * doc/tm.texi.in (TARGET_VECTORIZE_SPLIT_REDUCTION): Document.
        * doc/tm.texi: Regenerate.

        i386/
        * config/i386/i386.c (ix86_split_reduction): Implement
        TARGET_VECTORIZE_SPLIT_REDUCTION.

        * gcc.target/i386/pr80846-1.c: New testcase.
        * gcc.target/i386/pr80846-2.c: Likewise.

Index: gcc/target.def
===================================================================
--- gcc/target.def      (revision 256526)
+++ gcc/target.def      (working copy)
@@ -1890,6 +1890,17 @@ transformations even in absence of speci
  (scalar_mode mode),
  default_preferred_simd_mode)
 
+/* Returns the preferred mode for splitting SIMD reductions to.  */
+DEFHOOK
+(split_reduction,
+ "This hook should return the preferred mode to split the final reduction\n\
+step on @var{mode} to.  The reduction is then carried out reducing upper\n\
+against lower halves of vectors recursively until the specified mode is\n\
+reached.  The default is @var{mode} which means no splitting.",
+  machine_mode,
+  (machine_mode),
+  default_split_reduction)
+
 /* Returns a mask of vector sizes to iterate over when auto-vectorizing
    after processing the preferred one derived from preferred_simd_mode.  */
 DEFHOOK
Index: gcc/targhooks.c
===================================================================
--- gcc/targhooks.c     (revision 256526)
+++ gcc/targhooks.c     (working copy)
@@ -1283,6 +1283,14 @@ default_preferred_simd_mode (scalar_mode
   return word_mode;
 }
 
+/* By default do not split reductions further.  */
+
+machine_mode
+default_split_reduction (machine_mode mode)
+{
+  return mode;
+}
+
 /* By default only the size derived from the preferred vector mode
    is tried.  */
 
Index: gcc/targhooks.h
===================================================================
--- gcc/targhooks.h     (revision 256526)
+++ gcc/targhooks.h     (working copy)
@@ -108,6 +108,7 @@ default_builtin_support_vector_misalignm
                                             const_tree,
                                             int, bool);
 extern machine_mode default_preferred_simd_mode (scalar_mode mode);
+extern machine_mode default_split_reduction (machine_mode);
 extern void default_autovectorize_vector_sizes (vector_sizes *);
 extern opt_machine_mode default_get_mask_mode (poly_uint64, poly_uint64);
 extern void *default_init_cost (struct loop *);
Index: gcc/tree-vect-loop.c
===================================================================
--- gcc/tree-vect-loop.c        (revision 256526)
+++ gcc/tree-vect-loop.c        (working copy)
@@ -5062,12 +5062,7 @@ vect_create_epilog_for_reduction (vec<tr
     }
   else
     {
-      bool reduce_with_shift = have_whole_vector_shift (mode);
-      int element_bitsize = tree_to_uhwi (bitsize);
-      /* Enforced by vectorizable_reduction, which disallows SLP reductions
-        for variable-length vectors and also requires direct target support
-        for loop reductions.  */
-      int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
+      bool reduce_with_shift;
       tree vec_temp;
 
       /* COND reductions all do the final reduction with MAX_EXPR
@@ -5081,30 +5076,125 @@ vect_create_epilog_for_reduction (vec<tr
            code = MAX_EXPR;
        }
 
-      /* Regardless of whether we have a whole vector shift, if we're
-         emulating the operation via tree-vect-generic, we don't want
-         to use it.  Only the first round of the reduction is likely
-         to still be profitable via emulation.  */
-      /* ??? It might be better to emit a reduction tree code here, so that
-         tree-vect-generic can expand the first round via bit tricks.  */
-      if (!VECTOR_MODE_P (mode))
-        reduce_with_shift = false;
+      /* See if the target wants to do the final (shift) reduction
+        in a vector mode of smaller size and first reduce upper/lower
+        halves against each other.  */
+      enum machine_mode mode1 = mode;
+      tree vectype1 = vectype;
+      unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
+      unsigned sz1 = sz;
+      if (!slp_reduc
+         && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
+       sz1 = GET_MODE_SIZE (mode1).to_constant ();
+
+      vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
+      reduce_with_shift = have_whole_vector_shift (mode1);
+      if (!VECTOR_MODE_P (mode1))
+       reduce_with_shift = false;
       else
-        {
-          optab optab = optab_for_tree_code (code, vectype, optab_default);
-          if (optab_handler (optab, mode) == CODE_FOR_nothing)
-            reduce_with_shift = false;
-        }
+       {
+         optab optab = optab_for_tree_code (code, vectype1, optab_default);
+         if (optab_handler (optab, mode1) == CODE_FOR_nothing)
+           reduce_with_shift = false;
+       }
+
+      /* First reduce the vector to the desired vector size we should
+        do shift reduction on by combining upper and lower halves.  */
+      new_temp = new_phi_result;
+      while (sz > sz1)
+       {
+         gcc_assert (!slp_reduc);
+         sz /= 2;
+         vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
+
+         /* The target has to make sure we support lowpart/highpart
+            extraction, either via direct vector extract or through
+            an integer mode punning.  */
+         tree dst1, dst2;
+         if (convert_optab_handler (vec_extract_optab,
+                                    TYPE_MODE (TREE_TYPE (new_temp)),
+                                    TYPE_MODE (vectype1))
+             != CODE_FOR_nothing)
+           {
+             /* Extract sub-vectors directly once vec_extract becomes
+                a conversion optab.  */
+             dst1 = make_ssa_name (vectype1);
+             epilog_stmt
+                 = gimple_build_assign (dst1, BIT_FIELD_REF,
+                                        build3 (BIT_FIELD_REF, vectype1,
+                                                new_temp, TYPE_SIZE (vectype1),
+                                                bitsize_int (0)));
+             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+             dst2 =  make_ssa_name (vectype1);
+             epilog_stmt
+                 = gimple_build_assign (dst2, BIT_FIELD_REF,
+                                        build3 (BIT_FIELD_REF, vectype1,
+                                                new_temp, TYPE_SIZE (vectype1),
+                                                bitsize_int (sz * 
BITS_PER_UNIT)));
+             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+           }
+         else
+           {
+             /* Extract via punning to appropriately sized integer mode
+                vector.  */
+             tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
+                                                           1);
+             tree etype = build_vector_type (eltype, 2);
+             gcc_assert (convert_optab_handler (vec_extract_optab,
+                                                TYPE_MODE (etype),
+                                                TYPE_MODE (eltype))
+                         != CODE_FOR_nothing);
+             tree tem = make_ssa_name (etype);
+             epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
+                                                build1 (VIEW_CONVERT_EXPR,
+                                                        etype, new_temp));
+             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+             new_temp = tem;
+             tem = make_ssa_name (eltype);
+             epilog_stmt
+                 = gimple_build_assign (tem, BIT_FIELD_REF,
+                                        build3 (BIT_FIELD_REF, eltype,
+                                                new_temp, TYPE_SIZE (eltype),
+                                                bitsize_int (0)));
+             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+             dst1 = make_ssa_name (vectype1);
+             epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
+                                                build1 (VIEW_CONVERT_EXPR,
+                                                        vectype1, tem));
+             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+             tem = make_ssa_name (eltype);
+             epilog_stmt
+                 = gimple_build_assign (tem, BIT_FIELD_REF,
+                                        build3 (BIT_FIELD_REF, eltype,
+                                                new_temp, TYPE_SIZE (eltype),
+                                                bitsize_int (sz * 
BITS_PER_UNIT)));
+             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+             dst2 =  make_ssa_name (vectype1);
+             epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
+                                                build1 (VIEW_CONVERT_EXPR,
+                                                        vectype1, tem));
+             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+           }
+
+         new_temp = make_ssa_name (vectype1);
+         epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
+         gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+       }
 
       if (reduce_with_shift && !slp_reduc)
-        {
-          int nelements = vec_size_in_bits / element_bitsize;
+       {
+         int element_bitsize = tree_to_uhwi (bitsize);
+         /* Enforced by vectorizable_reduction, which disallows SLP reductions
+            for variable-length vectors and also requires direct target support
+            for loop reductions.  */
+         int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
+         int nelements = vec_size_in_bits / element_bitsize;
          vec_perm_builder sel;
          vec_perm_indices indices;
 
           int elt_offset;
 
-          tree zero_vec = build_zero_cst (vectype);
+          tree zero_vec = build_zero_cst (vectype1);
           /* Case 2: Create:
              for (offset = nelements/2; offset >= 1; offset/=2)
                 {
@@ -5118,15 +5208,15 @@ vect_create_epilog_for_reduction (vec<tr
             dump_printf_loc (MSG_NOTE, vect_location,
                             "Reduce using vector shifts\n");
 
-          vec_dest = vect_create_destination_var (scalar_dest, vectype);
-          new_temp = new_phi_result;
+         mode1 = TYPE_MODE (vectype1);
+          vec_dest = vect_create_destination_var (scalar_dest, vectype1);
           for (elt_offset = nelements / 2;
                elt_offset >= 1;
                elt_offset /= 2)
             {
              calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
              indices.new_vector (sel, 2, nelements);
-             tree mask = vect_gen_perm_mask_any (vectype, indices);
+             tree mask = vect_gen_perm_mask_any (vectype1, indices);
              epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
                                                 new_temp, zero_vec, mask);
               new_name = make_ssa_name (vec_dest, epilog_stmt);
@@ -5171,7 +5261,8 @@ vect_create_epilog_for_reduction (vec<tr
             dump_printf_loc (MSG_NOTE, vect_location,
                             "Reduce using scalar code.\n");
 
-          vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
+         int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
+         int element_bitsize = tree_to_uhwi (bitsize);
           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
             {
               int bit_offset;
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c       (revision 256526)
+++ gcc/tree-vect-stmts.c       (working copy)
@@ -9068,7 +9068,7 @@ free_stmt_vec_info (gimple *stmt)
    Returns the vector type corresponding to SCALAR_TYPE  and SIZE as supported
    by the target.  */
 
-static tree
+tree
 get_vectype_for_scalar_type_and_size (tree scalar_type, poly_uint64 size)
 {
   tree orig_scalar_type = scalar_type;
Index: gcc/tree-vectorizer.h
===================================================================
--- gcc/tree-vectorizer.h       (revision 256526)
+++ gcc/tree-vectorizer.h       (working copy)
@@ -1209,6 +1209,7 @@ extern bool vect_can_advance_ivs_p (loop
 /* In tree-vect-stmts.c.  */
 extern poly_uint64 current_vector_size;
 extern tree get_vectype_for_scalar_type (tree);
+extern tree get_vectype_for_scalar_type_and_size (tree, poly_uint64);
 extern tree get_mask_type_for_scalar_type (tree);
 extern tree get_same_sized_vectype (tree, tree);
 extern bool vect_is_simple_use (tree, vec_info *, gimple **,
Index: gcc/doc/tm.texi.in
===================================================================
--- gcc/doc/tm.texi.in  (revision 256526)
+++ gcc/doc/tm.texi.in  (working copy)
@@ -4093,6 +4093,8 @@ address;  but often a machine-dependent
 
 @hook TARGET_VECTORIZE_PREFERRED_SIMD_MODE
 
+@hook TARGET_VECTORIZE_SPLIT_REDUCTION
+
 @hook TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
 
 @hook TARGET_VECTORIZE_GET_MASK_MODE
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c      (revision 256526)
+++ gcc/config/i386/i386.c      (working copy)
@@ -48969,6 +48969,39 @@ ix86_preferred_simd_mode (scalar_mode mo
     }
 }
 
+/* All CPUs perfer to avoid cross-lane operations so perform reductions
+   upper against lower halves up to SSE reg size.  */
+
+static machine_mode
+ix86_split_reduction (machine_mode mode)
+{
+  /* Reduce lowpart against highpart until we reach SSE reg width to
+     avoid cross-lane operations.  */
+  switch (mode)
+    {
+    case E_V8DImode:
+    case E_V4DImode:
+      return V2DImode;
+    case E_V16SImode:
+    case E_V8SImode:
+      return V4SImode;
+    case E_V32HImode:
+    case E_V16HImode:
+      return V8HImode;
+    case E_V64QImode:
+    case E_V32QImode:
+      return V16QImode;
+    case E_V16SFmode:
+    case E_V8SFmode:
+      return V4SFmode;
+    case E_V8DFmode:
+    case E_V4DFmode:
+      return V2DFmode;
+    default:
+      return mode;
+    }
+}
+
 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
    vectors.  If AVX512F is enabled then try vectorizing with 512bit,
    256bit and 128bit vectors.  */
@@ -50601,6 +50634,9 @@ ix86_run_selftests (void)
 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
   ix86_preferred_simd_mode
+#undef TARGET_VECTORIZE_SPLIT_REDUCTION
+#define TARGET_VECTORIZE_SPLIT_REDUCTION \
+  ix86_split_reduction
 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
   ix86_autovectorize_vector_sizes
Index: gcc/testsuite/gcc.target/i386/pr80846-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr80846-1.c   (nonexistent)
+++ gcc/testsuite/gcc.target/i386/pr80846-1.c   (working copy)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f" } */
+
+int sumint(const int arr[]) {
+    arr = __builtin_assume_aligned(arr, 64);
+    int sum=0;
+    for (int i=0 ; i<1024 ; i++)
+      sum+=arr[i];
+    return sum;
+}
+
+/* { dg-final { scan-assembler-times "vextracti" 2 } } */
Index: gcc/testsuite/gcc.target/i386/pr80846-2.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr80846-2.c   (nonexistent)
+++ gcc/testsuite/gcc.target/i386/pr80846-2.c   (working copy)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2" } */
+
+int sumint(const int arr[]) {
+    arr = __builtin_assume_aligned(arr, 64);
+    int sum=0;
+    for (int i=0 ; i<1024 ; i++)
+      sum+=arr[i];
+    return sum;
+}
+
+/* { dg-final { scan-assembler-times "vextracti" 1 } } */

Re: [PATCH] Fix PR80846, change vectorizer reduction epilogue (on x86)

Reply via email to