[PATCH] Fix PR80846, change vectorizer reduction epilogue (on x86)

Richard Biener Tue, 28 Nov 2017 07:16:55 -0800

The following adds a new target hook, targetm.vectorize.split_reduction,
which allows the target to specify a preferred mode to perform the
final reducion on using either vector shifts or scalar extractions.
Up to that mode the vector reduction result is reduced by combining
lowparts and highparts recursively.  This avoids lane-crossing operations
when doing AVX256 on Zen and Bulldozer and also speeds up things on
Haswell (I verified ~20% speedup on Broadwell).


Thus the patch implements the target hook on x86 to _always_ prefer
SSE modes for the final reduction.

For the testcase in the bugzilla

int sumint(const int arr[]) {
    arr = __builtin_assume_aligned(arr, 64);
    int sum=0;
    for (int i=0 ; i<1024 ; i++)
      sum+=arr[i];
    return sum;
}

this changes -O3 -mavx512f code from

sumint:
.LFB0:
        .cfi_startproc
        vpxord  %zmm0, %zmm0, %zmm0
        leaq    4096(%rdi), %rax
        .p2align 4,,10
        .p2align 3
.L2:
        vpaddd  (%rdi), %zmm0, %zmm0
        addq    $64, %rdi
        cmpq    %rdi, %rax
        jne     .L2
        vpxord  %zmm1, %zmm1, %zmm1
        vshufi32x4      $78, %zmm1, %zmm0, %zmm2
        vpaddd  %zmm2, %zmm0, %zmm0
        vmovdqa64       .LC0(%rip), %zmm2
        vpermi2d        %zmm1, %zmm0, %zmm2
        vpaddd  %zmm2, %zmm0, %zmm0
        vmovdqa64       .LC1(%rip), %zmm2
        vpermi2d        %zmm1, %zmm0, %zmm2
        vpaddd  %zmm2, %zmm0, %zmm0
        vmovdqa64       .LC2(%rip), %zmm2
        vpermi2d        %zmm1, %zmm0, %zmm2
        vpaddd  %zmm2, %zmm0, %zmm0
        vmovd   %xmm0, %eax

to

sumint:
.LFB0:
        .cfi_startproc
        vpxord  %zmm0, %zmm0, %zmm0
        leaq    4096(%rdi), %rax
        .p2align 4,,10
        .p2align 3
.L2:
        vpaddd  (%rdi), %zmm0, %zmm0
        addq    $64, %rdi
        cmpq    %rdi, %rax
        jne     .L2
        vextracti64x4   $0x1, %zmm0, %ymm1
        vpaddd  %ymm0, %ymm1, %ymm1
        vmovdqa %xmm1, %xmm0
        vextracti128    $1, %ymm1, %xmm1
        vpaddd  %xmm1, %xmm0, %xmm0
        vpsrldq $8, %xmm0, %xmm1
        vpaddd  %xmm1, %xmm0, %xmm0
        vpsrldq $4, %xmm0, %xmm1
        vpaddd  %xmm1, %xmm0, %xmm0
        vmovd   %xmm0, %eax

and for -O3 -mavx2 from

sumint:
.LFB0:
        .cfi_startproc
        vpxor   %xmm0, %xmm0, %xmm0
        leaq    4096(%rdi), %rax
        .p2align 4,,10
        .p2align 3
.L2:
        vpaddd  (%rdi), %ymm0, %ymm0
        addq    $32, %rdi
        cmpq    %rdi, %rax
        jne     .L2
        vpxor   %xmm1, %xmm1, %xmm1
        vperm2i128      $33, %ymm1, %ymm0, %ymm2
        vpaddd  %ymm2, %ymm0, %ymm0
        vperm2i128      $33, %ymm1, %ymm0, %ymm2
        vpalignr        $8, %ymm0, %ymm2, %ymm2
        vpaddd  %ymm2, %ymm0, %ymm0
        vperm2i128      $33, %ymm1, %ymm0, %ymm1
        vpalignr        $4, %ymm0, %ymm1, %ymm1
        vpaddd  %ymm1, %ymm0, %ymm0
        vmovd   %xmm0, %eax

to

sumint:
.LFB0:
        .cfi_startproc
        vpxor   %xmm0, %xmm0, %xmm0
        leaq    4096(%rdi), %rax
        .p2align 4,,10
        .p2align 3
.L2:
        vpaddd  (%rdi), %ymm0, %ymm0
        addq    $32, %rdi
        cmpq    %rdi, %rax
        jne     .L2
        vmovdqa %xmm0, %xmm1
        vextracti128    $1, %ymm0, %xmm0
        vpaddd  %xmm0, %xmm1, %xmm0
        vpsrldq $8, %xmm0, %xmm1
        vpaddd  %xmm1, %xmm0, %xmm0
        vpsrldq $4, %xmm0, %xmm1
        vpaddd  %xmm1, %xmm0, %xmm0
        vmovd   %xmm0, %eax
        vzeroupper
        ret

which besides being faster is also smaller (less prefixes).

SPEC 2k6 results on Haswell (thus AVX2) are neutral.  As it merely
effects reduction vectorization epilogues I didn't expect big effects
but for loops that do not run much (more likely with AVX512).

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

Ok for trunk?

The PR mentions some more tricks to optimize the sequence but
those look like backend only optimizations.

Thanks,
Richard.

2017-11-28  Richard Biener  <rguent...@suse.de>

        PR tree-optimization/80846
        * target.def (split_reduction): New target hook.
        * targhooks.c (default_split_reduction): New function.
        * targhooks.h (default_split_reduction): Declare.
        * tree-vect-loop.c (vect_create_epilog_for_reduction): If the
        target requests first reduce vectors by combining low and high
        parts.
        * tree-vect-stmts.c (vect_gen_perm_mask_any): Adjust.
        (get_vectype_for_scalar_type_and_size): Export.
        * tree-vectorizer.h (get_vectype_for_scalar_type_and_size): Declare.

        * doc/tm.texi.in (TARGET_VECTORIZE_SPLIT_REDUCTION): Document.
        * doc/tm.texi: Regenerate.

        i386/
        * config/i386/i386.c (ix86_split_reduction): Implement
        TARGET_VECTORIZE_SPLIT_REDUCTION.

        * gcc.target/i386/pr80846-1.c: New testcase.
        * gcc.target/i386/pr80846-2.c: Likewise.

Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c      (revision 255197)
+++ gcc/config/i386/i386.c      (working copy)
@@ -48864,6 +48864,36 @@ ix86_preferred_simd_mode (scalar_mode mo
     }
 }
 
+/* All CPUs perfer to avoid cross-lane operations so perform reductions
+   upper against lower halves up to SSE reg size.  */
+
+static machine_mode
+ix86_split_reduction (machine_mode mode)
+{
+  /* Reduce lowpart against highpart until we reach SSE reg width to
+     avoid cross-lane operations.  */
+  switch (mode)
+    {
+    case E_V16SImode:
+    case E_V8SImode:
+      return V4SImode;
+    case E_V32HImode:
+    case E_V16HImode:
+      return V8HImode;
+    case E_V64QImode:
+    case E_V32QImode:
+      return V16QImode;
+    case E_V16SFmode:
+    case E_V8SFmode:
+      return V4SFmode;
+    case E_V8DFmode:
+    case E_V4DFmode:
+      return V2DFmode;
+    default:
+      return mode;
+    }
+}
+
 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
    vectors.  If AVX512F is enabled then try vectorizing with 512bit,
    256bit and 128bit vectors.  */
@@ -50486,6 +50516,9 @@ ix86_run_selftests (void)
 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
   ix86_preferred_simd_mode
+#undef TARGET_VECTORIZE_SPLIT_REDUCTION
+#define TARGET_VECTORIZE_SPLIT_REDUCTION \
+  ix86_split_reduction
 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
   ix86_autovectorize_vector_sizes
Index: gcc/doc/tm.texi
===================================================================
--- gcc/doc/tm.texi     (revision 255197)
+++ gcc/doc/tm.texi     (working copy)
@@ -5844,6 +5844,13 @@ equal to @code{word_mode}, because the v
 transformations even in absence of specialized @acronym{SIMD} hardware.
 @end deftypefn
 
+@deftypefn {Target Hook} machine_mode TARGET_VECTORIZE_SPLIT_REDUCTION 
(machine_mode)
+This hook should return the preferred mode to split the final reduction
+step on @var{mode} to.  The reduction is then carried out reducing upper
+against lower halves of vectors recursively until the specified mode is
+reached.  The default is @var{mode} which means no splitting.
+@end deftypefn
+
 @deftypefn {Target Hook} {unsigned int} 
TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES (void)
 This hook should return a mask of sizes that should be iterated over
 after trying to autovectorize using the vector size derived from the
Index: gcc/doc/tm.texi.in
===================================================================
--- gcc/doc/tm.texi.in  (revision 255197)
+++ gcc/doc/tm.texi.in  (working copy)
@@ -4091,6 +4091,8 @@ address;  but often a machine-dependent
 
 @hook TARGET_VECTORIZE_PREFERRED_SIMD_MODE
 
+@hook TARGET_VECTORIZE_SPLIT_REDUCTION
+
 @hook TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
 
 @hook TARGET_VECTORIZE_GET_MASK_MODE
Index: gcc/target.def
===================================================================
--- gcc/target.def      (revision 255197)
+++ gcc/target.def      (working copy)
@@ -1875,6 +1875,17 @@ transformations even in absence of speci
  (scalar_mode mode),
  default_preferred_simd_mode)
 
+/* Returns the preferred mode for splitting SIMD reductions to.  */
+DEFHOOK
+(split_reduction,
+ "This hook should return the preferred mode to split the final reduction\n\
+step on @var{mode} to.  The reduction is then carried out reducing upper\n\
+against lower halves of vectors recursively until the specified mode is\n\
+reached.  The default is @var{mode} which means no splitting.",
+  machine_mode,
+  (machine_mode),
+  default_split_reduction)
+
 /* Returns a mask of vector sizes to iterate over when auto-vectorizing
    after processing the preferred one derived from preferred_simd_mode.  */
 DEFHOOK
Index: gcc/targhooks.c
===================================================================
--- gcc/targhooks.c     (revision 255197)
+++ gcc/targhooks.c     (working copy)
@@ -1281,6 +1281,14 @@ default_preferred_simd_mode (scalar_mode
   return word_mode;
 }
 
+/* By default do not split reductions further.  */
+
+machine_mode
+default_split_reduction (machine_mode mode)
+{
+  return mode;
+}
+
 /* By default only the size derived from the preferred vector mode
    is tried.  */
 
Index: gcc/targhooks.h
===================================================================
--- gcc/targhooks.h     (revision 255197)
+++ gcc/targhooks.h     (working copy)
@@ -108,6 +108,7 @@ default_builtin_support_vector_misalignm
                                             const_tree,
                                             int, bool);
 extern machine_mode default_preferred_simd_mode (scalar_mode mode);
+extern machine_mode default_split_reduction (machine_mode);
 extern unsigned int default_autovectorize_vector_sizes (void);
 extern opt_machine_mode default_get_mask_mode (unsigned, unsigned);
 extern void *default_init_cost (struct loop *);
Index: gcc/testsuite/gcc.target/i386/pr80846-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr80846-1.c   (nonexistent)
+++ gcc/testsuite/gcc.target/i386/pr80846-1.c   (working copy)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f" } */
+
+int sumint(const int arr[]) {
+    arr = __builtin_assume_aligned(arr, 64);
+    int sum=0;
+    for (int i=0 ; i<1024 ; i++)
+      sum+=arr[i];
+    return sum;
+}
+
+/* { dg-final { scan-assembler-times "vextracti" 2 } } */
Index: gcc/testsuite/gcc.target/i386/pr80846-2.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr80846-2.c   (nonexistent)
+++ gcc/testsuite/gcc.target/i386/pr80846-2.c   (working copy)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2" } */
+
+int sumint(const int arr[]) {
+    arr = __builtin_assume_aligned(arr, 64);
+    int sum=0;
+    for (int i=0 ; i<1024 ; i++)
+      sum+=arr[i];
+    return sum;
+}
+
+/* { dg-final { scan-assembler-times "vextracti" 1 } } */
Index: gcc/tree-vect-loop.c
===================================================================
--- gcc/tree-vect-loop.c        (revision 255197)
+++ gcc/tree-vect-loop.c        (working copy)
@@ -4994,38 +4994,126 @@ vect_create_epilog_for_reduction (vec<tr
     }
   else
     {
-      bool reduce_with_shift = have_whole_vector_shift (mode);
-      int element_bitsize = tree_to_uhwi (bitsize);
-      int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
+      bool reduce_with_shift;
       tree vec_temp;
-
+ 
       /* COND reductions all do the final reduction with MAX_EXPR.  */
       if (code == COND_EXPR)
        code = MAX_EXPR;
 
-      /* Regardless of whether we have a whole vector shift, if we're
-         emulating the operation via tree-vect-generic, we don't want
-         to use it.  Only the first round of the reduction is likely
-         to still be profitable via emulation.  */
-      /* ??? It might be better to emit a reduction tree code here, so that
-         tree-vect-generic can expand the first round via bit tricks.  */
-      if (!VECTOR_MODE_P (mode))
-        reduce_with_shift = false;
+      /* See if the target wants to do the final (shift) reduction
+        in a vector mode of smaller size and first reduce upper/lower
+        halves against each other.  */
+      enum machine_mode mode1 = mode;
+      tree vectype1 = vectype;
+      unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
+      unsigned sz1 = sz;
+      if (!slp_reduc
+         && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
+       sz1 = GET_MODE_SIZE (mode1);
+
+      vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
+      reduce_with_shift = have_whole_vector_shift (mode1);
+      if (!VECTOR_MODE_P (mode1))
+       reduce_with_shift = false;
       else
-        {
-          optab optab = optab_for_tree_code (code, vectype, optab_default);
-          if (optab_handler (optab, mode) == CODE_FOR_nothing)
-            reduce_with_shift = false;
-        }
+       {
+         optab optab = optab_for_tree_code (code, vectype1, optab_default);
+         if (optab_handler (optab, mode1) == CODE_FOR_nothing)
+           reduce_with_shift = false;
+       }
+
+      /* First reduce the vector to the desired vector size we should
+        do shift reduction on by combining upper and lower halves.  */
+      new_temp = new_phi_result;
+      while (sz > sz1)
+       {
+         gcc_assert (!slp_reduc);
+         sz /= 2;
+         vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
+
+         /* The target has to make sure we support lowpart/highpart
+            extraction, either via direct vector extract or through
+            an integer mode punning.  */
+         tree dst1, dst2;
+         if (convert_optab_handler (vec_extract_optab,
+                                    TYPE_MODE (TREE_TYPE (new_temp)),
+                                    TYPE_MODE (vectype1))
+             != CODE_FOR_nothing)
+           {
+             /* Extract sub-vectors directly once vec_extract becomes
+                a conversion optab.  */
+             dst1 = make_ssa_name (vectype1);
+             epilog_stmt
+               = gimple_build_assign (dst1, BIT_FIELD_REF,
+                                      build3 (BIT_FIELD_REF, vectype1,
+                                              new_temp, TYPE_SIZE (vectype1),
+                                              bitsize_int (0)));
+             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+             dst2 =  make_ssa_name (vectype1);
+             epilog_stmt
+               = gimple_build_assign (dst2, BIT_FIELD_REF,
+                                      build3 (BIT_FIELD_REF, vectype1,
+                                              new_temp, TYPE_SIZE (vectype1),
+                                              bitsize_int (sz * 
BITS_PER_UNIT)));
+             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+           }
+         else
+           {
+             /* Extract via punning to appropriately sized integer mode
+                vector.  */
+             tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
+                                                           1);
+             tree etype = build_vector_type (eltype, 2);
+             gcc_assert (convert_optab_handler (vec_extract_optab,
+                                                TYPE_MODE (etype),
+                                                TYPE_MODE (eltype))
+                         != CODE_FOR_nothing);
+             tree tem = make_ssa_name (etype);
+             epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
+                                                build1 (VIEW_CONVERT_EXPR,
+                                                        etype, new_temp));
+             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+             new_temp = tem;
+             tem = make_ssa_name (eltype);
+             epilog_stmt
+               = gimple_build_assign (tem, BIT_FIELD_REF,
+                                      build3 (BIT_FIELD_REF, eltype,
+                                              new_temp, TYPE_SIZE (eltype),
+                                              bitsize_int (0)));
+             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+             dst1 = make_ssa_name (vectype1);
+             epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
+                                                build1 (VIEW_CONVERT_EXPR,
+                                                        vectype1, tem));
+             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+             tem = make_ssa_name (eltype);
+             epilog_stmt
+               = gimple_build_assign (tem, BIT_FIELD_REF,
+                                      build3 (BIT_FIELD_REF, eltype,
+                                              new_temp, TYPE_SIZE (eltype),
+                                              bitsize_int (sz * 
BITS_PER_UNIT)));
+             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+             dst2 =  make_ssa_name (vectype1);
+             epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
+                                                build1 (VIEW_CONVERT_EXPR,
+                                                        vectype1, tem));
+             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+           }
+
+         new_temp = make_ssa_name (vectype1);
+         epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
+         gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+       }
 
       if (reduce_with_shift && !slp_reduc)
         {
-          int nelements = vec_size_in_bits / element_bitsize;
+          int nelements = TYPE_VECTOR_SUBPARTS (vectype1);
           auto_vec_perm_indices sel (nelements);
 
           int elt_offset;
 
-          tree zero_vec = build_zero_cst (vectype);
+          tree zero_vec = build_zero_cst (vectype1);
           /* Case 2: Create:
              for (offset = nelements/2; offset >= 1; offset/=2)
                 {
@@ -5039,15 +5127,15 @@ vect_create_epilog_for_reduction (vec<tr
             dump_printf_loc (MSG_NOTE, vect_location,
                             "Reduce using vector shifts\n");
 
-          vec_dest = vect_create_destination_var (scalar_dest, vectype);
-          new_temp = new_phi_result;
+         mode1 = TYPE_MODE (vectype1);
+         vec_dest = vect_create_destination_var (scalar_dest, vectype1);
           for (elt_offset = nelements / 2;
                elt_offset >= 1;
                elt_offset /= 2)
             {
              sel.truncate (0);
-             calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
-             tree mask = vect_gen_perm_mask_any (vectype, sel);
+              calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
+              tree mask = vect_gen_perm_mask_any (vectype1, sel);
              epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
                                                 new_temp, zero_vec, mask);
               new_name = make_ssa_name (vec_dest, epilog_stmt);
@@ -5092,7 +5180,8 @@ vect_create_epilog_for_reduction (vec<tr
             dump_printf_loc (MSG_NOTE, vect_location,
                             "Reduce using scalar code.\n");
 
-          vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
+         int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
+         int element_bitsize = tree_to_uhwi (bitsize);
           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
             {
               int bit_offset;
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c       (revision 255197)
+++ gcc/tree-vect-stmts.c       (working copy)
@@ -6514,7 +6514,7 @@ vect_gen_perm_mask_any (tree vectype, ve
 
   mask_elt_type = lang_hooks.types.type_for_mode
     (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
-  mask_type = get_vectype_for_scalar_type (mask_elt_type);
+  mask_type = get_same_sized_vectype (mask_elt_type, vectype);
 
   auto_vec<tree, 32> mask_elts (nunits);
   for (unsigned int i = 0; i < nunits; ++i)
@@ -9065,7 +9065,7 @@ free_stmt_vec_info (gimple *stmt)
    Returns the vector type corresponding to SCALAR_TYPE  and SIZE as supported
    by the target.  */
 
-static tree
+tree
 get_vectype_for_scalar_type_and_size (tree scalar_type, unsigned size)
 {
   tree orig_scalar_type = scalar_type;
Index: gcc/tree-vectorizer.h
===================================================================
--- gcc/tree-vectorizer.h       (revision 255197)
+++ gcc/tree-vectorizer.h       (working copy)
@@ -1151,6 +1151,7 @@ extern bool vect_can_advance_ivs_p (loop
 /* In tree-vect-stmts.c.  */
 extern unsigned int current_vector_size;
 extern tree get_vectype_for_scalar_type (tree);
+extern tree get_vectype_for_scalar_type_and_size (tree, unsigned);
 extern tree get_mask_type_for_scalar_type (tree);
 extern tree get_same_sized_vectype (tree, tree);
 extern bool vect_is_simple_use (tree, vec_info *, gimple **,

[PATCH] Fix PR80846, change vectorizer reduction epilogue (on x86)

Reply via email to