This is the vectorizer part of the fix - currently when we
need to permute a load in contiguous accesses we load the
"gap" between two instances of a group as well.  That can
cause quite excessive code generation (fixed up by DCE / forwprop
later but confusing intermediate passes compile-time wise)
in case the gap is large.

The following addresses this in the SLP case, simply skipping
code generation of such loads.  This avoids the huge IV
increment chain which causes all of the followup issues.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied.

Richard.

2019-07-17  Richard Biener  <rguent...@suse.de>

        PR tree-optimization/91178
        * tree-vect-stmts.c (get_group_load_store_type): For SLP
        loads with a gap larger than the vector size always use
        VMAT_STRIDED_SLP.
        (vectorizable_load): For VMAT_STRIDED_SLP with a permutation
        avoid loading vectors that are only contained in the gap
        and thus are not needed.

        * gcc.dg/torture/pr91178.c: New testcase.

Index: gcc/testsuite/gcc.dg/torture/pr91178.c
===================================================================
--- gcc/testsuite/gcc.dg/torture/pr91178.c      (nonexistent)
+++ gcc/testsuite/gcc.dg/torture/pr91178.c      (working copy)
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+
+int a;
+extern int f[10][91125];
+int b[50];
+void c()
+{
+  for (int d = 6; d <= a; d++)
+    for (int e = 16; e <= 24; e++)
+      b[e] -= f[d][d];
+}
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c       (revision 273520)
+++ gcc/tree-vect-stmts.c       (working copy)
@@ -2267,6 +2267,14 @@ get_group_load_store_type (stmt_vec_info
                        / vect_get_scalar_dr_size (first_dr_info)))
            overrun_p = false;
 
+         /* If the gap at the end of the group exceeds a whole vector
+            in size use the strided SLP code which can skip code-generation
+            for the gap.  */
+         if (vls_type == VLS_LOAD && known_gt (gap, nunits))
+           *memory_access_type = VMAT_STRIDED_SLP;
+         else
+           *memory_access_type = VMAT_CONTIGUOUS;
+
          /* If the gap splits the vector in half and the target
             can do half-vector operations avoid the epilogue peeling
             by simply loading half of the vector only.  Usually
@@ -2274,7 +2282,8 @@ get_group_load_store_type (stmt_vec_info
          dr_alignment_support alignment_support_scheme;
          scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype));
          machine_mode vmode;
-         if (overrun_p
+         if (*memory_access_type == VMAT_CONTIGUOUS
+             && overrun_p
              && !masked_p
              && (((alignment_support_scheme
                      = vect_supportable_dr_alignment (first_dr_info, false)))
@@ -2297,7 +2306,6 @@ get_group_load_store_type (stmt_vec_info
                                 "Peeling for outer loop is not supported\n");
              return false;
            }
-         *memory_access_type = VMAT_CONTIGUOUS;
        }
     }
   else
@@ -8732,6 +8740,7 @@ vectorizable_load (stmt_vec_info stmt_in
       /* Checked by get_load_store_type.  */
       unsigned int const_nunits = nunits.to_constant ();
       unsigned HOST_WIDE_INT cst_offset = 0;
+      unsigned int group_gap = 0;
 
       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
       gcc_assert (!nested_in_vect_loop);
@@ -8749,6 +8758,7 @@ vectorizable_load (stmt_vec_info stmt_in
       if (slp && grouped_load)
        {
          group_size = DR_GROUP_SIZE (first_stmt_info);
+         group_gap = DR_GROUP_GAP (first_stmt_info);
          ref_type = get_group_alias_ptr_type (first_stmt_info);
        }
       else
@@ -8892,6 +8902,14 @@ vectorizable_load (stmt_vec_info stmt_in
          if (nloads > 1)
            vec_alloc (v, nloads);
          stmt_vec_info new_stmt_info = NULL;
+         if (slp && slp_perm
+             && (group_el % group_size) > group_size - group_gap
+             && (group_el % group_size) + nloads * lnel < group_size)
+           {
+             dr_chain.quick_push (NULL_TREE);
+             group_el += nloads * lnel;
+             continue;
+           }
          for (i = 0; i < nloads; i++)
            {
              tree this_off = build_int_cst (TREE_TYPE (alias_off),

Reply via email to