[PATCH 4/5] Remove load interleaving code

Richard Biener Fri, 25 Jul 2025 01:30:04 -0700

The following removes the non-SLP load interleaving code which was
almost unused.


        * tree-vectorizer.h (vect_grouped_load_supported): Remove.
        (vect_transform_grouped_load): Likewise.
        (vect_record_grouped_load_vectors): Likewise.
        * tree-vect-data-refs.cc (vect_grouped_load_supported): Remove.
        (vect_permute_load_chain): Likewise.
        (vect_shift_permute_load_chain): Likewise.
        (vect_transform_grouped_load): Likewise.
        (vect_record_grouped_load_vectors): Likewise.
        * tree-vect-loop.cc (vect_analyze_loop_2): Do not check for
        load interleaving support when disregarding single-lane SLP.
        * tree-vect-stmts.cc (vectorizable_load): Remove comments
        about load interleaving.
---
 gcc/tree-vect-data-refs.cc | 747 -------------------------------------
 gcc/tree-vect-loop.cc      |   7 +-
 gcc/tree-vect-stmts.cc     |  35 +-
 gcc/tree-vectorizer.h      |   5 -
 4 files changed, 3 insertions(+), 791 deletions(-)

diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index 3ff8be44b59..b38eecd7901 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -6581,126 +6581,6 @@ vect_setup_realignment (vec_info *vinfo, stmt_vec_info 
stmt_info,
 }
 
 
-/* Function vect_grouped_load_supported.
-
-   COUNT is the size of the load group (the number of statements plus the
-   number of gaps).  SINGLE_ELEMENT_P is true if there is actually
-   only one statement, with a gap of COUNT - 1.
-
-   Returns true if a suitable permute exists.  */
-
-bool
-vect_grouped_load_supported (tree vectype, bool single_element_p,
-                            unsigned HOST_WIDE_INT count)
-{
-  machine_mode mode = TYPE_MODE (vectype);
-
-  /* If this is single-element interleaving with an element distance
-     that leaves unused vector loads around punt - we at least create
-     very sub-optimal code in that case (and blow up memory,
-     see PR65518).  */
-  if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "single-element interleaving not supported "
-                        "for not adjacent vector loads\n");
-      return false;
-    }
-
-  /* vect_permute_load_chain requires the group size to be equal to 3 or
-     be a power of two.  */
-  if (count != 3 && exact_log2 (count) == -1)
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "the size of the group of accesses"
-                        " is not a power of 2 or not equal to 3\n");
-      return false;
-    }
-
-  /* Check that the permutation is supported.  */
-  if (VECTOR_MODE_P (mode))
-    {
-      unsigned int i, j;
-      if (count == 3)
-       {
-         unsigned int nelt;
-         if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
-           {
-             if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "cannot handle groups of 3 loads for"
-                                " variable-length vectors\n");
-             return false;
-           }
-
-         vec_perm_builder sel (nelt, nelt, 1);
-         sel.quick_grow (nelt);
-         vec_perm_indices indices;
-         unsigned int k;
-         for (k = 0; k < 3; k++)
-           {
-             for (i = 0; i < nelt; i++)
-               if (3 * i + k < 2 * nelt)
-                 sel[i] = 3 * i + k;
-               else
-                 sel[i] = 0;
-             indices.new_vector (sel, 2, nelt);
-             if (!can_vec_perm_const_p (mode, mode, indices))
-               {
-                 if (dump_enabled_p ())
-                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                    "shuffle of 3 loads is not supported by"
-                                    " target\n");
-                 return false;
-               }
-             for (i = 0, j = 0; i < nelt; i++)
-               if (3 * i + k < 2 * nelt)
-                 sel[i] = i;
-               else
-                 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
-             indices.new_vector (sel, 2, nelt);
-             if (!can_vec_perm_const_p (mode, mode, indices))
-               {
-                 if (dump_enabled_p ())
-                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                    "shuffle of 3 loads is not supported by"
-                                    " target\n");
-                 return false;
-               }
-           }
-         return true;
-       }
-      else
-       {
-         /* If length is not equal to 3 then only power of 2 is supported.  */
-         gcc_assert (pow2p_hwi (count));
-         poly_uint64 nelt = GET_MODE_NUNITS (mode);
-
-         /* The encoding has a single stepped pattern.  */
-         vec_perm_builder sel (nelt, 1, 3);
-         sel.quick_grow (3);
-         for (i = 0; i < 3; i++)
-           sel[i] = i * 2;
-         vec_perm_indices indices (sel, 2, nelt);
-         if (can_vec_perm_const_p (mode, mode, indices))
-           {
-             for (i = 0; i < 3; i++)
-               sel[i] = i * 2 + 1;
-             indices.new_vector (sel, 2, nelt);
-             if (can_vec_perm_const_p (mode, mode, indices))
-               return true;
-           }
-        }
-    }
-
-  if (dump_enabled_p ())
-    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                    "extract even/odd not supported by target\n");
-  return false;
-}
-
 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT 
vectors
    of type VECTYPE.  MASKED_P says whether the masked form is needed.
    If it is available and ELSVALS is nonzero store the possible else values
@@ -6730,633 +6610,6 @@ vect_load_lanes_supported (tree vectype, unsigned 
HOST_WIDE_INT count,
   return IFN_LAST;
 }
 
-/* Function vect_permute_load_chain.
-
-   Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
-   a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
-   the input data correctly.  Return the final references for loads in
-   RESULT_CHAIN.
-
-   E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
-   The input is 4 vectors each containing 8 elements. We assign a number to 
each
-   element, the input sequence is:
-
-   1st vec:   0  1  2  3  4  5  6  7
-   2nd vec:   8  9 10 11 12 13 14 15
-   3rd vec:  16 17 18 19 20 21 22 23
-   4th vec:  24 25 26 27 28 29 30 31
-
-   The output sequence should be:
-
-   1st vec:  0 4  8 12 16 20 24 28
-   2nd vec:  1 5  9 13 17 21 25 29
-   3rd vec:  2 6 10 14 18 22 26 30
-   4th vec:  3 7 11 15 19 23 27 31
-
-   i.e., the first output vector should contain the first elements of each
-   interleaving group, etc.
-
-   We use extract_even/odd instructions to create such output.  The input of
-   each extract_even/odd operation is two vectors
-   1st vec    2nd vec
-   0 1 2 3    4 5 6 7
-
-   and the output is the vector of extracted even/odd elements.  The output of
-   extract_even will be:   0 2 4 6
-   and of extract_odd:     1 3 5 7
-
-
-   The permutation is done in log LENGTH stages.  In each stage extract_even
-   and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
-   their order.  In our example,
-
-   E1: extract_even (1st vec, 2nd vec)
-   E2: extract_odd (1st vec, 2nd vec)
-   E3: extract_even (3rd vec, 4th vec)
-   E4: extract_odd (3rd vec, 4th vec)
-
-   The output for the first stage will be:
-
-   E1:  0  2  4  6  8 10 12 14
-   E2:  1  3  5  7  9 11 13 15
-   E3: 16 18 20 22 24 26 28 30
-   E4: 17 19 21 23 25 27 29 31
-
-   In order to proceed and create the correct sequence for the next stage (or
-   for the correct output, if the second stage is the last one, as in our
-   example), we first put the output of extract_even operation and then the
-   output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
-   The input for the second stage is:
-
-   1st vec (E1):  0  2  4  6  8 10 12 14
-   2nd vec (E3): 16 18 20 22 24 26 28 30
-   3rd vec (E2):  1  3  5  7  9 11 13 15
-   4th vec (E4): 17 19 21 23 25 27 29 31
-
-   The output of the second stage:
-
-   E1: 0 4  8 12 16 20 24 28
-   E2: 2 6 10 14 18 22 26 30
-   E3: 1 5  9 13 17 21 25 29
-   E4: 3 7 11 15 19 23 27 31
-
-   And RESULT_CHAIN after reordering:
-
-   1st vec (E1):  0 4  8 12 16 20 24 28
-   2nd vec (E3):  1 5  9 13 17 21 25 29
-   3rd vec (E2):  2 6 10 14 18 22 26 30
-   4th vec (E4):  3 7 11 15 19 23 27 31.  */
-
-static void
-vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
-                        unsigned int length,
-                        stmt_vec_info stmt_info,
-                        gimple_stmt_iterator *gsi,
-                        vec<tree> *result_chain)
-{
-  tree data_ref, first_vect, second_vect;
-  tree perm_mask_even, perm_mask_odd;
-  tree perm3_mask_low, perm3_mask_high;
-  gimple *perm_stmt;
-  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-  unsigned int i, j, log_length = exact_log2 (length);
-
-  result_chain->quick_grow (length);
-  memcpy (result_chain->address (), dr_chain.address (),
-         length * sizeof (tree));
-
-  if (length == 3)
-    {
-      /* vect_grouped_load_supported ensures that this is constant.  */
-      unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
-      unsigned int k;
-
-      vec_perm_builder sel (nelt, nelt, 1);
-      sel.quick_grow (nelt);
-      vec_perm_indices indices;
-      for (k = 0; k < 3; k++)
-       {
-         for (i = 0; i < nelt; i++)
-           if (3 * i + k < 2 * nelt)
-             sel[i] = 3 * i + k;
-           else
-             sel[i] = 0;
-         indices.new_vector (sel, 2, nelt);
-         perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
-
-         for (i = 0, j = 0; i < nelt; i++)
-           if (3 * i + k < 2 * nelt)
-             sel[i] = i;
-           else
-             sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
-         indices.new_vector (sel, 2, nelt);
-         perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
-
-         first_vect = dr_chain[0];
-         second_vect = dr_chain[1];
-
-         /* Create interleaving stmt (low part of):
-            low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
-                                                            ...}>  */
-         data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
-         perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
-                                          second_vect, perm3_mask_low);
-         vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
-
-         /* Create interleaving stmt (high part of):
-            high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
-                                                             ...}>  */
-         first_vect = data_ref;
-         second_vect = dr_chain[2];
-         data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
-         perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
-                                          second_vect, perm3_mask_high);
-         vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
-         (*result_chain)[k] = data_ref;
-       }
-    }
-  else
-    {
-      /* If length is not equal to 3 then only power of 2 is supported.  */
-      gcc_assert (pow2p_hwi (length));
-
-      /* The encoding has a single stepped pattern.  */
-      poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
-      vec_perm_builder sel (nelt, 1, 3);
-      sel.quick_grow (3);
-      for (i = 0; i < 3; ++i)
-       sel[i] = i * 2;
-      vec_perm_indices indices (sel, 2, nelt);
-      perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
-
-      for (i = 0; i < 3; ++i)
-       sel[i] = i * 2 + 1;
-      indices.new_vector (sel, 2, nelt);
-      perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
-
-      for (i = 0; i < log_length; i++)
-       {
-         for (j = 0; j < length; j += 2)
-           {
-             first_vect = dr_chain[j];
-             second_vect = dr_chain[j+1];
-
-             /* data_ref = permute_even (first_data_ref, second_data_ref);  */
-             data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
-             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
-                                              first_vect, second_vect,
-                                              perm_mask_even);
-             vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
-             (*result_chain)[j/2] = data_ref;
-
-             /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
-             data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
-             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
-                                              first_vect, second_vect,
-                                              perm_mask_odd);
-             vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
-             (*result_chain)[j/2+length/2] = data_ref;
-           }
-         memcpy (dr_chain.address (), result_chain->address (),
-                 length * sizeof (tree));
-       }
-    }
-}
-
-/* Function vect_shift_permute_load_chain.
-
-   Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
-   sequence of stmts to reorder the input data accordingly.
-   Return the final references for loads in RESULT_CHAIN.
-   Return true if successed, false otherwise.
-
-   E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
-   The input is 3 vectors each containing 8 elements.  We assign a
-   number to each element, the input sequence is:
-
-   1st vec:   0  1  2  3  4  5  6  7
-   2nd vec:   8  9 10 11 12 13 14 15
-   3rd vec:  16 17 18 19 20 21 22 23
-
-   The output sequence should be:
-
-   1st vec:  0 3 6  9 12 15 18 21
-   2nd vec:  1 4 7 10 13 16 19 22
-   3rd vec:  2 5 8 11 14 17 20 23
-
-   We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
-
-   First we shuffle all 3 vectors to get correct elements order:
-
-   1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
-   2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
-   3rd vec:  (16 19 22) (17 20 23) (18 21)
-
-   Next we unite and shift vector 3 times:
-
-   1st step:
-     shift right by 6 the concatenation of:
-     "1st vec" and  "2nd vec"
-       ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
-     "2nd vec" and  "3rd vec"
-       ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
-     "3rd vec" and  "1st vec"
-       (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
-                            | New vectors                   |
-
-     So that now new vectors are:
-
-     1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
-     2nd vec:  (10 13) (16 19 22) (17 20 23)
-     3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
-
-   2nd step:
-     shift right by 5 the concatenation of:
-     "1st vec" and  "3rd vec"
-       ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
-     "2nd vec" and  "1st vec"
-       (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
-     "3rd vec" and  "2nd vec"
-       (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
-                         | New vectors                   |
-
-     So that now new vectors are:
-
-     1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
-     2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
-     3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
-
-   3rd step:
-     shift right by 5 the concatenation of:
-     "1st vec" and  "1st vec"
-       ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
-     shift right by 3 the concatenation of:
-     "2nd vec" and  "2nd vec"
-               (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
-                         | New vectors                   |
-
-     So that now all vectors are READY:
-     1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
-     2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
-     3rd vec:  ( 1  4  7) (10 13) (16 19 22)
-
-   This algorithm is faster than one in vect_permute_load_chain if:
-     1.  "shift of a concatination" is faster than general permutation.
-        This is usually so.
-     2.  The TARGET machine can't execute vector instructions in parallel.
-        This is because each step of the algorithm depends on previous.
-        The algorithm in vect_permute_load_chain is much more parallel.
-
-   The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
-*/
-
-static bool
-vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
-                              unsigned int length,
-                              stmt_vec_info stmt_info,
-                              gimple_stmt_iterator *gsi,
-                              vec<tree> *result_chain)
-{
-  tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
-  tree perm2_mask1, perm2_mask2, perm3_mask;
-  tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
-  gimple *perm_stmt;
-
-  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-  machine_mode vmode = TYPE_MODE (vectype);
-  unsigned int i;
-  loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
-
-  unsigned HOST_WIDE_INT nelt, vf;
-  if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
-      || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
-    /* Not supported for variable-length vectors.  */
-    return false;
-
-  vec_perm_builder sel (nelt, nelt, 1);
-  sel.quick_grow (nelt);
-
-  result_chain->quick_grow (length);
-  memcpy (result_chain->address (), dr_chain.address (),
-         length * sizeof (tree));
-
-  if (pow2p_hwi (length) && vf > 4)
-    {
-      unsigned int j, log_length = exact_log2 (length);
-      for (i = 0; i < nelt / 2; ++i)
-       sel[i] = i * 2;
-      for (i = 0; i < nelt / 2; ++i)
-       sel[nelt / 2 + i] = i * 2 + 1;
-      vec_perm_indices indices (sel, 2, nelt);
-      if (!can_vec_perm_const_p (vmode, vmode, indices))
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "shuffle of 2 fields structure is not \
-                             supported by target\n");
-         return false;
-       }
-      perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
-
-      for (i = 0; i < nelt / 2; ++i)
-       sel[i] = i * 2 + 1;
-      for (i = 0; i < nelt / 2; ++i)
-       sel[nelt / 2 + i] = i * 2;
-      indices.new_vector (sel, 2, nelt);
-      if (!can_vec_perm_const_p (vmode, vmode, indices))
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "shuffle of 2 fields structure is not \
-                             supported by target\n");
-         return false;
-       }
-      perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
-
-      /* Generating permutation constant to shift all elements.
-        For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
-      for (i = 0; i < nelt; i++)
-       sel[i] = nelt / 2 + i;
-      indices.new_vector (sel, 2, nelt);
-      if (!can_vec_perm_const_p (vmode, vmode, indices))
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "shift permutation is not supported by target\n");
-         return false;
-       }
-      shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
-
-      /* Generating permutation constant to select vector from 2.
-        For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
-      for (i = 0; i < nelt / 2; i++)
-       sel[i] = i;
-      for (i = nelt / 2; i < nelt; i++)
-       sel[i] = nelt + i;
-      indices.new_vector (sel, 2, nelt);
-      if (!can_vec_perm_const_p (vmode, vmode, indices))
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "select is not supported by target\n");
-         return false;
-       }
-      select_mask = vect_gen_perm_mask_checked (vectype, indices);
-
-      for (i = 0; i < log_length; i++)
-       {
-         for (j = 0; j < length; j += 2)
-           {
-             first_vect = dr_chain[j];
-             second_vect = dr_chain[j + 1];
-
-             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
-             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
-                                              first_vect, first_vect,
-                                              perm2_mask1);
-             vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
-             vect[0] = data_ref;
-
-             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
-             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
-                                              second_vect, second_vect,
-                                              perm2_mask2);
-             vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
-             vect[1] = data_ref;
-
-             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
-             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
-                                              vect[0], vect[1], shift1_mask);
-             vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
-             (*result_chain)[j/2 + length/2] = data_ref;
-
-             data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
-             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
-                                              vect[0], vect[1], select_mask);
-             vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
-             (*result_chain)[j/2] = data_ref;
-           }
-         memcpy (dr_chain.address (), result_chain->address (),
-                 length * sizeof (tree));
-       }
-      return true;
-    }
-  if (length == 3 && vf > 2)
-    {
-      unsigned int k = 0, l = 0;
-
-      /* Generating permutation constant to get all elements in rigth order.
-        For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
-      for (i = 0; i < nelt; i++)
-       {
-         if (3 * k + (l % 3) >= nelt)
-           {
-             k = 0;
-             l += (3 - (nelt % 3));
-           }
-         sel[i] = 3 * k + (l % 3);
-         k++;
-       }
-      vec_perm_indices indices (sel, 2, nelt);
-      if (!can_vec_perm_const_p (vmode, vmode, indices))
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "shuffle of 3 fields structure is not \
-                             supported by target\n");
-         return false;
-       }
-      perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
-
-      /* Generating permutation constant to shift all elements.
-        For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
-      for (i = 0; i < nelt; i++)
-       sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
-      indices.new_vector (sel, 2, nelt);
-      if (!can_vec_perm_const_p (vmode, vmode, indices))
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "shift permutation is not supported by target\n");
-         return false;
-       }
-      shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
-
-      /* Generating permutation constant to shift all elements.
-        For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
-      for (i = 0; i < nelt; i++)
-       sel[i] = 2 * (nelt / 3) + 1 + i;
-      indices.new_vector (sel, 2, nelt);
-      if (!can_vec_perm_const_p (vmode, vmode, indices))
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "shift permutation is not supported by target\n");
-         return false;
-       }
-      shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
-
-      /* Generating permutation constant to shift all elements.
-        For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
-      for (i = 0; i < nelt; i++)
-       sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
-      indices.new_vector (sel, 2, nelt);
-      if (!can_vec_perm_const_p (vmode, vmode, indices))
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "shift permutation is not supported by target\n");
-         return false;
-       }
-      shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
-
-      /* Generating permutation constant to shift all elements.
-        For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
-      for (i = 0; i < nelt; i++)
-       sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
-      indices.new_vector (sel, 2, nelt);
-      if (!can_vec_perm_const_p (vmode, vmode, indices))
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "shift permutation is not supported by target\n");
-         return false;
-       }
-      shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
-
-      for (k = 0; k < 3; k++)
-       {
-         data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
-         perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
-                                          dr_chain[k], dr_chain[k],
-                                          perm3_mask);
-         vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
-         vect[k] = data_ref;
-       }
-
-      for (k = 0; k < 3; k++)
-       {
-         data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
-         perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
-                                          vect[k % 3], vect[(k + 1) % 3],
-                                          shift1_mask);
-         vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
-         vect_shift[k] = data_ref;
-       }
-
-      for (k = 0; k < 3; k++)
-       {
-         data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
-         perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
-                                          vect_shift[(4 - k) % 3],
-                                          vect_shift[(3 - k) % 3],
-                                          shift2_mask);
-         vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
-         vect[k] = data_ref;
-       }
-
-      (*result_chain)[3 - (nelt % 3)] = vect[2];
-
-      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
-      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
-                                      vect[0], shift3_mask);
-      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
-      (*result_chain)[nelt % 3] = data_ref;
-
-      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
-      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
-                                      vect[1], shift4_mask);
-      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
-      (*result_chain)[0] = data_ref;
-      return true;
-    }
-  return false;
-}
-
-/* Function vect_transform_grouped_load.
-
-   Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
-   to perform their permutation and ascribe the result vectorized statements to
-   the scalar statements.
-*/
-
-void
-vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
-                            vec<tree> dr_chain,
-                            int size, gimple_stmt_iterator *gsi)
-{
-  machine_mode mode;
-  vec<tree> result_chain = vNULL;
-
-  /* DR_CHAIN contains input data-refs that are a part of the interleaving.
-     RESULT_CHAIN is the output of vect_permute_load_chain, it contains 
permuted
-     vectors, that are ready for vector computation.  */
-  result_chain.create (size);
-
-  /* If reassociation width for vector type is 2 or greater target machine can
-     execute 2 or more vector instructions in parallel.  Otherwise try to
-     get chain for loads group using vect_shift_permute_load_chain.  */
-  mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
-  if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
-      || pow2p_hwi (size)
-      || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
-                                        gsi, &result_chain))
-    vect_permute_load_chain (vinfo, dr_chain,
-                            size, stmt_info, gsi, &result_chain);
-  vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
-  result_chain.release ();
-}
-
-/* RESULT_CHAIN contains the output of a group of grouped loads that were
-   generated as part of the vectorization of STMT_INFO.  Assign the statement
-   for each vector to the associated scalar statement.  */
-
-void
-vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
-                                 vec<tree> result_chain)
-{
-  stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
-  unsigned int i, gap_count;
-  tree tmp_data_ref;
-
-  /* Put a permuted data-ref in the VECTORIZED_STMT field.
-     Since we scan the chain starting from it's first node, their order
-     corresponds the order of data-refs in RESULT_CHAIN.  */
-  stmt_vec_info next_stmt_info = first_stmt_info;
-  gap_count = 1;
-  FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
-    {
-      if (!next_stmt_info)
-       break;
-
-      /* Skip the gaps.  Loads created for the gaps will be removed by dead
-       code elimination pass later.  No need to check for the first stmt in
-       the group, since it always exists.
-       DR_GROUP_GAP is the number of steps in elements from the previous
-       access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
-       correspond to the gaps.  */
-      if (next_stmt_info != first_stmt_info
-         && gap_count < DR_GROUP_GAP (next_stmt_info))
-       {
-         gap_count++;
-         continue;
-       }
-
-      /* ???  The following needs cleanup after the removal of
-         DR_GROUP_SAME_DR_STMT.  */
-      if (next_stmt_info)
-        {
-         gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
-         /* We assume that if VEC_STMT is not NULL, this is a case of multiple
-            copies, and we put the new vector statement last.  */
-         STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
-
-         next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
-         gap_count = 1;
-        }
-    }
-}
-
 /* Function vect_force_dr_alignment_p.
 
    Returns whether the alignment of a DECL can be forced to be aligned
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 1d5ef9b7e9e..cb315e6bbf9 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -2866,7 +2866,7 @@ again:
     return ok;
 
   /* Likewise if the grouped loads or stores in the SLP cannot be handled
-     via interleaving or lane instructions.  */
+     via lane instructions.  */
   slp_instance instance;
   slp_tree node;
   unsigned i, j;
@@ -2893,12 +2893,9 @@ again:
          if (STMT_VINFO_GROUPED_ACCESS (vinfo))
            {
              vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
-             bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
              size = DR_GROUP_SIZE (vinfo);
              vectype = STMT_VINFO_VECTYPE (vinfo);
-             if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
-                 && ! vect_grouped_load_supported (vectype, single_element_p,
-                                                   size))
+             if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST)
                return opt_result::failure_at (vinfo->stmt,
                                               "unsupported grouped load\n");
            }
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 366753216eb..3b8b98978d3 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -10213,39 +10213,6 @@ vectorizable_load (vec_info *vinfo,
         S2:     z = x + 1       -               -
   */
 
-  /* In case of interleaving (non-unit grouped access):
-
-     S1:  x2 = &base + 2
-     S2:  x0 = &base
-     S3:  x1 = &base + 1
-     S4:  x3 = &base + 3
-
-     Vectorized loads are created in the order of memory accesses
-     starting from the access of the first stmt of the chain:
-
-     VS1: vx0 = &base
-     VS2: vx1 = &base + vec_size*1
-     VS3: vx3 = &base + vec_size*2
-     VS4: vx4 = &base + vec_size*3
-
-     Then permutation statements are generated:
-
-     VS5: vx5 = VEC_PERM_EXPR < vx0, vx1, { 0, 2, ..., i*2 } >
-     VS6: vx6 = VEC_PERM_EXPR < vx0, vx1, { 1, 3, ..., i*2+1 } >
-       ...
-
-     And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
-     (the order of the data-refs in the output of vect_permute_load_chain
-     corresponds to the order of scalar stmts in the interleaving chain - see
-     the documentation of vect_permute_load_chain()).
-     The generation of permutation stmts and recording them in
-     STMT_VINFO_VEC_STMT is done in vect_transform_grouped_load().
-
-     In case of both multiple types and interleaving, the vector loads and
-     permutation stmts above are created for every copy.  The result vector
-     stmts are put in STMT_VINFO_VEC_STMT for the first copy and in the
-     corresponding STMT_VINFO_RELATED_STMT for the next copies.  */
-
   /* If the data reference is aligned (dr_aligned) or potentially unaligned
      on a target that supports unaligned accesses (dr_unaligned_supported)
      we generate the following code:
@@ -11421,7 +11388,7 @@ vectorizable_load (vec_info *vinfo,
        }
 
       /* Collect vector loads and later create their permutation in
-        vect_transform_grouped_load ().  */
+        vect_transform_slp_perm_load.  */
       if (!costing_p && (grouped_load || slp_perm))
        dr_chain.quick_push (new_temp);
 
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 1d70332114d..082e27c04d4 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2569,7 +2569,6 @@ extern void vect_copy_ref_info (tree, tree);
 extern tree vect_create_destination_var (tree, tree);
 extern bool vect_grouped_store_supported (tree, unsigned HOST_WIDE_INT);
 extern internal_fn vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, 
bool);
-extern bool vect_grouped_load_supported (tree, bool, unsigned HOST_WIDE_INT);
 extern internal_fn vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT,
                                              bool, vec<int> * = nullptr);
 extern void vect_permute_store_chain (vec_info *, vec<tree> &,
@@ -2579,10 +2578,6 @@ extern tree vect_setup_realignment (vec_info *,
                                    stmt_vec_info, gimple_stmt_iterator *,
                                    tree *, enum dr_alignment_support, tree,
                                    class loop **);
-extern void vect_transform_grouped_load (vec_info *, stmt_vec_info, vec<tree>,
-                                        int, gimple_stmt_iterator *);
-extern void vect_record_grouped_load_vectors (vec_info *,
-                                             stmt_vec_info, vec<tree>);
 extern tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
 extern tree vect_get_new_ssa_name (tree, enum vect_var_kind,
                                   const char * = NULL);
-- 
2.43.0

[PATCH 4/5] Remove load interleaving code

Reply via email to