The following removes the non-SLP load interleaving code which was almost unused.
* tree-vectorizer.h (vect_grouped_load_supported): Remove. (vect_transform_grouped_load): Likewise. (vect_record_grouped_load_vectors): Likewise. * tree-vect-data-refs.cc (vect_grouped_load_supported): Remove. (vect_permute_load_chain): Likewise. (vect_shift_permute_load_chain): Likewise. (vect_transform_grouped_load): Likewise. (vect_record_grouped_load_vectors): Likewise. * tree-vect-loop.cc (vect_analyze_loop_2): Do not check for load interleaving support when disregarding single-lane SLP. * tree-vect-stmts.cc (vectorizable_load): Remove comments about load interleaving. --- gcc/tree-vect-data-refs.cc | 747 ------------------------------------- gcc/tree-vect-loop.cc | 7 +- gcc/tree-vect-stmts.cc | 35 +- gcc/tree-vectorizer.h | 5 - 4 files changed, 3 insertions(+), 791 deletions(-) diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index 3ff8be44b59..b38eecd7901 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -6581,126 +6581,6 @@ vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info, } -/* Function vect_grouped_load_supported. - - COUNT is the size of the load group (the number of statements plus the - number of gaps). SINGLE_ELEMENT_P is true if there is actually - only one statement, with a gap of COUNT - 1. - - Returns true if a suitable permute exists. */ - -bool -vect_grouped_load_supported (tree vectype, bool single_element_p, - unsigned HOST_WIDE_INT count) -{ - machine_mode mode = TYPE_MODE (vectype); - - /* If this is single-element interleaving with an element distance - that leaves unused vector loads around punt - we at least create - very sub-optimal code in that case (and blow up memory, - see PR65518). */ - if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype))) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "single-element interleaving not supported " - "for not adjacent vector loads\n"); - return false; - } - - /* vect_permute_load_chain requires the group size to be equal to 3 or - be a power of two. */ - if (count != 3 && exact_log2 (count) == -1) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "the size of the group of accesses" - " is not a power of 2 or not equal to 3\n"); - return false; - } - - /* Check that the permutation is supported. */ - if (VECTOR_MODE_P (mode)) - { - unsigned int i, j; - if (count == 3) - { - unsigned int nelt; - if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "cannot handle groups of 3 loads for" - " variable-length vectors\n"); - return false; - } - - vec_perm_builder sel (nelt, nelt, 1); - sel.quick_grow (nelt); - vec_perm_indices indices; - unsigned int k; - for (k = 0; k < 3; k++) - { - for (i = 0; i < nelt; i++) - if (3 * i + k < 2 * nelt) - sel[i] = 3 * i + k; - else - sel[i] = 0; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (mode, mode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shuffle of 3 loads is not supported by" - " target\n"); - return false; - } - for (i = 0, j = 0; i < nelt; i++) - if (3 * i + k < 2 * nelt) - sel[i] = i; - else - sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (mode, mode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shuffle of 3 loads is not supported by" - " target\n"); - return false; - } - } - return true; - } - else - { - /* If length is not equal to 3 then only power of 2 is supported. */ - gcc_assert (pow2p_hwi (count)); - poly_uint64 nelt = GET_MODE_NUNITS (mode); - - /* The encoding has a single stepped pattern. */ - vec_perm_builder sel (nelt, 1, 3); - sel.quick_grow (3); - for (i = 0; i < 3; i++) - sel[i] = i * 2; - vec_perm_indices indices (sel, 2, nelt); - if (can_vec_perm_const_p (mode, mode, indices)) - { - for (i = 0; i < 3; i++) - sel[i] = i * 2 + 1; - indices.new_vector (sel, 2, nelt); - if (can_vec_perm_const_p (mode, mode, indices)) - return true; - } - } - } - - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "extract even/odd not supported by target\n"); - return false; -} - /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors of type VECTYPE. MASKED_P says whether the masked form is needed. If it is available and ELSVALS is nonzero store the possible else values @@ -6730,633 +6610,6 @@ vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, return IFN_LAST; } -/* Function vect_permute_load_chain. - - Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be - a power of 2 or equal to 3, generate extract_even/odd stmts to reorder - the input data correctly. Return the final references for loads in - RESULT_CHAIN. - - E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. - The input is 4 vectors each containing 8 elements. We assign a number to each - element, the input sequence is: - - 1st vec: 0 1 2 3 4 5 6 7 - 2nd vec: 8 9 10 11 12 13 14 15 - 3rd vec: 16 17 18 19 20 21 22 23 - 4th vec: 24 25 26 27 28 29 30 31 - - The output sequence should be: - - 1st vec: 0 4 8 12 16 20 24 28 - 2nd vec: 1 5 9 13 17 21 25 29 - 3rd vec: 2 6 10 14 18 22 26 30 - 4th vec: 3 7 11 15 19 23 27 31 - - i.e., the first output vector should contain the first elements of each - interleaving group, etc. - - We use extract_even/odd instructions to create such output. The input of - each extract_even/odd operation is two vectors - 1st vec 2nd vec - 0 1 2 3 4 5 6 7 - - and the output is the vector of extracted even/odd elements. The output of - extract_even will be: 0 2 4 6 - and of extract_odd: 1 3 5 7 - - - The permutation is done in log LENGTH stages. In each stage extract_even - and extract_odd stmts are created for each pair of vectors in DR_CHAIN in - their order. In our example, - - E1: extract_even (1st vec, 2nd vec) - E2: extract_odd (1st vec, 2nd vec) - E3: extract_even (3rd vec, 4th vec) - E4: extract_odd (3rd vec, 4th vec) - - The output for the first stage will be: - - E1: 0 2 4 6 8 10 12 14 - E2: 1 3 5 7 9 11 13 15 - E3: 16 18 20 22 24 26 28 30 - E4: 17 19 21 23 25 27 29 31 - - In order to proceed and create the correct sequence for the next stage (or - for the correct output, if the second stage is the last one, as in our - example), we first put the output of extract_even operation and then the - output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). - The input for the second stage is: - - 1st vec (E1): 0 2 4 6 8 10 12 14 - 2nd vec (E3): 16 18 20 22 24 26 28 30 - 3rd vec (E2): 1 3 5 7 9 11 13 15 - 4th vec (E4): 17 19 21 23 25 27 29 31 - - The output of the second stage: - - E1: 0 4 8 12 16 20 24 28 - E2: 2 6 10 14 18 22 26 30 - E3: 1 5 9 13 17 21 25 29 - E4: 3 7 11 15 19 23 27 31 - - And RESULT_CHAIN after reordering: - - 1st vec (E1): 0 4 8 12 16 20 24 28 - 2nd vec (E3): 1 5 9 13 17 21 25 29 - 3rd vec (E2): 2 6 10 14 18 22 26 30 - 4th vec (E4): 3 7 11 15 19 23 27 31. */ - -static void -vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain, - unsigned int length, - stmt_vec_info stmt_info, - gimple_stmt_iterator *gsi, - vec<tree> *result_chain) -{ - tree data_ref, first_vect, second_vect; - tree perm_mask_even, perm_mask_odd; - tree perm3_mask_low, perm3_mask_high; - gimple *perm_stmt; - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - unsigned int i, j, log_length = exact_log2 (length); - - result_chain->quick_grow (length); - memcpy (result_chain->address (), dr_chain.address (), - length * sizeof (tree)); - - if (length == 3) - { - /* vect_grouped_load_supported ensures that this is constant. */ - unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); - unsigned int k; - - vec_perm_builder sel (nelt, nelt, 1); - sel.quick_grow (nelt); - vec_perm_indices indices; - for (k = 0; k < 3; k++) - { - for (i = 0; i < nelt; i++) - if (3 * i + k < 2 * nelt) - sel[i] = 3 * i + k; - else - sel[i] = 0; - indices.new_vector (sel, 2, nelt); - perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0, j = 0; i < nelt; i++) - if (3 * i + k < 2 * nelt) - sel[i] = i; - else - sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); - indices.new_vector (sel, 2, nelt); - perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); - - first_vect = dr_chain[0]; - second_vect = dr_chain[1]; - - /* Create interleaving stmt (low part of): - low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, - ...}> */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, - second_vect, perm3_mask_low); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - - /* Create interleaving stmt (high part of): - high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, - ...}> */ - first_vect = data_ref; - second_vect = dr_chain[2]; - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, - second_vect, perm3_mask_high); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[k] = data_ref; - } - } - else - { - /* If length is not equal to 3 then only power of 2 is supported. */ - gcc_assert (pow2p_hwi (length)); - - /* The encoding has a single stepped pattern. */ - poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); - vec_perm_builder sel (nelt, 1, 3); - sel.quick_grow (3); - for (i = 0; i < 3; ++i) - sel[i] = i * 2; - vec_perm_indices indices (sel, 2, nelt); - perm_mask_even = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0; i < 3; ++i) - sel[i] = i * 2 + 1; - indices.new_vector (sel, 2, nelt); - perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0; i < log_length; i++) - { - for (j = 0; j < length; j += 2) - { - first_vect = dr_chain[j]; - second_vect = dr_chain[j+1]; - - /* data_ref = permute_even (first_data_ref, second_data_ref); */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - first_vect, second_vect, - perm_mask_even); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[j/2] = data_ref; - - /* data_ref = permute_odd (first_data_ref, second_data_ref); */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - first_vect, second_vect, - perm_mask_odd); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[j/2+length/2] = data_ref; - } - memcpy (dr_chain.address (), result_chain->address (), - length * sizeof (tree)); - } - } -} - -/* Function vect_shift_permute_load_chain. - - Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate - sequence of stmts to reorder the input data accordingly. - Return the final references for loads in RESULT_CHAIN. - Return true if successed, false otherwise. - - E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. - The input is 3 vectors each containing 8 elements. We assign a - number to each element, the input sequence is: - - 1st vec: 0 1 2 3 4 5 6 7 - 2nd vec: 8 9 10 11 12 13 14 15 - 3rd vec: 16 17 18 19 20 21 22 23 - - The output sequence should be: - - 1st vec: 0 3 6 9 12 15 18 21 - 2nd vec: 1 4 7 10 13 16 19 22 - 3rd vec: 2 5 8 11 14 17 20 23 - - We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. - - First we shuffle all 3 vectors to get correct elements order: - - 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) - 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) - 3rd vec: (16 19 22) (17 20 23) (18 21) - - Next we unite and shift vector 3 times: - - 1st step: - shift right by 6 the concatenation of: - "1st vec" and "2nd vec" - ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) - "2nd vec" and "3rd vec" - ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) - "3rd vec" and "1st vec" - (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) - | New vectors | - - So that now new vectors are: - - 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) - 2nd vec: (10 13) (16 19 22) (17 20 23) - 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) - - 2nd step: - shift right by 5 the concatenation of: - "1st vec" and "3rd vec" - ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) - "2nd vec" and "1st vec" - (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) - "3rd vec" and "2nd vec" - (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) - | New vectors | - - So that now new vectors are: - - 1st vec: ( 9 12 15) (18 21) ( 0 3 6) - 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) - 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY - - 3rd step: - shift right by 5 the concatenation of: - "1st vec" and "1st vec" - ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) - shift right by 3 the concatenation of: - "2nd vec" and "2nd vec" - (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) - | New vectors | - - So that now all vectors are READY: - 1st vec: ( 0 3 6) ( 9 12 15) (18 21) - 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) - 3rd vec: ( 1 4 7) (10 13) (16 19 22) - - This algorithm is faster than one in vect_permute_load_chain if: - 1. "shift of a concatination" is faster than general permutation. - This is usually so. - 2. The TARGET machine can't execute vector instructions in parallel. - This is because each step of the algorithm depends on previous. - The algorithm in vect_permute_load_chain is much more parallel. - - The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. -*/ - -static bool -vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain, - unsigned int length, - stmt_vec_info stmt_info, - gimple_stmt_iterator *gsi, - vec<tree> *result_chain) -{ - tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; - tree perm2_mask1, perm2_mask2, perm3_mask; - tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; - gimple *perm_stmt; - - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - machine_mode vmode = TYPE_MODE (vectype); - unsigned int i; - loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); - - unsigned HOST_WIDE_INT nelt, vf; - if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt) - || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf)) - /* Not supported for variable-length vectors. */ - return false; - - vec_perm_builder sel (nelt, nelt, 1); - sel.quick_grow (nelt); - - result_chain->quick_grow (length); - memcpy (result_chain->address (), dr_chain.address (), - length * sizeof (tree)); - - if (pow2p_hwi (length) && vf > 4) - { - unsigned int j, log_length = exact_log2 (length); - for (i = 0; i < nelt / 2; ++i) - sel[i] = i * 2; - for (i = 0; i < nelt / 2; ++i) - sel[nelt / 2 + i] = i * 2 + 1; - vec_perm_indices indices (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shuffle of 2 fields structure is not \ - supported by target\n"); - return false; - } - perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0; i < nelt / 2; ++i) - sel[i] = i * 2 + 1; - for (i = 0; i < nelt / 2; ++i) - sel[nelt / 2 + i] = i * 2; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shuffle of 2 fields structure is not \ - supported by target\n"); - return false; - } - perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices); - - /* Generating permutation constant to shift all elements. - For vector length 8 it is {4 5 6 7 8 9 10 11}. */ - for (i = 0; i < nelt; i++) - sel[i] = nelt / 2 + i; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shift permutation is not supported by target\n"); - return false; - } - shift1_mask = vect_gen_perm_mask_checked (vectype, indices); - - /* Generating permutation constant to select vector from 2. - For vector length 8 it is {0 1 2 3 12 13 14 15}. */ - for (i = 0; i < nelt / 2; i++) - sel[i] = i; - for (i = nelt / 2; i < nelt; i++) - sel[i] = nelt + i; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "select is not supported by target\n"); - return false; - } - select_mask = vect_gen_perm_mask_checked (vectype, indices); - - for (i = 0; i < log_length; i++) - { - for (j = 0; j < length; j += 2) - { - first_vect = dr_chain[j]; - second_vect = dr_chain[j + 1]; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - first_vect, first_vect, - perm2_mask1); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - vect[0] = data_ref; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - second_vect, second_vect, - perm2_mask2); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - vect[1] = data_ref; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - vect[0], vect[1], shift1_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[j/2 + length/2] = data_ref; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - vect[0], vect[1], select_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[j/2] = data_ref; - } - memcpy (dr_chain.address (), result_chain->address (), - length * sizeof (tree)); - } - return true; - } - if (length == 3 && vf > 2) - { - unsigned int k = 0, l = 0; - - /* Generating permutation constant to get all elements in rigth order. - For vector length 8 it is {0 3 6 1 4 7 2 5}. */ - for (i = 0; i < nelt; i++) - { - if (3 * k + (l % 3) >= nelt) - { - k = 0; - l += (3 - (nelt % 3)); - } - sel[i] = 3 * k + (l % 3); - k++; - } - vec_perm_indices indices (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shuffle of 3 fields structure is not \ - supported by target\n"); - return false; - } - perm3_mask = vect_gen_perm_mask_checked (vectype, indices); - - /* Generating permutation constant to shift all elements. - For vector length 8 it is {6 7 8 9 10 11 12 13}. */ - for (i = 0; i < nelt; i++) - sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shift permutation is not supported by target\n"); - return false; - } - shift1_mask = vect_gen_perm_mask_checked (vectype, indices); - - /* Generating permutation constant to shift all elements. - For vector length 8 it is {5 6 7 8 9 10 11 12}. */ - for (i = 0; i < nelt; i++) - sel[i] = 2 * (nelt / 3) + 1 + i; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shift permutation is not supported by target\n"); - return false; - } - shift2_mask = vect_gen_perm_mask_checked (vectype, indices); - - /* Generating permutation constant to shift all elements. - For vector length 8 it is {3 4 5 6 7 8 9 10}. */ - for (i = 0; i < nelt; i++) - sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shift permutation is not supported by target\n"); - return false; - } - shift3_mask = vect_gen_perm_mask_checked (vectype, indices); - - /* Generating permutation constant to shift all elements. - For vector length 8 it is {5 6 7 8 9 10 11 12}. */ - for (i = 0; i < nelt; i++) - sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; - indices.new_vector (sel, 2, nelt); - if (!can_vec_perm_const_p (vmode, vmode, indices)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shift permutation is not supported by target\n"); - return false; - } - shift4_mask = vect_gen_perm_mask_checked (vectype, indices); - - for (k = 0; k < 3; k++) - { - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - dr_chain[k], dr_chain[k], - perm3_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - vect[k] = data_ref; - } - - for (k = 0; k < 3; k++) - { - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - vect[k % 3], vect[(k + 1) % 3], - shift1_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - vect_shift[k] = data_ref; - } - - for (k = 0; k < 3; k++) - { - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - vect_shift[(4 - k) % 3], - vect_shift[(3 - k) % 3], - shift2_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - vect[k] = data_ref; - } - - (*result_chain)[3 - (nelt % 3)] = vect[2]; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0], - vect[0], shift3_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[nelt % 3] = data_ref; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1], - vect[1], shift4_mask); - vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); - (*result_chain)[0] = data_ref; - return true; - } - return false; -} - -/* Function vect_transform_grouped_load. - - Given a chain of input interleaved data-refs (in DR_CHAIN), build statements - to perform their permutation and ascribe the result vectorized statements to - the scalar statements. -*/ - -void -vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info, - vec<tree> dr_chain, - int size, gimple_stmt_iterator *gsi) -{ - machine_mode mode; - vec<tree> result_chain = vNULL; - - /* DR_CHAIN contains input data-refs that are a part of the interleaving. - RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted - vectors, that are ready for vector computation. */ - result_chain.create (size); - - /* If reassociation width for vector type is 2 or greater target machine can - execute 2 or more vector instructions in parallel. Otherwise try to - get chain for loads group using vect_shift_permute_load_chain. */ - mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); - if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 - || pow2p_hwi (size) - || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info, - gsi, &result_chain)) - vect_permute_load_chain (vinfo, dr_chain, - size, stmt_info, gsi, &result_chain); - vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain); - result_chain.release (); -} - -/* RESULT_CHAIN contains the output of a group of grouped loads that were - generated as part of the vectorization of STMT_INFO. Assign the statement - for each vector to the associated scalar statement. */ - -void -vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info, - vec<tree> result_chain) -{ - stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); - unsigned int i, gap_count; - tree tmp_data_ref; - - /* Put a permuted data-ref in the VECTORIZED_STMT field. - Since we scan the chain starting from it's first node, their order - corresponds the order of data-refs in RESULT_CHAIN. */ - stmt_vec_info next_stmt_info = first_stmt_info; - gap_count = 1; - FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref) - { - if (!next_stmt_info) - break; - - /* Skip the gaps. Loads created for the gaps will be removed by dead - code elimination pass later. No need to check for the first stmt in - the group, since it always exists. - DR_GROUP_GAP is the number of steps in elements from the previous - access (if there is no gap DR_GROUP_GAP is 1). We skip loads that - correspond to the gaps. */ - if (next_stmt_info != first_stmt_info - && gap_count < DR_GROUP_GAP (next_stmt_info)) - { - gap_count++; - continue; - } - - /* ??? The following needs cleanup after the removal of - DR_GROUP_SAME_DR_STMT. */ - if (next_stmt_info) - { - gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref); - /* We assume that if VEC_STMT is not NULL, this is a case of multiple - copies, and we put the new vector statement last. */ - STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt); - - next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info); - gap_count = 1; - } - } -} - /* Function vect_force_dr_alignment_p. Returns whether the alignment of a DECL can be forced to be aligned diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 1d5ef9b7e9e..cb315e6bbf9 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -2866,7 +2866,7 @@ again: return ok; /* Likewise if the grouped loads or stores in the SLP cannot be handled - via interleaving or lane instructions. */ + via lane instructions. */ slp_instance instance; slp_tree node; unsigned i, j; @@ -2893,12 +2893,9 @@ again: if (STMT_VINFO_GROUPED_ACCESS (vinfo)) { vinfo = DR_GROUP_FIRST_ELEMENT (vinfo); - bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo); size = DR_GROUP_SIZE (vinfo); vectype = STMT_VINFO_VECTYPE (vinfo); - if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST - && ! vect_grouped_load_supported (vectype, single_element_p, - size)) + if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST) return opt_result::failure_at (vinfo->stmt, "unsupported grouped load\n"); } diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 366753216eb..3b8b98978d3 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -10213,39 +10213,6 @@ vectorizable_load (vec_info *vinfo, S2: z = x + 1 - - */ - /* In case of interleaving (non-unit grouped access): - - S1: x2 = &base + 2 - S2: x0 = &base - S3: x1 = &base + 1 - S4: x3 = &base + 3 - - Vectorized loads are created in the order of memory accesses - starting from the access of the first stmt of the chain: - - VS1: vx0 = &base - VS2: vx1 = &base + vec_size*1 - VS3: vx3 = &base + vec_size*2 - VS4: vx4 = &base + vec_size*3 - - Then permutation statements are generated: - - VS5: vx5 = VEC_PERM_EXPR < vx0, vx1, { 0, 2, ..., i*2 } > - VS6: vx6 = VEC_PERM_EXPR < vx0, vx1, { 1, 3, ..., i*2+1 } > - ... - - And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts - (the order of the data-refs in the output of vect_permute_load_chain - corresponds to the order of scalar stmts in the interleaving chain - see - the documentation of vect_permute_load_chain()). - The generation of permutation stmts and recording them in - STMT_VINFO_VEC_STMT is done in vect_transform_grouped_load(). - - In case of both multiple types and interleaving, the vector loads and - permutation stmts above are created for every copy. The result vector - stmts are put in STMT_VINFO_VEC_STMT for the first copy and in the - corresponding STMT_VINFO_RELATED_STMT for the next copies. */ - /* If the data reference is aligned (dr_aligned) or potentially unaligned on a target that supports unaligned accesses (dr_unaligned_supported) we generate the following code: @@ -11421,7 +11388,7 @@ vectorizable_load (vec_info *vinfo, } /* Collect vector loads and later create their permutation in - vect_transform_grouped_load (). */ + vect_transform_slp_perm_load. */ if (!costing_p && (grouped_load || slp_perm)) dr_chain.quick_push (new_temp); diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 1d70332114d..082e27c04d4 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -2569,7 +2569,6 @@ extern void vect_copy_ref_info (tree, tree); extern tree vect_create_destination_var (tree, tree); extern bool vect_grouped_store_supported (tree, unsigned HOST_WIDE_INT); extern internal_fn vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool); -extern bool vect_grouped_load_supported (tree, bool, unsigned HOST_WIDE_INT); extern internal_fn vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool, vec<int> * = nullptr); extern void vect_permute_store_chain (vec_info *, vec<tree> &, @@ -2579,10 +2578,6 @@ extern tree vect_setup_realignment (vec_info *, stmt_vec_info, gimple_stmt_iterator *, tree *, enum dr_alignment_support, tree, class loop **); -extern void vect_transform_grouped_load (vec_info *, stmt_vec_info, vec<tree>, - int, gimple_stmt_iterator *); -extern void vect_record_grouped_load_vectors (vec_info *, - stmt_vec_info, vec<tree>); extern tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *); extern tree vect_get_new_ssa_name (tree, enum vect_var_kind, const char * = NULL); -- 2.43.0