This cleans the rest of vectorizable_load from non-SLP, propagates out ncopies == 1, and elides loops from 0 to ncopies.
Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed. * tree-vect-stmts.cc (vectorizable_load): Remove non-SLP paths and propagate out ncopies == 1. --- gcc/tree-vect-stmts.cc | 1935 ++++++++++++++++++---------------------- 1 file changed, 876 insertions(+), 1059 deletions(-) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index f699d808e68..db1b539b6c7 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -9836,7 +9836,6 @@ vectorizable_load (vec_info *vinfo, tree dataref_ptr = NULL_TREE; tree dataref_offset = NULL_TREE; gimple *ptr_incr = NULL; - int ncopies; int i, j; unsigned int group_size; poly_uint64 group_gap_adj; @@ -9850,7 +9849,6 @@ vectorizable_load (vec_info *vinfo, bool compute_in_loop = false; class loop *at_loop; int vec_num; - bool slp = (slp_node != NULL); bool slp_perm = false; bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo); poly_uint64 vf; @@ -9909,7 +9907,7 @@ vectorizable_load (vec_info *vinfo, return false; mask_index = internal_fn_mask_index (ifn); - if (mask_index >= 0 && slp_node) + if (mask_index >= 0) mask_index = vect_slp_child_index_for_operand (call, mask_index, STMT_VINFO_GATHER_SCATTER_P (stmt_info)); if (mask_index >= 0 @@ -9918,7 +9916,7 @@ vectorizable_load (vec_info *vinfo, return false; els_index = internal_fn_else_index (ifn); - if (els_index >= 0 && slp_node) + if (els_index >= 0) els_index = vect_slp_child_index_for_operand (call, els_index, STMT_VINFO_GATHER_SCATTER_P (stmt_info)); if (els_index >= 0 @@ -9939,19 +9937,9 @@ vectorizable_load (vec_info *vinfo, else vf = 1; - /* Multiple types in SLP are handled by creating the appropriate number of - vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in - case of SLP. */ - if (slp) - ncopies = 1; - else - ncopies = vect_get_num_copies (loop_vinfo, vectype); - - gcc_assert (ncopies >= 1); - /* FORNOW. This restriction should be relaxed. */ if (nested_in_vect_loop - && (ncopies > 1 || (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1))) + && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -9959,20 +9947,6 @@ vectorizable_load (vec_info *vinfo, return false; } - /* Invalidate assumptions made by dependence analysis when vectorization - on the unrolled body effectively re-orders stmts. */ - if (ncopies > 1 - && STMT_VINFO_MIN_NEG_DIST (stmt_info) != 0 - && maybe_gt (LOOP_VINFO_VECT_FACTOR (loop_vinfo), - STMT_VINFO_MIN_NEG_DIST (stmt_info))) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "cannot perform implicit CSE when unrolling " - "with negative dependence distance\n"); - return false; - } - elem_type = TREE_TYPE (vectype); mode = TYPE_MODE (vectype); @@ -9997,15 +9971,6 @@ vectorizable_load (vec_info *vinfo, first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); group_size = DR_GROUP_SIZE (first_stmt_info); - /* Refuse non-SLP vectorization of SLP-only groups. */ - if (!slp && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "cannot vectorize load in non-SLP mode.\n"); - return false; - } - /* Invalidate assumptions made by dependence analysis when vectorization on the unrolled body effectively re-orders stmts. */ if (STMT_VINFO_MIN_NEG_DIST (stmt_info) != 0 @@ -10031,7 +9996,7 @@ vectorizable_load (vec_info *vinfo, int maskload_elsval = 0; bool need_zeroing = false; if (!get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask, VLS_LOAD, - ncopies, &memory_access_type, &poffset, + 1, &memory_access_type, &poffset, &alignment_support_scheme, &misalignment, &gs_info, &lanes_ifn, &elsvals)) return false; @@ -10046,8 +10011,7 @@ vectorizable_load (vec_info *vinfo, /* ??? The following checks should really be part of get_group_load_store_type. */ - if (slp - && SLP_TREE_LOAD_PERMUTATION (slp_node).exists () + if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists () && !((memory_access_type == VMAT_ELEMENTWISE || memory_access_type == VMAT_GATHER_SCATTER) && SLP_TREE_LANES (slp_node) == 1)) @@ -10090,8 +10054,7 @@ vectorizable_load (vec_info *vinfo, } } - if (slp_node - && slp_node->ldst_lanes + if (slp_node->ldst_lanes && memory_access_type != VMAT_LOAD_STORE_LANES) { if (dump_enabled_p ()) @@ -10142,8 +10105,7 @@ vectorizable_load (vec_info *vinfo, if (costing_p) /* transformation not required. */ { - if (slp_node - && mask + if (mask && !vect_maybe_update_slp_op_vectype (slp_op, mask_vectype)) { @@ -10153,10 +10115,7 @@ vectorizable_load (vec_info *vinfo, return false; } - if (!slp) - STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type; - else - SLP_TREE_MEMORY_ACCESS_TYPE (slp_node) = memory_access_type; + SLP_TREE_MEMORY_ACCESS_TYPE (slp_node) = memory_access_type; if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) @@ -10210,16 +10169,10 @@ vectorizable_load (vec_info *vinfo, if (elsvals.length ()) maskload_elsval = *elsvals.begin (); - if (!slp) - gcc_assert (memory_access_type - == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)); - else - gcc_assert (memory_access_type - == SLP_TREE_MEMORY_ACCESS_TYPE (slp_node)); + gcc_assert (memory_access_type == SLP_TREE_MEMORY_ACCESS_TYPE (slp_node)); if (dump_enabled_p () && !costing_p) - dump_printf_loc (MSG_NOTE, vect_location, - "transform load. ncopies = %d\n", ncopies); + dump_printf_loc (MSG_NOTE, vect_location, "transform load.\n"); /* Transform. */ @@ -10289,15 +10242,8 @@ vectorizable_load (vec_info *vinfo, vectype, &gsi2); } gimple *new_stmt = SSA_NAME_DEF_STMT (new_temp); - if (slp) - for (j = 0; j < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); ++j) - slp_node->push_vec_def (new_stmt); - else - { - for (j = 0; j < ncopies; ++j) - STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); - *vec_stmt = new_stmt; - } + for (j = 0; j < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); ++j) + slp_node->push_vec_def (new_stmt); return true; } @@ -10474,6 +10420,7 @@ vectorizable_load (vec_info *vinfo, /* For SLP permutation support we need to load the whole group, not only the number of vector stmts the permutation result fits in. */ + int ncopies; if (slp_perm) { /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for @@ -10615,12 +10562,11 @@ vectorizable_load (vec_info *vinfo, return true; } - if (memory_access_type == VMAT_GATHER_SCATTER - || (!slp && memory_access_type == VMAT_CONTIGUOUS)) + if (memory_access_type == VMAT_GATHER_SCATTER) grouped_load = false; if (grouped_load - || (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())) + || SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) { if (grouped_load) { @@ -10634,7 +10580,7 @@ vectorizable_load (vec_info *vinfo, } /* For SLP vectorization we directly vectorize a subchain without permutation. */ - if (slp && ! SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) + if (! SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0]; /* For BB vectorization always use the first stmt to base the data ref pointer on. */ @@ -10642,60 +10588,39 @@ vectorizable_load (vec_info *vinfo, first_stmt_info_for_drptr = vect_find_first_scalar_stmt_in_slp (slp_node); - /* Check if the chain of loads is already vectorized. */ - if (STMT_VINFO_VEC_STMTS (first_stmt_info).exists () - /* For SLP we would need to copy over SLP_TREE_VEC_DEFS. - ??? But we can only do so if there is exactly one - as we have no way to get at the rest. Leave the CSE - opportunity alone. - ??? With the group load eventually participating - in multiple different permutations (having multiple - slp nodes which refer to the same group) the CSE - is even wrong code. See PR56270. */ - && !slp) - { - *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; - return true; - } first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info); group_gap_adj = 0; /* VEC_NUM is the number of vect stmts to be created for this group. */ - if (slp) - { - grouped_load = false; - /* If an SLP permutation is from N elements to N elements, - and if one vector holds a whole number of N, we can load - the inputs to the permutation in the same way as an - unpermuted sequence. In other cases we need to load the - whole group, not only the number of vector stmts the - permutation result fits in. */ - unsigned scalar_lanes = SLP_TREE_LANES (slp_node); - if (nested_in_vect_loop) - /* We do not support grouped accesses in a nested loop, - instead the access is contiguous but it might be - permuted. No gap adjustment is needed though. */ - vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); - else if (slp_perm - && (group_size != scalar_lanes - || !multiple_p (nunits, group_size))) - { - /* We don't yet generate such SLP_TREE_LOAD_PERMUTATIONs for - variable VF; see vect_transform_slp_perm_load. */ - unsigned int const_vf = vf.to_constant (); - unsigned int const_nunits = nunits.to_constant (); - vec_num = CEIL (group_size * const_vf, const_nunits); - group_gap_adj = vf * group_size - nunits * vec_num; - } - else - { - vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); - group_gap_adj - = group_size - scalar_lanes; - } - } + grouped_load = false; + /* If an SLP permutation is from N elements to N elements, + and if one vector holds a whole number of N, we can load + the inputs to the permutation in the same way as an + unpermuted sequence. In other cases we need to load the + whole group, not only the number of vector stmts the + permutation result fits in. */ + unsigned scalar_lanes = SLP_TREE_LANES (slp_node); + if (nested_in_vect_loop) + /* We do not support grouped accesses in a nested loop, + instead the access is contiguous but it might be + permuted. No gap adjustment is needed though. */ + vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + else if (slp_perm + && (group_size != scalar_lanes + || !multiple_p (nunits, group_size))) + { + /* We don't yet generate such SLP_TREE_LOAD_PERMUTATIONs for + variable VF; see vect_transform_slp_perm_load. */ + unsigned int const_vf = vf.to_constant (); + unsigned int const_nunits = nunits.to_constant (); + vec_num = CEIL (group_size * const_vf, const_nunits); + group_gap_adj = vf * group_size - nunits * vec_num; + } else - vec_num = group_size; + { + vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + group_gap_adj = group_size - scalar_lanes; + } ref_type = get_group_alias_ptr_type (first_stmt_info); } @@ -10706,8 +10631,7 @@ vectorizable_load (vec_info *vinfo, group_size = vec_num = 1; group_gap_adj = 0; ref_type = reference_alias_ptr_type (DR_REF (first_dr_info->dr)); - if (slp) - vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); } gcc_assert (alignment_support_scheme); @@ -10909,14 +10833,8 @@ vectorizable_load (vec_info *vinfo, auto_vec<tree> vec_offsets; auto_vec<tree> vec_masks; if (mask && !costing_p) - { - if (slp_node) - vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[mask_index], - &vec_masks); - else - vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, mask, - &vec_masks, mask_vectype); - } + vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[mask_index], + &vec_masks); tree vec_mask = NULL_TREE; tree vec_els = NULL_TREE; @@ -10929,8 +10847,7 @@ vectorizable_load (vec_info *vinfo, /* For costing some adjacent vector loads, we'd like to cost with the total number of them once instead of cost each one by one. */ unsigned int n_adjacent_loads = 0; - if (slp_node) - ncopies = slp_node->vec_stmts_size / group_size; + int ncopies = slp_node->vec_stmts_size / group_size; for (j = 0; j < ncopies; j++) { if (costing_p) @@ -11053,32 +10970,17 @@ vectorizable_load (vec_info *vinfo, gimple_call_set_nothrow (call, true); vect_finish_stmt_generation (vinfo, stmt_info, call, gsi); - if (!slp) - dr_chain.create (group_size); /* Extract each vector into an SSA_NAME. */ for (unsigned i = 0; i < group_size; i++) { new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest, vec_array, i, need_zeroing, final_mask); - if (slp) - slp_node->push_vec_def (new_temp); - else - dr_chain.quick_push (new_temp); + slp_node->push_vec_def (new_temp); } - if (!slp) - /* Record the mapping between SSA_NAMEs and statements. */ - vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain); - /* Record that VEC_ARRAY is now dead. */ vect_clobber_variable (vinfo, stmt_info, gsi, vec_array); - - if (!slp) - dr_chain.release (); - - if (!slp_node) - *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; } if (costing_p) @@ -11105,364 +11007,328 @@ vectorizable_load (vec_info *vinfo, gcc_assert (!grouped_load && !slp_perm); unsigned int inside_cost = 0, prologue_cost = 0; - for (j = 0; j < ncopies; j++) + + /* 1. Create the vector or array pointer update chain. */ + if (!costing_p) { - /* 1. Create the vector or array pointer update chain. */ - if (j == 0 && !costing_p) - { - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) - vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info, - slp_node, &gs_info, &dataref_ptr, - &vec_offsets); - else - dataref_ptr - = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type, - at_loop, offset, &dummy, gsi, - &ptr_incr, false, bump); - } - else if (!costing_p) + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info, + slp_node, &gs_info, &dataref_ptr, + &vec_offsets); + else + dataref_ptr + = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type, + at_loop, offset, &dummy, gsi, + &ptr_incr, false, bump); + } + + gimple *new_stmt = NULL; + for (i = 0; i < vec_num; i++) + { + tree final_mask = NULL_TREE; + tree final_len = NULL_TREE; + tree bias = NULL_TREE; + if (!costing_p) { - gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)); - if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + if (mask) + vec_mask = vec_masks[i]; + if (loop_masks) + final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, + vec_num, vectype, i); + if (vec_mask) + final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, + final_mask, vec_mask, gsi); + + if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi, stmt_info, bump); } - gimple *new_stmt = NULL; - for (i = 0; i < vec_num; i++) + /* 2. Create the vector-load in the loop. */ + unsigned HOST_WIDE_INT align; + if (gs_info.ifn != IFN_LAST) { - tree final_mask = NULL_TREE; - tree final_len = NULL_TREE; - tree bias = NULL_TREE; - if (!costing_p) + if (costing_p) { - if (mask) - vec_mask = vec_masks[vec_num * j + i]; - if (loop_masks) - final_mask - = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, - vec_num * ncopies, vectype, - vec_num * j + i); - if (vec_mask) - final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, - final_mask, vec_mask, gsi); - - if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) - dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, - gsi, stmt_info, bump); + unsigned int cnunits = vect_nunits_for_cost (vectype); + inside_cost + = record_stmt_cost (cost_vec, cnunits, scalar_load, + slp_node, 0, vect_body); + continue; } + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + vec_offset = vec_offsets[i]; + tree zero = build_zero_cst (vectype); + tree scale = size_int (gs_info.scale); - /* 2. Create the vector-load in the loop. */ - unsigned HOST_WIDE_INT align; - if (gs_info.ifn != IFN_LAST) + if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD) { - if (costing_p) - { - unsigned int cnunits = vect_nunits_for_cost (vectype); - inside_cost - = record_stmt_cost (cost_vec, cnunits, scalar_load, - slp_node, 0, vect_body); - continue; - } - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) - vec_offset = vec_offsets[vec_num * j + i]; - tree zero = build_zero_cst (vectype); - tree scale = size_int (gs_info.scale); - - if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD) + if (loop_lens) + final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens, + vec_num, vectype, i, 1); + else + final_len = build_int_cst (sizetype, + TYPE_VECTOR_SUBPARTS (vectype)); + signed char biasval + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); + bias = build_int_cst (intQI_type_node, biasval); + if (!final_mask) { - if (loop_lens) - final_len - = vect_get_loop_len (loop_vinfo, gsi, loop_lens, - vec_num * ncopies, vectype, - vec_num * j + i, 1); - else - final_len - = build_int_cst (sizetype, - TYPE_VECTOR_SUBPARTS (vectype)); - signed char biasval - = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); - bias = build_int_cst (intQI_type_node, biasval); - if (!final_mask) - { - mask_vectype = truth_type_for (vectype); - final_mask = build_minus_one_cst (mask_vectype); - } + mask_vectype = truth_type_for (vectype); + final_mask = build_minus_one_cst (mask_vectype); } + } - if (final_mask) - { - vec_els = vect_get_mask_load_else - (maskload_elsval, vectype); - if (type_mode_padding_p - && maskload_elsval != MASK_LOAD_ELSE_ZERO) - need_zeroing = true; - } + if (final_mask) + { + vec_els = vect_get_mask_load_else (maskload_elsval, vectype); + if (type_mode_padding_p + && maskload_elsval != MASK_LOAD_ELSE_ZERO) + need_zeroing = true; + } - gcall *call; - if (final_len && final_mask) - { - if (VECTOR_TYPE_P (TREE_TYPE (vec_offset))) - call = gimple_build_call_internal ( - IFN_MASK_LEN_GATHER_LOAD, 8, dataref_ptr, vec_offset, - scale, zero, final_mask, vec_els, final_len, bias); - else - /* Non-vector offset indicates that prefer to take - MASK_LEN_STRIDED_LOAD instead of the - MASK_LEN_GATHER_LOAD with direct stride arg. */ - call = gimple_build_call_internal ( - IFN_MASK_LEN_STRIDED_LOAD, 7, dataref_ptr, vec_offset, - zero, final_mask, vec_els, final_len, bias); - } - else if (final_mask) - call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, - 6, dataref_ptr, - vec_offset, scale, - zero, final_mask, - vec_els); + gcall *call; + if (final_len && final_mask) + { + if (VECTOR_TYPE_P (TREE_TYPE (vec_offset))) + call = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD, + 8, dataref_ptr, + vec_offset, scale, zero, + final_mask, vec_els, + final_len, bias); else - call = gimple_build_call_internal (IFN_GATHER_LOAD, 4, - dataref_ptr, vec_offset, - scale, zero); - gimple_call_set_nothrow (call, true); - new_stmt = call; + /* Non-vector offset indicates that prefer to take + MASK_LEN_STRIDED_LOAD instead of the + MASK_LEN_GATHER_LOAD with direct stride arg. */ + call = gimple_build_call_internal + (IFN_MASK_LEN_STRIDED_LOAD, 7, dataref_ptr, + vec_offset, zero, final_mask, vec_els, final_len, + bias); + } + else if (final_mask) + call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, + 6, dataref_ptr, + vec_offset, scale, + zero, final_mask, vec_els); + else + call = gimple_build_call_internal (IFN_GATHER_LOAD, 4, + dataref_ptr, vec_offset, + scale, zero); + gimple_call_set_nothrow (call, true); + new_stmt = call; + data_ref = NULL_TREE; + } + else if (gs_info.decl) + { + /* The builtin decls path for gather is legacy, x86 only. */ + gcc_assert (!final_len && nunits.is_constant ()); + if (costing_p) + { + unsigned int cnunits = vect_nunits_for_cost (vectype); + inside_cost + = record_stmt_cost (cost_vec, cnunits, scalar_load, + slp_node, 0, vect_body); + continue; + } + poly_uint64 offset_nunits + = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype); + if (known_eq (nunits, offset_nunits)) + { + new_stmt = vect_build_one_gather_load_call + (vinfo, stmt_info, gsi, &gs_info, + dataref_ptr, vec_offsets[i], final_mask); data_ref = NULL_TREE; } - else if (gs_info.decl) + else if (known_eq (nunits, offset_nunits * 2)) { - /* The builtin decls path for gather is legacy, x86 only. */ - gcc_assert (!final_len && nunits.is_constant ()); - if (costing_p) - { - unsigned int cnunits = vect_nunits_for_cost (vectype); - inside_cost - = record_stmt_cost (cost_vec, cnunits, scalar_load, - slp_node, 0, vect_body); - continue; - } - poly_uint64 offset_nunits - = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype); - if (known_eq (nunits, offset_nunits)) - { - new_stmt = vect_build_one_gather_load_call - (vinfo, stmt_info, gsi, &gs_info, - dataref_ptr, vec_offsets[vec_num * j + i], - final_mask); - data_ref = NULL_TREE; - } - else if (known_eq (nunits, offset_nunits * 2)) - { - /* We have a offset vector with half the number of - lanes but the builtins will produce full vectype - data with just the lower lanes filled. */ - new_stmt = vect_build_one_gather_load_call - (vinfo, stmt_info, gsi, &gs_info, - dataref_ptr, vec_offsets[2 * vec_num * j + 2 * i], - final_mask); - tree low = make_ssa_name (vectype); - gimple_set_lhs (new_stmt, low); - vect_finish_stmt_generation (vinfo, stmt_info, - new_stmt, gsi); - - /* now put upper half of final_mask in final_mask low. */ - if (final_mask - && !SCALAR_INT_MODE_P - (TYPE_MODE (TREE_TYPE (final_mask)))) - { - int count = nunits.to_constant (); - vec_perm_builder sel (count, count, 1); - sel.quick_grow (count); - for (int i = 0; i < count; ++i) - sel[i] = i | (count / 2); - vec_perm_indices indices (sel, 2, count); - tree perm_mask = vect_gen_perm_mask_checked - (TREE_TYPE (final_mask), indices); - new_stmt = gimple_build_assign (NULL_TREE, - VEC_PERM_EXPR, - final_mask, - final_mask, - perm_mask); - final_mask = make_ssa_name (TREE_TYPE (final_mask)); - gimple_set_lhs (new_stmt, final_mask); - vect_finish_stmt_generation (vinfo, stmt_info, - new_stmt, gsi); - } - else if (final_mask) - { - new_stmt = gimple_build_assign (NULL_TREE, - VEC_UNPACK_HI_EXPR, - final_mask); - final_mask = make_ssa_name - (truth_type_for (gs_info.offset_vectype)); - gimple_set_lhs (new_stmt, final_mask); - vect_finish_stmt_generation (vinfo, stmt_info, - new_stmt, gsi); - } - - new_stmt = vect_build_one_gather_load_call - (vinfo, stmt_info, gsi, &gs_info, - dataref_ptr, - vec_offsets[2 * vec_num * j + 2 * i + 1], - final_mask); - tree high = make_ssa_name (vectype); - gimple_set_lhs (new_stmt, high); - vect_finish_stmt_generation (vinfo, stmt_info, - new_stmt, gsi); + /* We have a offset vector with half the number of + lanes but the builtins will produce full vectype + data with just the lower lanes filled. */ + new_stmt = vect_build_one_gather_load_call + (vinfo, stmt_info, gsi, &gs_info, + dataref_ptr, vec_offsets[2 * i], final_mask); + tree low = make_ssa_name (vectype); + gimple_set_lhs (new_stmt, low); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); - /* compose low + high. */ + /* now put upper half of final_mask in final_mask low. */ + if (final_mask + && !SCALAR_INT_MODE_P (TYPE_MODE (TREE_TYPE (final_mask)))) + { int count = nunits.to_constant (); vec_perm_builder sel (count, count, 1); sel.quick_grow (count); for (int i = 0; i < count; ++i) - sel[i] = i < count / 2 ? i : i + count / 2; + sel[i] = i | (count / 2); vec_perm_indices indices (sel, 2, count); - tree perm_mask - = vect_gen_perm_mask_checked (vectype, indices); - new_stmt = gimple_build_assign (NULL_TREE, - VEC_PERM_EXPR, - low, high, perm_mask); - data_ref = NULL_TREE; + tree perm_mask = vect_gen_perm_mask_checked + (TREE_TYPE (final_mask), indices); + new_stmt = gimple_build_assign (NULL_TREE, VEC_PERM_EXPR, + final_mask, final_mask, + perm_mask); + final_mask = make_ssa_name (TREE_TYPE (final_mask)); + gimple_set_lhs (new_stmt, final_mask); + vect_finish_stmt_generation (vinfo, stmt_info, + new_stmt, gsi); } - else if (known_eq (nunits * 2, offset_nunits)) + else if (final_mask) { - /* We have a offset vector with double the number of - lanes. Select the low/high part accordingly. */ - vec_offset = vec_offsets[(vec_num * j + i) / 2]; - if ((vec_num * j + i) & 1) - { - int count = offset_nunits.to_constant (); - vec_perm_builder sel (count, count, 1); - sel.quick_grow (count); - for (int i = 0; i < count; ++i) - sel[i] = i | (count / 2); - vec_perm_indices indices (sel, 2, count); - tree perm_mask = vect_gen_perm_mask_checked - (TREE_TYPE (vec_offset), indices); - new_stmt = gimple_build_assign (NULL_TREE, - VEC_PERM_EXPR, - vec_offset, - vec_offset, - perm_mask); - vec_offset = make_ssa_name (TREE_TYPE (vec_offset)); - gimple_set_lhs (new_stmt, vec_offset); - vect_finish_stmt_generation (vinfo, stmt_info, - new_stmt, gsi); - } - new_stmt = vect_build_one_gather_load_call - (vinfo, stmt_info, gsi, &gs_info, - dataref_ptr, vec_offset, final_mask); - data_ref = NULL_TREE; + new_stmt = gimple_build_assign (NULL_TREE, + VEC_UNPACK_HI_EXPR, + final_mask); + final_mask = make_ssa_name + (truth_type_for (gs_info.offset_vectype)); + gimple_set_lhs (new_stmt, final_mask); + vect_finish_stmt_generation (vinfo, stmt_info, + new_stmt, gsi); } - else - gcc_unreachable (); + + new_stmt = vect_build_one_gather_load_call + (vinfo, stmt_info, gsi, &gs_info, dataref_ptr, + vec_offsets[2 * i + 1], final_mask); + tree high = make_ssa_name (vectype); + gimple_set_lhs (new_stmt, high); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + + /* compose low + high. */ + int count = nunits.to_constant (); + vec_perm_builder sel (count, count, 1); + sel.quick_grow (count); + for (int i = 0; i < count; ++i) + sel[i] = i < count / 2 ? i : i + count / 2; + vec_perm_indices indices (sel, 2, count); + tree perm_mask + = vect_gen_perm_mask_checked (vectype, indices); + new_stmt = gimple_build_assign (NULL_TREE, VEC_PERM_EXPR, + low, high, perm_mask); + data_ref = NULL_TREE; } - else + else if (known_eq (nunits * 2, offset_nunits)) { - /* Emulated gather-scatter. */ - gcc_assert (!final_mask); - unsigned HOST_WIDE_INT const_nunits = nunits.to_constant (); - if (costing_p) + /* We have a offset vector with double the number of + lanes. Select the low/high part accordingly. */ + vec_offset = vec_offsets[i / 2]; + if (i & 1) { - /* For emulated gathers N offset vector element - offset add is consumed by the load). */ - inside_cost = record_stmt_cost (cost_vec, const_nunits, - vec_to_scalar, - slp_node, 0, vect_body); - /* N scalar loads plus gathering them into a - vector. */ - inside_cost - = record_stmt_cost (cost_vec, const_nunits, scalar_load, - slp_node, 0, vect_body); - inside_cost - = record_stmt_cost (cost_vec, 1, vec_construct, - slp_node, 0, vect_body); - continue; - } - unsigned HOST_WIDE_INT const_offset_nunits - = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype) - .to_constant (); - vec<constructor_elt, va_gc> *ctor_elts; - vec_alloc (ctor_elts, const_nunits); - gimple_seq stmts = NULL; - /* We support offset vectors with more elements - than the data vector for now. */ - unsigned HOST_WIDE_INT factor - = const_offset_nunits / const_nunits; - vec_offset = vec_offsets[(vec_num * j + i) / factor]; - unsigned elt_offset - = ((vec_num * j + i) % factor) * const_nunits; - tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset)); - tree scale = size_int (gs_info.scale); - align = get_object_alignment (DR_REF (first_dr_info->dr)); - tree ltype = build_aligned_type (TREE_TYPE (vectype), align); - for (unsigned k = 0; k < const_nunits; ++k) - { - tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type), - bitsize_int (k + elt_offset)); - tree idx - = gimple_build (&stmts, BIT_FIELD_REF, idx_type, - vec_offset, TYPE_SIZE (idx_type), boff); - idx = gimple_convert (&stmts, sizetype, idx); - idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx, - scale); - tree ptr = gimple_build (&stmts, PLUS_EXPR, - TREE_TYPE (dataref_ptr), - dataref_ptr, idx); - ptr = gimple_convert (&stmts, ptr_type_node, ptr); - tree elt = make_ssa_name (TREE_TYPE (vectype)); - tree ref = build2 (MEM_REF, ltype, ptr, - build_int_cst (ref_type, 0)); - new_stmt = gimple_build_assign (elt, ref); - gimple_set_vuse (new_stmt, gimple_vuse (gsi_stmt (*gsi))); - gimple_seq_add_stmt (&stmts, new_stmt); - CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt); + int count = offset_nunits.to_constant (); + vec_perm_builder sel (count, count, 1); + sel.quick_grow (count); + for (int i = 0; i < count; ++i) + sel[i] = i | (count / 2); + vec_perm_indices indices (sel, 2, count); + tree perm_mask = vect_gen_perm_mask_checked + (TREE_TYPE (vec_offset), indices); + new_stmt = gimple_build_assign (NULL_TREE, VEC_PERM_EXPR, + vec_offset, vec_offset, + perm_mask); + vec_offset = make_ssa_name (TREE_TYPE (vec_offset)); + gimple_set_lhs (new_stmt, vec_offset); + vect_finish_stmt_generation (vinfo, stmt_info, + new_stmt, gsi); } - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); - new_stmt = gimple_build_assign ( - NULL_TREE, build_constructor (vectype, ctor_elts)); + new_stmt = vect_build_one_gather_load_call + (vinfo, stmt_info, gsi, &gs_info, + dataref_ptr, vec_offset, final_mask); data_ref = NULL_TREE; } - - vec_dest = vect_create_destination_var (scalar_dest, vectype); - /* DATA_REF is null if we've already built the statement. */ - if (data_ref) + else + gcc_unreachable (); + } + else + { + /* Emulated gather-scatter. */ + gcc_assert (!final_mask); + unsigned HOST_WIDE_INT const_nunits = nunits.to_constant (); + if (costing_p) { - vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); - new_stmt = gimple_build_assign (vec_dest, data_ref); + /* For emulated gathers N offset vector element + offset add is consumed by the load). */ + inside_cost = record_stmt_cost (cost_vec, const_nunits, + vec_to_scalar, + slp_node, 0, vect_body); + /* N scalar loads plus gathering them into a + vector. */ + inside_cost + = record_stmt_cost (cost_vec, const_nunits, scalar_load, + slp_node, 0, vect_body); + inside_cost + = record_stmt_cost (cost_vec, 1, vec_construct, + slp_node, 0, vect_body); + continue; } - new_temp = need_zeroing - ? make_ssa_name (vectype) - : make_ssa_name (vec_dest, new_stmt); - gimple_set_lhs (new_stmt, new_temp); - vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); - - /* If we need to explicitly zero inactive elements emit a - VEC_COND_EXPR that does so. */ - if (need_zeroing) + unsigned HOST_WIDE_INT const_offset_nunits + = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype) .to_constant (); + vec<constructor_elt, va_gc> *ctor_elts; + vec_alloc (ctor_elts, const_nunits); + gimple_seq stmts = NULL; + /* We support offset vectors with more elements + than the data vector for now. */ + unsigned HOST_WIDE_INT factor + = const_offset_nunits / const_nunits; + vec_offset = vec_offsets[i / factor]; + unsigned elt_offset = (i % factor) * const_nunits; + tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset)); + tree scale = size_int (gs_info.scale); + align = get_object_alignment (DR_REF (first_dr_info->dr)); + tree ltype = build_aligned_type (TREE_TYPE (vectype), align); + for (unsigned k = 0; k < const_nunits; ++k) { - vec_els = vect_get_mask_load_else (MASK_LOAD_ELSE_ZERO, - vectype); - - tree new_temp2 = make_ssa_name (vec_dest, new_stmt); - new_stmt - = gimple_build_assign (new_temp2, VEC_COND_EXPR, - final_mask, new_temp, vec_els); - vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, - gsi); - new_temp = new_temp2; + tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type), + bitsize_int (k + elt_offset)); + tree idx = gimple_build (&stmts, BIT_FIELD_REF, idx_type, + vec_offset, TYPE_SIZE (idx_type), + boff); + idx = gimple_convert (&stmts, sizetype, idx); + idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx, scale); + tree ptr = gimple_build (&stmts, PLUS_EXPR, + TREE_TYPE (dataref_ptr), + dataref_ptr, idx); + ptr = gimple_convert (&stmts, ptr_type_node, ptr); + tree elt = make_ssa_name (TREE_TYPE (vectype)); + tree ref = build2 (MEM_REF, ltype, ptr, + build_int_cst (ref_type, 0)); + new_stmt = gimple_build_assign (elt, ref); + gimple_set_vuse (new_stmt, gimple_vuse (gsi_stmt (*gsi))); + gimple_seq_add_stmt (&stmts, new_stmt); + CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt); } - - /* Store vector loads in the corresponding SLP_NODE. */ - if (slp) - slp_node->push_vec_def (new_stmt); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + new_stmt = gimple_build_assign (NULL_TREE, + build_constructor (vectype, + ctor_elts)); + data_ref = NULL_TREE; } - if (!slp && !costing_p) - STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); - } + vec_dest = vect_create_destination_var (scalar_dest, vectype); + /* DATA_REF is null if we've already built the statement. */ + if (data_ref) + { + vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); + new_stmt = gimple_build_assign (vec_dest, data_ref); + } + new_temp = (need_zeroing + ? make_ssa_name (vectype) + : make_ssa_name (vec_dest, new_stmt)); + gimple_set_lhs (new_stmt, new_temp); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); - if (!slp && !costing_p) - *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; + /* If we need to explicitly zero inactive elements emit a + VEC_COND_EXPR that does so. */ + if (need_zeroing) + { + vec_els = vect_get_mask_load_else (MASK_LOAD_ELSE_ZERO, + vectype); + + tree new_temp2 = make_ssa_name (vec_dest, new_stmt); + new_stmt = gimple_build_assign (new_temp2, VEC_COND_EXPR, + final_mask, new_temp, vec_els); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + new_temp = new_temp2; + } + + /* Store vector loads in the corresponding SLP_NODE. */ + slp_node->push_vec_def (new_stmt); + } if (costing_p && dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, @@ -11477,273 +11343,293 @@ vectorizable_load (vec_info *vinfo, /* For costing some adjacent vector loads, we'd like to cost with the total number of them once instead of cost each one by one. */ unsigned int n_adjacent_loads = 0; - for (j = 0; j < ncopies; j++) + + /* 1. Create the vector or array pointer update chain. */ + if (!costing_p) { - /* 1. Create the vector or array pointer update chain. */ - if (j == 0 && !costing_p) - { - bool simd_lane_access_p - = STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) != 0; - if (simd_lane_access_p - && TREE_CODE (DR_BASE_ADDRESS (first_dr_info->dr)) == ADDR_EXPR - && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (first_dr_info->dr), 0)) - && integer_zerop (get_dr_vinfo_offset (vinfo, first_dr_info)) - && integer_zerop (DR_INIT (first_dr_info->dr)) - && alias_sets_conflict_p (get_alias_set (aggr_type), - get_alias_set (TREE_TYPE (ref_type))) - && (alignment_support_scheme == dr_aligned - || alignment_support_scheme == dr_unaligned_supported)) - { - dataref_ptr = unshare_expr (DR_BASE_ADDRESS (first_dr_info->dr)); - dataref_offset = build_int_cst (ref_type, 0); - } - else if (diff_first_stmt_info) - { - dataref_ptr - = vect_create_data_ref_ptr (vinfo, first_stmt_info_for_drptr, - aggr_type, at_loop, offset, &dummy, - gsi, &ptr_incr, simd_lane_access_p, - bump); - /* Adjust the pointer by the difference to first_stmt. */ - data_reference_p ptrdr - = STMT_VINFO_DATA_REF (first_stmt_info_for_drptr); - tree diff - = fold_convert (sizetype, - size_binop (MINUS_EXPR, - DR_INIT (first_dr_info->dr), - DR_INIT (ptrdr))); - dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi, - stmt_info, diff); - if (alignment_support_scheme == dr_explicit_realign) - { - msq = vect_setup_realignment (vinfo, - first_stmt_info_for_drptr, gsi, - &realignment_token, - alignment_support_scheme, - dataref_ptr, &at_loop); - gcc_assert (!compute_in_loop); - } + bool simd_lane_access_p + = STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) != 0; + if (simd_lane_access_p + && TREE_CODE (DR_BASE_ADDRESS (first_dr_info->dr)) == ADDR_EXPR + && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (first_dr_info->dr), 0)) + && integer_zerop (get_dr_vinfo_offset (vinfo, first_dr_info)) + && integer_zerop (DR_INIT (first_dr_info->dr)) + && alias_sets_conflict_p (get_alias_set (aggr_type), + get_alias_set (TREE_TYPE (ref_type))) + && (alignment_support_scheme == dr_aligned + || alignment_support_scheme == dr_unaligned_supported)) + { + dataref_ptr = unshare_expr (DR_BASE_ADDRESS (first_dr_info->dr)); + dataref_offset = build_int_cst (ref_type, 0); + } + else if (diff_first_stmt_info) + { + dataref_ptr + = vect_create_data_ref_ptr (vinfo, first_stmt_info_for_drptr, + aggr_type, at_loop, offset, &dummy, + gsi, &ptr_incr, simd_lane_access_p, + bump); + /* Adjust the pointer by the difference to first_stmt. */ + data_reference_p ptrdr + = STMT_VINFO_DATA_REF (first_stmt_info_for_drptr); + tree diff = fold_convert (sizetype, + size_binop (MINUS_EXPR, + DR_INIT (first_dr_info->dr), + DR_INIT (ptrdr))); + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi, + stmt_info, diff); + if (alignment_support_scheme == dr_explicit_realign) + { + msq = vect_setup_realignment (vinfo, + first_stmt_info_for_drptr, gsi, + &realignment_token, + alignment_support_scheme, + dataref_ptr, &at_loop); + gcc_assert (!compute_in_loop); } - else - dataref_ptr - = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type, - at_loop, - offset, &dummy, gsi, &ptr_incr, - simd_lane_access_p, bump); - } - else if (!costing_p) - { - gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)); - if (dataref_offset) - dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, - bump); - else - dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi, - stmt_info, bump); } + else + dataref_ptr + = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type, + at_loop, + offset, &dummy, gsi, &ptr_incr, + simd_lane_access_p, bump); + } + else if (!costing_p) + { + gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)); + if (dataref_offset) + dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, bump); + else + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi, + stmt_info, bump); + } - if (grouped_load || slp_perm) - dr_chain.create (vec_num); + if (grouped_load || slp_perm) + dr_chain.create (vec_num); - gimple *new_stmt = NULL; - for (i = 0; i < vec_num; i++) + gimple *new_stmt = NULL; + for (i = 0; i < vec_num; i++) + { + tree final_mask = NULL_TREE; + tree final_len = NULL_TREE; + tree bias = NULL_TREE; + + if (!costing_p) { - tree final_mask = NULL_TREE; - tree final_len = NULL_TREE; - tree bias = NULL_TREE; + if (mask) + vec_mask = vec_masks[i]; + if (loop_masks) + final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, + vec_num, vectype, i); + if (vec_mask) + final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, + final_mask, vec_mask, gsi); - if (!costing_p) - { - if (mask) - vec_mask = vec_masks[vec_num * j + i]; - if (loop_masks) - final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, - vec_num * ncopies, vectype, - vec_num * j + i); - if (vec_mask) - final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, - final_mask, vec_mask, gsi); + if (i > 0) + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, + gsi, stmt_info, bump); + } - if (i > 0) - dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, - gsi, stmt_info, bump); - } + /* 2. Create the vector-load in the loop. */ + switch (alignment_support_scheme) + { + case dr_aligned: + case dr_unaligned_supported: + { + if (costing_p) + break; - /* 2. Create the vector-load in the loop. */ - switch (alignment_support_scheme) - { - case dr_aligned: - case dr_unaligned_supported: + unsigned int misalign; + unsigned HOST_WIDE_INT align; + align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info)); + if (alignment_support_scheme == dr_aligned) + misalign = 0; + else if (misalignment == DR_MISALIGNMENT_UNKNOWN) { - if (costing_p) - break; - - unsigned int misalign; - unsigned HOST_WIDE_INT align; - align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info)); - if (alignment_support_scheme == dr_aligned) - misalign = 0; - else if (misalignment == DR_MISALIGNMENT_UNKNOWN) - { - align - = dr_alignment (vect_dr_behavior (vinfo, first_dr_info)); - misalign = 0; - } - else - misalign = misalignment; - if (dataref_offset == NULL_TREE - && TREE_CODE (dataref_ptr) == SSA_NAME) - set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, - misalign); - align = least_bit_hwi (misalign | align); - - /* Compute IFN when LOOP_LENS or final_mask valid. */ - machine_mode vmode = TYPE_MODE (vectype); - machine_mode new_vmode = vmode; - internal_fn partial_ifn = IFN_LAST; - if (loop_lens) - { - opt_machine_mode new_ovmode - = get_len_load_store_mode (vmode, true, &partial_ifn); - new_vmode = new_ovmode.require (); - unsigned factor - = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode); - final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens, - vec_num * ncopies, vectype, - vec_num * j + i, factor); - } - else if (final_mask) - { - if (!can_vec_mask_load_store_p ( - vmode, TYPE_MODE (TREE_TYPE (final_mask)), true, - &partial_ifn)) - gcc_unreachable (); - } + align = dr_alignment (vect_dr_behavior (vinfo, first_dr_info)); + misalign = 0; + } + else + misalign = misalignment; + if (dataref_offset == NULL_TREE + && TREE_CODE (dataref_ptr) == SSA_NAME) + set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, + misalign); + align = least_bit_hwi (misalign | align); + + /* Compute IFN when LOOP_LENS or final_mask valid. */ + machine_mode vmode = TYPE_MODE (vectype); + machine_mode new_vmode = vmode; + internal_fn partial_ifn = IFN_LAST; + if (loop_lens) + { + opt_machine_mode new_ovmode + = get_len_load_store_mode (vmode, true, &partial_ifn); + new_vmode = new_ovmode.require (); + unsigned factor + = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode); + final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens, + vec_num, vectype, i, factor); + } + else if (final_mask) + { + if (!can_vec_mask_load_store_p (vmode, + TYPE_MODE + (TREE_TYPE (final_mask)), + true, &partial_ifn)) + gcc_unreachable (); + } - if (partial_ifn == IFN_MASK_LEN_LOAD) + if (partial_ifn == IFN_MASK_LEN_LOAD) + { + if (!final_len) { - if (!final_len) - { - /* Pass VF value to 'len' argument of - MASK_LEN_LOAD if LOOP_LENS is invalid. */ - final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype)); - } - if (!final_mask) - { - /* Pass all ones value to 'mask' argument of - MASK_LEN_LOAD if final_mask is invalid. */ - mask_vectype = truth_type_for (vectype); - final_mask = build_minus_one_cst (mask_vectype); - } + /* Pass VF value to 'len' argument of + MASK_LEN_LOAD if LOOP_LENS is invalid. */ + final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype)); } - if (final_len) + if (!final_mask) { - signed char biasval - = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); - - bias = build_int_cst (intQI_type_node, biasval); + /* Pass all ones value to 'mask' argument of + MASK_LEN_LOAD if final_mask is invalid. */ + mask_vectype = truth_type_for (vectype); + final_mask = build_minus_one_cst (mask_vectype); } + } + if (final_len) + { + signed char biasval + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); + bias = build_int_cst (intQI_type_node, biasval); + } - tree vec_els; + tree vec_els; - if (final_len) - { - tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT); - gcall *call; - if (partial_ifn == IFN_MASK_LEN_LOAD) - { - vec_els = vect_get_mask_load_else - (maskload_elsval, vectype); - if (type_mode_padding_p - && maskload_elsval != MASK_LOAD_ELSE_ZERO) - need_zeroing = true; - call = gimple_build_call_internal (IFN_MASK_LEN_LOAD, - 6, dataref_ptr, ptr, - final_mask, vec_els, - final_len, bias); - } - else - call = gimple_build_call_internal (IFN_LEN_LOAD, 4, - dataref_ptr, ptr, - final_len, bias); - gimple_call_set_nothrow (call, true); - new_stmt = call; - data_ref = NULL_TREE; - - /* Need conversion if it's wrapped with VnQI. */ - if (vmode != new_vmode) - { - tree new_vtype = build_vector_type_for_mode ( - unsigned_intQI_type_node, new_vmode); - tree var - = vect_get_new_ssa_name (new_vtype, vect_simple_var); - gimple_set_lhs (call, var); - vect_finish_stmt_generation (vinfo, stmt_info, call, - gsi); - tree op = build1 (VIEW_CONVERT_EXPR, vectype, var); - new_stmt = gimple_build_assign (vec_dest, - VIEW_CONVERT_EXPR, op); - } - } - else if (final_mask) + if (final_len) + { + tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT); + gcall *call; + if (partial_ifn == IFN_MASK_LEN_LOAD) { - tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT); - vec_els = vect_get_mask_load_else - (maskload_elsval, vectype); + vec_els = vect_get_mask_load_else (maskload_elsval, + vectype); if (type_mode_padding_p && maskload_elsval != MASK_LOAD_ELSE_ZERO) need_zeroing = true; - gcall *call = gimple_build_call_internal (IFN_MASK_LOAD, 4, - dataref_ptr, ptr, - final_mask, - vec_els); - gimple_call_set_nothrow (call, true); - new_stmt = call; - data_ref = NULL_TREE; + call = gimple_build_call_internal (IFN_MASK_LEN_LOAD, + 6, dataref_ptr, ptr, + final_mask, vec_els, + final_len, bias); } else + call = gimple_build_call_internal (IFN_LEN_LOAD, 4, + dataref_ptr, ptr, + final_len, bias); + gimple_call_set_nothrow (call, true); + new_stmt = call; + data_ref = NULL_TREE; + + /* Need conversion if it's wrapped with VnQI. */ + if (vmode != new_vmode) { - tree ltype = vectype; - tree new_vtype = NULL_TREE; - unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info); - unsigned HOST_WIDE_INT dr_size - = vect_get_scalar_dr_size (first_dr_info); - poly_int64 off = 0; - if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) - off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size; - unsigned int vect_align - = vect_known_alignment_in_bytes (first_dr_info, vectype, - off); - /* Try to use a single smaller load when we are about - to load excess elements compared to the unrolled - scalar loop. */ - if (known_gt ((vec_num * j + i + 1) * nunits, - (group_size * vf - gap))) + tree new_vtype + = build_vector_type_for_mode (unsigned_intQI_type_node, + new_vmode); + tree var = vect_get_new_ssa_name (new_vtype, + vect_simple_var); + gimple_set_lhs (call, var); + vect_finish_stmt_generation (vinfo, stmt_info, call, + gsi); + tree op = build1 (VIEW_CONVERT_EXPR, vectype, var); + new_stmt = gimple_build_assign (vec_dest, + VIEW_CONVERT_EXPR, op); + } + } + else if (final_mask) + { + tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT); + vec_els = vect_get_mask_load_else (maskload_elsval, vectype); + if (type_mode_padding_p + && maskload_elsval != MASK_LOAD_ELSE_ZERO) + need_zeroing = true; + gcall *call = gimple_build_call_internal (IFN_MASK_LOAD, 4, + dataref_ptr, ptr, + final_mask, + vec_els); + gimple_call_set_nothrow (call, true); + new_stmt = call; + data_ref = NULL_TREE; + } + else + { + tree ltype = vectype; + tree new_vtype = NULL_TREE; + unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info); + unsigned HOST_WIDE_INT dr_size + = vect_get_scalar_dr_size (first_dr_info); + poly_int64 off = 0; + if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) + off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size; + unsigned int vect_align + = vect_known_alignment_in_bytes (first_dr_info, vectype, + off); + /* Try to use a single smaller load when we are about + to load excess elements compared to the unrolled + scalar loop. */ + if (known_gt ((i + 1) * nunits, + (group_size * vf - gap))) + { + poly_uint64 remain = ((group_size * vf - gap) - i * nunits); + if (known_ge ((i + 1) * nunits - (group_size * vf - gap), + nunits)) + /* DR will be unused. */ + ltype = NULL_TREE; + else if (known_ge (vect_align, + tree_to_poly_uint64 + (TYPE_SIZE_UNIT (vectype)))) + /* Aligned access to excess elements is OK if + at least one element is accessed in the + scalar loop. */ + ; + else if (known_gt (vect_align, + ((nunits - remain) * dr_size))) + /* Aligned access to the gap area when there's + at least one element in it is OK. */ + ; + else { - poly_uint64 remain = ((group_size * vf - gap) - - (vec_num * j + i) * nunits); - if (known_ge ((vec_num * j + i + 1) * nunits - - (group_size * vf - gap), nunits)) - /* DR will be unused. */ - ltype = NULL_TREE; - else if (known_ge (vect_align, - tree_to_poly_uint64 - (TYPE_SIZE_UNIT (vectype)))) - /* Aligned access to excess elements is OK if - at least one element is accessed in the - scalar loop. */ - ; - else if (known_gt (vect_align, - ((nunits - remain) * dr_size))) - /* Aligned access to the gap area when there's - at least one element in it is OK. */ - ; - else + /* remain should now be > 0 and < nunits. */ + unsigned num; + if (known_ne (remain, 0u) + && constant_multiple_p (nunits, remain, &num)) + { + tree ptype; + new_vtype + = vector_vector_composition_type (vectype, num, + &ptype); + if (new_vtype) + ltype = ptype; + } + /* Else use multiple loads or a masked load? */ + /* For loop vectorization we now should have + an alternate type or LOOP_VINFO_PEELING_FOR_GAPS + set. */ + if (loop_vinfo) + gcc_assert (new_vtype + || LOOP_VINFO_PEELING_FOR_GAPS + (loop_vinfo)); + /* But still reduce the access size to the next + required power-of-two so peeling a single + scalar iteration is sufficient. */ + unsigned HOST_WIDE_INT cremain; + if (remain.is_constant (&cremain)) { - /* remain should now be > 0 and < nunits. */ - unsigned num; - if (known_ne (remain, 0u) - && constant_multiple_p (nunits, remain, &num)) + unsigned HOST_WIDE_INT cpart_size + = 1 << ceil_log2 (cremain); + if (known_gt (nunits, cpart_size) + && constant_multiple_p (nunits, cpart_size, + &num)) { tree ptype; new_vtype @@ -11753,333 +11639,289 @@ vectorizable_load (vec_info *vinfo, if (new_vtype) ltype = ptype; } - /* Else use multiple loads or a masked load? */ - /* For loop vectorization we now should have - an alternate type or LOOP_VINFO_PEELING_FOR_GAPS - set. */ - if (loop_vinfo) - gcc_assert (new_vtype - || LOOP_VINFO_PEELING_FOR_GAPS - (loop_vinfo)); - /* But still reduce the access size to the next - required power-of-two so peeling a single - scalar iteration is sufficient. */ - unsigned HOST_WIDE_INT cremain; - if (remain.is_constant (&cremain)) - { - unsigned HOST_WIDE_INT cpart_size - = 1 << ceil_log2 (cremain); - if (known_gt (nunits, cpart_size) - && constant_multiple_p (nunits, cpart_size, - &num)) - { - tree ptype; - new_vtype - = vector_vector_composition_type (vectype, - num, - &ptype); - if (new_vtype) - ltype = ptype; - } - } } } - tree offset - = (dataref_offset ? dataref_offset - : build_int_cst (ref_type, 0)); - if (!ltype) + } + tree offset = (dataref_offset ? dataref_offset + : build_int_cst (ref_type, 0)); + if (!ltype) + ; + else if (ltype != vectype + && memory_access_type == VMAT_CONTIGUOUS_REVERSE) + { + poly_uint64 gap_offset + = (tree_to_poly_uint64 (TYPE_SIZE_UNIT (vectype)) + - tree_to_poly_uint64 (TYPE_SIZE_UNIT (ltype))); + tree gapcst = build_int_cstu (ref_type, gap_offset); + offset = size_binop (PLUS_EXPR, offset, gapcst); + } + if (ltype) + { + data_ref = fold_build2 (MEM_REF, ltype, + dataref_ptr, offset); + if (alignment_support_scheme == dr_aligned) ; - else if (ltype != vectype - && memory_access_type == VMAT_CONTIGUOUS_REVERSE) + else + TREE_TYPE (data_ref) + = build_aligned_type (TREE_TYPE (data_ref), + align * BITS_PER_UNIT); + } + if (!ltype) + data_ref = build_constructor (vectype, NULL); + else if (ltype != vectype) + { + vect_copy_ref_info (data_ref, + DR_REF (first_dr_info->dr)); + tree tem = make_ssa_name (ltype); + new_stmt = gimple_build_assign (tem, data_ref); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, + gsi); + data_ref = NULL; + vec<constructor_elt, va_gc> *v; + /* We've computed 'num' above to statically two + or via constant_multiple_p. */ + unsigned num + = (exact_div (tree_to_poly_uint64 + (TYPE_SIZE_UNIT (vectype)), + tree_to_poly_uint64 + (TYPE_SIZE_UNIT (ltype))) + .to_constant ()); + vec_alloc (v, num); + if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) { - poly_uint64 gap_offset - = (tree_to_poly_uint64 (TYPE_SIZE_UNIT (vectype)) - - tree_to_poly_uint64 (TYPE_SIZE_UNIT (ltype))); - tree gapcst = build_int_cstu (ref_type, gap_offset); - offset = size_binop (PLUS_EXPR, offset, gapcst); + while (--num) + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, + build_zero_cst (ltype)); + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem); } - if (ltype) + else { - data_ref - = fold_build2 (MEM_REF, ltype, dataref_ptr, offset); - if (alignment_support_scheme == dr_aligned) - ; - else - TREE_TYPE (data_ref) - = build_aligned_type (TREE_TYPE (data_ref), - align * BITS_PER_UNIT); + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem); + while (--num) + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, + build_zero_cst (ltype)); } - if (!ltype) - data_ref = build_constructor (vectype, NULL); - else if (ltype != vectype) + gcc_assert (new_vtype != NULL_TREE); + if (new_vtype == vectype) + new_stmt + = gimple_build_assign (vec_dest, + build_constructor (vectype, v)); + else { - vect_copy_ref_info (data_ref, - DR_REF (first_dr_info->dr)); - tree tem = make_ssa_name (ltype); - new_stmt = gimple_build_assign (tem, data_ref); - vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, - gsi); - data_ref = NULL; - vec<constructor_elt, va_gc> *v; - /* We've computed 'num' above to statically two - or via constant_multiple_p. */ - unsigned num - = (exact_div (tree_to_poly_uint64 - (TYPE_SIZE_UNIT (vectype)), - tree_to_poly_uint64 - (TYPE_SIZE_UNIT (ltype))) - .to_constant ()); - vec_alloc (v, num); - if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) - { - while (--num) - CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, - build_zero_cst (ltype)); - CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem); - } - else - { - CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem); - while (--num) - CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, - build_zero_cst (ltype)); - } - gcc_assert (new_vtype != NULL_TREE); - if (new_vtype == vectype) - new_stmt = gimple_build_assign ( - vec_dest, build_constructor (vectype, v)); - else - { - tree new_vname = make_ssa_name (new_vtype); - new_stmt = gimple_build_assign ( - new_vname, build_constructor (new_vtype, v)); - vect_finish_stmt_generation (vinfo, stmt_info, - new_stmt, gsi); - new_stmt = gimple_build_assign ( - vec_dest, - build1 (VIEW_CONVERT_EXPR, vectype, new_vname)); - } + tree new_vname = make_ssa_name (new_vtype); + new_stmt + = gimple_build_assign (new_vname, + build_constructor (new_vtype, + v)); + vect_finish_stmt_generation (vinfo, stmt_info, + new_stmt, gsi); + new_stmt + = gimple_build_assign (vec_dest, + build1 (VIEW_CONVERT_EXPR, + vectype, new_vname)); } } - break; } - case dr_explicit_realign: - { - if (costing_p) - break; - tree ptr, bump; - - tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype)); - - if (compute_in_loop) - msq = vect_setup_realignment (vinfo, first_stmt_info, gsi, - &realignment_token, - dr_explicit_realign, - dataref_ptr, NULL); + break; + } + case dr_explicit_realign: + { + if (costing_p) + break; + tree ptr, bump; + + tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype)); + + if (compute_in_loop) + msq = vect_setup_realignment (vinfo, first_stmt_info, gsi, + &realignment_token, + dr_explicit_realign, + dataref_ptr, NULL); + + if (TREE_CODE (dataref_ptr) == SSA_NAME) + ptr = copy_ssa_name (dataref_ptr); + else + ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); + // For explicit realign the target alignment should be + // known at compile time. + unsigned HOST_WIDE_INT align + = DR_TARGET_ALIGNMENT (first_dr_info).to_constant (); + new_stmt = gimple_build_assign (ptr, BIT_AND_EXPR, dataref_ptr, + build_int_cst + (TREE_TYPE (dataref_ptr), + -(HOST_WIDE_INT) align)); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + data_ref = build2 (MEM_REF, vectype, + ptr, build_int_cst (ref_type, 0)); + vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); + vec_dest = vect_create_destination_var (scalar_dest, vectype); + new_stmt = gimple_build_assign (vec_dest, data_ref); + new_temp = make_ssa_name (vec_dest, new_stmt); + gimple_assign_set_lhs (new_stmt, new_temp); + gimple_move_vops (new_stmt, stmt_info->stmt); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + msq = new_temp; + + bump = size_binop (MULT_EXPR, vs, TYPE_SIZE_UNIT (elem_type)); + bump = size_binop (MINUS_EXPR, bump, size_one_node); + ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi, stmt_info, + bump); + new_stmt = gimple_build_assign (NULL_TREE, BIT_AND_EXPR, ptr, + build_int_cst (TREE_TYPE (ptr), + -(HOST_WIDE_INT) align)); + if (TREE_CODE (ptr) == SSA_NAME) + ptr = copy_ssa_name (ptr, new_stmt); + else + ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt); + gimple_assign_set_lhs (new_stmt, ptr); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + data_ref = build2 (MEM_REF, vectype, + ptr, build_int_cst (ref_type, 0)); + break; + } + case dr_explicit_realign_optimized: + { + if (costing_p) + break; + if (TREE_CODE (dataref_ptr) == SSA_NAME) + new_temp = copy_ssa_name (dataref_ptr); + else + new_temp = make_ssa_name (TREE_TYPE (dataref_ptr)); + // We should only be doing this if we know the target + // alignment at compile time. + unsigned HOST_WIDE_INT align + = DR_TARGET_ALIGNMENT (first_dr_info).to_constant (); + new_stmt = gimple_build_assign (new_temp, BIT_AND_EXPR, dataref_ptr, + build_int_cst (TREE_TYPE (dataref_ptr), + -(HOST_WIDE_INT) align)); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + data_ref = build2 (MEM_REF, vectype, new_temp, + build_int_cst (ref_type, 0)); + break; + } + default: + gcc_unreachable (); + } - if (TREE_CODE (dataref_ptr) == SSA_NAME) - ptr = copy_ssa_name (dataref_ptr); - else - ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); - // For explicit realign the target alignment should be - // known at compile time. - unsigned HOST_WIDE_INT align - = DR_TARGET_ALIGNMENT (first_dr_info).to_constant (); - new_stmt = gimple_build_assign ( - ptr, BIT_AND_EXPR, dataref_ptr, - build_int_cst (TREE_TYPE (dataref_ptr), - -(HOST_WIDE_INT) align)); - vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); - data_ref - = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, 0)); - vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); - vec_dest = vect_create_destination_var (scalar_dest, vectype); - new_stmt = gimple_build_assign (vec_dest, data_ref); - new_temp = make_ssa_name (vec_dest, new_stmt); - gimple_assign_set_lhs (new_stmt, new_temp); - gimple_move_vops (new_stmt, stmt_info->stmt); - vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); - msq = new_temp; - - bump = size_binop (MULT_EXPR, vs, TYPE_SIZE_UNIT (elem_type)); - bump = size_binop (MINUS_EXPR, bump, size_one_node); - ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi, stmt_info, - bump); - new_stmt = gimple_build_assign ( - NULL_TREE, BIT_AND_EXPR, ptr, - build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align)); - if (TREE_CODE (ptr) == SSA_NAME) - ptr = copy_ssa_name (ptr, new_stmt); - else - ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt); - gimple_assign_set_lhs (new_stmt, ptr); - vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); - data_ref - = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, 0)); - break; - } - case dr_explicit_realign_optimized: - { - if (costing_p) - break; - if (TREE_CODE (dataref_ptr) == SSA_NAME) - new_temp = copy_ssa_name (dataref_ptr); - else - new_temp = make_ssa_name (TREE_TYPE (dataref_ptr)); - // We should only be doing this if we know the target - // alignment at compile time. - unsigned HOST_WIDE_INT align - = DR_TARGET_ALIGNMENT (first_dr_info).to_constant (); - new_stmt = gimple_build_assign ( - new_temp, BIT_AND_EXPR, dataref_ptr, - build_int_cst (TREE_TYPE (dataref_ptr), - -(HOST_WIDE_INT) align)); - vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); - data_ref = build2 (MEM_REF, vectype, new_temp, - build_int_cst (ref_type, 0)); - break; - } - default: - gcc_unreachable (); + /* One common place to cost the above vect load for different + alignment support schemes. */ + if (costing_p) + { + /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we + only need to take care of the first stmt, whose + stmt_info is first_stmt_info, vec_num iterating on it + will cover the cost for the remaining, it's consistent + with transforming. For the prologue cost for realign, + we only need to count it once for the whole group. */ + bool first_stmt_info_p = first_stmt_info == stmt_info; + bool add_realign_cost = first_stmt_info_p && i == 0; + if (memory_access_type == VMAT_CONTIGUOUS + || memory_access_type == VMAT_CONTIGUOUS_REVERSE + || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE + && (!grouped_load || first_stmt_info_p))) + { + /* Leave realign cases alone to keep them simple. */ + if (alignment_support_scheme == dr_explicit_realign_optimized + || alignment_support_scheme == dr_explicit_realign) + vect_get_load_cost (vinfo, stmt_info, slp_node, 1, + alignment_support_scheme, misalignment, + add_realign_cost, &inside_cost, + &prologue_cost, cost_vec, cost_vec, + true); + else + n_adjacent_loads++; } - - /* One common place to cost the above vect load for different - alignment support schemes. */ - if (costing_p) + } + else + { + vec_dest = vect_create_destination_var (scalar_dest, vectype); + /* DATA_REF is null if we've already built the statement. */ + if (data_ref) { - /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we - only need to take care of the first stmt, whose - stmt_info is first_stmt_info, vec_num iterating on it - will cover the cost for the remaining, it's consistent - with transforming. For the prologue cost for realign, - we only need to count it once for the whole group. */ - bool first_stmt_info_p = first_stmt_info == stmt_info; - bool add_realign_cost = first_stmt_info_p && i == 0; - if (memory_access_type == VMAT_CONTIGUOUS - || memory_access_type == VMAT_CONTIGUOUS_REVERSE - || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE - && (!grouped_load || first_stmt_info_p))) - { - /* Leave realign cases alone to keep them simple. */ - if (alignment_support_scheme == dr_explicit_realign_optimized - || alignment_support_scheme == dr_explicit_realign) - vect_get_load_cost (vinfo, stmt_info, slp_node, 1, - alignment_support_scheme, misalignment, - add_realign_cost, &inside_cost, - &prologue_cost, cost_vec, cost_vec, - true); - else - n_adjacent_loads++; - } + vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); + new_stmt = gimple_build_assign (vec_dest, data_ref); } - else - { - vec_dest = vect_create_destination_var (scalar_dest, vectype); - /* DATA_REF is null if we've already built the statement. */ - if (data_ref) - { - vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); - new_stmt = gimple_build_assign (vec_dest, data_ref); - } - new_temp = need_zeroing - ? make_ssa_name (vectype) - : make_ssa_name (vec_dest, new_stmt); - gimple_set_lhs (new_stmt, new_temp); - vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + new_temp = (need_zeroing + ? make_ssa_name (vectype) + : make_ssa_name (vec_dest, new_stmt)); + gimple_set_lhs (new_stmt, new_temp); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); - /* If we need to explicitly zero inactive elements emit a - VEC_COND_EXPR that does so. */ - if (need_zeroing) - { - vec_els = vect_get_mask_load_else (MASK_LOAD_ELSE_ZERO, - vectype); + /* If we need to explicitly zero inactive elements emit a + VEC_COND_EXPR that does so. */ + if (need_zeroing) + { + vec_els = vect_get_mask_load_else (MASK_LOAD_ELSE_ZERO, + vectype); - tree new_temp2 = make_ssa_name (vec_dest, new_stmt); - new_stmt - = gimple_build_assign (new_temp2, VEC_COND_EXPR, - final_mask, new_temp, vec_els); - vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, - gsi); - new_temp = new_temp2; - } + tree new_temp2 = make_ssa_name (vec_dest, new_stmt); + new_stmt = gimple_build_assign (new_temp2, VEC_COND_EXPR, + final_mask, new_temp, vec_els); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, + gsi); + new_temp = new_temp2; } + } - /* 3. Handle explicit realignment if necessary/supported. - Create in loop: - vec_dest = realign_load (msq, lsq, realignment_token) */ - if (!costing_p - && (alignment_support_scheme == dr_explicit_realign_optimized - || alignment_support_scheme == dr_explicit_realign)) - { - lsq = gimple_assign_lhs (new_stmt); - if (!realignment_token) - realignment_token = dataref_ptr; - vec_dest = vect_create_destination_var (scalar_dest, vectype); - new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR, msq, - lsq, realignment_token); - new_temp = make_ssa_name (vec_dest, new_stmt); - gimple_assign_set_lhs (new_stmt, new_temp); - vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + /* 3. Handle explicit realignment if necessary/supported. + Create in loop: + vec_dest = realign_load (msq, lsq, realignment_token) */ + if (!costing_p + && (alignment_support_scheme == dr_explicit_realign_optimized + || alignment_support_scheme == dr_explicit_realign)) + { + lsq = gimple_assign_lhs (new_stmt); + if (!realignment_token) + realignment_token = dataref_ptr; + vec_dest = vect_create_destination_var (scalar_dest, vectype); + new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR, msq, + lsq, realignment_token); + new_temp = make_ssa_name (vec_dest, new_stmt); + gimple_assign_set_lhs (new_stmt, new_temp); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); - if (alignment_support_scheme == dr_explicit_realign_optimized) - { - gcc_assert (phi); - if (i == vec_num - 1 && j == ncopies - 1) - add_phi_arg (phi, lsq, loop_latch_edge (containing_loop), - UNKNOWN_LOCATION); - msq = lsq; - } + if (alignment_support_scheme == dr_explicit_realign_optimized) + { + gcc_assert (phi); + if (i == vec_num - 1) + add_phi_arg (phi, lsq, loop_latch_edge (containing_loop), + UNKNOWN_LOCATION); + msq = lsq; } + } - if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) + if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) + { + if (costing_p) + inside_cost = record_stmt_cost (cost_vec, 1, vec_perm, + slp_node, 0, vect_body); + else { - if (costing_p) - inside_cost = record_stmt_cost (cost_vec, 1, vec_perm, - slp_node, 0, vect_body); - else - { - tree perm_mask = perm_mask_for_reverse (vectype); - new_temp = permute_vec_elements (vinfo, new_temp, new_temp, - perm_mask, stmt_info, gsi); - new_stmt = SSA_NAME_DEF_STMT (new_temp); - } + tree perm_mask = perm_mask_for_reverse (vectype); + new_temp = permute_vec_elements (vinfo, new_temp, new_temp, + perm_mask, stmt_info, gsi); + new_stmt = SSA_NAME_DEF_STMT (new_temp); } + } - /* Collect vector loads and later create their permutation in - vect_transform_grouped_load (). */ - if (!costing_p && (grouped_load || slp_perm)) - dr_chain.quick_push (new_temp); + /* Collect vector loads and later create their permutation in + vect_transform_grouped_load (). */ + if (!costing_p && (grouped_load || slp_perm)) + dr_chain.quick_push (new_temp); - /* Store vector loads in the corresponding SLP_NODE. */ - if (!costing_p && slp && !slp_perm) - slp_node->push_vec_def (new_stmt); + /* Store vector loads in the corresponding SLP_NODE. */ + if (!costing_p && !slp_perm) + slp_node->push_vec_def (new_stmt); - /* With SLP permutation we load the gaps as well, without - we need to skip the gaps after we manage to fully load - all elements. group_gap_adj is DR_GROUP_SIZE here. */ - group_elt += nunits; - if (!costing_p - && maybe_ne (group_gap_adj, 0U) - && !slp_perm - && known_eq (group_elt, group_size - group_gap_adj)) - { - poly_wide_int bump_val - = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj); - if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step) - == -1) - bump_val = -bump_val; - tree bump = wide_int_to_tree (sizetype, bump_val); - dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi, - stmt_info, bump); - group_elt = 0; - } - } - /* Bump the vector pointer to account for a gap or for excess - elements loaded for a permuted SLP load. */ + /* With SLP permutation we load the gaps as well, without + we need to skip the gaps after we manage to fully load + all elements. group_gap_adj is DR_GROUP_SIZE here. */ + group_elt += nunits; if (!costing_p && maybe_ne (group_gap_adj, 0U) - && slp_perm) + && !slp_perm + && known_eq (group_elt, group_size - group_gap_adj)) { poly_wide_int bump_val = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj); @@ -12088,72 +11930,47 @@ vectorizable_load (vec_info *vinfo, tree bump = wide_int_to_tree (sizetype, bump_val); dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi, stmt_info, bump); + group_elt = 0; } + } + /* Bump the vector pointer to account for a gap or for excess + elements loaded for a permuted SLP load. */ + if (!costing_p + && maybe_ne (group_gap_adj, 0U) + && slp_perm) + { + poly_wide_int bump_val + = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj); + if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step) == -1) + bump_val = -bump_val; + tree bump = wide_int_to_tree (sizetype, bump_val); + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi, + stmt_info, bump); + } - if (slp && !slp_perm) - continue; - - if (slp_perm) + if (slp_perm) + { + unsigned n_perms; + /* For SLP we know we've seen all possible uses of dr_chain so + direct vect_transform_slp_perm_load to DCE the unused parts. + ??? This is a hack to prevent compile-time issues as seen + in PR101120 and friends. */ + if (costing_p) { - unsigned n_perms; - /* For SLP we know we've seen all possible uses of dr_chain so - direct vect_transform_slp_perm_load to DCE the unused parts. - ??? This is a hack to prevent compile-time issues as seen - in PR101120 and friends. */ - if (costing_p) - { - vect_transform_slp_perm_load (vinfo, slp_node, vNULL, nullptr, vf, - true, &n_perms, nullptr); - inside_cost = record_stmt_cost (cost_vec, n_perms, vec_perm, - slp_node, 0, vect_body); - } - else - { - bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, - gsi, vf, false, &n_perms, - nullptr, true); - gcc_assert (ok); - } + vect_transform_slp_perm_load (vinfo, slp_node, vNULL, nullptr, vf, + true, &n_perms, nullptr); + inside_cost = record_stmt_cost (cost_vec, n_perms, vec_perm, + slp_node, 0, vect_body); } else { - if (grouped_load) - { - gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE); - /* We assume that the cost of a single load-lanes instruction - is equivalent to the cost of DR_GROUP_SIZE separate loads. - If a grouped access is instead being provided by a - load-and-permute operation, include the cost of the - permutes. */ - if (costing_p && first_stmt_info == stmt_info) - { - /* Uses an even and odd extract operations or shuffle - operations for each needed permute. */ - int group_size = DR_GROUP_SIZE (first_stmt_info); - int nstmts = ceil_log2 (group_size) * group_size; - inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm, - slp_node, 0, vect_body); - - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "vect_model_load_cost: " - "strided group_size = %d .\n", - group_size); - } - else if (!costing_p) - { - vect_transform_grouped_load (vinfo, stmt_info, dr_chain, - group_size, gsi); - *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; - } - } - else if (!costing_p) - STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); + bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, + gsi, vf, false, &n_perms, + nullptr, true); + gcc_assert (ok); } dr_chain.release (); } - if (!slp && !costing_p) - *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; if (costing_p) { -- 2.43.0