On Mon, Aug 14, 2023 at 2:49 PM Kewen.Lin <li...@linux.ibm.com> wrote: > > Hi Richi, > > on 2023/8/14 20:04, Richard Biener wrote: > > On Mon, Aug 14, 2023 at 10:54 AM Kewen.Lin <li...@linux.ibm.com> wrote: > >> > >> Hi, > >> > >> Following Richi's suggestion [1], this patch is to move the > >> handlings on VMAT_LOAD_STORE_LANES in the final loop nest > >> of function vectorizable_load to its own loop. Basically > >> it duplicates the final loop nest, clean up some useless > >> set up code for the case of VMAT_LOAD_STORE_LANES, remove > >> some unreachable code. Also remove the corresponding > >> handlings in the final loop nest. > >> > >> Bootstrapped and regtested on x86_64-redhat-linux, > >> aarch64-linux-gnu and powerpc64{,le}-linux-gnu. > > > > OK (I guess the big diff is mostly because of re-indenting). > > Thanks! Yes, there is some code in the original final loop nest like > > if (memory_access_type == VMAT_LOAD_STORE_LANES) > { > ... > } > else > { > ... > } > > Then the else arm is fully re-indented. > > The other patch on VMAT_GATHER_SCATTER looks a bit better since > it doesn't need re-indenting.
Yes, that's also because VMAT_LOAD_STORE_LANES isn't for SLP so it even makes more sense to split that case out. Richard. > BR, > Kewen > > > > > Thanks, > > Richard. > > > >> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html > >> > >> gcc/ChangeLog: > >> > >> * tree-vect-stmts.cc (vectorizable_load): Move the handlings on > >> VMAT_LOAD_STORE_LANES in the final loop nest to its own loop, > >> and update the final nest accordingly. > >> --- > >> gcc/tree-vect-stmts.cc | 1275 ++++++++++++++++++++-------------------- > >> 1 file changed, 634 insertions(+), 641 deletions(-) > >> > >> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > >> index 4f2d088484c..c361e16cb7b 100644 > >> --- a/gcc/tree-vect-stmts.cc > >> +++ b/gcc/tree-vect-stmts.cc > >> @@ -10332,7 +10332,129 @@ vectorizable_load (vec_info *vinfo, > >> vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, mask, > >> &vec_masks, mask_vectype); > >> } > >> + > >> tree vec_mask = NULL_TREE; > >> + if (memory_access_type == VMAT_LOAD_STORE_LANES) > >> + { > >> + gcc_assert (alignment_support_scheme == dr_aligned > >> + || alignment_support_scheme == dr_unaligned_supported); > >> + gcc_assert (grouped_load && !slp); > >> + > >> + unsigned int inside_cost = 0, prologue_cost = 0; > >> + for (j = 0; j < ncopies; j++) > >> + { > >> + if (costing_p) > >> + { > >> + /* An IFN_LOAD_LANES will load all its vector results, > >> + regardless of which ones we actually need. Account > >> + for the cost of unused results. */ > >> + if (first_stmt_info == stmt_info) > >> + { > >> + unsigned int gaps = DR_GROUP_SIZE (first_stmt_info); > >> + stmt_vec_info next_stmt_info = first_stmt_info; > >> + do > >> + { > >> + gaps -= 1; > >> + next_stmt_info = DR_GROUP_NEXT_ELEMENT > >> (next_stmt_info); > >> + } > >> + while (next_stmt_info); > >> + if (gaps) > >> + { > >> + if (dump_enabled_p ()) > >> + dump_printf_loc (MSG_NOTE, vect_location, > >> + "vect_model_load_cost: %d " > >> + "unused vectors.\n", > >> + gaps); > >> + vect_get_load_cost (vinfo, stmt_info, gaps, > >> + alignment_support_scheme, > >> + misalignment, false, > >> &inside_cost, > >> + &prologue_cost, cost_vec, > >> cost_vec, > >> + true); > >> + } > >> + } > >> + vect_get_load_cost (vinfo, stmt_info, 1, > >> alignment_support_scheme, > >> + misalignment, false, &inside_cost, > >> + &prologue_cost, cost_vec, cost_vec, > >> true); > >> + continue; > >> + } > >> + > >> + /* 1. Create the vector or array pointer update chain. */ > >> + if (j == 0) > >> + dataref_ptr > >> + = vect_create_data_ref_ptr (vinfo, first_stmt_info, > >> aggr_type, > >> + at_loop, offset, &dummy, gsi, > >> + &ptr_incr, false, bump); > >> + else > >> + { > >> + gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)); > >> + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, > >> gsi, > >> + stmt_info, bump); > >> + } > >> + if (mask) > >> + vec_mask = vec_masks[j]; > >> + > >> + tree vec_array = create_vector_array (vectype, vec_num); > >> + > >> + tree final_mask = NULL_TREE; > >> + if (loop_masks) > >> + final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > >> + ncopies, vectype, j); > >> + if (vec_mask) > >> + final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, > >> final_mask, > >> + vec_mask, gsi); > >> + > >> + gcall *call; > >> + if (final_mask) > >> + { > >> + /* Emit: > >> + VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR, > >> + VEC_MASK). */ > >> + unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype)); > >> + tree alias_ptr = build_int_cst (ref_type, align); > >> + call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3, > >> + dataref_ptr, alias_ptr, > >> + final_mask); > >> + } > >> + else > >> + { > >> + /* Emit: > >> + VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]). > >> */ > >> + data_ref = create_array_ref (aggr_type, dataref_ptr, > >> ref_type); > >> + call = gimple_build_call_internal (IFN_LOAD_LANES, 1, > >> data_ref); > >> + } > >> + gimple_call_set_lhs (call, vec_array); > >> + gimple_call_set_nothrow (call, true); > >> + vect_finish_stmt_generation (vinfo, stmt_info, call, gsi); > >> + > >> + dr_chain.create (vec_num); > >> + /* Extract each vector into an SSA_NAME. */ > >> + for (i = 0; i < vec_num; i++) > >> + { > >> + new_temp = read_vector_array (vinfo, stmt_info, gsi, > >> scalar_dest, > >> + vec_array, i); > >> + dr_chain.quick_push (new_temp); > >> + } > >> + > >> + /* Record the mapping between SSA_NAMEs and statements. */ > >> + vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain); > >> + > >> + /* Record that VEC_ARRAY is now dead. */ > >> + vect_clobber_variable (vinfo, stmt_info, gsi, vec_array); > >> + > >> + dr_chain.release (); > >> + > >> + *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; > >> + } > >> + > >> + if (costing_p && dump_enabled_p ()) > >> + dump_printf_loc (MSG_NOTE, vect_location, > >> + "vect_model_load_cost: inside_cost = %u, " > >> + "prologue_cost = %u .\n", > >> + inside_cost, prologue_cost); > >> + > >> + return true; > >> + } > >> + > >> poly_uint64 group_elt = 0; > >> unsigned int inside_cost = 0, prologue_cost = 0; > >> for (j = 0; j < ncopies; j++) > >> @@ -10414,685 +10538,558 @@ vectorizable_load (vec_info *vinfo, > >> dr_chain.create (vec_num); > >> > >> gimple *new_stmt = NULL; > >> - if (memory_access_type == VMAT_LOAD_STORE_LANES) > >> + for (i = 0; i < vec_num; i++) > >> { > >> - if (costing_p) > >> - { > >> - /* An IFN_LOAD_LANES will load all its vector results, > >> - regardless of which ones we actually need. Account > >> - for the cost of unused results. */ > >> - if (grouped_load && first_stmt_info == stmt_info) > >> - { > >> - unsigned int gaps = DR_GROUP_SIZE (first_stmt_info); > >> - stmt_vec_info next_stmt_info = first_stmt_info; > >> - do > >> - { > >> - gaps -= 1; > >> - next_stmt_info = DR_GROUP_NEXT_ELEMENT > >> (next_stmt_info); > >> - } > >> - while (next_stmt_info); > >> - if (gaps) > >> - { > >> - if (dump_enabled_p ()) > >> - dump_printf_loc (MSG_NOTE, vect_location, > >> - "vect_model_load_cost: %d " > >> - "unused vectors.\n", > >> - gaps); > >> - vect_get_load_cost (vinfo, stmt_info, gaps, > >> - alignment_support_scheme, > >> - misalignment, false, > >> &inside_cost, > >> - &prologue_cost, cost_vec, > >> cost_vec, > >> - true); > >> - } > >> - } > >> - vect_get_load_cost (vinfo, stmt_info, 1, > >> alignment_support_scheme, > >> - misalignment, false, &inside_cost, > >> - &prologue_cost, cost_vec, cost_vec, > >> true); > >> - continue; > >> - } > >> - tree vec_array; > >> - > >> - vec_array = create_vector_array (vectype, vec_num); > >> - > >> tree final_mask = NULL_TREE; > >> - if (loop_masks) > >> - final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > >> - ncopies, vectype, j); > >> - if (vec_mask) > >> - final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, > >> - final_mask, vec_mask, gsi); > >> - > >> - gcall *call; > >> - if (final_mask) > >> - { > >> - /* Emit: > >> - VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR, > >> - VEC_MASK). */ > >> - unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype)); > >> - tree alias_ptr = build_int_cst (ref_type, align); > >> - call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3, > >> - dataref_ptr, alias_ptr, > >> - final_mask); > >> - } > >> - else > >> + tree final_len = NULL_TREE; > >> + tree bias = NULL_TREE; > >> + if (!costing_p) > >> { > >> - /* Emit: > >> - VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]). > >> */ > >> - data_ref = create_array_ref (aggr_type, dataref_ptr, > >> ref_type); > >> - call = gimple_build_call_internal (IFN_LOAD_LANES, 1, > >> data_ref); > >> - } > >> - gimple_call_set_lhs (call, vec_array); > >> - gimple_call_set_nothrow (call, true); > >> - vect_finish_stmt_generation (vinfo, stmt_info, call, gsi); > >> - new_stmt = call; > >> + if (loop_masks) > >> + final_mask = vect_get_loop_mask (loop_vinfo, gsi, > >> loop_masks, > >> + vec_num * ncopies, > >> vectype, > >> + vec_num * j + i); > >> + if (vec_mask) > >> + final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, > >> + final_mask, vec_mask, gsi); > >> > >> - /* Extract each vector into an SSA_NAME. */ > >> - for (i = 0; i < vec_num; i++) > >> - { > >> - new_temp = read_vector_array (vinfo, stmt_info, gsi, > >> scalar_dest, > >> - vec_array, i); > >> - dr_chain.quick_push (new_temp); > >> + if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > >> + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, > >> ptr_incr, > >> + gsi, stmt_info, bump); > >> } > >> > >> - /* Record the mapping between SSA_NAMEs and statements. */ > >> - vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain); > >> - > >> - /* Record that VEC_ARRAY is now dead. */ > >> - vect_clobber_variable (vinfo, stmt_info, gsi, vec_array); > >> - } > >> - else > >> - { > >> - for (i = 0; i < vec_num; i++) > >> + /* 2. Create the vector-load in the loop. */ > >> + switch (alignment_support_scheme) > >> { > >> - tree final_mask = NULL_TREE; > >> - tree final_len = NULL_TREE; > >> - tree bias = NULL_TREE; > >> - if (!costing_p) > >> - { > >> - if (loop_masks) > >> - final_mask > >> - = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > >> - vec_num * ncopies, vectype, > >> - vec_num * j + i); > >> - if (vec_mask) > >> - final_mask = prepare_vec_mask (loop_vinfo, > >> mask_vectype, > >> - final_mask, vec_mask, > >> gsi); > >> - > >> - if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > >> - dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, > >> ptr_incr, > >> - gsi, stmt_info, bump); > >> - } > >> + case dr_aligned: > >> + case dr_unaligned_supported: > >> + { > >> + unsigned int misalign; > >> + unsigned HOST_WIDE_INT align; > >> > >> - /* 2. Create the vector-load in the loop. */ > >> - switch (alignment_support_scheme) > >> - { > >> - case dr_aligned: > >> - case dr_unaligned_supported: > >> + if (memory_access_type == VMAT_GATHER_SCATTER > >> + && gs_info.ifn != IFN_LAST) > >> { > >> - unsigned int misalign; > >> - unsigned HOST_WIDE_INT align; > >> - > >> - if (memory_access_type == VMAT_GATHER_SCATTER > >> - && gs_info.ifn != IFN_LAST) > >> + if (costing_p) > >> { > >> - if (costing_p) > >> - { > >> - unsigned int cnunits > >> - = vect_nunits_for_cost (vectype); > >> - inside_cost > >> - = record_stmt_cost (cost_vec, cnunits, > >> - scalar_load, stmt_info, > >> 0, > >> - vect_body); > >> - break; > >> - } > >> - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > >> - vec_offset = vec_offsets[vec_num * j + i]; > >> - tree zero = build_zero_cst (vectype); > >> - tree scale = size_int (gs_info.scale); > >> - > >> - if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD) > >> - { > >> - if (loop_lens) > >> - final_len > >> - = vect_get_loop_len (loop_vinfo, gsi, > >> loop_lens, > >> - vec_num * ncopies, > >> vectype, > >> - vec_num * j + i, 1); > >> - else > >> - final_len = build_int_cst (sizetype, > >> - > >> TYPE_VECTOR_SUBPARTS ( > >> - vectype)); > >> - signed char biasval > >> - = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS > >> (loop_vinfo); > >> - bias = build_int_cst (intQI_type_node, > >> biasval); > >> - if (!final_mask) > >> - { > >> - mask_vectype = truth_type_for (vectype); > >> - final_mask = build_minus_one_cst > >> (mask_vectype); > >> - } > >> - } > >> - > >> - gcall *call; > >> - if (final_len && final_mask) > >> - call = gimple_build_call_internal ( > >> - IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, > >> - vec_offset, scale, zero, final_mask, final_len, > >> - bias); > >> - else if (final_mask) > >> - call = gimple_build_call_internal > >> - (IFN_MASK_GATHER_LOAD, 5, dataref_ptr, > >> - vec_offset, scale, zero, final_mask); > >> - else > >> - call = gimple_build_call_internal > >> - (IFN_GATHER_LOAD, 4, dataref_ptr, > >> - vec_offset, scale, zero); > >> - gimple_call_set_nothrow (call, true); > >> - new_stmt = call; > >> - data_ref = NULL_TREE; > >> + unsigned int cnunits = vect_nunits_for_cost > >> (vectype); > >> + inside_cost > >> + = record_stmt_cost (cost_vec, cnunits, > >> scalar_load, > >> + stmt_info, 0, vect_body); > >> break; > >> } > >> - else if (memory_access_type == VMAT_GATHER_SCATTER) > >> + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > >> + vec_offset = vec_offsets[vec_num * j + i]; > >> + tree zero = build_zero_cst (vectype); > >> + tree scale = size_int (gs_info.scale); > >> + > >> + if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD) > >> { > >> - /* Emulated gather-scatter. */ > >> - gcc_assert (!final_mask); > >> - unsigned HOST_WIDE_INT const_nunits > >> - = nunits.to_constant (); > >> - if (costing_p) > >> - { > >> - /* For emulated gathers N offset vector element > >> - offset add is consumed by the load). */ > >> - inside_cost > >> - = record_stmt_cost (cost_vec, const_nunits, > >> - vec_to_scalar, > >> stmt_info, 0, > >> - vect_body); > >> - /* N scalar loads plus gathering them into a > >> - vector. */ > >> - inside_cost > >> - = record_stmt_cost (cost_vec, const_nunits, > >> - scalar_load, stmt_info, > >> 0, > >> - vect_body); > >> - inside_cost > >> - = record_stmt_cost (cost_vec, 1, > >> vec_construct, > >> - stmt_info, 0, vect_body); > >> - break; > >> - } > >> - unsigned HOST_WIDE_INT const_offset_nunits > >> - = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype) > >> - .to_constant (); > >> - vec<constructor_elt, va_gc> *ctor_elts; > >> - vec_alloc (ctor_elts, const_nunits); > >> - gimple_seq stmts = NULL; > >> - /* We support offset vectors with more elements > >> - than the data vector for now. */ > >> - unsigned HOST_WIDE_INT factor > >> - = const_offset_nunits / const_nunits; > >> - vec_offset = vec_offsets[j / factor]; > >> - unsigned elt_offset = (j % factor) * const_nunits; > >> - tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset)); > >> - tree scale = size_int (gs_info.scale); > >> - align > >> - = get_object_alignment (DR_REF > >> (first_dr_info->dr)); > >> - tree ltype = build_aligned_type (TREE_TYPE > >> (vectype), > >> - align); > >> - for (unsigned k = 0; k < const_nunits; ++k) > >> + if (loop_lens) > >> + final_len > >> + = vect_get_loop_len (loop_vinfo, gsi, > >> loop_lens, > >> + vec_num * ncopies, > >> vectype, > >> + vec_num * j + i, 1); > >> + else > >> + final_len > >> + = build_int_cst (sizetype, > >> + TYPE_VECTOR_SUBPARTS > >> (vectype)); > >> + signed char biasval > >> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS > >> (loop_vinfo); > >> + bias = build_int_cst (intQI_type_node, biasval); > >> + if (!final_mask) > >> { > >> - tree boff = size_binop (MULT_EXPR, > >> - TYPE_SIZE (idx_type), > >> - bitsize_int > >> - (k + elt_offset)); > >> - tree idx = gimple_build (&stmts, BIT_FIELD_REF, > >> - idx_type, vec_offset, > >> - TYPE_SIZE (idx_type), > >> - boff); > >> - idx = gimple_convert (&stmts, sizetype, idx); > >> - idx = gimple_build (&stmts, MULT_EXPR, > >> - sizetype, idx, scale); > >> - tree ptr = gimple_build (&stmts, PLUS_EXPR, > >> - TREE_TYPE > >> (dataref_ptr), > >> - dataref_ptr, idx); > >> - ptr = gimple_convert (&stmts, ptr_type_node, > >> ptr); > >> - tree elt = make_ssa_name (TREE_TYPE (vectype)); > >> - tree ref = build2 (MEM_REF, ltype, ptr, > >> - build_int_cst (ref_type, > >> 0)); > >> - new_stmt = gimple_build_assign (elt, ref); > >> - gimple_set_vuse (new_stmt, > >> - gimple_vuse (gsi_stmt > >> (*gsi))); > >> - gimple_seq_add_stmt (&stmts, new_stmt); > >> - CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, > >> elt); > >> + mask_vectype = truth_type_for (vectype); > >> + final_mask = build_minus_one_cst > >> (mask_vectype); > >> } > >> - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); > >> - new_stmt = gimple_build_assign (NULL_TREE, > >> - build_constructor > >> - (vectype, > >> ctor_elts)); > >> - data_ref = NULL_TREE; > >> - break; > >> } > >> > >> - if (costing_p) > >> - break; > >> - > >> - align = > >> - known_alignment (DR_TARGET_ALIGNMENT > >> (first_dr_info)); > >> - if (alignment_support_scheme == dr_aligned) > >> - misalign = 0; > >> - else if (misalignment == DR_MISALIGNMENT_UNKNOWN) > >> - { > >> - align = dr_alignment > >> - (vect_dr_behavior (vinfo, first_dr_info)); > >> - misalign = 0; > >> - } > >> + gcall *call; > >> + if (final_len && final_mask) > >> + call = gimple_build_call_internal ( > >> + IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, > >> vec_offset, > >> + scale, zero, final_mask, final_len, bias); > >> + else if (final_mask) > >> + call > >> + = gimple_build_call_internal > >> (IFN_MASK_GATHER_LOAD, 5, > >> + dataref_ptr, > >> vec_offset, > >> + scale, zero, > >> final_mask); > >> else > >> - misalign = misalignment; > >> - if (dataref_offset == NULL_TREE > >> - && TREE_CODE (dataref_ptr) == SSA_NAME) > >> - set_ptr_info_alignment (get_ptr_info (dataref_ptr), > >> - align, misalign); > >> - align = least_bit_hwi (misalign | align); > >> - > >> - /* Compute IFN when LOOP_LENS or final_mask valid. */ > >> - machine_mode vmode = TYPE_MODE (vectype); > >> - machine_mode new_vmode = vmode; > >> - internal_fn partial_ifn = IFN_LAST; > >> - if (loop_lens) > >> + call > >> + = gimple_build_call_internal (IFN_GATHER_LOAD, 4, > >> + dataref_ptr, > >> vec_offset, > >> + scale, zero); > >> + gimple_call_set_nothrow (call, true); > >> + new_stmt = call; > >> + data_ref = NULL_TREE; > >> + break; > >> + } > >> + else if (memory_access_type == VMAT_GATHER_SCATTER) > >> + { > >> + /* Emulated gather-scatter. */ > >> + gcc_assert (!final_mask); > >> + unsigned HOST_WIDE_INT const_nunits = > >> nunits.to_constant (); > >> + if (costing_p) > >> { > >> - opt_machine_mode new_ovmode > >> - = get_len_load_store_mode (vmode, true, > >> - &partial_ifn); > >> - new_vmode = new_ovmode.require (); > >> - unsigned factor = (new_ovmode == vmode) > >> - ? 1 > >> - : GET_MODE_UNIT_SIZE (vmode); > >> - final_len > >> - = vect_get_loop_len (loop_vinfo, gsi, loop_lens, > >> - vec_num * ncopies, vectype, > >> - vec_num * j + i, factor); > >> + /* For emulated gathers N offset vector element > >> + offset add is consumed by the load). */ > >> + inside_cost > >> + = record_stmt_cost (cost_vec, const_nunits, > >> + vec_to_scalar, stmt_info, 0, > >> + vect_body); > >> + /* N scalar loads plus gathering them into a > >> + vector. */ > >> + inside_cost = record_stmt_cost (cost_vec, > >> const_nunits, > >> + scalar_load, > >> stmt_info, > >> + 0, vect_body); > >> + inside_cost > >> + = record_stmt_cost (cost_vec, 1, vec_construct, > >> + stmt_info, 0, vect_body); > >> + break; > >> } > >> - else if (final_mask) > >> + unsigned HOST_WIDE_INT const_offset_nunits > >> + = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype) > >> + .to_constant (); > >> + vec<constructor_elt, va_gc> *ctor_elts; > >> + vec_alloc (ctor_elts, const_nunits); > >> + gimple_seq stmts = NULL; > >> + /* We support offset vectors with more elements > >> + than the data vector for now. */ > >> + unsigned HOST_WIDE_INT factor > >> + = const_offset_nunits / const_nunits; > >> + vec_offset = vec_offsets[j / factor]; > >> + unsigned elt_offset = (j % factor) * const_nunits; > >> + tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset)); > >> + tree scale = size_int (gs_info.scale); > >> + align = get_object_alignment (DR_REF > >> (first_dr_info->dr)); > >> + tree ltype > >> + = build_aligned_type (TREE_TYPE (vectype), align); > >> + for (unsigned k = 0; k < const_nunits; ++k) > >> { > >> - if (!can_vec_mask_load_store_p ( > >> - vmode, TYPE_MODE (TREE_TYPE (final_mask)), > >> true, > >> - &partial_ifn)) > >> - gcc_unreachable (); > >> + tree boff = size_binop (MULT_EXPR, TYPE_SIZE > >> (idx_type), > >> + bitsize_int (k + > >> elt_offset)); > >> + tree idx = gimple_build (&stmts, BIT_FIELD_REF, > >> + idx_type, vec_offset, > >> + TYPE_SIZE (idx_type), > >> boff); > >> + idx = gimple_convert (&stmts, sizetype, idx); > >> + idx = gimple_build (&stmts, MULT_EXPR, sizetype, > >> idx, > >> + scale); > >> + tree ptr = gimple_build (&stmts, PLUS_EXPR, > >> + TREE_TYPE (dataref_ptr), > >> + dataref_ptr, idx); > >> + ptr = gimple_convert (&stmts, ptr_type_node, ptr); > >> + tree elt = make_ssa_name (TREE_TYPE (vectype)); > >> + tree ref = build2 (MEM_REF, ltype, ptr, > >> + build_int_cst (ref_type, 0)); > >> + new_stmt = gimple_build_assign (elt, ref); > >> + gimple_set_vuse (new_stmt, > >> + gimple_vuse (gsi_stmt (*gsi))); > >> + gimple_seq_add_stmt (&stmts, new_stmt); > >> + CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt); > >> } > >> + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); > >> + new_stmt = gimple_build_assign ( > >> + NULL_TREE, build_constructor (vectype, ctor_elts)); > >> + data_ref = NULL_TREE; > >> + break; > >> + } > >> > >> - if (partial_ifn == IFN_MASK_LEN_LOAD) > >> + if (costing_p) > >> + break; > >> + > >> + align = known_alignment (DR_TARGET_ALIGNMENT > >> (first_dr_info)); > >> + if (alignment_support_scheme == dr_aligned) > >> + misalign = 0; > >> + else if (misalignment == DR_MISALIGNMENT_UNKNOWN) > >> + { > >> + align > >> + = dr_alignment (vect_dr_behavior (vinfo, > >> first_dr_info)); > >> + misalign = 0; > >> + } > >> + else > >> + misalign = misalignment; > >> + if (dataref_offset == NULL_TREE > >> + && TREE_CODE (dataref_ptr) == SSA_NAME) > >> + set_ptr_info_alignment (get_ptr_info (dataref_ptr), > >> align, > >> + misalign); > >> + align = least_bit_hwi (misalign | align); > >> + > >> + /* Compute IFN when LOOP_LENS or final_mask valid. */ > >> + machine_mode vmode = TYPE_MODE (vectype); > >> + machine_mode new_vmode = vmode; > >> + internal_fn partial_ifn = IFN_LAST; > >> + if (loop_lens) > >> + { > >> + opt_machine_mode new_ovmode > >> + = get_len_load_store_mode (vmode, true, > >> &partial_ifn); > >> + new_vmode = new_ovmode.require (); > >> + unsigned factor > >> + = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE > >> (vmode); > >> + final_len = vect_get_loop_len (loop_vinfo, gsi, > >> loop_lens, > >> + vec_num * ncopies, > >> vectype, > >> + vec_num * j + i, > >> factor); > >> + } > >> + else if (final_mask) > >> + { > >> + if (!can_vec_mask_load_store_p ( > >> + vmode, TYPE_MODE (TREE_TYPE (final_mask)), true, > >> + &partial_ifn)) > >> + gcc_unreachable (); > >> + } > >> + > >> + if (partial_ifn == IFN_MASK_LEN_LOAD) > >> + { > >> + if (!final_len) > >> { > >> - if (!final_len) > >> - { > >> - /* Pass VF value to 'len' argument of > >> - MASK_LEN_LOAD if LOOP_LENS is invalid. */ > >> - final_len > >> - = size_int (TYPE_VECTOR_SUBPARTS (vectype)); > >> - } > >> - if (!final_mask) > >> - { > >> - /* Pass all ones value to 'mask' argument of > >> - MASK_LEN_LOAD if final_mask is invalid. */ > >> - mask_vectype = truth_type_for (vectype); > >> - final_mask = build_minus_one_cst > >> (mask_vectype); > >> - } > >> + /* Pass VF value to 'len' argument of > >> + MASK_LEN_LOAD if LOOP_LENS is invalid. */ > >> + final_len = size_int (TYPE_VECTOR_SUBPARTS > >> (vectype)); > >> } > >> - if (final_len) > >> + if (!final_mask) > >> { > >> - signed char biasval > >> - = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS > >> (loop_vinfo); > >> - > >> - bias = build_int_cst (intQI_type_node, biasval); > >> + /* Pass all ones value to 'mask' argument of > >> + MASK_LEN_LOAD if final_mask is invalid. */ > >> + mask_vectype = truth_type_for (vectype); > >> + final_mask = build_minus_one_cst (mask_vectype); > >> } > >> + } > >> + if (final_len) > >> + { > >> + signed char biasval > >> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > >> > >> - if (final_len) > >> + bias = build_int_cst (intQI_type_node, biasval); > >> + } > >> + > >> + if (final_len) > >> + { > >> + tree ptr = build_int_cst (ref_type, align * > >> BITS_PER_UNIT); > >> + gcall *call; > >> + if (partial_ifn == IFN_MASK_LEN_LOAD) > >> + call = gimple_build_call_internal > >> (IFN_MASK_LEN_LOAD, 5, > >> + dataref_ptr, ptr, > >> + final_mask, > >> final_len, > >> + bias); > >> + else > >> + call = gimple_build_call_internal (IFN_LEN_LOAD, 4, > >> + dataref_ptr, ptr, > >> + final_len, bias); > >> + gimple_call_set_nothrow (call, true); > >> + new_stmt = call; > >> + data_ref = NULL_TREE; > >> + > >> + /* Need conversion if it's wrapped with VnQI. */ > >> + if (vmode != new_vmode) > >> { > >> - tree ptr > >> - = build_int_cst (ref_type, align * > >> BITS_PER_UNIT); > >> - gcall *call; > >> - if (partial_ifn == IFN_MASK_LEN_LOAD) > >> - call = gimple_build_call_internal > >> (IFN_MASK_LEN_LOAD, > >> - 5, > >> dataref_ptr, > >> - ptr, > >> final_mask, > >> - final_len, > >> bias); > >> - else > >> - call = gimple_build_call_internal (IFN_LEN_LOAD, > >> 4, > >> - dataref_ptr, > >> ptr, > >> - final_len, > >> bias); > >> - gimple_call_set_nothrow (call, true); > >> - new_stmt = call; > >> - data_ref = NULL_TREE; > >> - > >> - /* Need conversion if it's wrapped with VnQI. */ > >> - if (vmode != new_vmode) > >> - { > >> - tree new_vtype = build_vector_type_for_mode ( > >> - unsigned_intQI_type_node, new_vmode); > >> - tree var = vect_get_new_ssa_name (new_vtype, > >> - > >> vect_simple_var); > >> - gimple_set_lhs (call, var); > >> - vect_finish_stmt_generation (vinfo, stmt_info, > >> call, > >> - gsi); > >> - tree op = build1 (VIEW_CONVERT_EXPR, vectype, > >> var); > >> - new_stmt > >> - = gimple_build_assign (vec_dest, > >> - VIEW_CONVERT_EXPR, > >> op); > >> - } > >> + tree new_vtype = build_vector_type_for_mode ( > >> + unsigned_intQI_type_node, new_vmode); > >> + tree var > >> + = vect_get_new_ssa_name (new_vtype, > >> vect_simple_var); > >> + gimple_set_lhs (call, var); > >> + vect_finish_stmt_generation (vinfo, stmt_info, > >> call, > >> + gsi); > >> + tree op = build1 (VIEW_CONVERT_EXPR, vectype, var); > >> + new_stmt = gimple_build_assign (vec_dest, > >> + VIEW_CONVERT_EXPR, > >> op); > >> } > >> - else if (final_mask) > >> + } > >> + else if (final_mask) > >> + { > >> + tree ptr = build_int_cst (ref_type, align * > >> BITS_PER_UNIT); > >> + gcall *call = gimple_build_call_internal > >> (IFN_MASK_LOAD, 3, > >> + dataref_ptr, > >> ptr, > >> + final_mask); > >> + gimple_call_set_nothrow (call, true); > >> + new_stmt = call; > >> + data_ref = NULL_TREE; > >> + } > >> + else > >> + { > >> + tree ltype = vectype; > >> + tree new_vtype = NULL_TREE; > >> + unsigned HOST_WIDE_INT gap = DR_GROUP_GAP > >> (first_stmt_info); > >> + unsigned int vect_align > >> + = vect_known_alignment_in_bytes (first_dr_info, > >> vectype); > >> + unsigned int scalar_dr_size > >> + = vect_get_scalar_dr_size (first_dr_info); > >> + /* If there's no peeling for gaps but we have a gap > >> + with slp loads then load the lower half of the > >> + vector only. See get_group_load_store_type for > >> + when we apply this optimization. */ > >> + if (slp > >> + && loop_vinfo > >> + && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && > >> gap != 0 > >> + && known_eq (nunits, (group_size - gap) * 2) > >> + && known_eq (nunits, group_size) > >> + && gap >= (vect_align / scalar_dr_size)) > >> { > >> - tree ptr = build_int_cst (ref_type, > >> - align * BITS_PER_UNIT); > >> - gcall *call > >> - = gimple_build_call_internal (IFN_MASK_LOAD, 3, > >> - dataref_ptr, ptr, > >> - final_mask); > >> - gimple_call_set_nothrow (call, true); > >> - new_stmt = call; > >> - data_ref = NULL_TREE; > >> + tree half_vtype; > >> + new_vtype > >> + = vector_vector_composition_type (vectype, 2, > >> + &half_vtype); > >> + if (new_vtype != NULL_TREE) > >> + ltype = half_vtype; > >> } > >> + tree offset > >> + = (dataref_offset ? dataref_offset > >> + : build_int_cst (ref_type, 0)); > >> + if (ltype != vectype > >> + && memory_access_type == VMAT_CONTIGUOUS_REVERSE) > >> + { > >> + unsigned HOST_WIDE_INT gap_offset > >> + = gap * tree_to_uhwi (TYPE_SIZE_UNIT > >> (elem_type)); > >> + tree gapcst = build_int_cst (ref_type, gap_offset); > >> + offset = size_binop (PLUS_EXPR, offset, gapcst); > >> + } > >> + data_ref > >> + = fold_build2 (MEM_REF, ltype, dataref_ptr, offset); > >> + if (alignment_support_scheme == dr_aligned) > >> + ; > >> else > >> + TREE_TYPE (data_ref) > >> + = build_aligned_type (TREE_TYPE (data_ref), > >> + align * BITS_PER_UNIT); > >> + if (ltype != vectype) > >> { > >> - tree ltype = vectype; > >> - tree new_vtype = NULL_TREE; > >> - unsigned HOST_WIDE_INT gap > >> - = DR_GROUP_GAP (first_stmt_info); > >> - unsigned int vect_align > >> - = vect_known_alignment_in_bytes (first_dr_info, > >> - vectype); > >> - unsigned int scalar_dr_size > >> - = vect_get_scalar_dr_size (first_dr_info); > >> - /* If there's no peeling for gaps but we have a gap > >> - with slp loads then load the lower half of the > >> - vector only. See get_group_load_store_type for > >> - when we apply this optimization. */ > >> - if (slp > >> - && loop_vinfo > >> - && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) > >> - && gap != 0 > >> - && known_eq (nunits, (group_size - gap) * 2) > >> - && known_eq (nunits, group_size) > >> - && gap >= (vect_align / scalar_dr_size)) > >> + vect_copy_ref_info (data_ref, > >> + DR_REF (first_dr_info->dr)); > >> + tree tem = make_ssa_name (ltype); > >> + new_stmt = gimple_build_assign (tem, data_ref); > >> + vect_finish_stmt_generation (vinfo, stmt_info, > >> new_stmt, > >> + gsi); > >> + data_ref = NULL; > >> + vec<constructor_elt, va_gc> *v; > >> + vec_alloc (v, 2); > >> + if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) > >> { > >> - tree half_vtype; > >> - new_vtype > >> - = vector_vector_composition_type (vectype, 2, > >> - > >> &half_vtype); > >> - if (new_vtype != NULL_TREE) > >> - ltype = half_vtype; > >> + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, > >> + build_zero_cst > >> (ltype)); > >> + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem); > >> } > >> - tree offset > >> - = (dataref_offset ? dataref_offset > >> - : build_int_cst (ref_type, 0)); > >> - if (ltype != vectype > >> - && memory_access_type == > >> VMAT_CONTIGUOUS_REVERSE) > >> + else > >> { > >> - unsigned HOST_WIDE_INT gap_offset > >> - = gap * tree_to_uhwi (TYPE_SIZE_UNIT > >> (elem_type)); > >> - tree gapcst = build_int_cst (ref_type, > >> gap_offset); > >> - offset = size_binop (PLUS_EXPR, offset, > >> gapcst); > >> + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem); > >> + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, > >> + build_zero_cst > >> (ltype)); > >> } > >> - data_ref > >> - = fold_build2 (MEM_REF, ltype, dataref_ptr, > >> offset); > >> - if (alignment_support_scheme == dr_aligned) > >> - ; > >> + gcc_assert (new_vtype != NULL_TREE); > >> + if (new_vtype == vectype) > >> + new_stmt = gimple_build_assign ( > >> + vec_dest, build_constructor (vectype, v)); > >> else > >> - TREE_TYPE (data_ref) > >> - = build_aligned_type (TREE_TYPE (data_ref), > >> - align * BITS_PER_UNIT); > >> - if (ltype != vectype) > >> { > >> - vect_copy_ref_info (data_ref, > >> - DR_REF > >> (first_dr_info->dr)); > >> - tree tem = make_ssa_name (ltype); > >> - new_stmt = gimple_build_assign (tem, data_ref); > >> + tree new_vname = make_ssa_name (new_vtype); > >> + new_stmt = gimple_build_assign ( > >> + new_vname, build_constructor (new_vtype, v)); > >> vect_finish_stmt_generation (vinfo, stmt_info, > >> new_stmt, gsi); > >> - data_ref = NULL; > >> - vec<constructor_elt, va_gc> *v; > >> - vec_alloc (v, 2); > >> - if (memory_access_type == > >> VMAT_CONTIGUOUS_REVERSE) > >> - { > >> - CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, > >> - build_zero_cst > >> (ltype)); > >> - CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem); > >> - } > >> - else > >> - { > >> - CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem); > >> - CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, > >> - build_zero_cst > >> (ltype)); > >> - } > >> - gcc_assert (new_vtype != NULL_TREE); > >> - if (new_vtype == vectype) > >> - new_stmt = gimple_build_assign ( > >> - vec_dest, build_constructor (vectype, v)); > >> - else > >> - { > >> - tree new_vname = make_ssa_name (new_vtype); > >> - new_stmt = gimple_build_assign ( > >> - new_vname, build_constructor (new_vtype, > >> v)); > >> - vect_finish_stmt_generation (vinfo, > >> stmt_info, > >> - new_stmt, > >> gsi); > >> - new_stmt = gimple_build_assign ( > >> - vec_dest, build1 (VIEW_CONVERT_EXPR, > >> vectype, > >> - new_vname)); > >> - } > >> + new_stmt = gimple_build_assign ( > >> + vec_dest, > >> + build1 (VIEW_CONVERT_EXPR, vectype, > >> new_vname)); > >> } > >> } > >> - break; > >> } > >> - case dr_explicit_realign: > >> - { > >> - if (costing_p) > >> - break; > >> - tree ptr, bump; > >> - > >> - tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype)); > >> + break; > >> + } > >> + case dr_explicit_realign: > >> + { > >> + if (costing_p) > >> + break; > >> + tree ptr, bump; > >> > >> - if (compute_in_loop) > >> - msq = vect_setup_realignment (vinfo, > >> first_stmt_info, gsi, > >> - &realignment_token, > >> - dr_explicit_realign, > >> - dataref_ptr, NULL); > >> + tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype)); > >> > >> - if (TREE_CODE (dataref_ptr) == SSA_NAME) > >> - ptr = copy_ssa_name (dataref_ptr); > >> - else > >> - ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); > >> - // For explicit realign the target alignment should be > >> - // known at compile time. > >> - unsigned HOST_WIDE_INT align = > >> - DR_TARGET_ALIGNMENT (first_dr_info).to_constant (); > >> - new_stmt = gimple_build_assign > >> - (ptr, BIT_AND_EXPR, dataref_ptr, > >> - build_int_cst > >> - (TREE_TYPE (dataref_ptr), > >> - -(HOST_WIDE_INT) align)); > >> - vect_finish_stmt_generation (vinfo, stmt_info, > >> - new_stmt, gsi); > >> - data_ref > >> - = build2 (MEM_REF, vectype, ptr, > >> - build_int_cst (ref_type, 0)); > >> - vect_copy_ref_info (data_ref, DR_REF > >> (first_dr_info->dr)); > >> - vec_dest = vect_create_destination_var (scalar_dest, > >> - vectype); > >> - new_stmt = gimple_build_assign (vec_dest, data_ref); > >> - new_temp = make_ssa_name (vec_dest, new_stmt); > >> - gimple_assign_set_lhs (new_stmt, new_temp); > >> - gimple_move_vops (new_stmt, stmt_info->stmt); > >> - vect_finish_stmt_generation (vinfo, stmt_info, > >> - new_stmt, gsi); > >> - msq = new_temp; > >> - > >> - bump = size_binop (MULT_EXPR, vs, > >> - TYPE_SIZE_UNIT (elem_type)); > >> - bump = size_binop (MINUS_EXPR, bump, size_one_node); > >> - ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi, > >> - stmt_info, bump); > >> - new_stmt = gimple_build_assign > >> - (NULL_TREE, BIT_AND_EXPR, ptr, > >> - build_int_cst > >> - (TREE_TYPE (ptr), -(HOST_WIDE_INT) > >> align)); > >> - if (TREE_CODE (ptr) == SSA_NAME) > >> - ptr = copy_ssa_name (ptr, new_stmt); > >> - else > >> - ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt); > >> - gimple_assign_set_lhs (new_stmt, ptr); > >> - vect_finish_stmt_generation (vinfo, stmt_info, > >> - new_stmt, gsi); > >> - data_ref > >> - = build2 (MEM_REF, vectype, ptr, > >> - build_int_cst (ref_type, 0)); > >> - break; > >> - } > >> - case dr_explicit_realign_optimized: > >> - { > >> - if (costing_p) > >> - break; > >> - if (TREE_CODE (dataref_ptr) == SSA_NAME) > >> - new_temp = copy_ssa_name (dataref_ptr); > >> - else > >> - new_temp = make_ssa_name (TREE_TYPE (dataref_ptr)); > >> - // We should only be doing this if we know the target > >> - // alignment at compile time. > >> - unsigned HOST_WIDE_INT align = > >> - DR_TARGET_ALIGNMENT (first_dr_info).to_constant (); > >> - new_stmt = gimple_build_assign > >> - (new_temp, BIT_AND_EXPR, dataref_ptr, > >> - build_int_cst (TREE_TYPE (dataref_ptr), > >> - -(HOST_WIDE_INT) align)); > >> - vect_finish_stmt_generation (vinfo, stmt_info, > >> - new_stmt, gsi); > >> - data_ref > >> - = build2 (MEM_REF, vectype, new_temp, > >> - build_int_cst (ref_type, 0)); > >> - break; > >> - } > >> - default: > >> - gcc_unreachable (); > >> - } > >> + if (compute_in_loop) > >> + msq = vect_setup_realignment (vinfo, first_stmt_info, > >> gsi, > >> + &realignment_token, > >> + dr_explicit_realign, > >> + dataref_ptr, NULL); > >> + > >> + if (TREE_CODE (dataref_ptr) == SSA_NAME) > >> + ptr = copy_ssa_name (dataref_ptr); > >> + else > >> + ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); > >> + // For explicit realign the target alignment should be > >> + // known at compile time. > >> + unsigned HOST_WIDE_INT align > >> + = DR_TARGET_ALIGNMENT (first_dr_info).to_constant (); > >> + new_stmt = gimple_build_assign ( > >> + ptr, BIT_AND_EXPR, dataref_ptr, > >> + build_int_cst (TREE_TYPE (dataref_ptr), > >> + -(HOST_WIDE_INT) align)); > >> + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, > >> gsi); > >> + data_ref > >> + = build2 (MEM_REF, vectype, ptr, build_int_cst > >> (ref_type, 0)); > >> + vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); > >> + vec_dest = vect_create_destination_var (scalar_dest, > >> vectype); > >> + new_stmt = gimple_build_assign (vec_dest, data_ref); > >> + new_temp = make_ssa_name (vec_dest, new_stmt); > >> + gimple_assign_set_lhs (new_stmt, new_temp); > >> + gimple_move_vops (new_stmt, stmt_info->stmt); > >> + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, > >> gsi); > >> + msq = new_temp; > >> + > >> + bump = size_binop (MULT_EXPR, vs, TYPE_SIZE_UNIT > >> (elem_type)); > >> + bump = size_binop (MINUS_EXPR, bump, size_one_node); > >> + ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi, > >> stmt_info, > >> + bump); > >> + new_stmt = gimple_build_assign ( > >> + NULL_TREE, BIT_AND_EXPR, ptr, > >> + build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align)); > >> + if (TREE_CODE (ptr) == SSA_NAME) > >> + ptr = copy_ssa_name (ptr, new_stmt); > >> + else > >> + ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt); > >> + gimple_assign_set_lhs (new_stmt, ptr); > >> + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, > >> gsi); > >> + data_ref > >> + = build2 (MEM_REF, vectype, ptr, build_int_cst > >> (ref_type, 0)); > >> + break; > >> + } > >> + case dr_explicit_realign_optimized: > >> + { > >> + if (costing_p) > >> + break; > >> + if (TREE_CODE (dataref_ptr) == SSA_NAME) > >> + new_temp = copy_ssa_name (dataref_ptr); > >> + else > >> + new_temp = make_ssa_name (TREE_TYPE (dataref_ptr)); > >> + // We should only be doing this if we know the target > >> + // alignment at compile time. > >> + unsigned HOST_WIDE_INT align > >> + = DR_TARGET_ALIGNMENT (first_dr_info).to_constant (); > >> + new_stmt = gimple_build_assign ( > >> + new_temp, BIT_AND_EXPR, dataref_ptr, > >> + build_int_cst (TREE_TYPE (dataref_ptr), > >> + -(HOST_WIDE_INT) align)); > >> + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, > >> gsi); > >> + data_ref = build2 (MEM_REF, vectype, new_temp, > >> + build_int_cst (ref_type, 0)); > >> + break; > >> + } > >> + default: > >> + gcc_unreachable (); > >> + } > >> > >> - /* One common place to cost the above vect load for different > >> - alignment support schemes. */ > >> - if (costing_p) > >> - { > >> - /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we > >> - only need to take care of the first stmt, whose > >> - stmt_info is first_stmt_info, vec_num iterating on it > >> - will cover the cost for the remaining, it's consistent > >> - with transforming. For the prologue cost for realign, > >> - we only need to count it once for the whole group. */ > >> - bool first_stmt_info_p = first_stmt_info == stmt_info; > >> - bool add_realign_cost = first_stmt_info_p && i == 0; > >> - if (memory_access_type == VMAT_CONTIGUOUS > >> - || memory_access_type == VMAT_CONTIGUOUS_REVERSE > >> - || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE > >> - && (!grouped_load || first_stmt_info_p))) > >> - vect_get_load_cost (vinfo, stmt_info, 1, > >> - alignment_support_scheme, > >> misalignment, > >> - add_realign_cost, &inside_cost, > >> - &prologue_cost, cost_vec, cost_vec, > >> - true); > >> - } > >> - else > >> + /* One common place to cost the above vect load for different > >> + alignment support schemes. */ > >> + if (costing_p) > >> + { > >> + /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we > >> + only need to take care of the first stmt, whose > >> + stmt_info is first_stmt_info, vec_num iterating on it > >> + will cover the cost for the remaining, it's consistent > >> + with transforming. For the prologue cost for realign, > >> + we only need to count it once for the whole group. */ > >> + bool first_stmt_info_p = first_stmt_info == stmt_info; > >> + bool add_realign_cost = first_stmt_info_p && i == 0; > >> + if (memory_access_type == VMAT_CONTIGUOUS > >> + || memory_access_type == VMAT_CONTIGUOUS_REVERSE > >> + || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE > >> + && (!grouped_load || first_stmt_info_p))) > >> + vect_get_load_cost (vinfo, stmt_info, 1, > >> + alignment_support_scheme, misalignment, > >> + add_realign_cost, &inside_cost, > >> + &prologue_cost, cost_vec, cost_vec, > >> true); > >> + } > >> + else > >> + { > >> + vec_dest = vect_create_destination_var (scalar_dest, > >> vectype); > >> + /* DATA_REF is null if we've already built the statement. */ > >> + if (data_ref) > >> { > >> - vec_dest = vect_create_destination_var (scalar_dest, > >> vectype); > >> - /* DATA_REF is null if we've already built the > >> statement. */ > >> - if (data_ref) > >> - { > >> - vect_copy_ref_info (data_ref, DR_REF > >> (first_dr_info->dr)); > >> - new_stmt = gimple_build_assign (vec_dest, data_ref); > >> - } > >> - new_temp = make_ssa_name (vec_dest, new_stmt); > >> - gimple_set_lhs (new_stmt, new_temp); > >> - vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, > >> gsi); > >> + vect_copy_ref_info (data_ref, DR_REF > >> (first_dr_info->dr)); > >> + new_stmt = gimple_build_assign (vec_dest, data_ref); > >> } > >> + new_temp = make_ssa_name (vec_dest, new_stmt); > >> + gimple_set_lhs (new_stmt, new_temp); > >> + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, > >> gsi); > >> + } > >> > >> - /* 3. Handle explicit realignment if necessary/supported. > >> - Create in loop: > >> - vec_dest = realign_load (msq, lsq, realignment_token) > >> */ > >> - if (!costing_p > >> - && (alignment_support_scheme == > >> dr_explicit_realign_optimized > >> - || alignment_support_scheme == dr_explicit_realign)) > >> - { > >> - lsq = gimple_assign_lhs (new_stmt); > >> - if (!realignment_token) > >> - realignment_token = dataref_ptr; > >> - vec_dest = vect_create_destination_var (scalar_dest, > >> vectype); > >> - new_stmt = gimple_build_assign (vec_dest, > >> REALIGN_LOAD_EXPR, > >> - msq, lsq, > >> realignment_token); > >> - new_temp = make_ssa_name (vec_dest, new_stmt); > >> - gimple_assign_set_lhs (new_stmt, new_temp); > >> - vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, > >> gsi); > >> + /* 3. Handle explicit realignment if necessary/supported. > >> + Create in loop: > >> + vec_dest = realign_load (msq, lsq, realignment_token) */ > >> + if (!costing_p > >> + && (alignment_support_scheme == dr_explicit_realign_optimized > >> + || alignment_support_scheme == dr_explicit_realign)) > >> + { > >> + lsq = gimple_assign_lhs (new_stmt); > >> + if (!realignment_token) > >> + realignment_token = dataref_ptr; > >> + vec_dest = vect_create_destination_var (scalar_dest, > >> vectype); > >> + new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR, > >> msq, > >> + lsq, realignment_token); > >> + new_temp = make_ssa_name (vec_dest, new_stmt); > >> + gimple_assign_set_lhs (new_stmt, new_temp); > >> + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, > >> gsi); > >> > >> - if (alignment_support_scheme == > >> dr_explicit_realign_optimized) > >> - { > >> - gcc_assert (phi); > >> - if (i == vec_num - 1 && j == ncopies - 1) > >> - add_phi_arg (phi, lsq, > >> - loop_latch_edge (containing_loop), > >> - UNKNOWN_LOCATION); > >> - msq = lsq; > >> - } > >> + if (alignment_support_scheme == > >> dr_explicit_realign_optimized) > >> + { > >> + gcc_assert (phi); > >> + if (i == vec_num - 1 && j == ncopies - 1) > >> + add_phi_arg (phi, lsq, loop_latch_edge > >> (containing_loop), > >> + UNKNOWN_LOCATION); > >> + msq = lsq; > >> } > >> + } > >> > >> - if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) > >> + if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) > >> + { > >> + if (costing_p) > >> + inside_cost = record_stmt_cost (cost_vec, 1, vec_perm, > >> + stmt_info, 0, vect_body); > >> + else > >> { > >> - if (costing_p) > >> - inside_cost = record_stmt_cost (cost_vec, 1, vec_perm, > >> - stmt_info, 0, > >> vect_body); > >> - else > >> - { > >> - tree perm_mask = perm_mask_for_reverse (vectype); > >> - new_temp > >> - = permute_vec_elements (vinfo, new_temp, new_temp, > >> - perm_mask, stmt_info, gsi); > >> - new_stmt = SSA_NAME_DEF_STMT (new_temp); > >> - } > >> + tree perm_mask = perm_mask_for_reverse (vectype); > >> + new_temp = permute_vec_elements (vinfo, new_temp, > >> new_temp, > >> + perm_mask, stmt_info, > >> gsi); > >> + new_stmt = SSA_NAME_DEF_STMT (new_temp); > >> } > >> + } > >> > >> - /* Collect vector loads and later create their permutation in > >> - vect_transform_grouped_load (). */ > >> - if (!costing_p && (grouped_load || slp_perm)) > >> - dr_chain.quick_push (new_temp); > >> + /* Collect vector loads and later create their permutation in > >> + vect_transform_grouped_load (). */ > >> + if (!costing_p && (grouped_load || slp_perm)) > >> + dr_chain.quick_push (new_temp); > >> > >> - /* Store vector loads in the corresponding SLP_NODE. */ > >> - if (!costing_p && slp && !slp_perm) > >> - slp_node->push_vec_def (new_stmt); > >> + /* Store vector loads in the corresponding SLP_NODE. */ > >> + if (!costing_p && slp && !slp_perm) > >> + slp_node->push_vec_def (new_stmt); > >> > >> - /* With SLP permutation we load the gaps as well, without > >> - we need to skip the gaps after we manage to fully load > >> - all elements. group_gap_adj is DR_GROUP_SIZE here. */ > >> - group_elt += nunits; > >> - if (!costing_p > >> - && maybe_ne (group_gap_adj, 0U) > >> - && !slp_perm > >> - && known_eq (group_elt, group_size - group_gap_adj)) > >> - { > >> - poly_wide_int bump_val > >> - = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) > >> - * group_gap_adj); > >> - if (tree_int_cst_sgn > >> - (vect_dr_behavior (vinfo, dr_info)->step) == -1) > >> - bump_val = -bump_val; > >> - tree bump = wide_int_to_tree (sizetype, bump_val); > >> - dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, > >> ptr_incr, > >> - gsi, stmt_info, bump); > >> - group_elt = 0; > >> - } > >> - } > >> - /* Bump the vector pointer to account for a gap or for excess > >> - elements loaded for a permuted SLP load. */ > >> + /* With SLP permutation we load the gaps as well, without > >> + we need to skip the gaps after we manage to fully load > >> + all elements. group_gap_adj is DR_GROUP_SIZE here. */ > >> + group_elt += nunits; > >> if (!costing_p > >> && maybe_ne (group_gap_adj, 0U) > >> - && slp_perm) > >> + && !slp_perm > >> + && known_eq (group_elt, group_size - group_gap_adj)) > >> { > >> poly_wide_int bump_val > >> - = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) > >> - * group_gap_adj); > >> - if (tree_int_cst_sgn > >> - (vect_dr_behavior (vinfo, dr_info)->step) == -1) > >> + = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * > >> group_gap_adj); > >> + if (tree_int_cst_sgn (vect_dr_behavior (vinfo, > >> dr_info)->step) > >> + == -1) > >> bump_val = -bump_val; > >> tree bump = wide_int_to_tree (sizetype, bump_val); > >> dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, > >> gsi, > >> stmt_info, bump); > >> + group_elt = 0; > >> } > >> } > >> + /* Bump the vector pointer to account for a gap or for excess > >> + elements loaded for a permuted SLP load. */ > >> + if (!costing_p > >> + && maybe_ne (group_gap_adj, 0U) > >> + && slp_perm) > >> + { > >> + poly_wide_int bump_val > >> + = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj); > >> + if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step) > >> == -1) > >> + bump_val = -bump_val; > >> + tree bump = wide_int_to_tree (sizetype, bump_val); > >> + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi, > >> + stmt_info, bump); > >> + } > >> > >> if (slp && !slp_perm) > >> continue; > >> @@ -11120,39 +11117,36 @@ vectorizable_load (vec_info *vinfo, > >> } > >> } > >> else > >> - { > >> - if (grouped_load) > >> - { > >> - if (memory_access_type != VMAT_LOAD_STORE_LANES) > >> + { > >> + if (grouped_load) > >> + { > >> + gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE); > >> + /* We assume that the cost of a single load-lanes instruction > >> + is equivalent to the cost of DR_GROUP_SIZE separate loads. > >> + If a grouped access is instead being provided by a > >> + load-and-permute operation, include the cost of the > >> + permutes. */ > >> + if (costing_p && first_stmt_info == stmt_info) > >> { > >> - gcc_assert (memory_access_type == > >> VMAT_CONTIGUOUS_PERMUTE); > >> - /* We assume that the cost of a single load-lanes > >> instruction > >> - is equivalent to the cost of DR_GROUP_SIZE separate > >> loads. > >> - If a grouped access is instead being provided by a > >> - load-and-permute operation, include the cost of the > >> - permutes. */ > >> - if (costing_p && first_stmt_info == stmt_info) > >> - { > >> - /* Uses an even and odd extract operations or shuffle > >> - operations for each needed permute. */ > >> - int group_size = DR_GROUP_SIZE (first_stmt_info); > >> - int nstmts = ceil_log2 (group_size) * group_size; > >> - inside_cost > >> - += record_stmt_cost (cost_vec, nstmts, vec_perm, > >> - stmt_info, 0, vect_body); > >> + /* Uses an even and odd extract operations or shuffle > >> + operations for each needed permute. */ > >> + int group_size = DR_GROUP_SIZE (first_stmt_info); > >> + int nstmts = ceil_log2 (group_size) * group_size; > >> + inside_cost += record_stmt_cost (cost_vec, nstmts, > >> vec_perm, > >> + stmt_info, 0, > >> vect_body); > >> > >> - if (dump_enabled_p ()) > >> - dump_printf_loc ( > >> - MSG_NOTE, vect_location, > >> - "vect_model_load_cost: strided group_size = %d > >> .\n", > >> - group_size); > >> - } > >> - else if (!costing_p) > >> - vect_transform_grouped_load (vinfo, stmt_info, > >> dr_chain, > >> - group_size, gsi); > >> + if (dump_enabled_p ()) > >> + dump_printf_loc (MSG_NOTE, vect_location, > >> + "vect_model_load_cost:" > >> + "strided group_size = %d .\n", > >> + group_size); > >> + } > >> + else if (!costing_p) > >> + { > >> + vect_transform_grouped_load (vinfo, stmt_info, dr_chain, > >> + group_size, gsi); > >> + *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; > >> } > >> - if (!costing_p) > >> - *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; > >> } > >> else if (!costing_p) > >> STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); > >> @@ -11166,7 +11160,8 @@ vectorizable_load (vec_info *vinfo, > >> { > >> gcc_assert (memory_access_type != VMAT_INVARIANT > >> && memory_access_type != VMAT_ELEMENTWISE > >> - && memory_access_type != VMAT_STRIDED_SLP); > >> + && memory_access_type != VMAT_STRIDED_SLP > >> + && memory_access_type != VMAT_LOAD_STORE_LANES); > >> if (dump_enabled_p ()) > >> dump_printf_loc (MSG_NOTE, vect_location, > >> "vect_model_load_cost: inside_cost = %u, " > >> -- > >> 2.31.1 > > >