On Wed, Jun 15, 2016 at 10:52 AM, Richard Sandiford
<[email protected]> wrote:
> This is the main patch in the series. It adds a new enum and routines
> for classifying a vector load or store implementation.
>
> Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install?
Why's the setting and checking of the memory access type conditional on !slp?
I'd rather avoid doing this :/
Otherwise it looks like a step in the right direction of splitting the
vectorizable_*
functions into a analysis part that records all decisions made and a transform
part that just applies it.
Thanks,
Richard.
> Thanks,
> Richard
>
>
> gcc/
> * tree-vectorizer.h (vect_memory_access_type): New enum.
> (_stmt_vec_info): Add a memory_access_type field.
> (STMT_VINFO_MEMORY_ACCESS_TYPE): New macro.
> (vect_model_store_cost): Take an access type instead of a boolean.
> (vect_model_load_cost): Likewise.
> * tree-vect-slp.c (vect_analyze_slp_cost_1): Update calls to
> vect_model_store_cost and vect_model_load_cost.
> * tree-vect-stmts.c (vec_load_store_type): New enum.
> (vect_model_store_cost): Take an access type instead of a
> store_lanes_p boolean. Simplify tests.
> (vect_model_load_cost): Likewise, but for load_lanes_p.
> (get_group_load_store_type, get_load_store_type): New functions.
> (vectorizable_store): Use get_load_store_type. Record the access
> type in STMT_VINFO_MEMORY_ACCESS_TYPE.
> (vectorizable_load): Likewise.
> (vectorizable_mask_load_store): Likewise. Replace is_store
> variable with vls_type.
>
> Index: gcc/tree-vectorizer.h
> ===================================================================
> --- gcc/tree-vectorizer.h
> +++ gcc/tree-vectorizer.h
> @@ -485,6 +485,33 @@ enum slp_vect_type {
> hybrid
> };
>
> +/* Describes how we're going to vectorize an individual load or store,
> + or a group of loads or stores. */
> +enum vect_memory_access_type {
> + /* A simple contiguous access. */
> + VMAT_CONTIGUOUS,
> +
> + /* A simple contiguous access in which the elements need to be permuted
> + after loading or before storing. Only used for loop vectorization;
> + SLP uses separate permutes. */
> + VMAT_CONTIGUOUS_PERMUTE,
> +
> + /* An access that uses IFN_LOAD_LANES or IFN_STORE_LANES. */
> + VMAT_LOAD_STORE_LANES,
> +
> + /* An access in which each scalar element is loaded or stored
> + individually. */
> + VMAT_ELEMENTWISE,
> +
> + /* A hybrid of VMAT_CONTIGUOUS and VMAT_ELEMENTWISE, used for grouped
> + SLP accesses. Each unrolled iteration uses a contiguous load
> + or store for the whole group, but the groups from separate iterations
> + are combined in the same way as for VMAT_ELEMENTWISE. */
> + VMAT_STRIDED_SLP,
> +
> + /* The access uses gather loads or scatter stores. */
> + VMAT_GATHER_SCATTER
> +};
>
> typedef struct data_reference *dr_p;
>
> @@ -602,6 +629,10 @@ typedef struct _stmt_vec_info {
> /* True if this is an access with loop-invariant stride. */
> bool strided_p;
>
> + /* Classifies how the load or store is going to be implemented
> + for loop vectorization. */
> + vect_memory_access_type memory_access_type;
> +
> /* For both loads and stores. */
> bool simd_lane_access_p;
>
> @@ -659,6 +690,7 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo)
> #define STMT_VINFO_DATA_REF(S) (S)->data_ref_info
> #define STMT_VINFO_GATHER_SCATTER_P(S) (S)->gather_scatter_p
> #define STMT_VINFO_STRIDED_P(S) (S)->strided_p
> +#define STMT_VINFO_MEMORY_ACCESS_TYPE(S) (S)->memory_access_type
> #define STMT_VINFO_SIMD_LANE_ACCESS_P(S) (S)->simd_lane_access_p
> #define STMT_VINFO_VEC_REDUCTION_TYPE(S) (S)->v_reduc_type
>
> @@ -1006,12 +1038,12 @@ extern void free_stmt_vec_info (gimple *stmt);
> extern void vect_model_simple_cost (stmt_vec_info, int, enum vect_def_type *,
> stmt_vector_for_cost *,
> stmt_vector_for_cost *);
> -extern void vect_model_store_cost (stmt_vec_info, int, bool,
> +extern void vect_model_store_cost (stmt_vec_info, int,
> vect_memory_access_type,
> enum vect_def_type, slp_tree,
> stmt_vector_for_cost *,
> stmt_vector_for_cost *);
> -extern void vect_model_load_cost (stmt_vec_info, int, bool, slp_tree,
> - stmt_vector_for_cost *,
> +extern void vect_model_load_cost (stmt_vec_info, int,
> vect_memory_access_type,
> + slp_tree, stmt_vector_for_cost *,
> stmt_vector_for_cost *);
> extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
> enum vect_cost_for_stmt, stmt_vec_info,
> Index: gcc/tree-vect-slp.c
> ===================================================================
> --- gcc/tree-vect-slp.c
> +++ gcc/tree-vect-slp.c
> @@ -1490,9 +1490,13 @@ vect_analyze_slp_cost_1 (slp_instance instance,
> slp_tree node,
> stmt_info = vinfo_for_stmt (stmt);
> if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> {
> + vect_memory_access_type memory_access_type
> + = (STMT_VINFO_STRIDED_P (stmt_info)
> + ? VMAT_STRIDED_SLP
> + : VMAT_CONTIGUOUS);
> if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
> - vect_model_store_cost (stmt_info, ncopies_for_cost, false,
> - vect_uninitialized_def,
> + vect_model_store_cost (stmt_info, ncopies_for_cost,
> + memory_access_type, vect_uninitialized_def,
> node, prologue_cost_vec, body_cost_vec);
> else
> {
> @@ -1515,8 +1519,9 @@ vect_analyze_slp_cost_1 (slp_instance instance,
> slp_tree node,
> ncopies_for_cost *= SLP_INSTANCE_UNROLLING_FACTOR (instance);
> }
> /* Record the cost for the vector loads. */
> - vect_model_load_cost (stmt_info, ncopies_for_cost, false,
> - node, prologue_cost_vec, body_cost_vec);
> + vect_model_load_cost (stmt_info, ncopies_for_cost,
> + memory_access_type, node, prologue_cost_vec,
> + body_cost_vec);
> return;
> }
> }
> Index: gcc/tree-vect-stmts.c
> ===================================================================
> --- gcc/tree-vect-stmts.c
> +++ gcc/tree-vect-stmts.c
> @@ -52,6 +52,14 @@ along with GCC; see the file COPYING3. If not see
> /* For lang_hooks.types.type_for_mode. */
> #include "langhooks.h"
>
> +/* Says whether a statement is a load, a store of a vectorized statement
> + result, or a store of an invariant value. */
> +enum vec_load_store_type {
> + VLS_LOAD,
> + VLS_STORE,
> + VLS_STORE_INVARIANT
> +};
> +
> /* Return the vectorized type for the given statement. */
>
> tree
> @@ -873,8 +881,8 @@ vect_model_promotion_demotion_cost (stmt_vec_info
> stmt_info,
>
> void
> vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
> - bool store_lanes_p, enum vect_def_type dt,
> - slp_tree slp_node,
> + vect_memory_access_type memory_access_type,
> + enum vect_def_type dt, slp_tree slp_node,
> stmt_vector_for_cost *prologue_cost_vec,
> stmt_vector_for_cost *body_cost_vec)
> {
> @@ -903,14 +911,9 @@ vect_model_store_cost (stmt_vec_info stmt_info, int
> ncopies,
> /* We assume that the cost of a single store-lanes instruction is
> equivalent to the cost of GROUP_SIZE separate stores. If a grouped
> access is instead being provided by a permute-and-store operation,
> - include the cost of the permutes.
> -
> - For SLP, the caller has already counted the permutation, if any. */
> - if (grouped_access_p
> - && first_stmt_p
> - && !store_lanes_p
> - && !STMT_VINFO_STRIDED_P (stmt_info)
> - && !slp_node)
> + include the cost of the permutes. */
> + if (first_stmt_p
> + && memory_access_type == VMAT_CONTIGUOUS_PERMUTE)
> {
> /* Uses a high and low interleave or shuffle operations for each
> needed permute. */
> @@ -927,17 +930,16 @@ vect_model_store_cost (stmt_vec_info stmt_info, int
> ncopies,
>
> tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> /* Costs of the stores. */
> - if (STMT_VINFO_STRIDED_P (stmt_info) && !(slp_node && grouped_access_p))
> - {
> - /* N scalar stores plus extracting the elements. */
> - inside_cost += record_stmt_cost (body_cost_vec,
> - ncopies * TYPE_VECTOR_SUBPARTS
> (vectype),
> - scalar_store, stmt_info, 0, vect_body);
> - }
> + if (memory_access_type == VMAT_ELEMENTWISE)
> + /* N scalar stores plus extracting the elements. */
> + inside_cost += record_stmt_cost (body_cost_vec,
> + ncopies * TYPE_VECTOR_SUBPARTS (vectype),
> + scalar_store, stmt_info, 0, vect_body);
> else
> vect_get_store_cost (dr, ncopies, &inside_cost, body_cost_vec);
>
> - if (STMT_VINFO_STRIDED_P (stmt_info))
> + if (memory_access_type == VMAT_ELEMENTWISE
> + || memory_access_type == VMAT_STRIDED_SLP)
> inside_cost += record_stmt_cost (body_cost_vec,
> ncopies * TYPE_VECTOR_SUBPARTS (vectype),
> vec_to_scalar, stmt_info, 0, vect_body);
> @@ -1011,7 +1013,8 @@ vect_get_store_cost (struct data_reference *dr, int
> ncopies,
>
> void
> vect_model_load_cost (stmt_vec_info stmt_info, int ncopies,
> - bool load_lanes_p, slp_tree slp_node,
> + vect_memory_access_type memory_access_type,
> + slp_tree slp_node,
> stmt_vector_for_cost *prologue_cost_vec,
> stmt_vector_for_cost *body_cost_vec)
> {
> @@ -1036,14 +1039,9 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
> ncopies,
> /* We assume that the cost of a single load-lanes instruction is
> equivalent to the cost of GROUP_SIZE separate loads. If a grouped
> access is instead being provided by a load-and-permute operation,
> - include the cost of the permutes.
> -
> - For SLP, the caller has already counted the permutation, if any. */
> - if (grouped_access_p
> - && first_stmt_p
> - && !load_lanes_p
> - && !STMT_VINFO_STRIDED_P (stmt_info)
> - && !slp_node)
> + include the cost of the permutes. */
> + if (first_stmt_p
> + && memory_access_type == VMAT_CONTIGUOUS_PERMUTE)
> {
> /* Uses an even and odd extract operations or shuffle operations
> for each needed permute. */
> @@ -1059,7 +1057,7 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
> ncopies,
> }
>
> /* The loads themselves. */
> - if (STMT_VINFO_STRIDED_P (stmt_info) && !(slp_node && grouped_access_p))
> + if (memory_access_type == VMAT_ELEMENTWISE)
> {
> /* N scalar loads plus gathering them into a vector. */
> tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> @@ -1071,7 +1069,8 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
> ncopies,
> vect_get_load_cost (dr, ncopies, first_stmt_p,
> &inside_cost, &prologue_cost,
> prologue_cost_vec, body_cost_vec, true);
> - if (STMT_VINFO_STRIDED_P (stmt_info))
> + if (memory_access_type == VMAT_ELEMENTWISE
> + || memory_access_type == VMAT_STRIDED_SLP)
> inside_cost += record_stmt_cost (body_cost_vec, ncopies, vec_construct,
> stmt_info, 0, vect_body);
>
> @@ -1674,6 +1673,209 @@ static tree permute_vec_elements (tree, tree, tree,
> gimple *,
> gimple_stmt_iterator *);
>
>
> +/* A subroutine of get_load_store_type, with a subset of the same
> + arguments. Handle the case where STMT is part of a grouped load
> + or store.
> +
> + For stores, the statements in the group are all consecutive
> + and there is no gap at the end. For loads, the statements in the
> + group might not be consecutive; there can be gaps between statements
> + as well as at the end. */
> +
> +static bool
> +get_group_load_store_type (gimple *stmt, tree vectype, bool slp,
> + vec_load_store_type vls_type,
> + vect_memory_access_type *memory_access_type)
> +{
> + stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
> + vec_info *vinfo = stmt_info->vinfo;
> + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
> + struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
> + gimple *first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
> + unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
> + bool single_element_p = (stmt == first_stmt
> + && !GROUP_NEXT_ELEMENT (stmt_info));
> + unsigned HOST_WIDE_INT gap = GROUP_GAP (vinfo_for_stmt (first_stmt));
> + int nunits = TYPE_VECTOR_SUBPARTS (vectype);
> +
> + /* True if the vectorized statements would access beyond the last
> + statement in the group. */
> + bool overrun_p = false;
> +
> + /* True if we can cope with such overrun by peeling for gaps, so that
> + there is at least one final scalar iteration after the vector loop. */
> + bool can_overrun_p = (vls_type == VLS_LOAD && loop_vinfo && !loop->inner);
> +
> + /* There can only be a gap at the end of the group if the stride is
> + known at compile time. */
> + gcc_assert (!STMT_VINFO_STRIDED_P (stmt_info) || gap == 0);
> +
> + /* Stores can't yet have gaps. */
> + gcc_assert (slp || vls_type == VLS_LOAD || gap == 0);
> +
> + if (slp)
> + {
> + if (STMT_VINFO_STRIDED_P (stmt_info))
> + {
> + /* Try to use consecutive accesses of GROUP_SIZE elements,
> + separated by the stride, until we have a complete vector.
> + Fall back to scalar accesses if that isn't possible. */
> + if (nunits % group_size == 0)
> + *memory_access_type = VMAT_STRIDED_SLP;
> + else
> + *memory_access_type = VMAT_ELEMENTWISE;
> + }
> + else
> + {
> + overrun_p = loop_vinfo && gap != 0;
> + if (overrun_p && vls_type != VLS_LOAD)
> + {
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "Grouped store with gaps requires"
> + " non-consecutive accesses\n");
> + return false;
> + }
> + if (overrun_p && !can_overrun_p)
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "Peeling for outer loop is not supported\n");
> + return false;
> + }
> + *memory_access_type = VMAT_CONTIGUOUS;
> + }
> + }
> + else
> + {
> + /* We can always handle this case using elementwise accesses,
> + but see if something more efficient is available. */
> + *memory_access_type = VMAT_ELEMENTWISE;
> +
> + /* If there is a gap at the end of the group then these optimizations
> + would access excess elements in the last iteration. */
> + bool would_overrun_p = (gap != 0);
> + if (!STMT_VINFO_STRIDED_P (stmt_info)
> + && (can_overrun_p || !would_overrun_p))
> + {
> + /* First try using LOAD/STORE_LANES. */
> + if (vls_type == VLS_LOAD
> + ? vect_load_lanes_supported (vectype, group_size)
> + : vect_store_lanes_supported (vectype, group_size))
> + {
> + *memory_access_type = VMAT_LOAD_STORE_LANES;
> + overrun_p = would_overrun_p;
> + }
> +
> + /* If that fails, try using permuting loads. */
> + if (*memory_access_type == VMAT_ELEMENTWISE
> + && (vls_type == VLS_LOAD
> + ? vect_grouped_load_supported (vectype, single_element_p,
> + group_size)
> + : vect_grouped_store_supported (vectype, group_size)))
> + {
> + *memory_access_type = VMAT_CONTIGUOUS_PERMUTE;
> + overrun_p = would_overrun_p;
> + }
> + }
> + }
> +
> + if (vls_type != VLS_LOAD && first_stmt == stmt)
> + {
> + /* STMT is the leader of the group. Check the operands of all the
> + stmts of the group. */
> + gimple *next_stmt = GROUP_NEXT_ELEMENT (stmt_info);
> + while (next_stmt)
> + {
> + gcc_assert (gimple_assign_single_p (next_stmt));
> + tree op = gimple_assign_rhs1 (next_stmt);
> + gimple *def_stmt;
> + enum vect_def_type dt;
> + if (!vect_is_simple_use (op, vinfo, &def_stmt, &dt))
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "use not simple.\n");
> + return false;
> + }
> + next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
> + }
> + }
> +
> + if (overrun_p)
> + {
> + gcc_assert (can_overrun_p);
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "Data access with gaps requires scalar "
> + "epilogue loop\n");
> + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
> + }
> +
> + return true;
> +}
> +
> +/* Analyze load or store statement STMT of type VLS_TYPE. Return true
> + if there is a memory access type that the vectorized form can use,
> + storing it in *MEMORY_ACCESS_TYPE if so. If we decide to use gathers
> + or scatters, fill in GS_INFO accordingly.
> +
> + SLP says whether we're performing SLP rather than loop vectorization.
> + VECTYPE is the vector type that the vectorized statements will use. */
> +
> +static bool
> +get_load_store_type (gimple *stmt, tree vectype, bool slp,
> + vec_load_store_type vls_type,
> + vect_memory_access_type *memory_access_type,
> + gather_scatter_info *gs_info)
> +{
> + stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
> + vec_info *vinfo = stmt_info->vinfo;
> + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
> + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> + {
> + *memory_access_type = VMAT_GATHER_SCATTER;
> + gimple *def_stmt;
> + if (!vect_check_gather_scatter (stmt, loop_vinfo, gs_info))
> + gcc_unreachable ();
> + else if (!vect_is_simple_use (gs_info->offset, vinfo, &def_stmt,
> + &gs_info->offset_dt,
> + &gs_info->offset_vectype))
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "%s index use not simple.\n",
> + vls_type == VLS_LOAD ? "gather" : "scatter");
> + return false;
> + }
> + }
> + else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> + {
> + if (!get_group_load_store_type (stmt, vectype, slp, vls_type,
> + memory_access_type))
> + return false;
> + }
> + else if (STMT_VINFO_STRIDED_P (stmt_info))
> + {
> + gcc_assert (!slp);
> + *memory_access_type = VMAT_ELEMENTWISE;
> + }
> + else
> + *memory_access_type = VMAT_CONTIGUOUS;
> +
> + /* FIXME: At the moment the cost model seems to underestimate the
> + cost of using elementwise accesses. This check preserves the
> + traditional behavior until that can be fixed. */
> + if (*memory_access_type == VMAT_ELEMENTWISE
> + && !STMT_VINFO_STRIDED_P (stmt_info))
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "not falling back to elementwise accesses\n");
> + return false;
> + }
> + return true;
> +}
> +
> /* Function vectorizable_mask_load_store.
>
> Check if STMT performs a conditional load or store that can be vectorized.
> @@ -1705,7 +1907,7 @@ vectorizable_mask_load_store (gimple *stmt,
> gimple_stmt_iterator *gsi,
> int i, j;
> bool inv_p;
> gather_scatter_info gs_info;
> - bool is_store;
> + vec_load_store_type vls_type;
> tree mask;
> gimple *def_stmt;
> enum vect_def_type dt;
> @@ -1716,7 +1918,6 @@ vectorizable_mask_load_store (gimple *stmt,
> gimple_stmt_iterator *gsi,
> ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
> gcc_assert (ncopies >= 1);
>
> - is_store = gimple_call_internal_fn (stmt) == IFN_MASK_STORE;
> mask = gimple_call_arg (stmt, 2);
>
> if (TREE_CODE (TREE_TYPE (mask)) != BOOLEAN_TYPE)
> @@ -1743,12 +1944,6 @@ vectorizable_mask_load_store (gimple *stmt,
> gimple_stmt_iterator *gsi,
>
> elem_type = TREE_TYPE (vectype);
>
> - if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> - return false;
> -
> - if (STMT_VINFO_STRIDED_P (stmt_info))
> - return false;
> -
> if (TREE_CODE (mask) != SSA_NAME)
> return false;
>
> @@ -1762,27 +1957,26 @@ vectorizable_mask_load_store (gimple *stmt,
> gimple_stmt_iterator *gsi,
> || TYPE_VECTOR_SUBPARTS (mask_vectype) != TYPE_VECTOR_SUBPARTS
> (vectype))
> return false;
>
> - if (is_store)
> + if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
> {
> tree rhs = gimple_call_arg (stmt, 3);
> if (!vect_is_simple_use (rhs, loop_vinfo, &def_stmt, &dt,
> &rhs_vectype))
> return false;
> + if (dt == vect_constant_def || dt == vect_external_def)
> + vls_type = VLS_STORE_INVARIANT;
> + else
> + vls_type = VLS_STORE;
> }
> + else
> + vls_type = VLS_LOAD;
>
> - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> - {
> - gimple *def_stmt;
> - if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info))
> - gcc_unreachable ();
> - if (!vect_is_simple_use (gs_info.offset, loop_vinfo, &def_stmt,
> - &gs_info.offset_dt, &gs_info.offset_vectype))
> - {
> - if (dump_enabled_p ())
> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "gather index use not simple.");
> - return false;
> - }
> + vect_memory_access_type memory_access_type;
> + if (!get_load_store_type (stmt, vectype, false, vls_type,
> + &memory_access_type, &gs_info))
> + return false;
>
> + if (memory_access_type == VMAT_GATHER_SCATTER)
> + {
> tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
> tree masktype
> = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (TREE_CHAIN (arglist))));
> @@ -1794,6 +1988,14 @@ vectorizable_mask_load_store (gimple *stmt,
> gimple_stmt_iterator *gsi,
> return false;
> }
> }
> + else if (memory_access_type != VMAT_CONTIGUOUS)
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "unsupported access type for masked %s\n",
> + vls_type == VLS_LOAD ? "load" : "store");
> + return false;
> + }
> else if (tree_int_cst_compare (nested_in_vect_loop
> ? STMT_VINFO_DR_STEP (stmt_info)
> : DR_STEP (dr), size_zero_node) <= 0)
> @@ -1801,25 +2003,28 @@ vectorizable_mask_load_store (gimple *stmt,
> gimple_stmt_iterator *gsi,
> else if (!VECTOR_MODE_P (TYPE_MODE (vectype))
> || !can_vec_mask_load_store_p (TYPE_MODE (vectype),
> TYPE_MODE (mask_vectype),
> - !is_store)
> + vls_type == VLS_LOAD)
> || (rhs_vectype
> && !useless_type_conversion_p (vectype, rhs_vectype)))
> return false;
>
> if (!vec_stmt) /* transformation not required. */
> {
> + STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
> STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
> - if (is_store)
> - vect_model_store_cost (stmt_info, ncopies, false, dt,
> - NULL, NULL, NULL);
> + if (vls_type == VLS_LOAD)
> + vect_model_load_cost (stmt_info, ncopies, memory_access_type,
> + NULL, NULL, NULL);
> else
> - vect_model_load_cost (stmt_info, ncopies, false, NULL, NULL, NULL);
> + vect_model_store_cost (stmt_info, ncopies, memory_access_type,
> + dt, NULL, NULL, NULL);
> return true;
> }
> + gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE
> (stmt_info));
>
> /** Transform. **/
>
> - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> + if (memory_access_type == VMAT_GATHER_SCATTER)
> {
> tree vec_oprnd0 = NULL_TREE, op;
> tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
> @@ -1993,7 +2198,7 @@ vectorizable_mask_load_store (gimple *stmt,
> gimple_stmt_iterator *gsi,
> gsi_replace (gsi, new_stmt, true);
> return true;
> }
> - else if (is_store)
> + else if (vls_type != VLS_LOAD)
> {
> tree vec_rhs = NULL_TREE, vec_mask = NULL_TREE;
> prev_stmt_info = NULL;
> @@ -2102,7 +2307,7 @@ vectorizable_mask_load_store (gimple *stmt,
> gimple_stmt_iterator *gsi,
> }
> }
>
> - if (!is_store)
> + if (vls_type == VLS_LOAD)
> {
> /* Ensure that even with -fno-tree-dce the scalar MASK_LOAD is removed
> from the IL. */
> @@ -5188,9 +5393,8 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator
> *gsi, gimple **vec_stmt,
> gimple *ptr_incr = NULL;
> int ncopies;
> int j;
> - gimple *next_stmt, *first_stmt = NULL;
> - bool grouped_store = false;
> - bool store_lanes_p = false;
> + gimple *next_stmt, *first_stmt;
> + bool grouped_store;
> unsigned int group_size, i;
> vec<tree> dr_chain = vNULL;
> vec<tree> oprnds = vNULL;
> @@ -5207,6 +5411,7 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator
> *gsi, gimple **vec_stmt,
> gather_scatter_info gs_info;
> enum vect_def_type scatter_src_dt = vect_unknown_def_type;
> gimple *new_stmt;
> + vec_load_store_type vls_type;
>
> if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
> return false;
> @@ -5274,6 +5479,11 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator
> *gsi, gimple **vec_stmt,
> return false;
> }
>
> + if (dt == vect_constant_def || dt == vect_external_def)
> + vls_type = VLS_STORE_INVARIANT;
> + else
> + vls_type = VLS_STORE;
> +
> if (rhs_vectype && !useless_type_conversion_p (vectype, rhs_vectype))
> return false;
>
> @@ -5303,7 +5513,6 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator
> *gsi, gimple **vec_stmt,
> }
> if (negative)
> {
> - gcc_assert (!grouped_store);
> alignment_support_scheme = vect_supportable_dr_alignment (dr,
> false);
> if (alignment_support_scheme != dr_aligned
> && alignment_support_scheme != dr_unaligned_supported)
> @@ -5325,80 +5534,31 @@ vectorizable_store (gimple *stmt,
> gimple_stmt_iterator *gsi, gimple **vec_stmt,
> }
> }
>
> - if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> - {
> - grouped_store = true;
> - first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
> - group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
> - if (!slp && !STMT_VINFO_STRIDED_P (stmt_info))
> - {
> - if (vect_store_lanes_supported (vectype, group_size))
> - store_lanes_p = true;
> - else if (!vect_grouped_store_supported (vectype, group_size))
> - return false;
> - }
> -
> - if (STMT_VINFO_STRIDED_P (stmt_info)
> - && slp
> - && (group_size > nunits
> - || nunits % group_size != 0))
> - {
> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "unhandled strided group store\n");
> - return false;
> - }
> -
> - if (first_stmt == stmt)
> - {
> - /* STMT is the leader of the group. Check the operands of all the
> - stmts of the group. */
> - next_stmt = GROUP_NEXT_ELEMENT (stmt_info);
> - while (next_stmt)
> - {
> - gcc_assert (gimple_assign_single_p (next_stmt));
> - op = gimple_assign_rhs1 (next_stmt);
> - if (!vect_is_simple_use (op, vinfo, &def_stmt, &dt))
> - {
> - if (dump_enabled_p ())
> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "use not simple.\n");
> - return false;
> - }
> - next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
> - }
> - }
> - }
> -
> - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> - {
> - gimple *def_stmt;
> - if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info))
> - gcc_unreachable ();
> - if (!vect_is_simple_use (gs_info.offset, vinfo, &def_stmt,
> - &gs_info.offset_dt, &gs_info.offset_vectype))
> - {
> - if (dump_enabled_p ())
> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "scatter index use not simple.");
> - return false;
> - }
> - }
> + vect_memory_access_type memory_access_type;
> + if (!get_load_store_type (stmt, vectype, slp, vls_type,
> + &memory_access_type, &gs_info))
> + return false;
>
> if (!vec_stmt) /* transformation not required. */
> {
> + if (!slp)
> + STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
> STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
> /* The SLP costs are calculated during SLP analysis. */
> if (!PURE_SLP_STMT (stmt_info))
> - vect_model_store_cost (stmt_info, ncopies, store_lanes_p, dt,
> + vect_model_store_cost (stmt_info, ncopies, memory_access_type, dt,
> NULL, NULL, NULL);
> return true;
> }
> + if (!slp)
> + gcc_assert (memory_access_type
> + == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info));
>
> /** Transform. **/
>
> ensure_base_align (stmt_info, dr);
>
> - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> + if (memory_access_type == VMAT_GATHER_SCATTER)
> {
> tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, op, src;
> tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
> @@ -5538,8 +5698,10 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator
> *gsi, gimple **vec_stmt,
> return true;
> }
>
> + grouped_store = STMT_VINFO_GROUPED_ACCESS (stmt_info);
> if (grouped_store)
> {
> + first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
> first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
> group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
>
> @@ -5585,7 +5747,8 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator
> *gsi, gimple **vec_stmt,
> dump_printf_loc (MSG_NOTE, vect_location,
> "transform store. ncopies = %d\n", ncopies);
>
> - if (STMT_VINFO_STRIDED_P (stmt_info))
> + if (memory_access_type == VMAT_ELEMENTWISE
> + || memory_access_type == VMAT_STRIDED_SLP)
> {
> gimple_stmt_iterator incr_gsi;
> bool insert_after;
> @@ -5756,14 +5919,14 @@ vectorizable_store (gimple *stmt,
> gimple_stmt_iterator *gsi, gimple **vec_stmt,
> gcc_assert (alignment_support_scheme);
> /* Targets with store-lane instructions must not require explicit
> realignment. */
> - gcc_assert (!store_lanes_p
> + gcc_assert (memory_access_type != VMAT_LOAD_STORE_LANES
> || alignment_support_scheme == dr_aligned
> || alignment_support_scheme == dr_unaligned_supported);
>
> if (negative)
> offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1);
>
> - if (store_lanes_p)
> + if (memory_access_type == VMAT_LOAD_STORE_LANES)
> aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
> else
> aggr_type = vectype;
> @@ -5901,7 +6064,7 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator
> *gsi, gimple **vec_stmt,
> TYPE_SIZE_UNIT (aggr_type));
> }
>
> - if (store_lanes_p)
> + if (memory_access_type == VMAT_LOAD_STORE_LANES)
> {
> tree vec_array;
>
> @@ -6185,7 +6348,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator
> *gsi, gimple **vec_stmt,
> gphi *phi = NULL;
> vec<tree> dr_chain = vNULL;
> bool grouped_load = false;
> - bool load_lanes_p = false;
> gimple *first_stmt;
> gimple *first_stmt_for_drptr = NULL;
> bool inv_p;
> @@ -6294,48 +6456,11 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator
> *gsi, gimple **vec_stmt,
> {
> grouped_load = true;
> /* FORNOW */
> - gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P
> (stmt_info));
> + gcc_assert (!nested_in_vect_loop);
> + gcc_assert (!STMT_VINFO_GATHER_SCATTER_P (stmt_info));
>
> first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
> group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
> - bool single_element_p = (first_stmt == stmt
> - && !GROUP_NEXT_ELEMENT (stmt_info));
> -
> - if (!slp && !STMT_VINFO_STRIDED_P (stmt_info))
> - {
> - if (vect_load_lanes_supported (vectype, group_size))
> - load_lanes_p = true;
> - else if (!vect_grouped_load_supported (vectype, single_element_p,
> - group_size))
> - return false;
> - }
> -
> - if (single_element_p)
> - {
> - /* Single-element interleaving requires peeling for gaps. */
> - gcc_assert (GROUP_GAP (stmt_info));
> - }
> -
> - /* If there is a gap in the end of the group then we access excess
> - elements in the last iteration and thus need to peel that off. */
> - if (loop_vinfo
> - && ! STMT_VINFO_STRIDED_P (stmt_info)
> - && GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0)
> - {
> - if (dump_enabled_p ())
> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "Data access with gaps requires scalar "
> - "epilogue loop\n");
> - if (loop->inner)
> - {
> - if (dump_enabled_p ())
> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "Peeling for outer loop is not supported\n");
> - return false;
> - }
> -
> - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
> - }
>
> if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
> slp_perm = true;
> @@ -6381,24 +6506,13 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator
> *gsi, gimple **vec_stmt,
> }
> }
>
> + vect_memory_access_type memory_access_type;
> + if (!get_load_store_type (stmt, vectype, slp, VLS_LOAD,
> + &memory_access_type, &gs_info))
> + return false;
>
> - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> - {
> - gimple *def_stmt;
> - if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info))
> - gcc_unreachable ();
> - if (!vect_is_simple_use (gs_info.offset, vinfo, &def_stmt,
> - &gs_info.offset_dt, &gs_info.offset_vectype))
> - {
> - if (dump_enabled_p ())
> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "gather index use not simple.\n");
> - return false;
> - }
> - }
> - else if (STMT_VINFO_STRIDED_P (stmt_info))
> - ;
> - else
> + if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info)
> + && !STMT_VINFO_STRIDED_P (stmt_info))
> {
> negative = tree_int_cst_compare (nested_in_vect_loop
> ? STMT_VINFO_DR_STEP (stmt_info)
> @@ -6444,14 +6558,20 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator
> *gsi, gimple **vec_stmt,
>
> if (!vec_stmt) /* transformation not required. */
> {
> + if (!slp)
> + STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
> STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
> /* The SLP costs are calculated during SLP analysis. */
> if (!PURE_SLP_STMT (stmt_info))
> - vect_model_load_cost (stmt_info, ncopies, load_lanes_p,
> + vect_model_load_cost (stmt_info, ncopies, memory_access_type,
> NULL, NULL, NULL);
> return true;
> }
>
> + if (!slp)
> + gcc_assert (memory_access_type
> + == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info));
> +
> if (dump_enabled_p ())
> dump_printf_loc (MSG_NOTE, vect_location,
> "transform load. ncopies = %d\n", ncopies);
> @@ -6460,7 +6580,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator
> *gsi, gimple **vec_stmt,
>
> ensure_base_align (stmt_info, dr);
>
> - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> + if (memory_access_type == VMAT_GATHER_SCATTER)
> {
> tree vec_oprnd0 = NULL_TREE, op;
> tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
> @@ -6627,7 +6747,9 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator
> *gsi, gimple **vec_stmt,
> }
> return true;
> }
> - else if (STMT_VINFO_STRIDED_P (stmt_info))
> +
> + if (memory_access_type == VMAT_ELEMENTWISE
> + || memory_access_type == VMAT_STRIDED_SLP)
> {
> gimple_stmt_iterator incr_gsi;
> bool insert_after;
> @@ -6694,26 +6816,23 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator
> *gsi, gimple **vec_stmt,
> int lnel = 1;
> tree ltype = TREE_TYPE (vectype);
> auto_vec<tree> dr_chain;
> - if (slp)
> + if (memory_access_type == VMAT_STRIDED_SLP)
> {
> - if (group_size < nunits
> - && nunits % group_size == 0)
> + nloads = nunits / group_size;
> + if (group_size < nunits)
> {
> - nloads = nunits / group_size;
> lnel = group_size;
> ltype = build_vector_type (TREE_TYPE (vectype), group_size);
> - ltype = build_aligned_type (ltype,
> - TYPE_ALIGN (TREE_TYPE (vectype)));
> }
> - else if (group_size >= nunits
> - && group_size % nunits == 0)
> + else
> {
> - nloads = 1;
> lnel = nunits;
> ltype = vectype;
> - ltype = build_aligned_type (ltype,
> - TYPE_ALIGN (TREE_TYPE (vectype)));
> }
> + ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE
> (vectype)));
> + }
> + if (slp)
> + {
> /* For SLP permutation support we need to load the whole group,
> not only the number of vector stmts the permutation result
> fits in. */
> @@ -6845,7 +6964,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator
> *gsi, gimple **vec_stmt,
> gcc_assert (alignment_support_scheme);
> /* Targets with load-lane instructions must not require explicit
> realignment. */
> - gcc_assert (!load_lanes_p
> + gcc_assert (memory_access_type != VMAT_LOAD_STORE_LANES
> || alignment_support_scheme == dr_aligned
> || alignment_support_scheme == dr_unaligned_supported);
>
> @@ -6980,7 +7099,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator
> *gsi, gimple **vec_stmt,
> if (negative)
> offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1);
>
> - if (load_lanes_p)
> + if (memory_access_type == VMAT_LOAD_STORE_LANES)
> aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
> else
> aggr_type = vectype;
> @@ -7043,7 +7162,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator
> *gsi, gimple **vec_stmt,
> if (grouped_load || slp_perm)
> dr_chain.create (vec_num);
>
> - if (load_lanes_p)
> + if (memory_access_type == VMAT_LOAD_STORE_LANES)
> {
> tree vec_array;
>
> @@ -7313,7 +7432,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator
> *gsi, gimple **vec_stmt,
> {
> if (grouped_load)
> {
> - if (!load_lanes_p)
> + if (memory_access_type != VMAT_LOAD_STORE_LANES)
> vect_transform_grouped_load (stmt, dr_chain, group_size, gsi);
> *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
> }