Hi,
currently for RVV gathers/scatters we accept any scale and extension in
the optabs and "just" extend the offset before scaling it properly.
This causes two major problems:
- These operations are hidden from the vectorizer and thus not costed
appropriately.
- When the vectorizer chooses a full vector for the offsets (e.g. 16 shorts
for an 128-bit vector) and we're required to extend them to Pmode, we
need a much larger vector (e.g. 16 unsigned longs = 1024 bit) to hold
it. This necessitates LMUL = 4, further slowing the gather down.
This patch moves scale handling from vect_check_gather_scatter to
vectorizable_load/_store. In the former this patch accepts any scale without
checking for target support but still saves it in the gs info struct.
In vectorizable_load/store we check if the requested scale is supported
by the target and emit an extension as well as scaling operation only
if it isn't.
This way the vectorizer can accurately cost those and at the same time
support a variety of gather/scatter offsets.
I tested this on riscv in its current state (i.e. exposing all scale factors we
can support to the middle-end) and with a local patch that only exposes what we
actually have. Both are clean from a vectorizer standpoint but the latter
exposes at least two other issues, one in our target code and one in
vect-loop-manip. I'll take care of those separately.
As my usual aarch64 and x86 cfarm machines are currently down I tested on our
internal machines, x86 natively and aarch64 with qemu and
--target_board=unix/-march=armv8.8-a+sve.
I guess the naming and the placement of the new helpers is not ideal, open for
suggestions here. Originally I wanted to use them for gather discovery as well
but now they're just for codegen.
Regards
Robin
gcc/ChangeLog:
* tree-vect-data-refs.cc (vect_prune_runtime_alias_test_list):
Adjust comment.
(vect_gather_scatter_fn_p): Check if we can support scale > 1 by
larger offset type.
(vect_check_gather_scatter): Also try offset type with swapped
signedness.
(gs_off_need_scale): New function.
(gs_off_need_sign_change): Ditto.
* tree-vect-stmts.cc (check_load_store_for_partial_vectors):
Use scale 1.
(get_load_store_type): Ditto.
(vectorizable_store): Check if scale or sign swap are necessary
and perform it.
(vectorizable_load): Ditto.
* tree-vectorizer.h (gs_off_need_sign_change): Declare.
(gs_off_need_scale): Ditto.
---
gcc/tree-vect-data-refs.cc | 99 +++++++++++++++++++++++-
gcc/tree-vect-stmts.cc | 154 +++++++++++++++++++++++++++++++++++--
gcc/tree-vectorizer.h | 5 ++
3 files changed, 247 insertions(+), 11 deletions(-)
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index f9bf6a21697..00ff8e24b80 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -4434,10 +4434,16 @@ vect_prune_runtime_alias_test_list (loop_vec_info
loop_vinfo)
appropriate vector type for it. SCALE is the amount by which the
offset should be multiplied *after* it has been converted to address width.
+ If SCALE > 1 and the target does not support it we try to increase
+ the offset type size and see if SCALE = 1 is supported with the larger
+ type. If so, we can extend and shift the offset vector appropriately
+ before emitting the gather/scatter.
+
Return true if the function is supported, storing the function id in
*IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.
- If we can use gather and store the possible else values in ELSVALS. */
+ If we can use gather/scatter and ELSVALS is nonzero store the possible
+ else values in ELSVALS. */
bool
vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
@@ -4473,6 +4479,13 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p,
bool masked_p,
alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
}
+ /* Instead of asking the backend for an offset-scaling gather/scatter insn we
+ can also explicitly scale the offset vector by multiplying it with SCALE.
+ For now only do so when SCALE is a power of two. */
+ bool scale_explicitly = false;
+ if (pow2p_hwi (scale))
+ scale_explicitly = true;
+
for (;;)
{
tree offset_vectype;
@@ -4485,6 +4498,13 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p,
bool masked_p,
return false;
}
+ /* If the offset is already of pointer size and a scale > 1 is requested
+ we can always use a multiplication later even if the actual scale
+ is not supported by an IFN. */
+ if (TYPE_PRECISION (TREE_TYPE (offset_vectype)) == POINTER_SIZE
+ && scale_explicitly)
+ scale = 1;
+
/* Test whether the target supports this combination. */
if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
offset_vectype, scale,
@@ -4525,6 +4545,13 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p,
bool masked_p,
/* Try a larger offset vector type. */
offset_type = build_nonstandard_integer_type
(TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
+
+ /* In the next larger type we don't need target support for scale > 1
+ but can multiply the offset vector with the desired scale.
+ As we use a larger type than the original offset there is no overflow
+ while doing so. */
+ if (scale_explicitly)
+ scale = 1;
}
}
@@ -4778,10 +4805,10 @@ vect_check_gather_scatter (stmt_vec_info stmt_info,
loop_vec_info loop_vinfo,
if (use_ifn_p
&& TREE_CODE (off) == SSA_NAME
&& !POINTER_TYPE_P (TREE_TYPE (off))
- && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
+ && (vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
masked_p, vectype, memory_type,
TREE_TYPE (off), scale, &ifn,
- &offset_vectype, elsvals))
+ &offset_vectype, elsvals)))
break;
if (TYPE_PRECISION (TREE_TYPE (op0))
@@ -4836,7 +4863,18 @@ vect_check_gather_scatter (stmt_vec_info stmt_info,
loop_vec_info loop_vinfo,
if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
vectype, memory_type, offtype, scale,
&ifn, &offset_vectype, elsvals))
- ifn = IFN_LAST;
+ {
+ /* Alternatively, the target may support an offset with swapped
+ signedness. In that case we need to extend it to pointer size
+ later. */
+ tree offtype_swapped = build_nonstandard_integer_type
+ (POINTER_SIZE, !TYPE_UNSIGNED (offtype));
+ if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
+ vectype, memory_type, offtype_swapped,
+ scale, &ifn, &offset_vectype,
+ elsvals))
+ ifn = IFN_LAST;
+ }
decl = NULL_TREE;
}
else
@@ -4872,9 +4910,62 @@ vect_check_gather_scatter (stmt_vec_info stmt_info,
loop_vec_info loop_vinfo,
info->scale = scale;
info->element_type = TREE_TYPE (vectype);
info->memory_type = memory_type;
+
return true;
}
+/* Return true if the target supports a scale factor 1 but not SCALE for
+ the gather/scatter ifn IFN and the provides types and false otherwise. */
+
+bool
+gs_off_need_scale (internal_fn ifn, tree vectype,
+ tree datatype, tree offset_vectype, int scale)
+{
+ if (scale == 1)
+ return false;
+
+ if (internal_gather_scatter_fn_supported_p
+ (ifn, vectype, datatype,
+ offset_vectype, scale, nullptr))
+ return false;
+
+ if (internal_gather_scatter_fn_supported_p (ifn, vectype, datatype,
+ offset_vectype, 1,
+ nullptr))
+ return true;
+
+ return false;
+}
+
+/* Return true if the target does not support a gather/scatter ifn IFN
+ with the provided types and SCALE but instead supports and IFN with
+ a pointer-sized offset and swapped signedness instead, false otherwise. */
+
+bool
+gs_off_need_sign_change (vec_info *vinfo, internal_fn ifn, tree vectype,
+ tree datatype, tree offset_vectype, int scale,
+ tree *offset_vectype_out)
+{
+ if (internal_gather_scatter_fn_supported_p
+ (ifn, vectype, datatype,
+ offset_vectype, scale, nullptr))
+ return false;
+
+ tree offtype = build_nonstandard_integer_type
+ (POINTER_SIZE, !TYPE_UNSIGNED (offset_vectype));
+ *offset_vectype_out
+ = get_vectype_for_scalar_type (vinfo, offtype, TYPE_SIGN (offtype));
+
+ if (*offset_vectype_out
+ && internal_gather_scatter_fn_supported_p (ifn, vectype, datatype,
+ *offset_vectype_out, 1,
+ nullptr))
+ return true;
+
+ return false;
+}
+
+
/* Find the data references in STMT, analyze them with respect to LOOP and
append them to DATAREFS. Return false if datarefs in this stmt cannot
be handled. */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 653c5e38e27..1b4378abacf 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1499,15 +1499,18 @@ check_load_store_for_partial_vectors (loop_vec_info
loop_vinfo, tree vectype,
? SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (slp_node)[0])
: ls->strided_offset_vectype);
tree memory_type = TREE_TYPE (DR_REF (STMT_VINFO_DR_INFO (repr)->dr));
- int scale = SLP_TREE_GS_SCALE (slp_node);
+ /* In vect_check_gather_scatter we have determined that we support
+ the required offset type as well as the scale factor.
+ As we are only interested in the IFN type here, just use SCALE = 1
+ and decide how to scale before actually emitting the IFN. */
if (internal_gather_scatter_fn_supported_p (len_ifn, vectype,
memory_type,
- off_vectype, scale,
+ off_vectype, 1,
elsvals))
vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
else if (internal_gather_scatter_fn_supported_p (ifn, vectype,
memory_type,
- off_vectype, scale,
+ off_vectype, 1,
elsvals)
|| memory_access_type == VMAT_GATHER_SCATTER_LEGACY)
vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
@@ -2035,10 +2038,19 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
int scale = SLP_TREE_GS_SCALE (slp_node);
tree memory_type = TREE_TYPE (DR_REF (first_dr_info->dr));
tree tem;
+ tree offtype_swapped = build_nonstandard_integer_type
+ (POINTER_SIZE, !TYPE_UNSIGNED (TREE_TYPE (offset_vectype)));
+
if (vect_gather_scatter_fn_p (loop_vinfo, vls_type == VLS_LOAD,
masked_p, vectype,
memory_type,
- offset_vectype, scale,
+ offset_vectype, 1,
+ &ls->gs.ifn, &tem,
+ elsvals)
+ || vect_gather_scatter_fn_p (loop_vinfo, vls_type == VLS_LOAD,
+ masked_p, vectype,
+ memory_type,
+ offtype_swapped, 1,
&ls->gs.ifn, &tem,
elsvals))
*memory_access_type = VMAT_GATHER_SCATTER_IFN;
@@ -8617,19 +8629,83 @@ vectorizable_store (vec_info *vinfo,
tree alias_align_ptr = build_int_cst (ref_type, align);
if (memory_access_type == VMAT_GATHER_SCATTER_IFN)
{
+ /* If we need to scale the offsets either the target supports a
+ scale factor directly or we need to extend and multiply them
+ here. When there is a sign-change involved and we didn't
+ go through a gather/scatter pattern we need to extend as
+ well. We made sure the necessary vector type exists
+ in vect_check_gather_scatter. */
+ tree offset_vectype
+ = (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
+ ? SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (slp_node)[0])
+ : ls.strided_offset_vectype);
+ tree offset_vectype_tmp;
+ int gs_scale = SLP_TREE_GS_SCALE (slp_node);
+ bool sign_change_p = false;
+ if (gs_off_need_sign_change
+ (vinfo, ls.gs.ifn, vectype, TREE_TYPE (vectype),
+ offset_vectype,
+ gs_scale, &offset_vectype_tmp))
+ {
+ sign_change_p = true;
+ offset_vectype = offset_vectype_tmp;
+ }
+
+ bool scale_p = false;
+ if (gs_off_need_scale
+ (ls.gs.ifn, vectype, TREE_TYPE (vectype),
+ offset_vectype, gs_scale))
+ scale_p = true;
+
if (costing_p)
{
unsigned int cnunits = vect_nunits_for_cost (vectype);
inside_cost
+= record_stmt_cost (cost_vec, cnunits, scalar_store,
slp_node, 0, vect_body);
+
+ if (sign_change_p)
+ {
+ /* Zero/sign extend. */
+ inside_cost
+ += record_stmt_cost (cost_vec, 1, vector_stmt,
+ slp_node, 0, vect_body);
+ }
+
+ if (scale_p)
+ {
+ /* Shift. */
+ inside_cost
+ += record_stmt_cost (cost_vec, 1, vector_stmt,
+ slp_node, 0, vect_body);
+ }
continue;
}
if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
vec_offset = vec_offsets[j];
- tree scale = size_int (SLP_TREE_GS_SCALE (slp_node));
+ bool strided = !VECTOR_TYPE_P (TREE_TYPE (vec_offset));
+ tree scale = size_int (gs_scale);
+
+ if (!strided)
+ {
+ gimple_seq stmts = NULL;
+ if (sign_change_p)
+ vec_offset = gimple_convert (&stmts, offset_vectype,
+ vec_offset);
+ if (scale_p)
+ {
+ vec_offset = gimple_build
+ (&stmts, LSHIFT_EXPR, offset_vectype,
+ vec_offset, size_int (exact_log2 (gs_scale)));
+
+ scale = size_int (1);
+ }
+
+ if (stmts)
+ gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+ }
if (ls.gs.ifn == IFN_MASK_LEN_SCATTER_STORE)
{
@@ -8653,7 +8729,7 @@ vectorizable_store (vec_info *vinfo,
gcall *call;
if (final_len && final_mask)
{
- if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
+ if (!strided)
call = gimple_build_call_internal (
IFN_MASK_LEN_SCATTER_STORE, 8, dataref_ptr,
alias_align_ptr,
@@ -10464,18 +10540,82 @@ vectorizable_load (vec_info *vinfo,
tree alias_align_ptr = build_int_cst (ref_type, align);
if (memory_access_type == VMAT_GATHER_SCATTER_IFN)
{
+ /* If we need to scale the offsets either the target supports a
+ scale factor directly or we need to extend and multiply them
+ here. When there is a sign-change involved and we didn't
+ go through a gather/scatter pattern we need to extend as
+ well. We made sure the necessary vector type exists
+ in vect_check_gather_scatter. */
+ tree offset_vectype
+ = (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
+ ? SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (slp_node)[0])
+ : ls.strided_offset_vectype);
+ tree offset_vectype_tmp;
+ int gs_scale = SLP_TREE_GS_SCALE (slp_node);
+ bool sign_change_p = false;
+ if (gs_off_need_sign_change
+ (vinfo, ls.gs.ifn, vectype, TREE_TYPE (vectype),
+ offset_vectype, gs_scale,
+ &offset_vectype_tmp))
+ {
+ sign_change_p = true;
+ offset_vectype = offset_vectype_tmp;
+ }
+
+ bool scale_p = false;
+ if (gs_off_need_scale
+ (ls.gs.ifn, vectype, TREE_TYPE (vectype),
+ offset_vectype, gs_scale))
+ scale_p = true;
+
if (costing_p)
{
unsigned int cnunits = vect_nunits_for_cost (vectype);
inside_cost
= record_stmt_cost (cost_vec, cnunits, scalar_load,
slp_node, 0, vect_body);
+
+ if (sign_change_p)
+ {
+ /* Zero/sign extend. */
+ inside_cost
+ += record_stmt_cost (cost_vec, 1, vector_stmt,
+ slp_node, 0, vect_body);
+ }
+
+ if (scale_p)
+ {
+ /* Shift. */
+ inside_cost
+ += record_stmt_cost (cost_vec, 1, vector_stmt,
+ slp_node, 0, vect_body);
+ }
continue;
}
if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
vec_offset = vec_offsets[i];
tree zero = build_zero_cst (vectype);
- tree scale = size_int (SLP_TREE_GS_SCALE (slp_node));
+
+ bool strided = !VECTOR_TYPE_P (TREE_TYPE (vec_offset));
+ tree scale = size_int (gs_scale);
+
+ if (!strided)
+ {
+ gimple_seq stmts = NULL;
+ if (sign_change_p)
+ vec_offset = gimple_convert (&stmts, offset_vectype,
+ vec_offset);
+ if (scale_p)
+ {
+ vec_offset = gimple_build
+ (&stmts, LSHIFT_EXPR, offset_vectype,
+ vec_offset, size_int (exact_log2 (gs_scale)));
+ scale = size_int (1);
+ }
+
+ if (stmts)
+ gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+ }
if (ls.gs.ifn == IFN_MASK_LEN_GATHER_LOAD)
{
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 729fa822775..7dff4824984 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2596,6 +2596,11 @@ extern bool vect_check_gather_scatter (stmt_vec_info,
loop_vec_info,
vec<int> * = nullptr);
extern void vect_describe_gather_scatter_call (stmt_vec_info,
gather_scatter_info *);
+
+bool gs_off_need_sign_change (vec_info *, internal_fn, tree,
+ tree, tree, int, tree*);
+bool gs_off_need_scale (internal_fn, tree,
+ tree, tree, int);
extern opt_result vect_find_stmt_data_reference (loop_p, gimple *,
vec<data_reference_p> *,
vec<int> *, int);
--
2.50.0