Hi,
Similar to the signed/unsigned patch before this one relaxes the
gather/scatter restrictions on scale factors. The basic idea is that a
natively unsupported scale factor can still be reached by emitting a
multiplication before the actual gather operation. As before, we need
to make sure that there is no overflow when multiplying.
The approach is similar to before, just that we have two more "passes" that
check supported offset types.
Bootstrapped and regtested on x86 and power10. Regtested on aarch64 and
rv64gcv_zvl512b (with and without target support for scaling).
Regards
Robin
gcc/ChangeLog:
* tree-vect-data-refs.cc (struct gather_scatter_config):
Add scale.
(vect_gather_scatter_get_configs): Try various scales.
(vect_gather_scatter_fn_p): Add scale handling.
(vect_check_gather_scatter): Add scale parameter.
* tree-vect-stmts.cc (check_load_store_for_partial_vectors):
Ditto.
(vect_truncate_gather_scatter_offset): Ditto.
(vect_use_grouped_gather): Ditto.
(get_load_store_type): Ditto.
(vectorizable_store): Scale offset if necessary.
(vectorizable_load): Ditto.
* tree-vectorizer.h (struct vect_load_store_data): Add
supported_scale.
(vect_gather_scatter_fn_p): Add argument.
---
gcc/tree-vect-data-refs.cc | 181 +++++++++++++++++++++++++++++--------
gcc/tree-vect-stmts.cc | 71 ++++++++++++---
gcc/tree-vectorizer.h | 11 ++-
3 files changed, 210 insertions(+), 53 deletions(-)
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index fb2450a30c4..9c87cc1de6d 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -4431,6 +4431,7 @@ struct gather_scatter_config
{
internal_fn ifn;
tree offset_vectype;
+ int scale;
vec<int> elsvals;
};
@@ -4523,38 +4524,62 @@ vect_gather_scatter_get_configs (vec_info *vinfo, bool
read_p, bool masked_p,
if (!offset_vectype)
continue;
- vec<int> elsvals = vNULL;
+ /* Try multiple scale values. Start with exact match, then try
+ smaller common scales that a target might support . */
+ int scales_to_try[] = {scale, 1, 2, 4, 8};
- /* If we haven't determined which IFN is supported yet, try all three
- to find which one the target supports. */
- if (ifn == IFN_LAST)
+ for (unsigned int j = 0;
+ j < sizeof (scales_to_try) / sizeof (*scales_to_try);
+ j++)
{
- ifn = vect_gather_scatter_which_ifn (read_p, masked_p,
- vectype, memory_type,
- offset_vectype, scale, &elsvals);
- if (ifn != IFN_LAST)
+ int try_scale = scales_to_try[j];
+
+ /* Skip scales >= requested scale (except for exact match). */
+ if (j > 0 && try_scale >= scale)
+ continue;
+
+ /* Skip if requested scale is not a multiple of this scale. */
+ if (j > 0 && scale % try_scale != 0)
+ continue;
+
+ vec<int> elsvals = vNULL;
+
+ /* If we haven't determined which IFN is supported yet, try all three
+ to find which one the target supports. */
+ if (ifn == IFN_LAST)
{
- /* Found which IFN is supported. Save this configuration. */
- gather_scatter_config config;
- config.ifn = ifn;
- config.offset_vectype = offset_vectype;
- config.elsvals = elsvals;
- configs.safe_push (config);
+ ifn = vect_gather_scatter_which_ifn (read_p, masked_p,
+ vectype, memory_type,
+ offset_vectype, try_scale,
+ &elsvals);
+ if (ifn != IFN_LAST)
+ {
+ /* Found which IFN is supported. Save this configuration. */
+ gather_scatter_config config;
+ config.ifn = ifn;
+ config.offset_vectype = offset_vectype;
+ config.scale = try_scale;
+ config.elsvals = elsvals;
+ configs.safe_push (config);
+ }
}
- }
- else
- {
- /* We already know which IFN is supported, just check if this
- offset type works with it. */
- if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
- offset_vectype, scale,
- &elsvals))
+ else
{
- gather_scatter_config config;
- config.ifn = ifn;
- config.offset_vectype = offset_vectype;
- config.elsvals = elsvals;
- configs.safe_push (config);
+ /* We already know which IFN is supported, just check if this
+ offset type and scale work with it. */
+ if (internal_gather_scatter_fn_supported_p (ifn, vectype,
+ memory_type,
+ offset_vectype,
+ try_scale,
+ &elsvals))
+ {
+ gather_scatter_config config;
+ config.ifn = ifn;
+ config.offset_vectype = offset_vectype;
+ config.scale = try_scale;
+ config.elsvals = elsvals;
+ configs.safe_push (config);
+ }
}
}
}
@@ -4570,6 +4595,11 @@ vect_gather_scatter_get_configs (vec_info *vinfo, bool
read_p, bool masked_p,
base address. If OFFSET_TYPE is scalar the function chooses an
appropriate vector type for it. SCALE is the amount by which the
offset should be multiplied *after* it has been converted to address width.
+ SCALE is the requested scale, but if the target doesn't support it,
+ If the target does not support the requested SCALE, SUPPORTED_SCALE
+ will contain the scale that is actually supported
+ (which may be smaller, requiring additional multiplication).
+ Otherwise SUPPORTED_SCALE is 0.
Return true if the function is supported, storing the function id in
*IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.
@@ -4582,12 +4612,14 @@ vect_gather_scatter_get_configs (vec_info *vinfo, bool
read_p, bool masked_p,
bool
vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
tree vectype, tree memory_type, tree offset_type,
- int scale, internal_fn *ifn_out,
+ int scale, int *supported_scale,
+ internal_fn *ifn_out,
tree *offset_vectype_out,
tree *supported_offset_vectype,
vec<int> *elsvals)
{
*supported_offset_vectype = NULL_TREE;
+ *supported_scale = 0;
unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
unsigned int element_bits = vector_element_bits (vectype);
if (element_bits != memory_bits)
@@ -4609,11 +4641,19 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p,
bool masked_p,
if (configs.is_empty ())
return false;
- /* First, try to find a configuration that matches our offset type
- (no conversion needed). */
+ /* Selection priority:
+ 1 - Exact scale match + offset type match
+ 2 - Exact scale match + sign-swapped offset
+ 3 - Smaller scale + offset type match
+ 4 - Smaller scale + sign-swapped offset
+ Within each category, prefer smaller offset types. */
+
+ /* First pass: exact scale match with no conversion. */
for (unsigned int i = 0; i < configs.length (); i++)
{
- if (TYPE_SIGN (configs[i].offset_vectype) == TYPE_SIGN (offset_vectype))
+ if (configs[i].scale == scale
+ && TYPE_SIGN (configs[i].offset_vectype)
+ == TYPE_SIGN (offset_vectype))
{
*ifn_out = configs[i].ifn;
*offset_vectype_out = configs[i].offset_vectype;
@@ -4623,19 +4663,77 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p,
bool masked_p,
}
}
- /* No direct match. This means we try to find a sign-swapped offset
- vectype. */
+ /* No direct match. This means we try to find either
+ - a sign-swapped offset vectype or
+ - a different scale and 2x larger offset type
+ - a different scale and larger sign-swapped offset vectype. */
unsigned int offset_precision = TYPE_PRECISION (TREE_TYPE (offset_vectype));
unsigned int needed_precision
= TYPE_UNSIGNED (offset_vectype) ? offset_precision * 2 : POINTER_SIZE;
needed_precision = std::min (needed_precision, (unsigned) POINTER_SIZE);
+ /* Second pass: No direct match. This means we try to find a sign-swapped
+ offset vectype. */
enum tree_code tmp;
for (unsigned int i = 0; i < configs.length (); i++)
{
unsigned int precision
= TYPE_PRECISION (TREE_TYPE (configs[i].offset_vectype));
- if (precision >= needed_precision
+ if (configs[i].scale == scale
+ && precision >= needed_precision
+ && (supportable_convert_operation (CONVERT_EXPR,
+ configs[i].offset_vectype,
+ offset_vectype, &tmp)
+ || (needed_precision == offset_precision
+ && tree_nop_conversion_p (configs[i].offset_vectype,
+ offset_vectype))))
+ {
+ *ifn_out = configs[i].ifn;
+ *offset_vectype_out = offset_vectype;
+ *supported_offset_vectype = configs[i].offset_vectype;
+ if (elsvals)
+ *elsvals = configs[i].elsvals;
+ return true;
+ }
+ }
+
+ /* Third pass: Try a smaller scale with the same signedness. */
+ needed_precision = offset_precision * 2;
+ needed_precision = std::min (needed_precision, (unsigned) POINTER_SIZE);
+
+ for (unsigned int i = 0; i < configs.length (); i++)
+ {
+ unsigned int precision
+ = TYPE_PRECISION (TREE_TYPE (configs[i].offset_vectype));
+ if (configs[i].scale < scale
+ && precision >= needed_precision
+ && (supportable_convert_operation (CONVERT_EXPR,
+ configs[i].offset_vectype,
+ offset_vectype, &tmp)
+ || (needed_precision == offset_precision
+ && tree_nop_conversion_p (configs[i].offset_vectype,
+ offset_vectype))))
+ {
+ *ifn_out = configs[i].ifn;
+ *offset_vectype_out = configs[i].offset_vectype;
+ *supported_scale = configs[i].scale;
+ if (elsvals)
+ *elsvals = configs[i].elsvals;
+ return true;
+ }
+ }
+
+ /* Fourth pass: Try a smaller scale and sign-swapped offset vectype. */
+ needed_precision
+ = TYPE_UNSIGNED (offset_vectype) ? offset_precision * 2 : POINTER_SIZE;
+ needed_precision = std::min (needed_precision, (unsigned) POINTER_SIZE);
+
+ for (unsigned int i = 0; i < configs.length (); i++)
+ {
+ unsigned int precision
+ = TYPE_PRECISION (TREE_TYPE (configs[i].offset_vectype));
+ if (configs[i].scale < scale
+ && precision >= needed_precision
&& (supportable_convert_operation (CONVERT_EXPR,
configs[i].offset_vectype,
offset_vectype, &tmp)
@@ -4646,6 +4744,7 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p,
bool masked_p,
*ifn_out = configs[i].ifn;
*offset_vectype_out = offset_vectype;
*supported_offset_vectype = configs[i].offset_vectype;
+ *supported_scale = configs[i].scale;
if (elsvals)
*elsvals = configs[i].elsvals;
return true;
@@ -4805,6 +4904,7 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, tree
vectype,
base = fold_convert (sizetype, base);
base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
+ int tmp_scale;
tree tmp_offset_vectype;
/* OFF at this point may be either a SSA_NAME or some tree expression
@@ -4878,14 +4978,16 @@ vect_check_gather_scatter (stmt_vec_info stmt_info,
tree vectype,
&& !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
masked_p, vectype, memory_type,
signed_char_type_node,
- new_scale, &ifn,
+ new_scale, &tmp_scale,
+ &ifn,
&offset_vectype,
&tmp_offset_vectype,
elsvals)
&& !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
masked_p, vectype, memory_type,
unsigned_char_type_node,
- new_scale, &ifn,
+ new_scale, &tmp_scale,
+ &ifn,
&offset_vectype,
&tmp_offset_vectype,
elsvals))
@@ -4910,7 +5012,9 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, tree
vectype,
&& !POINTER_TYPE_P (TREE_TYPE (off))
&& vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
masked_p, vectype, memory_type,
- TREE_TYPE (off), scale, &ifn,
+ TREE_TYPE (off),
+ scale, &tmp_scale,
+ &ifn,
&offset_vectype,
&tmp_offset_vectype,
elsvals))
@@ -4966,7 +5070,8 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, tree
vectype,
if (use_ifn_p)
{
if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
- vectype, memory_type, offtype, scale,
+ vectype, memory_type, offtype,
+ scale, &tmp_scale,
&ifn, &offset_vectype,
&tmp_offset_vectype,
elsvals))
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 84ba756a042..d153544640a 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1512,6 +1512,9 @@ check_load_store_for_partial_vectors (loop_vec_info
loop_vinfo, tree vectype,
we chose a different one use this instead. */
if (ls->supported_offset_vectype)
off_vectype = ls->supported_offset_vectype;
+ /* Same for scale. */
+ if (ls->supported_scale)
+ scale = ls->supported_scale;
if (internal_gather_scatter_fn_supported_p (len_ifn, vectype,
memory_type,
@@ -1706,8 +1709,10 @@ vect_truncate_gather_scatter_offset (stmt_vec_info
stmt_info, tree vectype,
no narrower than OFFSET_TYPE. */
tree memory_type = TREE_TYPE (DR_REF (dr));
tree tmp_offset_vectype;
+ int tmp_scale;
if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
- vectype, memory_type, offset_type, scale,
+ vectype, memory_type, offset_type,
+ scale, &tmp_scale,
&gs_info->ifn, &gs_info->offset_vectype,
&tmp_offset_vectype, elsvals)
|| gs_info->ifn == IFN_LAST)
@@ -1789,9 +1794,10 @@ vect_use_grouped_gather (dr_vec_info *dr_info, tree
vectype,
not available we still have a strided load/store. */
bool ok = false;
tree tmp_vectype;
+ int tmp_scale;
if (vect_gather_scatter_fn_p
(loop_vinfo, DR_IS_READ (dr), masked_p, *pun_vectype,
- TREE_TYPE (*pun_vectype), *pun_vectype, 1, &ifn,
+ TREE_TYPE (*pun_vectype), *pun_vectype, 1, &tmp_scale, &ifn,
&offset_vectype, &tmp_vectype, elsvals))
ok = true;
else if (internal_strided_fn_supported_p (strided_ifn, *pun_vectype,
@@ -2091,6 +2097,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
bool *slp_perm = &ls->slp_perm;
unsigned *n_perms = &ls->n_perms;
tree *supported_offset_vectype = &ls->supported_offset_vectype;
+ int *supported_scale = &ls->supported_scale;
loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
@@ -2164,7 +2171,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
tree tem;
if (vect_gather_scatter_fn_p (loop_vinfo, vls_type == VLS_LOAD,
masked_p, vectype, memory_type,
- offset_vectype, scale,
+ offset_vectype, scale, supported_scale,
&ls->gs.ifn, &tem,
supported_offset_vectype, elsvals))
{
@@ -2179,6 +2186,10 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
dump_printf_loc (MSG_NOTE, vect_location,
" target supports offset type %T.\n",
*supported_offset_vectype);
+ if (*supported_scale)
+ dump_printf_loc (MSG_NOTE, vect_location,
+ " target supports offset scale %d.\n",
+ *supported_scale);
}
*memory_access_type = VMAT_GATHER_SCATTER_IFN;
}
@@ -2455,7 +2466,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
gcc_assert (vect_gather_scatter_fn_p
(loop_vinfo, vls_type == VLS_LOAD, masked_p, vectype,
gs_info.memory_type, TREE_TYPE (gs_info.offset),
- gs_info.scale, &gs_info.ifn,
+ gs_info.scale, supported_scale, &gs_info.ifn,
&tmp, supported_offset_vectype, elsvals));
SLP_TREE_GS_SCALE (slp_node) = gs_info.scale;
@@ -8853,6 +8864,10 @@ vectorizable_store (vec_info *vinfo,
inside_cost
+= record_stmt_cost (cost_vec, 1, vector_stmt,
slp_node, 0, vect_body);
+ if (ls.supported_scale)
+ inside_cost
+ += record_stmt_cost (cost_vec, 1, vector_stmt,
+ slp_node, 0, vect_body);
unsigned int cnunits = vect_nunits_for_cost (vectype);
inside_cost
@@ -8867,12 +8882,26 @@ vectorizable_store (vec_info *vinfo,
tree scale = size_int (SLP_TREE_GS_SCALE (slp_node));
bool strided = !VECTOR_TYPE_P (TREE_TYPE (vec_offset));
- /* Perform the offset conversion if necessary. */
- if (!strided && ls.supported_offset_vectype)
+ /* Perform the offset conversion and scaling if necessary. */
+ if (!strided
+ && (ls.supported_offset_vectype || ls.supported_scale))
{
gimple_seq stmts = NULL;
- vec_offset = gimple_convert
- (&stmts, ls.supported_offset_vectype, vec_offset);
+ if (ls.supported_offset_vectype)
+ vec_offset = gimple_convert
+ (&stmts, ls.supported_offset_vectype, vec_offset);
+ if (ls.supported_scale)
+ {
+ tree mult_cst = build_int_cst
+ (TREE_TYPE (TREE_TYPE (vec_offset)),
+ SLP_TREE_GS_SCALE (slp_node) / ls.supported_scale);
+ tree mult = build_vector_from_val
+ (TREE_TYPE (vec_offset), mult_cst);
+ vec_offset = gimple_build
+ (&stmts, MULT_EXPR, TREE_TYPE (vec_offset),
+ vec_offset, mult);
+ scale = size_int (ls.supported_scale);
+ }
gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
}
@@ -10694,6 +10723,10 @@ vectorizable_load (vec_info *vinfo,
inside_cost
+= record_stmt_cost (cost_vec, 1, vector_stmt,
slp_node, 0, vect_body);
+ if (ls.supported_scale)
+ inside_cost
+ += record_stmt_cost (cost_vec, 1, vector_stmt,
+ slp_node, 0, vect_body);
unsigned int cnunits = vect_nunits_for_cost (vectype);
inside_cost
@@ -10707,12 +10740,26 @@ vectorizable_load (vec_info *vinfo,
tree scale = size_int (SLP_TREE_GS_SCALE (slp_node));
bool strided = !VECTOR_TYPE_P (TREE_TYPE (vec_offset));
- /* Perform the offset conversion if necessary. */
- if (!strided && ls.supported_offset_vectype)
+ /* Perform the offset conversion and scaling if necessary. */
+ if (!strided
+ && (ls.supported_offset_vectype || ls.supported_scale))
{
gimple_seq stmts = NULL;
- vec_offset = gimple_convert
- (&stmts, ls.supported_offset_vectype, vec_offset);
+ if (ls.supported_offset_vectype)
+ vec_offset = gimple_convert
+ (&stmts, ls.supported_offset_vectype, vec_offset);
+ if (ls.supported_scale)
+ {
+ tree mult_cst = build_int_cst
+ (TREE_TYPE (TREE_TYPE (vec_offset)),
+ SLP_TREE_GS_SCALE (slp_node) / ls.supported_scale);
+ tree mult = build_vector_from_val
+ (TREE_TYPE (vec_offset), mult_cst);
+ vec_offset = gimple_build
+ (&stmts, MULT_EXPR, TREE_TYPE (vec_offset),
+ vec_offset, mult);
+ scale = size_int (ls.supported_scale);
+ }
gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
}
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index a49fb9cb1ad..70dea5a6ad6 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -290,9 +290,14 @@ struct vect_load_store_data : vect_data {
tree strided_offset_vectype; // VMAT_GATHER_SCATTER_IFN, originally strided
tree ls_type; // VMAT_GATHER_SCATTER_IFN
/* This is set to a supported offset vector type if we don't support the
- originally requested offset type. In that case there will be an
- additional offset conversion before the gather/scatter. */
+ originally requested offset type, otherwise NULL.
+ If nonzero there will be an additional offset conversion before
+ the gather/scatter. */
tree supported_offset_vectype; // VMAT_GATHER_SCATTER_IFN
+ /* Similar for scale. Only nonzero if we don't support the requested
+ scale. Then we need to multiply the offset vector before the
+ gather/scatter. */
+ int supported_scale; // VMAT_GATHER_SCATTER_IFN
auto_vec<int> elsvals;
/* True if the load requires a load permutation. */
bool slp_perm; // SLP_TREE_LOAD_PERMUTATION
@@ -2596,7 +2601,7 @@ extern bool vect_slp_analyze_instance_alignment (vec_info
*, slp_instance);
extern opt_result vect_analyze_data_ref_accesses (vec_info *, vec<int> *);
extern opt_result vect_prune_runtime_alias_test_list (loop_vec_info);
extern bool vect_gather_scatter_fn_p (vec_info *, bool, bool, tree, tree,
- tree, int, internal_fn *, tree *,
+ tree, int, int *, internal_fn *, tree *,
tree *, vec<int> * = nullptr);
extern bool vect_check_gather_scatter (stmt_vec_info, tree,
loop_vec_info, gather_scatter_info *,
--
2.51.0