Hi,
This patch adds gather/scatter handling for grouped access. The idea is
to e.g. replace an access (for uint8_t elements) like
arr[0]
arr[1]
arr[2]
arr[3]
arr[0 + step]
arr[1 + step]
...
by a gather load of uint32_t
arr[0]
arr[0 + step * 1]
arr[0 + step * 2]
...
where the offset vector is a simple series with step STEP.
If supported, such a gather can be implemented as a strided load.
Bootstrapped on x86 and power10. Regtested on aarch64 and riscv.
Regards
Robin
PR target/118019
gcc/ChangeLog:
* internal-fn.cc (get_supported_else_vals): Exit at invalid
index.
(internal_strided_fn_supported_p): New function.
* internal-fn.h (internal_strided_fn_supported_p): Declare.
* tree-vect-data-refs.cc (vect_supportable_dr_alignment):
Assume packed if access mode unit size unequal data ref type.
* tree-vect-stmts.cc (vect_get_punning_vectype): New function.
(vect_use_grouped_gather): New function.
(get_load_store_type): Call new function.
(vectorizable_store): Use punned vectype.
(vectorizable_load): Ditto.
* tree-vectorizer.h (struct vect_load_store_data): Add punned
vectype.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/autovec/pr118019-2.c: New test.
---
gcc/internal-fn.cc | 22 +-
gcc/internal-fn.h | 2 +
.../gcc.target/riscv/rvv/autovec/pr118019-2.c | 50 +++++
gcc/tree-vect-data-refs.cc | 4 +-
gcc/tree-vect-stmts.cc | 207 ++++++++++++++++--
gcc/tree-vectorizer.h | 1 +
6 files changed, 270 insertions(+), 16 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index bf2fac81807..db396c69ec5 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -5234,7 +5234,7 @@ get_supported_else_vals (enum insn_code icode, unsigned
else_index,
vec<int> &else_vals)
{
const struct insn_data_d *data = &insn_data[icode];
- if ((char)else_index >= data->n_operands)
+ if ((int)else_index >= data->n_operands || (int)else_index == -1)
return;
machine_mode else_mode = data->operand[else_index].mode;
@@ -5309,6 +5309,26 @@ internal_gather_scatter_fn_supported_p (internal_fn ifn,
tree vector_type,
return ok;
}
+/* Return true if the target supports a strided load/store function IFN
+ with VECTOR_TYPE. If supported and ELSVALS is nonzero the supported else
+ values will be added to the vector ELSVALS points to. */
+
+bool
+internal_strided_fn_supported_p (internal_fn ifn, tree vector_type,
+ vec<int> *elsvals)
+{
+ machine_mode mode = TYPE_MODE (vector_type);
+ optab optab = direct_internal_fn_optab (ifn);
+ insn_code icode = direct_optab_handler (optab, mode);
+
+ bool ok = icode != CODE_FOR_nothing;
+
+ if (ok && elsvals)
+ get_supported_else_vals (icode, internal_fn_else_index (ifn), *elsvals);
+
+ return ok;
+}
+
/* Return true if the target supports IFN_CHECK_{RAW,WAR}_PTRS function IFN
for pointers of type TYPE when the accesses have LENGTH bytes and their
common byte alignment is ALIGN. */
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index fd21694dfeb..dcb707251f8 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -246,6 +246,8 @@ extern int internal_fn_alias_ptr_index (internal_fn fn);
extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
tree, tree, int,
vec<int> * = nullptr);
+extern bool internal_strided_fn_supported_p (internal_fn, tree,
+ vec<int> * = nullptr);
extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
poly_uint64, unsigned int);
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
new file mode 100644
index 00000000000..d3436b78377
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
@@ -0,0 +1,50 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=rv64gcv_zvl512b -mabi=lp64d
-mno-vector-strict-align" } */
+
+/* Ensure we use strided loads. */
+
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+
+#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
+ int t0 = s0 + s1;\
+ int t1 = s0 - s1;\
+ int t2 = s2 + s3;\
+ int t3 = s2 - s3;\
+ d0 = t0 + t2;\
+ d2 = t0 - t2;\
+ d1 = t1 + t3;\
+ d3 = t1 - t3;\
+}
+
+uint32_t
+abs2 (uint32_t a)
+{
+ uint32_t s = ((a >> 15) & 0x10001) * 0xffff;
+ return (a + s) ^ s;
+}
+
+int
+x264_pixel_satd_8x4 (uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2)
+{
+ uint32_t tmp[4][4];
+ uint32_t a0, a1, a2, a3;
+ int sum = 0;
+ for (int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2)
+ {
+ a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
+ a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
+ a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
+ a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
+ HADAMARD4 (tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3);
+ }
+ for (int i = 0; i < 4; i++)
+ {
+ HADAMARD4 (a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
+ sum += abs2 (a0) + abs2 (a1) + abs2 (a2) + abs2 (a3);
+ }
+ return (((uint16_t) sum) + ((uint32_t) sum >> 16)) >> 1;
+}
+
+/* { dg-final { scan-assembler-times "vlse32" 4 } } */
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index e451b72e07e..be03495c6b9 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -6640,7 +6640,9 @@ vect_supportable_dr_alignment (vec_info *vinfo,
dr_vec_info *dr_info,
bool is_packed = false;
tree type = TREE_TYPE (DR_REF (dr));
if (misalignment == DR_MISALIGNMENT_UNKNOWN)
- is_packed = not_size_aligned (DR_REF (dr));
+ is_packed = not_size_aligned (DR_REF (dr))
+ || tree_to_uhwi (TYPE_SIZE (type))
+ < tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
is_packed,
is_gather_scatter))
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 5b1f291fa8d..619b99083a3 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1723,6 +1723,124 @@ vect_truncate_gather_scatter_offset (stmt_vec_info
stmt_info, tree vectype,
return false;
}
+/* Check if there is an integer vector type with the same size as VECTYPE but
+ NELTS units of size (TYPE_SIZE (VECTYPE) / NELTS). If so, return the
+ appropriate type. */
+
+static tree
+vect_get_punning_vectype (tree vectype, int nelts)
+{
+ gcc_assert (VECTOR_TYPE_P (vectype));
+
+ machine_mode vmode = TYPE_MODE (vectype);
+ if (!VECTOR_MODE_P (vmode))
+ return NULL_TREE;
+
+ poly_uint64 vbsize = GET_MODE_BITSIZE (vmode);
+ unsigned int pbsize;
+ scalar_int_mode elmode;
+ if (constant_multiple_p (vbsize, nelts, &pbsize)
+ && (int_mode_for_mode (SCALAR_TYPE_MODE
+ (TREE_TYPE (vectype))).exists (&elmode)))
+ {
+ machine_mode rmode;
+ if (int_mode_for_size (pbsize, 0).exists (&elmode)
+ && tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype))) * nelts
+ == GET_MODE_SIZE (elmode)
+ && related_vector_mode (vmode, elmode, nelts).exists (&rmode))
+ {
+ tree ptype = build_nonstandard_integer_type (pbsize, 1);
+ return build_vector_type (ptype, nelts);
+ }
+ }
+
+ return NULL_TREE;
+}
+
+/* Return true if we can use gather/scatter or strided internal functions
+ to vectorize STMT_INFO, which is a grouped or strided load or store
+ with multiple lanes and will be implemented by a type-punned access
+ of a vector with element size that matches the number of lanes.
+
+ MASKED_P is true if load or store is conditional.
+ When returning true, fill in GS_INFO with the information required to
+ perform the operation. Also, store the punning type in PUNNED_VECTYPE.
+
+ If successful and ELSVALS is nonzero the supported
+ else values will be stored in the vector ELSVALS points to. */
+
+static bool
+vect_use_grouped_gather (stmt_vec_info stmt_info, tree vectype,
+ loop_vec_info loop_vinfo, bool masked_p,
+ unsigned int nelts,
+ gather_scatter_info *info, vec<int> *elsvals,
+ tree *pun_vectype)
+{
+ dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
+ data_reference *dr = dr_info->dr;
+
+ /* TODO: We can support nelts > BITS_PER_UNIT or non-power-of-two by
+ multiple gathers/scatter. */
+ if (nelts > BITS_PER_UNIT || !pow2p_hwi (nelts))
+ return false;
+
+ /* Pun the vectype with one of the same size but an element spanning
+ NELTS elements of VECTYPE.
+ The punned type of a V16QI with NELTS = 4 would be V4SI.
+ */
+ *pun_vectype = vect_get_punning_vectype (vectype, nelts);
+
+ if (!*pun_vectype)
+ return false;
+
+ internal_fn ifn;
+ tree offset_vectype = *pun_vectype;
+
+ internal_fn strided_ifn = DR_IS_READ (dr)
+ ? IFN_MASK_LEN_STRIDED_LOAD : IFN_MASK_LEN_STRIDED_STORE;
+
+ /* Check if we have a gather/scatter with the new type. We're just trying
+ with the type itself as offset for now. If not, check if we have a
+ strided load/store. These have fewer constraints (for example no offset
+ type must exist) so it is possible that even though a gather/scatter is
+ not available we still have a strided load/store. */
+ bool ok = false;
+ if (vect_gather_scatter_fn_p
+ (loop_vinfo, DR_IS_READ (dr), masked_p, *pun_vectype,
+ TREE_TYPE (*pun_vectype), *pun_vectype, 1, &ifn,
+ &offset_vectype, elsvals))
+ ok = true;
+ else if (internal_strided_fn_supported_p (strided_ifn, *pun_vectype,
+ elsvals))
+ {
+ /* Use gather/scatter IFNs, vect_get_strided_load_store_ops
+ will switch back to the strided variants. */
+ ifn = DR_IS_READ (dr) ? IFN_MASK_LEN_GATHER_LOAD :
+ IFN_MASK_LEN_SCATTER_STORE;
+ ok = true;
+ }
+
+ if (ok)
+ {
+ info->ifn = ifn;
+ info->decl = NULL_TREE;
+ info->base = dr->ref;
+ info->alias_ptr = build_int_cst
+ (reference_alias_ptr_type (DR_REF (dr)),
+ get_object_alignment (DR_REF (dr)));
+ info->element_type = TREE_TYPE (vectype);
+ info->offset_vectype = offset_vectype;
+ /* No need to set the offset, vect_get_strided_load_store_ops
+ will do that. */
+ info->scale = 1;
+ info->memory_type = TREE_TYPE (DR_REF (dr));
+ return true;
+ }
+
+ return false;
+}
+
+
/* Return true if we can use gather/scatter internal functions to
vectorize STMT_INFO, which is a grouped or strided load or store.
MASKED_P is true if load or store is conditional. When returning
@@ -1978,6 +2096,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
int *misalignment = &ls->misalignment;
internal_fn *lanes_ifn = &ls->lanes_ifn;
vec<int> *elsvals = &ls->elsvals;
+ tree *pun_vectype = &ls->pun_vectype;
loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
@@ -1989,6 +2108,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
*misalignment = DR_MISALIGNMENT_UNKNOWN;
*poffset = 0;
+ *pun_vectype = NULL_TREE;
if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
{
@@ -2321,13 +2441,13 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
if ((*memory_access_type == VMAT_ELEMENTWISE
|| *memory_access_type == VMAT_STRIDED_SLP)
&& !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
- && SLP_TREE_LANES (slp_node) == 1
&& loop_vinfo)
{
gather_scatter_info gs_info;
- if (vect_use_strided_gather_scatters_p (stmt_info, vectype, loop_vinfo,
- masked_p, &gs_info, elsvals,
- group_size, single_element_p))
+ if (SLP_TREE_LANES (slp_node) == 1
+ && vect_use_strided_gather_scatters_p (stmt_info, vectype, loop_vinfo,
+ masked_p, &gs_info, elsvals,
+ group_size, single_element_p))
{
SLP_TREE_GS_SCALE (slp_node) = gs_info.scale;
SLP_TREE_GS_BASE (slp_node) = error_mark_node;
@@ -2335,6 +2455,28 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
ls->strided_offset_vectype = gs_info.offset_vectype;
*memory_access_type = VMAT_GATHER_SCATTER_IFN;
}
+ else if (SLP_TREE_LANES (slp_node) > 1
+ && vect_use_grouped_gather (stmt_info, vectype, loop_vinfo,
+ masked_p, SLP_TREE_LANES (slp_node),
+ &gs_info, elsvals, pun_vectype))
+ {
+ int puntype_misalignment = dr_misalignment
+ (first_dr_info, *pun_vectype, *poffset);
+ dr_alignment_support puntype_alignment_scheme
+ = vect_supportable_dr_alignment
+ (vinfo, first_dr_info, *pun_vectype, puntype_misalignment,
+ true);
+
+ if (puntype_alignment_scheme == dr_aligned
+ || puntype_alignment_scheme == dr_unaligned_supported)
+ {
+ SLP_TREE_GS_SCALE (slp_node) = gs_info.scale;
+ SLP_TREE_GS_BASE (slp_node) = error_mark_node;
+ ls->gs.ifn = gs_info.ifn;
+ ls->strided_offset_vectype = gs_info.offset_vectype;
+ *memory_access_type = VMAT_GATHER_SCATTER_IFN;
+ }
+ }
}
if (*memory_access_type == VMAT_CONTIGUOUS_DOWN
@@ -2351,14 +2493,15 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
}
else
{
+ tree vtype = ls->pun_vectype ? ls->pun_vectype : vectype;
if (mat_gather_scatter_p (*memory_access_type)
&& !first_dr_info)
*misalignment = DR_MISALIGNMENT_UNKNOWN;
else
- *misalignment = dr_misalignment (first_dr_info, vectype, *poffset);
+ *misalignment = dr_misalignment (first_dr_info, vtype, *poffset);
*alignment_support_scheme
= vect_supportable_dr_alignment
- (vinfo, first_dr_info, vectype, *misalignment,
+ (vinfo, first_dr_info, vtype, *misalignment,
mat_gather_scatter_p (*memory_access_type));
}
@@ -8364,10 +8507,13 @@ vectorizable_store (vec_info *vinfo,
{
aggr_type = elem_type;
if (!costing_p)
- vect_get_strided_load_store_ops (stmt_info, slp_node, vectype,
- ls.strided_offset_vectype,
- loop_vinfo, gsi,
- &bump, &vec_offset, loop_lens);
+ {
+ tree vtype = ls.pun_vectype ? ls.pun_vectype : vectype;
+ vect_get_strided_load_store_ops (stmt_info, slp_node, vtype,
+ ls.strided_offset_vectype,
+ loop_vinfo, gsi,
+ &bump, &vec_offset, loop_lens);
+ }
}
else
{
@@ -8553,7 +8699,9 @@ vectorizable_store (vec_info *vinfo,
if (mat_gather_scatter_p (memory_access_type))
{
- gcc_assert (!grouped_store);
+ gcc_assert (!grouped_store || ls.pun_vectype);
+ if (ls.pun_vectype)
+ vectype = ls.pun_vectype;
auto_vec<tree> vec_offsets;
unsigned int inside_cost = 0, prologue_cost = 0;
int num_stmts = vec_num;
@@ -8600,8 +8748,9 @@ vectorizable_store (vec_info *vinfo,
if (mask_node)
vec_mask = vec_masks[j];
/* We should have catched mismatched types earlier. */
- gcc_assert (useless_type_conversion_p (vectype,
- TREE_TYPE (vec_oprnd)));
+ gcc_assert (ls.pun_vectype
+ || useless_type_conversion_p
+ (vectype, TREE_TYPE (vec_oprnd)));
}
tree final_mask = NULL_TREE;
tree final_len = NULL_TREE;
@@ -8654,6 +8803,18 @@ vectorizable_store (vec_info *vinfo,
}
}
+ if (ls.pun_vectype)
+ {
+ gimple *conv_stmt
+ = gimple_build_assign (make_ssa_name (vectype),
+ VIEW_CONVERT_EXPR,
+ build1 (VIEW_CONVERT_EXPR, vectype,
+ vec_oprnd));
+ vect_finish_stmt_generation (vinfo, stmt_info, conv_stmt,
+ gsi);
+ vec_oprnd = gimple_get_lhs (conv_stmt);
+ }
+
gcall *call;
if (final_len && final_mask)
{
@@ -10413,7 +10574,14 @@ vectorizable_load (vec_info *vinfo,
if (mat_gather_scatter_p (memory_access_type))
{
- gcc_assert (!grouped_load && !slp_perm);
+ gcc_assert ((!grouped_load && !slp_perm) || ls.pun_vectype);
+
+ /* If we pun the original vectype the loads as well as costing, length,
+ etc. is performed with the new type. After loading we VIEW_CONVERT
+ the data to the original vectype. */
+ tree original_vectype = vectype;
+ if (ls.pun_vectype)
+ vectype = ls.pun_vectype;
/* 1. Create the vector or array pointer update chain. */
if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
@@ -10754,6 +10922,17 @@ vectorizable_load (vec_info *vinfo,
new_temp = new_temp2;
}
+ if (ls.pun_vectype)
+ {
+ new_stmt = gimple_build_assign (make_ssa_name
+ (original_vectype),
+ VIEW_CONVERT_EXPR,
+ build1 (VIEW_CONVERT_EXPR,
+ original_vectype,
+ new_temp));
+ vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+ }
+
/* Store vector loads in the corresponding SLP_NODE. */
slp_node->push_vec_def (new_stmt);
}
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index df805c6ade9..c7533d7a35b 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -287,6 +287,7 @@ struct vect_load_store_data : vect_data {
tree decl; // VMAT_GATHER_SCATTER_DECL
} gs;
tree strided_offset_vectype; // VMAT_GATHER_SCATTER_IFN, originally strided
+ tree pun_vectype; // VMAT_GATHER_SCATTER_IFN
auto_vec<int> elsvals;
};
--
2.51.0