On Tue, May 27, 2025 at 6:04 PM Robin Dapp <[email protected]> wrote:
>
> From: Robin Dapp <[email protected]>
>
> This patch enables strided loads for VMAT_STRIDED_SLP. Instead of
> building vectors from scalars or other vectors we can use strided loads
> directly when applicable.
>
> The current implementation limits strided loads to cases where we can
> load entire groups and not subsets of them. A future improvement would
> be to e.g. load a group of three uint8_t
>
> g0 g1 g2, g0 + stride g1 + stride g2 + stride, ...
>
> by
>
> vlse16 vlse8
>
> and permute those into place (after re-interpreting as vector of
> uint8_t).
>
> For satd_8x4 in particular we can do even better by eliding the strided
> SLP load permutations, essentially turning
>
> vlse64 v0, (a0)
> vlse64 v1, (a1)
> VEC_PERM_EXPR <v0, v1, { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25,
> 26, 27 }>;
> VEC_PERM_EXPR <v0, v1, { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29,
> 30, 31 }>;
>
> into
>
> vlse32 v0, (a0)
> vlse32 v1, (a1)
> vlse32 v0, 4(a0)
> vlse32 v1, 4(a1)
>
> but that is going to be a follow up.
>
> Bootstrapped and regtested on x86, aarch64, and power10.
> Regtested on rv64gcv_zvl512b. I'm seeing one additional failure in
> gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
> where we use a larger LMUL than we should but IMHO this can wait.
>
> PR target/118109
>
> gcc/ChangeLog:
>
> * internal-fn.cc (internal_strided_fn_supported_p): New
> function.
> * internal-fn.h (internal_strided_fn_supported_p): Declare.
> * tree-vect-stmts.cc (vect_supportable_strided_type): New
> function.
> (vectorizable_load): Add strided-load support for strided
> groups.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/pr118019-2.c: New test.
> ---
> gcc/internal-fn.cc | 21 ++
> gcc/internal-fn.h | 2 +
> .../gcc.target/riscv/rvv/autovec/pr118019-2.c | 51 +++++
> gcc/tree-vect-stmts.cc | 196 +++++++++++++++---
> 4 files changed, 243 insertions(+), 27 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 6b04443f7cd..aec90ef87cc 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -5203,6 +5203,27 @@ internal_gather_scatter_fn_supported_p (internal_fn
> ifn, tree vector_type,
> return ok;
> }
>
> +/* Return true if the target supports a strided load/store function IFN
> + with VECTOR_TYPE. If supported and ELSVALS is nonzero the supported else
> + values will be added to the vector ELSVALS points to. */
> +
> +bool
> +internal_strided_fn_supported_p (internal_fn ifn, tree vector_type,
> + vec<int> *elsvals)
> +{
can we add an assert here as to what 'ifn' we expect here?
MASK_LEN_STRIDED_LOAD and MASK_LEN_STRIDED_STORE I guess?
> + machine_mode mode = TYPE_MODE (vector_type);
> + optab optab = direct_internal_fn_optab (ifn);
> + insn_code icode = direct_optab_handler (optab, mode);
> +
> + bool ok = icode != CODE_FOR_nothing;
> +
> + if (ok && elsvals)
> + get_supported_else_vals
> + (icode, internal_fn_else_index (IFN_MASK_LEN_STRIDED_LOAD), *elsvals);
shouldn't you use 'ifn' here instead of IFN_MASK_LEN_STRIDED_LOAD?
> +
> + return ok;
> +}
> +
> /* Return true if the target supports IFN_CHECK_{RAW,WAR}_PTRS function IFN
> for pointers of type TYPE when the accesses have LENGTH bytes and their
> common byte alignment is ALIGN. */
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index afd4f8e64c7..7d386246a42 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -242,6 +242,8 @@ extern int internal_fn_stored_value_index (internal_fn);
> extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
> tree, tree, int,
> vec<int> * = nullptr);
> +extern bool internal_strided_fn_supported_p (internal_fn ifn, tree
> vector_type,
> + vec<int> *elsvals);
> extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
> poly_uint64, unsigned int);
>
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
> new file mode 100644
> index 00000000000..9918d4d7f52
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
> @@ -0,0 +1,51 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=rv64gcv_zvl512b -mabi=lp64d
> -mno-vector-strict-align" } */
> +
> +/* Ensure we use strided loads. */
> +
> +typedef unsigned char uint8_t;
> +typedef unsigned short uint16_t;
> +typedef unsigned int uint32_t;
> +
> +#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3)
> \
> + {
> \
> + int t0 = s0 + s1;
> \
> + int t1 = s0 - s1;
> \
> + int t2 = s2 + s3;
> \
> + int t3 = s2 - s3;
> \
> + d0 = t0 + t2;
> \
> + d2 = t0 - t2;
> \
> + d1 = t1 + t3;
> \
> + d3 = t1 - t3;
> \
> + }
> +
> +uint32_t
> +abs2 (uint32_t a)
> +{
> + uint32_t s = ((a >> 15) & 0x10001) * 0xffff;
> + return (a + s) ^ s;
> +}
> +
> +int
> +x264_pixel_satd_8x4 (uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2)
> +{
> + uint32_t tmp[4][4];
> + uint32_t a0, a1, a2, a3;
> + int sum = 0;
> + for (int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2)
> + {
> + a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
> + a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
> + a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
> + a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
> + HADAMARD4 (tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3);
> + }
> + for (int i = 0; i < 4; i++)
> + {
> + HADAMARD4 (a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
> + sum += abs2 (a0) + abs2 (a1) + abs2 (a2) + abs2 (a3);
> + }
> + return (((uint16_t) sum) + ((uint32_t) sum >> 16)) >> 1;
> +}
> +
> +/* { dg-final { scan-assembler-times "vlse64" 8 } } */
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 3710694ac75..090169d9cb5 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -10165,6 +10165,46 @@ hoist_defs_of_uses (gimple *stmt, class loop *loop,
> bool hoist_p)
> return true;
> }
>
> +/* Check if there is a vector type with the same number of elements as
> + VECTYPE but NELTS units.
It looks like the function looks for a same element typed vector type
with a _different_ number of elements, NELTS number.
? If so, set VTYPE to the appropriate type and
> + return true if the target supports a strided load/store for the respective
> + mode. Return false otherwise. */
So it combines two things. I think
get_related_vectype_for_scalar_type (TYPE_MODE (vector_type),
TREE_TYPE (vectype), nelts)
does exactly this first part? Alternatively
you can use get_vectype_for_scalar_type (vinfo, TREE_TYPE (vectype), nelts.
I'd hate adding a third variant?
> +static bool
> +vect_supportable_strided_type (vec_load_store_type vls_type,
> + tree vectype, int nelts,
> + tree *vtype,
> + vec<int> *elsvals)
> +{
> + gcc_assert (VECTOR_TYPE_P (vectype));
> + gcc_assert (TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
why this?
> +
> + machine_mode vmode = TYPE_MODE (vectype);
> + if (!VECTOR_MODE_P (vmode))
> + return NULL_TREE;
> +
> + internal_fn ifn = vls_type == VLS_LOAD
> + ? IFN_MASK_LEN_STRIDED_LOAD
> + : IFN_MASK_LEN_STRIDED_STORE;
> +
> + poly_uint64 vbsize = GET_MODE_BITSIZE (vmode);
> + unsigned int pbsize;
> + if (constant_multiple_p (vbsize, nelts, &pbsize))
> + {
> + scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype));
> + machine_mode rmode;
> + if (int_mode_for_size (pbsize, 0).exists (&elmode)
> + && related_vector_mode (vmode, elmode, nelts).exists (&rmode))
> + {
> + tree ptype = build_nonstandard_integer_type (pbsize, 1);
> + *vtype = build_vector_type (ptype, nelts);
> + if (internal_strided_fn_supported_p (ifn, *vtype, elsvals))
> + return true;
> + }
> + }
> +
> + return false;
> +}
> +
> /* vectorizable_load.
>
> Check if STMT_INFO reads a non scalar data-ref (array/pointer/structure)
> @@ -10669,6 +10709,8 @@ vectorizable_load (vec_info *vinfo,
> tree running_off;
> vec<constructor_elt, va_gc> *v = NULL;
> tree stride_base, stride_step, alias_off;
> + bool strided_load_ok_p = false;
> + tree stride_step_signed = NULL_TREE;
> /* Checked by get_load_store_type. */
> unsigned int const_nunits = nunits.to_constant ();
> unsigned HOST_WIDE_INT cst_offset = 0;
> @@ -10744,15 +10786,34 @@ vectorizable_load (vec_info *vinfo,
> stride_step = cse_and_gimplify_to_preheader (loop_vinfo,
> stride_step);
> }
>
> + tree stride_step_full = NULL_TREE;
> + auto_vec<tree> dr_chain;
> +
> + /* For SLP permutation support we need to load the whole group,
> + not only the number of vector stmts the permutation result
> + fits in. */
> + if (slp_perm)
> + {
> + /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
> + variable VF. */
> + unsigned int const_vf = vf.to_constant ();
> + ncopies = CEIL (group_size * const_vf, const_nunits);
> + dr_chain.create (ncopies);
> + }
> + else
> + ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
> +
> + /* Initialize for VMAT_ELEMENTWISE i.e. when we must load each element
> + separately. */
> running_off = offvar;
> alias_off = build_int_cst (ref_type, 0);
> int nloads = const_nunits;
> int lnel = 1;
> tree ltype = TREE_TYPE (vectype);
> tree lvectype = vectype;
> - auto_vec<tree> dr_chain;
> if (memory_access_type == VMAT_STRIDED_SLP)
> {
> + tree strided_vtype;
> HOST_WIDE_INT n = gcd (group_size, const_nunits);
> /* Use the target vector type if the group size is a multiple
> of it. */
> @@ -10781,9 +10842,42 @@ vectorizable_load (vec_info *vinfo,
> misalignment = mis_align;
> }
> }
> + /* Instead of loading individual vector elements and
> + constructing a larger vector from them we can use
> + a strided load directly.
> + ??? For non-power-of-two groups we could build the
> + group from smaller element sizes and permute them
> + into place afterwards instead of relying on a more
> + rigid vec_init. */
> + else if (n > 1 && n == group_size
> + && vect_supportable_strided_type
> + (VLS_LOAD, vectype, const_nunits / n,
> + &strided_vtype, &elsvals))
So I do wonder how this interacts with vector_vector_composition_type,
in fact the difference is that for strided_load we know the composition
happens as part of a load, so how about instead extending
this function, pass it VLS_LOAD/STORE and also consider
strided_loads as composition kind there? This would avoid duplication
and I think at least some cases of non-power-of-two groups would
be handled this way already (permuting out gaps).
> + {
> + dr_alignment_support dr_align = dr_aligned;
> + int mis_align = 0;
> + mis_align = dr_misalignment (first_dr_info,
> + strided_vtype);
> + dr_align
> + = vect_supportable_dr_alignment (vinfo, dr_info,
> + strided_vtype,
> + mis_align);
> + if (dr_align == dr_aligned
> + || dr_align == dr_unaligned_supported)
> + {
> + nloads = 1;
> + lnel = const_nunits;
> + lvectype = strided_vtype;
> + ltype = TREE_TYPE (strided_vtype);
> + alignment_support_scheme = dr_align;
> + misalignment = mis_align;
> +
> + strided_load_ok_p = true;
> + }
> + }
> /* Else use the biggest vector we can load the group without
> accessing excess elements. */
> - else if (n > 1)
> + else if (!strided_load_ok_p && n > 1)
> {
> tree ptype;
> tree vtype
> @@ -10829,19 +10923,35 @@ vectorizable_load (vec_info *vinfo,
> ltype = build_aligned_type (ltype, align * BITS_PER_UNIT);
> }
>
> - /* For SLP permutation support we need to load the whole group,
> - not only the number of vector stmts the permutation result
> - fits in. */
> - if (slp_perm)
> + /* We don't use masking here so just use any else value and don't
> + perform any zeroing. */
> + tree vec_els = NULL_TREE;
> + if (strided_load_ok_p && !costing_p)
> {
> - /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
> - variable VF. */
> - unsigned int const_vf = vf.to_constant ();
> - ncopies = CEIL (group_size * const_vf, const_nunits);
> - dr_chain.create (ncopies);
> + gcc_assert (elsvals.length ());
> + maskload_elsval = *elsvals.begin ();
> + vec_els = vect_get_mask_load_else (maskload_elsval, lvectype);
> +
> + stride_step_full
> + = fold_build2 (MULT_EXPR, TREE_TYPE (stride_step),
> + stride_step,
> + build_int_cst (TREE_TYPE (stride_step),
> + TYPE_VECTOR_SUBPARTS (lvectype)));
> + stride_step_full
> + = cse_and_gimplify_to_preheader (loop_vinfo, stride_step_full);
> +
> + tree cst_off = build_int_cst (ref_type, cst_offset);
> + dataref_ptr
> + = vect_create_data_ref_ptr (vinfo, first_stmt_info, lvectype,
> + loop, cst_off, &dummy, gsi, &ptr_incr,
> + false, stride_step_full);
> +
> + stride_step_signed
> + = fold_build1 (NOP_EXPR, signed_type_for (TREE_TYPE
> (stride_step)),
> + stride_step);
> + stride_step_signed
> + = cse_and_gimplify_to_preheader (loop_vinfo, stride_step_signed);
> }
> - else
> - ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
>
> unsigned int group_el = 0;
> unsigned HOST_WIDE_INT
> @@ -10852,37 +10962,65 @@ vectorizable_load (vec_info *vinfo,
> unsigned int n_adjacent_loads = 0;
> for (j = 0; j < ncopies; j++)
> {
> - if (nloads > 1 && !costing_p)
> + if (nloads > 1 && !strided_load_ok_p && !costing_p)
> vec_alloc (v, nloads);
> gimple *new_stmt = NULL;
> for (i = 0; i < nloads; i++)
> {
> if (costing_p)
> {
> + /* TODO: The vectype in stmt_info/slp_node is potentially
> + wrong as we could be using a much smaller vectype
> + as determined by vector_vector_composition_type. */
> + if (strided_load_ok_p)
> + inside_cost += record_stmt_cost (cost_vec, 1,
> + vector_gather_load,
> + slp_node, 0,
> + vect_body);
> /* For VMAT_ELEMENTWISE, just cost it as scalar_load to
> avoid ICE, see PR110776. */
> - if (VECTOR_TYPE_P (ltype)
> - && memory_access_type != VMAT_ELEMENTWISE)
> + else if (VECTOR_TYPE_P (ltype)
> + && memory_access_type != VMAT_ELEMENTWISE)
> n_adjacent_loads++;
> else
> inside_cost += record_stmt_cost (cost_vec, 1, scalar_load,
> slp_node, 0, vect_body);
> continue;
> }
> - tree this_off = build_int_cst (TREE_TYPE (alias_off),
> - group_el * elsz + cst_offset);
> - tree data_ref = build2 (MEM_REF, ltype, running_off, this_off);
> - vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
> - new_temp = make_ssa_name (ltype);
> - new_stmt = gimple_build_assign (new_temp, data_ref);
> +
> + if (!strided_load_ok_p)
> + {
> +
> + tree this_off = build_int_cst (TREE_TYPE (alias_off),
> + group_el * elsz +
> cst_offset);
> + tree data_ref = build2 (MEM_REF, ltype, running_off,
> + this_off);
> + vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
> + new_temp = make_ssa_name (ltype);
> + new_stmt = gimple_build_assign (new_temp, data_ref);
> + if (nloads > 1)
> + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_temp);
> + }
> + else
> + {
> + mask_vectype = truth_type_for (lvectype);
> + tree final_mask = build_minus_one_cst (mask_vectype);
> + tree bias = build_int_cst (intQI_type_node, 0);
> + tree len = size_int (TYPE_VECTOR_SUBPARTS (lvectype));
> + tree zero = build_zero_cst (lvectype);
> + new_stmt
> + = gimple_build_call_internal
> + (IFN_MASK_LEN_STRIDED_LOAD, 7, dataref_ptr,
> + stride_step_signed, zero, final_mask, vec_els, len,
> bias);
> + new_temp = make_ssa_name (lvectype);
> + gimple_set_lhs (new_stmt, new_temp);
> + }
> vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> - if (nloads > 1)
> - CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_temp);
>
> group_el += lnel;
> - if (group_el == group_size)
> + if (group_el >= group_size)
> {
> - n_groups++;
> + n_groups += (group_el / group_size);
> /* When doing SLP make sure to not load elements from
> the next vector iteration, those will not be accessed
> so just use the last element again. See PR107451. */
> @@ -10894,6 +11032,10 @@ vectorizable_load (vec_info *vinfo,
> running_off, stride_step);
> vect_finish_stmt_generation (vinfo, stmt_info, incr,
> gsi);
> running_off = newoff;
> + if (strided_load_ok_p && !costing_p)
> + dataref_ptr
> + = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
> gsi,
> + stmt_info, stride_step_full);
> }
> group_el = 0;
> }
> @@ -10935,7 +11077,7 @@ vectorizable_load (vec_info *vinfo,
> if (!costing_p)
> {
> if (slp_perm)
> - dr_chain.quick_push (gimple_assign_lhs (new_stmt));
> + dr_chain.quick_push (gimple_get_lhs (new_stmt));
> else
> slp_node->push_vec_def (new_stmt);
> }
> --
> 2.49.0
>