Hi!
On 2023-10-19T11:47:14+0000, Richard Biener <[email protected]> wrote:
> The following implements SLP vectorization support for gathers
> without relying on IFNs being pattern detected (and supported by
> the target). That includes support for emulated gathers but also
> the legacy x86 builtin path.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, will push.
For GCN (tested '-march=gfx90a'), I see:
PASS: gcc.dg/vect/vect-gather-2.c (test for excess errors)
+FAIL: gcc.dg/vect/vect-gather-2.c scan-tree-dump vect "different gather
base"
+FAIL: gcc.dg/vect/vect-gather-2.c scan-tree-dump vect "different gather
scale"
+PASS: gcc.dg/vect/vect-gather-2.c scan-tree-dump-not vect "Loop contains
only SLP stmts"
Grüße
Thomas
> PR tree-optimization/111131
> * tree-vect-loop.cc (update_epilogue_loop_vinfo): Make
> sure to update all gather/scatter stmt DRs, not only those
> that eventually got VMAT_GATHER_SCATTER set.
> * tree-vect-slp.cc (_slp_oprnd_info::first_gs_info): Add.
> (vect_get_and_check_slp_defs): Handle gathers/scatters,
> adding the offset as SLP operand and comparing base and scale.
> (vect_build_slp_tree_1): Handle gathers.
> (vect_build_slp_tree_2): Likewise.
>
> * gcc.dg/vect/vect-gather-1.c: Now expected to vectorize
> everywhere.
> * gcc.dg/vect/vect-gather-2.c: Expected to not SLP anywhere.
> Massage the scale case to more reliably produce a different
> one. Scan for the specific messages.
> * gcc.dg/vect/vect-gather-3.c: Masked gather is also supported
> for AVX2, but not emulated.
> * gcc.dg/vect/vect-gather-4.c: Expected to not SLP anywhere.
> Massage to more properly ensure this.
> * gcc.dg/vect/tsvc/vect-tsvc-s353.c: Expect to vectorize
> everywhere.
> ---
> .../gcc.dg/vect/tsvc/vect-tsvc-s353.c | 2 +-
> gcc/testsuite/gcc.dg/vect/vect-gather-1.c | 2 +-
> gcc/testsuite/gcc.dg/vect/vect-gather-2.c | 13 ++++--
> gcc/testsuite/gcc.dg/vect/vect-gather-3.c | 2 +-
> gcc/testsuite/gcc.dg/vect/vect-gather-4.c | 6 +--
> gcc/tree-vect-loop.cc | 6 ++-
> gcc/tree-vect-slp.cc | 45 +++++++++++++++++--
> 7 files changed, 61 insertions(+), 15 deletions(-)
>
> diff --git a/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s353.c
> b/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s353.c
> index 98ba7522471..2c4fa3f5991 100644
> --- a/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s353.c
> +++ b/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s353.c
> @@ -44,4 +44,4 @@ int main (int argc, char **argv)
> return 0;
> }
>
> -/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { xfail { !
> riscv_v } } } } */
> +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-1.c
> b/gcc/testsuite/gcc.dg/vect/vect-gather-1.c
> index e3bbf5c0bf8..5f6640d9ab6 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-gather-1.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-gather-1.c
> @@ -58,4 +58,4 @@ main (void)
> return 0;
> }
>
> -/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" vect { target
> vect_gather_load_ifn } } } */
> +/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" vect } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-2.c
> b/gcc/testsuite/gcc.dg/vect/vect-gather-2.c
> index a1f6ba458a9..4c23b808333 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-gather-2.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-gather-2.c
> @@ -8,6 +8,7 @@ f1 (int *restrict y, int *restrict x1, int *restrict x2,
> {
> for (int i = 0; i < N; ++i)
> {
> + /* Different base. */
> y[i * 2] = x1[indices[i * 2]] + 1;
> y[i * 2 + 1] = x2[indices[i * 2 + 1]] + 2;
> }
> @@ -18,8 +19,9 @@ f2 (int *restrict y, int *restrict x, int *restrict indices)
> {
> for (int i = 0; i < N; ++i)
> {
> - y[i * 2] = x[indices[i * 2]] + 1;
> - y[i * 2 + 1] = x[indices[i * 2 + 1] * 2] + 2;
> + /* Different scale. */
> + y[i * 2] = *(int *)((char *)x + (__UINTPTR_TYPE__)indices[i * 2] * 4)
> + 1;
> + y[i * 2 + 1] = *(int *)((char *)x + (__UINTPTR_TYPE__)indices[i * 2 +
> 1] * 2) + 2;
> }
> }
>
> @@ -28,9 +30,12 @@ f3 (int *restrict y, int *restrict x, int *restrict
> indices)
> {
> for (int i = 0; i < N; ++i)
> {
> + /* Different type. */
> y[i * 2] = x[indices[i * 2]] + 1;
> - y[i * 2 + 1] = x[(unsigned int) indices[i * 2 + 1]] + 2;
> + y[i * 2 + 1] = x[((unsigned int *) indices)[i * 2 + 1]] + 2;
> }
> }
>
> -/* { dg-final { scan-tree-dump-not "Loop contains only SLP stmts" vect {
> target vect_gather_load_ifn } } } */
> +/* { dg-final { scan-tree-dump-not "Loop contains only SLP stmts" vect } } */
> +/* { dg-final { scan-tree-dump "different gather base" vect { target { !
> vect_gather_load_ifn } } } } */
> +/* { dg-final { scan-tree-dump "different gather scale" vect { target { !
> vect_gather_load_ifn } } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-3.c
> b/gcc/testsuite/gcc.dg/vect/vect-gather-3.c
> index adfef3bf407..30ba6789e03 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-gather-3.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-gather-3.c
> @@ -62,4 +62,4 @@ main (void)
> return 0;
> }
>
> -/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" vect { target
> { vect_gather_load_ifn && vect_masked_load } } } } */
> +/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" vect { target
> { { vect_gather_load_ifn || avx2 } && vect_masked_load } } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-4.c
> b/gcc/testsuite/gcc.dg/vect/vect-gather-4.c
> index ee2e4e4999a..1ce63e69199 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-gather-4.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-gather-4.c
> @@ -39,10 +39,10 @@ f3 (int *restrict y, int *restrict x, int *restrict
> indices)
> y[i * 2] = (indices[i * 2] < N * 2
> ? x[indices[i * 2]] + 1
> : 1);
> - y[i * 2 + 1] = (indices[i * 2 + 1] < N * 2
> - ? x[(unsigned int) indices[i * 2 + 1]] + 2
> + y[i * 2 + 1] = (((unsigned int *)indices)[i * 2 + 1] < N * 2
> + ? x[((unsigned int *) indices)[i * 2 + 1]] + 2
> : 2);
> }
> }
>
> -/* { dg-final { scan-tree-dump-not "Loop contains only SLP stmts" vect {
> target vect_gather_load_ifn } } } */
> +/* { dg-final { scan-tree-dump-not "Loop contains only SLP stmts" vect } } */
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index ebab1953b9c..8877ebde246 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -11362,8 +11362,7 @@ update_epilogue_loop_vinfo (class loop *epilogue,
> tree advance)
> updated offset we set using ADVANCE. Instead we have to make sure the
> reference in the data references point to the corresponding copy of
> the original in the epilogue. */
> - if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
> - == VMAT_GATHER_SCATTER)
> + if (STMT_VINFO_GATHER_SCATTER_P (vect_stmt_to_vectorize (stmt_vinfo)))
> {
> DR_REF (dr)
> = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
> @@ -11372,6 +11371,9 @@ update_epilogue_loop_vinfo (class loop *epilogue,
> tree advance)
> = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
> &find_in_mapping, &mapping);
> }
> + else
> + gcc_assert (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize
> (stmt_vinfo))
> + != VMAT_GATHER_SCATTER);
> DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
> stmt_vinfo->dr_aux.stmt = stmt_vinfo;
> /* The vector size of the epilogue is smaller than that of the main
> loop
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index d081999a763..8efff2e912d 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -283,10 +283,11 @@ typedef struct _slp_oprnd_info
> vec<tree> ops;
> /* Information about the first statement, its vector def-type, type, the
> operand itself in case it's constant, and an indication if it's a
> pattern
> - stmt. */
> + stmt and gather/scatter info. */
> tree first_op_type;
> enum vect_def_type first_dt;
> bool any_pattern;
> + gather_scatter_info first_gs_info;
> } *slp_oprnd_info;
>
>
> @@ -609,6 +610,7 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned
> char swap,
> unsigned int i, number_of_oprnds;
> enum vect_def_type dt = vect_uninitialized_def;
> slp_oprnd_info oprnd_info;
> + gather_scatter_info gs_info;
> unsigned int commutative_op = -1U;
> bool first = stmt_num == 0;
>
> @@ -660,6 +662,19 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned
> char swap,
>
> oprnd_info = (*oprnds_info)[i];
>
> + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> + {
> + gcc_assert (number_of_oprnds == 1);
> + if (!is_a <loop_vec_info> (vinfo)
> + || !vect_check_gather_scatter (stmt_info,
> + as_a <loop_vec_info> (vinfo),
> + first ? &oprnd_info->first_gs_info
> + : &gs_info))
> + return -1;
> +
> + oprnd = first ? oprnd_info->first_gs_info.offset : gs_info.offset;
> + }
> +
> stmt_vec_info def_stmt_info;
> if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
> {
> @@ -792,6 +807,25 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned
> char swap,
> return 1;
> }
>
> + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> + {
> + if (!operand_equal_p (oprnd_info->first_gs_info.base,
> + gs_info.base))
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "Build SLP failed: different gather base\n");
> + return 1;
> + }
> + if (oprnd_info->first_gs_info.scale != gs_info.scale)
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "Build SLP failed: different gather scale\n");
> + return 1;
> + }
> + }
> +
> /* Not first stmt of the group, check that the def-stmt/s match
> the def-stmt/s of the first stmt. Allow different definition
> types for reduction chains: the first stmt must be a
> @@ -1235,6 +1269,9 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char
> *swap,
> || rhs_code == INDIRECT_REF
> || rhs_code == COMPONENT_REF
> || rhs_code == MEM_REF)))
> + || (ldst_p
> + && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
> + != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
> || first_stmt_ldst_p != ldst_p
> || first_stmt_phi_p != phi_p)
> {
> @@ -1357,12 +1394,12 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char
> *swap,
> if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
> && rhs_code != CFN_GATHER_LOAD
> && rhs_code != CFN_MASK_GATHER_LOAD
> + && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
> /* Not grouped loads are handled as externals for BB
> vectorization. For loop vectorization we can handle
> splats the same we handle single element interleaving. */
> && (is_a <bb_vec_info> (vinfo)
> - || stmt_info != first_stmt_info
> - || STMT_VINFO_GATHER_SCATTER_P (stmt_info)))
> + || stmt_info != first_stmt_info))
> {
> /* Not grouped load. */
> if (dump_enabled_p ())
> @@ -1858,6 +1895,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
> gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
> || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
> || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
> + else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> + gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
> else
> {
> *max_nunits = this_max_nunits;
> --
> 2.35.3
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht
München, HRB 106955