Jennifer Schmitz <[email protected]> writes:
> A common idiom in intrinsics loops is to have accumulator intrinsics
> in an unrolled loop with an accumulator initialized to zero at the beginning.
> Propagating the initial zero accumulator into the first iteration
> of the loop and simplifying the first accumulate instruction is a
> desirable transformation that we should teach GCC.
> Therefore, this patch folds svsra to svlsr/svasr if op1 is all zeros,
> producing the lower latency instructions LSR/ASR instead of USRA/SSRA.
> We implemented this optimization in svsra_impl::fold.
> Because svlsr/svasr are predicated intrinsics, we added a ptrue
> predicate. Additionally, the width of the shift amount (imm3) was
> adjusted to fit the function type.
> In order to create the ptrue predicate, a new helper function
> build_ptrue was added. We also refactored gimple_folder::fold_to_ptrue
> to use the new helper function.
>
> Tests were added to check the produced assembly for use of LSR/ASR.
>
> The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
> OK for mainline?
>
> Signed-off-by: Jennifer Schmitz <[email protected]>
>
> gcc/
> * config/aarch64/aarch64-sve-builtins-sve2.cc
> (svsra_impl::fold): Fold svsra to svlsr/svasr if op1 is all zeros.
> * config/aarch64/aarch64-sve-builtins.cc (build_ptrue): New
> function that returns a ptrue tree.
> (gimple_folder::fold_to_ptrue): Refactor to use build_ptrue.
> * config/aarch64/aarch64-sve-builtins.h: Declare build_ptrue.
>
> gcc/testsuite/
> * gcc.target/aarch64/sve2/acle/asm/sra_s32.c: New test.
> * gcc.target/aarch64/sve2/acle/asm/sra_s64.c: Likewise.
> * gcc.target/aarch64/sve2/acle/asm/sra_u32.c: Likewise.
> * gcc.target/aarch64/sve2/acle/asm/sra_u64.c: Likewise.
> ---
> .../aarch64/aarch64-sve-builtins-sve2.cc | 29 +++++++++++++++++++
> gcc/config/aarch64/aarch64-sve-builtins.cc | 28 +++++++++++-------
> gcc/config/aarch64/aarch64-sve-builtins.h | 1 +
> .../aarch64/sve2/acle/asm/sra_s32.c | 9 ++++++
> .../aarch64/sve2/acle/asm/sra_s64.c | 9 ++++++
> .../aarch64/sve2/acle/asm/sra_u32.c | 9 ++++++
> .../aarch64/sve2/acle/asm/sra_u64.c | 9 ++++++
> 7 files changed, 83 insertions(+), 11 deletions(-)
>
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
> b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
> index 6a20a613f83..0990918cc45 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
> @@ -417,6 +417,35 @@ public:
>
> class svsra_impl : public function_base
> {
> +public:
> + gimple *
> + fold (gimple_folder &f) const override
> + {
> + /* Fold to svlsr/svasr if op1 is all zeros. */
> + tree op1 = gimple_call_arg (f.call, 0);
> + if (!integer_zerop (op1))
> + return NULL;
> + function_instance instance ("svlsr", functions::svlsr,
> + shapes::binary_uint_opt_n, MODE_n,
> + f.type_suffix_ids, GROUP_none, PRED_x);
> + if (!f.type_suffix (0).unsigned_p)
> + {
> + instance.base_name = "svasr";
> + instance.base = functions::svasr;
> + }
> + gcall *call = f.redirect_call (instance);
> + unsigned int element_bytes = f.type_suffix (0).element_bytes;
> + /* Add a ptrue as predicate, because unlike svsra, svlsr/svasr are
> + predicated intrinsics. */
> + gimple_call_set_arg (call, 0, build_ptrue (element_bytes));
Maybe it would be simpler to use build_all_ones_cst (f.gp_type ()).
Unlike for fold_to_ptrue (which produces output predicates),
we don't need the upper bits of each predicate element to be zero.
> + /* For svsra, the shift amount (imm3) is uint64_t for all function types,
> + but for svlsr/svasr, imm3 has the same width as the function type. */
> + tree imm3 = gimple_call_arg (f.call, 2);
> + tree imm3_prec = wide_int_to_tree (scalar_types[f.type_suffix
> (0).vector_type],
Nit: long line. The easiest way of avoiding it would be to use
f.scalar_type (0) instead.
> + wi::to_wide (imm3, element_bytes));
This works correctly, but FWIW, it's a little simpler to use
wi::to_widest (imm3) instead. No need to change though.
Thanks,
Richard
> + gimple_call_set_arg (call, 2, imm3_prec);
> + return call;
> + }
> public:
> rtx
> expand (function_expander &e) const override
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc
> b/gcc/config/aarch64/aarch64-sve-builtins.cc
> index e7c703c987e..945e9818f4e 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
> @@ -3456,6 +3456,21 @@ is_ptrue (tree v, unsigned int step)
> && vector_cst_all_same (v, step));
> }
>
> +/* Return a ptrue tree (type svbool_t) where the element width
> + is given by ELEMENT_BYTES.
> + For example, for ELEMENT_BYTES = 2, we get { 1, 0, 1, 0, ... }. */
> +tree
> +build_ptrue (unsigned int element_bytes)
> +{
> + tree bool_type = scalar_types[VECTOR_TYPE_svbool_t];
> + tree svbool_type = acle_vector_types[0][VECTOR_TYPE_svbool_t];
> + tree_vector_builder builder (svbool_type, element_bytes, 1);
> + builder.quick_push (build_all_ones_cst (bool_type));
> + for (unsigned int i = 1; i < element_bytes; ++i)
> + builder.quick_push (build_zero_cst (bool_type));
> + return builder.build ();
> +}
> +
> gimple_folder::gimple_folder (const function_instance &instance, tree fndecl,
> gimple_stmt_iterator *gsi_in, gcall *call_in)
> : function_call_info (gimple_location (call_in), instance, fndecl),
> @@ -3572,17 +3587,8 @@ gimple_folder::fold_to_cstu (poly_uint64 val)
> gimple *
> gimple_folder::fold_to_ptrue ()
> {
> - tree svbool_type = TREE_TYPE (lhs);
> - tree bool_type = TREE_TYPE (svbool_type);
> - unsigned int element_bytes = type_suffix (0).element_bytes;
> -
> - /* The return type is svbool_t for all type suffixes, thus for b8 we
> - want { 1, 1, 1, 1, ... }, for b16 we want { 1, 0, 1, 0, ... }, etc. */
> - tree_vector_builder builder (svbool_type, element_bytes, 1);
> - builder.quick_push (build_all_ones_cst (bool_type));
> - for (unsigned int i = 1; i < element_bytes; ++i)
> - builder.quick_push (build_zero_cst (bool_type));
> - return gimple_build_assign (lhs, builder.build ());
> + return gimple_build_assign (lhs,
> + build_ptrue (type_suffix (0).element_bytes));
> }
>
> /* Fold the call to a PFALSE. */
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h
> b/gcc/config/aarch64/aarch64-sve-builtins.h
> index 645e56badbe..c5524b9664f 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins.h
> +++ b/gcc/config/aarch64/aarch64-sve-builtins.h
> @@ -829,6 +829,7 @@ extern tree acle_svprfop;
> bool vector_cst_all_same (tree, unsigned int);
> bool is_ptrue (tree, unsigned int);
> const function_instance *lookup_fndecl (tree);
> +tree build_ptrue (unsigned int);
>
> /* Try to find a mode with the given mode_suffix_info fields. Return the
> mode on success or MODE_none on failure. */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c
> b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c
> index ac992dc7b1c..86cf4bd8137 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c
> @@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_32_s32_tied2, svint32_t,
> TEST_UNIFORM_Z (sra_32_s32_untied, svint32_t,
> z0 = svsra_n_s32 (z1, z2, 32),
> z0 = svsra (z1, z2, 32))
> +
> +/*
> +** sra_2_s32_zeroop1:
> +** asr z0\.s, z1\.s, #2
> +** ret
> +*/
> +TEST_UNIFORM_Z (sra_2_s32_zeroop1, svint32_t,
> + z0 = svsra_n_s32 (svdup_s32 (0), z1, 2),
> + z0 = svsra (svdup_s32 (0), z1, 2))
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c
> b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c
> index 9ea5657ab88..7b39798ba1d 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c
> @@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_64_s64_tied2, svint64_t,
> TEST_UNIFORM_Z (sra_64_s64_untied, svint64_t,
> z0 = svsra_n_s64 (z1, z2, 64),
> z0 = svsra (z1, z2, 64))
> +
> +/*
> +** sra_2_s64_zeroop1:
> +** asr z0\.d, z1\.d, #2
> +** ret
> +*/
> +TEST_UNIFORM_Z (sra_2_s64_zeroop1, svint64_t,
> + z0 = svsra_n_s64 (svdup_s64 (0), z1, 2),
> + z0 = svsra (svdup_s64 (0), z1, 2))
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c
> b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c
> index 090245153f7..001e09ca78d 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c
> @@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_32_u32_tied2, svuint32_t,
> TEST_UNIFORM_Z (sra_32_u32_untied, svuint32_t,
> z0 = svsra_n_u32 (z1, z2, 32),
> z0 = svsra (z1, z2, 32))
> +
> +/*
> +** sra_2_u32_zeroop1:
> +** lsr z0\.s, z1\.s, #2
> +** ret
> +*/
> +TEST_UNIFORM_Z (sra_2_u32_zeroop1, svuint32_t,
> + z0 = svsra_n_u32 (svdup_u32 (0), z1, 2),
> + z0 = svsra (svdup_u32 (0), z1, 2))
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c
> b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c
> index ff21c368b72..780cf7a7ff6 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c
> @@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_64_u64_tied2, svuint64_t,
> TEST_UNIFORM_Z (sra_64_u64_untied, svuint64_t,
> z0 = svsra_n_u64 (z1, z2, 64),
> z0 = svsra (z1, z2, 64))
> +
> +/*
> +** sra_2_u64_zeroop1:
> +** lsr z0\.d, z1\.d, #2
> +** ret
> +*/
> +TEST_UNIFORM_Z (sra_2_u64_zeroop1, svuint64_t,
> + z0 = svsra_n_u64 (svdup_u64 (0), z1, 2),
> + z0 = svsra (svdup_u64 (0), z1, 2))