Jennifer Schmitz <jschm...@nvidia.com> writes: > A common idiom in intrinsics loops is to have accumulator intrinsics > in an unrolled loop with an accumulator initialized to zero at the beginning. > Propagating the initial zero accumulator into the first iteration > of the loop and simplifying the first accumulate instruction is a > desirable transformation that we should teach GCC. > Therefore, this patch folds svsra to svlsr/svasr if op1 is all zeros, > producing the lower latency instructions LSR/ASR instead of USRA/SSRA. > We implemented this optimization in svsra_impl::fold. > Because svlsr/svasr are predicated intrinsics, we added a ptrue > predicate. Additionally, the width of the shift amount (imm3) was > adjusted to fit the function type. > In order to create the ptrue predicate, a new helper function > build_ptrue was added. We also refactored gimple_folder::fold_to_ptrue > to use the new helper function. > > Tests were added to check the produced assembly for use of LSR/ASR. > > The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression. > OK for mainline? > > Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com> > > gcc/ > * config/aarch64/aarch64-sve-builtins-sve2.cc > (svsra_impl::fold): Fold svsra to svlsr/svasr if op1 is all zeros. > * config/aarch64/aarch64-sve-builtins.cc (build_ptrue): New > function that returns a ptrue tree. > (gimple_folder::fold_to_ptrue): Refactor to use build_ptrue. > * config/aarch64/aarch64-sve-builtins.h: Declare build_ptrue. > > gcc/testsuite/ > * gcc.target/aarch64/sve2/acle/asm/sra_s32.c: New test. > * gcc.target/aarch64/sve2/acle/asm/sra_s64.c: Likewise. > * gcc.target/aarch64/sve2/acle/asm/sra_u32.c: Likewise. > * gcc.target/aarch64/sve2/acle/asm/sra_u64.c: Likewise. > --- > .../aarch64/aarch64-sve-builtins-sve2.cc | 29 +++++++++++++++++++ > gcc/config/aarch64/aarch64-sve-builtins.cc | 28 +++++++++++------- > gcc/config/aarch64/aarch64-sve-builtins.h | 1 + > .../aarch64/sve2/acle/asm/sra_s32.c | 9 ++++++ > .../aarch64/sve2/acle/asm/sra_s64.c | 9 ++++++ > .../aarch64/sve2/acle/asm/sra_u32.c | 9 ++++++ > .../aarch64/sve2/acle/asm/sra_u64.c | 9 ++++++ > 7 files changed, 83 insertions(+), 11 deletions(-) > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc > b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc > index 6a20a613f83..0990918cc45 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc > +++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc > @@ -417,6 +417,35 @@ public: > > class svsra_impl : public function_base > { > +public: > + gimple * > + fold (gimple_folder &f) const override > + { > + /* Fold to svlsr/svasr if op1 is all zeros. */ > + tree op1 = gimple_call_arg (f.call, 0); > + if (!integer_zerop (op1)) > + return NULL; > + function_instance instance ("svlsr", functions::svlsr, > + shapes::binary_uint_opt_n, MODE_n, > + f.type_suffix_ids, GROUP_none, PRED_x); > + if (!f.type_suffix (0).unsigned_p) > + { > + instance.base_name = "svasr"; > + instance.base = functions::svasr; > + } > + gcall *call = f.redirect_call (instance); > + unsigned int element_bytes = f.type_suffix (0).element_bytes; > + /* Add a ptrue as predicate, because unlike svsra, svlsr/svasr are > + predicated intrinsics. */ > + gimple_call_set_arg (call, 0, build_ptrue (element_bytes));
Maybe it would be simpler to use build_all_ones_cst (f.gp_type ()). Unlike for fold_to_ptrue (which produces output predicates), we don't need the upper bits of each predicate element to be zero. > + /* For svsra, the shift amount (imm3) is uint64_t for all function types, > + but for svlsr/svasr, imm3 has the same width as the function type. */ > + tree imm3 = gimple_call_arg (f.call, 2); > + tree imm3_prec = wide_int_to_tree (scalar_types[f.type_suffix > (0).vector_type], Nit: long line. The easiest way of avoiding it would be to use f.scalar_type (0) instead. > + wi::to_wide (imm3, element_bytes)); This works correctly, but FWIW, it's a little simpler to use wi::to_widest (imm3) instead. No need to change though. Thanks, Richard > + gimple_call_set_arg (call, 2, imm3_prec); > + return call; > + } > public: > rtx > expand (function_expander &e) const override > diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc > b/gcc/config/aarch64/aarch64-sve-builtins.cc > index e7c703c987e..945e9818f4e 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins.cc > +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc > @@ -3456,6 +3456,21 @@ is_ptrue (tree v, unsigned int step) > && vector_cst_all_same (v, step)); > } > > +/* Return a ptrue tree (type svbool_t) where the element width > + is given by ELEMENT_BYTES. > + For example, for ELEMENT_BYTES = 2, we get { 1, 0, 1, 0, ... }. */ > +tree > +build_ptrue (unsigned int element_bytes) > +{ > + tree bool_type = scalar_types[VECTOR_TYPE_svbool_t]; > + tree svbool_type = acle_vector_types[0][VECTOR_TYPE_svbool_t]; > + tree_vector_builder builder (svbool_type, element_bytes, 1); > + builder.quick_push (build_all_ones_cst (bool_type)); > + for (unsigned int i = 1; i < element_bytes; ++i) > + builder.quick_push (build_zero_cst (bool_type)); > + return builder.build (); > +} > + > gimple_folder::gimple_folder (const function_instance &instance, tree fndecl, > gimple_stmt_iterator *gsi_in, gcall *call_in) > : function_call_info (gimple_location (call_in), instance, fndecl), > @@ -3572,17 +3587,8 @@ gimple_folder::fold_to_cstu (poly_uint64 val) > gimple * > gimple_folder::fold_to_ptrue () > { > - tree svbool_type = TREE_TYPE (lhs); > - tree bool_type = TREE_TYPE (svbool_type); > - unsigned int element_bytes = type_suffix (0).element_bytes; > - > - /* The return type is svbool_t for all type suffixes, thus for b8 we > - want { 1, 1, 1, 1, ... }, for b16 we want { 1, 0, 1, 0, ... }, etc. */ > - tree_vector_builder builder (svbool_type, element_bytes, 1); > - builder.quick_push (build_all_ones_cst (bool_type)); > - for (unsigned int i = 1; i < element_bytes; ++i) > - builder.quick_push (build_zero_cst (bool_type)); > - return gimple_build_assign (lhs, builder.build ()); > + return gimple_build_assign (lhs, > + build_ptrue (type_suffix (0).element_bytes)); > } > > /* Fold the call to a PFALSE. */ > diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h > b/gcc/config/aarch64/aarch64-sve-builtins.h > index 645e56badbe..c5524b9664f 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins.h > +++ b/gcc/config/aarch64/aarch64-sve-builtins.h > @@ -829,6 +829,7 @@ extern tree acle_svprfop; > bool vector_cst_all_same (tree, unsigned int); > bool is_ptrue (tree, unsigned int); > const function_instance *lookup_fndecl (tree); > +tree build_ptrue (unsigned int); > > /* Try to find a mode with the given mode_suffix_info fields. Return the > mode on success or MODE_none on failure. */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c > b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c > index ac992dc7b1c..86cf4bd8137 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c > @@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_32_s32_tied2, svint32_t, > TEST_UNIFORM_Z (sra_32_s32_untied, svint32_t, > z0 = svsra_n_s32 (z1, z2, 32), > z0 = svsra (z1, z2, 32)) > + > +/* > +** sra_2_s32_zeroop1: > +** asr z0\.s, z1\.s, #2 > +** ret > +*/ > +TEST_UNIFORM_Z (sra_2_s32_zeroop1, svint32_t, > + z0 = svsra_n_s32 (svdup_s32 (0), z1, 2), > + z0 = svsra (svdup_s32 (0), z1, 2)) > diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c > b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c > index 9ea5657ab88..7b39798ba1d 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c > @@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_64_s64_tied2, svint64_t, > TEST_UNIFORM_Z (sra_64_s64_untied, svint64_t, > z0 = svsra_n_s64 (z1, z2, 64), > z0 = svsra (z1, z2, 64)) > + > +/* > +** sra_2_s64_zeroop1: > +** asr z0\.d, z1\.d, #2 > +** ret > +*/ > +TEST_UNIFORM_Z (sra_2_s64_zeroop1, svint64_t, > + z0 = svsra_n_s64 (svdup_s64 (0), z1, 2), > + z0 = svsra (svdup_s64 (0), z1, 2)) > diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c > b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c > index 090245153f7..001e09ca78d 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c > @@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_32_u32_tied2, svuint32_t, > TEST_UNIFORM_Z (sra_32_u32_untied, svuint32_t, > z0 = svsra_n_u32 (z1, z2, 32), > z0 = svsra (z1, z2, 32)) > + > +/* > +** sra_2_u32_zeroop1: > +** lsr z0\.s, z1\.s, #2 > +** ret > +*/ > +TEST_UNIFORM_Z (sra_2_u32_zeroop1, svuint32_t, > + z0 = svsra_n_u32 (svdup_u32 (0), z1, 2), > + z0 = svsra (svdup_u32 (0), z1, 2)) > diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c > b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c > index ff21c368b72..780cf7a7ff6 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c > @@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_64_u64_tied2, svuint64_t, > TEST_UNIFORM_Z (sra_64_u64_untied, svuint64_t, > z0 = svsra_n_u64 (z1, z2, 64), > z0 = svsra (z1, z2, 64)) > + > +/* > +** sra_2_u64_zeroop1: > +** lsr z0\.d, z1\.d, #2 > +** ret > +*/ > +TEST_UNIFORM_Z (sra_2_u64_zeroop1, svuint64_t, > + z0 = svsra_n_u64 (svdup_u64 (0), z1, 2), > + z0 = svsra (svdup_u64 (0), z1, 2))