Jennifer Schmitz <[email protected]> writes:
> As follow-up to
> https://gcc.gnu.org/pipermail/gcc-patches/2024-October/665472.html,
> this patch implements folding of svmul by -1 to svneg for
> unsigned SVE vector types. The key idea is to reuse the existing code that
> does this fold for signed types and feed it as callback to a helper function
> that adds the necessary type conversions.
>
> For example, for the test case
> svuint64_t foo (svuint64_t x, svbool_t pg)
> {
> return svmul_n_u64_x (pg, x, -1);
> }
>
> the following gimple sequence is emitted (-O2 -mcpu=grace):
> svuint64_t foo (svuint64_t x, svbool_t pg)
> {
> svint64_t D.12921;
> svint64_t D.12920;
> svuint64_t D.12919;
>
> D.12920 = VIEW_CONVERT_EXPR<svint64_t>(x);
> D.12921 = svneg_s64_x (pg, D.12920);
> D.12919 = VIEW_CONVERT_EXPR<svuint64_t>(D.12921);
> goto <D.12922>;
> <D.12922>:
> return D.12919;
> }
>
> In general, the new helper gimple_folder::convert_and_fold
> - takes a target type and a function pointer,
> - converts the lhs and all non-boolean vector types to the target type,
> - passes the converted lhs and arguments to the callback,
> - receives the new gimple statement from the callback function,
> - adds the necessary view converts to the gimple sequence,
> - and returns the new call.
>
> Because all arguments are converted to the same target types, the helper
> function is only suitable for folding calls whose arguments are all of
> the same type. If necessary, this could be extended to convert the
> arguments to different types differentially.
>
> The patch was bootstrapped and tested on aarch64-linux-gnu, no regression.
> OK for mainline?
>
> Signed-off-by: Jennifer Schmitz <[email protected]>
>
> gcc/ChangeLog:
>
> * config/aarch64/aarch64-sve-builtins-base.cc
> (svmul_impl::fold): Wrap code for folding to svneg in lambda
> function and pass to gimple_folder::convert_and_fold to enable
> the transform for unsigned types.
> * config/aarch64/aarch64-sve-builtins.cc
> (gimple_folder::convert_and_fold): New function that converts
> operands to target type before calling callback function, adding the
> necessary conversion statements.
> * config/aarch64/aarch64-sve-builtins.h
> (gimple_folder::convert_and_fold): Declare function.
> (signed_type_suffix_index): Return type_suffix_index of signed
> vector type for given width.
> (function_instance::signed_type): Return signed vector type for
> given width.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/sve/acle/asm/mul_u8.c: Adjust expected outcome.
> * gcc.target/aarch64/sve/acle/asm/mul_u16.c: Likewise.
> * gcc.target/aarch64/sve/acle/asm/mul_u32.c: Likewise.
> * gcc.target/aarch64/sve/acle/asm/mul_u64.c: New test and adjust
> expected outcome.
> ---
> .../aarch64/aarch64-sve-builtins-base.cc | 70 +++++++++++++------
> gcc/config/aarch64/aarch64-sve-builtins.cc | 43 ++++++++++++
> gcc/config/aarch64/aarch64-sve-builtins.h | 31 ++++++++
> .../gcc.target/aarch64/sve/acle/asm/mul_u16.c | 5 +-
> .../gcc.target/aarch64/sve/acle/asm/mul_u32.c | 5 +-
> .../gcc.target/aarch64/sve/acle/asm/mul_u64.c | 26 ++++++-
> .../gcc.target/aarch64/sve/acle/asm/mul_u8.c | 7 +-
> 7 files changed, 153 insertions(+), 34 deletions(-)
>
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index 87e9909b55a..52401a8c57a 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -2092,33 +2092,61 @@ public:
> return f.fold_active_lanes_to (build_zero_cst (TREE_TYPE (f.lhs)));
>
> /* If one of the operands is all integer -1, fold to svneg. */
> - tree pg = gimple_call_arg (f.call, 0);
> - tree negated_op = NULL;
> - if (integer_minus_onep (op2))
> - negated_op = op1;
> - else if (integer_minus_onep (op1))
> - negated_op = op2;
> - if (!f.type_suffix (0).unsigned_p && negated_op)
> + if (integer_minus_onep (op1) || integer_minus_onep (op2))
> {
> - function_instance instance ("svneg", functions::svneg,
> - shapes::unary, MODE_none,
> - f.type_suffix_ids, GROUP_none, f.pred);
> - gcall *call = f.redirect_call (instance);
> - unsigned offset_index = 0;
> - if (f.pred == PRED_m)
> + auto mul_by_m1 = [](gimple_folder &f, tree lhs_conv,
> + vec<tree> &args_conv) -> gimple *
> {
> - offset_index = 1;
> - gimple_call_set_arg (call, 0, op1);
> - }
> - else
> - gimple_set_num_ops (call, 5);
> - gimple_call_set_arg (call, offset_index, pg);
> - gimple_call_set_arg (call, offset_index + 1, negated_op);
> - return call;
> + gcc_assert (lhs_conv && args_conv.length () == 3);
> + tree pg = args_conv[0];
> + tree op1 = args_conv[1];
> + tree op2 = args_conv[2];
> + tree negated_op = op1;
> + bool negate_op1 = true;
> + if (integer_minus_onep (op1))
> + {
> + negated_op = op2;
> + negate_op1 = false;
> + }
> + type_suffix_pair signed_tsp =
> + {signed_type_suffix_index (f.type_suffix (0).element_bits),
> + f.type_suffix_ids[1]};
> + function_instance instance ("svneg", functions::svneg,
> + shapes::unary, MODE_none,
> + signed_tsp, GROUP_none, f.pred);
> + gcall *call = f.redirect_call (instance);
> + gimple_call_set_lhs (call, lhs_conv);
> + unsigned offset = 0;
> + tree fntype, op1_type = TREE_TYPE (op1);
> + if (f.pred == PRED_m)
> + {
> + offset = 1;
> + tree arg_types[3] = {op1_type, TREE_TYPE (pg), op1_type};
> + fntype = build_function_type_array (TREE_TYPE (lhs_conv),
> + 3, arg_types);
> + tree ty = f.signed_type (f.type_suffix (0).element_bits);
> + tree inactive = negate_op1 ? op1 : build_minus_one_cst (ty);
I think my question from the previous review still stands:
Isn't the old approach of using op1 as the inactive argument still
correct?
> + gimple_call_set_arg (call, 0, inactive);
> + }
> + else
> + {
> + gimple_set_num_ops (call, 5);
> + tree arg_types[2] = {TREE_TYPE (pg), op1_type};
> + fntype = build_function_type_array (TREE_TYPE (lhs_conv),
> + 2, arg_types);
> + }
> + gimple_call_set_fntype (call, fntype);
I think the fntype should be set by redirect_call, which can get the
type from TREE_TYPE (rfn->decl). That's better because it includes any
relevant attributes.
> + gimple_call_set_arg (call, offset, pg);
> + gimple_call_set_arg (call, offset + 1, negated_op);
> + return call;
> + };
> + tree ty = f.signed_type (f.type_suffix (0).element_bits);
> + return f.convert_and_fold (ty, mul_by_m1);
> }
>
> /* If one of the operands is a uniform power of 2, fold to a left shift
> by immediate. */
> + tree pg = gimple_call_arg (f.call, 0);
> tree op1_cst = uniform_integer_cst_p (op1);
> tree op2_cst = uniform_integer_cst_p (op2);
> tree shift_op1, shift_op2 = NULL;
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc
> b/gcc/config/aarch64/aarch64-sve-builtins.cc
> index 0fec1cd439e..01b0da22c9b 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
> @@ -3646,6 +3646,49 @@ gimple_folder::redirect_pred_x ()
> return redirect_call (instance);
> }
>
> +/* Convert the lhs and all non-boolean vector-type operands to TYPE.
> + Pass the converted variables to the callback FP, and finally convert the
> + result back to the original type. Add the necessary conversion statements.
> + Return the new call. */
> +gimple *
> +gimple_folder::convert_and_fold (tree type,
> + gimple *(*fp) (gimple_folder &,
> + tree, vec<tree> &))
> +{
> + gcc_assert (VECTOR_TYPE_P (type)
> + && TYPE_MODE (type) != VNx16BImode);
> + tree old_ty = TREE_TYPE (lhs);
> + gimple_seq stmts = NULL;
> + tree lhs_conv, op, op_ty, t;
> + gimple *g, *new_stmt;
> + bool convert_lhs_p = !useless_type_conversion_p (type, old_ty);
> + lhs_conv = convert_lhs_p ? create_tmp_var (type) : lhs;
> + unsigned int num_args = gimple_call_num_args (call);
> + vec<tree> args_conv = vNULL;
It'd be slightly more efficient to use something like:
auto_vec<tree, 16> args_conv;
here. This also removes the need to free the vector after use
(looks like the current version would leak).
> + args_conv.safe_grow (num_args);
> + for (unsigned int i = 0; i < num_args; ++i)
> + {
> + op = gimple_call_arg (call, i);
> + op_ty = TREE_TYPE (op);
> + args_conv[i] =
> + (VECTOR_TYPE_P (op_ty)
> + && TREE_CODE (op) != VECTOR_CST
> + && TYPE_MODE (op_ty) != VNx16BImode
> + && !useless_type_conversion_p (op_ty, type))
I think we should convert VECTOR_CSTs too. gimple_build should
fold it for us (but let me know if it doesn't).
Thanks, and sorry for the slow review.
Richard
> + ? gimple_build (&stmts, VIEW_CONVERT_EXPR, type, op) : op;
> + }
> +
> + new_stmt = fp (*this, lhs_conv, args_conv);
> + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> + if (convert_lhs_p)
> + {
> + t = build1 (VIEW_CONVERT_EXPR, old_ty, lhs_conv);
> + g = gimple_build_assign (lhs, VIEW_CONVERT_EXPR, t);
> + gsi_insert_after (gsi, g, GSI_SAME_STMT);
> + }
> + return new_stmt;
> +}
> +
> /* Fold the call to constant VAL. */
> gimple *
> gimple_folder::fold_to_cstu (poly_uint64 val)
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h
> b/gcc/config/aarch64/aarch64-sve-builtins.h
> index 4094f8207f9..3e919e52e2c 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins.h
> +++ b/gcc/config/aarch64/aarch64-sve-builtins.h
> @@ -406,6 +406,7 @@ public:
> tree scalar_type (unsigned int) const;
> tree vector_type (unsigned int) const;
> tree tuple_type (unsigned int) const;
> + tree signed_type (unsigned int) const;
> unsigned int elements_per_vq (unsigned int) const;
> machine_mode vector_mode (unsigned int) const;
> machine_mode tuple_mode (unsigned int) const;
> @@ -632,6 +633,8 @@ public:
>
> gcall *redirect_call (const function_instance &);
> gimple *redirect_pred_x ();
> + gimple *convert_and_fold (tree, gimple *(*) (gimple_folder &,
> + tree, vec<tree> &));
>
> gimple *fold_to_cstu (poly_uint64);
> gimple *fold_to_pfalse ();
> @@ -864,6 +867,20 @@ find_type_suffix (type_class_index tclass, unsigned int
> element_bits)
> gcc_unreachable ();
> }
>
> +/* Return the type suffix of the signed type of width ELEMENT_BITS. */
> +inline type_suffix_index
> +signed_type_suffix_index (unsigned int element_bits)
> +{
> + switch (element_bits)
> + {
> + case 8: return TYPE_SUFFIX_s8;
> + case 16: return TYPE_SUFFIX_s16;
> + case 32: return TYPE_SUFFIX_s32;
> + case 64: return TYPE_SUFFIX_s64;
> + }
> + gcc_unreachable ();
> +}
> +
> /* Return the single field in tuple type TYPE. */
> inline tree
> tuple_type_field (tree type)
> @@ -1029,6 +1046,20 @@ function_instance::tuple_type (unsigned int i) const
> return acle_vector_types[num_vectors - 1][type_suffix (i).vector_type];
> }
>
> +/* Return the signed vector type of width ELEMENT_BITS. */
> +inline tree
> +function_instance::signed_type (unsigned int element_bits) const
> +{
> + switch (element_bits)
> + {
> + case 8: return acle_vector_types[0][VECTOR_TYPE_svint8_t];
> + case 16: return acle_vector_types[0][VECTOR_TYPE_svint16_t];
> + case 32: return acle_vector_types[0][VECTOR_TYPE_svint32_t];
> + case 64: return acle_vector_types[0][VECTOR_TYPE_svint64_t];
> + }
> + gcc_unreachable ();
> +}
> +
> /* Return the number of elements of type suffix I that fit within a
> 128-bit block. */
> inline unsigned int
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u16.c
> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u16.c
> index bdf6fcb98d6..e228dc5995d 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u16.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u16.c
> @@ -174,8 +174,7 @@ TEST_UNIFORM_Z (mul_3_u16_m_untied, svuint16_t,
>
> /*
> ** mul_m1_u16_m:
> -** mov (z[0-9]+)\.b, #-1
> -** mul z0\.h, p0/m, z0\.h, \1\.h
> +** neg z0\.h, p0/m, z0\.h
> ** ret
> */
> TEST_UNIFORM_Z (mul_m1_u16_m, svuint16_t,
> @@ -569,7 +568,7 @@ TEST_UNIFORM_Z (mul_255_u16_x, svuint16_t,
>
> /*
> ** mul_m1_u16_x:
> -** mul z0\.h, z0\.h, #-1
> +** neg z0\.h, p0/m, z0\.h
> ** ret
> */
> TEST_UNIFORM_Z (mul_m1_u16_x, svuint16_t,
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u32.c
> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u32.c
> index a61e85fa12d..e8f52c9d785 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u32.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u32.c
> @@ -174,8 +174,7 @@ TEST_UNIFORM_Z (mul_3_u32_m_untied, svuint32_t,
>
> /*
> ** mul_m1_u32_m:
> -** mov (z[0-9]+)\.b, #-1
> -** mul z0\.s, p0/m, z0\.s, \1\.s
> +** neg z0\.s, p0/m, z0\.s
> ** ret
> */
> TEST_UNIFORM_Z (mul_m1_u32_m, svuint32_t,
> @@ -569,7 +568,7 @@ TEST_UNIFORM_Z (mul_255_u32_x, svuint32_t,
>
> /*
> ** mul_m1_u32_x:
> -** mul z0\.s, z0\.s, #-1
> +** neg z0\.s, p0/m, z0\.s
> ** ret
> */
> TEST_UNIFORM_Z (mul_m1_u32_x, svuint32_t,
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u64.c
> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u64.c
> index eee1f8a0c99..2ccdc3642c5 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u64.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u64.c
> @@ -183,14 +183,25 @@ TEST_UNIFORM_Z (mul_3_u64_m_untied, svuint64_t,
>
> /*
> ** mul_m1_u64_m:
> -** mov (z[0-9]+)\.b, #-1
> -** mul z0\.d, p0/m, z0\.d, \1\.d
> +** neg z0\.d, p0/m, z0\.d
> ** ret
> */
> TEST_UNIFORM_Z (mul_m1_u64_m, svuint64_t,
> z0 = svmul_n_u64_m (p0, z0, -1),
> z0 = svmul_m (p0, z0, -1))
>
> +/*
> +** mul_m1r_u64_m:
> +** mov (z[0-9]+)\.b, #-1
> +** mov (z[0-9]+\.d), z0\.d
> +** movprfx z0, \1
> +** neg z0\.d, p0/m, \2
> +** ret
> +*/
> +TEST_UNIFORM_Z (mul_m1r_u64_m, svuint64_t,
> + z0 = svmul_u64_m (p0, svdup_u64 (-1), z0),
> + z0 = svmul_m (p0, svdup_u64 (-1), z0))
> +
> /*
> ** mul_u64_z_tied1:
> ** movprfx z0\.d, p0/z, z0\.d
> @@ -597,13 +608,22 @@ TEST_UNIFORM_Z (mul_255_u64_x, svuint64_t,
>
> /*
> ** mul_m1_u64_x:
> -** mul z0\.d, z0\.d, #-1
> +** neg z0\.d, p0/m, z0\.d
> ** ret
> */
> TEST_UNIFORM_Z (mul_m1_u64_x, svuint64_t,
> z0 = svmul_n_u64_x (p0, z0, -1),
> z0 = svmul_x (p0, z0, -1))
>
> +/*
> +** mul_m1r_u64_x:
> +** neg z0\.d, p0/m, z0\.d
> +** ret
> +*/
> +TEST_UNIFORM_Z (mul_m1r_u64_x, svuint64_t,
> + z0 = svmul_u64_x (p0, svdup_u64 (-1), z0),
> + z0 = svmul_x (p0, svdup_u64 (-1), z0))
> +
> /*
> ** mul_m127_u64_x:
> ** mul z0\.d, z0\.d, #-127
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u8.c
> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u8.c
> index 06ee1b3e7c8..8e53a4821f0 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u8.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u8.c
> @@ -174,8 +174,7 @@ TEST_UNIFORM_Z (mul_3_u8_m_untied, svuint8_t,
>
> /*
> ** mul_m1_u8_m:
> -** mov (z[0-9]+)\.b, #-1
> -** mul z0\.b, p0/m, z0\.b, \1\.b
> +** neg z0\.b, p0/m, z0\.b
> ** ret
> */
> TEST_UNIFORM_Z (mul_m1_u8_m, svuint8_t,
> @@ -559,7 +558,7 @@ TEST_UNIFORM_Z (mul_128_u8_x, svuint8_t,
>
> /*
> ** mul_255_u8_x:
> -** mul z0\.b, z0\.b, #-1
> +** neg z0\.b, p0/m, z0\.b
> ** ret
> */
> TEST_UNIFORM_Z (mul_255_u8_x, svuint8_t,
> @@ -568,7 +567,7 @@ TEST_UNIFORM_Z (mul_255_u8_x, svuint8_t,
>
> /*
> ** mul_m1_u8_x:
> -** mul z0\.b, z0\.b, #-1
> +** neg z0\.b, p0/m, z0\.b
> ** ret
> */
> TEST_UNIFORM_Z (mul_m1_u8_x, svuint8_t,