Ping. Thanks, Jennifer > On 15 Oct 2024, at 09:40, Jennifer Schmitz <jschm...@nvidia.com> wrote: > > Because a neg instruction has lower latency and higher throughput than > sdiv and mul, svdiv and svmul by -1 can be folded to svneg. For svdiv, > this is already implemented on the RTL level; for svmul, the > optimization was still missing. > This patch implements folding to svneg for both operations using the > gimple_folder. For svdiv, the transform is applied if the divisor is -1. > Svmul is folded if either of the operands is -1. A case distinction of > the predication is made to account for the fact that svneg_m has 3 arguments > (argument 0 holds the values for the inactive lanes), while svneg_x and > svneg_z have only 2 arguments. > Tests were added or adjusted to check the produced assembly and runtime > tests were added to check correctness. > > The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression. > OK for mainline? > > Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com> > > gcc/ > * config/aarch64/aarch64-sve-builtins-base.cc (svdiv_impl::fold): > Fold division by -1 to svneg. > (svmul_impl::fold): Fold multiplication by -1 to svneg. > > gcc/testsuite/ > * gcc.target/aarch64/sve/acle/asm/div_s32.c: New test. > * gcc.target/aarch64/sve/acle/asm/mul_s16.c: Adjust expected outcome. > * gcc.target/aarch64/sve/acle/asm/mul_s32.c: New test. > * gcc.target/aarch64/sve/acle/asm/mul_s64.c: Adjust expected outcome. > * gcc.target/aarch64/sve/acle/asm/mul_s8.c: Likewise. > * gcc.target/aarch64/sve/div_const_run.c: New test. > * gcc.target/aarch64/sve/mul_const_run.c: Likewise. > --- > .../aarch64/aarch64-sve-builtins-base.cc | 73 ++++++++++++++++--- > .../gcc.target/aarch64/sve/acle/asm/div_s32.c | 59 +++++++++++++++ > .../gcc.target/aarch64/sve/acle/asm/mul_s16.c | 5 +- > .../gcc.target/aarch64/sve/acle/asm/mul_s32.c | 48 +++++++++++- > .../gcc.target/aarch64/sve/acle/asm/mul_s64.c | 5 +- > .../gcc.target/aarch64/sve/acle/asm/mul_s8.c | 7 +- > .../gcc.target/aarch64/sve/div_const_run.c | 10 ++- > .../gcc.target/aarch64/sve/mul_const_run.c | 10 ++- > 8 files changed, 189 insertions(+), 28 deletions(-) > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > index e7eba20f07a..2312b124c29 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > @@ -768,6 +768,27 @@ public: > if (integer_zerop (op1) || integer_zerop (op2)) > return f.fold_active_lanes_to (build_zero_cst (TREE_TYPE (f.lhs))); > > + /* If the divisor is all integer -1, fold to svneg. */ > + tree pg = gimple_call_arg (f.call, 0); > + if (!f.type_suffix (0).unsigned_p && integer_minus_onep (op2)) > + { > + function_instance instance ("svneg", functions::svneg, > + shapes::unary, MODE_none, > + f.type_suffix_ids, GROUP_none, f.pred); > + gcall *call = f.redirect_call (instance); > + unsigned offset_index = 0; > + if (f.pred == PRED_m) > + { > + offset_index = 1; > + gimple_call_set_arg (call, 0, op1); > + } > + else > + gimple_set_num_ops (call, 5); > + gimple_call_set_arg (call, offset_index, pg); > + gimple_call_set_arg (call, offset_index + 1, op1); > + return call; > + } > + > /* If the divisor is a uniform power of 2, fold to a shift > instruction. */ > tree op2_cst = uniform_integer_cst_p (op2); > @@ -2033,12 +2054,37 @@ public: > if (integer_zerop (op1) || integer_zerop (op2)) > return f.fold_active_lanes_to (build_zero_cst (TREE_TYPE (f.lhs))); > > + /* If one of the operands is all integer -1, fold to svneg. */ > + tree pg = gimple_call_arg (f.call, 0); > + tree negated_op = NULL; > + if (integer_minus_onep (op2)) > + negated_op = op1; > + else if (integer_minus_onep (op1)) > + negated_op = op2; > + if (!f.type_suffix (0).unsigned_p && negated_op) > + { > + function_instance instance ("svneg", functions::svneg, > + shapes::unary, MODE_none, > + f.type_suffix_ids, GROUP_none, f.pred); > + gcall *call = f.redirect_call (instance); > + unsigned offset_index = 0; > + if (f.pred == PRED_m) > + { > + offset_index = 1; > + gimple_call_set_arg (call, 0, op1); > + } > + else > + gimple_set_num_ops (call, 5); > + gimple_call_set_arg (call, offset_index, pg); > + gimple_call_set_arg (call, offset_index + 1, negated_op); > + return call; > + } > + > /* If one of the operands is a uniform power of 2, fold to a left shift > by immediate. */ > - tree pg = gimple_call_arg (f.call, 0); > tree op1_cst = uniform_integer_cst_p (op1); > tree op2_cst = uniform_integer_cst_p (op2); > - tree shift_op1, shift_op2; > + tree shift_op1, shift_op2 = NULL; > if (op1_cst && integer_pow2p (op1_cst) > && (f.pred != PRED_m > || is_ptrue (pg, f.type_suffix (0).element_bytes))) > @@ -2054,15 +2100,20 @@ public: > else > return NULL; > > - shift_op2 = wide_int_to_tree (unsigned_type_for (TREE_TYPE (shift_op2)), > - tree_log2 (shift_op2)); > - function_instance instance ("svlsl", functions::svlsl, > - shapes::binary_uint_opt_n, MODE_n, > - f.type_suffix_ids, GROUP_none, f.pred); > - gcall *call = f.redirect_call (instance); > - gimple_call_set_arg (call, 1, shift_op1); > - gimple_call_set_arg (call, 2, shift_op2); > - return call; > + if (shift_op2) > + { > + shift_op2 = wide_int_to_tree (unsigned_type_for (TREE_TYPE (shift_op2)), > + tree_log2 (shift_op2)); > + function_instance instance ("svlsl", functions::svlsl, > + shapes::binary_uint_opt_n, MODE_n, > + f.type_suffix_ids, GROUP_none, f.pred); > + gcall *call = f.redirect_call (instance); > + gimple_call_set_arg (call, 1, shift_op1); > + gimple_call_set_arg (call, 2, shift_op2); > + return call; > + } > + > + return NULL; > } > }; > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c > b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c > index 521f8bb4758..2c836db777e 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c > @@ -55,6 +55,15 @@ TEST_UNIFORM_ZX (div_w0_s32_m_untied, svint32_t, int32_t, > z0 = svdiv_n_s32_m (p0, z1, x0), > z0 = svdiv_m (p0, z1, x0)) > > +/* > +** div_m1_s32_m_tied1: > +** neg z0\.s, p0/m, z0\.s > +** ret > +*/ > +TEST_UNIFORM_Z (div_m1_s32_m_tied1, svint32_t, > + z0 = svdiv_n_s32_m (p0, z0, -1), > + z0 = svdiv_m (p0, z0, -1)) > + > /* > ** div_1_s32_m_tied1: > ** ret > @@ -63,6 +72,16 @@ TEST_UNIFORM_Z (div_1_s32_m_tied1, svint32_t, > z0 = svdiv_n_s32_m (p0, z0, 1), > z0 = svdiv_m (p0, z0, 1)) > > +/* > +** div_m1_s32_m_untied: > +** movprfx z0, z1 > +** neg z0\.s, p0/m, z1\.s > +** ret > +*/ > +TEST_UNIFORM_Z (div_m1_s32_m_untied, svint32_t, > + z0 = svdiv_n_s32_m (p0, z1, -1), > + z0 = svdiv_m (p0, z1, -1)) > + > /* > ** div_1_s32_m_untied: > ** mov z0\.d, z1\.d > @@ -214,6 +233,17 @@ TEST_UNIFORM_ZX (div_w0_s32_z_untied, svint32_t, int32_t, > z0 = svdiv_n_s32_z (p0, z1, x0), > z0 = svdiv_z (p0, z1, x0)) > > +/* > +** div_m1_s32_z_tied1: > +** mov (z[0-9]+)\.d, z0\.d > +** movprfx z0\.s, p0/z, \1\.s > +** neg z0\.s, p0/m, \1\.s > +** ret > +*/ > +TEST_UNIFORM_Z (div_m1_s32_z_tied1, svint32_t, > + z0 = svdiv_n_s32_z (p0, z0, -1), > + z0 = svdiv_z (p0, z0, -1)) > + > /* > ** div_1_s32_z_tied1: > ** mov (z[0-9]+)\.b, #0 > @@ -224,6 +254,16 @@ TEST_UNIFORM_Z (div_1_s32_z_tied1, svint32_t, > z0 = svdiv_n_s32_z (p0, z0, 1), > z0 = svdiv_z (p0, z0, 1)) > > +/* > +** div_m1_s32_z_untied: > +** movprfx z0\.s, p0/z, z1\.s > +** neg z0\.s, p0/m, z1\.s > +** ret > +*/ > +TEST_UNIFORM_Z (div_m1_s32_z_untied, svint32_t, > + z0 = svdiv_n_s32_z (p0, z1, -1), > + z0 = svdiv_z (p0, z1, -1)) > + > /* > ** div_1_s32_z_untied: > ** mov (z[0-9]+)\.b, #0 > @@ -381,6 +421,15 @@ TEST_UNIFORM_ZX (div_w0_s32_x_untied, svint32_t, int32_t, > z0 = svdiv_n_s32_x (p0, z1, x0), > z0 = svdiv_x (p0, z1, x0)) > > +/* > +** div_m1_s32_x_tied1: > +** neg z0\.s, p0/m, z0\.s > +** ret > +*/ > +TEST_UNIFORM_Z (div_m1_s32_x_tied1, svint32_t, > + z0 = svdiv_n_s32_x (p0, z0, -1), > + z0 = svdiv_x (p0, z0, -1)) > + > /* > ** div_1_s32_x_tied1: > ** ret > @@ -389,6 +438,16 @@ TEST_UNIFORM_Z (div_1_s32_x_tied1, svint32_t, > z0 = svdiv_n_s32_x (p0, z0, 1), > z0 = svdiv_x (p0, z0, 1)) > > +/* > +** div_m1_s32_x_untied: > +** movprfx z0, z1 > +** neg z0\.s, p0/m, z1\.s > +** ret > +*/ > +TEST_UNIFORM_Z (div_m1_s32_x_untied, svint32_t, > + z0 = svdiv_n_s32_x (p0, z1, -1), > + z0 = svdiv_x (p0, z1, -1)) > + > /* > ** div_1_s32_x_untied: > ** mov z0\.d, z1\.d > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s16.c > b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s16.c > index 381f3356025..2d780966e78 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s16.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s16.c > @@ -183,8 +183,7 @@ TEST_UNIFORM_Z (mul_3_s16_m_untied, svint16_t, > > /* > ** mul_m1_s16_m: > -** mov (z[0-9]+)\.b, #-1 > -** mul z0\.h, p0/m, z0\.h, \1\.h > +** neg z0\.h, p0/m, z0\.h > ** ret > */ > TEST_UNIFORM_Z (mul_m1_s16_m, svint16_t, > @@ -597,7 +596,7 @@ TEST_UNIFORM_Z (mul_255_s16_x, svint16_t, > > /* > ** mul_m1_s16_x: > -** mul z0\.h, z0\.h, #-1 > +** neg z0\.h, p0/m, z0\.h > ** ret > */ > TEST_UNIFORM_Z (mul_m1_s16_x, svint16_t, > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c > b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c > index 13009d88619..1d605dbdd8d 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c > @@ -183,14 +183,25 @@ TEST_UNIFORM_Z (mul_3_s32_m_untied, svint32_t, > > /* > ** mul_m1_s32_m: > -** mov (z[0-9]+)\.b, #-1 > -** mul z0\.s, p0/m, z0\.s, \1\.s > +** neg z0\.s, p0/m, z0\.s > ** ret > */ > TEST_UNIFORM_Z (mul_m1_s32_m, svint32_t, > z0 = svmul_n_s32_m (p0, z0, -1), > z0 = svmul_m (p0, z0, -1)) > > +/* > +** mul_m1r_s32_m: > +** mov (z[0-9]+)\.b, #-1 > +** mov (z[0-9]+)\.d, z0\.d > +** movprfx z0, \1 > +** neg z0\.s, p0/m, \2\.s > +** ret > +*/ > +TEST_UNIFORM_Z (mul_m1r_s32_m, svint32_t, > + z0 = svmul_s32_m (p0, svdup_s32 (-1), z0), > + z0 = svmul_m (p0, svdup_s32 (-1), z0)) > + > /* > ** mul_s32_z_tied1: > ** movprfx z0\.s, p0/z, z0\.s > @@ -597,13 +608,44 @@ TEST_UNIFORM_Z (mul_255_s32_x, svint32_t, > > /* > ** mul_m1_s32_x: > -** mul z0\.s, z0\.s, #-1 > +** neg z0\.s, p0/m, z0\.s > ** ret > */ > TEST_UNIFORM_Z (mul_m1_s32_x, svint32_t, > z0 = svmul_n_s32_x (p0, z0, -1), > z0 = svmul_x (p0, z0, -1)) > > +/* > +** mul_m1r_s32_x: > +** neg z0\.s, p0/m, z0\.s > +** ret > +*/ > +TEST_UNIFORM_Z (mul_m1r_s32_x, svint32_t, > + z0 = svmul_s32_x (p0, svdup_s32 (-1), z0), > + z0 = svmul_x (p0, svdup_s32 (-1), z0)) > + > +/* > +** mul_m1_s32_z: > +** mov (z[0-9]+)\.d, z0\.d > +** movprfx z0\.s, p0/z, \1\.s > +** neg z0\.s, p0/m, \1\.s > +** ret > +*/ > +TEST_UNIFORM_Z (mul_m1_s32_z, svint32_t, > + z0 = svmul_n_s32_z (p0, z0, -1), > + z0 = svmul_z (p0, z0, -1)) > + > +/* > +** mul_m1r_s32_z: > +** mov (z[0-9]+)\.d, z0\.d > +** movprfx z0\.s, p0/z, \1\.s > +** neg z0\.s, p0/m, \1\.s > +** ret > +*/ > +TEST_UNIFORM_Z (mul_m1r_s32_z, svint32_t, > + z0 = svmul_s32_z (p0, svdup_s32 (-1), z0), > + z0 = svmul_z (p0, svdup_s32 (-1), z0)) > + > /* > ** mul_m127_s32_x: > ** mul z0\.s, z0\.s, #-127 > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c > b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c > index 530d9fc84a5..c05d184f2fe 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c > @@ -192,8 +192,7 @@ TEST_UNIFORM_Z (mul_3_s64_m_untied, svint64_t, > > /* > ** mul_m1_s64_m: > -** mov (z[0-9]+)\.b, #-1 > -** mul z0\.d, p0/m, z0\.d, \1\.d > +** neg z0\.d, p0/m, z0\.d > ** ret > */ > TEST_UNIFORM_Z (mul_m1_s64_m, svint64_t, > @@ -625,7 +624,7 @@ TEST_UNIFORM_Z (mul_255_s64_x, svint64_t, > > /* > ** mul_m1_s64_x: > -** mul z0\.d, z0\.d, #-1 > +** neg z0\.d, p0/m, z0\.d > ** ret > */ > TEST_UNIFORM_Z (mul_m1_s64_x, svint64_t, > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c > b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c > index 0c90a8bb832..efc952e3a52 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c > @@ -183,8 +183,7 @@ TEST_UNIFORM_Z (mul_3_s8_m_untied, svint8_t, > > /* > ** mul_m1_s8_m: > -** mov (z[0-9]+)\.b, #-1 > -** mul z0\.b, p0/m, z0\.b, \1\.b > +** neg z0\.b, p0/m, z0\.b > ** ret > */ > TEST_UNIFORM_Z (mul_m1_s8_m, svint8_t, > @@ -587,7 +586,7 @@ TEST_UNIFORM_Z (mul_128_s8_x, svint8_t, > > /* > ** mul_255_s8_x: > -** mul z0\.b, z0\.b, #-1 > +** neg z0\.b, p0/m, z0\.b > ** ret > */ > TEST_UNIFORM_Z (mul_255_s8_x, svint8_t, > @@ -596,7 +595,7 @@ TEST_UNIFORM_Z (mul_255_s8_x, svint8_t, > > /* > ** mul_m1_s8_x: > -** mul z0\.b, z0\.b, #-1 > +** neg z0\.b, p0/m, z0\.b > ** ret > */ > TEST_UNIFORM_Z (mul_m1_s8_x, svint8_t, > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c > b/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c > index c96bb2763dc..60cf8345d6a 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c > @@ -42,7 +42,9 @@ typedef svuint64_t svuint64_ > __attribute__((arm_sve_vector_bits(128))); > TEST_TYPES_1 (uint64, u64) > > #define TEST_VALUES_S_1(B, OP1, OP2) \ > - F (int##B, s##B, x, OP1, OP2) > + F (int##B, s##B, x, OP1, OP2) \ > + F (int##B, s##B, z, OP1, OP2) \ > + F (int##B, s##B, m, OP1, OP2) > > #define TEST_VALUES_S \ > TEST_VALUES_S_1 (32, INT32_MIN, INT32_MIN) \ > @@ -60,7 +62,11 @@ typedef svuint64_t svuint64_ > __attribute__((arm_sve_vector_bits(128))); > TEST_VALUES_S_1 (32, INT32_MAX, -5) \ > TEST_VALUES_S_1 (64, INT64_MAX, -5) \ > TEST_VALUES_S_1 (32, INT32_MIN, -4) \ > - TEST_VALUES_S_1 (64, INT64_MIN, -4) > + TEST_VALUES_S_1 (64, INT64_MIN, -4) \ > + TEST_VALUES_S_1 (32, INT32_MAX, -1) \ > + TEST_VALUES_S_1 (32, -7, -1) \ > + TEST_VALUES_S_1 (64, INT64_MIN, -1) \ > + TEST_VALUES_S_1 (64, 16, -1) > > #define TEST_VALUES_U_1(B, OP1, OP2) \ > F (uint##B, u##B, x, OP1, OP2) > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c > b/gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c > index c369d5be167..eb897d622fc 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c > @@ -44,7 +44,9 @@ typedef svuint64_t svuint64_ > __attribute__((arm_sve_vector_bits(128))); > TEST_TYPES_1 (uint64, u64) > > #define TEST_VALUES_S_1(B, OP1, OP2) \ > - F (int##B, s##B, x, OP1, OP2) > + F (int##B, s##B, x, OP1, OP2) \ > + F (int##B, s##B, m, OP1, OP2) \ > + F (int##B, s##B, z, OP1, OP2) > > #define TEST_VALUES_S \ > TEST_VALUES_S_1 (32, INT32_MIN, INT32_MIN) \ > @@ -70,7 +72,11 @@ typedef svuint64_t svuint64_ > __attribute__((arm_sve_vector_bits(128))); > TEST_VALUES_S_1 (32, INT32_MAX, -5) \ > TEST_VALUES_S_1 (64, INT64_MAX, -5) \ > TEST_VALUES_S_1 (32, INT32_MIN, -4) \ > - TEST_VALUES_S_1 (64, INT64_MIN, -4) > + TEST_VALUES_S_1 (64, INT64_MIN, -4) \ > + TEST_VALUES_S_1 (32, INT32_MAX, -1) \ > + TEST_VALUES_S_1 (32, -7, -1) \ > + TEST_VALUES_S_1 (64, INT64_MIN, -1) \ > + TEST_VALUES_S_1 (64, 16, -1) > > #define TEST_VALUES_U_1(B, OP1, OP2) \ > F (uint##B, u##B, x, OP1, OP2) > -- > 2.44.0
smime.p7s
Description: S/MIME cryptographic signature