Jennifer Schmitz <jschm...@nvidia.com> writes: > Because a neg instruction has lower latency and higher throughput than > sdiv and mul, svdiv and svmul by -1 can be folded to svneg. For svdiv, > this is already implemented on the RTL level; for svmul, the > optimization was still missing. > This patch implements folding to svneg for both operations using the > gimple_folder. For svdiv, the transform is applied if the divisor is -1. > Svmul is folded if either of the operands is -1. A case distinction of > the predication is made to account for the fact that svneg_m has 3 arguments > (argument 0 holds the values for the inactive lanes), while svneg_x and > svneg_z have only 2 arguments. > Tests were added or adjusted to check the produced assembly and runtime > tests were added to check correctness. > > The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression. > OK for mainline? > > Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com>
Sorry for the slow review. > [...] > @@ -2033,12 +2054,37 @@ public: > if (integer_zerop (op1) || integer_zerop (op2)) > return f.fold_active_lanes_to (build_zero_cst (TREE_TYPE (f.lhs))); > > + /* If one of the operands is all integer -1, fold to svneg. */ > + tree pg = gimple_call_arg (f.call, 0); > + tree negated_op = NULL; > + if (integer_minus_onep (op2)) > + negated_op = op1; > + else if (integer_minus_onep (op1)) > + negated_op = op2; > + if (!f.type_suffix (0).unsigned_p && negated_op) This is definitely ok, but it would be nice to handle the unsigned_p case too at some point. This would mean bit-casting to the equivalent signed type, doing the negation, and casting back. It would be good to have a helper for doing that (maybe with a lambda callback that emits the actual call) since I can imagine it will be needed elsewhere too. > [...] > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c > b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c > index 13009d88619..1d605dbdd8d 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c > @@ -183,14 +183,25 @@ TEST_UNIFORM_Z (mul_3_s32_m_untied, svint32_t, > > /* > ** mul_m1_s32_m: > -** mov (z[0-9]+)\.b, #-1 > -** mul z0\.s, p0/m, z0\.s, \1\.s > +** neg z0\.s, p0/m, z0\.s > ** ret > */ > TEST_UNIFORM_Z (mul_m1_s32_m, svint32_t, > z0 = svmul_n_s32_m (p0, z0, -1), > z0 = svmul_m (p0, z0, -1)) > > +/* > +** mul_m1r_s32_m: > +** mov (z[0-9]+)\.b, #-1 > +** mov (z[0-9]+)\.d, z0\.d > +** movprfx z0, \1 > +** neg z0\.s, p0/m, \2\.s > +** ret > +*/ > +TEST_UNIFORM_Z (mul_m1r_s32_m, svint32_t, > + z0 = svmul_s32_m (p0, svdup_s32 (-1), z0), > + z0 = svmul_m (p0, svdup_s32 (-1), z0)) Maybe it would be better to test the untied case instead, by passing z1 rather than z0 as the final argument. Hopefully that would leave us with just the first and last instructions. (I think the existing tests already cover the awkward tied2 case well enough.) Same for the later similar tests. OK with that change, thanks. Richard > + > /* > ** mul_s32_z_tied1: > ** movprfx z0\.s, p0/z, z0\.s > @@ -597,13 +608,44 @@ TEST_UNIFORM_Z (mul_255_s32_x, svint32_t, > > /* > ** mul_m1_s32_x: > -** mul z0\.s, z0\.s, #-1 > +** neg z0\.s, p0/m, z0\.s > ** ret > */ > TEST_UNIFORM_Z (mul_m1_s32_x, svint32_t, > z0 = svmul_n_s32_x (p0, z0, -1), > z0 = svmul_x (p0, z0, -1)) > > +/* > +** mul_m1r_s32_x: > +** neg z0\.s, p0/m, z0\.s > +** ret > +*/ > +TEST_UNIFORM_Z (mul_m1r_s32_x, svint32_t, > + z0 = svmul_s32_x (p0, svdup_s32 (-1), z0), > + z0 = svmul_x (p0, svdup_s32 (-1), z0)) > + > +/* > +** mul_m1_s32_z: > +** mov (z[0-9]+)\.d, z0\.d > +** movprfx z0\.s, p0/z, \1\.s > +** neg z0\.s, p0/m, \1\.s > +** ret > +*/ > +TEST_UNIFORM_Z (mul_m1_s32_z, svint32_t, > + z0 = svmul_n_s32_z (p0, z0, -1), > + z0 = svmul_z (p0, z0, -1)) > + > +/* > +** mul_m1r_s32_z: > +** mov (z[0-9]+)\.d, z0\.d > +** movprfx z0\.s, p0/z, \1\.s > +** neg z0\.s, p0/m, \1\.s > +** ret > +*/ > +TEST_UNIFORM_Z (mul_m1r_s32_z, svint32_t, > + z0 = svmul_s32_z (p0, svdup_s32 (-1), z0), > + z0 = svmul_z (p0, svdup_s32 (-1), z0)) > + > /* > ** mul_m127_s32_x: > ** mul z0\.s, z0\.s, #-127 > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c > b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c > index 530d9fc84a5..c05d184f2fe 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c > @@ -192,8 +192,7 @@ TEST_UNIFORM_Z (mul_3_s64_m_untied, svint64_t, > > /* > ** mul_m1_s64_m: > -** mov (z[0-9]+)\.b, #-1 > -** mul z0\.d, p0/m, z0\.d, \1\.d > +** neg z0\.d, p0/m, z0\.d > ** ret > */ > TEST_UNIFORM_Z (mul_m1_s64_m, svint64_t, > @@ -625,7 +624,7 @@ TEST_UNIFORM_Z (mul_255_s64_x, svint64_t, > > /* > ** mul_m1_s64_x: > -** mul z0\.d, z0\.d, #-1 > +** neg z0\.d, p0/m, z0\.d > ** ret > */ > TEST_UNIFORM_Z (mul_m1_s64_x, svint64_t, > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c > b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c > index 0c90a8bb832..efc952e3a52 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c > @@ -183,8 +183,7 @@ TEST_UNIFORM_Z (mul_3_s8_m_untied, svint8_t, > > /* > ** mul_m1_s8_m: > -** mov (z[0-9]+)\.b, #-1 > -** mul z0\.b, p0/m, z0\.b, \1\.b > +** neg z0\.b, p0/m, z0\.b > ** ret > */ > TEST_UNIFORM_Z (mul_m1_s8_m, svint8_t, > @@ -587,7 +586,7 @@ TEST_UNIFORM_Z (mul_128_s8_x, svint8_t, > > /* > ** mul_255_s8_x: > -** mul z0\.b, z0\.b, #-1 > +** neg z0\.b, p0/m, z0\.b > ** ret > */ > TEST_UNIFORM_Z (mul_255_s8_x, svint8_t, > @@ -596,7 +595,7 @@ TEST_UNIFORM_Z (mul_255_s8_x, svint8_t, > > /* > ** mul_m1_s8_x: > -** mul z0\.b, z0\.b, #-1 > +** neg z0\.b, p0/m, z0\.b > ** ret > */ > TEST_UNIFORM_Z (mul_m1_s8_x, svint8_t, > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c > b/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c > index c96bb2763dc..60cf8345d6a 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c > @@ -42,7 +42,9 @@ typedef svuint64_t svuint64_ > __attribute__((arm_sve_vector_bits(128))); > TEST_TYPES_1 (uint64, u64) > > #define TEST_VALUES_S_1(B, OP1, OP2) \ > - F (int##B, s##B, x, OP1, OP2) > + F (int##B, s##B, x, OP1, OP2) > \ > + F (int##B, s##B, z, OP1, OP2) > \ > + F (int##B, s##B, m, OP1, OP2) > > #define TEST_VALUES_S > \ > TEST_VALUES_S_1 (32, INT32_MIN, INT32_MIN) \ > @@ -60,7 +62,11 @@ typedef svuint64_t svuint64_ > __attribute__((arm_sve_vector_bits(128))); > TEST_VALUES_S_1 (32, INT32_MAX, -5) > \ > TEST_VALUES_S_1 (64, INT64_MAX, -5) > \ > TEST_VALUES_S_1 (32, INT32_MIN, -4) > \ > - TEST_VALUES_S_1 (64, INT64_MIN, -4) > + TEST_VALUES_S_1 (64, INT64_MIN, -4) > \ > + TEST_VALUES_S_1 (32, INT32_MAX, -1) > \ > + TEST_VALUES_S_1 (32, -7, -1) > \ > + TEST_VALUES_S_1 (64, INT64_MIN, -1) > \ > + TEST_VALUES_S_1 (64, 16, -1) > > #define TEST_VALUES_U_1(B, OP1, OP2) \ > F (uint##B, u##B, x, OP1, OP2) > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c > b/gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c > index c369d5be167..eb897d622fc 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c > @@ -44,7 +44,9 @@ typedef svuint64_t svuint64_ > __attribute__((arm_sve_vector_bits(128))); > TEST_TYPES_1 (uint64, u64) > > #define TEST_VALUES_S_1(B, OP1, OP2) \ > - F (int##B, s##B, x, OP1, OP2) > + F (int##B, s##B, x, OP1, OP2) > \ > + F (int##B, s##B, m, OP1, OP2) > \ > + F (int##B, s##B, z, OP1, OP2) > > #define TEST_VALUES_S > \ > TEST_VALUES_S_1 (32, INT32_MIN, INT32_MIN) \ > @@ -70,7 +72,11 @@ typedef svuint64_t svuint64_ > __attribute__((arm_sve_vector_bits(128))); > TEST_VALUES_S_1 (32, INT32_MAX, -5) > \ > TEST_VALUES_S_1 (64, INT64_MAX, -5) > \ > TEST_VALUES_S_1 (32, INT32_MIN, -4) > \ > - TEST_VALUES_S_1 (64, INT64_MIN, -4) > + TEST_VALUES_S_1 (64, INT64_MIN, -4) > \ > + TEST_VALUES_S_1 (32, INT32_MAX, -1) > \ > + TEST_VALUES_S_1 (32, -7, -1) > \ > + TEST_VALUES_S_1 (64, INT64_MIN, -1) > \ > + TEST_VALUES_S_1 (64, 16, -1) > > #define TEST_VALUES_U_1(B, OP1, OP2) \ > F (uint##B, u##B, x, OP1, OP2)