Hi, This rejects any loops where any predicated instruction comes before the vctp that generates the loop predicate. Even though this is not a requirement for dlstp transformation we have found potential issues where you can end up with a wrong transformation, so it is safer to reject such loops.
OK for trunk? gcc/ChangeLog: * gcc/config/arm/arm.cc (arm_mve_get_loop_vctp): Reject loops with a predicated instruction before the vctp. gcc/testsuite/ChangeLog: * gcc.target/arm/mve/dlstp-invalid-asm.c (test10): Renamed to... (test10a): ... this. (test10b): Variation of test10a with a small change to trigger an issue. --- gcc/config/arm/arm.cc | 21 ++++++++++++++----- .../gcc.target/arm/mve/dlstp-invalid-asm.c | 20 +++++++++++++++++- 2 files changed, 35 insertions(+), 6 deletions(-)
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc index 7292fddef80..29c0f478f36 100644 --- a/gcc/config/arm/arm.cc +++ b/gcc/config/arm/arm.cc @@ -34749,11 +34749,22 @@ arm_mve_get_loop_vctp (basic_block bb) instruction. We require arm_get_required_vpr_reg_param to be false to make sure we pick up a VCTP, rather than a VCTP_M. */ FOR_BB_INSNS (bb, insn) - if (NONDEBUG_INSN_P (insn)) - if (arm_get_required_vpr_reg_ret_val (insn) - && (arm_mve_get_vctp_lanes (insn) != 0) - && !arm_get_required_vpr_reg_param (insn)) - return insn; + { + if (!NONDEBUG_INSN_P (insn)) + continue; + /* If we encounter a predicated instruction before the VCTP then we can + not dlstp transform this loop because we would be imposing extra + predication on that instruction which was not present in the original + code. */ + if (arm_get_required_vpr_reg_param (insn)) + return NULL; + if (arm_get_required_vpr_reg_ret_val (insn)) + { + if (arm_mve_get_vctp_lanes (insn) != 0) + return insn; + return NULL; + } + } return NULL; } diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c index 26df2d30523..f26754cc482 100644 --- a/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c +++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c @@ -128,7 +128,7 @@ void test9 (int32_t *a, int32_t *b, int32_t *c, int n) } /* Using a VPR that gets re-generated within the loop. */ -void test10 (int32_t *a, int32_t *b, int32_t *c, int n) +void test10a (int32_t *a, int32_t *b, int32_t *c, int n) { mve_pred16_t p = vctp32q (n); while (n > 0) @@ -145,6 +145,24 @@ void test10 (int32_t *a, int32_t *b, int32_t *c, int n) } } +/* Using a VPR that gets re-generated within the loop. */ +void test10b (int32_t *a, int32_t *b, int32_t *c, int n) +{ + mve_pred16_t p = vctp32q (n-4); + while (n > 0) + { + int32x4_t va = vldrwq_z_s32 (a, p); + p = vctp32q (n); + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + /* Using vctp32q_m instead of vctp32q. */ void test11 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p0) {