https://gcc.gnu.org/g:f42fd8e9335354f986d69b92ab66be07cc31bc7a
commit r15-5798-gf42fd8e9335354f986d69b92ab66be07cc31bc7a Author: Andre Vieira <andre.simoesdiasvie...@arm.com> Date: Fri Nov 29 10:18:57 2024 +0000 arm, mve: Detect uses of vctp_vpr_generated inside subregs Address a problem we were having where we were missing on detecting uses of vctp_vpr_generated in the analysis for 'arm_attempt_dlstp_transform' because the use was inside a SUBREG and rtx_equal_p does not catch that. Using reg_overlap_mentioned_p is much more robust. gcc/ChangeLog: PR target/117814 * config/arm/arm.cc (arm_attempt_dlstp_transform): Use reg_overlap_mentioned_p instead of rtx_equal_p to detect uses of vctp_vpr_generated inside subregs. gcc/testsuite/ChangeLog: PR target/117814 * gcc.target/arm/mve/dlstp-invalid-asm.c (test10): Renamed to... (test10a): ... this. (test10b): Variation of test10a with a small change to trigger wrong codegen. Diff: --- gcc/config/arm/arm.cc | 3 +- .../gcc.target/arm/mve/dlstp-invalid-asm.c | 37 ++++++++++++++++++++-- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc index 7292fddef809..7f82fb94a56a 100644 --- a/gcc/config/arm/arm.cc +++ b/gcc/config/arm/arm.cc @@ -35847,7 +35847,8 @@ arm_attempt_dlstp_transform (rtx label) df_ref insn_uses = NULL; FOR_EACH_INSN_USE (insn_uses, insn) { - if (rtx_equal_p (vctp_vpr_generated, DF_REF_REG (insn_uses))) + if (reg_overlap_mentioned_p (vctp_vpr_generated, + DF_REF_REG (insn_uses))) { end_sequence (); return 1; diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c index 26df2d30523c..eb0782ebd0de 100644 --- a/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c +++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c @@ -127,8 +127,15 @@ void test9 (int32_t *a, int32_t *b, int32_t *c, int n) } } -/* Using a VPR that gets re-generated within the loop. */ -void test10 (int32_t *a, int32_t *b, int32_t *c, int n) +/* Using a VPR that gets re-generated within the loop. Even though we + currently reject such loops, it would be possible to dlstp transform this + specific loop, as long as we make sure that the first vldrwq_z mask would + either: + * remain the same as its mask in the first iteration, + * become the same as the loop mask after the first iteration, + * become all ones, since the dlstp would then mask it the same as the loop + mask. */ +void test10a (int32_t *a, int32_t *b, int32_t *c, int n) { mve_pred16_t p = vctp32q (n); while (n > 0) @@ -145,6 +152,32 @@ void test10 (int32_t *a, int32_t *b, int32_t *c, int n) } } +/* Using a VPR that gets re-generated within the loop, the difference between + this test and test10a is to make sure the two vctp calls are never the same, + this leads to slightly different codegen in some cases triggering the issue + in a different way. This loop too would be OK to dlstp transform as long + as we made sure that the first vldrwq_z mask would either: + * remain the same as the its mask in the first iteration, + * become the same as the loop mask after the first iteration, + * become all ones, since the dlstp would then mask it the same as the loop + mask. */ +void test10b (int32_t *a, int32_t *b, int32_t *c, int n) +{ + mve_pred16_t p = vctp32q (n-4); + while (n > 0) + { + int32x4_t va = vldrwq_z_s32 (a, p); + p = vctp32q (n); + int32x4_t vb = vldrwq_z_s32 (b, p); + int32x4_t vc = vaddq_x_s32 (va, vb, p); + vstrwq_p_s32 (c, vc, p); + c += 4; + a += 4; + b += 4; + n -= 4; + } +} + /* Using vctp32q_m instead of vctp32q. */ void test11 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p0) {