https://gcc.gnu.org/g:f42fd8e9335354f986d69b92ab66be07cc31bc7a

commit r15-5798-gf42fd8e9335354f986d69b92ab66be07cc31bc7a
Author: Andre Vieira <andre.simoesdiasvie...@arm.com>
Date:   Fri Nov 29 10:18:57 2024 +0000

    arm, mve: Detect uses of vctp_vpr_generated inside subregs
    
    Address a problem we were having where we were missing on detecting uses of
    vctp_vpr_generated in the analysis for 'arm_attempt_dlstp_transform' because
    the use was inside a SUBREG and rtx_equal_p does not catch that.  Using
    reg_overlap_mentioned_p is much more robust.
    
    gcc/ChangeLog:
    
            PR target/117814
            * config/arm/arm.cc (arm_attempt_dlstp_transform): Use
            reg_overlap_mentioned_p instead of rtx_equal_p to detect uses of
            vctp_vpr_generated inside subregs.
    
    gcc/testsuite/ChangeLog:
    
            PR target/117814
            * gcc.target/arm/mve/dlstp-invalid-asm.c (test10): Renamed to...
            (test10a): ... this.
            (test10b): Variation of test10a with a small change to trigger wrong
            codegen.

Diff:
---
 gcc/config/arm/arm.cc                              |  3 +-
 .../gcc.target/arm/mve/dlstp-invalid-asm.c         | 37 ++++++++++++++++++++--
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 7292fddef809..7f82fb94a56a 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -35847,7 +35847,8 @@ arm_attempt_dlstp_transform (rtx label)
          df_ref insn_uses = NULL;
          FOR_EACH_INSN_USE (insn_uses, insn)
          {
-           if (rtx_equal_p (vctp_vpr_generated, DF_REF_REG (insn_uses)))
+           if (reg_overlap_mentioned_p (vctp_vpr_generated,
+                                        DF_REF_REG (insn_uses)))
              {
                end_sequence ();
                return 1;
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c 
b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c
index 26df2d30523c..eb0782ebd0de 100644
--- a/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c
@@ -127,8 +127,15 @@ void test9 (int32_t *a, int32_t *b, int32_t *c, int n)
     }
 }
 
-/* Using a VPR that gets re-generated within the loop.  */
-void test10 (int32_t *a, int32_t *b, int32_t *c, int n)
+/* Using a VPR that gets re-generated within the loop.  Even though we
+   currently reject such loops, it would be possible to dlstp transform this
+   specific loop, as long as we make sure that the first vldrwq_z mask would
+   either:
+   * remain the same as its mask in the first iteration,
+   * become the same as the loop mask after the first iteration,
+   * become all ones, since the dlstp would then mask it the same as the loop
+   mask.  */
+void test10a (int32_t *a, int32_t *b, int32_t *c, int n)
 {
   mve_pred16_t p = vctp32q (n);
   while (n > 0)
@@ -145,6 +152,32 @@ void test10 (int32_t *a, int32_t *b, int32_t *c, int n)
     }
 }
 
+/* Using a VPR that gets re-generated within the loop, the difference between
+   this test and test10a is to make sure the two vctp calls are never the same,
+   this leads to slightly different codegen in some cases triggering the issue
+   in a different way.   This loop too would be OK to dlstp transform as long
+   as we made sure that the first vldrwq_z mask would either:
+   * remain the same as the its mask in the first iteration,
+   * become the same as the loop mask after the first iteration,
+   * become all ones, since the dlstp would then mask it the same as the loop
+   mask.  */
+void test10b (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  mve_pred16_t p = vctp32q (n-4);
+  while (n > 0)
+    {
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      p = vctp32q (n);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
 /* Using vctp32q_m instead of vctp32q.  */
 void test11 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p0)
 {

Reply via email to