Hi,

This rejects any loops where any predicated instruction comes before the vctp
that generates the loop predicate.  Even though this is not a requirement for
dlstp transformation we have found potential issues where you can end up with a
wrong transformation, so it is safer to reject such loops.

OK for trunk?

gcc/ChangeLog:

        * gcc/config/arm/arm.cc (arm_mve_get_loop_vctp): Reject loops with a
        predicated instruction before the vctp.

gcc/testsuite/ChangeLog:

        * gcc.target/arm/mve/dlstp-invalid-asm.c (test10): Renamed to...
        (test10a): ... this.
        (test10b): Variation of test10a with a small change to trigger an
        issue.
---
 gcc/config/arm/arm.cc                         | 21 ++++++++++++++-----
 .../gcc.target/arm/mve/dlstp-invalid-asm.c    | 20 +++++++++++++++++-
 2 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 7292fddef80..29c0f478f36 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -34749,11 +34749,22 @@ arm_mve_get_loop_vctp (basic_block bb)
      instruction.  We require arm_get_required_vpr_reg_param to be false
      to make sure we pick up a VCTP, rather than a VCTP_M.  */
   FOR_BB_INSNS (bb, insn)
-    if (NONDEBUG_INSN_P (insn))
-      if (arm_get_required_vpr_reg_ret_val (insn)
-	  && (arm_mve_get_vctp_lanes (insn) != 0)
-	  && !arm_get_required_vpr_reg_param (insn))
-	return insn;
+    {
+      if (!NONDEBUG_INSN_P (insn))
+	continue;
+      /* If we encounter a predicated instruction before the VCTP then we can
+	 not dlstp transform this loop because we would be imposing extra
+	 predication on that instruction which was not present in the original
+	 code.  */
+      if (arm_get_required_vpr_reg_param (insn))
+	return NULL;
+      if (arm_get_required_vpr_reg_ret_val (insn))
+	{
+	  if (arm_mve_get_vctp_lanes (insn) != 0)
+	    return insn;
+	  return NULL;
+	}
+    }
   return NULL;
 }
 
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c
index 26df2d30523..f26754cc482 100644
--- a/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c
@@ -128,7 +128,7 @@ void test9 (int32_t *a, int32_t *b, int32_t *c, int n)
 }
 
 /* Using a VPR that gets re-generated within the loop.  */
-void test10 (int32_t *a, int32_t *b, int32_t *c, int n)
+void test10a (int32_t *a, int32_t *b, int32_t *c, int n)
 {
   mve_pred16_t p = vctp32q (n);
   while (n > 0)
@@ -145,6 +145,24 @@ void test10 (int32_t *a, int32_t *b, int32_t *c, int n)
     }
 }
 
+/* Using a VPR that gets re-generated within the loop.  */
+void test10b (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  mve_pred16_t p = vctp32q (n-4);
+  while (n > 0)
+    {
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      p = vctp32q (n);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
 /* Using vctp32q_m instead of vctp32q.  */
 void test11 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p0)
 {

Reply via email to