Hi All,

The attached testcase generates the following paradoxical subregs when creating
the predicates.

(insn 22 21 23 2 (set (reg:VNx8BI 100)
        (subreg:VNx8BI (reg:VNx2BI 103) 0))
     (expr_list:REG_EQUAL (const_vector:VNx8BI [
                (const_int 1 [0x1])
                (const_int 0 [0])
                (const_int 1 [0x1])
        (const_int 0 [0]) repeated x5
            ])
        (nil)))

and

(insn 15 14 16 2 (set (reg:VNx8BI 96)
        (subreg:VNx8BI (reg:VNx2BI 99) 0))
     (expr_list:REG_EQUAL (const_vector:VNx8BI [
                (const_int 1 [0x1])
                (const_int 0 [0]) repeated x7
            ])
        (nil)))

This causes CSE to incorrectly think that the two predicates are equal because
some of the significant bits get ignored due to the subreg.

The attached patch instead makes it so it always looks at all 16-bits of the
predicate, but in turn means we need to generate a TRN that matches the expected
result mode.  In effect in RTL we keep the mode as VNx16BI but during codegen
re-interpret them as the mode the predicate instruction wanted:

(insn 10 9 11 2 (set (reg:VNx8BI 96)
        (subreg:VNx8BI (reg:VNx16BI 99) 0))
     (expr_list:REG_EQUAL (const_vector:VNx8BI [
                (const_int 1 [0x1])
                (const_int 0 [0]) repeated x7
            ])
        (nil)))

Which needed correction to the TRN pattern.  A new TRN1_CONV unspec is
introduced which allows one to keep the arguments as VNx16BI but encode the
instruction as a type of the last operand.

(insn 9 8 10 2 (set (reg:VNx16BI 99)
        (unspec:VNx16BI [
                (reg:VNx16BI 97)
                (reg:VNx16BI 98)
                (reg:VNx2BI 100)
            ] UNSPEC_TRN1_CONV))
        (nil))

This allows us remove all the paradoxical subregs and end up with

(insn 16 15 17 2 (set (reg:VNx8BI 101)
        (subreg:VNx8BI (reg:VNx16BI 104) 0))
        (expr_list:REG_EQUAL (const_vector:VNx8BI [
                (const_int 1 [0x1])
                (const_int 0 [0])
                (const_int 1 [0x1])
                (const_int 0 [0]) repeated x5
            ])
        (nil)))

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master? and backport to GCC 10?

Thanks,
Tamar

gcc/ChangeLog:

        PR target/100048
        * config/aarch64/aarch64-sve.md (@aarch64_sve_trn1_conv<mode>): New.
        * config/aarch64/aarch64.c (aarch64_expand_sve_const_pred_trn): Use new
        TRN optab.
        * config/aarch64/iterators.md (UNSPEC_TRN1_CONV): New.

gcc/testsuite/ChangeLog:

        PR target/100048
        * gcc.target/aarch64/sve/pr100048.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 
7db2938bb84e04d066a7b07574e5cf344a3a8fb6..2cdc6338902216760622a39b14f0076994458c98
 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -8657,6 +8657,22 @@ (define_insn "@aarch64_sve_<perm_insn><mode>"
   "<perm_insn>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
 )
 
+;; Special purpose permute used by the predicate generation instructions.
+;; This version only accepts VNx16BI as input but can output as any predicate
+;; type and will reinterpet the input registers as the type in operand 3.
+(define_insn "@aarch64_sve_trn1_conv<mode>"
+  [(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+       (unspec:VNx16BI [(match_operand:VNx16BI 1 "register_operand" "Upa")
+                        (match_operand:VNx16BI 2 "register_operand" "Upa")
+                        (clobber
+                         (match_operand:PRED_ALL 3 "register_operand" "=Upa"))
+                       ]
+                       UNSPEC_TRN1_CONV))]
+  "TARGET_SVE"
+  "trn1\t%0.<PRED_ALL:Vetype>, %1.<PRED_ALL:Vetype>, %2.<PRED_ALL:Vetype>"
+)
+
+
 ;; =========================================================================
 ;; == Conversions
 ;; =========================================================================
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
994fafc2dc857ca5c7f345e49b47cc7e7dcf5900..61337881bfd05dbf6e84ada6810b87fa36dc989d
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -5481,12 +5481,13 @@ aarch64_expand_sve_const_pred_trn (rtx target, 
rtx_vector_builder &builder,
        }
     }
 
-  /* Emit the TRN1 itself.  */
+  /* Emit the TRN1 itself.  We emit a TRN that will always take a
+     input registers as VNx16BI but re-interpret the results to
+     MODE.  */
   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
-  target = aarch64_target_reg (target, mode);
-  emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
-                             gen_lowpart (mode, a),
-                             gen_lowpart (mode, b)));
+  target = aarch64_target_reg (target, GET_MODE (a));
+  rtx type_reg = gen_reg_rtx (mode);
+  emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
   return target;
 }
 
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 
5f5abd60525ba52fdb466e94a92ff4d011bee5cd..cac33ae812b382cd55611b0da8a6e9eac3a513c4
 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -649,6 +649,7 @@ (define_c_enum "unspec"
     UNSPEC_UZP2Q       ; Used in aarch64-sve.md.
     UNSPEC_ZIP1Q       ; Used in aarch64-sve.md.
     UNSPEC_ZIP2Q       ; Used in aarch64-sve.md.
+    UNSPEC_TRN1_CONV   ; Used in aarch64-sve.md.
     UNSPEC_COND_CMPEQ_WIDE ; Used in aarch64-sve.md.
     UNSPEC_COND_CMPGE_WIDE ; Used in aarch64-sve.md.
     UNSPEC_COND_CMPGT_WIDE ; Used in aarch64-sve.md.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr100048.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr100048.c
new file mode 100644
index 
0000000000000000000000000000000000000000..525933863f7d67d76ba7afa4321346efa27ba000
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr100048.c
@@ -0,0 +1,25 @@
+/* { dg-additional-options "-O2 -fno-schedule-insns" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "arm_sve.h"
+
+/*
+** foo:
+**        ptrue   (p[0-7])\.d, all
+**        pfalse  (p[0-7])\.b
+**        ptrue   (p[0-7])\.s, all
+**        trn1    (p[0-7])\.d, \2\.d, \3\.d
+**        trn1    \2\.d, \1\.d, \3\.d
+**        faddv   (h[0-31]), \4\, (z[0-31]).h
+**        faddv   (h[0-31]), \2\, \6\.h
+**        str     \5, [x0]
+**        str     \7, [x0, 2]
+**        ret
+*/
+void foo(svfloat16_t in, float16_t *dst) {
+  const svbool_t pg_q0 = svdupq_n_b16(1, 0, 1, 0, 0, 0, 0, 0);
+  const svbool_t pg_f0 = svdupq_n_b16(1, 0, 0, 0, 0, 0, 0, 0);
+  dst[0] = svaddv_f16(pg_f0, in);
+  dst[1] = svaddv_f16(pg_q0, in);
+}
+


-- 
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 7db2938bb84e04d066a7b07574e5cf344a3a8fb6..2cdc6338902216760622a39b14f0076994458c98 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -8657,6 +8657,22 @@ (define_insn "@aarch64_sve_<perm_insn><mode>"
   "<perm_insn>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
 )
 
+;; Special purpose permute used by the predicate generation instructions.
+;; This version only accepts VNx16BI as input but can output as any predicate
+;; type and will reinterpet the input registers as the type in operand 3.
+(define_insn "@aarch64_sve_trn1_conv<mode>"
+  [(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+	(unspec:VNx16BI [(match_operand:VNx16BI 1 "register_operand" "Upa")
+			 (match_operand:VNx16BI 2 "register_operand" "Upa")
+			 (clobber
+			  (match_operand:PRED_ALL 3 "register_operand" "=Upa"))
+			]
+			UNSPEC_TRN1_CONV))]
+  "TARGET_SVE"
+  "trn1\t%0.<PRED_ALL:Vetype>, %1.<PRED_ALL:Vetype>, %2.<PRED_ALL:Vetype>"
+)
+
+
 ;; =========================================================================
 ;; == Conversions
 ;; =========================================================================
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 994fafc2dc857ca5c7f345e49b47cc7e7dcf5900..61337881bfd05dbf6e84ada6810b87fa36dc989d 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -5481,12 +5481,13 @@ aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
 	}
     }
 
-  /* Emit the TRN1 itself.  */
+  /* Emit the TRN1 itself.  We emit a TRN that will always take a
+     input registers as VNx16BI but re-interpret the results to
+     MODE.  */
   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
-  target = aarch64_target_reg (target, mode);
-  emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
-			      gen_lowpart (mode, a),
-			      gen_lowpart (mode, b)));
+  target = aarch64_target_reg (target, GET_MODE (a));
+  rtx type_reg = gen_reg_rtx (mode);
+  emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
   return target;
 }
 
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 5f5abd60525ba52fdb466e94a92ff4d011bee5cd..cac33ae812b382cd55611b0da8a6e9eac3a513c4 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -649,6 +649,7 @@ (define_c_enum "unspec"
     UNSPEC_UZP2Q	; Used in aarch64-sve.md.
     UNSPEC_ZIP1Q	; Used in aarch64-sve.md.
     UNSPEC_ZIP2Q	; Used in aarch64-sve.md.
+    UNSPEC_TRN1_CONV	; Used in aarch64-sve.md.
     UNSPEC_COND_CMPEQ_WIDE ; Used in aarch64-sve.md.
     UNSPEC_COND_CMPGE_WIDE ; Used in aarch64-sve.md.
     UNSPEC_COND_CMPGT_WIDE ; Used in aarch64-sve.md.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr100048.c b/gcc/testsuite/gcc.target/aarch64/sve/pr100048.c
new file mode 100644
index 0000000000000000000000000000000000000000..525933863f7d67d76ba7afa4321346efa27ba000
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr100048.c
@@ -0,0 +1,25 @@
+/* { dg-additional-options "-O2 -fno-schedule-insns" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "arm_sve.h"
+
+/*
+** foo:
+**        ptrue   (p[0-7])\.d, all
+**        pfalse  (p[0-7])\.b
+**        ptrue   (p[0-7])\.s, all
+**        trn1    (p[0-7])\.d, \2\.d, \3\.d
+**        trn1    \2\.d, \1\.d, \3\.d
+**        faddv   (h[0-31]), \4\, (z[0-31]).h
+**        faddv   (h[0-31]), \2\, \6\.h
+**        str     \5, [x0]
+**        str     \7, [x0, 2]
+**        ret
+*/
+void foo(svfloat16_t in, float16_t *dst) {
+  const svbool_t pg_q0 = svdupq_n_b16(1, 0, 1, 0, 0, 0, 0, 0);
+  const svbool_t pg_f0 = svdupq_n_b16(1, 0, 0, 0, 0, 0, 0, 0);
+  dst[0] = svaddv_f16(pg_f0, in);
+  dst[1] = svaddv_f16(pg_q0, in);
+}
+

Reply via email to