[PATCH v2 1/1] aarch64: Add support for unpacked SVE FP comparisons

Spencer Abson Mon, 16 Jun 2025 15:41:01 -0700

This patch extends our vec_cmp expander to support partial FP modes.

We use a predicate mode that is narrower the operation's VPRED to govern
unpacked FP operations under flag_trapping_math, so the expansion must
handle cases where the comparison's target and governing predicates have
different modes.


While such predicates enable all of the defined part of the operation, they
are not all-true.  Their false bits contribute to the (trapping) behavior of
the operation, so the operation itself should not have SVE_KNOWN_PTRUE.

gcc/ChangeLog:

    * config/aarch64/aarch64-sve.md (vec_cmp<mode><vpred>): Extend
    to handle partial FP modes.
    (@aarch64_pred_fcm<cmp_op><mode>): Likewise.
    (@aarch64_pred_fcmuo<mode>): Likewise.
    * config/aarch64/aarch64-sve.md (*one_cmpl<mode>3): Rename to...
    (@aarch64_pred_one_cmpl<mode>_z): ... this.
    * config/aarch64/aarch64.cc (aarch64_emit_sve_fp_cond): Allow the
    target and governing predicates to have different modes.
    (aarch64_emit_sve_or_fp_conds): Likewise.
    (aarch64_emit_sve_invert_fp_cond): Likewise.
    (aarch64_expand_sve_vec_cmp_float): Likewise.

gcc/testsuite/ChangeLog:

    * gcc.target/aarch64/sve/unpacked_fcm_1.c: New test.
    * gcc.target/aarch64/sve/unpacked_fcm_2.c: Likewise.
---
 gcc/config/aarch64/aarch64-sve.md             |  18 +-
 gcc/config/aarch64/aarch64.cc                 |  56 +-
 .../gcc.target/aarch64/sve/unpacked_fcm_1.c   | 602 ++++++++++++++++++
 .../gcc.target/aarch64/sve/unpacked_fcm_2.c   |  47 ++
 4 files changed, 698 insertions(+), 25 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_2.c

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 450975dd088..1894d623b6d 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3966,7 +3966,7 @@
 )
 
 ;; Predicated predicate inverse.
-(define_insn "*one_cmpl<mode>3"
+(define_insn "@aarch64_pred_one_cmpl<mode>_z"
   [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
        (and:PRED_ALL
          (not:PRED_ALL (match_operand:PRED_ALL 2 "register_operand" "Upa"))
@@ -8637,8 +8637,8 @@
 (define_expand "vec_cmp<mode><vpred>"
   [(set (match_operand:<VPRED> 0 "register_operand")
        (match_operator:<VPRED> 1 "comparison_operator"
-         [(match_operand:SVE_FULL_F 2 "register_operand")
-          (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero")]))]
+         [(match_operand:SVE_F 2 "register_operand")
+          (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero")]))]
   "TARGET_SVE"
   {
     aarch64_expand_sve_vec_cmp_float (operands[0], GET_CODE (operands[1]),
@@ -8651,10 +8651,10 @@
 (define_insn "@aarch64_pred_fcm<cmp_op><mode>"
   [(set (match_operand:<VPRED> 0 "register_operand")
        (unspec:<VPRED>
-         [(match_operand:<VPRED> 1 "register_operand")
+         [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
           (match_operand:SI 2 "aarch64_sve_ptrue_flag")
-          (match_operand:SVE_FULL_F 3 "register_operand")
-          (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+          (match_operand:SVE_F 3 "register_operand")
+          (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
          SVE_COND_FP_CMP_I0))]
   "TARGET_SVE"
   {@ [ cons: =0 , 1   , 3 , 4   ]
@@ -8667,10 +8667,10 @@
 (define_insn "@aarch64_pred_fcmuo<mode>"
   [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
        (unspec:<VPRED>
-         [(match_operand:<VPRED> 1 "register_operand" "Upl")
+         [(match_operand:<VPRED> 1 "aarch64_predicate_operand" "Upl")
           (match_operand:SI 2 "aarch64_sve_ptrue_flag")
-          (match_operand:SVE_FULL_F 3 "register_operand" "w")
-          (match_operand:SVE_FULL_F 4 "register_operand" "w")]
+          (match_operand:SVE_F 3 "register_operand" "w")
+          (match_operand:SVE_F 4 "register_operand" "w")]
          UNSPEC_COND_FCMUO))]
   "TARGET_SVE"
   "fcmuo\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.<Vetype>"
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 5540946eac7..93f8292bad0 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -27229,7 +27229,7 @@ aarch64_emit_sve_fp_cond (rtx target, rtx_code code, 
rtx pred,
                          bool known_ptrue_p, rtx op0, rtx op1)
 {
   rtx flag = gen_int_mode (known_ptrue_p, SImode);
-  rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
+  rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
                               gen_rtvec (4, pred, flag, op0, op1),
                               aarch64_unspec_cond_code (code));
   emit_set_insn (target, unspec);
@@ -27248,10 +27248,10 @@ static void
 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
                              rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
 {
-  machine_mode pred_mode = GET_MODE (pred);
-  rtx tmp1 = gen_reg_rtx (pred_mode);
+  machine_mode target_mode = GET_MODE (target);
+  rtx tmp1 = gen_reg_rtx (target_mode);
   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
-  rtx tmp2 = gen_reg_rtx (pred_mode);
+  rtx tmp2 = gen_reg_rtx (target_mode);
   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
 }
@@ -27268,8 +27268,7 @@ static void
 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
                                 bool known_ptrue_p, rtx op0, rtx op1)
 {
-  machine_mode pred_mode = GET_MODE (pred);
-  rtx tmp = gen_reg_rtx (pred_mode);
+  rtx tmp = gen_reg_rtx (GET_MODE (target));
   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
   aarch64_emit_unop (target, one_cmpl_optab, tmp);
 }
@@ -27281,10 +27280,25 @@ aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code 
code, rtx pred,
 void
 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1)
 {
-  machine_mode pred_mode = GET_MODE (target);
   machine_mode data_mode = GET_MODE (op0);
+  rtx pred = aarch64_sve_fp_pred (data_mode, nullptr);
 
-  rtx ptrue = aarch64_ptrue_reg (pred_mode);
+  /* The governing and destination modes.  */
+  machine_mode pred_mode = GET_MODE (pred);
+  machine_mode target_mode = GET_MODE (target);
+
+  /* For partial vector modes, the choice of predicate mode depends
+     on whether we need to suppress exceptions for inactive elements.
+     If we do need to suppress exceptions, the predicate mode matches
+     the element size rather than the container size and the predicate
+     marks the upper bits in each container as inactive.  The predicate
+     is then a ptrue wrt TARGET_MODE but not wrt PRED_MODE.  It is the
+     latter which matters here.
+
+     If we don't need to suppress exceptions, the predicate mode matches
+     the container size, PRED_MODE == TARGET_MODE, and the predicate is
+     thus a ptrue wrt both TARGET_MODE and PRED_MODE.  */
+  bool known_ptrue_p = pred_mode == target_mode;
   switch (code)
     {
     case UNORDERED:
@@ -27298,12 +27312,13 @@ aarch64_expand_sve_vec_cmp_float (rtx target, 
rtx_code code, rtx op0, rtx op1)
     case EQ:
     case NE:
       /* There is native support for the comparison.  */
-      aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
+      aarch64_emit_sve_fp_cond (target, code, pred, known_ptrue_p, op0, op1);
       return;
 
     case LTGT:
       /* This is a trapping operation (LT or GT).  */
-      aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
+      aarch64_emit_sve_or_fp_conds (target, LT, GT,
+                                   pred, known_ptrue_p, op0, op1);
       return;
 
     case UNEQ:
@@ -27312,7 +27327,7 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code 
code, rtx op0, rtx op1)
          /* This would trap for signaling NaNs.  */
          op1 = force_reg (data_mode, op1);
          aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
-                                       ptrue, true, op0, op1);
+                                       pred, known_ptrue_p, op0, op1);
          return;
        }
       /* fall through */
@@ -27322,11 +27337,19 @@ aarch64_expand_sve_vec_cmp_float (rtx target, 
rtx_code code, rtx op0, rtx op1)
     case UNGE:
       if (flag_trapping_math)
        {
-         /* Work out which elements are ordered.  */
-         rtx ordered = gen_reg_rtx (pred_mode);
          op1 = force_reg (data_mode, op1);
-         aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
-                                          ptrue, true, op0, op1);
+
+         /* Work out which elements are unordered.  */
+         rtx uo_tmp = gen_reg_rtx (target_mode);
+         aarch64_emit_sve_fp_cond (uo_tmp, UNORDERED,
+                                   pred, known_ptrue_p, op0, op1);
+
+         /* Invert the result - governered by PRED so that we only flip
+            the active bits.  */
+         rtx ordered = gen_reg_rtx (pred_mode);
+         uo_tmp = gen_lowpart (pred_mode, uo_tmp);
+         emit_insn (gen_aarch64_pred_one_cmpl_z (pred_mode, ordered,
+                                                 pred, uo_tmp));
 
          /* Test the opposite condition for the ordered elements,
             then invert the result.  */
@@ -27351,7 +27374,8 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code 
code, rtx op0, rtx op1)
 
   /* There is native support for the inverse comparison.  */
   code = reverse_condition_maybe_unordered (code);
-  aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
+  aarch64_emit_sve_invert_fp_cond (target, code,
+                                  pred, known_ptrue_p, op0, op1);
 }
 
 /* Return true if:
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_1.c
new file mode 100644
index 00000000000..bf9c1273857
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_1.c
@@ -0,0 +1,602 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 
--param=aarch64-autovec-preference=sve-only -fno-schedule-insns 
-fno-schedule-insns2" } */
+
+#include <stdint.h>
+
+#define UNLT(A, B) (!__builtin_isgreaterequal (A, B))
+#define UNLE(A, B) (!__builtin_isgreater (A, B))
+#define UNGT(A, B) (!__builtin_islessequal (A, B))
+#define UNGE(A, B) (!__builtin_isless (A, B))
+#define UNEQ(A, B) (!__builtin_islessgreater (A, B))
+
+#define EQ(A, B) ((A) == (B))
+#define NE(A, B) ((A) != (B))
+#define LE(A, B) ((A) <= (B))
+#define LT(A, B) ((A) < (B))
+#define GE(A, B) ((A) >= (B))
+#define GT(A, B) ((A) > (B))
+#define ORDERED(A, B) (!__builtin_isunordered (A, B))
+#define UNORDERED(A, B) (__builtin_isunordered (A, B))
+
+#define b_i b[i]
+
+#define TEST_FCM(TYPE0, TYPE1, CMP, RHS, COUNT)                        \
+  void                                                         \
+  f_##TYPE0##_##TYPE1##_##CMP##_##RHS (TYPE0 *__restrict out,  \
+                                      TYPE1 *__restrict a,     \
+                                      TYPE1 *__restrict b)     \
+  {                                                            \
+    for (unsigned int i = 0; i < COUNT; i++)                   \
+      out[i] = CMP (a[i], RHS) ? 3 : out[i];                   \
+  }
+
+#define TEST_CC_REG(CMP)                     \
+  TEST_FCM (uint64_t, float, CMP, b_i, 32)    \
+  TEST_FCM (uint32_t, _Float16, CMP, b_i, 64) \
+  TEST_FCM (uint64_t, _Float16, CMP, b_i, 32)
+
+#define TEST_CC_ALL(CMP)                   \
+  TEST_CC_REG (CMP)                        \
+  TEST_FCM (uint64_t, float, CMP, 0, 32)    \
+  TEST_FCM (uint32_t, _Float16, CMP, 0, 64) \
+  TEST_FCM (uint64_t, _Float16, CMP, 0, 32)
+
+
+/*
+** f_uint64_t_float_UNLT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   (p[0-9]+)\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmge   p[0-9]+\.s, \3/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNLT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmge   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNLT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmge   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNLT)
+
+/*
+** f_uint64_t_float_UNLE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   (p[0-9]+)\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmgt   p[0-9]+\.s, \3/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNLE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmgt   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNLE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmgt   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNLE)
+
+/*
+** f_uint64_t_float_UNGT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   (p[0-9]+)\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmle   p[0-9]+\.s, \3/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNGT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmle   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNGT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmle   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNGT)
+
+/*
+** f_uint64_t_float_UNGE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   (p[0-9]+)\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmlt   p[0-9]+\.s, \3/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNGE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmlt   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNGE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmlt   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNGE)
+
+/*
+** f_uint64_t_float_UNEQ_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   (p[0-9]+)\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmne   p[0-9]+\.s, \3/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNEQ_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmne   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNEQ_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   (p[0-9]+)\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**     not     (p[0-9]+)\.b, \1/z, \2\.b
+**     fcmne   p[0-9]+\.h, \3/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNEQ)
+
+/*
+** f_uint64_t_float_EQ_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmeq   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_EQ_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmeq   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_EQ_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmeq   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_EQ_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmeq   p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_EQ_0:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmeq   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_EQ_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmeq   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (EQ)
+
+/*
+** f_uint64_t_float_NE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmne   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_NE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmne   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_NE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmne   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_NE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmne   p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_NE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmne   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_NE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmne   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (NE)
+
+/*
+** f_uint64_t_float_LE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmle   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_LE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmle   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_LE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmle   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_LE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmle   p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_LE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmle   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_LE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmle   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (LE)
+
+/*
+** f_uint64_t_float_LT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmlt   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_LT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmlt   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_LT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmlt   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_LT_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmlt   p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_LT_0:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmlt   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_LT_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmlt   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (LT)
+
+/*
+** f_uint64_t_float_GE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmge   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_GE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmge   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_GE_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmge   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_GE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmge   p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_GE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmge   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_GE_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmge   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (GE)
+
+/*
+** f_uint64_t_float_GT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmgt   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_GT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmgt   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_GT_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmgt   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t_float_GT_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmgt   p[0-9]+\.s, \1/z, z[0-9]+\.s, #0.0
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_GT_0:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmgt   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_GT_0:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmgt   p[0-9]+\.h, \1/z, z[0-9]+\.h, #0.0
+**  ...
+*/
+TEST_CC_ALL (GT)
+
+/*
+** f_uint64_t_float_ORDERED_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_ORDERED_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmuo   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_ORDERED_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (ORDERED)
+
+/*
+** f_uint64_t_float_UNORDERED_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   p[0-9]+\.s, \1/z, z[0-9]+\.s, z[0-9]+\.s
+**  ...
+*/
+
+/*
+** f_uint32_t__Float16_UNORDERED_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.s, all
+**  ...
+**     fcmuo   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+
+/*
+** f_uint64_t__Float16_UNORDERED_b_i:
+**  ...
+**     ptrue   (p[0-9]+)\.d, all
+**  ...
+**     fcmuo   p[0-9]+\.h, \1/z, z[0-9]+\.h, z[0-9]+\.h
+**  ...
+*/
+TEST_CC_REG (UNORDERED)
+
+
+/* { dg-final { check-function-bodies "**" "" ""} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_2.c
new file mode 100644
index 00000000000..fd174d5738f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_2.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 
--param=aarch64-autovec-preference=sve-only -fno-trapping-math" } */
+
+#include "unpacked_fcm_1.c"
+
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.s} } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.d} } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b} 57 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, #0.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, #0.0\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, 
z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.h, p[0-7]/z, 
z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
-- 
2.34.1

[PATCH v2 1/1] aarch64: Add support for unpacked SVE FP comparisons

Reply via email to