[PATCH/GCC16 v2 1/1] AArch64: Emit half-precision FCMP/FCMPE

2025-01-31 Thread Spencer Abson
Enable a target with FEAT_FP16 to emit the half-precision variants
of FCMP/FCMPE.

gcc/ChangeLog:

* config/aarch64/aarch64.md: Update cbranch, cstore, fcmp
and fcmpe to use the GPF_F16 iterator for floating-point
modes.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/_Float16_cmp_1.c: New test.
* gcc.target/aarch64/_Float16_cmp_2.c: New (negative) test.
---
 gcc/config/aarch64/aarch64.md | 29 +-
 .../gcc.target/aarch64/_Float16_cmp_1.c   | 54 +++
 .../gcc.target/aarch64/_Float16_cmp_2.c   |  7 +++
 3 files changed, 77 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/_Float16_cmp_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/_Float16_cmp_2.c

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 071058dbeb3..f63e4d79b3c 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -707,11 +707,12 @@
 )
 
 (define_expand "cbranch4"
-  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
-   [(match_operand:GPF 1 "register_operand")
-(match_operand:GPF 2 
"aarch64_fp_compare_operand")])
-  (label_ref (match_operand 3 "" ""))
-  (pc)))]
+  [(set (pc) (if_then_else
+   (match_operator 0 "aarch64_comparison_operator"
+[(match_operand:GPF_F16 1 "register_operand")
+ (match_operand:GPF_F16 2 "aarch64_fp_compare_operand")])
+   (label_ref (match_operand 3 "" ""))
+   (pc)))]
   ""
   "
   operands[1] = aarch64_gen_compare_reg (GET_CODE (operands[0]), operands[1],
@@ -4338,26 +4339,28 @@
 
 (define_insn "fcmp"
   [(set (reg:CCFP CC_REGNUM)
-(compare:CCFP (match_operand:GPF 0 "register_operand")
- (match_operand:GPF 1 "aarch64_fp_compare_operand")))]
+   (compare:CCFP
+ (match_operand:GPF_F16 0 "register_operand")
+ (match_operand:GPF_F16 1 "aarch64_fp_compare_operand")))]
"TARGET_FLOAT"
{@ [ cons: 0 , 1  ]
   [ w   , Y  ] fcmp\t%0, #0.0
   [ w   , w  ] fcmp\t%0, %1
   }
-  [(set_attr "type" "fcmp")]
+  [(set_attr "type" "fcmp")]
 )
 
 (define_insn "fcmpe"
   [(set (reg:CCFPE CC_REGNUM)
-(compare:CCFPE (match_operand:GPF 0 "register_operand")
-  (match_operand:GPF 1 "aarch64_fp_compare_operand")))]
+   (compare:CCFPE
+ (match_operand:GPF_F16 0 "register_operand")
+ (match_operand:GPF_F16 1 "aarch64_fp_compare_operand")))]
"TARGET_FLOAT"
{@ [ cons: 0 , 1  ]
   [ w   , Y  ] fcmpe\t%0, #0.0
   [ w   , w  ] fcmpe\t%0, %1
   }
-  [(set_attr "type" "fcmp")]
+  [(set_attr "type" "fcmp")]
 )
 
 (define_insn "*cmp_swp__reg"
@@ -4425,8 +4428,8 @@
 (define_expand "cstore4"
   [(set (match_operand:SI 0 "register_operand")
(match_operator:SI 1 "aarch64_comparison_operator_mode"
-[(match_operand:GPF 2 "register_operand")
- (match_operand:GPF 3 "aarch64_fp_compare_operand")]))]
+[(match_operand:GPF_F16 2 "register_operand")
+ (match_operand:GPF_F16 3 "aarch64_fp_compare_operand")]))]
   ""
   "
   operands[2] = aarch64_gen_compare_reg (GET_CODE (operands[1]), operands[2],
diff --git a/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_1.c 
b/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_1.c
new file mode 100644
index 000..e49ace1d7dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_1.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+fp16" } */
+
+/*
+** test_fcmp_store:
+** fcmph0, h1
+** csetw0, eq
+** ret
+*/
+int
+test_fcmp_store(_Float16 a, _Float16 b)
+{
+return a == b;
+}
+
+/*
+** test_fcmpe_store:
+** fcmpe   h0, h1
+** csetw0, mi
+** ret
+*/
+int
+test_fcmpe_store(_Float16 a, _Float16 b)
+{
+return a < b;
+}
+
+/*
+** test_fcmp_branch:
+** fcmph0, h1
+** ...
+*/
+_Float16
+test_fcmp_branch(_Float16 a, _Float16 b)
+{
+if (a == b)
+return a * b;
+return a;
+}
+
+/*
+** test_fcmpe_branch:
+** fcmpe   h0, h1
+** ...
+*/
+_Float16
+test_fcmpe_branch(_Float16 a, _Float16 b)
+{
+if (a < b)
+return a * b;
+return a;
+}
+
+/* { dg-final { check-function-bodies "**" "" "" } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_2.c 
b/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_2.c
new file mode 100644
index 000..0ff7cda8796
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_2.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+nofp16" } */
+
+#include "_Float16_cmp_1.c"
+
+/* { dg-final { scan-assembler-not {\tfcmp\th[0-9]+} } } */
+/* { dg-final { scan-assembler-not {\tfcmpe\th[0-9]+} } } */
-- 
2.34.1



[PATCH/GCC16 v2 0/1] AArch64: Emit half-precision FCMP/FCMPE

2025-01-31 Thread Spencer Abson
Applied the fixups suggested in the previous review, cheers.


This patch allows the AArch64 back end to emit the half-precision variants of
FCMP and FCMPE, given the target supports FEAT_FP16. Previously, such 
comparisons
would be unnecessarily promoted to single-precision.

The latest documentation of these instructions can be found here:
https://developer.arm.com/documentation/ddi0602/2024-12

Successfully bootstrapped and regtested on aarch64-linux-gnu.

OK for stage 1?

Spencer Abson (1):
  AArch64: Emit half-precision FCMP/FCMPE

 gcc/config/aarch64/aarch64.md | 29 +-
 .../gcc.target/aarch64/_Float16_cmp_1.c   | 54 +++
 .../gcc.target/aarch64/_Float16_cmp_2.c   |  7 +++
 3 files changed, 77 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/_Float16_cmp_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/_Float16_cmp_2.c

-- 
2.34.1



[PATCH/GCC16 0/1] AArch64: Define the spaceship optab [PR117013]

2025-01-23 Thread Spencer Abson
This patch defines spaceship{sf,df,si,di} for AArch64. This is a fix
for the poor codegen on floating-point types raised by the PR, and
an improvement to that for integers where this optab applies.

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117013.

Successfully bootstrapped and regtested on aarch64-linux-gnu.

OK for stage 1?

Spencer Abson (1):
  AArch64: Define the spaceship optab [PR117013]

 gcc/config/aarch64/aarch64-protos.h   |   1 +
 gcc/config/aarch64/aarch64.cc |  73 +++
 gcc/config/aarch64/aarch64.md |  43 
 .../g++.target/aarch64/spaceship_1.C  | 192 ++
 .../g++.target/aarch64/spaceship_2.C  |  72 +++
 .../g++.target/aarch64/spaceship_3.C  |   9 +
 6 files changed, 390 insertions(+)
 create mode 100644 gcc/testsuite/g++.target/aarch64/spaceship_1.C
 create mode 100644 gcc/testsuite/g++.target/aarch64/spaceship_2.C
 create mode 100644 gcc/testsuite/g++.target/aarch64/spaceship_3.C

-- 
2.34.1



[PATCH/GCC16 1/1] AArch64: Define the spaceship optab [PR117013]

2025-01-23 Thread Spencer Abson
This expansion ensures that exactly one comparison is emitted for
spacesip-like sequences on floating-point operands, including when
the result of such sequences are compared against members of
std.

For both integer and floating-point types, we optimize for the case
in which the result of a spaceship-like operation is written to a GPR.
The PR highlights this issue for floating-point operands, but we also
make an improvement for integers, preferring:

cmp w0, w1
csetw1, gt
csinv   w0, w1, wzr, ge

over:

cmp w0, w1
mov w0, 1
csinv   w0, w0, wzr, ge
cselw0, w0, wzr, ne

to compute:

auto test(int a, int b) { return a <=> b;}

gcc/ChangeLog:
PR target/117013
* config/aarch64/aarch64-protos.h (aarch64_expand_fp_spaceship):
Declare optab expander function for floating-point types.
* config/aarch64/aarch64.cc (aarch64_expand_fp_spaceship):
Define optab expansion for floating-point types (new function).
* config/aarch64/aarch64.md (spaceship4):
Add define_expands for spaceship4 on integer and
floating-point types.

gcc/testsuite/ChangeLog:
PR target/117013
* g++.target/aarch64/spaceship_1.C: New test.
* g++.target/aarch64/spaceship_2.C: New test.
* g++.target/aarch64/spaceship_3.C: New test.
---
 gcc/config/aarch64/aarch64-protos.h   |   1 +
 gcc/config/aarch64/aarch64.cc |  73 +++
 gcc/config/aarch64/aarch64.md |  43 
 .../g++.target/aarch64/spaceship_1.C  | 192 ++
 .../g++.target/aarch64/spaceship_2.C  |  72 +++
 .../g++.target/aarch64/spaceship_3.C  |   9 +
 6 files changed, 390 insertions(+)
 create mode 100644 gcc/testsuite/g++.target/aarch64/spaceship_1.C
 create mode 100644 gcc/testsuite/g++.target/aarch64/spaceship_2.C
 create mode 100644 gcc/testsuite/g++.target/aarch64/spaceship_3.C

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index fa7bc8029be..39a1dae4e8b 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1240,6 +1240,7 @@ void aarch64_restore_za (rtx);
 void aarch64_expand_crc_using_pmull (scalar_mode, scalar_mode, rtx *);
 void aarch64_expand_reversed_crc_using_pmull (scalar_mode, scalar_mode, rtx *);
 
+void aarch64_expand_fp_spaceship (rtx, rtx, rtx, rtx);
 
 extern bool aarch64_gcs_enabled ();
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index dba779a8e51..ea5dd0d5047 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -31427,6 +31427,79 @@ aarch64_expand_reversed_crc_using_pmull (scalar_mode 
crc_mode,
 }
 }
 
+/* Expand the spaceship optab for floating-point operands.
+
+   If the result is compared against (-1, 0, 1 , 2), expand into
+   fcmpe + conditional branch insns.
+
+   Otherwise (the result is just stored as an integer), expand into
+   fcmpe + a sequence of conditional select/increment/invert insns.  */
+void
+aarch64_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx hint)
+{
+  rtx cc_reg = gen_rtx_REG (CCFPEmode, CC_REGNUM);
+  emit_set_insn (cc_reg, gen_rtx_COMPARE (CCFPEmode, op0, op1));
+
+  rtx cc_gt = gen_rtx_GT (VOIDmode, cc_reg, const0_rtx);
+  rtx cc_lt = gen_rtx_LT (VOIDmode, cc_reg, const0_rtx);
+  rtx cc_un = gen_rtx_UNORDERED (VOIDmode, cc_reg, const0_rtx);
+
+  if (hint == const0_rtx)
+{
+  rtx un_label = gen_label_rtx ();
+  rtx lt_label = gen_label_rtx ();
+  rtx gt_label = gen_label_rtx ();
+  rtx end_label = gen_label_rtx ();
+
+  rtx temp = gen_rtx_IF_THEN_ELSE (VOIDmode, cc_un,
+   gen_rtx_LABEL_REF (Pmode, un_label), pc_rtx);
+  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, temp));
+
+  temp = gen_rtx_IF_THEN_ELSE (VOIDmode, cc_lt,
+   gen_rtx_LABEL_REF (Pmode, lt_label), pc_rtx);
+  emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
+
+  temp = gen_rtx_IF_THEN_ELSE (VOIDmode, cc_gt,
+   gen_rtx_LABEL_REF (Pmode, gt_label), pc_rtx);
+  emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
+
+  /* Equality.  */
+  emit_move_insn (dest, const0_rtx);
+  emit_jump (end_label);
+
+  emit_label (un_label);
+  emit_move_insn (dest, const2_rtx);
+  emit_jump (end_label);
+
+  emit_label (gt_label);
+  emit_move_insn (dest, const1_rtx);
+  emit_jump (end_label);
+
+  emit_label (lt_label);
+  emit_move_insn (dest, constm1_rtx);
+
+  emit_label (end_label);
+}
+  else
+{
+  rtx temp0 = gen_reg_rtx (SImode);
+  rtx temp1 = gen_reg_rtx (SImode);
+  rtx cc_ungt = gen_rtx_UNGT (VOIDmode, cc_reg, const0_rtx);
+
+  /* The value of hint is stored if the operands are unordered.  */
+  rtx temp_un = gen_int_mode (UINTVAL (hint) - 1, SImode);
+  if (!aarch64_reg_zero_or_m1_or_1 (temp_un, SImode))
+   temp_un = force_reg (SImode, temp_un);
+
+ 

[PATCH/GCC16 0/1] AArch64: Emit half-precision FCMP/FCMPE

2025-01-27 Thread Spencer Abson
This patch allows the AArch64 back end to emit the half-precision variants of
FCMP and FCMPE, given the target supports FEAT_FP16. Previously, such 
comparisons
would be unnecessarily promoted to single-precision.

The latest documentation of these instructions can be found here:
https://developer.arm.com/documentation/ddi0602/2024-12

Successfully bootstrapped and regtested on aarch64-linux-gnu.

OK for stage 1?

Spencer Abson (1):
  AArch64: Emit half-precision FCMP/FCMPE

 gcc/config/aarch64/aarch64.md | 29 +-
 .../gcc.target/aarch64/_Float16_cmp_1.c   | 54 +++
 .../gcc.target/aarch64/_Float16_cmp_2.c   |  7 +++
 3 files changed, 77 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/_Float16_cmp_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/_Float16_cmp_2.c

-- 
2.34.1



[PATCH/GCC16 1/1] AArch64: Emit half-precision FCMP/FCMPE

2025-01-27 Thread Spencer Abson
Enable a target with FEAT_FP16 to emit the half-precision variants
of FCMP/FCMPE.

gcc/ChangeLog:

* config/aarch64/aarch64.md: Update cbranch, cstore, fcmp
and fcmpe to use the GPF_F16 iterator for floating-point
modes.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/_Float16_cmp_1.c: New test.
* gcc.target/aarch64/_Float16_cmp_2.c: New (negative) test.
---
 gcc/config/aarch64/aarch64.md | 29 +-
 .../gcc.target/aarch64/_Float16_cmp_1.c   | 54 +++
 .../gcc.target/aarch64/_Float16_cmp_2.c   |  7 +++
 3 files changed, 77 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/_Float16_cmp_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/_Float16_cmp_2.c

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 071058dbeb3..8721bf5d4f3 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -707,11 +707,12 @@
 )
 
 (define_expand "cbranch4"
-  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
-   [(match_operand:GPF 1 "register_operand")
-(match_operand:GPF 2 
"aarch64_fp_compare_operand")])
-  (label_ref (match_operand 3 "" ""))
-  (pc)))]
+  [(set (pc) (if_then_else
+   (match_operator 0 "aarch64_comparison_operator"
+[(match_operand:GPF_F16 1 "register_operand")
+ (match_operand:GPF_F16 2 "aarch64_fp_compare_operand")])
+   (label_ref (match_operand 3 "" ""))
+   (pc)))]
   ""
   "
   operands[1] = aarch64_gen_compare_reg (GET_CODE (operands[0]), operands[1],
@@ -4338,26 +4339,28 @@
 
 (define_insn "fcmp"
   [(set (reg:CCFP CC_REGNUM)
-(compare:CCFP (match_operand:GPF 0 "register_operand")
- (match_operand:GPF 1 "aarch64_fp_compare_operand")))]
+   (compare:CCFP
+   (match_operand:GPF_F16 0 "register_operand")
+   (match_operand:GPF_F16 1 "aarch64_fp_compare_operand")))]
"TARGET_FLOAT"
{@ [ cons: 0 , 1  ]
   [ w   , Y  ] fcmp\t%0, #0.0
   [ w   , w  ] fcmp\t%0, %1
   }
-  [(set_attr "type" "fcmp")]
+  [(set_attr "type" "fcmp")]
 )
 
 (define_insn "fcmpe"
   [(set (reg:CCFPE CC_REGNUM)
-(compare:CCFPE (match_operand:GPF 0 "register_operand")
-  (match_operand:GPF 1 "aarch64_fp_compare_operand")))]
+   (compare:CCFPE
+   (match_operand:GPF_F16 0 "register_operand")
+   (match_operand:GPF_F16 1 "aarch64_fp_compare_operand")))]
"TARGET_FLOAT"
{@ [ cons: 0 , 1  ]
   [ w   , Y  ] fcmpe\t%0, #0.0
   [ w   , w  ] fcmpe\t%0, %1
   }
-  [(set_attr "type" "fcmp")]
+  [(set_attr "type" "fcmp")]
 )
 
 (define_insn "*cmp_swp__reg"
@@ -4425,8 +4428,8 @@
 (define_expand "cstore4"
   [(set (match_operand:SI 0 "register_operand")
(match_operator:SI 1 "aarch64_comparison_operator_mode"
-[(match_operand:GPF 2 "register_operand")
- (match_operand:GPF 3 "aarch64_fp_compare_operand")]))]
+[(match_operand:GPF_F16 2 "register_operand")
+ (match_operand:GPF_F16 3 "aarch64_fp_compare_operand")]))]
   ""
   "
   operands[2] = aarch64_gen_compare_reg (GET_CODE (operands[1]), operands[2],
diff --git a/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_1.c 
b/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_1.c
new file mode 100644
index 000..e49ace1d7dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_1.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+fp16" } */
+
+/*
+** test_fcmp_store:
+** fcmph0, h1
+** csetw0, eq
+** ret
+*/
+int
+test_fcmp_store(_Float16 a, _Float16 b)
+{
+return a == b;
+}
+
+/*
+** test_fcmpe_store:
+** fcmpe   h0, h1
+** csetw0, mi
+** ret
+*/
+int
+test_fcmpe_store(_Float16 a, _Float16 b)
+{
+return a < b;
+}
+
+/*
+** test_fcmp_branch:
+** fcmph0, h1
+** ...
+*/
+_Float16
+test_fcmp_branch(_Float16 a, _Float16 b)
+{
+if (a == b)
+return a * b;
+return a;
+}
+
+/*
+** test_fcmpe_branch:
+** fcmpe   h0, h1
+** ...
+*/
+_Float16
+test_fcmpe_branch(_Float16 a, _Float16 b)
+{
+if (a < b)
+return a * b;
+return a;
+}
+
+/* { dg-final { check-function-bodies "**" "" "" } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_2.c 
b/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_2.c
new file mode 100644
index 000..e714304970b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/_Float16_cmp_2.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+nofp16" } */
+
+#include "_Float16_cmp_1.c"
+
+/* { dg-final { scan-assembler-not "\tfcmp\th\[0-9\]\+" } } */
+/* { dg-final { scan-assembler-not "\tfcmpe\th\[0-9\]\+" } } */
\ No newline at end of file
--

[PATCH 0/1][RFC] middle-end: target support checks for vectorizable_induction

2025-03-20 Thread Spencer Abson
Hi all,

While tinkering with AArch64's SVE port, I noticed (by means of ICE) that 
vetorizable_induction does not accurately
test target support of the vectorized operations it emits.

This would only give an ICE for variable-length vectors (see 
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103523),
so the patch I've attached here covers those only.

The question I'd like to raise is whether we should apply more scrutiny here; a 
vectorized MULT_EXPR is emitted to
calculate the step vector for each IV in SLP induction vectorization, as well 
as whenever we need to calucalate the
initial values of float inductions with variable-length vectors.

Is it worth moving some code around to test for support of MULT_EXPR with the 
mode of STEP_VECTYPE whenver we know
that the transformation will use it?
Is there a reason that testing for target support was omitted from the 
originial code?

While this is an RFC, the patch itself has been bootstrapped and regtested on 
aarch64-linux-gnu.

Thank you very much for any discussion.
Spencer Abson

Spencer Abson (1):
  Induction vectorizer: prevent ICE for scalable types

 gcc/tree-vect-loop.cc | 39 ++-
 1 file changed, 30 insertions(+), 9 deletions(-)

-- 
2.34.1



[PATCH 1/1][RFC] Induction vectorizer: prevent ICE for scalable types

2025-03-20 Thread Spencer Abson
We currently check that the target suppports PLUS_EXPR and MINUS_EXPR
with step_vectype (a fix for pr103523).  However, vectorizable_induction
can emit a vectorized MULT_EXPR when calculating the step of each IV for
SLP, and both MULT_EXPR/FLOAT_EXPR when calculating VEC_INIT for float
inductions.

gcc/ChangeLog:

* tree-vect-loop.cc (vectorizable_induction): Add target support
checks for vectorized MULT_EXPR and FLOAT_EXPR where necessary for
scalable types.
Prefer target_supports_op_p over directly_supports_p for these tree
codes.
(vect_update_nonlinear_iv): Fix a doc comment while I'm here.
---
 gcc/tree-vect-loop.cc | 39 ++-
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 9413dcef702..cce57978ae2 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10053,7 +10053,7 @@ vect_update_nonlinear_iv (gimple_seq* stmts, tree 
vectype,
 
 }
 
-/* Function vectorizable_induction
+/* Function vectorizable_nonlinear_induction
 
Check if STMT_INFO performs an nonlinear induction computation that can be
vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
@@ -10402,6 +10402,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   unsigned i;
   tree expr;
+  tree index_vectype = NULL_TREE;
   gimple_stmt_iterator si;
   enum vect_induction_op_type induction_type
 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
@@ -10513,12 +10514,29 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 "supported.\n");
   return false;
 }
-  tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
+  tree stept = TREE_TYPE (step_expr);
+  tree step_vectype = get_same_sized_vectype (stept, vectype);
 
-  /* Check for backend support of PLUS/MINUS_EXPR. */
-  if (!directly_supported_p (PLUS_EXPR, step_vectype)
-  || !directly_supported_p (MINUS_EXPR, step_vectype))
-return false;
+  /* Check for target support of the vectorized arithmetic used here.  */
+  if (!target_supports_op_p (step_vectype, PLUS_EXPR, optab_default)
+  || !target_supports_op_p (step_vectype, MINUS_EXPR, optab_default))
+  return false;
+  if (!nunits.is_constant ())
+{
+  if (!target_supports_op_p (step_vectype, MULT_EXPR, optab_default))
+   return false;
+  /* FLOAT_EXPR when computing VEC_INIT for float inductions.  */
+  if (SCALAR_FLOAT_TYPE_P (stept))
+   {
+ tree index_type = build_nonstandard_integer_type
+   (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (stept)), 1);
+
+ index_vectype = build_vector_type (index_type, nunits);
+ if (!can_float_p (TYPE_MODE (step_vectype),
+   TYPE_MODE (index_vectype), 1))
+   return false;
+   }
+}
 
   if (!vec_stmt) /* transformation not required.  */
 {
@@ -10637,7 +10655,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
  nivs = 1;
}
   gimple_seq init_stmts = NULL;
-  tree stept = TREE_TYPE (step_vectype);
   tree lupdate_mul = NULL_TREE;
   if (!nested_in_vect_loop)
{
@@ -10741,7 +10758,9 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
  gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (steps[0])));
  gcc_assert (flag_associative_math);
- tree index = build_index_vector (step_vectype, 0, 1);
+ gcc_assert (index_vectype != NULL_TREE);
+
+ tree index = build_index_vector (index_vectype, 0, 1);
  new_name = gimple_convert (&init_stmts, TREE_TYPE (steps[0]),
 inits[0]);
  tree base_vec = gimple_build_vector_from_val (&init_stmts,
@@ -11016,7 +11035,9 @@ vectorizable_induction (loop_vec_info loop_vinfo,
+ (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
  gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
  gcc_assert (flag_associative_math);
- tree index = build_index_vector (step_vectype, 0, 1);
+ gcc_assert (index_vectype != NULL_TREE);
+
+ tree index = build_index_vector (index_vectype, 0, 1);
  tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
new_name);
  tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
-- 
2.34.1



Re: [PATCH 1/1] AArch64: Fold builtins with highpart args to highpart equivalent [PR117850]

2025-02-18 Thread Spencer Abson
Hi Kyrill,

Thanks for your comments, and for answering my question RE your work. Happy to
apply those changes in the next revision.

Cheers,
Spencer


Re: [PATCH 1/1] AArch64: Fold builtins with highpart args to highpart equivalent [PR117850]

2025-02-18 Thread Spencer Abson
On Tue, Feb 18, 2025 at 10:27:46AM +, Richard Sandiford wrote:
> Thanks, this generally looks really good.  Some comments on top of
> Kyrill's, and Christophe's comment internally about -save-temps.
> 
> Spencer Abson  writes:
> > +/* Build and return a new VECTOR_CST that is the concatenation of
> > +   VEC_IN with itself.  */
> > +static tree
> > +aarch64_self_concat_vec_cst (tree vec_in)
> > +{
> > +  gcc_assert ((TREE_CODE (vec_in) == VECTOR_CST));
> > +  unsigned HOST_WIDE_INT nelts
> > += VECTOR_CST_NELTS (vec_in).to_constant ();
> > +
> > +  tree out_type = build_vector_type (TREE_TYPE (TREE_TYPE (vec_in)),
> > +nelts * 2);
> 
> It would be good to pass in the type that the caller wants.
> More about that below.

Yeah, I can see the advantage of that.

> 
> > +
> > +  /* Avoid decoding/encoding if the encoding won't change.  */
> > +  if (VECTOR_CST_DUPLICATE_P (vec_in))
> > +{
> > +  tree vec_out = make_vector (exact_log2
> > +(VECTOR_CST_NPATTERNS (vec_in)), 1);
> > +  unsigned int encoded_size
> > +   = vector_cst_encoded_nelts (vec_in) * sizeof (tree);
> > +
> > +  memcpy (VECTOR_CST_ENCODED_ELTS (vec_out),
> > + VECTOR_CST_ENCODED_ELTS (vec_in), encoded_size);
> > +
> > +  TREE_TYPE (vec_out) = out_type;
> > +  return vec_out;
> > +}
> 
> I'm not sure this is worth it.  The approach below shouldn't be that
> much less efficient, since all the temporaries are generally on the
> stack.  Also:
> 
> > +
> > +  tree_vector_builder vec_out (out_type, nelts, 1);
> 
> This call rightly describes a duplicated sequence of NELTS elements so...
> 
> > +  for (unsigned i = 0; i < nelts * 2; i++)
> > +vec_out.quick_push (VECTOR_CST_ELT (vec_in, i % nelts));
> 
> ...it should only be necessary to push nelts elements here.

Good point!

> 
> > +
> > +  return vec_out.build ();
> > +}
> > +
> > +/* If the SSA_NAME_DEF_STMT of ARG is an assignement to a
> > +   BIT_FIELD_REF with SIZE and OFFSET, return the object of the
> > +   BIT_FIELD_REF.  Otherwise, return NULL_TREE.  */
> > +static tree
> > +aarch64_object_of_bfr (tree arg, unsigned HOST_WIDE_INT size,
> > +  unsigned HOST_WIDE_INT offset)
> > +{
> > +  if (TREE_CODE (arg) != SSA_NAME)
> > +return NULL_TREE;
> > +
> > +  gassign *stmt = dyn_cast (SSA_NAME_DEF_STMT (arg));
> > +
> > +  if (!stmt)
> > +return NULL_TREE;
> > +
> > +  if (gimple_assign_rhs_code (stmt) != BIT_FIELD_REF)
> > +return NULL_TREE;
> > +
> > +  tree bf_ref = gimple_assign_rhs1 (stmt);
> > +
> > +  if (bit_field_size (bf_ref).to_constant () != size
> > +  || bit_field_offset (bf_ref).to_constant () != offset)
> > +return NULL_TREE;
> > +
> > +  return TREE_OPERAND (bf_ref, 0);
> 
> I think this also needs to check that operand 0 of the BIT_FIELD_REF
> is a 128-bit vector.  A 64-bit reference at offset 64 could instead
> be into something else, such as a 256-bit vector.
> 
> An example is:
> 
> --
> #include 
> 
> typedef int16_t int16x16_t __attribute__((vector_size(32)));
> 
> int32x4_t
> f (int16x16_t foo)
> {
>   return vmovl_s16 ((int16x4_t) { foo[4], foo[5], foo[6], foo[7] });
> }
> --
> 
> which triggers an ICE.
> 
> Even if the argument is a 128-bit vector, it could be a 128-bit
> vector of a different type, such as in:
> 
> --
> #include 
> 
> int32x4_t
> f (int32x4_t foo)
> {
>   return vmovl_s16 (vget_high_s16 (vreinterpretq_s16_s32 (foo)));
> }
> --
> 
> I think we should still accept this second case, but emit a VIEW_CONVERT_EXPR
> before the call to convert the argument to the right type.
> 

Thanks for raising these, serious tunnel vision on my part...

> > +}
> > +
> > +/*  Prefer to use the highpart builtin when:
> > +
> > +1) All lowpart arguments are references to the highparts of other
> > +vectors.
> > +
> > +2) For calls with two lowpart arguments, if either refers to a
> > +vector highpart and the other is a VECTOR_CST.  We can copy the
> > +VECTOR_CST to 128b in this case.  */
> > +static bool
> > +aarch64_fold_lo_call_

[PATCH 1/1] AArch64: Fold builtins with highpart args to highpart equivalent [PR117850]

2025-02-17 Thread Spencer Abson
Add a fold at gimple_fold_builtin to prefer the highpart variant of a builtin
if the arguments are better suited to it. This helps us avoid copying data
between lanes before operation.

E.g. We prefer to use UMULL2 rather than DUP+UMULL for the following:

uint16x8_t
foo(const uint8x16_t s) {
const uint8x16_t f0 = vdupq_n_u8(4);
return vmull_u8(vget_high_u8(s), vget_high_u8(f0));
}

gcc/ChangeLog:

* config/aarch64/aarch64-builtins.cc (LO_HI_PAIRINGS): New macro.
Covers every LO_HI_PAIR.
(aarch64_get_highpart_builtin): New function. Get the highpart builtin
paired with the input FCODE.
(LO_HI_PAIR):
(aarch64_self_concat_vec_cst): New function. Concatenate a
VECTOR_CST with itself.
(aarch64_object_of_bfr): New function. Helper to check arguments
for vector highparts.
(aarch64_fold_lo_call_to_hi): New function.
(aarch64_general_gimple_fold_builtin): Add cases for the lowpart
builtins.
* config/aarch64/aarch64-builtin-pairs.def: New file. Declare
pairings of lowpart/highpart builtins.

gcc/testsuite/ChangeLog:
* gcc.target/aarch64/simd/vabal_combine.c: Test changed to
pass after earlier builtin fold.
* gcc.target/aarch64/simd/fold_to_highpart_1.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_2.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_3.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_4.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_5.c: New test.
---
 gcc/config/aarch64/aarch64-builtin-pairs.def  |  77 ++
 gcc/config/aarch64/aarch64-builtins.cc| 232 ++
 .../aarch64/simd/fold_to_highpart_1.c | 708 ++
 .../aarch64/simd/fold_to_highpart_2.c |  82 ++
 .../aarch64/simd/fold_to_highpart_3.c |  80 ++
 .../aarch64/simd/fold_to_highpart_4.c |  77 ++
 .../aarch64/simd/fold_to_highpart_5.c |  71 ++
 .../gcc.target/aarch64/simd/vabal_combine.c   |  12 +-
 8 files changed, 1333 insertions(+), 6 deletions(-)
 create mode 100644 gcc/config/aarch64/aarch64-builtin-pairs.def
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c

diff --git a/gcc/config/aarch64/aarch64-builtin-pairs.def 
b/gcc/config/aarch64/aarch64-builtin-pairs.def
new file mode 100644
index 000..d3ca69a1887
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-builtin-pairs.def
@@ -0,0 +1,77 @@
+/* Pairings of AArch64 builtins that can be folded into each other.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   .  */
+
+/* LO/HI widenable integer modes.  */
+#define LO_HI_PAIR_V_WI(T, LO, HI) \
+  LO_HI_PAIR (T##_##LO##v2si, T##_##HI##v4si) \
+  LO_HI_PAIR (T##_##LO##v4hi, T##_##HI##v8hi) \
+  LO_HI_PAIR (T##_##LO##v8qi, T##_##HI##v16qi)
+
+/* LO/HI Single/Half integer modes.  */
+#define LO_HI_PAIR_V_HSI(T, LO, HI) \
+  LO_HI_PAIR (T##_##LO##v2si, T##_##HI##v4si) \
+  LO_HI_PAIR (T##_##LO##v4hi, T##_##HI##v8hi)
+
+#define UNOP_LONG_LH_PAIRS \
+  LO_HI_PAIR (UNOP_sxtlv8hi,  UNOP_vec_unpacks_hi_v16qi) \
+  LO_HI_PAIR (UNOP_sxtlv4si,  UNOP_vec_unpacks_hi_v8hi) \
+  LO_HI_PAIR (UNOP_sxtlv2di,  UNOP_vec_unpacks_hi_v4si) \
+  LO_HI_PAIR (UNOPU_uxtlv8hi, UNOPU_vec_unpacku_hi_v16qi) \
+  LO_HI_PAIR (UNOPU_uxtlv4si, UNOPU_vec_unpacku_hi_v8hi) \
+  LO_HI_PAIR (UNOPU_uxtlv2di, UNOPU_vec_unpacku_hi_v4si)
+
+#define BINOP_LONG_LH_PAIRS \
+  LO_HI_PAIR_V_WI (BINOP,  saddl, saddl2) \
+  LO_HI_PAIR_V_WI (BINOPU, uaddl, uaddl2) \
+  LO_HI_PAIR_V_WI (BINOP,  ssubl, ssubl2) \
+  LO_HI_PAIR_V_WI (BINOPU, usubl, usubl2) \
+  LO_HI_PAIR_V_WI (BINOP,  sabdl, sabdl2) \
+  LO_HI_PAIR_V_WI (BINOPU, uabdl, uabdl2) \
+  LO_HI_PAIR_V_WI (BINOP,  intrinsic_vec_smult_lo_, vec_widen_smult_hi_) \
+  LO_HI_PAIR_V_WI (BINOPU, intrinsic_vec_umult_lo_, vec_widen_umult_hi_) \
+  LO_HI_PAIR_V_HSI (BINOP,  sqdmull, sqdmull2)
+
+#define BINOP_LONG_N_LH_PAIRS \
+  LO_HI_PAIR_V_HSI (BINOP,  smull_n, smull_hi_n) \
+  

[PATCH 0/1] AArch64: Fold builtin calls w/ highpart args to highpart equivalent [PR117850]

2025-02-17 Thread Spencer Abson
Hi all,

This patch implements the missed optimisation noted in PR117850.

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117850

It covers all the AArch64 builtins that I can imagine this is sensible for,
excluding vshll/vshll_n (for now) due to a discrepancy in their declarations.

Bootstrapped and regtested on aarch64-none-linux-gnu. This work was also
tested on a cross-compiler targeting aarch64_be-none-linux-gnu.

CC'ing Kyrylo as it looks like this patch interferes with his earlier work -
I'm wondering what to do about simd/vabal_combine.c without losing coverage?

OK for stage-1?

Spencer

Spencer Abson (1):
  AArch64: Fold builtins with highpart args to highpart equivalent
[PR117850]

 gcc/config/aarch64/aarch64-builtin-pairs.def  |  77 ++
 gcc/config/aarch64/aarch64-builtins.cc| 232 ++
 .../aarch64/simd/fold_to_highpart_1.c | 708 ++
 .../aarch64/simd/fold_to_highpart_2.c |  82 ++
 .../aarch64/simd/fold_to_highpart_3.c |  80 ++
 .../aarch64/simd/fold_to_highpart_4.c |  77 ++
 .../aarch64/simd/fold_to_highpart_5.c |  71 ++
 .../gcc.target/aarch64/simd/vabal_combine.c   |  12 +-
 8 files changed, 1333 insertions(+), 6 deletions(-)
 create mode 100644 gcc/config/aarch64/aarch64-builtin-pairs.def
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c

-- 
2.34.1



[PATCH v2 1/1] AArch64: Fold builtins with highpart args to highpart equivalent [PR117850]

2025-02-21 Thread Spencer Abson
Add a fold at gimple_fold_builtin to prefer the highpart variant of a builtin
if the arguments are better suited to it. This helps us avoid copying data
between lanes before operation.

E.g. We prefer to use UMULL2 rather than DUP+UMULL for the following:

uint16x8_t
foo(const uint8x16_t s) {
const uint8x16_t f0 = vdupq_n_u8(4);
return vmull_u8(vget_high_u8(s), vget_high_u8(f0));
}

gcc/ChangeLog:

* config/aarch64/aarch64-builtins.cc (LO_HI_PAIRINGS): New macro.
Cover every lo/hi pairing in builtin-pairs.def.
(aarch64_get_highpart_builtin): New function.  Get the fndecl for
the hi builtin paired with FCODE.
(LO_HI_PAIR): New macro.
(aarch64_object_of_bfr): New function.  Parse BIT_FIELD_REF expressions.
(aarch64_duplicate_vector_cst): New function.
(aarch64_nbit_vector_type_p): New function.  Check if a type describes
an n-bit vector.
(aarch64_vq_high_half): New function. Helper to identify vector
highparts.
(aarch64_fold_lo_call_to_hi): New function.  Perform the fold described
here.
(aarch64_general_gimple_fold_builtin): Add cases for lo builtins.
* config/aarch64/aarch64-builtin-pairs.def: New file.  Declare pairings
of lo/hi builtins.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/simd/vabal_combine.c: Removed.
* gcc.target/aarch64/simd/fold_to_highpart_1.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_2.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_3.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_4.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_5.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_6.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_7.c: New test.
---
 gcc/config/aarch64/aarch64-builtin-pairs.def  |  81 ++
 gcc/config/aarch64/aarch64-builtins.cc| 206 +
 .../aarch64/simd/fold_to_highpart_1.c | 733 ++
 .../aarch64/simd/fold_to_highpart_2.c |  86 ++
 .../aarch64/simd/fold_to_highpart_3.c |  81 ++
 .../aarch64/simd/fold_to_highpart_4.c |  77 ++
 .../aarch64/simd/fold_to_highpart_5.c |  38 +
 .../aarch64/simd/fold_to_highpart_6.c |  94 +++
 .../aarch64/simd/fold_to_highpart_7.c |  36 +
 .../gcc.target/aarch64/simd/vabal_combine.c   |  72 --
 10 files changed, 1432 insertions(+), 72 deletions(-)
 create mode 100644 gcc/config/aarch64/aarch64-builtin-pairs.def
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_7.c
 delete mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c

diff --git a/gcc/config/aarch64/aarch64-builtin-pairs.def 
b/gcc/config/aarch64/aarch64-builtin-pairs.def
new file mode 100644
index 000..e1dc0b71a1c
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-builtin-pairs.def
@@ -0,0 +1,81 @@
+/* Pairings of AArch64 builtins that can be folded into each other.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   .  */
+
+/* LO/HI widenable integer modes.  */
+#define LO_HI_PAIR_V_WI(T, LO, HI) \
+  LO_HI_PAIR (T##_##LO##v2si, T##_##HI##v4si) \
+  LO_HI_PAIR (T##_##LO##v4hi, T##_##HI##v8hi) \
+  LO_HI_PAIR (T##_##LO##v8qi, T##_##HI##v16qi)
+
+/* LO/HI Single/Half integer modes.  */
+#define LO_HI_PAIR_V_HSI(T, LO, HI) \
+  LO_HI_PAIR (T##_##LO##v2si, T##_##HI##v4si) \
+  LO_HI_PAIR (T##_##LO##v4hi, T##_##HI##v8hi)
+
+#define UNOP_LONG_LH_PAIRS \
+  LO_HI_PAIR (UNOP_sxtlv8hi,  UNOP_vec_unpacks_hi_v16qi) \
+  LO_HI_PAIR (UNOP_sxtlv4si,  UNOP_vec_unpacks_hi_v8hi) \
+  LO_HI_PAIR (UNOP_sxtlv2di,  UNOP_vec_unpacks_hi_v4si) \
+  LO_HI_PAIR (UNOPU_uxtlv8hi, UNOPU_vec_unpacku_hi_v16qi) \
+  LO_HI_PAIR (UNOPU_uxtlv4si, UNOPU_vec_unpacku_hi_v8hi) \
+  LO_HI_PAIR (UNOPU_uxtlv2di, UNOPU_vec_unpa

[PATCH v2 0/1] AArch64: Fold builtins with highpart args to highpart equivalent [PR117850]

2025-02-21 Thread Spencer Abson
Hi all,

This patch implements the missed optimisation noted in PR117850.

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117850

Changes since the last revision:
- Now processing different function signatures programatically
- Fixed ICE RE the type referred to by a BIT_FIELD_REF
- Modified testcase regex to account for commutativity
- Removed vabal_combine.c in favour of coverage in fold_to_highpart_1.c
- Supported widening floating-point conversions
- General response to feedback from Richard S., Christophe Lyon.

Notes:
- I'm on the fence about adding more conservative assertions to 
aarch64_duplicate_vector_cst.
  E.g.
gcc_assert (types_compatible_p (TREE_TYPE (TREE_TYPE 
(vec_in)),
   (TREE_TYPE (out_ty;

- I give this test in fold_to_highpart_7.c; 
https://godbolt.org/z/sG6GdEdGb.  It's interesting
  that the current behavior of GCC is worse than GCC14, trunk targeting 
aarch64_be, and  Clang.

  Maybe it's worth a thought?

Bootstrapped and regtested on aarch64-none-linux-gnu. This work was also
tested on a cross-compiler targeting aarch64_be-none-linux-gnu.

OK for stage-1?

Thanks,
Spencer

Spencer Abson (1):
  AArch64: Fold builtins with highpart args to highpart equivalent
[PR117850]

 gcc/config/aarch64/aarch64-builtin-pairs.def  |  81 ++
 gcc/config/aarch64/aarch64-builtins.cc| 206 +
 .../aarch64/simd/fold_to_highpart_1.c | 733 ++
 .../aarch64/simd/fold_to_highpart_2.c |  86 ++
 .../aarch64/simd/fold_to_highpart_3.c |  81 ++
 .../aarch64/simd/fold_to_highpart_4.c |  77 ++
 .../aarch64/simd/fold_to_highpart_5.c |  38 +
 .../aarch64/simd/fold_to_highpart_6.c |  94 +++
 .../aarch64/simd/fold_to_highpart_7.c |  36 +
 .../gcc.target/aarch64/simd/vabal_combine.c   |  72 --
 10 files changed, 1432 insertions(+), 72 deletions(-)
 create mode 100644 gcc/config/aarch64/aarch64-builtin-pairs.def
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_7.c
 delete mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c

-- 
2.34.1



[PATCH 0/1] middle-end: Fix operation_could_trap_p for FIX_TRUNC_EXPR

2025-05-14 Thread Spencer Abson
Floating-point to integer conversions can be inexact or invalid (e.g., the
latter due to overflow or NaN).  However, since users of operation_could_trap_p
infer the bool FP_OPERATION argument from the expression's type, FIX_TRUNC_EXPR
is considered non-trapping here.  This patch handles FIX_TRUNC_EXPR explicitly.

Floating-point extensions/truncations and integer to floating-point conversions
are considered trapping via the default case, as their expressions have a
floating-point type.  I believe this is the correct behavior here, since
floating-point extensions/truncations are clearly potentially trapping, and
integer to floating-point conversions can be inexact for extreme integer values.
Perhaps it's worth being explicit here too?

I stumbled upon this behavior through ifcvt; see 
https://godbolt.org/z/h7qMc4ebx.
There might be a simpler or better way to test this change, but I'm not sure.

Bootstrapped and regtested on aarch64-linux-gnu.  This change causes
sve/pr96357.c to fail, as it can no longer be vectorized without
fno-trapping-math.  The PR is a failure involving SVE_STRICT_GP, so I'm
hesitant to add that flag.  I suspect we may want a better test for the issue
fixed by commit 0eb5e901f6e2, if it is still relevant.

Thanks

Spencer Abson (1):
  middle-end: Fix operation_could_trap_p for FIX_TRUNC_EXPR

 .../gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c| 18 ++
 .../gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c|  6 ++
 gcc/tree-eh.cc |  5 +
 3 files changed, 29 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c

-- 
2.34.1



[PATCH 1/1] middle-end: Fix operation_could_trap_p for FIX_TRUNC_EXPR

2025-05-14 Thread Spencer Abson
Floating-point to integer conversions can be inexact or invalid (e.g., due to
overflow or NaN).  However, since users of operation_could_trap_p infer the
bool FP_OPERATION argument from the expression's type, FIX_TRUNC_EXPR is
considered non-trapping here.

This patch handles FIX_TRUNC_EXPR explicitly.

gcc/ChangeLog:

* tree-eh.cc (operation_could_trap_helper_p):  Cover FIX_TRUNC_EXPR
explicitly.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c: New test.
* gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c: New test.
---
 .../gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c| 18 ++
 .../gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c|  6 ++
 gcc/tree-eh.cc |  5 +
 3 files changed, 29 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c
new file mode 100644
index 000..c65441ad679
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-ifcvt-stats" } */
+
+void test (int *dst, float *arr, int *pred, int n)
+{
+  for (int i = 0; i < n; i++)
+{
+  int pred_i = pred[i];
+  float arr_i = arr[i];
+
+  dst[i] = pred_i ? (int)arr_i : 5;
+}
+}
+
+/* We expect this to fail if_convertible_loop_p so long as we have no
+   conditional IFN for FIX_TRUNC_EXPR.  */
+
+/* { dg-final { scan-tree-dump-times "Applying if-conversion" 0 "ifcvt" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c
new file mode 100644
index 000..628b754e94d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math 
-fdump-tree-ifcvt-stats" } */
+
+#include "ifcvt-fix-trunc-1.c"
+
+/* { dg-final { scan-tree-dump-times "Applying if-conversion" 1 "ifcvt" } } */
diff --git a/gcc/tree-eh.cc b/gcc/tree-eh.cc
index a4d59954c05..154e8fafb12 100644
--- a/gcc/tree-eh.cc
+++ b/gcc/tree-eh.cc
@@ -2538,6 +2538,11 @@ operation_could_trap_helper_p (enum tree_code op,
   /* Constructing an object cannot trap.  */
   return false;
 
+case FIX_TRUNC_EXPR:
+  /* Don't rely on FP_OPERATION to cover FP->INT conversions,
+since the complete expression does not have a FP type.  */
+  return flag_trapping_math;
+
 case COND_EXPR:
 case VEC_COND_EXPR:
   /* Whether *COND_EXPR can trap depends on whether the
-- 
2.34.1



[pushed] MAINTAINERS: add myself to write after approval

2025-05-16 Thread Spencer Abson
ChangeLog:

* MAINTAINERS: Add myself to write after approval.
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index a3e3f25d9d1..8993d176c22 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -329,6 +329,7 @@ from other maintainers or reviewers.
 NameBZ account  Email
 
 Soumya AR   soumyaa 
+Spencer Abson   sabson  
 Mark G. Adams   mgadams 
 Ajit Kumar Agarwal  aagarwa 
 Pedro Alves palves  
-- 
2.34.1



Re: [PATCH v2 1/1] middle-end: Fix operation_could_trap_p for FIX_TRUNC expressions

2025-06-03 Thread Spencer Abson
On Tue, Jun 03, 2025 at 03:26:40PM +0200, Richard Biener wrote:
> On Tue, Jun 3, 2025 at 3:09 PM Spencer Abson  wrote:
> >
> > Floating-point to integer conversions can be inexact or invalid (e.g., due 
> > to
> > overflow or NaN).  However, since users of operation_could_trap_p infer the
> > bool FP_OPERATION argument from the expression's type, the FIX_TRUNC family
> > are considered non-trapping here.
> >
> > This patch handles them explicitly.
> >
> > gcc/ChangeLog:
> >
> > * tree-eh.cc (operation_could_trap_helper_p): Cover FIX_TRUNC
> > expressions explicitly.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/sve/pr96357.c: Change to avoid producing
> > a conditional FIX_TRUNC_EXPR, whilst still reproducing the bug
> > in PR96357.
> > * gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c: New test.
> > * gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c: Likewise.
> > ---
> >  .../gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c   | 19 +++
> >  .../gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c   |  6 ++
> >  .../gcc.target/aarch64/sve/pr96357.c  |  8 
> >  gcc/tree-eh.cc|  7 +++
> >  4 files changed, 36 insertions(+), 4 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c
> >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c
> >
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c 
> > b/gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c
> > new file mode 100644
> > index 000..801a53fa30b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c
> > @@ -0,0 +1,19 @@
> > +  /* { dg-do compile } */
> > +  /* { dg-options "-O2 -ftree-vectorize -fdump-tree-ifcvt-stats" } */
> > +
> > +void
> > +test (int *dst, float *arr, int *pred, int n)
> > +{
> > +  for (int i = 0; i < n; i++)
> > +{
> > +  int pred_i = pred[i];
> > +  float arr_i = arr[i];
> > +
> > +  dst[i] = pred_i ? (int)arr_i : 5;
> > +}
> > +}
> > +
> > +/* We expect this to fail if_convertible_loop_p so long as we have no
> > +   conditional IFN for FIX_TRUNC_EXPR.  */
> > +
> > +/* { dg-final { scan-tree-dump-times "Applying if-conversion" 0 "ifcvt" } 
> > } */
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c 
> > b/gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c
> > new file mode 100644
> > index 000..628b754e94d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math 
> > -fdump-tree-ifcvt-stats" } */
> > +
> > +#include "ifcvt-fix-trunc-1.c"
> > +
> > +/* { dg-final { scan-tree-dump-times "Applying if-conversion" 1 "ifcvt" } 
> > } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr96357.c 
> > b/gcc/testsuite/gcc.target/aarch64/sve/pr96357.c
> > index 9a7f912e529..6dd0409f3c8 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/sve/pr96357.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr96357.c
> > @@ -5,10 +5,10 @@ int d;
> >
> >  void
> >  f1(char f, char *g, char *h, char *l, char *n) {
> > -  double i = d, j = 1.0 - f, k = j ? d : j;
> > -  if (k == 1.0)
> > -i = 0.0;
> 
> So why does if-conversion  not work with the previous variant?

The body of the previous variant of f1 becomes (after ch_vect):

  ...

  double _70;

  ...

  _34 = d.0_22 == 1;
  _35 = j_26 != 0.0;
  _36 = _34 & _35;
  if (_36 != 0)
goto ; [17.00%]
  else
goto ; [83.00%]

   [local count: 882293657]:
  _70 = i_23 * 5.0e-1;
  _72 = (char) _70;

   [local count: 1063004408]:
  # prephitmp_73 = PHI <0(3), _72(4)>

  ...

The interesting expression being _72 = (char) _70.  if-conversion would have
previously converted this part to:

  _34 = d.0_22 == 1;
  _35 = j_26 != 0.0;
  _36 = _34 & _35;
  _77 = ~_36;
  _70 = .COND_MUL (_77, i_23, 5.0e-1, i_23);
  _72 = (char) _70;
  prephitmp_73 = _36 ? 0 : _72;

Which is invalid given FIX_TRUNC_EXPR can trap.  We have no predicated form
of this expression, so it fails if_convertible_gimple_assign_stmt_p with this
patch applied.

The change to f1 gets rid of the FIX_TRUNC_EXPR.

Thanks,
Spencer
> 
> > -  *l = *n = *g = *h = i * 0.5;
> > +  double j = 1.0 - f, k = j ? d : j;
> > +
> > +  char i = (k == 1.0) ? 10 : 50;
> &

[PATCH 14/14] aarch64: Add support for unpacked SVE FP conditional ternary arithmetic

2025-06-02 Thread Spencer Abson
This patch extends the expander for fma, fnma, fms, and fnms to support
partial SVE FP modes.

We add the missing BF16 tests, which we can now trigger for having
implemented the conditional expander.

We also add tests for the 'merging with multiplicand' case, which this
expander canonicalizes (albeit under SVE_STRICT_GP).

gcc/ChangeLog:

* config/aarch64/aarch64-sve.md (@cond_): Extend
to support partial FP modes.
(*cond__2_strict): Extend from SVE_FULL_F to SVE_F,
use aarch64_predicate_operand.
(*cond__4_strict): Extend from SVE_FULL_F_B16B16 to
SVE_F_B16B16, use aarch64_predicate_operand.
(*cond__any_strict):  Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/unpacked_cond_fmla_1.c: Add test cases
for merging with multiplcand.
* gcc.target/aarch64/sve/unpacked_cond_fmls_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_fmla_2.c: New test.
* gcc.target/aarch64/sve/unpacked_cond_fmls_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_fnmla_2.c: Likewise..
* gcc.target/aarch64/sve/unpacked_cond_fnmls_2.c: Likewise.
* g++.target/aarch64/sve/unpacked_cond_ternary_bf16_1.C: Likewise.
* g++.target/aarch64/sve/unpacked_cond_ternary_bf16_2.C: Likewise.
---
 gcc/config/aarch64/aarch64-sve.md | 61 ++-
 .../sve/unpacked_cond_ternary_bf16_1.C| 35 +++
 .../sve/unpacked_cond_ternary_bf16_2.C| 14 +
 .../aarch64/sve/unpacked_cond_fmla_1.c|  4 ++
 .../aarch64/sve/unpacked_cond_fmla_2.c| 18 ++
 .../aarch64/sve/unpacked_cond_fmls_1.c|  4 ++
 .../aarch64/sve/unpacked_cond_fmls_2.c| 18 ++
 .../aarch64/sve/unpacked_cond_fnmla_1.c   |  4 ++
 .../aarch64/sve/unpacked_cond_fnmla_2.c   | 18 ++
 .../aarch64/sve/unpacked_cond_fnmls_1.c   |  4 ++
 .../aarch64/sve/unpacked_cond_fnmls_2.c   | 18 ++
 11 files changed, 169 insertions(+), 29 deletions(-)
 create mode 100644 
gcc/testsuite/g++.target/aarch64/sve/unpacked_cond_ternary_bf16_1.C
 create mode 100644 
gcc/testsuite/g++.target/aarch64/sve/unpacked_cond_ternary_bf16_2.C
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmla_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmls_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmla_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmls_2.c

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index e5443980e8b..278f78960a6 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -7599,17 +7599,17 @@
 
 ;; Predicated floating-point ternary operations with merging.
 (define_expand "@cond_"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-   (unspec:SVE_FULL_F_B16B16
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+   (unspec:SVE_F_B16B16
  [(match_operand: 1 "register_operand")
-  (unspec:SVE_FULL_F_B16B16
+  (unspec:SVE_F_B16B16
 [(match_dup 1)
  (const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")
+ (match_operand:SVE_F_B16B16 4 "register_operand")]
 SVE_COND_FP_TERNARY)
-  (match_operand:SVE_FULL_F_B16B16 5 "aarch64_simd_reg_or_zero")]
+  (match_operand:SVE_F_B16B16 5 "aarch64_simd_reg_or_zero")]
  UNSPEC_SEL))]
   "TARGET_SVE && ( || !)"
 {
@@ -7617,6 +7617,9 @@
  second of the two.  */
   if (rtx_equal_p (operands[3], operands[5]))
 std::swap (operands[2], operands[3]);
+
+  if (rtx pred = aarch64_sve_emit_masked_fp_pred (mode, operands[1]))
+operands[1] = pred;
 })
 
 ;; Predicated floating-point ternary operations, merging with the
@@ -7646,15 +7649,15 @@
 )
 
 (define_insn "*cond__2_strict"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-   (unspec:SVE_FULL_F
- [(match_operand: 1 "register_operand")
-  (unspec:SVE_FULL_F
+  [(set (match_operand:SVE_F 0 "register_operand")
+   (unspec:SVE_F
+ [(match_operand: 1 "aarch64_predicate_operand")
+  (unspec:SVE_F
 [(match_dup 1)
  (const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")
- (match_operand:SVE_FULL_F 4 "register_operand")]
+ (match_operand:SVE_F 2 "register_operand")
+ 

Re: [PATCH 09/14] aarch64: Add support for unpacked SVE FDIV

2025-06-11 Thread Spencer Abson
On Tue, Jun 10, 2025 at 07:54:31PM +0100, Richard Sandiford wrote:
> Spencer Abson  writes:
> > On Fri, Jun 06, 2025 at 12:46:32PM +0100, Richard Sandiford wrote:
> >> Spencer Abson  writes:
> >> > This patch extends the unpredicated FP division expander to support
> >> > partial FP modes.  It extends the existing patterns used to implement
> >> > UNSPEC_COND_FDIV and it's approximation as needed.
> >> >
> >> > gcc/ChangeLog:
> >> >
> >> >  * config/aarch64/aarch64-sve.md: (@aarch64_sve_):
> >> >  Extend from SVE_FULL_F to SVE_F, use aarch64_predicate_operand.
> >> >  (div3): Extend from SVE_FULL_F to SVE_F.
> >> >  (@aarch64_frecpe): Likewise.
> >> >  (@aarch64_frecps): Likewise.
> >> >
> >> > gcc/testsuite/ChangeLog:
> >> >
> >> >  * gcc.target/aarch64/sve/unpacked_fdiv_1.c: New test.
> >> >  * gcc.target/aarch64/sve/unpacked_fdiv_2.c: Likewise.
> >> >  * gcc.target/aarch64/sve/unpacked_fdiv_3.c: Likewise.
> >> > ---
> >> >  gcc/config/aarch64/aarch64-sve.md | 50 +--
> >> >  .../gcc.target/aarch64/sve/unpacked_fdiv_1.c  | 34 +
> >> >  .../gcc.target/aarch64/sve/unpacked_fdiv_2.c  | 11 
> >> >  .../gcc.target/aarch64/sve/unpacked_fdiv_3.c  | 11 
> >> >  4 files changed, 81 insertions(+), 25 deletions(-)
> >> >  create mode 100644 
> >> > gcc/testsuite/gcc.target/aarch64/sve/unpacked_fdiv_1.c
> >> >  create mode 100644 
> >> > gcc/testsuite/gcc.target/aarch64/sve/unpacked_fdiv_2.c
> >> >  create mode 100644 
> >> > gcc/testsuite/gcc.target/aarch64/sve/unpacked_fdiv_3.c
> >> >
> >> > diff --git a/gcc/config/aarch64/aarch64-sve.md 
> >> > b/gcc/config/aarch64/aarch64-sve.md
> >> > index cdad900d9cf..79a087837de 100644
> >> > --- a/gcc/config/aarch64/aarch64-sve.md
> >> > +++ b/gcc/config/aarch64/aarch64-sve.md
> >> > @@ -3752,9 +3752,9 @@
> >> >  
> >> >  ;; Unpredicated floating-point unary operations.
> >> >  (define_insn "@aarch64_sve_"
> >> > -  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
> >> > -(unspec:SVE_FULL_F
> >> > -  [(match_operand:SVE_FULL_F 1 "register_operand" "w")]
> >> > +  [(set (match_operand:SVE_F 0 "register_operand" "=w")
> >> > +(unspec:SVE_F
> >> > +  [(match_operand:SVE_F 1 "register_operand" "w")]
> >> >SVE_FP_UNARY))]
> >> >"TARGET_SVE"
> >> >"\t%0., %1."
> >> 
> >> I agree the patch is correct for the current definitions of SVE_FP_UNARY
> >> and SVE_FP_BINARY.  Since the names are generic, though, I think it
> >> would be worth adding a comment to iterators.md above the definition
> >> of both iterators, saying something like:
> >> 
> >> ;; This iterator is currently only used for estimation instructions,
> >> ;; where lax handling of exceptions is assumed to be acceptable.
> >> ;; The iterator is therefore applied unconditionally to partial FP modes.
> >> ;; This would need to be revisited if new operations are added in future.
> >> 
> >> (feel free to reword)
> >> 
> >> The patch LGTM with that change.
> >> 
> >> That said, I suppose the documentation of the -mlow-precision-*
> >> options doesn't explicit state that they change exception behaviour.
> >> Maybe it would be safer to leave the reciprocal stuff out for now,
> >> even though wanting low-precision results with strict exception
> >> conformance seems like an odd combination.  Does anyone else have
> >> any thoughts?
> >
> > Yeah, I agree that it's not immediately clear whether -mlow-precision-*
> > alone justifies this.  I wouldn't have made this change if the low-
> > precision expansion wasn't predicated on all of:
> >
> >   if (!flag_finite_math_only
> >   || flag_trapping_math
> >   || !flag_unsafe_math_optimizations
> >   || optimize_function_for_size_p (cfun)
> >   || !use_approx_division_p)
> > return false;
> >
> > Which, IIUC, reflects the fact that it also requires -ffast-math or
> > -funsafe-math-optimizations.
> 
> Ah, OK.
> >
> > I should have placed an assertion (or something similar) to make sure
> > that we have !flag_trapping_math when the low-precision expander is
> > handling partial vector modes.
> >
> > Perhaps something for V2?  Happy to drop it for now if not.
> 
> I think in that case we can just change the comment I suggested to:
> 
> ;; This iterator is currently only used for estimation instructions,
> ;; which are never generated automatically when -ftrapping-math is true.
> ;; The iterator is therefore applied unconditionally to partial FP modes.
> ;; This might need to be revisited if new operations are added in future.
> 
> It can be tricky for expanders to assert that the caller is sane,
> because expanders generally don't have as much context as the caller does.
> For example, the instructions could theoretically be protected by a
> test for trouble-free inputs, a bit like for -ftree-builtin-call-dce.

Thank you, sounds good.

Spencer
> 
> Thanks,
> Richard


Re: [PATCH 11/14] aarch64: Add support for unpacked SVE FP conditional binary arithmetic

2025-06-11 Thread Spencer Abson
On Tue, Jun 10, 2025 at 08:04:06PM +0100, Richard Sandiford wrote:
> Spencer Abson  writes:
> > On Fri, Jun 06, 2025 at 03:52:12PM +0100, Richard Sandiford wrote:
> >> Spencer Abson  writes:
> >> > @@ -8165,20 +8169,25 @@
> >> >  ;;
> >> >  ;; For unpacked vectors, it doesn't really matter whether SEL uses the
> >> >  ;; the container size or the element size.  If SEL used the container 
> >> > size,
> >> > -;; it would ignore undefined bits of the predicate but would copy the
> >> > -;; upper (undefined) bits of each container along with the defined bits.
> >> > -;; If SEL used the element size, it would use undefined bits of the 
> >> > predicate
> >> > -;; to select between undefined elements in each input vector.  Thus the 
> >> > only
> >> > -;; difference is whether the undefined bits in a container always come 
> >> > from
> >> > -;; the same input as the defined bits, or whether the choice can vary
> >> > +;; it would ignore bits of the predicate that can be undefined, but 
> >> > would copy
> >> > +;; the upper (undefined) bits of each container along with the defined 
> >> > bits.
> >> > +;; If SEL used the element size, it would use bits from the predicate 
> >> > that can
> >> > +;; be undefined to select between undefined elements in each input 
> >> > vector.
> >> > +;; Thus the only difference is whether the undefined bits in a 
> >> > container always
> >> > +;; come from the same input as the defined bits, or whether the choice 
> >> > can vary
> >> 
> >> It looks like the main change here is to replace "undefined bits of the
> >> predicate" with "bits of the predicate that can be undefined".  Could
> >> you go into more detail about the distinction?  It seems to be saying
> >> that the upper bits in each predicate are sometimes defined and
> >> sometimes not.
> >> 
> >> As I see it, the "level of undefinedness" is really the same for the
> >> predicates and data vectors.  Within each container element, the bits
> >> that hold the element value are defined/significant and the other bits
> >> are undefined/insignificant/have arbitrary values.  The same thing
> >> goes for the upper bits in each predicate element: the low bit is
> >> defined/significant and the other bits are undefined/insignificant/have
> >> arbitrary values.  They might by chance be zeros when zeros are
> >> convenient or ones when ones are convenient, but the semantics don't
> >> guarantee anything either way.
> >
> > Yes, I agree.  Sorry, my change was not very clear.
> >
> > What I was trying to reflect is that, for example, selecting between
> > a pair of VNx4HF using VNx8BI is now a recognised insn.  However, any
> > bits of a VNx8BI that are not significant to a VNx4BI are don't-care
> > wrt the result.
> >
> > I meant that they 'can be undefined' in that they are as good as
> > undefined, for the purpose of SEL.  Maybe a better change would
> > have been to reword this paragraph in favour of 'don't-care' rather
> > than 'undefined' when describing the upper bits of each predicate
> > element?
> 
> Ah, I see.  In that case, how about a bigger edit:
> 
> ;; For unpacked vectors, it doesn't really matter whether SEL uses the
> ;; the container size or the element size.  If SEL used the container size,
> ;; it would would copy the upper (undefined) bits of each container along
> ;; with the corresponding defined bits.  If SEL used the element size,
> ;; it would use separate predicate bits to select between the undefined
> ;; elements in each input vector; these seperate predicate bits might
> ;; themselves be undefined, depending on the mode of the predicate.
> ;;
> ;; Thus the only difference is whether the undefined bits in a container
> ;; always come from the same input as the defined bits, or whether the
> ;; choice can vary independently of the defined bits.

Yeah, that's much clearer.

Thanks for this review, I'll apply your suggestions and commit the
patches with minor changes - with the vectorizer test change applied
across the series where applicable.

Unless I run into something unexpected, it looks like I may only need
to repost this patch (11/14) and patch 04, does that sound right to
you?

I'll keep track of the problems discussed in the cover note.

Thanks,
Spencer
> 
> Thanks,
> Richard


Re: [PATCH 11/14] aarch64: Add support for unpacked SVE FP conditional binary arithmetic

2025-06-09 Thread Spencer Abson
On Fri, Jun 06, 2025 at 03:52:12PM +0100, Richard Sandiford wrote:
> Spencer Abson  writes:
> > @@ -8165,20 +8169,25 @@
> >  ;;
> >  ;; For unpacked vectors, it doesn't really matter whether SEL uses the
> >  ;; the container size or the element size.  If SEL used the container size,
> > -;; it would ignore undefined bits of the predicate but would copy the
> > -;; upper (undefined) bits of each container along with the defined bits.
> > -;; If SEL used the element size, it would use undefined bits of the 
> > predicate
> > -;; to select between undefined elements in each input vector.  Thus the 
> > only
> > -;; difference is whether the undefined bits in a container always come from
> > -;; the same input as the defined bits, or whether the choice can vary
> > +;; it would ignore bits of the predicate that can be undefined, but would 
> > copy
> > +;; the upper (undefined) bits of each container along with the defined 
> > bits.
> > +;; If SEL used the element size, it would use bits from the predicate that 
> > can
> > +;; be undefined to select between undefined elements in each input vector.
> > +;; Thus the only difference is whether the undefined bits in a container 
> > always
> > +;; come from the same input as the defined bits, or whether the choice can 
> > vary
> 
> It looks like the main change here is to replace "undefined bits of the
> predicate" with "bits of the predicate that can be undefined".  Could
> you go into more detail about the distinction?  It seems to be saying
> that the upper bits in each predicate are sometimes defined and
> sometimes not.
> 
> As I see it, the "level of undefinedness" is really the same for the
> predicates and data vectors.  Within each container element, the bits
> that hold the element value are defined/significant and the other bits
> are undefined/insignificant/have arbitrary values.  The same thing
> goes for the upper bits in each predicate element: the low bit is
> defined/significant and the other bits are undefined/insignificant/have
> arbitrary values.  They might by chance be zeros when zeros are
> convenient or ones when ones are convenient, but the semantics don't
> guarantee anything either way.

Yes, I agree.  Sorry, my change was not very clear.

What I was trying to reflect is that, for example, selecting between
a pair of VNx4HF using VNx8BI is now a recognised insn.  However, any
bits of a VNx8BI that are not significant to a VNx4BI are don't-care
wrt the result.

I meant that they 'can be undefined' in that they are as good as
undefined, for the purpose of SEL.  Maybe a better change would
have been to reword this paragraph in favour of 'don't-care' rather
than 'undefined' when describing the upper bits of each predicate
element?

> 
> >  ;; independently of the defined bits.
> >  ;;
> >  ;; For the other instructions, using the element size is more natural,
> >  ;; so we do that for SEL as well.
> > +;;
> > +;; The use of 'aarch64_predicate_operand' here is only to support the FP 
> > arithmetic/
> > +;; UNSPEC_SEL combiner patterns.  As with those operations, any predicate 
> > bits that
> > +;; are insignificant to the data mode will have no effect on the 
> > operation's result.
> 
> Sorry for the formatting nit, but: long lines.  The comment itself looks good
> though.
> 
> > +;;
> >  (define_insn "*vcond_mask_"
> >[(set (match_operand:SVE_ALL 0 "register_operand")
> > (unspec:SVE_ALL
> > - [(match_operand: 3 "register_operand")
> > + [(match_operand: 3 "aarch64_predicate_operand")
> >(match_operand:SVE_ALL 1 "aarch64_sve_reg_or_dup_imm")
> >(match_operand:SVE_ALL 2 "aarch64_simd_reg_or_zero")]
> >   UNSPEC_SEL))]
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index 287de0f5ae4..d38b108c5f4 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -3893,6 +3893,33 @@ aarch64_sve_fp_pred (machine_mode data_mode, rtx 
> > *strictness)
> > return aarch64_ptrue_reg (aarch64_sve_pred_mode (data_mode));
> >  }
> >  
> > +/* If DATA_MODE is a partial vector mode, emit a sequence of insns to
> > +   zero-out the predicate bits of an existing natural GP, PRED, associated
> > +   with the undefined elements in each container
> 
> This makes it sound unconditional, whereas it's really conditional on
> flag_trapping_math.
> 
> Also, I'd see this more as converting a 

Re: [PATCH 04/14] aarch64: Add support for unpacked SVE FP comparisons

2025-06-09 Thread Spencer Abson
On Fri, Jun 06, 2025 at 10:02:19AM +0100, Richard Sandiford wrote:
> Spencer Abson  writes:
> > @@ -27292,10 +27291,16 @@ aarch64_emit_sve_invert_fp_cond (rtx target, 
> > rtx_code code, rtx pred,
> >  void
> >  aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx 
> > op1)
> >  {
> > -  machine_mode pred_mode = GET_MODE (target);
> >machine_mode data_mode = GET_MODE (op0);
> > +  rtx pred = aarch64_sve_fp_pred (data_mode, nullptr);
> >  
> > -  rtx ptrue = aarch64_ptrue_reg (pred_mode);
> > +  /* The governing and destination modes.  */
> > +  machine_mode pred_mode = GET_MODE (pred);
> > +  machine_mode target_mode = GET_MODE (target);
> > +
> > +  /* Also determines SVE_KNOWN_PTRUE, since an unnatural GP from
> > + sve_fp_pred would disable part of the operation.   */
> > +  bool natural_p = pred_mode == target_mode;
> 
> I'm not sure about calling this "unnatural".  The SVE predicate system
> was designed with this kind of flexibility in mind.  How about:
> 
>   /* For partial vector modes, the choice of predicate mode depends
>  on whether we need to suppress exceptions for inactive elements.
>  If we do need to suppress exceptions, the predicate mode matches
>  the element size rather than the container size and the predicate
>  marks the upper bits in each container as inactive.  The predicate
>  is then a ptrue wrt target_mode but not wrt pred_mode.  It is the
>  ptrueness wrt pred_mode that matters here.
> 
>  If we don't need to suppress exceptions, the predicate mode matches
>  the container size, pred_mode == target_mode, and the predicate is
>  thus a ptrue wrt both target_mode and pred_mode.  */
>   bool known_ptrue_p = pred_mode == target_mode;

OK, I think referring to containers and elements is a good call.  Maybe
we ought to add a comment above the definition of VPRED, along the
lines of:

;; For partial vector modes, this is the predicate mode associated
;; with the container size.

Then your suggestion for patch 06 sounds good too.

> 
> There again, maybe my comment makes no sense to anyone other than me,
> so please do push back if you have a better suggestion!

Only that perhaps the last part of the first section could be:

/*  The predicate is then a ptrue wrt target_mode but not wrt pred_mode,
it is the latter which matters here.  */

I'll be adding 'ptrueness' to my dictionary either way! :)

> 
> >switch (code)
> >  {
> >  case UNORDERED:
> > [...]
> > @@ -27333,11 +27338,21 @@ aarch64_expand_sve_vec_cmp_float (rtx target, 
> > rtx_code code, rtx op0, rtx op1)
> >  case UNGE:
> >if (flag_trapping_math)
> > {
> > - /* Work out which elements are ordered.  */
> > - rtx ordered = gen_reg_rtx (pred_mode);
> >   op1 = force_reg (data_mode, op1);
> > - aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
> > -  ptrue, true, op0, op1);
> > +
> > + /* Work out which elements are unordered.  */
> > + rtx uo_tmp = gen_reg_rtx (target_mode);
> > + aarch64_emit_sve_fp_cond (uo_tmp, UNORDERED, pred, natural_p,
> > +   op0, op1);
> > +
> > + /* Invert the result.  Use PRED again to maintain the intended
> > +trapping behavior.  */
> > + if (!natural_p)
> > +   uo_tmp = gen_lowpart (pred_mode, uo_tmp);
> 
> The !natural_p test isn't necessary here, and IMO it's slightly easier
> to follow the code without it.  The lowpart is a natural component of
> the following XOR/NOT operation and is necessary to make the operation
> well-typed.
> 
> > +
> > + rtx ordered = gen_reg_rtx (pred_mode);
> > + emit_insn (gen_aarch64_pred_z (XOR, pred_mode,
> > +ordered, pred, pred, uo_tmp));
> 
> Although the underlying instruction is an EOR, (and (xor a b) a) isn't
> canonical rtl: it should be folded to (and (not b) a) instead.
> 
> So I think we should rename:
> 
> ;; Predicated predicate inverse.
> (define_insn "*one_cmpl3"
>   [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
>   (and:PRED_ALL
> (not:PRED_ALL (match_operand:PRED_ALL 2 "register_operand" "Upa"))
> (match_operand:PRED_ALL 1 "register_operand" "Upa")))]
>   "TARGET_SVE"
>   "not\t%0.b, %1/z, %2.b"
> )
> 
> to "@aarch64_pred_one_cmpl_z" and use gen_aarch64_pred_one_compl_z
> here.  (Not the most elegant instruction name, but I suppose we should
> be consistent...)
> 
> This will need updates to the testcase to match NOT rather than EOR.

Thanks for the catch & fix, sounds good.

> 
> OK with those changes, thanks.
> 
> Richard


Re: [PATCH 08/14] aarch64: Add support for unpacked SVE FP binary arithmetic

2025-06-09 Thread Spencer Abson
On Fri, Jun 06, 2025 at 12:18:15PM +0100, Richard Sandiford wrote:
> Spencer Abson  writes:
> > This patch extends the expanders for unpredicated smax, smin, add, sub,
> > mul, min, and max, so that they support partial SVE FP modes.
> >
> > The relevant insn/split patterns have also been updated.
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-sve.md (3): Extend from
> > SVE_FULL_F to SVE_F, and use aarch64_sve_fp_pred.
> > (@aarch64_pred_): Extend from SVE_FULL_F to SVE_F,
> > use aarch64_predicate_operand.  (ADD/SUB/MUL/MAX/MIN).
> > * config/aarch64/aarch64-sve2.md: Likewise, for BF16 operations.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * g++.target/aarch64/sve/unpacked_binary_bf16_1.C: New test.
> > * g++.target/aarch64/sve/unpacked_binary_bf16_2.C: Likewise.
> > * gcc.target/aarch64/sve/unpacked_builtin_fmax_1.c: Likewise.
> > * gcc.target/aarch64/sve/unpacked_builtin_fmax_2.c: Likewise.
> > * gcc.target/aarch64/sve/unpacked_builtin_fmin_1.c: Likewise.
> > * gcc.target/aarch64/sve/unpacked_builtin_fmin_2.c: Likewise.
> > * gcc.target/aarch64/sve/unpacked_fadd_1.c: Likewise.
> > * gcc.target/aarch64/sve/unpacked_fadd_2.c: Likewise.
> > * gcc.target/aarch64/sve/unpacked_fmaxnm_1.c: Likewise.
> > * gcc.target/aarch64/sve/unpacked_fmaxnm_2.c: Likewise.
> > * gcc.target/aarch64/sve/unpacked_fminnm_1.c: Likewise.
> > * gcc.target/aarch64/sve/unpacked_fminnm_2.c: Likewise.
> > * gcc.target/aarch64/sve/unpacked_fmul_1.c: Likewise.
> > * gcc.target/aarch64/sve/unpacked_fmul_2.c: Likewise.
> > * gcc.target/aarch64/sve/unpacked_fsubr_1.c: Likewise.
> > * gcc.target/aarch64/sve/unpacked_fsubr_2.c: Likewise.
> 
> OK, thanks.
> 
> > ---
> >  gcc/config/aarch64/aarch64-sve.md | 70 +--
> >  gcc/config/aarch64/aarch64-sve2.md| 10 +--
> >  .../aarch64/sve/unpacked_binary_bf16_1.C  | 35 ++
> >  .../aarch64/sve/unpacked_binary_bf16_2.C  | 15 
> >  .../aarch64/sve/unpacked_builtin_fmax_1.c | 40 +++
> >  .../aarch64/sve/unpacked_builtin_fmax_2.c | 16 +
> >  .../aarch64/sve/unpacked_builtin_fmin_1.c | 40 +++
> >  .../aarch64/sve/unpacked_builtin_fmin_2.c | 16 +
> >  .../gcc.target/aarch64/sve/unpacked_fadd_1.c  | 48 +
> >  .../gcc.target/aarch64/sve/unpacked_fadd_2.c  | 22 ++
> >  .../aarch64/sve/unpacked_fmaxnm_1.c   | 41 +++
> >  .../aarch64/sve/unpacked_fmaxnm_2.c   | 16 +
> >  .../aarch64/sve/unpacked_fminnm_1.c   | 42 +++
> >  .../aarch64/sve/unpacked_fminnm_2.c   | 16 +
> >  .../gcc.target/aarch64/sve/unpacked_fmul_1.c  | 39 +++
> >  .../gcc.target/aarch64/sve/unpacked_fmul_2.c  | 14 
> >  .../gcc.target/aarch64/sve/unpacked_fsubr_1.c | 42 +++
> >  .../gcc.target/aarch64/sve/unpacked_fsubr_2.c | 16 +
> >  18 files changed, 498 insertions(+), 40 deletions(-)
> >  create mode 100644 
> > gcc/testsuite/g++.target/aarch64/sve/unpacked_binary_bf16_1.C
> >  create mode 100644 
> > gcc/testsuite/g++.target/aarch64/sve/unpacked_binary_bf16_2.C
> >  create mode 100644 
> > gcc/testsuite/gcc.target/aarch64/sve/unpacked_builtin_fmax_1.c
> >  create mode 100644 
> > gcc/testsuite/gcc.target/aarch64/sve/unpacked_builtin_fmax_2.c
> >  create mode 100644 
> > gcc/testsuite/gcc.target/aarch64/sve/unpacked_builtin_fmin_1.c
> >  create mode 100644 
> > gcc/testsuite/gcc.target/aarch64/sve/unpacked_builtin_fmin_2.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fadd_1.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fadd_2.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fmaxnm_1.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fmaxnm_2.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fminnm_1.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fminnm_2.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fmul_1.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fmul_2.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fsubr_1.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fsubr_2.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-sve.md 
> > b/gcc/config/aarch64/aarch64-sve.md
> > index 76de511420f..cdad900d9cf 100644
> > --- a/gcc/config/aarch64/aarch6

Re: [PATCH 09/14] aarch64: Add support for unpacked SVE FDIV

2025-06-09 Thread Spencer Abson
On Fri, Jun 06, 2025 at 12:46:32PM +0100, Richard Sandiford wrote:
> Spencer Abson  writes:
> > This patch extends the unpredicated FP division expander to support
> > partial FP modes.  It extends the existing patterns used to implement
> > UNSPEC_COND_FDIV and it's approximation as needed.
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-sve.md: (@aarch64_sve_):
> > Extend from SVE_FULL_F to SVE_F, use aarch64_predicate_operand.
> > (div3): Extend from SVE_FULL_F to SVE_F.
> > (@aarch64_frecpe): Likewise.
> > (@aarch64_frecps): Likewise.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/sve/unpacked_fdiv_1.c: New test.
> > * gcc.target/aarch64/sve/unpacked_fdiv_2.c: Likewise.
> > * gcc.target/aarch64/sve/unpacked_fdiv_3.c: Likewise.
> > ---
> >  gcc/config/aarch64/aarch64-sve.md | 50 +--
> >  .../gcc.target/aarch64/sve/unpacked_fdiv_1.c  | 34 +
> >  .../gcc.target/aarch64/sve/unpacked_fdiv_2.c  | 11 
> >  .../gcc.target/aarch64/sve/unpacked_fdiv_3.c  | 11 
> >  4 files changed, 81 insertions(+), 25 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fdiv_1.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fdiv_2.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fdiv_3.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-sve.md 
> > b/gcc/config/aarch64/aarch64-sve.md
> > index cdad900d9cf..79a087837de 100644
> > --- a/gcc/config/aarch64/aarch64-sve.md
> > +++ b/gcc/config/aarch64/aarch64-sve.md
> > @@ -3752,9 +3752,9 @@
> >  
> >  ;; Unpredicated floating-point unary operations.
> >  (define_insn "@aarch64_sve_"
> > -  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
> > -   (unspec:SVE_FULL_F
> > - [(match_operand:SVE_FULL_F 1 "register_operand" "w")]
> > +  [(set (match_operand:SVE_F 0 "register_operand" "=w")
> > +   (unspec:SVE_F
> > + [(match_operand:SVE_F 1 "register_operand" "w")]
> >   SVE_FP_UNARY))]
> >"TARGET_SVE"
> >"\t%0., %1."
> 
> I agree the patch is correct for the current definitions of SVE_FP_UNARY
> and SVE_FP_BINARY.  Since the names are generic, though, I think it
> would be worth adding a comment to iterators.md above the definition
> of both iterators, saying something like:
> 
> ;; This iterator is currently only used for estimation instructions,
> ;; where lax handling of exceptions is assumed to be acceptable.
> ;; The iterator is therefore applied unconditionally to partial FP modes.
> ;; This would need to be revisited if new operations are added in future.
> 
> (feel free to reword)
> 
> The patch LGTM with that change.
> 
> That said, I suppose the documentation of the -mlow-precision-*
> options doesn't explicit state that they change exception behaviour.
> Maybe it would be safer to leave the reciprocal stuff out for now,
> even though wanting low-precision results with strict exception
> conformance seems like an odd combination.  Does anyone else have
> any thoughts?

Yeah, I agree that it's not immediately clear whether -mlow-precision-*
alone justifies this.  I wouldn't have made this change if the low-
precision expansion wasn't predicated on all of:

  if (!flag_finite_math_only
  || flag_trapping_math
  || !flag_unsafe_math_optimizations
  || optimize_function_for_size_p (cfun)
  || !use_approx_division_p)
return false;

Which, IIUC, reflects the fact that it also requires -ffast-math or
-funsafe-math-optimizations.

I should have placed an assertion (or something similar) to make sure
that we have !flag_trapping_math when the low-precision expander is
handling partial vector modes.

Perhaps something for V2?  Happy to drop it for now if not.

> 
> Richard


Re: [PATCH 13/14] aarch64: Relaxed SEL combiner patterns for unpacked SVE FP ternary arithmetic

2025-06-09 Thread Spencer Abson
On Fri, Jun 06, 2025 at 04:04:18PM +0100, Richard Sandiford wrote:
> Spencer Abson  writes:
> > Extend the ternary op/UNSPEC_SEL combiner patterns from SVE_FULL_F/
> > SVE_FULL_F_BF to SVE_F/SVE_F_BF, where the strictness value is
> > SVE_RELAXED_GP.
> >
> > We can only reliably test the 'merging with the third input' (addend)
> > and 'independent value' patterns at this stage as the canocalisation that
> > reorders the multiplicands based on the second SEL input would be performed
> > by the conditional expander.
> >
> > Another difficulty is that we can't test these fused multiply/SEL combines
> > without using __builtin_fma and friends.  The reason for this is as follows:
> >
> > We support COND_ADD, COND_SUB, and COND_MUL optabs, so match.pd will
> > canonicalize patterns like ADD/SUB/MUL combined with a VEC_COND_EXPR into
> > these conditional forms.  Later, when widening_mul tries to fold these into
> > conditional fused multiply operations, the transformation fails - simply
> > because we haven’t implemented those conditional fused multiply optabs yet.
> >
> > Hence why this patch lacks tests for BFloat16...
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-sve.md (*cond__2_relaxed):
> > Extend from SVE_FULL_F to SVE_F.
> > (*cond__4_relaxed): Extend from SVE_FULL_F_B16B16
> > to SVE_F_B16B16.
> > (*cond__any_relaxed): Likewise.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/sve/unpacked_cond_fmla_1.c: New test.
> > * gcc.target/aarch64/sve/unpacked_cond_fmls_1.c: Likewise.
> > * gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c: Likewise.
> > * gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c: Likewise.
> 
> OK, thanks.
> 
> BTW, I just realised that all my comments saying "please add a token
> test that SEL doesn't get dropped" are probably completely bogus,
> since those cases will instead be handed by the follow-on patches
> to the conditional optabs.  Is that right?  Please ignore if so. :)

Yeah, that's true where we have a conditional optab, but that isn't
the case for any of the interesting unary operations.

Maybe I should include '_2.c' style tests for each of those, that
explicitly test for SEL?

Thanks,
Spencer
> 
> Sorry, I've been thinking about this a patch at a time while ignoring
> the bigger picture.
> 
> Richard
> 
> > ---
> >  gcc/config/aarch64/aarch64-sve.md | 38 
> >  .../aarch64/sve/unpacked_cond_fmla_1.c| 43 +++
> >  .../aarch64/sve/unpacked_cond_fmls_1.c| 43 +++
> >  .../aarch64/sve/unpacked_cond_fnmla_1.c   | 43 +++
> >  .../aarch64/sve/unpacked_cond_fnmls_1.c   | 43 +++
> >  5 files changed, 191 insertions(+), 19 deletions(-)
> >  create mode 100644 
> > gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmla_1.c
> >  create mode 100644 
> > gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmls_1.c
> >  create mode 100644 
> > gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c
> >  create mode 100644 
> > gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-sve.md 
> > b/gcc/config/aarch64/aarch64-sve.md
> > index 8c1921ddf5c..e5443980e8b 100644
> > --- a/gcc/config/aarch64/aarch64-sve.md
> > +++ b/gcc/config/aarch64/aarch64-sve.md
> > @@ -7622,15 +7622,15 @@
> >  ;; Predicated floating-point ternary operations, merging with the
> >  ;; first input.
> >  (define_insn_and_rewrite "*cond__2_relaxed"
> > -  [(set (match_operand:SVE_FULL_F 0 "register_operand")
> > -   (unspec:SVE_FULL_F
> > +  [(set (match_operand:SVE_F 0 "register_operand")
> > +   (unspec:SVE_F
> >   [(match_operand: 1 "register_operand")
> > -  (unspec:SVE_FULL_F
> > +  (unspec:SVE_F
> >  [(match_operand 5)
> >   (const_int SVE_RELAXED_GP)
> > - (match_operand:SVE_FULL_F 2 "register_operand")
> > - (match_operand:SVE_FULL_F 3 "register_operand")
> > - (match_operand:SVE_FULL_F 4 "register_operand")]
> > + (match_operand:SVE_F 2 "register_operand")
> > + (match_operand:SVE_F 3 "register_operand")
> > + (match_operand:SVE_F 4 "register_operand")]
> >  SVE_COND_FP_TERNARY)
> >(match_dup 2)]
> >   UNSPEC_SEL))]
&g

Re: [PATCH 02/14] aarch64: Add support for unpacked SVE FP conversions

2025-06-09 Thread Spencer Abson
On Thu, Jun 05, 2025 at 06:11:44PM +0100, Richard Sandiford wrote:
> Spencer Abson  writes:
> > @@ -9487,21 +9489,39 @@
> >  ;; - FCVTZU
> >  ;; 
> > -
> >  
> > -;; Unpredicated conversion of floats to integers of the same size (HF to 
> > HI,
> > -;; SF to SI or DF to DI).
> > -(define_expand "2"
> > -  [(set (match_operand: 0 "register_operand")
> > -   (unspec:
> > +;; Unpredicated conversion of floats to integers of the same size or wider,
> > +;; excluding conversions from DF (see below).
> > +(define_expand "2"
> > +  [(set (match_operand:SVE_HSDI 0 "register_operand")
> > +   (unspec:SVE_HSDI
> > + [(match_dup 2)
> > +  (match_dup 3)
> > +  (match_operand:SVE_HSF 1 "register_operand")]
> > + SVE_COND_FCVTI))]
> > +  "TARGET_SVE
> > +   && (~( | ) & 
> > ) == 0"
> > +  {
> > +operands[2] = aarch64_sve_fp_pred (mode, &operands[3]);
> > +  }
> > +)
> > +
> > +;; SI <- DF can't use SI <- trunc (DI <- DF) without -ffast-math, so this
> > +;; truncating variant of FCVTZ{S,U} is useful for auto-vectorization.
> > +;;
> > +;; DF is the only source mode for which the mask used above doesn't apply,
> > +;; we define a separate pattern for it here.
> > +(define_expand "2"
> > +  [(set (match_operand:SVE_2SDI 0 "register_operand")
> > +   (unspec:SVE_2SDI
> >   [(match_dup 2)
> >(const_int SVE_RELAXED_GP)
> > -  (match_operand:SVE_FULL_F 1 "register_operand")]
> > +  (match_operand:VNx2DF_ONLY 1 "register_operand")]
> >   SVE_COND_FCVTI))]
> >"TARGET_SVE"
> >{
> > -operands[2] = aarch64_ptrue_reg (mode);
> > +operands[2] = aarch64_ptrue_reg (VNx2BImode);
> >}
> >  )
> > -
> 
> Sorry for the formatting nit, but: please keep the blank line.
> 
> >  ;; Predicated float-to-integer conversion, either to the same width or 
> > wider.
> >  (define_insn 
> > "@aarch64_sve__nontrunc"
> >[(set (match_operand:SVE_FULL_HSDI 0 "register_operand")
> > @@ -9517,18 +9537,34 @@
> >}
> >  )
> >  
> > +;; As above, for pairs used by the auto-vectorizer only.
> > +(define_insn 
> > "*aarch64_sve__nontrunc"
> > +  [(set (match_operand:SVE_HSDI 0 "register_operand")
> > +   (unspec:SVE_HSDI
> > + [(match_operand: 1 "aarch64_predicate_operand")
> > +  (match_operand:SI 3 "aarch64_sve_gp_strictness")
> > +  (match_operand:SVE_PARTIAL_F 2 "register_operand")]
> > + SVE_COND_FCVTI))]
> > +   "TARGET_SVE
> > +   && (~( | ) & 
> > ) == 0"
> > +  {@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
> > + [ w, Upl , 0 ; *  ] 
> > fcvtz\t%0., %1/m, %2.
> > + [ ?&w  , Upl , w ; yes] movprfx\t%0, 
> > %2\;fcvtz\t%0., %1/m, %2.
> > +  }
> > +)
> > +
> >  ;; Predicated narrowing float-to-integer conversion.
> 
> I think it would be worth extending the comment here, in case it isn't
> obvious what's going on:
> 
> ;; Predicated narrowing float-to-integer conversion.  The VNx2DF->VNx4SI
> ;; variant is provided for the ACLE, where the zeroed odd-indexed lanes are
> ;; significant.  The VNx2DF->VNx2SI variant is provided for autovectorization,
> ;; where the odd-indexed lanes are ignored.

Sounds good, thanks.

> 
> > -(define_insn 
> > "@aarch64_sve__trunc"
> > -  [(set (match_operand:VNx4SI_ONLY 0 "register_operand")
> > -   (unspec:VNx4SI_ONLY
> > +(define_insn "@aarch64_sve__trunc"
> > +  [(set (match_operand:SVE_SI 0 "register_operand")
> > +   (unspec:SVE_SI
> >   [(match_operand:VNx2BI 1 "register_operand")
> >(match_operand:SI 3 "aarch64_sve_gp_strictness")
> >(match_operand:VNx2DF_ONLY 2 "register_operand")]
> >   SVE_COND_FCVTI))]
> >"TARGET_SVE"
> >{@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
> > - [ w, Upl , 0 ; *  ] 
> > fcvtz\t%0., %1/m, %2.
> > - [ ?&w  , Upl , w ; yes] movprfx\t%0, 
> > %2\;fcvtz\t%0., %1/m, %2.
> > + [ w, Upl , 0 ; *  ] 
> > fcvtz\t%0., %1/m, %2.
> > + [ ?&w  , 

Re: [PATCH 03/14] aarch64: Relaxed SEL combiner patterns for unpacked SVE FP conversions

2025-06-09 Thread Spencer Abson
On Thu, Jun 05, 2025 at 09:24:27PM +0100, Richard Sandiford wrote:
> Spencer Abson  writes:
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_cvtf_1.c 
> > b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_cvtf_1.c
> > new file mode 100644
> > index 000..8f69232f2cf
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_cvtf_1.c
> > @@ -0,0 +1,47 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 
> > -fno-trapping-math" } */
> 
> The =2048 is ok, but do you need it for these autovectorisation tests?
> If vectorisation is treated as not profitable without it, then perhaps
> we could switch to Tamar's -mmax-vectorization, once that's in.

This isn't needed to make vectorization profitable, but rather to
make partial vector modes the reliably obvious choice - and hopefully
one that is isn't affected by future cost model changes.  With =2048
and COUNT, each loop should be fully-unrolled into a single unpacked 
operation (plus setup and return).

For me, this was much more flexible than using builtin vector types,
and easier to reason about.  Maybe that's just me though!  I can try
something else if it would be preferred.

> 
> OK as-is, but also ok without -msve-vector-bits if that works (first
> preference) or with -mmax-vectorization (second preference).
> 
> Thanks,
> Richard


[PATCH 10/14] aarch64: Relaxed SEL combiner patterns for unpacked SVE FP binary arithmetic

2025-06-02 Thread Spencer Abson
Extend the binary op/UNSPEC_SEL combiner patterns from SVE_FULL_F/
SVE_FULL_F_B16B16 to SVE_F/SVE_F_B16B16, where the strictness value
is SVE_RELAXED_GP.

gcc/ChangeLog:

* config/aarch64/aarch64-sve.md (*cond__2_relaxed):
Extend from SVE_FULL_F_B16B16 to SVE_F_B16B16.
(*cond__3_relaxed): Likewise.
(*cond__any_relaxed): Likwise.
(*cond__any_const_relaxed): Extend from SVE_FULL_F
to SVE_F.
(*cond_add_2_const_relaxed): Likewise.
(*cond_add_any_const_relaxed): Likewise.
(*cond_sub_3_const_relaxed): Likewise.
(*cond_sub_const_relaxed): Likewise.

gcc/testsuite/ChangeLog:

* g++.target/aarch64/sve/unpacked_cond_binary_bf16_1.C: New test.
* gcc.target/aarch64/sve/unpacked_cond_builtin_fmax_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_builtin_fmin_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_fadd_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_fdiv_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_fmaxnm_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_fminnm_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_fmul_1.c: Likewise..
* gcc.target/aarch64/sve/unpacked_cond_fsubr_1.c: Likewise.
---
 gcc/config/aarch64/aarch64-sve.md | 98 +--
 .../aarch64/sve/unpacked_cond_binary_bf16_1.C | 46 +
 .../sve/unpacked_cond_builtin_fmax_1.c| 47 +
 .../sve/unpacked_cond_builtin_fmin_1.c| 47 +
 .../aarch64/sve/unpacked_cond_fadd_1.c| 58 +++
 .../aarch64/sve/unpacked_cond_fdiv_1.c| 43 
 .../aarch64/sve/unpacked_cond_fmaxnm_1.c  | 49 ++
 .../aarch64/sve/unpacked_cond_fminnm_1.c  | 49 ++
 .../aarch64/sve/unpacked_cond_fmul_1.c| 46 +
 .../aarch64/sve/unpacked_cond_fsubr_1.c   | 53 ++
 10 files changed, 487 insertions(+), 49 deletions(-)
 create mode 100644 
gcc/testsuite/g++.target/aarch64/sve/unpacked_cond_binary_bf16_1.C
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmax_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmin_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fadd_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fdiv_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmaxnm_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fminnm_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmul_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fsubr_1.c

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 79a087837de..d111e0b9261 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -5585,14 +5585,14 @@
 
 ;; Predicated floating-point operations, merging with the first input.
 (define_insn_and_rewrite "*cond__2_relaxed"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-   (unspec:SVE_FULL_F_B16B16
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+   (unspec:SVE_F_B16B16
  [(match_operand: 1 "register_operand")
-  (unspec:SVE_FULL_F_B16B16
+  (unspec:SVE_F_B16B16
 [(match_operand 4)
  (const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")]
 SVE_COND_FP_BINARY)
   (match_dup 2)]
  UNSPEC_SEL))]
@@ -5628,14 +5628,14 @@
 
 ;; Same for operations that take a 1-bit constant.
 (define_insn_and_rewrite "*cond__2_const_relaxed"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-   (unspec:SVE_FULL_F
+  [(set (match_operand:SVE_F 0 "register_operand")
+   (unspec:SVE_F
  [(match_operand: 1 "register_operand")
-  (unspec:SVE_FULL_F
+  (unspec:SVE_F
 [(match_operand 4)
  (const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "")]
 SVE_COND_FP_BINARY_I1)
   (match_dup 2)]
  UNSPEC_SEL))]
@@ -5671,14 +5671,14 @@
 
 ;; Predicated floating-point operations, merging with the second input.
 (define_insn_and_rewrite "*cond__3_relaxed"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-   (unspec:SVE_FULL_F_B16B16
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+   (unspec:SVE_F_B16B16
  [(match_operand: 1 "register_operand")
-  (unspec:SVE_FULL_F_B16B16
+  (unspec:

[PATCH 02/14] aarch64: Add support for unpacked SVE FP conversions

2025-06-02 Thread Spencer Abson
This patch introduces expanders for FP<-FP conversions that levarage
partial vector modes.  We also extend the INT<-FP and FP<-INT conversions
using the same approach.

The ACLE enables vectorized conversions like the following:

fcvt z0.h, p7/m, z1.s

Modelling the source vector as VNx4SF:

... | SF| SF| SF| SF|

And the destination as a VNx8HF, where this operation would yield:

... | 0 | HF| 0 | HF| 0 | HF| 0 | HF|

Hence the useful results are stored unpacked, i.e.

... | X | HF| X | HF| X | HF| X | HF|   (VNx4HF)

This patch allows the vectorizer to use this variant of fcvt as a
conversion from VNx4SF to VNx4HF.  The same idea applies to widening
conversions, and between vectors with FP and integer base types.

If the source itself had been unpacked, e.g.

... |   X   | SF|   X   | SF|   (VNx2SF)

The result would yield

... | X | X | X | HF| X | X | X | HF|   (VNx2HF)

The upper bits of each container here are undefined, it's important to
avoid interpreting them during FP operations - doing so could introduce
spurious traps.  The obvious route we've taken here is to mask undefined
lanes using the operation's predicate.

The natural predicate mode (e.g. VNx2BI here) cannot do this; to ensure
correct behavior, we need a predicate mode that can control the data as if
it were fully-packed (VNx4BI).

These unnatural predicates must be recognised as legal operands by the
corresponding FP insns.  In general, the governing predicate mode for an
insn could be any such with at least as many significant lanes as the data
mode.  For example, addvnx4hf3 could be controlled by any of VNx{4,8,16}BI.

We implement 'aarch64_predicate_operand', a new define_special_predicate, to
acheive this.

gcc/ChangeLog:

* config/aarch64/aarch64-protos.h (aarch64_sve_valid_pred_p):
Declare helper for aarch64_predicate_operand.
(aarch64_sve_packed_pred): Declare helper for new expanders.
(aarch64_sve_fp_pred): Likewise.
* config/aarch64/aarch64-sve.md (2):
Extend into...
(2): New expander for converting
vectors of HF,SF to vectors of HI,SI,DI.
(2): New expander for converting
vectors of SI,DI to vectors of DF.
(*aarch64_sve__nontrunc):
New pattern to match those we've added here.
(@aarch64_sve__trunc): Extend
into...
(@aarch64_sve__trunc): Match both
VNx2SI<-VNx2DF and VNx4SI<-VNx4DF.
(2): Extend into...
(2): New expander for converting 
vectors
of HI,SI,DI to vectors of HF,SF,DF.
(*aarch64_sve__nonextend): New
pattern to match those we've added here.
(trunc2): New expander to handle
narrowing ('truncating') FP<-FP conversions.
(*aarch64_sve__trunc): New
pattern to handle those we've added here.
(extend2): New expander to handle
widening ('extending') FP<-FP conversions.
(*aarch64_sve__nontrunc): New
pattern to handle those we've added here.
* config/aarch64/aarch64.cc (aarch64_sve_packed_pred): New function.
(aarch64_sve_fp_pred): Likewise.
(aarch64_sve_valid_pred_p): Likewise.
* config/aarch64/iterators.md (SVE_PARTIAL_HSF): New mode iterator.
(SVE_HSF): Likewise.
(SVE_SDF): Likewise.
(SVE_SI): Likewise.
(SVE_2SDI) Likewise.
(self_mask):  Extend to all integer/FP vector modes.
(narrower_mask): Likewise (excluding QI).
* config/aarch64/predicates.md (aarch64_predicate_operand): New special
predicate to handle unnatural predicate modes.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/pack_fcvt_signed_1.c: Disable the aarch64 
vector
cost model to preserve this test.
* gcc.target/aarch64/sve/pack_fcvt_unsigned_1.c: Likewise.
* gcc.target/aarch64/sve/pack_float_1.c: Likewise.
* gcc.target/aarch64/sve/unpack_float_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cvtf_1.c: New test.
* gcc.target/aarch64/sve/unpacked_cvtf_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cvtf_3.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fcvt_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fcvt_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fcvtz_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fcvtz_2.c: Likewise.
---
 gcc/config/aarch64/aarch64-protos.h   |   3 +
 gcc/config/aarch64/aarch64-sve.md | 169 ++--
 gcc/config/aarch64/aarch64.cc |  51 
 gcc/config/aarch64/iterators.md   |  47 +++-
 gcc/config/aarch64/predicates.md  |   4 +
 .../aarch64/sve/pack_fcvt_signed_1.c  |   2 +-
 .../aarch64/sve/pack_fcvt_unsigned_1.c|   2 +-
 .../gcc.target/aarch64/sve/pack_float_1.c |   2 +-
 .../gcc.target/aarch64/sve/unpack_float_1.c   |   2 +-
 .../gcc.target/aarch64/sve/unpacked_cvtf_1.c  | 217 
 .../gcc.t

[PATCH 11/14] aarch64: Add support for unpacked SVE FP conditional binary arithmetic

2025-06-02 Thread Spencer Abson
This patch extends the expander for conditional smax, smin, add, sub,
mul, min, max, and div to support partial SVE FP modes.

The natural mask supplied to the unpacked operation leaves the undefined
elements in each container unpredicated.  This expansion modifies this
mask to explicitly disable these elements.

gcc/ChangeLog:

* config/aarch64/aarch64-protos.h (aarch64_sve_emit_masked_fp_pred):
Declare.
* config/aarch64/aarch64-sve.md (and3):  Change this to...
(@and3): ...this, to have gen_and3.
(@cond_): Extend from SVE_FULL_F_B16B16 to SVE_F_B16B16,
use aarch64_predicate_operand.
(*cond__2_strict): Likewise.
(*cond__3_strict): Likewise.
(*cond__any_strict): Likwise.
(*cond__2_const_strict): Extend from SVE_FULL_F to SVE_F,
use aarch64_predicate_operand.
(*cond__any_const_strict): Likewise.
(*cond_sub_3_const_strict): Likwise.
(*cond_sub_const_strict): Likewise.
(*vcond_mask_): Use aarch64_predicate_operand, and update
the comment here.
* config/aarch64/aarch64.cc (aarch64_sve_emit_masked_fp_pred):
New function.  Helper to mask the predicate in conditional expanders.

gcc/testsuite/ChangeLog:

* g++.target/aarch64/sve/unpacked_cond_binary_bf16_2.C: New test.
* gcc.target/aarch64/sve/unpacked_cond_builtin_fmax_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_builtin_fmin_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_fadd_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_fdiv_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_fmaxnm_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_fminnm_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_fmul_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_fsubr_2.c: Likewise.
---
 gcc/config/aarch64/aarch64-protos.h   |   1 +
 gcc/config/aarch64/aarch64-sve.md | 153 +-
 gcc/config/aarch64/aarch64.cc |  27 
 .../aarch64/sve/unpacked_cond_binary_bf16_2.C |  18 +++
 .../sve/unpacked_cond_builtin_fmax_2.c|  20 +++
 .../sve/unpacked_cond_builtin_fmin_2.c|  20 +++
 .../aarch64/sve/unpacked_cond_fadd_2.c|  24 +++
 .../aarch64/sve/unpacked_cond_fdiv_2.c|  18 +++
 .../aarch64/sve/unpacked_cond_fmaxnm_2.c  |  20 +++
 .../aarch64/sve/unpacked_cond_fminnm_2.c  |  20 +++
 .../aarch64/sve/unpacked_cond_fmul_2.c|  18 +++
 .../aarch64/sve/unpacked_cond_fsubr_2.c   |  22 +++
 12 files changed, 289 insertions(+), 72 deletions(-)
 create mode 100644 
gcc/testsuite/g++.target/aarch64/sve/unpacked_cond_binary_bf16_2.C
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmax_2.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmin_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fadd_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fdiv_2.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmaxnm_2.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fminnm_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmul_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fsubr_2.c

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 1e3ed80e10b..3a7169dc626 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1031,6 +1031,7 @@ rtx aarch64_pfalse_reg (machine_mode);
 bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
 rtx aarch64_sve_packed_pred (machine_mode);
 rtx aarch64_sve_fp_pred (machine_mode, rtx *);
+rtx aarch64_sve_emit_masked_fp_pred (machine_mode, rtx);
 void aarch64_emit_load_store_through_mode (rtx, rtx, machine_mode);
 bool aarch64_expand_maskloadstore (rtx *, machine_mode);
 void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index d111e0b9261..1ed2d065c15 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -5569,18 +5569,22 @@
 
 ;; Predicated floating-point operations with merging.
 (define_expand "@cond_"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-   (unspec:SVE_FULL_F_B16B16
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+   (unspec:SVE_F_B16B16
  [(match_operand: 1 "register_operand")
-  (unspec:SVE_FULL_F_B16B16
+  (unspec:SVE_F_B16B16
 [(match_dup 1)
  (const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "")
- (match_operand:SVE_FULL_F_B16B16 3 "")]
+ (match_operand:SVE_F_B16B16 2 "")
+ (match_operand:SVE_F_B16B16 3 "")]
 SVE_COND_FP_BINARY)
-  (match_operand:S

[PATCH 09/14] aarch64: Add support for unpacked SVE FDIV

2025-06-02 Thread Spencer Abson
This patch extends the unpredicated FP division expander to support
partial FP modes.  It extends the existing patterns used to implement
UNSPEC_COND_FDIV and it's approximation as needed.

gcc/ChangeLog:

* config/aarch64/aarch64-sve.md: (@aarch64_sve_):
Extend from SVE_FULL_F to SVE_F, use aarch64_predicate_operand.
(div3): Extend from SVE_FULL_F to SVE_F.
(@aarch64_frecpe): Likewise.
(@aarch64_frecps): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/unpacked_fdiv_1.c: New test.
* gcc.target/aarch64/sve/unpacked_fdiv_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fdiv_3.c: Likewise.
---
 gcc/config/aarch64/aarch64-sve.md | 50 +--
 .../gcc.target/aarch64/sve/unpacked_fdiv_1.c  | 34 +
 .../gcc.target/aarch64/sve/unpacked_fdiv_2.c  | 11 
 .../gcc.target/aarch64/sve/unpacked_fdiv_3.c  | 11 
 4 files changed, 81 insertions(+), 25 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fdiv_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fdiv_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fdiv_3.c

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index cdad900d9cf..79a087837de 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3752,9 +3752,9 @@
 
 ;; Unpredicated floating-point unary operations.
 (define_insn "@aarch64_sve_"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
-   (unspec:SVE_FULL_F
- [(match_operand:SVE_FULL_F 1 "register_operand" "w")]
+  [(set (match_operand:SVE_F 0 "register_operand" "=w")
+   (unspec:SVE_F
+ [(match_operand:SVE_F 1 "register_operand" "w")]
  SVE_FP_UNARY))]
   "TARGET_SVE"
   "\t%0., %1."
@@ -5525,10 +5525,10 @@
 
 ;; Unpredicated floating-point binary operations.
 (define_insn "@aarch64_sve_"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
-   (unspec:SVE_FULL_F
- [(match_operand:SVE_FULL_F 1 "register_operand" "w")
-  (match_operand:SVE_FULL_F 2 "register_operand" "w")]
+  [(set (match_operand:SVE_F 0 "register_operand" "=w")
+   (unspec:SVE_F
+ [(match_operand:SVE_F 1 "register_operand" "w")
+  (match_operand:SVE_F 2 "register_operand" "w")]
  SVE_FP_BINARY))]
   "TARGET_SVE"
   "\t%0., %1., %2."
@@ -5552,12 +5552,12 @@
 
 ;; Predicated floating-point binary operations that have no immediate forms.
 (define_insn "@aarch64_pred_"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-   (unspec:SVE_FULL_F
- [(match_operand: 1 "register_operand")
+  [(set (match_operand:SVE_F 0 "register_operand")
+   (unspec:SVE_F
+ [(match_operand: 1 "aarch64_predicate_operand")
   (match_operand:SI 4 "aarch64_sve_gp_strictness")
-  (match_operand:SVE_FULL_F 2 "register_operand")
-  (match_operand:SVE_FULL_F 3 "register_operand")]
+  (match_operand:SVE_F 2 "register_operand")
+  (match_operand:SVE_F 3 "register_operand")]
  SVE_COND_FP_BINARY_REG))]
   "TARGET_SVE"
   {@ [ cons: =0 , 1   , 2 , 3 ; attrs: movprfx ]
@@ -6649,12 +6649,12 @@
 ;; -
 
 (define_expand "div3"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-   (unspec:SVE_FULL_F
+  [(set (match_operand:SVE_F 0 "register_operand")
+   (unspec:SVE_F
  [(match_dup 3)
-  (const_int SVE_RELAXED_GP)
-  (match_operand:SVE_FULL_F 1 "nonmemory_operand")
-  (match_operand:SVE_FULL_F 2 "register_operand")]
+  (match_dup 4)
+  (match_operand:SVE_F 1 "nonmemory_operand")
+  (match_operand:SVE_F 2 "register_operand")]
  UNSPEC_COND_FDIV))]
   "TARGET_SVE"
   {
@@ -6662,23 +6662,23 @@
   DONE;
 
 operands[1] = force_reg (mode, operands[1]);
-operands[3] = aarch64_ptrue_reg (mode);
+operands[3] = aarch64_sve_fp_pred (mode, &operands[4]);
   }
 )
 
 (define_expand "@aarch64_frecpe"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-   (unspec:SVE_FULL_F
- [(match_operand:SVE_FULL_F 1 "register_operand")]
+  [(set (match_operand:SVE_F 0 "register_operand")
+   (unspec:SVE_F
+ [(match_operand:SVE_F 1 "register_operand")]
  UNSPEC_FRECPE))]
   "TARGET_SVE"
 )
 
 (define_expand "@aarch64_frecps"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-   (unspec:SVE_FULL_F
- [(match_operand:SVE_FULL_F 1 "register_operand")
-  (match_operand:SVE_FULL_F 2 "register_operand")]
+  [(set (match_operand:SVE_F 0 "register_operand")
+   (unspec:SVE_F
+ [(match_operand:SVE_F 1 "register_operand")
+  (match_operand:SVE_F 2 "register_operand")]
  UNSPEC_FRECPS))]
   "TARGET_SVE"
 )
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fdiv_1.c 
b/gcc/testsuite/

[PATCH 04/14] aarch64: Add support for unpacked SVE FP comparisons

2025-06-02 Thread Spencer Abson
This patch extends our vec_cmp expander to support partial FP modes.

We use an unnatural predicate mode to govern unpacked FP operations under
flag_trapping_math, so the expansion must handle cases where the comparison's
target and governing predicates have different modes.

While such predicates enable all of the defined part of the operation, they
are not all-true.  Their false bits contribute to the (trapping) behavior of
the operation, so the operation itself should not have SVE_KNOWN_PTRUE.

gcc/ChangeLog:

* config/aarch64/aarch64-sve.md (vec_cmp): Extend
to handle partial FP modes.
(@aarch64_pred_fcm): Likewise.
(@aarch64_pred_fcmuo): Likewise.
* config/aarch64/aarch64.cc (aarch64_emit_sve_fp_cond): Handle
unnatural governing predicates.
(aarch64_emit_sve_or_fp_conds): Likewise.
(aarch64_emit_sve_invert_fp_cond): Likewise.
(aarch64_expand_sve_vec_cmp_float): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/unpacked_fcm_1.c: New test.
* gcc.target/aarch64/sve/unpacked_fcm_2.c: Likewise.
---
 gcc/config/aarch64/aarch64-sve.md |  16 +-
 gcc/config/aarch64/aarch64.cc |  47 +-
 .../gcc.target/aarch64/sve/unpacked_fcm_1.c   | 545 ++
 .../gcc.target/aarch64/sve/unpacked_fcm_2.c   |  47 ++
 4 files changed, 631 insertions(+), 24 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_2.c

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 7484aeeb161..6c5129bc0c6 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -8600,8 +8600,8 @@
 (define_expand "vec_cmp"
   [(set (match_operand: 0 "register_operand")
(match_operator: 1 "comparison_operator"
- [(match_operand:SVE_FULL_F 2 "register_operand")
-  (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero")]))]
+ [(match_operand:SVE_F 2 "register_operand")
+  (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero")]))]
   "TARGET_SVE"
   {
 aarch64_expand_sve_vec_cmp_float (operands[0], GET_CODE (operands[1]),
@@ -8614,10 +8614,10 @@
 (define_insn "@aarch64_pred_fcm"
   [(set (match_operand: 0 "register_operand")
(unspec:
- [(match_operand: 1 "register_operand")
+ [(match_operand: 1 "aarch64_predicate_operand")
   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
-  (match_operand:SVE_FULL_F 3 "register_operand")
-  (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+  (match_operand:SVE_F 3 "register_operand")
+  (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
  SVE_COND_FP_CMP_I0))]
   "TARGET_SVE"
   {@ [ cons: =0 , 1   , 3 , 4   ]
@@ -8630,10 +8630,10 @@
 (define_insn "@aarch64_pred_fcmuo"
   [(set (match_operand: 0 "register_operand" "=Upa")
(unspec:
- [(match_operand: 1 "register_operand" "Upl")
+ [(match_operand: 1 "aarch64_predicate_operand" "Upl")
   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
-  (match_operand:SVE_FULL_F 3 "register_operand" "w")
-  (match_operand:SVE_FULL_F 4 "register_operand" "w")]
+  (match_operand:SVE_F 3 "register_operand" "w")
+  (match_operand:SVE_F 4 "register_operand" "w")]
  UNSPEC_COND_FCMUO))]
   "TARGET_SVE"
   "fcmuo\t%0., %1/z, %3., %4."
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index b13fce2a859..287de0f5ae4 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -27240,7 +27240,7 @@ aarch64_emit_sve_fp_cond (rtx target, rtx_code code, 
rtx pred,
  bool known_ptrue_p, rtx op0, rtx op1)
 {
   rtx flag = gen_int_mode (known_ptrue_p, SImode);
-  rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
+  rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
   gen_rtvec (4, pred, flag, op0, op1),
   aarch64_unspec_cond_code (code));
   emit_set_insn (target, unspec);
@@ -27259,10 +27259,10 @@ static void
 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
  rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
 {
-  machine_mode pred_mode = GET_MODE (pred);
-  rtx tmp1 = gen_reg_rtx (pred_mode);
+  machine_mode target_mode = GET_MODE (target);
+  rtx tmp1 = gen_reg_rtx (target_mode);
   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
-  rtx tmp2 = gen_reg_rtx (pred_mode);
+  rtx tmp2 = gen_reg_rtx (target_mode);
   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
 }
@@ -27279,8 +27279,7 @@ static void
 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
 bool known_ptrue_p, rtx op0, rtx op1)
 {
-  machine_m

[PATCH 03/14] aarch64: Relaxed SEL combiner patterns for unpacked SVE FP conversions

2025-06-02 Thread Spencer Abson
Add UNSPEC_SEL combiner patterns for unpacked FP conversions, where the
strictness value is SVE_RELAXED_GP.

gcc/ChangeLog:

* config/aarch64/aarch64-sve.md
(*cond__nontrunc_relaxed):
New FCVT/SEL combiner pattern.
(*cond__trunc_relaxed):
New FCVTZ{S,U}/SEL combiner pattern.
(*cond__nonextend_relaxed):
New {S,U}CVTF/SEL combiner pattern.
(*cond__trunc):
New FCVT/SEL combiner pattern.
(*cond__nontrunc_relaxed):
New FCVTZ{S,U}/SEL combiner pattern.
* config/aarch64/iterators.md: New mode iterator for VNx2SI.
gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/unpacked_cond_cvtf_1.c: New test.
* gcc.target/aarch64/sve/unpacked_cond_fcvt_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_fcvtz_1.c: Likewise.
---
 gcc/config/aarch64/aarch64-sve.md | 121 ++
 gcc/config/aarch64/iterators.md   |   1 +
 .../aarch64/sve/unpacked_cond_cvtf_1.c|  47 +++
 .../aarch64/sve/unpacked_cond_fcvt_1.c|  37 ++
 .../aarch64/sve/unpacked_cond_fcvtz_1.c   |  51 
 5 files changed, 257 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_cvtf_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fcvt_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fcvtz_1.c

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index f8f8d2f011a..7484aeeb161 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -9612,6 +9612,31 @@
   }
 )
 
+;; As above, for pairs that are used by the auto-vectorizer only.
+(define_insn_and_rewrite 
"*cond__nontrunc_relaxed"
+  [(set (match_operand:SVE_HSDI 0 "register_operand")
+   (unspec:SVE_HSDI
+ [(match_operand: 1 "register_operand")
+  (unspec:SVE_HSDI
+[(match_operand 4)
+ (const_int SVE_RELAXED_GP)
+ (match_operand:SVE_PARTIAL_F 2 "register_operand")]
+SVE_COND_FCVTI)
+  (match_operand:SVE_HSDI 3 "aarch64_simd_reg_or_zero")]
+ UNSPEC_SEL))]
+  "TARGET_SVE
+  && (~( | ) & 
) == 0"
+  {@ [ cons: =0 , 1   , 2 , 3  ; attrs: movprfx ]
+ [ &w   , Upl , w , 0  ; *  ] 
fcvtz\t%0., %1/m, %2.
+ [ &w   , Upl , w , Dz ; yes] 
movprfx\t%0., %1/z, 
%2.\;fcvtz\t%0., %1/m, 
%2.
+ [ ?&w  , Upl , w , w  ; yes] movprfx\t%0, 
%3\;fcvtz\t%0., %1/m, %2.
+  }
+  "&& !rtx_equal_p (operands[1], operands[4])"
+  {
+operands[4] = copy_rtx (operands[1]);
+  }
+)
+
 (define_insn 
"*cond__nontrunc_strict"
   [(set (match_operand:SVE_FULL_HSDI 0 "register_operand")
(unspec:SVE_FULL_HSDI
@@ -9665,6 +9690,29 @@
   }
 )
 
+(define_insn_and_rewrite 
"*cond__trunc_relaxed"
+  [(set (match_operand:VNx2SI_ONLY 0 "register_operand")
+   (unspec:VNx2SI_ONLY
+ [(match_operand:VNx2BI 1 "register_operand")
+  (unspec:VNx2SI_ONLY
+[(match_operand 4)
+ (const_int SVE_RELAXED_GP)
+ (match_operand:VNx2DF_ONLY 2 "register_operand")]
+SVE_COND_FCVTI)
+  (match_operand:VNx2SI_ONLY 3 "aarch64_simd_reg_or_zero")]
+ UNSPEC_SEL))]
+  "TARGET_SVE"
+  {@ [ cons: =0 , 1   , 2 , 3  ; attrs: movprfx ]
+ [ &w   , Upl , w , 0  ; *  ] 
fcvtz\t%0., %1/m, %2.
+ [ &w   , Upl , w , Dz ; yes] 
movprfx\t%0., %1/z, 
%2.\;fcvtz\t%0., %1/m, 
%2.
+ [ ?&w  , Upl , w , w  ; yes] movprfx\t%0, 
%3\;fcvtz\t%0., %1/m, %2.
+  }
+  "&& !rtx_equal_p (operands[1], operands[4])"
+  {
+operands[4] = copy_rtx (operands[1]);
+  }
+)
+
 ;; -
 ;;  [INT<-FP] Packs
 ;; -
@@ -9816,6 +9864,31 @@
   }
 )
 
+;; As above, for pairs that are used by the auto-vectorizer only.
+(define_insn_and_rewrite 
"*cond__nonextend_relaxed"
+  [(set (match_operand:SVE_PARTIAL_F 0 "register_operand")
+   (unspec:SVE_PARTIAL_F
+ [(match_operand: 1 "register_operand")
+  (unspec:SVE_PARTIAL_F
+[(match_operand 4)
+ (const_int SVE_RELAXED_GP)
+ (match_operand:SVE_HSDI 2 "register_operand")]
+SVE_COND_ICVTF)
+  (match_operand:SVE_PARTIAL_F 3 "aarch64_simd_reg_or_zero")]
+ UNSPEC_SEL))]
+  "TARGET_SVE
+   && (~( | ) & 
) == 0"
+  {@ [ cons: =0 , 1   , 2 , 3  ; attrs: movprfx ]
+ [ &w   , Upl , w , 0  ; *  ] 
cvtf\t%0., %1/m, %2.
+ [ &w   , Upl , w , Dz ; yes] 
movprfx\t%0., %1/z, 
%2.\;cvtf\t%0., %1/m, 
%2.
+ [ ?&w  , Upl , w , w  ; yes] movprfx\t%0, 
%3\;cvtf\t%0., %1/m, %2.
+  }
+  "&& !rtx_equal_p (operands[1], operands[4])"
+  {
+operands[4] = copy_rtx (operands[1]);
+  }
+)
+
 (define_insn 

[PATCH 08/14] aarch64: Add support for unpacked SVE FP binary arithmetic

2025-06-02 Thread Spencer Abson
This patch extends the expanders for unpredicated smax, smin, add, sub,
mul, min, and max, so that they support partial SVE FP modes.

The relevant insn/split patterns have also been updated.

gcc/ChangeLog:

* config/aarch64/aarch64-sve.md (3): Extend from
SVE_FULL_F to SVE_F, and use aarch64_sve_fp_pred.
(@aarch64_pred_): Extend from SVE_FULL_F to SVE_F,
use aarch64_predicate_operand.  (ADD/SUB/MUL/MAX/MIN).
* config/aarch64/aarch64-sve2.md: Likewise, for BF16 operations.

gcc/testsuite/ChangeLog:

* g++.target/aarch64/sve/unpacked_binary_bf16_1.C: New test.
* g++.target/aarch64/sve/unpacked_binary_bf16_2.C: Likewise.
* gcc.target/aarch64/sve/unpacked_builtin_fmax_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_builtin_fmax_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_builtin_fmin_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_builtin_fmin_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fadd_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fadd_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fmaxnm_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fmaxnm_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fminnm_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fminnm_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fmul_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fmul_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fsubr_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fsubr_2.c: Likewise.
---
 gcc/config/aarch64/aarch64-sve.md | 70 +--
 gcc/config/aarch64/aarch64-sve2.md| 10 +--
 .../aarch64/sve/unpacked_binary_bf16_1.C  | 35 ++
 .../aarch64/sve/unpacked_binary_bf16_2.C  | 15 
 .../aarch64/sve/unpacked_builtin_fmax_1.c | 40 +++
 .../aarch64/sve/unpacked_builtin_fmax_2.c | 16 +
 .../aarch64/sve/unpacked_builtin_fmin_1.c | 40 +++
 .../aarch64/sve/unpacked_builtin_fmin_2.c | 16 +
 .../gcc.target/aarch64/sve/unpacked_fadd_1.c  | 48 +
 .../gcc.target/aarch64/sve/unpacked_fadd_2.c  | 22 ++
 .../aarch64/sve/unpacked_fmaxnm_1.c   | 41 +++
 .../aarch64/sve/unpacked_fmaxnm_2.c   | 16 +
 .../aarch64/sve/unpacked_fminnm_1.c   | 42 +++
 .../aarch64/sve/unpacked_fminnm_2.c   | 16 +
 .../gcc.target/aarch64/sve/unpacked_fmul_1.c  | 39 +++
 .../gcc.target/aarch64/sve/unpacked_fmul_2.c  | 14 
 .../gcc.target/aarch64/sve/unpacked_fsubr_1.c | 42 +++
 .../gcc.target/aarch64/sve/unpacked_fsubr_2.c | 16 +
 18 files changed, 498 insertions(+), 40 deletions(-)
 create mode 100644 
gcc/testsuite/g++.target/aarch64/sve/unpacked_binary_bf16_1.C
 create mode 100644 
gcc/testsuite/g++.target/aarch64/sve/unpacked_binary_bf16_2.C
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/sve/unpacked_builtin_fmax_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/sve/unpacked_builtin_fmax_2.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/sve/unpacked_builtin_fmin_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/sve/unpacked_builtin_fmin_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fadd_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fadd_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fmaxnm_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fmaxnm_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fminnm_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fminnm_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fmul_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fmul_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fsubr_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fsubr_2.c

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 76de511420f..cdad900d9cf 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -5473,27 +5473,27 @@
 ;; Split a predicated instruction whose predicate is unused into an
 ;; unpredicated instruction.
 (define_split
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-   (unspec:SVE_FULL_F_B16B16
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+   (unspec:SVE_F_B16B16
  [(match_operand: 1 "register_operand")
   (match_operand:SI 4 "aarch64_sve_gp_strictness")
-  (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
-  (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
+  (match_operand:SVE_F_B16B16 2 "register_operand")
+  (match_operand:SVE_F_B16B16 3 "register_operand")]
  ))]
   "TARGET_SVE
&& reload_completed
&& INTVAL (operands[4]) == SVE_R

[PATCH 07/14] aarch64: Relaxed SEL combiner patterns for unpacked SVE FP unary operations

2025-06-02 Thread Spencer Abson
Extend the unary op/UNSPEC_SEL combiner patterns from SVE_FULL_F to SVE_F,
where the strictness value is SVE_RELAXED_GP.

gcc/ChangeLog:

* config/aarch64/aarch64-sve.md (*cond__2_relaxed):
Extend from SVE_FULL_F to SVE_F.
(*cond__any_relaxed): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/unpacked_cond_fabs_1.c: New test.
* gcc.target/aarch64/sve/unpacked_cond_fneg_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_frinta_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_frinti_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_frintm_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_frintp_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_frintx_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_frintz_1.c: Likewise.
---
 gcc/config/aarch64/aarch64-sve.md | 18 +-
 .../aarch64/sve/unpacked_cond_fabs_1.c| 32 +
 .../aarch64/sve/unpacked_cond_fneg_1.c| 34 +++
 .../aarch64/sve/unpacked_cond_frinta_1.c  | 32 +
 .../aarch64/sve/unpacked_cond_frinti_1.c  | 32 +
 .../aarch64/sve/unpacked_cond_frintm_1.c  | 32 +
 .../aarch64/sve/unpacked_cond_frintp_1.c  | 32 +
 .../aarch64/sve/unpacked_cond_frintx_1.c  | 32 +
 .../aarch64/sve/unpacked_cond_frintz_1.c  | 32 +
 9 files changed, 267 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fabs_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fneg_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_frinta_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_frinti_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_frintm_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_frintp_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_frintx_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_frintz_1.c

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 1a705e153cb..76de511420f 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3821,13 +3821,13 @@
 
 ;; Predicated floating-point unary arithmetic, merging with the first input.
 (define_insn_and_rewrite "*cond__2_relaxed"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-   (unspec:SVE_FULL_F
+  [(set (match_operand:SVE_F 0 "register_operand")
+   (unspec:SVE_F
  [(match_operand: 1 "register_operand")
-  (unspec:SVE_FULL_F
+  (unspec:SVE_F
 [(match_operand 3)
  (const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")]
+ (match_operand:SVE_F 2 "register_operand")]
 SVE_COND_FP_UNARY)
   (match_dup 2)]
  UNSPEC_SEL))]
@@ -3869,15 +3869,15 @@
 ;; as earlyclobber helps to make the instruction more regular to the
 ;; register allocator.
 (define_insn_and_rewrite "*cond__any_relaxed"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-   (unspec:SVE_FULL_F
+  [(set (match_operand:SVE_F 0 "register_operand")
+   (unspec:SVE_F
  [(match_operand: 1 "register_operand")
-  (unspec:SVE_FULL_F
+  (unspec:SVE_F
 [(match_operand 4)
  (const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")]
+ (match_operand:SVE_F 2 "register_operand")]
 SVE_COND_FP_UNARY)
-  (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero")]
+  (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero")]
  UNSPEC_SEL))]
   "TARGET_SVE && !rtx_equal_p (operands[2], operands[3])"
   {@ [ cons: =0 , 1   , 2 , 3  ; attrs: movprfx ]
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fabs_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fabs_1.c
new file mode 100644
index 000..fea5cd1f50d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fabs_1.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 
-fno-trapping-math" } */
+
+#include 
+
+#define a_i a[i]
+#define b_i b[i]
+
+#define TEST_FN(FN, TYPE0, TYPE1, COUNT, MERGE)\
+  void \
+  f_##FN##_##TYPE0##_##TYPE1##_##MERGE (TYPE1 *__restrict p,   \
+   TYPE0 *__restrict a,\
+   TYPE0 *__restrict b)\
+  {\
+for (unsigned int i = 0; i < COUNT; i++)   \
+  a[i] = p[i] ? FN (a[i]) : MERGE; \
+  }
+
+#de

[PATCH 05/14] aarch64: Compare/and splits for unpacked SVE FP comparisons

2025-06-02 Thread Spencer Abson
This patch extends the compare/and splitting patterns for FP comparisons
from SVE_FULL_F to SVE_F.

gcc/ChangeLog:

* config/aarch64/aarch64-sve.md (*fcm_and_combine):
Extend to SVE_F.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/unpacked_fcm_1.c: Allow other tests
to define TEST_FCM (new ifdef guard).
* gcc.target/aarch64/sve/unpacked_fcm_and_1.c: New test.
---
 gcc/config/aarch64/aarch64-sve.md  |  8 
 .../gcc.target/aarch64/sve/unpacked_fcm_1.c|  2 ++
 .../aarch64/sve/unpacked_fcm_and_1.c   | 18 ++
 3 files changed, 24 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_and_1.c

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 6c5129bc0c6..399d147c9a5 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -8653,8 +8653,8 @@
  (unspec:
[(match_operand: 1)
 (const_int SVE_KNOWN_PTRUE)
-(match_operand:SVE_FULL_F 2 "register_operand" "w, w")
-(match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "Dz, w")]
+(match_operand:SVE_F 2 "register_operand" "w, w")
+(match_operand:SVE_F 3 "aarch64_simd_reg_or_zero" "Dz, w")]
SVE_COND_FP_CMP_I0)
  (match_operand: 4 "register_operand" "Upl, Upl")))]
   "TARGET_SVE"
@@ -8676,8 +8676,8 @@
  (unspec:
[(match_operand: 1)
 (const_int SVE_KNOWN_PTRUE)
-(match_operand:SVE_FULL_F 2 "register_operand" "w")
-(match_operand:SVE_FULL_F 3 "register_operand" "w")]
+(match_operand:SVE_F 2 "register_operand" "w")
+(match_operand:SVE_F 3 "register_operand" "w")]
UNSPEC_COND_FCMUO)
  (match_operand: 4 "register_operand" "Upl")))]
   "TARGET_SVE"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_1.c
index 7e39b79991b..bc02763df0b 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_1.c
@@ -20,6 +20,7 @@
 
 #define b_i b[i]
 
+#ifndef TEST_FCM
 #define TEST_FCM(TYPE0, TYPE1, CMP, RHS, COUNT)  \
   void   \
   f_##TYPE0##_##TYPE1##_##CMP##_##RHS (TYPE0 *__restrict out, \
@@ -29,6 +30,7 @@
 for (unsigned int i = 0; i < COUNT; i++) \
   out[i] = CMP (a[i], RHS) ? 3 : out[i]; \
   }
+#endif
 
 #define TEST_CC_REG(CMP) \
   TEST_FCM (uint64_t, float, CMP, b_i, 32)\
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_and_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_and_1.c
new file mode 100644
index 000..2b88cc14a98
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_and_1.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 
-fno-trapping-math" } */
+
+#define TEST_FCM(TYPE0, TYPE1, CMP, RHS, COUNT)  \
+  void   \
+  f_##TYPE0##_##TYPE1##_##CMP##_##RHS (TYPE0 *__restrict out, \
+   TYPE0 *__restrict p,   \
+  TYPE1 *__restrict a,   \
+  TYPE1 *__restrict b)   \
+  {  \
+for (unsigned int i = 0; i < COUNT; i++) \
+  if (p[i] && CMP (a[i], RHS))   \
+   out[i] = 3;   \
+  }
+
+#include "unpacked_fcm_1.c"
+
+/* { dg-final { scan-assembler-not {\tand\t} } } */
-- 
2.34.1



[PATCH 01/14] aarch64: Extend iterator support for partial SVE FP modes

2025-06-02 Thread Spencer Abson
Define new iterators for partial floating-point modes, and cover these
in some existing mode_attrs.  This patch serves as a starting point for
a series that extends support for unpacked floating-point operations.

To differentiate between BFloat mode iterators that need to test
TARGET_SSVE_B16B16, and those that don't (see LOGICALF), this patch
enforces the following naming convention:
- _BF: BF16 modes will not test TARGET_SSVE_B16B16.
- _B16B16: BF16 modes will test TARGET_SSVE_B16B16.

gcc/ChangeLog:

* config/aarch64/aarch64-sve.md: Replace uses of SVE_FULL_F_BF
with SVE_FULL_F_B16B16.
Replace use of SVE_F with SVE_F_BF.
* config/aarch64/iterators.md (SVE_PARTIAL_F): New iterator for
partial SVE FP modes.
(SVE_FULL_F_BF): Rename to SVE_FULL_F_B16B16.
(SVE_PARTIAL_F_B16B16): New iterator (BF16 included) for partial
SVE FP modes.
(SVE_F_B16B16): New iterator for all SVE FP modes.
(SVE_BF): New iterator for all SVE BF16 modes.
(SVE_F): Redefine to exclude BF16 modes.
(SVE_F_BF): New iterator to replace the previous SVE_F.
(b): Cover partial FP modes.
(is_bf16): Likewise.
---
 gcc/config/aarch64/aarch64-sve.md | 218 +++---
 gcc/config/aarch64/iterators.md   |  35 +++--
 2 files changed, 133 insertions(+), 120 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index c5d3e8cd3b3..6baa377988a 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -5456,27 +5456,27 @@
 ;; Split a predicated instruction whose predicate is unused into an
 ;; unpredicated instruction.
 (define_split
-  [(set (match_operand:SVE_FULL_F_BF 0 "register_operand")
-   (unspec:SVE_FULL_F_BF
+  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
+   (unspec:SVE_FULL_F_B16B16
  [(match_operand: 1 "register_operand")
   (match_operand:SI 4 "aarch64_sve_gp_strictness")
-  (match_operand:SVE_FULL_F_BF 2 "register_operand")
-  (match_operand:SVE_FULL_F_BF 3 "register_operand")]
+  (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
+  (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
  ))]
   "TARGET_SVE
&& reload_completed
&& INTVAL (operands[4]) == SVE_RELAXED_GP"
   [(set (match_dup 0)
-   (SVE_UNPRED_FP_BINARY:SVE_FULL_F_BF (match_dup 2) (match_dup 3)))]
+   (SVE_UNPRED_FP_BINARY:SVE_FULL_F_B16B16 (match_dup 2) (match_dup 3)))]
 )
 
 ;; Unpredicated floating-point binary operations (post-RA only).
 ;; These are generated by the split above.
 (define_insn "*post_ra_3"
-  [(set (match_operand:SVE_FULL_F_BF 0 "register_operand" "=w")
-   (SVE_UNPRED_FP_BINARY:SVE_FULL_F_BF
- (match_operand:SVE_FULL_F_BF 1 "register_operand" "w")
- (match_operand:SVE_FULL_F_BF 2 "register_operand" "w")))]
+  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand" "=w")
+   (SVE_UNPRED_FP_BINARY:SVE_FULL_F_B16B16
+ (match_operand:SVE_FULL_F_B16B16 1 "register_operand" "w")
+ (match_operand:SVE_FULL_F_B16B16 2 "register_operand" "w")))]
   "TARGET_SVE && reload_completed"
   "\t%0., %1., %2.")
 
@@ -5520,12 +5520,12 @@
 ;; Unpredicated floating-point binary operations that need to be predicated
 ;; for SVE.
 (define_expand "3"
-  [(set (match_operand:SVE_FULL_F_BF 0 "register_operand")
-   (unspec:SVE_FULL_F_BF
+  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
+   (unspec:SVE_FULL_F_B16B16
  [(match_dup 3)
   (const_int SVE_RELAXED_GP)
-  (match_operand:SVE_FULL_F_BF 1 "")
-  (match_operand:SVE_FULL_F_BF 2 "")]
+  (match_operand:SVE_FULL_F_B16B16 1 "")
+  (match_operand:SVE_FULL_F_B16B16 2 "")]
  SVE_COND_FP_BINARY_OPTAB))]
   "TARGET_SVE && ( || !)"
   {
@@ -5552,30 +5552,30 @@
 
 ;; Predicated floating-point operations with merging.
 (define_expand "@cond_"
-  [(set (match_operand:SVE_FULL_F_BF 0 "register_operand")
-   (unspec:SVE_FULL_F_BF
+  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
+   (unspec:SVE_FULL_F_B16B16
  [(match_operand: 1 "register_operand")
-  (unspec:SVE_FULL_F_BF
+  (unspec:SVE_FULL_F_B16B16
 [(match_dup 1)
  (const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F_BF 2 "")
- (match_operand:SVE_FULL_F_BF 3 "")]
+ (match_operand:SVE_FULL_F_B16B16 2 "")
+ (match_operand:SVE_FULL_F_B16B16 3 "")]
 SVE_COND_FP_BINARY)
-  (match_operand:SVE_FULL_F_BF 4 "aarch64_simd_reg_or_zero")]
+  (match_operand:SVE_FULL_F_B16B16 4 "aarch64_simd_reg_or_zero")]
  UNSPEC_SEL))]
   "TARGET_SVE && ( || !)"
 )
 
 ;; Predicated floating-point operations, merging with the first input.
 (define_insn_and_rewrite "*cond__2_relaxed"
-  [(set (match_operand:SVE_FULL_F_BF 0 "r

[PATCH 06/14] aarch64: Add support for unpacked SVE FP unary operations

2025-06-02 Thread Spencer Abson
This patch extends the expander for unpredicated round, nearbyint, floor,
ceil, rint, and trunc, so that it can handle partial SVE FP modes.

We move fabs and fneg to a separate expander, since they are not trapping
instructions.

gcc/ChangeLog:

* config/aarch64/aarch64-sve.md (2): Replace use of
aarch64_ptrue_reg with aarch64_sve_fp_pred.
(@aarch64_pred_) Extend from SVE_FULL_F to SVE_F,
use aarch64_predicate_operand.
* config/aarch64/iterators.md: Split FABS/FNEG out of
SVE_COND_FP_UNARY (into SVE_COND_FP_UNARY_BITWISE).

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/unpacked_fabs_1.c: New test.
* gcc.target/aarch64/sve/unpacked_fneg_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_frinta_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_frinta_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_frinti_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_frinti_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_frintm_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_frintm_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_frintp_1.c: Likewiss.
* gcc.target/aarch64/sve/unpacked_frintp_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_frintx_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_frintx_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_frintz_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_frintz_2.c: Likewise.
---
 gcc/config/aarch64/aarch64-sve.md | 31 ++-
 gcc/config/aarch64/iterators.md   | 14 -
 .../gcc.target/aarch64/sve/unpacked_fabs_1.c  | 24 ++
 .../gcc.target/aarch64/sve/unpacked_fneg_1.c  | 26 
 .../aarch64/sve/unpacked_frinta_1.c   | 27 
 .../aarch64/sve/unpacked_frinta_2.c   | 11 +++
 .../aarch64/sve/unpacked_frinti_1.c   | 27 
 .../aarch64/sve/unpacked_frinti_2.c   | 11 +++
 .../aarch64/sve/unpacked_frintm_1.c   | 27 
 .../aarch64/sve/unpacked_frintm_2.c   | 11 +++
 .../aarch64/sve/unpacked_frintp_1.c   | 27 
 .../aarch64/sve/unpacked_frintp_2.c   | 11 +++
 .../aarch64/sve/unpacked_frintx_1.c   | 27 
 .../aarch64/sve/unpacked_frintx_2.c   | 11 +++
 .../aarch64/sve/unpacked_frintz_1.c   | 27 
 .../aarch64/sve/unpacked_frintz_2.c   | 11 +++
 16 files changed, 308 insertions(+), 15 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fabs_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fneg_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_frinta_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_frinta_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_frinti_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_frinti_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_frintm_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_frintm_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_frintp_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_frintp_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_frintx_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_frintx_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_frintz_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_frintz_2.c

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 399d147c9a5..1a705e153cb 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3762,13 +3762,28 @@
 
 ;; Unpredicated floating-point unary operations.
 (define_expand "2"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-   (unspec:SVE_FULL_F
+  [(set (match_operand:SVE_F 0 "register_operand")
+   (unspec:SVE_F
  [(match_dup 2)
-  (const_int SVE_RELAXED_GP)
-  (match_operand:SVE_FULL_F 1 "register_operand")]
+  (match_dup 3)
+  (match_operand:SVE_F 1 "register_operand")]
  SVE_COND_FP_UNARY_OPTAB))]
   "TARGET_SVE"
+  {
+operands[2] = aarch64_sve_fp_pred (mode, &operands[3]);
+  }
+)
+
+;; FABS and FNEG are non-trapping, we can always expand with their
+;; natural PTRUE.
+(define_expand "2"
+  [(set (match_operand:SVE_F 0 "register_operand")
+   (unspec:SVE_F
+ [(match_dup 2)
+  (const_int SVE_RELAXED_GP)
+  (match_operand:SVE_F 1 "register_operand")]
+ SVE_COND_FP_UNARY_BITWISE))]
+  "TARGET_SVE"
   {
 operands[2] = aarch64_ptrue_reg (mode);
   }
@@ -3776,11 +3791,11 @@
 
 ;; Predicated floating-point unary operations.
 (define_insn "@aarch64_pred_"
-  [(set (match_operand

[PATCH 13/14] aarch64: Relaxed SEL combiner patterns for unpacked SVE FP ternary arithmetic

2025-06-02 Thread Spencer Abson
Extend the ternary op/UNSPEC_SEL combiner patterns from SVE_FULL_F/
SVE_FULL_F_BF to SVE_F/SVE_F_BF, where the strictness value is
SVE_RELAXED_GP.

We can only reliably test the 'merging with the third input' (addend)
and 'independent value' patterns at this stage as the canocalisation that
reorders the multiplicands based on the second SEL input would be performed
by the conditional expander.

Another difficulty is that we can't test these fused multiply/SEL combines
without using __builtin_fma and friends.  The reason for this is as follows:

We support COND_ADD, COND_SUB, and COND_MUL optabs, so match.pd will
canonicalize patterns like ADD/SUB/MUL combined with a VEC_COND_EXPR into
these conditional forms.  Later, when widening_mul tries to fold these into
conditional fused multiply operations, the transformation fails - simply
because we haven’t implemented those conditional fused multiply optabs yet.

Hence why this patch lacks tests for BFloat16...

gcc/ChangeLog:

* config/aarch64/aarch64-sve.md (*cond__2_relaxed):
Extend from SVE_FULL_F to SVE_F.
(*cond__4_relaxed): Extend from SVE_FULL_F_B16B16
to SVE_F_B16B16.
(*cond__any_relaxed): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/unpacked_cond_fmla_1.c: New test.
* gcc.target/aarch64/sve/unpacked_cond_fmls_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c: Likewise.
---
 gcc/config/aarch64/aarch64-sve.md | 38 
 .../aarch64/sve/unpacked_cond_fmla_1.c| 43 +++
 .../aarch64/sve/unpacked_cond_fmls_1.c| 43 +++
 .../aarch64/sve/unpacked_cond_fnmla_1.c   | 43 +++
 .../aarch64/sve/unpacked_cond_fnmls_1.c   | 43 +++
 5 files changed, 191 insertions(+), 19 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmla_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmls_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 8c1921ddf5c..e5443980e8b 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -7622,15 +7622,15 @@
 ;; Predicated floating-point ternary operations, merging with the
 ;; first input.
 (define_insn_and_rewrite "*cond__2_relaxed"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-   (unspec:SVE_FULL_F
+  [(set (match_operand:SVE_F 0 "register_operand")
+   (unspec:SVE_F
  [(match_operand: 1 "register_operand")
-  (unspec:SVE_FULL_F
+  (unspec:SVE_F
 [(match_operand 5)
  (const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")
- (match_operand:SVE_FULL_F 4 "register_operand")]
+ (match_operand:SVE_F 2 "register_operand")
+ (match_operand:SVE_F 3 "register_operand")
+ (match_operand:SVE_F 4 "register_operand")]
 SVE_COND_FP_TERNARY)
   (match_dup 2)]
  UNSPEC_SEL))]
@@ -7668,15 +7668,15 @@
 ;; Predicated floating-point ternary operations, merging with the
 ;; third input.
 (define_insn_and_rewrite "*cond__4_relaxed"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-   (unspec:SVE_FULL_F_B16B16
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+   (unspec:SVE_F_B16B16
  [(match_operand: 1 "register_operand")
-  (unspec:SVE_FULL_F_B16B16
+  (unspec:SVE_F_B16B16
 [(match_operand 5)
  (const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")
+ (match_operand:SVE_F_B16B16 4 "register_operand")]
 SVE_COND_FP_TERNARY)
   (match_dup 4)]
  UNSPEC_SEL))]
@@ -7714,17 +7714,17 @@
 ;; Predicated floating-point ternary operations, merging with an
 ;; independent value.
 (define_insn_and_rewrite "*cond__any_relaxed"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-   (unspec:SVE_FULL_F_B16B16
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+   (unspec:SVE_F_B16B16
  [(match_operand: 1 "register_operand")
-  (unspec:SVE_FULL_F_B16B16
+  (unspec:SVE_F_B16B16
 [(match_operand 6)
  (const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
-

[PATCH 00/14] aarch64: Add support for unpacked SVE FP operations

2025-06-02 Thread Spencer Abson
This series incrementally adds support for operations on unpacked vectors
of floating-point values.  By "unpacked", we're referring to the in-register
layout of partial SVE vector modes.  For example, the elements of a VNx4HF
are stored as:

... | X | HF | X | HF | X | HF | X | HF |

Where 'X' denotes the undefined upper half of the 32-bit container that each
16-bit value is stored in.  This padding must not affect the operation's
behavior, so should not be interpreted if the operation may trap.

The series is organised as follows:
* NFCs to iterators.md that lay the groundwork for the rest of the
series.
* Unpacked conversions, in which a solution to the issue described
above is given.
* Unpacked comparisons, which are slightly less trivial than...
* Unpacked unary/binary/ternary operations, each of which is broken
down into:
* Defining the unconditional expansion
* Supporting OP/UNSPEC_SEL combiner patterns under
SVE_RELAXED_GP
* Defining the conditional expander (if applicable)

This allows each change to aarch64-sve.md to be testable; once the conditional
expander for an operation is defined, the rules in match.pd canonicalize any
occurrence of that operation combined with a VEC_COND_EXPR into these
conditional forms, which would make the SVE_RELAXED_GP patterns dead at trunk.
I’ve taken this approach because I believe it’s valuable to have these
patterns to fall back on.

Notes on code generation under -ftrapping-math:

1) In the example below, we're currently unable to remove (1) in favour of
(2).

ptrue   p6.b, all   (1)
ptrue   p7.d, all   (2)
ld1wz30.d, p6/z, [x1]
ld1wz29.d, p6/z, [x3]
fsubz30.s, p7/m, z30.s, #1.0

In the expanded RTL, the predicate source of the LD1Ws is a
(subreg:VNx2BI (reg:VNx16BI 111) 0), where every bit of 111 is a 1.  The
predicate source of the FSUB is a (subreg:VNx4BI (reg:VNx16BI 112) 0),
where every 8th bit of 112 is a 1, and the rest are 0.

2) The AND emitted by the conditional expander typically follows a CMP
operation, where it is trivially redundant.

cmpne   p5.d, p7/z, z0.d, #0
ptrue   p6.d, vl32
and p6.b, p6/z, p5.b, p5.b

The fold we need here is slightly different from what the existing
*cmp_and splitting patterns achieve, in that we don’t need to
replace p7 with p6 to make the AND redundant.

The AND in this case has the structure:

(set (reg:VNx4BI 113)
(and (subreg:VNx4BI (reg:VNx16BI 111) 0)
 (subreg:VNx4BI (reg:VNx2BI 112) 0)

This problem feels somewhat related to how we might handle
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=118151.


Bootstrapped & regtested on aarch64-linux-gnu.

Thanks,
Spencer

Spencer Abson (14):
  aarch64: Extend iterator support for partial SVE FP modes
  aarch64: Add support for unpacked SVE FP conversions
  aarch64: Relaxed SEL combiner patterns for unpacked SVE FP conversions
  aarch64: Add support for unpacked SVE FP comparisons
  aarch64: Compare/and splits for unpacked SVE FP comparisons
  aarch64: Add support for unpacked SVE FP unary operations
  aarch64: Relaxed SEL combiner patterns for unpacked SVE FP unary
operations
  aarch64: Add support for unpacked SVE FP binary arithmetic
  aarch64: Add support for unpacked SVE FDIV
  aarch64: Relaxed SEL combiner patterns for unpacked SVE FP binary
arithmetic
  aarch64: Add support for unpacked SVE FP conditional binary arithmetic
  aarch64: Add support for unpacked SVE FP ternary arithmetic
  aarch64: Relaxed SEL combiner patterns for unpacked SVE FP ternary
arithmetic
  aarch64: Add support for unpacked SVE FP conditional ternary
arithmetic

 gcc/config/aarch64/aarch64-protos.h   |   4 +
 gcc/config/aarch64/aarch64-sve.md | 889 --
 gcc/config/aarch64/aarch64-sve2.md|  10 +-
 gcc/config/aarch64/aarch64.cc | 125 ++-
 gcc/config/aarch64/iterators.md   |  97 +-
 gcc/config/aarch64/predicates.md  |   4 +
 .../aarch64/sve/unpacked_binary_bf16_1.C  |  35 +
 .../aarch64/sve/unpacked_binary_bf16_2.C  |  15 +
 .../aarch64/sve/unpacked_cond_binary_bf16_1.C |  46 +
 .../aarch64/sve/unpacked_cond_binary_bf16_2.C |  18 +
 .../sve/unpacked_cond_ternary_bf16_1.C|  35 +
 .../sve/unpacked_cond_ternary_bf16_2.C|  14 +
 .../aarch64/sve/unpacked_ternary_bf16_1.C |  27 +
 .../aarch64/sve/unpacked_ternary_bf16_2.C |  11 +
 .../aarch64/sve/pack_fcvt_signed_1.c  |   2 +-
 .../aarch64/sve/pack_fcvt_unsigned_1.c|   2 +-
 .../gcc.target/aarch64/sve/pack_float_1.c |   2 +-
 .../gcc.target/aarch64/sve/unpack_float_1.c   |   2 +-
 .../aarch64/sve/unpacked_builtin_fmax_1.c |  40 +
 .../aarch64/sve/unpacked_builtin_fmax_2.c |  16 +
 .../aarch64/sve/unpacked_builtin_fmin_1.c |  40 +
 .../aarch64/sve/unpacked_builtin_fmin_2.c |  16 +
 .../sve/unpacked_cond_b

[PATCH 12/14] aarch64: Add support for unpacked SVE FP ternary arithmetic

2025-06-02 Thread Spencer Abson
This patch extends the expander for unconditional fma, fnma, fms, and
fnms, so that it supports partial SVE FP modes.

gcc/ChangeLog:

* config/aarch64/aarch64-sve.md (4): Extend from
SVE_FULL_F_B16B16 to SVE_F_B16B16.  Use sve_fp_pred instead
of aarch64_ptrue_reg.
(@aarch64_pred_): Extend from SVE_FULL_F_B16B16
to SVE_F_B16B16.  Use aarch64_predicate_operand.

gcc/testsuite/ChangeLog:

* g++.target/aarch64/sve/unpacked_ternary_bf16_1.C: New test.
* g++.target/aarch64/sve/unpacked_ternary_bf16_2.C: Likewise.
* gcc.target/aarch64/sve/unpacked_fmla_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fmla_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fmls_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fmls_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fnmla_1.c: Likeiwse.
* gcc.target/aarch64/sve/unpacked_fnmla_2.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fnmls_1.c: Likewise.
* gcc.target/aarch64/sve/unpacked_fnmls_2.c: Likewise.
---
 gcc/config/aarch64/aarch64-sve.md | 26 +++---
 .../aarch64/sve/unpacked_ternary_bf16_1.C | 27 +++
 .../aarch64/sve/unpacked_ternary_bf16_2.C | 11 ++
 .../gcc.target/aarch64/sve/unpacked_fmla_1.c  | 34 +++
 .../gcc.target/aarch64/sve/unpacked_fmla_2.c  | 11 ++
 .../gcc.target/aarch64/sve/unpacked_fmls_1.c  | 34 +++
 .../gcc.target/aarch64/sve/unpacked_fmls_2.c  | 11 ++
 .../gcc.target/aarch64/sve/unpacked_fnmla_1.c | 34 +++
 .../gcc.target/aarch64/sve/unpacked_fnmla_2.c | 11 ++
 .../gcc.target/aarch64/sve/unpacked_fnmls_1.c | 34 +++
 .../gcc.target/aarch64/sve/unpacked_fnmls_2.c | 11 ++
 11 files changed, 231 insertions(+), 13 deletions(-)
 create mode 100644 
gcc/testsuite/g++.target/aarch64/sve/unpacked_ternary_bf16_1.C
 create mode 100644 
gcc/testsuite/g++.target/aarch64/sve/unpacked_ternary_bf16_2.C
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fmla_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fmla_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fmls_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fmls_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fnmla_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fnmla_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fnmls_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/unpacked_fnmls_2.c

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 1ed2d065c15..8c1921ddf5c 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -7563,29 +7563,29 @@
 
 ;; Unpredicated floating-point ternary operations.
 (define_expand "4"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-   (unspec:SVE_FULL_F_B16B16
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+   (unspec:SVE_F_B16B16
  [(match_dup 4)
-  (const_int SVE_RELAXED_GP)
-  (match_operand:SVE_FULL_F_B16B16 1 "register_operand")
-  (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
-  (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
+  (match_dup 5)
+  (match_operand:SVE_F_B16B16 1 "register_operand")
+  (match_operand:SVE_F_B16B16 2 "register_operand")
+  (match_operand:SVE_F_B16B16 3 "register_operand")]
  SVE_COND_FP_TERNARY))]
   "TARGET_SVE && ( || !)"
   {
-operands[4] = aarch64_ptrue_reg (mode);
+operands[4] = aarch64_sve_fp_pred (mode, &operands[5]);
   }
 )
 
 ;; Predicated floating-point ternary operations.
 (define_insn "@aarch64_pred_"
-  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
-   (unspec:SVE_FULL_F_B16B16
- [(match_operand: 1 "register_operand")
+  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+   (unspec:SVE_F_B16B16
+ [(match_operand: 1 "aarch64_predicate_operand")
   (match_operand:SI 5 "aarch64_sve_gp_strictness")
-  (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
-  (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
-  (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+  (match_operand:SVE_F_B16B16 2 "register_operand")
+  (match_operand:SVE_F_B16B16 3 "register_operand")
+  (match_operand:SVE_F_B16B16 4 "register_operand")]
  SVE_COND_FP_TERNARY))]
   "TARGET_SVE && ( || !)"
   {@ [ cons: =0 , 1   , %2  , 3 , 4 ; attrs: movprfx , is_rev ]
diff --git a/gcc/testsuite/g++.target/aarch64/sve/unpacked_ternary_bf16_1.C 
b/gcc/testsuite/g++.target/aarch64/sve/unpacked_ternary_bf16_1.C
new file mode 100644
index 000..19bfe95f298
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/unpacked_ternary_bf16_1.C
@@ -0,0 +1,27 @@
+/* { 

[PATCH v2 1/1] middle-end: Fix operation_could_trap_p for FIX_TRUNC expressions

2025-06-03 Thread Spencer Abson
Floating-point to integer conversions can be inexact or invalid (e.g., due to
overflow or NaN).  However, since users of operation_could_trap_p infer the
bool FP_OPERATION argument from the expression's type, the FIX_TRUNC family
are considered non-trapping here.

This patch handles them explicitly.

gcc/ChangeLog:

* tree-eh.cc (operation_could_trap_helper_p): Cover FIX_TRUNC
expressions explicitly.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/pr96357.c: Change to avoid producing
a conditional FIX_TRUNC_EXPR, whilst still reproducing the bug
in PR96357.
* gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c: New test.
* gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c: Likewise.
---
 .../gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c   | 19 +++
 .../gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c   |  6 ++
 .../gcc.target/aarch64/sve/pr96357.c  |  8 
 gcc/tree-eh.cc|  7 +++
 4 files changed, 36 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c
new file mode 100644
index 000..801a53fa30b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c
@@ -0,0 +1,19 @@
+  /* { dg-do compile } */
+  /* { dg-options "-O2 -ftree-vectorize -fdump-tree-ifcvt-stats" } */
+
+void
+test (int *dst, float *arr, int *pred, int n)
+{
+  for (int i = 0; i < n; i++)
+{
+  int pred_i = pred[i];
+  float arr_i = arr[i];
+
+  dst[i] = pred_i ? (int)arr_i : 5;
+}
+}
+
+/* We expect this to fail if_convertible_loop_p so long as we have no
+   conditional IFN for FIX_TRUNC_EXPR.  */
+
+/* { dg-final { scan-tree-dump-times "Applying if-conversion" 0 "ifcvt" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c
new file mode 100644
index 000..628b754e94d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math 
-fdump-tree-ifcvt-stats" } */
+
+#include "ifcvt-fix-trunc-1.c"
+
+/* { dg-final { scan-tree-dump-times "Applying if-conversion" 1 "ifcvt" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr96357.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr96357.c
index 9a7f912e529..6dd0409f3c8 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pr96357.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr96357.c
@@ -5,10 +5,10 @@ int d;
 
 void
 f1(char f, char *g, char *h, char *l, char *n) {
-  double i = d, j = 1.0 - f, k = j ? d : j;
-  if (k == 1.0)
-i = 0.0;
-  *l = *n = *g = *h = i * 0.5;
+  double j = 1.0 - f, k = j ? d : j;
+
+  char i = (k == 1.0) ? 10 : 50;
+  *l = *n = *g = *h = i;
 }
 
 void
diff --git a/gcc/tree-eh.cc b/gcc/tree-eh.cc
index a4d59954c05..8cc81ebcf5e 100644
--- a/gcc/tree-eh.cc
+++ b/gcc/tree-eh.cc
@@ -2538,6 +2538,13 @@ operation_could_trap_helper_p (enum tree_code op,
   /* Constructing an object cannot trap.  */
   return false;
 
+case FIX_TRUNC_EXPR:
+case VEC_PACK_FIX_TRUNC_EXPR:
+case VEC_UNPACK_FIX_TRUNC_HI_EXPR:
+case VEC_UNPACK_FIX_TRUNC_LO_EXPR:
+  /* The FIX_TRUNC family are always potentially trapping.  */
+  return flag_trapping_math;
+
 case COND_EXPR:
 case VEC_COND_EXPR:
   /* Whether *COND_EXPR can trap depends on whether the
-- 
2.34.1



[PATCH v2 0/1] middle-end: Fix operation_could_trap_p for FIX_TRUNC expressions

2025-06-03 Thread Spencer Abson
Hi,

This is a V2 of https://gcc.gnu.org/pipermail/gcc-patches/2025-May/683650.html.

Apologies for the delay - I was hesistant to push without preserving
sve/pr96357.c and regtesting x86.  The change to sve/pr96357.c means that it
can still be vectorized, and will still trigger the ICE without Przemek's fix
applied.

I'd like to be able to test this change for VEC_UNPACK_FIX_TRUNC_HI_EXPR,
VEC_UNPACK_FIX_TRUNC_LO_EXPR and VEC_PACK_FIX_TRUNC_EXP - but I've struggled
to come up with a practical way to do that...

Bootstrapped & regtested on aarch64-linux-gnu, x86_64-linux-gnu.
OK for master?

Thanks,
Spencer

Spencer Abson (1):
  middle-end: Fix operation_could_trap_p for FIX_TRUNC expressions

 .../gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c   | 19 +++
 .../gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c   |  6 ++
 .../gcc.target/aarch64/sve/pr96357.c  |  8 
 gcc/tree-eh.cc|  7 +++
 4 files changed, 36 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-1.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ifcvt-fix-trunc-2.c

-- 
2.34.1



Re: [PATCH 02/14] aarch64: Add support for unpacked SVE FP conversions

2025-06-03 Thread Spencer Abson
Thanks, Alfie.  I agree that having a table with just one entry looks a
little odd, but the rest of the file follows this pattern.  For example:

;; -
;;  [FP] Absolute difference
;; -
;; Includes:
;; - FABD
;; -

Minor/Major sections of the file must satisfy the rules in check-sve-md.awk.

Cheers,
Spencer


Re: [PATCH 03/14] aarch64: Relaxed SEL combiner patterns for unpacked SVE FP conversions

2025-06-09 Thread Spencer Abson
On Mon, Jun 09, 2025 at 02:48:58PM +0100, Richard Sandiford wrote:
> Spencer Abson  writes:
> > On Thu, Jun 05, 2025 at 09:24:27PM +0100, Richard Sandiford wrote:
> >> Spencer Abson  writes:
> >> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_cvtf_1.c 
> >> > b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_cvtf_1.c
> >> > new file mode 100644
> >> > index 000..8f69232f2cf
> >> > --- /dev/null
> >> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_cvtf_1.c
> >> > @@ -0,0 +1,47 @@
> >> > +/* { dg-do compile } */
> >> > +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 
> >> > -fno-trapping-math" } */
> >> 
> >> The =2048 is ok, but do you need it for these autovectorisation tests?
> >> If vectorisation is treated as not profitable without it, then perhaps
> >> we could switch to Tamar's -mmax-vectorization, once that's in.
> >
> > This isn't needed to make vectorization profitable, but rather to
> > make partial vector modes the reliably obvious choice - and hopefully
> > one that is isn't affected by future cost model changes.  With =2048
> > and COUNT, each loop should be fully-unrolled into a single unpacked 
> > operation (plus setup and return).
> >
> > For me, this was much more flexible than using builtin vector types,
> > and easier to reason about.  Maybe that's just me though!  I can try
> > something else if it would be preferred.
> 
> I don't really agree about the "easier to reason about" bit: IMO,
> builtin vector types are the most direct and obvious way of testing
> things with fixed-length vectors, for the cases that they can handle
> directly.  But I agree that vectorisation is more flexible, in that
> it can deal with cases that fixed-length builtin vectors can't yet
> handle directly.
> 
> My main concern was that the tests didn't seem to have much coverage
> of normal VLA codegen.  If the aim is predictable costing, it might
> be enough to use -moverride=sve_width=2048 instead of
> -msve-vector-bits=2048.

I see - yeah, -moverride=sve_width=2048 is enough.

How about we use builtin vectors wherever possible, and fall back
to the current approach (but replacing -msve-vector-bits with
-moverride=sve_width) everywhere else?

Alternatively, if we'd like to focus on VLA codegen, I could
just replace -msve-vector-bits with -moverride=sve_width throughout
the series.

Thanks,
Spencer
> 
> Thanks,
> Richard


Re: [PATCH] aarch64: Fold NOT+PTEST to NOTS [PR118150]

2025-06-13 Thread Spencer Abson
On Fri, Jun 13, 2025 at 02:12:44PM +, Kyrylo Tkachov wrote:
> Hi Spencer,
> 
> Thanks for the patch.
> 
> > On 13 Jun 2025, at 14:46, Spencer Abson  wrote:
> > 
> > Add the missing combiner patterns for folding NOT+PTEST to NOTS when
> > they share the same GP.
> > 
> 
> I guess GP here means “governing predicate”?
> GP usually means “General Purpose (register)” in aarch64 so it’d be good to 
> make the terminology explicit in the commit message.
> 
> > gcc/ChangeLog:
> > 
> > * config/aarch64/aarch64-sve.md (*one_cmpl3_cc): New
> > combiner pattern.
> > (*one_cmpl3_ptest): Likewise.
> > 
> 
> The ChangeLog entry should mention PR target/118150 so that bugzilla is 
> properly updated on commit.
> 
> > gcc/testsuite/ChangeLog:
> > 
> > * gcc.target/aarch64/sve/acle/general/not_1.c: New test.
> > 
> > ---
> > Bootstapped & regtested on aarch64-linux-gnu.  OK for master?
> > 
> > Thanks,
> > Spencer
> > ---
> > gcc/config/aarch64/aarch64-sve.md | 36 +++
> > .../aarch64/sve/acle/general/not_1.c  | 22 
> > 2 files changed, 58 insertions(+)
> > create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/general/not_1.c
> > 
> > diff --git a/gcc/config/aarch64/aarch64-sve.md 
> > b/gcc/config/aarch64/aarch64-sve.md
> > index c5d3e8cd3b3..0f85cafa16a 100644
> > --- a/gcc/config/aarch64/aarch64-sve.md
> > +++ b/gcc/config/aarch64/aarch64-sve.md
> 
> The comment in this section says:
> 
> ;; -
> ;;  [PRED] Inverse
> ;; -
> ;; Includes:
> ;; - NOT
> ;; 
> 
> It should now include NOTS as well.
> 

Gah, thanks for catching those - a bit careless of me!

Spencer


[PATCH] aarch64: Fold NOT+PTEST to NOTS [PR118150]

2025-06-13 Thread Spencer Abson
Add the missing combiner patterns for folding NOT+PTEST to NOTS when
they share the same GP.

gcc/ChangeLog:

* config/aarch64/aarch64-sve.md (*one_cmpl3_cc): New
combiner pattern.
(*one_cmpl3_ptest): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/acle/general/not_1.c: New test.

---
Bootstapped & regtested on aarch64-linux-gnu.  OK for master?

Thanks,
Spencer
---
 gcc/config/aarch64/aarch64-sve.md | 36 +++
 .../aarch64/sve/acle/general/not_1.c  | 22 
 2 files changed, 58 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/general/not_1.c

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index c5d3e8cd3b3..0f85cafa16a 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3972,6 +3972,42 @@
   "not\t%0.b, %1/z, %2.b"
 )
 
+;; Predicated predicate inverse in which the flags are set in the same
+;; way as a PTEST.
+(define_insn "*one_cmpl3_cc"
+  [(set (reg:CC_NZC CC_REGNUM)
+   (unspec:CC_NZC
+ [(match_operand:VNx16BI 1 "register_operand" "Upa")
+  (match_operand 3)
+  (match_operand:SI 4 "aarch64_sve_ptrue_flag")
+  (and:PRED_ALL
+(not:PRED_ALL
+  (match_operand:PRED_ALL 2 "register_operand" "Upa"))
+(match_dup 3))]
+ UNSPEC_PTEST))
+   (set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+   (and:PRED_ALL (not:PRED_ALL (match_dup 2)) (match_dup 3)))]
+  "TARGET_SVE"
+  "nots\t%0.b, %1/z, %2.b"
+)
+
+;; Same, where only the flags result is interesting.
+(define_insn "*one_cmpl3_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+   (unspec:CC_NZC
+ [(match_operand:VNx16BI 1 "register_operand" "Upa")
+  (match_operand 3)
+  (match_operand:SI 4 "aarch64_sve_ptrue_flag")
+  (and:PRED_ALL
+(not:PRED_ALL
+  (match_operand:PRED_ALL 2 "register_operand" "Upa"))
+(match_dup 3))]
+ UNSPEC_PTEST))
+   (clobber (match_scratch:PRED_ALL 0 "=Upa"))]
+  "TARGET_SVE"
+  "nots\t%0.b, %1/z, %2.b"
+)
+
 ;; =
 ;; == Binary arithmetic
 ;; =
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/not_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/not_1.c
new file mode 100644
index 000..875d78885d6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/not_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include 
+
+void
+test1 (svbool_t pg, svbool_t x, int *any, svbool_t *ptr)
+{
+  svbool_t res = svnot_z (pg, x);
+  *any = svptest_last (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg, svbool_t x)
+{
+  svbool_t res = svnot_z (pg, x);
+  return svptest_first (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\tnots\t} 2 } } */
+/* { dg-final { scan-assembler-not {\tnot\t} } } */
-- 
2.34.1