On 11/11/2024 16:03, Richard Sandiford wrote:
Claudio Bantaloukas <claudio.bantalou...@arm.com> writes:
[...]
@@ -231,12 +231,12 @@ CONSTEXPR const group_suffix_info group_suffixes[] = {
  #define TYPES_all_arith(S, D) \
    TYPES_all_float (S, D), TYPES_all_integer (S, D)
-/* _bf16
+/* _mf8 _bf16
        _f16 _f32 _f64
     _s8  _s16 _s32 _s64
     _u8  _u16 _u32 _u64.  */
  #define TYPES_all_data(S, D) \
-  S (bf16), TYPES_all_arith (S, D)
+  S(mf8), S (bf16), TYPES_all_arith (S, D)
Sorry for the clash, but I've since pushed the SVE2p1 patches, which
redefine all_data in terms of separate b_data, h_data, s_data, and d_data.
This would now be part of b_data, and we should now get things like
svluti2_lane_zt_mf8 for free.
Okdoke, will post new patch series soon.
We should probably also add mf8 to things like za_bhsd_data, but that
can be a separate follow-on patch.

Ack, makes more sense as it's part of SME.


+/*
+** caller_0:
+**     ...
+**     mov     (z[0-9]+\.b), w2
+**     ...
+**     st1b    \1, p[0-7], \[x1\]
+**     ...
+**     ret
+*/
+void __attribute__((noipa))
+caller_0 (mfloat8_t *ptr, mfloat8_t in)
+{
+  callee_0 (ptr, svdup_mf8 (in));
+}
w2 isn't a meaningful register here, since the data should be in via b0.
I suppose for now we should make the move into w2 as well, with a note
to say that this should be optimised away later.  (Although the hard-coded
w2 should be replaced with (w[0-9]+) for the move in and \1 for the move out.)
Same for the other callers in this file.

Ack, part of next series iteration along with FIXMEs to optimize away the umov and mov pair.



Looks good otherwise, thanks!

Richard

+
[...]
+/*
+** caller_1:
+**     ...
+**     mov     (z[0-9]+\.b), w3
+**     ...
+**     st1b    \1, p[0-7], \[x2\]
+**     ...
+**     ret
+*/
+void __attribute__((noipa))
+caller_1 (mfloat8_t *ptr, mfloat8_t in)
+{
+  callee_1 (ptr, 1, svdup_mf8 (in));
+}
+
+/*
+** callee_7:
+**     ...
+**     ld1b    (z[0-9]+\.b), (p[0-7])/z, \[x7\]
+**     ...
+**     st1b    \1, p[0-7], \[x0\]
+**     ...
+**     ret
+*/
+void __attribute__((noipa))
+callee_7 (mfloat8_t *ptr, ...)
+{
+  va_list va;
+  svmfloat8_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svmfloat8_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_7:
+**     ...
+**     mov     (z[0-9]+\.b), w8
+**     ...
+**     st1b    \1, p[0-7], \[x7\]
+**     ...
+**     ret
+*/
+void __attribute__((noipa))
+caller_7 (mfloat8_t *ptr, mfloat8_t in)
+{
+  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_mf8 (in));
+}
+
+/* FIXME: We should be able to get rid of the va_list object.  */
+/*
+** callee_8:
+**     sub     sp, sp, #([0-9]+)
+**     ...
+**     ldr     (x[0-9]+), \[sp, \1\]
+**     ...
+**     ld1b    (z[0-9]+\.b), (p[0-7])/z, \[\2\]
+**     ...
+**     st1b    \3, \4, \[x0\]
+**     ...
+**     ret
+*/
+void __attribute__((noipa))
+callee_8 (mfloat8_t *ptr, ...)
+{
+  va_list va;
+  svmfloat8_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svmfloat8_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_8:
+**     ...
+**     mov     (z[0-9]+\.b), w1
+**     ...
+**     st1b    \1, p[0-7], \[(x[0-9]+)\]
+**     ...
+**     str     \2, \[sp\]
+**     ...
+**     ret
+*/
+void __attribute__((noipa))
+caller_8 (mfloat8_t *ptr, mfloat8_t in)
+{
+  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_mf8 (in));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/tbl2_mf8.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/tbl2_mf8.c
new file mode 100644
index 00000000000..19cc739e7ab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/tbl2_mf8.c
@@ -0,0 +1,31 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tbl2_mf8_tied1:
+**     tbl     z0\.b, {z0\.b(?:, | - )z1\.b}, z4\.b
+**     ret
+*/
+TEST_TBL2 (tbl2_mf8_tied1, svmfloat8x2_t, svmfloat8_t, svuint8_t,
+          z0_res = svtbl2_mf8 (z0, z4),
+          z0_res = svtbl2 (z0, z4))
+
+/*
+** tbl2_mf8_tied2:
+**     tbl     z0\.b, {z1\.b(?:, | - )z2\.b}, z0\.b
+**     ret
+*/
+TEST_TBL2_REV (tbl2_mf8_tied2, svmfloat8x2_t, svmfloat8_t, svuint8_t,
+              z0_res = svtbl2_mf8 (z1, z0),
+              z0_res = svtbl2 (z1, z0))
+
+/*
+** tbl2_mf8_untied:
+**     tbl     z0\.b, {z2\.b(?:, | - )z3\.b}, z4\.b
+**     ret
+*/
+TEST_TBL2 (tbl2_mf8_untied, svmfloat8x2_t, svmfloat8_t, svuint8_t,
+          z0_res = svtbl2_mf8 (z2, z4),
+          z0_res = svtbl2 (z2, z4))
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/tbx_mf8.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/tbx_mf8.c
new file mode 100644
index 00000000000..ba0fef3934b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/tbx_mf8.c
@@ -0,0 +1,37 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tbx_mf8_tied1:
+**     tbx     z0\.b, z1\.b, z4\.b
+**     ret
+*/
+TEST_DUAL_Z (tbx_mf8_tied1, svmfloat8_t, svuint8_t,
+            z0 = svtbx_mf8 (z0, z1, z4),
+            z0 = svtbx (z0, z1, z4))
+
+/* Bad RA choice: no preferred output sequence.  */
+TEST_DUAL_Z (tbx_mf8_tied2, svmfloat8_t, svuint8_t,
+            z0 = svtbx_mf8 (z1, z0, z4),
+            z0 = svtbx (z1, z0, z4))
+
+/* Bad RA choice: no preferred output sequence.  */
+TEST_DUAL_Z_REV (tbx_mf8_tied3, svmfloat8_t, svuint8_t,
+                z0_res = svtbx_mf8 (z4, z5, z0),
+                z0_res = svtbx (z4, z5, z0))
+
+/*
+** tbx_mf8_untied:
+** (
+**     mov     z0\.d, z1\.d
+**     tbx     z0\.b, z2\.b, z4\.b
+** |
+**     tbx     z1\.b, z2\.b, z4\.b
+**     mov     z0\.d, z1\.d
+** )
+**     ret
+*/
+TEST_DUAL_Z (tbx_mf8_untied, svmfloat8_t, svuint8_t,
+            z0 = svtbx_mf8 (z1, z2, z4),
+            z0 = svtbx (z1, z2, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/whilerw_mf8.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/whilerw_mf8.c
new file mode 100644
index 00000000000..12cf0d2c365
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/whilerw_mf8.c
@@ -0,0 +1,50 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 
} } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** whilerw_rr_mf8:
+**     whilerw p0\.b, x0, x1
+**     ret
+*/
+TEST_COMPARE_S (whilerw_rr_mf8, const mfloat8_t *,
+               p0 = svwhilerw_mf8 (x0, x1),
+               p0 = svwhilerw (x0, x1))
+
+/*
+** whilerw_0r_mf8:
+**     whilerw p0\.b, xzr, x1
+**     ret
+*/
+TEST_COMPARE_S (whilerw_0r_mf8, const mfloat8_t *,
+               p0 = svwhilerw_mf8 ((const mfloat8_t *) 0, x1),
+               p0 = svwhilerw ((const mfloat8_t *) 0, x1))
+
+/*
+** whilerw_cr_mf8:
+**     mov     (x[0-9]+), #?1073741824
+**     whilerw p0\.b, \1, x1
+**     ret
+*/
+TEST_COMPARE_S (whilerw_cr_mf8, const mfloat8_t *,
+               p0 = svwhilerw_mf8 ((const mfloat8_t *) 1073741824, x1),
+               p0 = svwhilerw ((const mfloat8_t *) 1073741824, x1))
+
+/*
+** whilerw_r0_mf8:
+**     whilerw p0\.b, x0, xzr
+**     ret
+*/
+TEST_COMPARE_S (whilerw_r0_mf8, const mfloat8_t *,
+               p0 = svwhilerw_mf8 (x0, (const mfloat8_t *) 0),
+               p0 = svwhilerw (x0, (const mfloat8_t *) 0))
+
+/*
+** whilerw_rc_mf8:
+**     mov     (x[0-9]+), #?1073741824
+**     whilerw p0\.b, x0, \1
+**     ret
+*/
+TEST_COMPARE_S (whilerw_rc_mf8, const mfloat8_t *,
+               p0 = svwhilerw_mf8 (x0, (const mfloat8_t *) 1073741824),
+               p0 = svwhilerw (x0, (const mfloat8_t *) 1073741824))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/whilewr_mf8.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/whilewr_mf8.c
new file mode 100644
index 00000000000..c4023a2fbff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/whilewr_mf8.c
@@ -0,0 +1,50 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 
} } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** whilewr_rr_mf8:
+**     whilewr p0\.b, x0, x1
+**     ret
+*/
+TEST_COMPARE_S (whilewr_rr_mf8, const mfloat8_t *,
+               p0 = svwhilewr_mf8 (x0, x1),
+               p0 = svwhilewr (x0, x1))
+
+/*
+** whilewr_0r_mf8:
+**     whilewr p0\.b, xzr, x1
+**     ret
+*/
+TEST_COMPARE_S (whilewr_0r_mf8, const mfloat8_t *,
+               p0 = svwhilewr_mf8 ((const mfloat8_t *) 0, x1),
+               p0 = svwhilewr ((const mfloat8_t *) 0, x1))
+
+/*
+** whilewr_cr_mf8:
+**     mov     (x[0-9]+), #?1073741824
+**     whilewr p0\.b, \1, x1
+**     ret
+*/
+TEST_COMPARE_S (whilewr_cr_mf8, const mfloat8_t *,
+               p0 = svwhilewr_mf8 ((const mfloat8_t *) 1073741824, x1),
+               p0 = svwhilewr ((const mfloat8_t *) 1073741824, x1))
+
+/*
+** whilewr_r0_mf8:
+**     whilewr p0\.b, x0, xzr
+**     ret
+*/
+TEST_COMPARE_S (whilewr_r0_mf8, const mfloat8_t *,
+               p0 = svwhilewr_mf8 (x0, (const mfloat8_t *) 0),
+               p0 = svwhilewr (x0, (const mfloat8_t *) 0))
+
+/*
+** whilewr_rc_mf8:
+**     mov     (x[0-9]+), #?1073741824
+**     whilewr p0\.b, x0, \1
+**     ret
+*/
+TEST_COMPARE_S (whilewr_rc_mf8, const mfloat8_t *,
+               p0 = svwhilewr_mf8 (x0, (const mfloat8_t *) 1073741824),
+               p0 = svwhilewr (x0, (const mfloat8_t *) 1073741824))

Reply via email to