GCC 15 is the first release to support FP8 intrinsics. The underlying instructions depend on the value of a new register, FPMR. Unlike FPCR, FPMR is a normal call-clobbered/caller-save register rather than a global register. So:
- The FP8 intrinsics take a final uint64_t argument that specifies what value FPMR should have. - If an FP8 operation is split across multiple functions, it is likely that those functions would have a similar argument. If the object code has the structure: for (...) fp8_kernel (..., fpmr_value); then fp8_kernel would set FPMR to fpmr_value each time it is called, even though FPMR will already have that value for at least the second and subsequent calls (and possibly the first). The working assumption for the ABI has been that writes to registers like FPMR can in general be more expensive than reads and so it would be better to use a conditional write like: mrs tmp, fpmr cmp tmp, <value> beq 1f nsr fpmr, <value> 1: instead of writing the same value to FPMR repeatedly. This patch implements that. It also adds a tuning flag that suppresses the behaviour, both to make testing easier and to support any future cores that (for example) are able to rename FPMR. Hopefully this really is the last part of the FP8 enablement. Tested on aarch64-linux-gnu. I'll push in about 24 hours if there are no comments before then. Richard gcc/ * config/aarch64/aarch64-tuning-flags.def (AARCH64_EXTRA_TUNE_CHEAP_FPMR_WRITE): New tuning flag. * config/aarch64/aarch64.h (TARGET_CHEAP_FPMR_WRITE): New macro. * config/aarch64/aarch64.md: Split moves into FPMR into a test and branch around. (aarch64_write_fpmr): New pattern. gcc/testsuite/ * g++.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp: Add cheap_fpmr_write by default. * gcc.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp: Likewise. * gcc.target/aarch64/acle/fp8.c: Add cheap_fpmr_write. * gcc.target/aarch64/acle/fpmr-2.c: Likewise. * gcc.target/aarch64/simd/vcvt_fpm.c: Likewise. * gcc.target/aarch64/simd/vdot2_fpm.c: Likewise. * gcc.target/aarch64/simd/vdot4_fpm.c: Likewise. * gcc.target/aarch64/simd/vmla_fpm.c: Likewise. * gcc.target/aarch64/acle/fpmr-6.c: New test. --- gcc/config/aarch64/aarch64-tuning-flags.def | 15 +++++++ gcc/config/aarch64/aarch64.h | 5 +++ gcc/config/aarch64/aarch64.md | 39 +++++++++++++++++++ .../sve2/acle/aarch64-sve2-acle-asm.exp | 2 +- gcc/testsuite/gcc.target/aarch64/acle/fp8.c | 2 +- .../gcc.target/aarch64/acle/fpmr-2.c | 2 +- .../gcc.target/aarch64/acle/fpmr-6.c | 36 +++++++++++++++++ .../gcc.target/aarch64/simd/vcvt_fpm.c | 2 +- .../gcc.target/aarch64/simd/vdot2_fpm.c | 2 +- .../gcc.target/aarch64/simd/vdot4_fpm.c | 2 +- .../gcc.target/aarch64/simd/vmla_fpm.c | 2 +- .../sve2/acle/aarch64-sve2-acle-asm.exp | 2 +- 12 files changed, 103 insertions(+), 8 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/fpmr-6.c diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index 60967aac903..7a67d6197d9 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -48,6 +48,21 @@ AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_fma", FULLY_PIPELINED_FMA) rather than re-use an input predicate register. */ AARCH64_EXTRA_TUNING_OPTION ("avoid_pred_rmw", AVOID_PRED_RMW) +/* Whether writes to the FPMR are cheap enough that: + + msr fpmr, <value> + + is better than: + + mrs tmp, fpmr + cmp tmp, <value> + beq 1f + nsr fpmr, <value> + 1: + + even when the branch is predictably taken. */ +AARCH64_EXTRA_TUNING_OPTION ("cheap_fpmr_write", CHEAP_FPMR_WRITE) + /* Baseline tuning settings suitable for all modern cores. */ #define AARCH64_EXTRA_TUNE_BASE (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND \ | AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA) diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index 218868a5246..5cbf442130b 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -486,6 +486,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED /* fp8 instructions are enabled through +fp8. */ #define TARGET_FP8 AARCH64_HAVE_ISA (FP8) +/* See the comment above the tuning flag for details. */ +#define TARGET_CHEAP_FPMR_WRITE \ + (bool (aarch64_tune_params.extra_tuning_flags \ + & AARCH64_EXTRA_TUNE_CHEAP_FPMR_WRITE)) + /* Combinatorial tests. */ #define TARGET_SVE2_OR_SME2 \ diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 776c4c4ceee..071058dbeb3 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -356,6 +356,7 @@ (define_c_enum "unspec" [ UNSPEC_UPDATE_FFRT UNSPEC_RDFFR UNSPEC_WRFFR + UNSPEC_WRITE_FPMR UNSPEC_SYSREG_RDI UNSPEC_SYSREG_RTI UNSPEC_SYSREG_WDI @@ -1883,6 +1884,44 @@ (define_split } ) +;; The preferred way of writing to the FPMR is to test whether it already +;; has the desired value and branch around the write if so. This reduces +;; the number of redundant FPMR writes caused by ABI boundaries, such as in: +;; +;; for (...) +;; fp8_kernel (..., fpmr_value); +;; +;; Without this optimization, fp8_kernel would set FPMR to fpmr_value each +;; time that it is called. +;; +;; We do this as a split so that hardreg_pre can optimize the moves first. +(define_split + [(set (reg:DI FPM_REGNUM) + (match_operand:DI 0 "aarch64_reg_or_zero"))] + "TARGET_FP8 && !TARGET_CHEAP_FPMR_WRITE && can_create_pseudo_p ()" + [(const_int 0)] + { + auto label = gen_label_rtx (); + rtx current = copy_to_reg (gen_rtx_REG (DImode, FPM_REGNUM)); + rtx cond = gen_rtx_EQ (VOIDmode, current, operands[0]); + emit_jump_insn (gen_cbranchdi4 (cond, current, operands[0], label)); + emit_insn (gen_aarch64_write_fpmr (operands[0])); + emit_label (label); + DONE; + } +) + +;; A write to the FPMR that is already protected by a conditional branch. +;; Since this instruction is introduced late, it shouldn't matter too much +;; that we're using an unspec for a move. +(define_insn "aarch64_write_fpmr" + [(set (reg:DI FPM_REGNUM) + (unspec:DI [(match_operand:DI 0 "aarch64_reg_or_zero" "rZ")] + UNSPEC_WRITE_FPMR))] + "TARGET_FP8" + "msr\tfpmr, %x0" +) + (define_expand "aarch64_cpymemdi" [(parallel [(set (match_operand 2) (const_int 0)) diff --git a/gcc/testsuite/g++.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp b/gcc/testsuite/g++.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp index 4323e5f62ae..7fc33e99b05 100644 --- a/gcc/testsuite/g++.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp +++ b/gcc/testsuite/g++.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp @@ -39,7 +39,7 @@ if { [check_effective_target_aarch64_sve2] } { # Turn off any codegen tweaks by default that may affect expected assembly. # Tests relying on those should turn them on explicitly. -set sve2_flags "$sve2_flags -mtune=generic -moverride=tune=none" +set sve2_flags "$sve2_flags -mtune=generic -moverride=tune=none -moverride=tune=cheap_fpmr_write" set gcc_subdir [string replace $subdir 0 2 gcc] lappend extra_flags "-fno-ipa-icf" "-I$srcdir/$gcc_subdir/../../sve/acle/asm" diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c index 63f88e24dfb..f0e7035ffc0 100644 --- a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c +++ b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c @@ -1,6 +1,6 @@ /* Test the fp8 ACLE intrinsics family. */ /* { dg-do compile } */ -/* { dg-options "-O1 -march=armv8-a" } */ +/* { dg-options "-O1 -march=armv8-a -moverride=tune=cheap_fpmr_write" } */ /* { dg-final { check-function-bodies "**" "" "" } } */ #include <arm_acle.h> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c index c5b255b0a9a..79a9535126e 100644 --- a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O1 -march=armv8-a+fp8fma" } */ +/* { dg-options "-O1 -march=armv8-a+fp8fma -moverride=tune=cheap_fpmr_write" } */ #include <arm_neon.h> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-6.c b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-6.c new file mode 100644 index 00000000000..6a00e017af9 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-6.c @@ -0,0 +1,36 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -march=armv9-a+fp8dot4 -moverride=tune=none" } */ +/* { dg-final { check-function-bodies "**" "" "" { target *-*-* } {\.L[0-9]+} } } */ + +#include "arm_neon.h" + +/* +** f1: +** mrs (x[0-9]+), fpmr +** cmp \1, x0 +** beq ([^\n]+) +** msr fpmr, x0 +** ?\2: +** fdot v0.2s, v1.8b, v2.8b +** ret +*/ +float32x2_t +f1 (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d) +{ + return vdot_f32_mf8_fpm (a, b, c, d); +} + +/* +** f2: +** mrs (x[0-9]+), fpmr +** cbz \1, ([^\n]+) +** msr fpmr, xzr +** ?\2: +** fdot v0.2s, v1.8b, v2.8b +** ret +*/ +float32x2_t +f2 (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c) +{ + return vdot_f32_mf8_fpm (a, b, c, 0); +} diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c b/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c index 39076684345..29dece61d4a 100644 --- a/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c +++ b/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-additional-options "-O3 -march=armv9-a+fp8" } */ +/* { dg-additional-options "-O3 -march=armv9-a+fp8 -moverride=tune=cheap_fpmr_write" } */ /* { dg-final { check-function-bodies "**" "" } } */ #include "arm_neon.h" diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c b/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c index 5fe139106c6..07decd71926 100644 --- a/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c +++ b/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-additional-options "-O3 -march=armv9-a+fp8dot2" } */ +/* { dg-additional-options "-O3 -march=armv9-a+fp8dot2 -moverride=tune=cheap_fpmr_write" } */ /* { dg-final { check-function-bodies "**" "" } } */ #include "arm_neon.h" diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c b/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c index e47a737e8b5..27c1d38434f 100644 --- a/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c +++ b/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-additional-options "-O3 -march=armv9-a+fp8dot4" } */ +/* { dg-additional-options "-O3 -march=armv9-a+fp8dot4 -moverride=tune=cheap_fpmr_write" } */ /* { dg-final { check-function-bodies "**" "" } } */ #include "arm_neon.h" diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmla_fpm.c b/gcc/testsuite/gcc.target/aarch64/simd/vmla_fpm.c index 51b47055ca2..8e5835af5a2 100644 --- a/gcc/testsuite/gcc.target/aarch64/simd/vmla_fpm.c +++ b/gcc/testsuite/gcc.target/aarch64/simd/vmla_fpm.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-additional-options "-O3 -march=armv9-a+fp8fma" } */ +/* { dg-additional-options "-O3 -march=armv9-a+fp8fma -moverride=tune=cheap_fpmr_write" } */ /* { dg-final { check-function-bodies "**" "" } } */ #include "arm_neon.h" diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp b/gcc/testsuite/gcc.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp index 69a3a1786f2..e950f8613da 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp @@ -39,7 +39,7 @@ if { [check_effective_target_aarch64_sve2] } { # Turn off any codegen tweaks by default that may affect expected assembly. # Tests relying on those should turn them on explicitly. -set sve2_flags "$sve2_flags -mtune=generic -moverride=tune=none" +set sve2_flags "$sve2_flags -mtune=generic -moverride=tune=none -moverride=tune=cheap_fpmr_write" lappend extra_flags "-fno-ipa-icf" -- 2.25.1