> On 31 Jul 2024, at 11:31, Claudio Bantaloukas <[email protected]>
> wrote:
>
> External email: Use caution opening links or attachments
>
>
> On 31/07/2024 08:57, Kyrylo Tkachov wrote:
>> Hi Claudio,
>>
>>> On 31 Jul 2024, at 08:29, Claudio Bantaloukas <[email protected]>
>>> wrote:
>>>
>>> External email: Use caution opening links or attachments
>>>
>>>
>>> Unlike most system registers, fpmr can be heavily written to in code that
>>> exercises the fp8 functionality. That is because every fp8 instrinsic call
>>> can potentially change the value of fpmr.
>>> Rather than just use an unspec, we treat the fpmr system register like
>>> all other registers and use a move operation to read and write to it.
>>>
>>> We introduce a new class of moveable system registers that, currently,
>>> only accepts fpmr and a new constraint, Umv, that allows us to
>>> selectively use mrs and msr instructions when expanding rtl for them.
>>> Given that there is code that depends on "real" registers coming before
>>> "fake" ones, we introduce a new constant FPM_REGNUM that uses an
>>> existing value and renumber registers below that.
>>> This requires us to update the bitmaps that describe which registers
>>> belong to each register class.
>>
>> So I like the approach though I’ll let Richard review the implementation
>> details here.
>> My only slight concern here is compatibility with LLVM. I notice that LLVM
>> doesn’t accept the test case you’ve included as it doesn’t understand “fpmr”
>> in its inline assembly. It also doesn’t support the new constraint, of
>> course.
>> Do you know if there are plans to teach LLVM these inline assembly
>> constructs to avoid creating GCC-only sources for fp8?
>
> Hi Kyrill,
> I asked and got confirmation that a patch to add fpmr as a register
> should land soon.
Great, thanks for confirming.
Kyrill
>
> Cheers,
> Claudio
>
>> Thanks,
>> Kyrill
>>
>>
>>>
>>> gcc/ChangeLog:
>>>
>>> * config/aarch64/aarch64.cc (aarch64_hard_regno_nregs): Add
>>> support for MOVEABLE_SYSREGS class.
>>> (aarch64_hard_regno_mode_ok): Allow reads and writes to fpmr.
>>> (aarch64_regno_regclass): Support MOVEABLE_SYSREGS class.
>>> (aarch64_class_max_nregs): Likewise.
>>> * config/aarch64/aarch64.h (FIXED_REGISTERS): add fpmr.
>>> (CALL_REALLY_USED_REGISTERS): Likewise.
>>> (REGISTER_NAMES): Likewise.
>>> (enum reg_class): Add MOVEABLE_SYSREGS class.
>>> (REG_CLASS_NAMES): Likewise.
>>> (REG_CLASS_CONTENTS): Update class bitmaps to deal with fpmr,
>>> the new MOVEABLE_REGS class and renumbering of registers.
>>> * config/aarch64/aarch64.md: (FPM_REGNUM): added new register
>>> number, reusing old value.
>>> (FFR_REGNUM): Renumber.
>>> (FFRT_REGNUM): Likewise.
>>> (LOWERING_REGNUM): Likewise.
>>> (TPIDR2_BLOCK_REGNUM): Likewise.
>>> (SME_STATE_REGNUM): Likewise.
>>> (TPIDR2_SETUP_REGNUM): Likewise.
>>> (ZA_FREE_REGNUM): Likewise.
>>> (ZA_SAVED_REGNUM): Likewise.
>>> (ZA_REGNUM): Likewise.
>>> (ZT0_REGNUM): Likewise.
>>> (*mov<mode>_aarch64): Add support for moveable sysregs.
>>> (*movsi_aarch64): Likewise.
>>> (*movdi_aarch64): Likewise.
>>> * config/aarch64/constraints.md (MOVEABLE_SYSREGS): New constraint.
>>>
>>> gcc/testsuite/ChangeLog:
>>>
>>> * gcc.target/aarch64/acle/fp8.c: New tests.
>>> ---
>>> gcc/config/aarch64/aarch64.cc | 8 ++
>>> gcc/config/aarch64/aarch64.h | 14 ++-
>>> gcc/config/aarch64/aarch64.md | 30 ++++--
>>> gcc/config/aarch64/constraints.md | 3 +
>>> gcc/testsuite/gcc.target/aarch64/acle/fp8.c | 101 ++++++++++++++++++++
>>> 5 files changed, 142 insertions(+), 14 deletions(-)
>>>
>>> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>>> index e0cf382998c..9810f2c0390 100644
>>> --- a/gcc/config/aarch64/aarch64.cc
>>> +++ b/gcc/config/aarch64/aarch64.cc
>>> @@ -2018,6 +2018,7 @@ aarch64_hard_regno_nregs (unsigned regno,
>>> machine_mode mode)
>>> case PR_HI_REGS:
>>> return mode == VNx32BImode ? 2 : 1;
>>>
>>> + case MOVEABLE_SYSREGS:
>>> case FFR_REGS:
>>> case PR_AND_FFR_REGS:
>>> case FAKE_REGS:
>>> @@ -2045,6 +2046,9 @@ aarch64_hard_regno_mode_ok (unsigned regno,
>>> machine_mode mode)
>>> /* This must have the same size as _Unwind_Word. */
>>> return mode == DImode;
>>>
>>> + if (regno == FPM_REGNUM)
>>> + return mode == QImode || mode == HImode || mode == SImode || mode ==
>>> DImode;
>>> +
>>> unsigned int vec_flags = aarch64_classify_vector_mode (mode);
>>> if (vec_flags == VEC_SVE_PRED)
>>> return pr_or_ffr_regnum_p (regno);
>>> @@ -12680,6 +12684,9 @@ aarch64_regno_regclass (unsigned regno)
>>> if (PR_REGNUM_P (regno))
>>> return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
>>>
>>> + if (regno == FPM_REGNUM)
>>> + return MOVEABLE_SYSREGS;
>>> +
>>> if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
>>> return FFR_REGS;
>>>
>>> @@ -13068,6 +13075,7 @@ aarch64_class_max_nregs (reg_class_t regclass,
>>> machine_mode mode)
>>> case PR_HI_REGS:
>>> return mode == VNx32BImode ? 2 : 1;
>>>
>>> + case MOVEABLE_SYSREGS:
>>> case STACK_REG:
>>> case FFR_REGS:
>>> case PR_AND_FFR_REGS:
>>> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
>>> index 2e75c6b81e2..2dfb999bea5 100644
>>> --- a/gcc/config/aarch64/aarch64.h
>>> +++ b/gcc/config/aarch64/aarch64.h
>>> @@ -523,6 +523,7 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE
>>> ATTRIBUTE_UNUSED
>>> 1, 1, 1, 1, /* SFP, AP, CC, VG */ \
>>> 0, 0, 0, 0, 0, 0, 0, 0, /* P0 - P7 */ \
>>> 0, 0, 0, 0, 0, 0, 0, 0, /* P8 - P15 */ \
>>> + 1, /* FPMR */ \
>>> 1, 1, /* FFR and FFRT */ \
>>> 1, 1, 1, 1, 1, 1, 1, 1 /* Fake registers */ \
>>> }
>>> @@ -547,6 +548,7 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE
>>> ATTRIBUTE_UNUSED
>>> 1, 1, 1, 0, /* SFP, AP, CC, VG */ \
>>> 1, 1, 1, 1, 1, 1, 1, 1, /* P0 - P7 */ \
>>> 1, 1, 1, 1, 1, 1, 1, 1, /* P8 - P15 */ \
>>> + 1, /* FPMR */ \
>>> 1, 1, /* FFR and FFRT */ \
>>> 0, 0, 0, 0, 0, 0, 0, 0 /* Fake registers */ \
>>> }
>>> @@ -564,6 +566,7 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE
>>> ATTRIBUTE_UNUSED
>>> "sfp", "ap", "cc", "vg", \
>>> "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", \
>>> "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", \
>>> + "fpmr", \
>>> "ffr", "ffrt", \
>>> "lowering", "tpidr2_block", "sme_state", "tpidr2_setup", \
>>> "za_free", "za_saved", "za", "zt0" \
>>> @@ -775,6 +778,7 @@ enum reg_class
>>> PR_REGS,
>>> FFR_REGS,
>>> PR_AND_FFR_REGS,
>>> + MOVEABLE_SYSREGS,
>>> FAKE_REGS,
>>> ALL_REGS,
>>> LIM_REG_CLASSES /* Last */
>>> @@ -801,6 +805,7 @@ enum reg_class
>>> "PR_REGS", \
>>> "FFR_REGS", \
>>> "PR_AND_FFR_REGS", \
>>> + "MOVEABLE_SYSREGS", \
>>> "FAKE_REGS", \
>>> "ALL_REGS" \
>>> }
>>> @@ -822,10 +827,11 @@ enum reg_class
>>> { 0x00000000, 0x00000000, 0x00000ff0 }, /* PR_LO_REGS */ \
>>> { 0x00000000, 0x00000000, 0x000ff000 }, /* PR_HI_REGS */ \
>>> { 0x00000000, 0x00000000, 0x000ffff0 }, /* PR_REGS */ \
>>> - { 0x00000000, 0x00000000, 0x00300000 }, /* FFR_REGS */ \
>>> - { 0x00000000, 0x00000000, 0x003ffff0 }, /* PR_AND_FFR_REGS */ \
>>> - { 0x00000000, 0x00000000, 0x3fc00000 }, /* FAKE_REGS */ \
>>> - { 0xffffffff, 0xffffffff, 0x000fffff } /* ALL_REGS */ \
>>> + { 0x00000000, 0x00000000, 0x00600000 }, /* FFR_REGS */ \
>>> + { 0x00000000, 0x00000000, 0x006ffff0 }, /* PR_AND_FFR_REGS */ \
>>> + { 0x00000000, 0x00000000, 0x00100000 }, /* MOVEABLE_SYSREGS */ \
>>> + { 0x00000000, 0x00000000, 0x7f800000 }, /* FAKE_REGS */ \
>>> + { 0xffffffff, 0xffffffff, 0x001fffff } /* ALL_REGS */ \
>>> }
>>>
>>> #define REGNO_REG_CLASS(REGNO) aarch64_regno_regclass (REGNO)
>>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
>>> index ed29127dafb..ed1bd2ede7d 100644
>>> --- a/gcc/config/aarch64/aarch64.md
>>> +++ b/gcc/config/aarch64/aarch64.md
>>> @@ -107,10 +107,14 @@ (define_constants
>>> (P14_REGNUM 82)
>>> (P15_REGNUM 83)
>>> (LAST_SAVED_REGNUM 83)
>>> - (FFR_REGNUM 84)
>>> +
>>> + ;; Floating Point Mode Register, used in FP8 insns.
>>> + (FPM_REGNUM 84)
>>> +
>>> + (FFR_REGNUM 85)
>>> ;; "FFR token": a fake register used for representing the scheduling
>>> ;; restrictions on FFR-related operations.
>>> - (FFRT_REGNUM 85)
>>> + (FFRT_REGNUM 86)
>>>
>>> ;; ----------------------------------------------------------------
>>> ;; Fake registers
>>> @@ -122,17 +126,17 @@ (define_constants
>>> ;; ABI-related lowering is needed. These placeholders read and
>>> ;; write this register. Instructions that depend on the lowering
>>> ;; read the register.
>>> - (LOWERING_REGNUM 86)
>>> + (LOWERING_REGNUM 87)
>>>
>>> ;; Represents the contents of the current function's TPIDR2 block,
>>> ;; in abstract form.
>>> - (TPIDR2_BLOCK_REGNUM 87)
>>> + (TPIDR2_BLOCK_REGNUM 88)
>>>
>>> ;; Holds the value that the current function wants PSTATE.ZA to be.
>>> ;; The actual value can sometimes vary, because it does not track
>>> ;; changes to PSTATE.ZA that happen during a lazy save and restore.
>>> ;; Those effects are instead tracked by ZA_SAVED_REGNUM.
>>> - (SME_STATE_REGNUM 88)
>>> + (SME_STATE_REGNUM 89)
>>>
>>> ;; Instructions write to this register if they set TPIDR2_EL0 to a
>>> ;; well-defined value. Instructions read from the register if they
>>> @@ -140,14 +144,14 @@ (define_constants
>>> ;;
>>> ;; The register does not model the architected TPIDR2_ELO, just the
>>> ;; current function's management of it.
>>> - (TPIDR2_SETUP_REGNUM 89)
>>> + (TPIDR2_SETUP_REGNUM 90)
>>>
>>> ;; Represents the property "has an incoming lazy save been committed?".
>>> - (ZA_FREE_REGNUM 90)
>>> + (ZA_FREE_REGNUM 91)
>>>
>>> ;; Represents the property "are the current function's ZA contents
>>> ;; stored in the lazy save buffer, rather than in ZA itself?".
>>> - (ZA_SAVED_REGNUM 91)
>>> + (ZA_SAVED_REGNUM 92)
>>>
>>> ;; Represents the contents of the current function's ZA state in
>>> ;; abstract form. At various times in the function, these contents
>>> @@ -155,10 +159,10 @@ (define_constants
>>> ;;
>>> ;; The contents persist even when the architected ZA is off. Private-ZA
>>> ;; functions have no effect on its contents.
>>> - (ZA_REGNUM 92)
>>> + (ZA_REGNUM 93)
>>>
>>> ;; Similarly represents the contents of the current function's ZT0
>>> state.
>>> - (ZT0_REGNUM 93)
>>> + (ZT0_REGNUM 94)
>>>
>>> (FIRST_FAKE_REGNUM LOWERING_REGNUM)
>>> (LAST_FAKE_REGNUM ZT0_REGNUM)
>>> @@ -1405,6 +1409,8 @@ (define_insn "*mov<mode>_aarch64"
>>> [w, r Z ; neon_from_gp<q>, nosimd ] fmov\t%s0, %w1
>>> [w, w ; neon_dup , simd ] dup\t%<Vetype>0, %1.<v>[0]
>>> [w, w ; neon_dup , nosimd ] fmov\t%s0, %s1
>>> + [Umv, r ; mrs , * ] msr\t%0, %x1
>>> + [r, Umv ; mrs , * ] mrs\t%x0, %1
>>> }
>>> )
>>>
>>> @@ -1467,6 +1473,8 @@ (define_insn_and_split "*movsi_aarch64"
>>> [r , w ; f_mrc , fp , 4] fmov\t%w0, %s1
>>> [w , w ; fmov , fp , 4] fmov\t%s0, %s1
>>> [w , Ds ; neon_move, simd, 4] <<
>>> aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);
>>> + [Umv, r ; mrs , * , 4] msr\t%0, %x1
>>> + [r, Umv ; mrs , * , 4] mrs\t%x0, %1
>>> }
>>> "CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]),
>>> SImode)
>>> && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
>>> @@ -1505,6 +1513,8 @@ (define_insn_and_split "*movdi_aarch64"
>>> [w, w ; fmov , fp , 4] fmov\t%d0, %d1
>>> [w, Dd ; neon_move, simd, 4] <<
>>> aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);
>>> [w, Dx ; neon_move, simd, 8] #
>>> + [Umv, r; mrs , * , 4] msr\t%0, %1
>>> + [r, Umv; mrs , * , 4] mrs\t%0, %1
>>> }
>>> "CONST_INT_P (operands[1])
>>> && REG_P (operands[0])
>>> diff --git a/gcc/config/aarch64/constraints.md
>>> b/gcc/config/aarch64/constraints.md
>>> index a2569cea510..0c81fb28f7e 100644
>>> --- a/gcc/config/aarch64/constraints.md
>>> +++ b/gcc/config/aarch64/constraints.md
>>> @@ -77,6 +77,9 @@ (define_register_constraint "Upl" "PR_LO_REGS"
>>> (define_register_constraint "Uph" "PR_HI_REGS"
>>> "SVE predicate registers p8 - p15.")
>>>
>>> +(define_register_constraint "Umv" "MOVEABLE_SYSREGS"
>>> + "@internal System Registers suitable for moving rather than requiring an
>>> unspec msr")
>>> +
>>> (define_constraint "c"
>>> "@internal The condition code register."
>>> (match_operand 0 "cc_register"))
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
>>> b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
>>> index 459442be155..afb44f83f60 100644
>>> --- a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
>>> +++ b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
>>> @@ -1,6 +1,7 @@
>>> /* Test the fp8 ACLE intrinsics family. */
>>> /* { dg-do compile } */
>>> /* { dg-options "-O1 -march=armv8-a" } */
>>> +/* { dg-final { check-function-bodies "**" "" "" } } */
>>>
>>> #include <arm_acle.h>
>>>
>>> @@ -17,4 +18,104 @@
>>> #error "__ARM_FEATURE_FP8 feature macro defined."
>>> #endif
>>>
>>> +/*
>>> +**test_write_fpmr_sysreg_asm_64:
>>> +** msr fpmr, x0
>>> +** ret
>>> +*/
>>> +void
>>> +test_write_fpmr_sysreg_asm_64 (uint64_t val)
>>> +{
>>> + register uint64_t fpmr asm ("fpmr") = val;
>>> + asm volatile ("" ::"Umv"(fpmr));
>>> +}
>>> +
>>> +/*
>>> +**test_write_fpmr_sysreg_asm_32:
>>> +** msr fpmr, x0
>>> +** ret
>>> +*/
>>> +void
>>> +test_write_fpmr_sysreg_asm_32 (uint32_t val)
>>> +{
>>> + register uint32_t fpmr asm ("fpmr") = val;
>>> + asm volatile ("" ::"Umv"(fpmr));
>>> +}
>>> +
>>> +/*
>>> +**test_write_fpmr_sysreg_asm_16:
>>> +** msr fpmr, x0
>>> +** ret
>>> +*/
>>> +void
>>> +test_write_fpmr_sysreg_asm_16 (uint16_t val)
>>> +{
>>> + register uint16_t fpmr asm ("fpmr") = val;
>>> + asm volatile ("" ::"Umv"(fpmr));
>>> +}
>>> +
>>> +/*
>>> +**test_write_fpmr_sysreg_asm_8:
>>> +** msr fpmr, x0
>>> +** ret
>>> +*/
>>> +void
>>> +test_write_fpmr_sysreg_asm_8 (uint8_t val)
>>> +{
>>> + register uint8_t fpmr asm ("fpmr") = val;
>>> + asm volatile ("" ::"Umv"(fpmr));
>>> +}
>>> +
>>> +/*
>>> +**test_read_fpmr_sysreg_asm_64:
>>> +** mrs x0, fpmr
>>> +** ret
>>> +*/
>>> +uint64_t
>>> +test_read_fpmr_sysreg_asm_64 ()
>>> +{
>>> + register uint64_t fpmr asm ("fpmr");
>>> + asm volatile ("" : "=Umv"(fpmr) :);
>>> + return fpmr;
>>> +}
>>> +
>>> +/*
>>> +**test_read_fpmr_sysreg_asm_32:
>>> +** mrs x0, fpmr
>>> +** ret
>>> +*/
>>> +uint32_t
>>> +test_read_fpmr_sysreg_asm_32 ()
>>> +{
>>> + register uint32_t fpmr asm ("fpmr");
>>> + asm volatile ("" : "=Umv"(fpmr) :);
>>> + return fpmr;
>>> +}
>>> +
>>> +/*
>>> +**test_read_fpmr_sysreg_asm_16:
>>> +** mrs x0, fpmr
>>> +** ret
>>> +*/
>>> +uint16_t
>>> +test_read_fpmr_sysreg_asm_16 ()
>>> +{
>>> + register uint16_t fpmr asm ("fpmr");
>>> + asm volatile ("" : "=Umv"(fpmr) :);
>>> + return fpmr;
>>> +}
>>> +
>>> +/*
>>> +**test_read_fpmr_sysreg_asm_8:
>>> +** mrs x0, fpmr
>>> +** ret
>>> +*/
>>> +uint8_t
>>> +test_read_fpmr_sysreg_asm_8 ()
>>> +{
>>> + register uint8_t fpmr asm ("fpmr");
>>> + asm volatile ("" : "=Umv"(fpmr) :);
>>> + return fpmr;
>>> +}
>>> +
>>> #pragma GCC pop_options
>>