Karl Meakin <karl.mea...@arm.com> writes: > Commit the test file `cmpbr.c` before rules for generating the new > instructions are added, so that the changes in codegen are more obvious > in the next commit. > > gcc/testsuite/ChangeLog: > > * lib/target-supports.exp: Add `cmpbr` to the list of extensions. > * gcc.target/aarch64/cmpbr.c: New test. > --- > gcc/testsuite/gcc.target/aarch64/cmpbr.c | 1877 ++++++++++++++++++++++ > gcc/testsuite/lib/target-supports.exp | 14 +- > 2 files changed, 1885 insertions(+), 6 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/aarch64/cmpbr.c > > diff --git a/gcc/testsuite/gcc.target/aarch64/cmpbr.c > b/gcc/testsuite/gcc.target/aarch64/cmpbr.c > new file mode 100644 > index 00000000000..9ca376a8f33 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/cmpbr.c > @@ -0,0 +1,1877 @@ > +// Test that the instructions added by FEAT_CMPBR are emitted > +// { dg-do compile } > +// { dg-do-if assemble { target aarch64_asm_cmpbr_ok } } > +// { dg-options "-march=armv9.5-a+cmpbr -O2" } > +// { dg-final { check-function-bodies "**" "*/" "" { target *-*-* } > {\.L[0-9]+} } } > + > +#include <stdint.h> > + > +typedef uint8_t u8; > +typedef int8_t i8; > + > +typedef uint16_t u16; > +typedef int16_t i16; > + > +typedef uint32_t u32; > +typedef int32_t i32; > + > +typedef uint64_t u64; > +typedef int64_t i64; > + > +int taken(); > +int not_taken(); > + > +#define COMPARE(ty, name, op, rhs) > \ > + int ty##_x0_##name##_##rhs(ty x0, ty x1) { > \ > + return (x0 op rhs) ? taken() : not_taken(); > \ > + } > + > +#define COMPARE_ALL(unsigned_ty, signed_ty, rhs) > \ > + COMPARE(unsigned_ty, eq, ==, rhs); > \ > + COMPARE(unsigned_ty, ne, !=, rhs); > \ > + > \ > + COMPARE(unsigned_ty, ult, <, rhs); > \ > + COMPARE(unsigned_ty, ule, <=, rhs); > \ > + COMPARE(unsigned_ty, ugt, >, rhs); > \ > + COMPARE(unsigned_ty, uge, >=, rhs); > \ > + > \ > + COMPARE(signed_ty, slt, <, rhs); > \ > + COMPARE(signed_ty, sle, <=, rhs); > \ > + COMPARE(signed_ty, sgt, >, rhs); > \ > + COMPARE(signed_ty, sge, >=, rhs); > + > +// ==== CBB<cc> (register) ==== > +COMPARE_ALL(u8, i8, x1); > + > +// ==== CBH<cc> (register) ==== > +COMPARE_ALL(u16, i16, x1); > + > +// ==== CB<cc> (register) ==== > +COMPARE_ALL(u32, i32, x1); > +COMPARE_ALL(u64, i64, x1); > + > +// ==== CB<cc> (immediate) ==== > +COMPARE_ALL(u32, i32, 42); > +COMPARE_ALL(u64, i64, 42); > + > +// ==== Special cases ==== > +// Comparisons against the immediate 0 can be done for all types, > +// because we can use the wzr/xzr register as one of the operands. > +// However, we should prefer to use CBZ/CBNZ or TBZ/TBNZ when possible, > +// because they have larger range. > +COMPARE_ALL(u8, i8, 0); > +COMPARE_ALL(u16, i16, 0); > +COMPARE_ALL(u32, i32, 0); > +COMPARE_ALL(u64, i64, 0); > + > +// CBB and CBH cannot have immediate operands. > +// Instead we have to do a MOV+CB. > +COMPARE_ALL(u8, i8, 42); > +COMPARE_ALL(u16, i16, 42); > + > +// 64 is out of the range for immediate operands (0 to 63). > +// * For 8/16-bit types, use a MOV+CB as above. > +// * For 32/64-bit types, use a CMP+B<cc> instead, > +// because B<cc> has a longer range than CB<cc>. > +COMPARE_ALL(u8, i8, 64); > +COMPARE_ALL(u16, i16, 64); > +COMPARE_ALL(u32, i32, 64); > +COMPARE_ALL(u64, i64, 64); > + > +// 4098 is out of the range for CMP (0 to 4095, optionally shifted by left > by 12 > +// bits), but it can be materialized in a single MOV. > +COMPARE_ALL(u16, i16, 4098); > +COMPARE_ALL(u32, i32, 4098); > +COMPARE_ALL(u64, i64, 4098); > + > +// If the branch destination is out of range (1KiB), we have to generate an > +// extra B instruction (which can handle larger displacements) and branch > around > +// it > +int far_branch(i32 x, i32 y) { > + volatile int z = 0; > + if (x == y) {
It might be worth making this: if (__builtin_expect (x == y, 0)) { as a way of ensuring that nothing tries to "optimise" the code by inverting the branch (and avoiding the far branch). > + // clang-format off > + #define STORE_2() z = 0; z = 0; > + #define STORE_4() STORE_2(); STORE_2(); > + #define STORE_8() STORE_4(); STORE_4(); > + #define STORE_16() STORE_8(); STORE_8(); > + #define STORE_32() STORE_16(); STORE_16(); > + #define STORE_64() STORE_32(); STORE_32(); > + #define STORE_128() STORE_64(); STORE_64(); > + #define STORE_256() STORE_128(); STORE_128(); > + // clang-format on > + > + STORE_256(); > + } > + return taken(); > +} > + > +/* > +** u8_x0_eq_x1: > +** and w1, w1, 255 > +** cmp w1, w0, uxtb Sorry for not noticing earlier, but this should be: and (w[0-9]+), w1, 255 cmp \1, w0, uxtb since there is no requirement for w1 to be used for the temporary result. Similarly for the others. (I realise many of them change with patch 8.) > +/* > +** i8_x0_slt_x1: > +** sxtb w1, w1 > +** cmp w1, w0, sxtb Similarly: sxtb (w[0-9]+), w1 cmp \1, w0, sxtb (and for sxth) > +** u16_x0_eq_4098: > +** mov w1, 4098 > +** cmp w1, w0, uxth Similarly here: sxtb (w[0-9]+), 4098 cmp \1, w0, uxth Thanks, Richard