[PATCH v4 0/2] Add support for AdvSIMD faminmax
From: Saurabh Jha This patch series is a respin of the previous patch here: https://gcc.gnu.org/pipermail/gcc-patches/2024-August/659749.html. This new version is rebased with latest master after the merging of this patch series: https://gcc.gnu.org/pipermail/gcc-patches/2024-August/660532.html. We no longer need to refactor report_missing_extension and report_missing_extension_p as this was already refactored in https://gcc.gnu.org/pipermail/gcc-patches/2024-August/660532.html. Therefore, the refactoring parts are removed from this patch series. Nothing else is changed from previous version. Regression tested for aarch64-none-linux-gnu and found no regressions. Ok for master? I don't have commit access so can someone please commit on my behalf? Saurabh Jha (2): aarch64: Add AdvSIMD faminmax intrinsics aarch64: Add codegen support for AdvSIMD faminmax gcc/config/aarch64/aarch64-builtins.cc| 126 ++ .../aarch64/aarch64-option-extensions.def | 2 + gcc/config/aarch64/aarch64-simd.md| 21 ++ gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 12 + gcc/config/arm/types.md | 6 + gcc/doc/invoke.texi | 2 + .../aarch64/simd/faminmax-builtins-no-flag.c | 10 + .../aarch64/simd/faminmax-builtins.c | 115 ++ .../aarch64/simd/faminmax-codegen-no-flag.c | 217 ++ .../aarch64/simd/faminmax-codegen.c | 197 11 files changed, 712 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c -- 2.43.2
[PATCH v4 1/2] aarch64: Add AdvSIMD faminmax intrinsics
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch introduces AdvSIMD faminmax intrinsics. The intrinsics of this extension are implemented as the following builtin functions: * vamax_f16 * vamaxq_f16 * vamax_f32 * vamaxq_f32 * vamaxq_f64 * vamin_f16 * vaminq_f16 * vamin_f32 * vaminq_f32 * vaminq_f64 gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (enum aarch64_builtins): New enum values for faminmax builtins. (aarch64_init_faminmax_builtins): New function to declare new builtins. (handle_arm_neon_h): Modify to call aarch64_init_faminmax_builtins. (aarch64_general_check_builtin_call): Modify to check whether +faminmax flag is being used and printing error message if not being used. (aarch64_expand_builtin_faminmax): New function to emit instructions of this extension. (aarch64_general_expand_builtin): Modify to call aarch64_expand_builtin_faminmax. * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): Introduce new flag for this extension. * config/aarch64/aarch64-simd.md (aarch64_): Instruction pattern for faminmax intrinsics. * config/aarch64/aarch64.h (TARGET_FAMINMAX): Introduce new flag for this extension. * config/aarch64/iterators.md: Introduce new iterators for faminmax intrinsics. * config/arm/types.md: Introduce neon_fp_aminmax attributes. * doc/invoke.texi: Document extension in AArch64 Options. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/faminmax-builtins-no-flag.c: New test. * gcc.target/aarch64/simd/faminmax-builtins.c: New test. --- gcc/config/aarch64/aarch64-builtins.cc| 126 ++ .../aarch64/aarch64-option-extensions.def | 2 + gcc/config/aarch64/aarch64-simd.md| 11 ++ gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 9 ++ gcc/config/arm/types.md | 6 + gcc/doc/invoke.texi | 2 + .../aarch64/simd/faminmax-builtins-no-flag.c | 10 ++ .../aarch64/simd/faminmax-builtins.c | 115 9 files changed, 285 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins.c diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index eb878b933fe..95ec8b6 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -829,6 +829,17 @@ enum aarch64_builtins AARCH64_RBIT, AARCH64_RBITL, AARCH64_RBITLL, + /* FAMINMAX builtins. */ + AARCH64_FAMINMAX_BUILTIN_FAMAX4H, + AARCH64_FAMINMAX_BUILTIN_FAMAX8H, + AARCH64_FAMINMAX_BUILTIN_FAMAX2S, + AARCH64_FAMINMAX_BUILTIN_FAMAX4S, + AARCH64_FAMINMAX_BUILTIN_FAMAX2D, + AARCH64_FAMINMAX_BUILTIN_FAMIN4H, + AARCH64_FAMINMAX_BUILTIN_FAMIN8H, + AARCH64_FAMINMAX_BUILTIN_FAMIN2S, + AARCH64_FAMINMAX_BUILTIN_FAMIN4S, + AARCH64_FAMINMAX_BUILTIN_FAMIN2D, /* System register builtins. */ AARCH64_RSR, AARCH64_RSRP, @@ -1547,6 +1558,66 @@ aarch64_init_simd_builtin_functions (bool called_from_pragma) } } +/* Initialize the absolute maximum/minimum (FAMINMAX) builtins. */ + +typedef struct +{ + const char *name; + unsigned int code; + tree eltype; + machine_mode mode; +} faminmax_builtins_data; + +static void +aarch64_init_faminmax_builtins () +{ + faminmax_builtins_data data[] = { +/* Absolute maximum. */ +{"vamax_f16", AARCH64_FAMINMAX_BUILTIN_FAMAX4H, + aarch64_simd_types[Float16x4_t].eltype, + aarch64_simd_types[Float16x4_t].mode}, +{"vamaxq_f16", AARCH64_FAMINMAX_BUILTIN_FAMAX8H, + aarch64_simd_types[Float16x8_t].eltype, + aarch64_simd_types[Float16x8_t].mode}, +{"vamax_f32", AARCH64_FAMINMAX_BUILTIN_FAMAX2S, + aarch64_simd_types[Float32x2_t].eltype, + aarch64_simd_types[Float32x2_t].mode}, +{"vamaxq_f32", AARCH64_FAMINMAX_BUILTIN_FAMAX4S, + aarch64_simd_types[Float32x4_t].eltype, + aarch64_simd_types[Float32x4_t].mode}, +{"vamaxq_f64", AARCH64_FAMINMAX_BUILTIN_FAMAX2D, + aarch64_simd_types[Float64x2_t].eltype, + aarch64_simd_types[Float64x2_t].mode}, +/* Absolute minimum. */ +{"vamin_f16", AARCH64_FAMINMAX_BUILTIN_FAMIN4H, + aarch64_simd_types[Float16x4_t].eltype, + aarch64_simd_types[Float16x4_t].mode}, +{"vaminq_f16", AARCH64_FAMINMAX_BUILTIN_FAMIN8H, + aarch64_simd_types[Float16x8_t].eltype, + aarch64_simd_types[Float16x8_t].mode}, +{"vamin_f32", AARCH64_FAMINMAX_BUILTIN_FAMIN2S, + aarch64_simd_types[Float32x2_t].eltype, + aarch64_simd_types[Float32x2_t].mode}, +{"vaminq_f32", AARCH64_FAMINMAX_BUILTIN
[PATCH v4 2/2] aarch64: Add codegen support for AdvSIMD faminmax
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch adds code generation support for famax and famin in terms of existing RTL operators. famax/famin is equivalent to first taking abs of the operands and then taking smax/smin on the results of abs. famax/famin (a, b) = smax/smin (abs (a), abs (b)) This fusion of operators is only possible when -march=armv9-a+faminmax flags are passed. We also need to pass -ffast-math flag; if we don't, then a statement like c[i] = __builtin_fmaxf16 (a[i], b[i]); is RTL expanded to UNSPEC_FMAXNM instead of smax (likewise for smin). This code generation is only available on -O2 or -O3 as that is when auto-vectorization is enabled. gcc/ChangeLog: * config/aarch64/aarch64-simd.md (*aarch64_faminmax_fused): Instruction pattern for faminmax codegen. * config/aarch64/iterators.md: Attribute for faminmax codegen. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/faminmax-codegen-no-flag.c: New test. * gcc.target/aarch64/simd/faminmax-codegen.c: New test. --- gcc/config/aarch64/aarch64-simd.md| 10 + gcc/config/aarch64/iterators.md | 3 + .../aarch64/simd/faminmax-codegen-no-flag.c | 217 ++ .../aarch64/simd/faminmax-codegen.c | 197 4 files changed, 427 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 488e27c36a9..17c99775b65 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -9921,3 +9921,13 @@ "\t%0., %1., %2." [(set_attr "type" "neon_fp_aminmax")] ) + +(define_insn "*aarch64_faminmax_fused" + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (FMAXMIN:VHSDF + (abs:VHSDF (match_operand:VHSDF 1 "register_operand" "w")) + (abs:VHSDF (match_operand:VHSDF 2 "register_operand" "w"] + "TARGET_FAMINMAX" + "\t%0., %1., %2." + [(set_attr "type" "neon_fp_aminmax")] +) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 17ac5e073aa..c2fcd18306e 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -4472,3 +4472,6 @@ (define_int_attr faminmax_uns_op [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")]) + +(define_code_attr faminmax_op + [(smax "famax") (smin "famin")]) diff --git a/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c new file mode 100644 index 000..d77f5a5d19f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c @@ -0,0 +1,217 @@ +/* { dg-do assemble} */ +/* { dg-additional-options "-O3 -ffast-math -march=armv9-a" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + +#pragma GCC target "+nosve" + +/* +** test_vamax_f16: +** fabs v1.4h, v1.4h +** fabs v0.4h, v0.4h +** fmaxnm v0.4h, v0.4h, v1.4h +** ret +*/ +float16x4_t +test_vamax_f16 (float16x4_t a, float16x4_t b) +{ + int i; + float16x4_t c; + + for (i = 0; i < 4; ++i) { +a[i] = __builtin_fabsf16 (a[i]); +b[i] = __builtin_fabsf16 (b[i]); +c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f16: +** fabs v1.8h, v1.8h +** fabs v0.8h, v0.8h +** fmaxnm v0.8h, v0.8h, v1.8h +** ret +*/ +float16x8_t +test_vamaxq_f16 (float16x8_t a, float16x8_t b) +{ + int i; + float16x8_t c; + + for (i = 0; i < 8; ++i) { +a[i] = __builtin_fabsf16 (a[i]); +b[i] = __builtin_fabsf16 (b[i]); +c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamax_f32: +** fabs v1.2s, v1.2s +** fabs v0.2s, v0.2s +** fmaxnm v0.2s, v0.2s, v1.2s +** ret +*/ +float32x2_t +test_vamax_f32 (float32x2_t a, float32x2_t b) +{ + int i; + float32x2_t c; + + for (i = 0; i < 2; ++i) { +a[i] = __builtin_fabsf32 (a[i]); +b[i] = __builtin_fabsf32 (b[i]); +c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f32: +** fabs v1.4s, v1.4s +** fabs v0.4s, v0.4s +** fmaxnm v0.4s, v0.4s, v1.4s +** ret +*/ +float32x4_t +test_vamaxq_f32 (float32x4_t a, float32x4_t b) +{ + int i; + float32x4_t c; + + for (i = 0; i < 4; ++i) { +a[i] = __builtin_fabsf32 (a[i]); +b[i] = __builtin_fabsf32 (b[i]); +c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f64: +** fabs v1.2d, v1.2d +** fabs v0.2d, v0.2d +** fmaxnm v0.2d, v0.2d, v1.2d +** ret +*/ +float64x2_t +test_vamaxq_f64 (float64x2_t a, float64x2_t b) +{ + int i; + float64x2_t c; + + for (i = 0; i < 2; ++i) { +a[i] = __builtin_fabsf64 (a[i]); +b[i] = __builtin_fabs
[PATCH v5 0/2] aarch64: Add support for AdvSIMD faminmax
From: Saurabh Jha This patch series is a respin of the previous patch here: https://gcc.gnu.org/pipermail/gcc-patches/2024-August/660917.html The new version addresses review comments on the previous patch series. It also introduced a new way of defining AArch4 AdvSIMD intrinsics. All of the new changes are in the first patch of the series. The second patch is unchanged. Saurabh Jha (2): aarch64: Add AdvSIMD faminmax intrinsics aarch64: Add codegen support for AdvSIMD faminmax gcc/config/aarch64/aarch64-builtins.cc| 126 ++ .../aarch64/aarch64-option-extensions.def | 2 + gcc/config/aarch64/aarch64-simd.md| 21 ++ gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 12 + gcc/config/arm/types.md | 6 + gcc/doc/invoke.texi | 2 + .../aarch64/simd/faminmax-builtins-no-flag.c | 10 + .../aarch64/simd/faminmax-builtins.c | 115 ++ .../aarch64/simd/faminmax-codegen-no-flag.c | 217 ++ .../aarch64/simd/faminmax-codegen.c | 197 11 files changed, 712 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c -- 2.43.2
[PATCH v5 2/2] aarch64: Add codegen support for AdvSIMD faminmax
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch adds code generation support for famax and famin in terms of existing RTL operators. famax/famin is equivalent to first taking abs of the operands and then taking smax/smin on the results of abs. famax/famin (a, b) = smax/smin (abs (a), abs (b)) This fusion of operators is only possible when -march=armv9-a+faminmax flags are passed. We also need to pass -ffast-math flag; if we don't, then a statement like c[i] = __builtin_fmaxf16 (a[i], b[i]); is RTL expanded to UNSPEC_FMAXNM instead of smax (likewise for smin). This code generation is only available on -O2 or -O3 as that is when auto-vectorization is enabled. gcc/ChangeLog: * config/aarch64/aarch64-simd.md (*aarch64_faminmax_fused): Instruction pattern for faminmax codegen. * config/aarch64/iterators.md: Attribute for faminmax codegen. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/faminmax-codegen-no-flag.c: New test. * gcc.target/aarch64/simd/faminmax-codegen.c: New test. --- gcc/config/aarch64/aarch64-simd.md| 10 + gcc/config/aarch64/iterators.md | 3 + .../aarch64/simd/faminmax-codegen-no-flag.c | 217 ++ .../aarch64/simd/faminmax-codegen.c | 197 4 files changed, 427 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 7542c81ed91..8973cade488 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -9921,3 +9921,13 @@ "\t%0., %1., %2." [(set_attr "type" "neon_fp_aminmax")] ) + +(define_insn "*aarch64_faminmax_fused" + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (FMAXMIN:VHSDF + (abs:VHSDF (match_operand:VHSDF 1 "register_operand" "w")) + (abs:VHSDF (match_operand:VHSDF 2 "register_operand" "w"] + "TARGET_FAMINMAX" + "\t%0., %1., %2." + [(set_attr "type" "neon_fp_aminmax")] +) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 17ac5e073aa..c2fcd18306e 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -4472,3 +4472,6 @@ (define_int_attr faminmax_uns_op [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")]) + +(define_code_attr faminmax_op + [(smax "famax") (smin "famin")]) diff --git a/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c new file mode 100644 index 000..d77f5a5d19f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c @@ -0,0 +1,217 @@ +/* { dg-do assemble} */ +/* { dg-additional-options "-O3 -ffast-math -march=armv9-a" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + +#pragma GCC target "+nosve" + +/* +** test_vamax_f16: +** fabs v1.4h, v1.4h +** fabs v0.4h, v0.4h +** fmaxnm v0.4h, v0.4h, v1.4h +** ret +*/ +float16x4_t +test_vamax_f16 (float16x4_t a, float16x4_t b) +{ + int i; + float16x4_t c; + + for (i = 0; i < 4; ++i) { +a[i] = __builtin_fabsf16 (a[i]); +b[i] = __builtin_fabsf16 (b[i]); +c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f16: +** fabs v1.8h, v1.8h +** fabs v0.8h, v0.8h +** fmaxnm v0.8h, v0.8h, v1.8h +** ret +*/ +float16x8_t +test_vamaxq_f16 (float16x8_t a, float16x8_t b) +{ + int i; + float16x8_t c; + + for (i = 0; i < 8; ++i) { +a[i] = __builtin_fabsf16 (a[i]); +b[i] = __builtin_fabsf16 (b[i]); +c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamax_f32: +** fabs v1.2s, v1.2s +** fabs v0.2s, v0.2s +** fmaxnm v0.2s, v0.2s, v1.2s +** ret +*/ +float32x2_t +test_vamax_f32 (float32x2_t a, float32x2_t b) +{ + int i; + float32x2_t c; + + for (i = 0; i < 2; ++i) { +a[i] = __builtin_fabsf32 (a[i]); +b[i] = __builtin_fabsf32 (b[i]); +c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f32: +** fabs v1.4s, v1.4s +** fabs v0.4s, v0.4s +** fmaxnm v0.4s, v0.4s, v1.4s +** ret +*/ +float32x4_t +test_vamaxq_f32 (float32x4_t a, float32x4_t b) +{ + int i; + float32x4_t c; + + for (i = 0; i < 4; ++i) { +a[i] = __builtin_fabsf32 (a[i]); +b[i] = __builtin_fabsf32 (b[i]); +c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f64: +** fabs v1.2d, v1.2d +** fabs v0.2d, v0.2d +** fmaxnm v0.2d, v0.2d, v1.2d +** ret +*/ +float64x2_t +test_vamaxq_f64 (float64x2_t a, float64x2_t b) +{ + int i; + float64x2_t c; + + for (i = 0; i < 2; ++i) { +a[i] = __builtin_fabsf64 (a[i]); +b[i] = __builtin_fabs
[PATCH v5 1/2] aarch64: Add AdvSIMD faminmax intrinsics
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch introduces AdvSIMD faminmax intrinsics. The intrinsics of this extension are implemented as the following builtin functions: * vamax_f16 * vamaxq_f16 * vamax_f32 * vamaxq_f32 * vamaxq_f64 * vamin_f16 * vaminq_f16 * vamin_f32 * vaminq_f32 * vaminq_f64 We are defining a new way to add AArch64 AdvSIMD intrinsics by listing all the intrinsics in a .def file and then using that .def file to initialise various data structures. This would lead to more concise code and easier addition of the new AdvSIMD intrinsics in future. The faminmax intrinsics are defined using the new approach gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (ENTRY): Macro to parse the contents of aarch64-simd-pragma-builtins.def. (enum aarch64_builtins): New enum values for faminmax builtins via aarch64-simd-pragma-builtins.def. (aarch64_init_pragma_builtins): New function to define pragma builtins. (handle_arm_neon_h): Modify to call aarch64_init_pragma_builtins. (aarch64_general_check_builtin_call): Modify to check whether required flag is being used for pragma builtins. (aarch64_expand_pragma_builtin): New function to emit instructions of pragma builtins. (aarch64_general_expand_builtin): Modify to call aarch64_expand_pragma_builtin. * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): Introduce new flag for this extension. * config/aarch64/aarch64-simd.md (@aarch64_): Instruction pattern for faminmax intrinsics. * config/aarch64/aarch64.h (TARGET_FAMINMAX): Introduce new flag for this extension. * config/aarch64/iterators.md: New iterators and unspecs. * config/arm/types.md: Introduce neon_fp_aminmax attributes. * doc/invoke.texi: Document extension in AArch64 Options. * config/aarch64/aarch64-simd-pragma-builtins.def: New file to list pragma builtins. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/faminmax-builtins-no-flag.c: New test. * gcc.target/aarch64/simd/faminmax-builtins.c: New test. --- gcc/config/aarch64/aarch64-builtins.cc| 73 +++ .../aarch64/aarch64-option-extensions.def | 2 + .../aarch64/aarch64-simd-pragma-builtins.def | 31 + gcc/config/aarch64/aarch64-simd.md| 11 ++ gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 9 ++ gcc/config/arm/types.md | 6 + gcc/doc/invoke.texi | 2 + .../aarch64/simd/faminmax-builtins-no-flag.c | 10 ++ .../aarch64/simd/faminmax-builtins.c | 115 ++ 10 files changed, 263 insertions(+) create mode 100644 gcc/config/aarch64/aarch64-simd-pragma-builtins.def create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins.c diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index eb878b933fe..e6b88a194d3 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -757,6 +757,10 @@ typedef struct #define VAR1(T, N, MAP, FLAG, A) \ AARCH64_SIMD_BUILTIN_##T##_##N##A, +#undef ENTRY +#define ENTRY(N, M, U, F) \ + AARCH64_##N, + enum aarch64_builtins { AARCH64_BUILTIN_MIN, @@ -829,6 +833,8 @@ enum aarch64_builtins AARCH64_RBIT, AARCH64_RBITL, AARCH64_RBITLL, + /* Pragma builtins. */ +#include "aarch64-simd-pragma-builtins.def" /* System register builtins. */ AARCH64_RSR, AARCH64_RSRP, @@ -947,6 +953,7 @@ const char *aarch64_scalar_builtin_types[] = { extern GTY(()) aarch64_simd_type_info aarch64_simd_types[]; +#undef ENTRY #define ENTRY(E, M, Q, G) \ {E, "__" #E, #G "__" #E, NULL_TREE, NULL_TREE, E_##M##mode, qualifier_##Q}, struct aarch64_simd_type_info aarch64_simd_types [] = { @@ -1547,6 +1554,39 @@ aarch64_init_simd_builtin_functions (bool called_from_pragma) } } +/* Initialize pragma builtins. */ + +typedef struct +{ + const char *name; + machine_mode mode; + int unspec; + aarch64_feature_flags required_extensions; +} pragma_builtins_data; + +#undef ENTRY +#define ENTRY(N, M, U, F) \ + {#N, E_##M##mode, U, F}, + +pragma_builtins_data pragma_builtins[] = { +#include "aarch64-simd-pragma-builtins.def" +}; + +static void +aarch64_init_pragma_builtins () +{ + for (size_t i = 0; i < ARRAY_SIZE (pragma_builtins); ++i) +{ + pragma_builtins_data data = pragma_builtins[i]; + tree type = aarch64_simd_builtin_type (data.mode, qualifier_none); + tree fntype = build_function_type_list (type, type, type, NULL_TREE); + unsigned int code = AARCH64_vamax_f
[PATCH v6 0/2] aarch64: Add support for AdvSIMD faminmax
From: Saurabh Jha This patch series is a respin of https://gcc.gnu.org/pipermail/gcc-patches/2024-August/661672.html. The new version addresses comment about using AARCH64_PRAGMA_BUILTIN_START and AARCH64_PRAGMA_BUILTIN_END in aarch64_builtins enum. Apart from the function expand_pragma_builtin where we need to use faminmax specific enum values to distinguish between different unspec values, we are using the new enum values everywhere else. Nothing else is changed in the first patch and nothing is changed in the second patch. Saurabh Jha (2): aarch64: Add AdvSIMD faminmax intrinsics aarch64: Add codegen support for AdvSIMD faminmax gcc/config/aarch64/aarch64-builtins.cc| 79 +++ .../aarch64/aarch64-option-extensions.def | 2 + .../aarch64/aarch64-simd-pragma-builtins.def | 31 +++ gcc/config/aarch64/aarch64-simd.md| 21 ++ gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 12 + gcc/config/arm/types.md | 6 + gcc/doc/invoke.texi | 2 + .../aarch64/simd/faminmax-builtins-no-flag.c | 10 + .../aarch64/simd/faminmax-builtins.c | 115 ++ .../aarch64/simd/faminmax-codegen-no-flag.c | 217 ++ .../aarch64/simd/faminmax-codegen.c | 197 12 files changed, 696 insertions(+) create mode 100644 gcc/config/aarch64/aarch64-simd-pragma-builtins.def create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c -- 2.43.2
[PATCH v6 1/2] aarch64: Add AdvSIMD faminmax intrinsics
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch introduces AdvSIMD faminmax intrinsics. The intrinsics of this extension are implemented as the following builtin functions: * vamax_f16 * vamaxq_f16 * vamax_f32 * vamaxq_f32 * vamaxq_f64 * vamin_f16 * vaminq_f16 * vamin_f32 * vaminq_f32 * vaminq_f64 We are defining a new way to add AArch64 AdvSIMD intrinsics by listing all the intrinsics in a .def file and then using that .def file to initialise various data structures. This would lead to more concise code and easier addition of the new AdvSIMD intrinsics in future. The faminmax intrinsics are defined using the new approach gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (ENTRY): Macro to parse the contents of aarch64-simd-pragma-builtins.def. (enum aarch64_builtins): New enum values for faminmax builtins via aarch64-simd-pragma-builtins.def. (aarch64_init_pragma_builtins): New function to define pragma builtins. (handle_arm_neon_h): Modify to call aarch64_init_pragma_builtins. (aarch64_general_check_builtin_call): Modify to check whether required flag is being used for pragma builtins. (aarch64_expand_pragma_builtin): New function to emit instructions of pragma builtins. (aarch64_general_expand_builtin): Modify to call aarch64_expand_pragma_builtin. * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): Introduce new flag for this extension. * config/aarch64/aarch64-simd.md (@aarch64_): Instruction pattern for faminmax intrinsics. * config/aarch64/aarch64.h (TARGET_FAMINMAX): Introduce new flag for this extension. * config/aarch64/iterators.md: New iterators and unspecs. * config/arm/types.md: Introduce neon_fp_aminmax attributes. * doc/invoke.texi: Document extension in AArch64 Options. * config/aarch64/aarch64-simd-pragma-builtins.def: New file to list pragma builtins. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/faminmax-builtins-no-flag.c: New test. * gcc.target/aarch64/simd/faminmax-builtins.c: New test. --- gcc/config/aarch64/aarch64-builtins.cc| 79 .../aarch64/aarch64-option-extensions.def | 2 + .../aarch64/aarch64-simd-pragma-builtins.def | 31 + gcc/config/aarch64/aarch64-simd.md| 11 ++ gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 9 ++ gcc/config/arm/types.md | 6 + gcc/doc/invoke.texi | 2 + .../aarch64/simd/faminmax-builtins-no-flag.c | 10 ++ .../aarch64/simd/faminmax-builtins.c | 115 ++ 10 files changed, 269 insertions(+) create mode 100644 gcc/config/aarch64/aarch64-simd-pragma-builtins.def create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins.c diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index eb878b933fe..61df394b881 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -757,6 +757,10 @@ typedef struct #define VAR1(T, N, MAP, FLAG, A) \ AARCH64_SIMD_BUILTIN_##T##_##N##A, +#undef ENTRY +#define ENTRY(N, M, U, F) \ + AARCH64_##N, + enum aarch64_builtins { AARCH64_BUILTIN_MIN, @@ -829,6 +833,10 @@ enum aarch64_builtins AARCH64_RBIT, AARCH64_RBITL, AARCH64_RBITLL, + /* Pragma builtins. */ + AARCH64_PRAGMA_BUILTIN_START, +#include "aarch64-simd-pragma-builtins.def" + AARCH64_PRAGMA_BUILTIN_END, /* System register builtins. */ AARCH64_RSR, AARCH64_RSRP, @@ -947,6 +955,7 @@ const char *aarch64_scalar_builtin_types[] = { extern GTY(()) aarch64_simd_type_info aarch64_simd_types[]; +#undef ENTRY #define ENTRY(E, M, Q, G) \ {E, "__" #E, #G "__" #E, NULL_TREE, NULL_TREE, E_##M##mode, qualifier_##Q}, struct aarch64_simd_type_info aarch64_simd_types [] = { @@ -1547,6 +1556,39 @@ aarch64_init_simd_builtin_functions (bool called_from_pragma) } } +/* Initialize pragma builtins. */ + +typedef struct +{ + const char *name; + machine_mode mode; + int unspec; + aarch64_feature_flags required_extensions; +} pragma_builtins_data; + +#undef ENTRY +#define ENTRY(N, M, U, F) \ + {#N, E_##M##mode, U, F}, + +pragma_builtins_data pragma_builtins[] = { +#include "aarch64-simd-pragma-builtins.def" +}; + +static void +aarch64_init_pragma_builtins () +{ + for (size_t i = 0; i < ARRAY_SIZE (pragma_builtins); ++i) +{ + pragma_builtins_data data = pragma_builtins[i]; + tree type = aarch64_simd_builtin_type (data.mode, qualifier_none); + tree fntype = build_function_type_list (type,
[PATCH v6 2/2] aarch64: Add codegen support for AdvSIMD faminmax
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch adds code generation support for famax and famin in terms of existing RTL operators. famax/famin is equivalent to first taking abs of the operands and then taking smax/smin on the results of abs. famax/famin (a, b) = smax/smin (abs (a), abs (b)) This fusion of operators is only possible when -march=armv9-a+faminmax flags are passed. We also need to pass -ffast-math flag; if we don't, then a statement like c[i] = __builtin_fmaxf16 (a[i], b[i]); is RTL expanded to UNSPEC_FMAXNM instead of smax (likewise for smin). This code generation is only available on -O2 or -O3 as that is when auto-vectorization is enabled. gcc/ChangeLog: * config/aarch64/aarch64-simd.md (*aarch64_faminmax_fused): Instruction pattern for faminmax codegen. * config/aarch64/iterators.md: Attribute for faminmax codegen. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/faminmax-codegen-no-flag.c: New test. * gcc.target/aarch64/simd/faminmax-codegen.c: New test. --- gcc/config/aarch64/aarch64-simd.md| 10 + gcc/config/aarch64/iterators.md | 3 + .../aarch64/simd/faminmax-codegen-no-flag.c | 217 ++ .../aarch64/simd/faminmax-codegen.c | 197 4 files changed, 427 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 7542c81ed91..8973cade488 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -9921,3 +9921,13 @@ "\t%0., %1., %2." [(set_attr "type" "neon_fp_aminmax")] ) + +(define_insn "*aarch64_faminmax_fused" + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (FMAXMIN:VHSDF + (abs:VHSDF (match_operand:VHSDF 1 "register_operand" "w")) + (abs:VHSDF (match_operand:VHSDF 2 "register_operand" "w"] + "TARGET_FAMINMAX" + "\t%0., %1., %2." + [(set_attr "type" "neon_fp_aminmax")] +) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 17ac5e073aa..c2fcd18306e 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -4472,3 +4472,6 @@ (define_int_attr faminmax_uns_op [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")]) + +(define_code_attr faminmax_op + [(smax "famax") (smin "famin")]) diff --git a/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c new file mode 100644 index 000..d77f5a5d19f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c @@ -0,0 +1,217 @@ +/* { dg-do assemble} */ +/* { dg-additional-options "-O3 -ffast-math -march=armv9-a" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + +#pragma GCC target "+nosve" + +/* +** test_vamax_f16: +** fabs v1.4h, v1.4h +** fabs v0.4h, v0.4h +** fmaxnm v0.4h, v0.4h, v1.4h +** ret +*/ +float16x4_t +test_vamax_f16 (float16x4_t a, float16x4_t b) +{ + int i; + float16x4_t c; + + for (i = 0; i < 4; ++i) { +a[i] = __builtin_fabsf16 (a[i]); +b[i] = __builtin_fabsf16 (b[i]); +c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f16: +** fabs v1.8h, v1.8h +** fabs v0.8h, v0.8h +** fmaxnm v0.8h, v0.8h, v1.8h +** ret +*/ +float16x8_t +test_vamaxq_f16 (float16x8_t a, float16x8_t b) +{ + int i; + float16x8_t c; + + for (i = 0; i < 8; ++i) { +a[i] = __builtin_fabsf16 (a[i]); +b[i] = __builtin_fabsf16 (b[i]); +c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamax_f32: +** fabs v1.2s, v1.2s +** fabs v0.2s, v0.2s +** fmaxnm v0.2s, v0.2s, v1.2s +** ret +*/ +float32x2_t +test_vamax_f32 (float32x2_t a, float32x2_t b) +{ + int i; + float32x2_t c; + + for (i = 0; i < 2; ++i) { +a[i] = __builtin_fabsf32 (a[i]); +b[i] = __builtin_fabsf32 (b[i]); +c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f32: +** fabs v1.4s, v1.4s +** fabs v0.4s, v0.4s +** fmaxnm v0.4s, v0.4s, v1.4s +** ret +*/ +float32x4_t +test_vamaxq_f32 (float32x4_t a, float32x4_t b) +{ + int i; + float32x4_t c; + + for (i = 0; i < 4; ++i) { +a[i] = __builtin_fabsf32 (a[i]); +b[i] = __builtin_fabsf32 (b[i]); +c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f64: +** fabs v1.2d, v1.2d +** fabs v0.2d, v0.2d +** fmaxnm v0.2d, v0.2d, v1.2d +** ret +*/ +float64x2_t +test_vamaxq_f64 (float64x2_t a, float64x2_t b) +{ + int i; + float64x2_t c; + + for (i = 0; i < 2; ++i) { +a[i] = __builtin_fabsf64 (a[i]); +b[i] = __builtin_fabs
[PATCH] aarch64: Add ACLE intrinsics for AdvSIMD faminmax
The AArch64 FEAT_FAMINMAX extension is optional in Armv9.2 and mandatory in Armv9.5. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch introduces intrinsics for AdvSIMD faminmax extension in the form of the following builtin-functions: * vamax_f16 * vamaxq_f16 * vamax_f32 * vamaxq_f32 * vamaxq_f64 * vamin_f16 * vaminq_f16 * vamin_f32 * vaminq_f32 * vaminq_f64 gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (enum aarch64_builtins): New enum values for faminmax builtins. (aarch64_init_faminmax_builtins): New function to declare new builtins. (handle_arm_neon_h): Modified to call aarch64_init_faminmax_builtins. (aarch64_general_check_builtin_call): Modified to check whether +faminmax flag is being used and printing error message if not used. (aarch64_expand_builtin_faminmax): New function to emit instructions of this extension. (aarch64_general_expand_builtin): Modified to call aarch64_expand_builtin_faminmax. * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): Introduce new flag for this extension. * config/aarch64/aarch64-simd.md (aarch64_): Introduce instruction pattern for this extension. * config/aarch64/aarch64.h (TARGET_FAMINMAX): Introduce new flag for this extension. * config/aarch64/iterators.md: Introduce new iterators for this extension. * config/arm/types.md: Introduce neon_fp_aminmax attributes. * doc/invoke.texi: Document extension in AArch64 Options. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/faminmax.c: New tests for this extension. --- Hi, Regression tested for aarch64-none-linux-gnu and found no regressions. This patch is dependent on the patch series "Extend aarch64_feature_flags to 128 bits" which is under review. This patch should be commited only after that patch series is commited. I am raising this patch now for early feedback. Ok for master? I don't have commit access so can someone please commit on my behalf? Regards, Saurabh --- gcc/config/aarch64/aarch64-builtins.cc| 150 -- .../aarch64/aarch64-option-extensions.def | 2 + gcc/config/aarch64/aarch64-simd.md| 11 ++ gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 10 ++ gcc/config/arm/types.md | 6 + gcc/doc/invoke.texi | 2 + .../gcc.target/aarch64/simd/faminmax.c| 40 + 8 files changed, 216 insertions(+), 9 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax.c diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index 30669f8aa18..b3d8cf22eeb 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -829,6 +829,17 @@ enum aarch64_builtins AARCH64_RBIT, AARCH64_RBITL, AARCH64_RBITLL, + /* FAMINMAX builtins. */ + AARCH64_FAMINMAX_BUILTIN_FAMAX4H, + AARCH64_FAMINMAX_BUILTIN_FAMAX8H, + AARCH64_FAMINMAX_BUILTIN_FAMAX2S, + AARCH64_FAMINMAX_BUILTIN_FAMAX4S, + AARCH64_FAMINMAX_BUILTIN_FAMAX2D, + AARCH64_FAMINMAX_BUILTIN_FAMIN4H, + AARCH64_FAMINMAX_BUILTIN_FAMIN8H, + AARCH64_FAMINMAX_BUILTIN_FAMIN2S, + AARCH64_FAMINMAX_BUILTIN_FAMIN4S, + AARCH64_FAMINMAX_BUILTIN_FAMIN2D, /* System register builtins. */ AARCH64_RSR, AARCH64_RSRP, @@ -1547,6 +1558,66 @@ aarch64_init_simd_builtin_functions (bool called_from_pragma) } } +/* Initialize the absolute maximum/minimum (FAMINMAX) builtins. */ + +typedef struct +{ + const char *name; + unsigned int code; + tree eltype; + machine_mode mode; +} faminmax_builtins_data; + +static void +aarch64_init_faminmax_builtins () +{ + faminmax_builtins_data data[] = { +/* Absolute maximum. */ +{"vamax_f16", AARCH64_FAMINMAX_BUILTIN_FAMAX4H, + aarch64_simd_types[Float16x4_t].eltype, + aarch64_simd_types[Float16x4_t].mode}, +{"vamaxq_f16", AARCH64_FAMINMAX_BUILTIN_FAMAX8H, + aarch64_simd_types[Float16x8_t].eltype, + aarch64_simd_types[Float16x8_t].mode}, +{"vamax_f32", AARCH64_FAMINMAX_BUILTIN_FAMAX2S, + aarch64_simd_types[Float32x2_t].eltype, + aarch64_simd_types[Float32x2_t].mode}, +{"vamaxq_f32", AARCH64_FAMINMAX_BUILTIN_FAMAX4S, + aarch64_simd_types[Float32x4_t].eltype, + aarch64_simd_types[Float32x4_t].mode}, +{"vamaxq_f64", AARCH64_FAMINMAX_BUILTIN_FAMAX2D, + aarch64_simd_types[Float64x2_t].eltype, + aarch64_simd_types[Float64x2_t].mode}, +/* Absolute maximum. */ +{"vamin_f16", AARCH64_FAMINMAX_BUILTIN_FAMIN4H, + aarch64_simd_types[Float16x4_t].eltype, + aarch64_simd_types[Float16x4_t].mode}, +{"vaminq_f16", AARCH64_FAMINMAX_BUILTIN_FAMIN8H, + aarch64_simd_types[Float16x8_t].eltype, + aarch64_simd_types[Float16x8_t].mode}, +{"vami
[PATCH] aarch64: Add support for AdvSIMD faminmax
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch does three things: 1. Introduces AdvSIMD faminmax intrinsics. 2. Adds code generation support for famax and famin in terms of the existing operators. 3. Move report_missing_extension and reported_missing_extension_p to make it more usable. The intrinsics of this extension are implemented as the following builtin functions: * vamax_f16 * vamaxq_f16 * vamax_f32 * vamaxq_f32 * vamaxq_f64 * vamin_f16 * vaminq_f16 * vamin_f32 * vaminq_f32 * vaminq_f64 For code generation, famax/famin is equivalent to first taking fabs of the operands and then taking fmax/fmin of the results of fabs. famax/famin (a, b) = fmax/fmin (fabs (a), fabs (b)) This is correct because NaN/Inf handling of famax/famin and fmax/fmin are same. We cannot use fmaxnm/fminnm here as Nan/Inf are handled differently in them. We moved the definition of `report_missing_extension` from gcc/config/aarch64/aarch64-sve-builtins.cc to gcc/config/aarch64/aarch64-builtins.cc and its declaration to gcc/config/aarch64/aarch64-builtins.h. We also moved the declaration of `reported_missing_extension_p` from gcc/config/aarch64/aarch64-sve-builtins.cc to gcc/config/aarch64/aarch64-builtins.cc, closer to the definition of `report_missing_extension`. In the exsiting code structure, this leads to `report_missing_extension` being usable from both normal builtins and sve builtins. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (enum aarch64_builtins): New enum values for faminmax builtins. (aarch64_init_faminmax_builtins): New function to declare new builtins. (handle_arm_neon_h): Modify to call aarch64_init_faminmax_builtins. (aarch64_general_check_builtin_call): Modify to check whether +faminmax flag is being used and printing error message if not being used. (aarch64_expand_builtin_faminmax): New function to emit instructions of this extension. (aarch64_general_expand_builtin): Modify to call aarch64_expand_builtin_faminmax. (report_missing_extension): Move from config/aarch64/aarch64-sve-builtins.cc. * config/aarch64/aarch64-builtins.h (report_missing_extension): Declaration for this function so that it can be used wherever this header is included. (reported_missing_extension_p): Move from config/aarch64/aarch64-sve-builtins.cc * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): Introduce new flag for this extension. * config/aarch64/aarch64-simd.md (aarch64_): Introduce instruction pattern for this extension. * config/aarch64/aarch64-sve-builtins.cc (reported_missing_extension_p): Move to config/aarch64/aarch64-builtins.cc (report_missing_extension): Move to config/aarch64/aarch64-builtins.cc. * config/aarch64/aarch64.h (TARGET_FAMINMAX): Introduce new flag for this extension. * config/aarch64/iterators.md: Introduce new iterators for this extension. * config/arm/types.md: Introduce neon_fp_aminmax attributes. * doc/invoke.texi: Document extension in AArch64 Options. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/faminmax-builtins-no-flag.c: New test. * gcc.target/aarch64/simd/faminmax-builtins.c: New test. * gcc.target/aarch64/simd/faminmax-codegen-no-flag.c: New test. * gcc.target/aarch64/simd/faminmax-codegen.c: New test. --- Hi, Regression tested for aarch64-none-linux-gnu and found no regressions. This patch is a revised version of an earlier patch https://gcc.gnu.org/pipermail/gcc-patches/2024-July/657914.html but has more scope than that. That's why I didn't add "v2" in the subject line. Ok for master? I don't have commit access so can someone please commit on my behalf? Regards, Saurabh --- gcc/config/aarch64/aarch64-builtins.cc| 173 +- gcc/config/aarch64/aarch64-builtins.h | 5 +- .../aarch64/aarch64-option-extensions.def | 2 + gcc/config/aarch64/aarch64-simd.md| 12 ++ gcc/config/aarch64/aarch64-sve-builtins.cc| 22 --- gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 8 + gcc/config/arm/types.md | 6 + gcc/doc/invoke.texi | 2 + .../aarch64/simd/faminmax-builtins-no-flag.c | 10 + .../aarch64/simd/faminmax-builtins.c | 75 .../aarch64/simd/faminmax-codegen-no-flag.c | 54 ++ .../aarch64/simd/faminmax-codegen.c | 104 +++ 13 files changed, 445 insertions(+), 32 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins.c create mo
[PATCH v2] aarch64: Add support for AdvSIMD faminmax
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch does three things: 1. Introduces AdvSIMD faminmax intrinsics. 2. Adds code generation support for famax and famin in terms of the existing operators. 3. Move report_missing_extension and reported_missing_extension_p to make it more usable. The intrinsics of this extension are implemented as the following builtin functions: * vamax_f16 * vamaxq_f16 * vamax_f32 * vamaxq_f32 * vamaxq_f64 * vamin_f16 * vaminq_f16 * vamin_f32 * vaminq_f32 * vaminq_f64 For code generation, famax/famin is equivalent to first taking fabs of the operands and then taking fmax/fmin of the results of fabs. famax/famin (a, b) = fmax/fmin (fabs (a), fabs (b)) This is correct because NaN/Inf handling of famax/famin and fmax/fmin are same. We cannot use fmaxnm/fminnm here as Nan/Inf are handled differently in them. We moved the definition of `report_missing_extension` from gcc/config/aarch64/aarch64-sve-builtins.cc to gcc/config/aarch64/aarch64-builtins.cc and its declaration to gcc/config/aarch64/aarch64-builtins.h. We also moved the declaration of `reported_missing_extension_p` from gcc/config/aarch64/aarch64-sve-builtins.cc to gcc/config/aarch64/aarch64-builtins.cc, closer to the definition of `report_missing_extension`. In the exsiting code structure, this leads to `report_missing_extension` being usable from both normal builtins and sve builtins. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (enum aarch64_builtins): New enum values for faminmax builtins. (aarch64_init_faminmax_builtins): New function to declare new builtins. (handle_arm_neon_h): Modify to call aarch64_init_faminmax_builtins. (aarch64_general_check_builtin_call): Modify to check whether +faminmax flag is being used and printing error message if not being used. (aarch64_expand_builtin_faminmax): New function to emit instructions of this extension. (aarch64_general_expand_builtin): Modify to call aarch64_expand_builtin_faminmax. (report_missing_extension): Move from config/aarch64/aarch64-sve-builtins.cc. * config/aarch64/aarch64-builtins.h (report_missing_extension): Declaration for this function so that it can be used wherever this header is included. (reported_missing_extension_p): Move from config/aarch64/aarch64-sve-builtins.cc * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): Introduce new flag for this extension. * config/aarch64/aarch64-simd.md (aarch64_): Introduce instruction pattern for this extension. * config/aarch64/aarch64-sve-builtins.cc (reported_missing_extension_p): Move to config/aarch64/aarch64-builtins.cc (report_missing_extension): Move to config/aarch64/aarch64-builtins.cc. * config/aarch64/aarch64.h (TARGET_FAMINMAX): Introduce new flag for this extension. * config/aarch64/iterators.md: Introduce new iterators for this extension. * config/arm/types.md: Introduce neon_fp_aminmax attributes. * doc/invoke.texi: Document extension in AArch64 Options. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/faminmax-builtins-no-flag.c: New test. * gcc.target/aarch64/simd/faminmax-builtins.c: New test. * gcc.target/aarch64/simd/faminmax-codegen-no-flag.c: New test. * gcc.target/aarch64/simd/faminmax-codegen.c: New test. --- Hi, Regression tested for aarch64-none-linux-gnu and found no regressions. This is a respin of https://gcc.gnu.org/pipermail/gcc-patches/2024-August/658968.html as the previous version failed patchwork CI due to not being able to apply. Ok for master? I don't have commit access so can someone please commit on my behalf? Regards, Saurabh --- gcc/config/aarch64/aarch64-builtins.cc| 173 +- gcc/config/aarch64/aarch64-builtins.h | 5 +- .../aarch64/aarch64-option-extensions.def | 2 + gcc/config/aarch64/aarch64-simd.md| 12 ++ gcc/config/aarch64/aarch64-sve-builtins.cc| 22 --- gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 8 + gcc/config/arm/types.md | 6 + gcc/doc/invoke.texi | 2 + .../aarch64/simd/faminmax-builtins-no-flag.c | 10 + .../aarch64/simd/faminmax-builtins.c | 75 .../aarch64/simd/faminmax-codegen-no-flag.c | 54 ++ .../aarch64/simd/faminmax-codegen.c | 104 +++ 13 files changed, 445 insertions(+), 32 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins.c create mode 100644 gcc/testsuite/gcc.targe
[PATCH v3 0/2] Add support for AdvSIMD faminmax
From: Saurabh Jha This patch series is a respin of a previous patch here: https://gcc.gnu.org/pipermail/gcc-patches/2024-August/658984.html The AArch64 FEAT_FAMINMAX is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This new version addresses all review comments from the previous version. Additionally, we realised that the NaN/Inf behaviour of famax/famin and fmax/fmin are not the same, as we previously thought. The behaviour of famax/famin and fmaxnm/fminnm are not same either. The new codegen strategy is to combine the rtl operators smax and abs into famax and smin and abs into famin. We are using two instruction patterns: one for intrinsics and one for codegen. Apart from codegen changes and their test cases, this new version also changes intrinsic tests to use the -O3 flag. This removes the need for testing loads and stores. The old code for intrinsic and the refactoring of report_missing_extension and report_missing_extension_p are same as the previous version. Regression tested for aarch64-none-linux-gnu and found no regressions. Ok for master? I don't have commit access so can someone please commit on my behalf? Saurabh Jha (2): aarch64: Add AdvSIMD faminmax intrinsics aarch64: Add codegen support for AdvSIMD faminmax gcc/config/aarch64/aarch64-builtins.cc| 173 +- gcc/config/aarch64/aarch64-builtins.h | 5 +- .../aarch64/aarch64-option-extensions.def | 2 + gcc/config/aarch64/aarch64-simd.md| 21 ++ gcc/config/aarch64/aarch64-sve-builtins.cc| 22 -- gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 12 + gcc/config/arm/types.md | 6 + gcc/doc/invoke.texi | 2 + .../aarch64/simd/faminmax-builtins-no-flag.c | 10 + .../aarch64/simd/faminmax-builtins.c | 115 ++ .../aarch64/simd/faminmax-codegen-no-flag.c | 217 ++ .../aarch64/simd/faminmax-codegen.c | 197 13 files changed, 754 insertions(+), 32 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c -- 2.43.2
[PATCH v3 2/2] aarch64: Add codegen support for AdvSIMD faminmax
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch adds code generation support for famax and famin in terms of existing RTL operators. famax/famin is equivalent to first taking abs of the operands and then taking smax/smin on the results of abs. famax/famin (a, b) = smax/smin (abs (a), abs (b)) This fusion of operators is only possible when -march=armv9-a+faminmax flags are passed. We also need to pass -ffast-math flag; if we don't, then a statement like c[i] = __builtin_fmaxf16 (a[i], b[i]); is RTL expanded to UNSPEC_FMAXNM instead of smax (likewise for smin). This code generation is only available on -O2 or -O3 as that is when auto-vectorization is enabled. gcc/ChangeLog: * config/aarch64/aarch64-simd.md (*aarch64_faminmax_fused): Instruction pattern for faminmax codegen. * config/aarch64/iterators.md: Attribute for faminmax codegen. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/faminmax-codegen-no-flag.c: New test. * gcc.target/aarch64/simd/faminmax-codegen.c: New test. --- gcc/config/aarch64/aarch64-simd.md| 10 + gcc/config/aarch64/iterators.md | 3 + .../aarch64/simd/faminmax-codegen-no-flag.c | 217 ++ .../aarch64/simd/faminmax-codegen.c | 197 4 files changed, 427 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 0e1dd48dddb..37923037055 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -9901,3 +9901,13 @@ "\t%0., %1., %2." [(set_attr "type" "neon_fp_aminmax")] ) + +(define_insn "*aarch64_faminmax_fused" + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (FMAXMIN:VHSDF + (abs:VHSDF (match_operand:VHSDF 1 "register_operand" "w")) + (abs:VHSDF (match_operand:VHSDF 2 "register_operand" "w"] + "TARGET_FAMINMAX" + "\t%0., %1., %2." + [(set_attr "type" "neon_fp_aminmax")] +) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index ce1c63e63cc..28b35a7da5c 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -4471,3 +4471,6 @@ (define_int_attr faminmax_uns_op [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")]) + +(define_code_attr faminmax_op + [(smax "famax") (smin "famin")]) diff --git a/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c new file mode 100644 index 000..d77f5a5d19f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c @@ -0,0 +1,217 @@ +/* { dg-do assemble} */ +/* { dg-additional-options "-O3 -ffast-math -march=armv9-a" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + +#pragma GCC target "+nosve" + +/* +** test_vamax_f16: +** fabs v1.4h, v1.4h +** fabs v0.4h, v0.4h +** fmaxnm v0.4h, v0.4h, v1.4h +** ret +*/ +float16x4_t +test_vamax_f16 (float16x4_t a, float16x4_t b) +{ + int i; + float16x4_t c; + + for (i = 0; i < 4; ++i) { +a[i] = __builtin_fabsf16 (a[i]); +b[i] = __builtin_fabsf16 (b[i]); +c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f16: +** fabs v1.8h, v1.8h +** fabs v0.8h, v0.8h +** fmaxnm v0.8h, v0.8h, v1.8h +** ret +*/ +float16x8_t +test_vamaxq_f16 (float16x8_t a, float16x8_t b) +{ + int i; + float16x8_t c; + + for (i = 0; i < 8; ++i) { +a[i] = __builtin_fabsf16 (a[i]); +b[i] = __builtin_fabsf16 (b[i]); +c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamax_f32: +** fabs v1.2s, v1.2s +** fabs v0.2s, v0.2s +** fmaxnm v0.2s, v0.2s, v1.2s +** ret +*/ +float32x2_t +test_vamax_f32 (float32x2_t a, float32x2_t b) +{ + int i; + float32x2_t c; + + for (i = 0; i < 2; ++i) { +a[i] = __builtin_fabsf32 (a[i]); +b[i] = __builtin_fabsf32 (b[i]); +c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f32: +** fabs v1.4s, v1.4s +** fabs v0.4s, v0.4s +** fmaxnm v0.4s, v0.4s, v1.4s +** ret +*/ +float32x4_t +test_vamaxq_f32 (float32x4_t a, float32x4_t b) +{ + int i; + float32x4_t c; + + for (i = 0; i < 4; ++i) { +a[i] = __builtin_fabsf32 (a[i]); +b[i] = __builtin_fabsf32 (b[i]); +c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f64: +** fabs v1.2d, v1.2d +** fabs v0.2d, v0.2d +** fmaxnm v0.2d, v0.2d, v1.2d +** ret +*/ +float64x2_t +test_vamaxq_f64 (float64x2_t a, float64x2_t b) +{ + int i; + float64x2_t c; + + for (i = 0; i < 2; ++i) { +a[i] = __builtin_fabsf64 (a[i]); +b[i] = __builtin_fabs
[PATCH v3 1/2] aarch64: Add AdvSIMD faminmax intrinsics
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch does two things: 1. Introduces AdvSIMD faminmax intrinsics. 2. Move report_missing_extension and reported_missing_extension_p to make it more usable. The intrinsics of this extension are implemented as the following builtin functions: * vamax_f16 * vamaxq_f16 * vamax_f32 * vamaxq_f32 * vamaxq_f64 * vamin_f16 * vaminq_f16 * vamin_f32 * vaminq_f32 * vaminq_f64 We moved the definition of `report_missing_extension` from gcc/config/aarch64/aarch64-sve-builtins.cc to gcc/config/aarch64/aarch64-builtins.cc and its declaration to gcc/config/aarch64/aarch64-builtins.h. We also moved the declaration of `reported_missing_extension_p` from gcc/config/aarch64/aarch64-sve-builtins.cc to gcc/config/aarch64/aarch64-builtins.cc, closer to the definition of `report_missing_extension`. In the exsiting code structure, this leads to `report_missing_extension` being usable from both normal builtins and sve builtins. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (enum aarch64_builtins): New enum values for faminmax builtins. (aarch64_init_faminmax_builtins): New function to declare new builtins. (handle_arm_neon_h): Modify to call aarch64_init_faminmax_builtins. (aarch64_general_check_builtin_call): Modify to check whether +faminmax flag is being used and printing error message if not being used. (aarch64_expand_builtin_faminmax): New function to emit instructions of this extension. (aarch64_general_expand_builtin): Modify to call aarch64_expand_builtin_faminmax. (report_missing_extension): Move from config/aarch64/aarch64-sve-builtins.cc. * config/aarch64/aarch64-builtins.h (report_missing_extension): Declaration for this function so that it can be used wherever this header is included. (reported_missing_extension_p): Move from config/aarch64/aarch64-sve-builtins.cc * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): Introduce new flag for this extension. * config/aarch64/aarch64-simd.md (aarch64_): Instruction pattern for faminmax intrinsics. * config/aarch64/aarch64-sve-builtins.cc (reported_missing_extension_p): Move to config/aarch64/aarch64-builtins.c (report_missing_extension): Move to config/aarch64/aarch64-builtins.cc * config/aarch64/aarch64.h (TARGET_FAMINMAX): Introduce new flag for this extension. * config/aarch64/iterators.md: Introduce new iterators for faminmax intrinsics. * config/arm/types.md: Introduce neon_fp_aminmax attributes. * doc/invoke.texi: Document extension in AArch64 Options. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/faminmax-builtins-no-flag.c: New test. * gcc.target/aarch64/simd/faminmax-builtins.c: New test. --- gcc/config/aarch64/aarch64-builtins.cc| 173 +- gcc/config/aarch64/aarch64-builtins.h | 5 +- .../aarch64/aarch64-option-extensions.def | 2 + gcc/config/aarch64/aarch64-simd.md| 11 ++ gcc/config/aarch64/aarch64-sve-builtins.cc| 22 --- gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 9 + gcc/config/arm/types.md | 6 + gcc/doc/invoke.texi | 2 + .../aarch64/simd/faminmax-builtins-no-flag.c | 10 + .../aarch64/simd/faminmax-builtins.c | 115 11 files changed, 327 insertions(+), 32 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins.c diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index 30669f8aa18..cd590186f22 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -829,6 +829,17 @@ enum aarch64_builtins AARCH64_RBIT, AARCH64_RBITL, AARCH64_RBITLL, + /* FAMINMAX builtins. */ + AARCH64_FAMINMAX_BUILTIN_FAMAX4H, + AARCH64_FAMINMAX_BUILTIN_FAMAX8H, + AARCH64_FAMINMAX_BUILTIN_FAMAX2S, + AARCH64_FAMINMAX_BUILTIN_FAMAX4S, + AARCH64_FAMINMAX_BUILTIN_FAMAX2D, + AARCH64_FAMINMAX_BUILTIN_FAMIN4H, + AARCH64_FAMINMAX_BUILTIN_FAMIN8H, + AARCH64_FAMINMAX_BUILTIN_FAMIN2S, + AARCH64_FAMINMAX_BUILTIN_FAMIN4S, + AARCH64_FAMINMAX_BUILTIN_FAMIN2D, /* System register builtins. */ AARCH64_RSR, AARCH64_RSRP, @@ -1547,6 +1558,66 @@ aarch64_init_simd_builtin_functions (bool called_from_pragma) } } +/* Initialize the absolute maximum/minimum (FAMINMAX) builtins. */ + +typedef struct +{ + const char *name; + unsigned int code; + tree eltype; + machine_mode mode; +} faminmax_builtins_data; + +static v
[PATCH v7 1/2] aarch64: Add AdvSIMD faminmax intrinsics
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch introduces AdvSIMD faminmax intrinsics. The intrinsics of this extension are implemented as the following builtin functions: * vamax_f16 * vamaxq_f16 * vamax_f32 * vamaxq_f32 * vamaxq_f64 * vamin_f16 * vaminq_f16 * vamin_f32 * vaminq_f32 * vaminq_f64 We are defining a new way to add AArch64 AdvSIMD intrinsics by listing all the intrinsics in a .def file and then using that .def file to initialise various data structures. This would lead to more concise code and easier addition of the new AdvSIMD intrinsics in future. The faminmax intrinsics are defined using the new approach. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (ENTRY): Macro to parse the contents of aarch64-simd-pragma-builtins.def. (enum aarch64_builtins): New enum values for faminmax builtins via aarch64-simd-pragma-builtins.def. (struct aarch64_pragma_builtins_data): Struct to hold data from aarch64-simd-pragma-builtins.def. (aarch64_init_pragma_builtins): New function to define pragma builtins. (aarch64_get_pragma_builtin): New function to get a row of aarch64_pragma_builtins, given code. (handle_arm_neon_h): Modify to call aarch64_init_pragma_builtins. (aarch64_general_check_builtin_call): Modify to check whether required flag is being used for pragma builtins. (aarch64_expand_pragma_builtin): New function to emit instructions of pragma builtins. (aarch64_general_expand_builtin): Modify to call aarch64_expand_pragma_builtin. * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): Introduce new flag for this extension. * config/aarch64/aarch64-simd.md (@aarch64_): Instruction pattern for faminmax intrinsics. * config/aarch64/aarch64.h (TARGET_FAMINMAX): Introduce new flag for this extension. * config/aarch64/iterators.md: New iterators and unspecs. * config/arm/types.md: Introduce neon_fp_aminmax attributes. * doc/invoke.texi: Document extension in AArch64 Options. * config/aarch64/aarch64-simd-pragma-builtins.def: New file to list pragma builtins. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/faminmax-builtins-no-flag.c: New test. * gcc.target/aarch64/simd/faminmax-builtins.c: New test. --- gcc/config/aarch64/aarch64-builtins.cc| 84 + .../aarch64/aarch64-option-extensions.def | 2 + .../aarch64/aarch64-simd-pragma-builtins.def | 31 + gcc/config/aarch64/aarch64-simd.md| 11 ++ gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 9 ++ gcc/config/arm/types.md | 6 + gcc/doc/invoke.texi | 2 + .../aarch64/simd/faminmax-builtins-no-flag.c | 10 ++ .../aarch64/simd/faminmax-builtins.c | 115 ++ 10 files changed, 274 insertions(+) create mode 100644 gcc/config/aarch64/aarch64-simd-pragma-builtins.def create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins.c diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index eb878b933fe..a4905dd0aae 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -757,6 +757,10 @@ typedef struct #define VAR1(T, N, MAP, FLAG, A) \ AARCH64_SIMD_BUILTIN_##T##_##N##A, +#undef ENTRY +#define ENTRY(N, M, U, F) \ + AARCH64_##N, + enum aarch64_builtins { AARCH64_BUILTIN_MIN, @@ -829,6 +833,10 @@ enum aarch64_builtins AARCH64_RBIT, AARCH64_RBITL, AARCH64_RBITLL, + /* Pragma builtins. */ + AARCH64_PRAGMA_BUILTIN_START, +#include "aarch64-simd-pragma-builtins.def" + AARCH64_PRAGMA_BUILTIN_END, /* System register builtins. */ AARCH64_RSR, AARCH64_RSRP, @@ -947,6 +955,7 @@ const char *aarch64_scalar_builtin_types[] = { extern GTY(()) aarch64_simd_type_info aarch64_simd_types[]; +#undef ENTRY #define ENTRY(E, M, Q, G) \ {E, "__" #E, #G "__" #E, NULL_TREE, NULL_TREE, E_##M##mode, qualifier_##Q}, struct aarch64_simd_type_info aarch64_simd_types [] = { @@ -1547,6 +1556,50 @@ aarch64_init_simd_builtin_functions (bool called_from_pragma) } } +/* Initialize pragma builtins. */ + +struct aarch64_pragma_builtins_data +{ + const char *name; + machine_mode mode; + int unspec; + aarch64_feature_flags required_extensions; +}; + +#undef ENTRY +#define ENTRY(N, M, U, F) \ + {#N, E_##M##mode, U, F}, + +static aarch64_pragma_builtins_data aarch64_pragma_builtins[] = { +#include "aarch64-simd-pragma-builtins.def" +}; + +static void +aarch64_init_pragma_builtins () +{ + for (size_t
[PATCH v7 0/2] aarch64: Add support for AdvSIMD faminmax
From: Saurabh Jha This patch series is a respin of https://gcc.gnu.org/pipermail/gcc-patches/2024-August/661757.html. The major refactorings suggested in the reviews to previous version will be done separately to keep the scope of this series small. I'll create a new series for that refactoring. This new version addresses all comments except the one about aarch64_expand_builtin_data_intrinsic. We don't need to pass aarch64_pragma_builtins as the extra argument as we already have it declared in the file elsewhere. The first patch only contains changes that address comments and the second patch contains no changes. Saurabh Jha (2): aarch64: Add AdvSIMD faminmax intrinsics aarch64: Add codegen support for AdvSIMD faminmax gcc/config/aarch64/aarch64-builtins.cc| 84 +++ .../aarch64/aarch64-option-extensions.def | 2 + .../aarch64/aarch64-simd-pragma-builtins.def | 31 +++ gcc/config/aarch64/aarch64-simd.md| 21 ++ gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 12 + gcc/config/arm/types.md | 6 + gcc/doc/invoke.texi | 2 + .../aarch64/simd/faminmax-builtins-no-flag.c | 10 + .../aarch64/simd/faminmax-builtins.c | 115 ++ .../aarch64/simd/faminmax-codegen-no-flag.c | 217 ++ .../aarch64/simd/faminmax-codegen.c | 197 12 files changed, 701 insertions(+) create mode 100644 gcc/config/aarch64/aarch64-simd-pragma-builtins.def create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c -- 2.43.2
[PATCH v7 2/2] aarch64: Add codegen support for AdvSIMD faminmax
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch adds code generation support for famax and famin in terms of existing RTL operators. famax/famin is equivalent to first taking abs of the operands and then taking smax/smin on the results of abs. famax/famin (a, b) = smax/smin (abs (a), abs (b)) This fusion of operators is only possible when -march=armv9-a+faminmax flags are passed. We also need to pass -ffast-math flag; if we don't, then a statement like c[i] = __builtin_fmaxf16 (a[i], b[i]); is RTL expanded to UNSPEC_FMAXNM instead of smax (likewise for smin). This code generation is only available on -O2 or -O3 as that is when auto-vectorization is enabled. gcc/ChangeLog: * config/aarch64/aarch64-simd.md (*aarch64_faminmax_fused): Instruction pattern for faminmax codegen. * config/aarch64/iterators.md: Attribute for faminmax codegen. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/faminmax-codegen-no-flag.c: New test. * gcc.target/aarch64/simd/faminmax-codegen.c: New test. --- gcc/config/aarch64/aarch64-simd.md| 10 + gcc/config/aarch64/iterators.md | 3 + .../aarch64/simd/faminmax-codegen-no-flag.c | 217 ++ .../aarch64/simd/faminmax-codegen.c | 197 4 files changed, 427 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 7542c81ed91..8973cade488 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -9921,3 +9921,13 @@ "\t%0., %1., %2." [(set_attr "type" "neon_fp_aminmax")] ) + +(define_insn "*aarch64_faminmax_fused" + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (FMAXMIN:VHSDF + (abs:VHSDF (match_operand:VHSDF 1 "register_operand" "w")) + (abs:VHSDF (match_operand:VHSDF 2 "register_operand" "w"] + "TARGET_FAMINMAX" + "\t%0., %1., %2." + [(set_attr "type" "neon_fp_aminmax")] +) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 17ac5e073aa..c2fcd18306e 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -4472,3 +4472,6 @@ (define_int_attr faminmax_uns_op [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")]) + +(define_code_attr faminmax_op + [(smax "famax") (smin "famin")]) diff --git a/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c new file mode 100644 index 000..d77f5a5d19f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c @@ -0,0 +1,217 @@ +/* { dg-do assemble} */ +/* { dg-additional-options "-O3 -ffast-math -march=armv9-a" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + +#pragma GCC target "+nosve" + +/* +** test_vamax_f16: +** fabs v1.4h, v1.4h +** fabs v0.4h, v0.4h +** fmaxnm v0.4h, v0.4h, v1.4h +** ret +*/ +float16x4_t +test_vamax_f16 (float16x4_t a, float16x4_t b) +{ + int i; + float16x4_t c; + + for (i = 0; i < 4; ++i) { +a[i] = __builtin_fabsf16 (a[i]); +b[i] = __builtin_fabsf16 (b[i]); +c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f16: +** fabs v1.8h, v1.8h +** fabs v0.8h, v0.8h +** fmaxnm v0.8h, v0.8h, v1.8h +** ret +*/ +float16x8_t +test_vamaxq_f16 (float16x8_t a, float16x8_t b) +{ + int i; + float16x8_t c; + + for (i = 0; i < 8; ++i) { +a[i] = __builtin_fabsf16 (a[i]); +b[i] = __builtin_fabsf16 (b[i]); +c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamax_f32: +** fabs v1.2s, v1.2s +** fabs v0.2s, v0.2s +** fmaxnm v0.2s, v0.2s, v1.2s +** ret +*/ +float32x2_t +test_vamax_f32 (float32x2_t a, float32x2_t b) +{ + int i; + float32x2_t c; + + for (i = 0; i < 2; ++i) { +a[i] = __builtin_fabsf32 (a[i]); +b[i] = __builtin_fabsf32 (b[i]); +c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f32: +** fabs v1.4s, v1.4s +** fabs v0.4s, v0.4s +** fmaxnm v0.4s, v0.4s, v1.4s +** ret +*/ +float32x4_t +test_vamaxq_f32 (float32x4_t a, float32x4_t b) +{ + int i; + float32x4_t c; + + for (i = 0; i < 4; ++i) { +a[i] = __builtin_fabsf32 (a[i]); +b[i] = __builtin_fabsf32 (b[i]); +c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f64: +** fabs v1.2d, v1.2d +** fabs v0.2d, v0.2d +** fmaxnm v0.2d, v0.2d, v1.2d +** ret +*/ +float64x2_t +test_vamaxq_f64 (float64x2_t a, float64x2_t b) +{ + int i; + float64x2_t c; + + for (i = 0; i < 2; ++i) { +a[i] = __builtin_fabsf64 (a[i]); +b[i] = __builtin_fabs
[PATCH v8 0/2] aarch64: Add support for AdvSIMD faminmax.
From: Saurabh Jha This series is a revised version of: https://gcc.gnu.org/pipermail/gcc-patches/2024-August/661860.html. The first patch of the series is updated to address these comments: https://gcc.gnu.org/pipermail/gcc-patches/2024-August/661866.html All comments are addressed exactly as suggested except the one about handling signatures where I have defined an enum for signatures and then using those enum values in pragma builtin macros. No changes in the second patch of the series except fixing ChangeLog in the commit message. Saurabh Jha (2): aarch64: Add AdvSIMD faminmax intrinsics aarch64: Add codegen support for AdvSIMD faminmax gcc/config/aarch64/aarch64-builtins.cc| 123 ++ .../aarch64/aarch64-option-extensions.def | 2 + .../aarch64/aarch64-simd-pragma-builtins.def | 23 ++ gcc/config/aarch64/aarch64-simd.md| 21 ++ gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 12 + gcc/config/arm/types.md | 5 + gcc/doc/invoke.texi | 2 + .../aarch64/simd/faminmax-builtins-no-flag.c | 10 + .../aarch64/simd/faminmax-builtins.c | 115 ++ .../aarch64/simd/faminmax-codegen-no-flag.c | 217 ++ .../aarch64/simd/faminmax-codegen.c | 197 12 files changed, 731 insertions(+) create mode 100644 gcc/config/aarch64/aarch64-simd-pragma-builtins.def create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c -- 2.43.2
[PATCH v8 1/2] aarch64: Add AdvSIMD faminmax intrinsics
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch introduces AdvSIMD faminmax intrinsics. The intrinsics of this extension are implemented as the following builtin functions: * vamax_f16 * vamaxq_f16 * vamax_f32 * vamaxq_f32 * vamaxq_f64 * vamin_f16 * vaminq_f16 * vamin_f32 * vaminq_f32 * vaminq_f64 We are defining a new way to add AArch64 AdvSIMD intrinsics by listing all the intrinsics in a .def file and then using that .def file to initialise various data structures. This would lead to more concise code and easier addition of the new AdvSIMD intrinsics in future. The faminmax intrinsics are defined using the new approach. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (ENTRY): Macro to parse the contents of aarch64-simd-pragma-builtins.def. (enum aarch64_builtins): New enum values for faminmax builtins via aarch64-simd-pragma-builtins.def. (enum aarch64_builtin_signatures): Enum to specify the number of operands a builtin will take. (ENTRY_VHSDF): Macro to parse the contents of aarch64-simd-pragma-builtins.def. (struct aarch64_pragma_builtins_data): Struct to hold data from aarch64-simd-pragma-builtins.def. (aarch64_fntype): New function to define function types of intrinsics given an object of type aarch64_pragma_builtins_data. (aarch64_init_pragma_builtins): New function to define pragma builtins. (aarch64_get_pragma_builtin): New function to get a row of aarch64_pragma_builtins, given code. (handle_arm_neon_h): Modify to call aarch64_init_pragma_builtins. (aarch64_general_check_builtin_call): Modify to check whether required flag is being used for pragma builtins. (aarch64_expand_pragma_builtin): New function to emit instructions of pragma_builtin. (aarch64_general_expand_builtin): Modify to call aarch64_expand_pragma_builtin. * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): Introduce new flag for this extension. * config/aarch64/aarch64-simd.md (@aarch64_): Instruction pattern for faminmax intrinsics. * config/aarch64/aarch64.h (TARGET_FAMINMAX): Introduce new flag for this extension. * config/aarch64/iterators.md: New iterators and unspecs. * config/arm/types.md: Introduce neon_fp_aminmax attributes. * doc/invoke.texi: Document extension in AArch64 Options. * config/aarch64/aarch64-simd-pragma-builtins.def: New file to list pragma builtins. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/faminmax-builtins-no-flag.c: New test. * gcc.target/aarch64/simd/faminmax-builtins.c: New test. --- gcc/config/aarch64/aarch64-builtins.cc| 123 ++ .../aarch64/aarch64-option-extensions.def | 2 + .../aarch64/aarch64-simd-pragma-builtins.def | 23 gcc/config/aarch64/aarch64-simd.md| 11 ++ gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 9 ++ gcc/config/arm/types.md | 5 + gcc/doc/invoke.texi | 2 + .../aarch64/simd/faminmax-builtins-no-flag.c | 10 ++ .../aarch64/simd/faminmax-builtins.c | 115 10 files changed, 304 insertions(+) create mode 100644 gcc/config/aarch64/aarch64-simd-pragma-builtins.def create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins.c diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index eb878b933fe..6e64ae86c52 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -757,6 +757,18 @@ typedef struct #define VAR1(T, N, MAP, FLAG, A) \ AARCH64_SIMD_BUILTIN_##T##_##N##A, +#undef ENTRY +#define ENTRY(N, S, M, U, F) \ + AARCH64_##N, + +#undef ENTRY_VHSDF +#define ENTRY_VHSDF(NAME, SIGNATURE, UNSPEC, EXTENSIONS) \ + AARCH64_##NAME##_f16, \ + AARCH64_##NAME##q_f16, \ + AARCH64_##NAME##_f32, \ + AARCH64_##NAME##q_f32, \ + AARCH64_##NAME##q_f64, + enum aarch64_builtins { AARCH64_BUILTIN_MIN, @@ -829,6 +841,10 @@ enum aarch64_builtins AARCH64_RBIT, AARCH64_RBITL, AARCH64_RBITLL, + /* Pragma builtins. */ + AARCH64_PRAGMA_BUILTIN_START, +#include "aarch64-simd-pragma-builtins.def" + AARCH64_PRAGMA_BUILTIN_END, /* System register builtins. */ AARCH64_RSR, AARCH64_RSRP, @@ -947,6 +963,7 @@ const char *aarch64_scalar_builtin_types[] = { extern GTY(()) aarch64_simd_type_info aarch64_simd_types[]; +#undef ENTRY #define ENTRY(E, M, Q, G) \ {E, "__" #E, #G "__" #E, N
[PATCH v8 2/2] aarch64: Add codegen support for AdvSIMD faminmax
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch adds code generation support for famax and famin in terms of existing RTL operators. famax/famin is equivalent to first taking abs of the operands and then taking smax/smin on the results of abs. famax/famin (a, b) = smax/smin (abs (a), abs (b)) This fusion of operators is only possible when -march=armv9-a+faminmax flags are passed. We also need to pass -ffast-math flag; if we don't, then a statement like c[i] = __builtin_fmaxf16 (a[i], b[i]); is RTL expanded to UNSPEC_FMAXNM instead of smax (likewise for smin). This code generation is only available on -O2 or -O3 as that is when auto-vectorization is enabled. gcc/ChangeLog: * config/aarch64/aarch64-simd.md (*aarch64_faminmax_fused): Instruction pattern for faminmax codegen. * config/aarch64/iterators.md: Attribute for faminmax codegen. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/faminmax-codegen-no-flag.c: New test. * gcc.target/aarch64/simd/faminmax-codegen.c: New test. --- gcc/config/aarch64/aarch64-simd.md| 10 + gcc/config/aarch64/iterators.md | 3 + .../aarch64/simd/faminmax-codegen-no-flag.c | 217 ++ .../aarch64/simd/faminmax-codegen.c | 197 4 files changed, 427 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 7542c81ed91..8973cade488 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -9921,3 +9921,13 @@ "\t%0., %1., %2." [(set_attr "type" "neon_fp_aminmax")] ) + +(define_insn "*aarch64_faminmax_fused" + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (FMAXMIN:VHSDF + (abs:VHSDF (match_operand:VHSDF 1 "register_operand" "w")) + (abs:VHSDF (match_operand:VHSDF 2 "register_operand" "w"] + "TARGET_FAMINMAX" + "\t%0., %1., %2." + [(set_attr "type" "neon_fp_aminmax")] +) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 17ac5e073aa..c2fcd18306e 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -4472,3 +4472,6 @@ (define_int_attr faminmax_uns_op [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")]) + +(define_code_attr faminmax_op + [(smax "famax") (smin "famin")]) diff --git a/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c new file mode 100644 index 000..d77f5a5d19f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c @@ -0,0 +1,217 @@ +/* { dg-do assemble} */ +/* { dg-additional-options "-O3 -ffast-math -march=armv9-a" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + +#pragma GCC target "+nosve" + +/* +** test_vamax_f16: +** fabs v1.4h, v1.4h +** fabs v0.4h, v0.4h +** fmaxnm v0.4h, v0.4h, v1.4h +** ret +*/ +float16x4_t +test_vamax_f16 (float16x4_t a, float16x4_t b) +{ + int i; + float16x4_t c; + + for (i = 0; i < 4; ++i) { +a[i] = __builtin_fabsf16 (a[i]); +b[i] = __builtin_fabsf16 (b[i]); +c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f16: +** fabs v1.8h, v1.8h +** fabs v0.8h, v0.8h +** fmaxnm v0.8h, v0.8h, v1.8h +** ret +*/ +float16x8_t +test_vamaxq_f16 (float16x8_t a, float16x8_t b) +{ + int i; + float16x8_t c; + + for (i = 0; i < 8; ++i) { +a[i] = __builtin_fabsf16 (a[i]); +b[i] = __builtin_fabsf16 (b[i]); +c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamax_f32: +** fabs v1.2s, v1.2s +** fabs v0.2s, v0.2s +** fmaxnm v0.2s, v0.2s, v1.2s +** ret +*/ +float32x2_t +test_vamax_f32 (float32x2_t a, float32x2_t b) +{ + int i; + float32x2_t c; + + for (i = 0; i < 2; ++i) { +a[i] = __builtin_fabsf32 (a[i]); +b[i] = __builtin_fabsf32 (b[i]); +c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f32: +** fabs v1.4s, v1.4s +** fabs v0.4s, v0.4s +** fmaxnm v0.4s, v0.4s, v1.4s +** ret +*/ +float32x4_t +test_vamaxq_f32 (float32x4_t a, float32x4_t b) +{ + int i; + float32x4_t c; + + for (i = 0; i < 4; ++i) { +a[i] = __builtin_fabsf32 (a[i]); +b[i] = __builtin_fabsf32 (b[i]); +c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f64: +** fabs v1.2d, v1.2d +** fabs v0.2d, v0.2d +** fmaxnm v0.2d, v0.2d, v1.2d +** ret +*/ +float64x2_t +test_vamaxq_f64 (float64x2_t a, float64x2_t b) +{ + int i; + float64x2_t c; + + for (i = 0; i < 2; ++i) { +a[i] = __builtin_fabsf64 (a[i]); +b[i] = __buil
[PATCH 0/2] aarch64: Add support for SVE2 faminmax
From: Saurabh Jha This patch series adds support for SVE2 faminmax. It should be merged only after AdvSIMD faminmax patch series is merged: https://gcc.gnu.org/pipermail/gcc-patches/2024-September/662131.html The first patch adds intrinsics and the second patch adds support for combining FMAX/FMAXNM/FMIN/FMINNM and FABS into FAMAX/FAMIN. Regression tested on aarch64-none-linux-gnu and found no regressions. Ok for master? I don't have commit access so can someone please commit on my behalf? Saurabh Jha (2): aarch64: Add SVE2 faminmax intrinsics aarch64: Add codegen support for SVE2 faminmax .../aarch64/aarch64-sve-builtins-base.cc | 4 + .../aarch64/aarch64-sve-builtins-base.def | 5 + .../aarch64/aarch64-sve-builtins-base.h | 2 + gcc/config/aarch64/aarch64-sve.md | 29 gcc/config/aarch64/aarch64.h | 1 + gcc/config/aarch64/iterators.md | 24 ++- gcc/testsuite/gcc.target/aarch64/aminmax.h| 13 ++ .../gcc.target/aarch64/sve/faminmax.c | 85 ++ .../aarch64/sve2/acle/asm/amax_f16.c | 155 ++ .../aarch64/sve2/acle/asm/amax_f32.c | 155 ++ .../aarch64/sve2/acle/asm/amax_f64.c | 155 ++ .../aarch64/sve2/acle/asm/amin_f16.c | 155 ++ .../aarch64/sve2/acle/asm/amin_f32.c | 155 ++ .../aarch64/sve2/acle/asm/amin_f64.c | 155 ++ 14 files changed, 1092 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/aminmax.h create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f64.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f64.c -- 2.43.2
[PATCH 1/2] aarch64: Add SVE2 faminmax intrinsics
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch introduces SVE2 faminmax intrinsics. The intrinsics of this extension are implemented as the following builtin functions: * sva[max|min]_[m|x|z] * sva[max|min]_[f16|f32|f64]_[m|x|z] * sva[max|min]_n_[f16|f32|f64]_[m|x|z] gcc/ChangeLog: * config/aarch64/aarch64-sve-builtins-base.cc (svamax): Absolute maximum declaration. (svamin): Absolute minimum declaration. * config/aarch64/aarch64-sve-builtins-base.def (svamax): Absolute maximum declaration. (svamin): Absolute minimum declaration. * config/aarch64/aarch64-sve-builtins-base.h: Declaring function bases for the new intrinsics. * config/aarch64/aarch64.h (TARGET_SVE_FAMINMAX): New flag for SVE2 faminmax. * config/aarch64/iterators.md: New unspecs, iterators, and attrs for the new intrinsics. gcc/testsuite/ChangeLog: * gcc.target/aarch64/aminmax.h: New test. * gcc.target/aarch64/sve2/acle/asm/amax_f16.c: New test. * gcc.target/aarch64/sve2/acle/asm/amax_f32.c: New test. * gcc.target/aarch64/sve2/acle/asm/amax_f64.c: New test. * gcc.target/aarch64/sve2/acle/asm/amin_f16.c: New test. * gcc.target/aarch64/sve2/acle/asm/amin_f32.c: New test. * gcc.target/aarch64/sve2/acle/asm/amin_f64.c: New test. --- .../aarch64/aarch64-sve-builtins-base.cc | 4 + .../aarch64/aarch64-sve-builtins-base.def | 5 + .../aarch64/aarch64-sve-builtins-base.h | 2 + gcc/config/aarch64/aarch64.h | 1 + gcc/config/aarch64/iterators.md | 18 +- gcc/testsuite/gcc.target/aarch64/aminmax.h| 13 ++ .../aarch64/sve2/acle/asm/amax_f16.c | 155 ++ .../aarch64/sve2/acle/asm/amax_f32.c | 155 ++ .../aarch64/sve2/acle/asm/amax_f64.c | 155 ++ .../aarch64/sve2/acle/asm/amin_f16.c | 155 ++ .../aarch64/sve2/acle/asm/amin_f32.c | 155 ++ .../aarch64/sve2/acle/asm/amin_f64.c | 155 ++ 12 files changed, 972 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/aminmax.h create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f64.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f64.c diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index 8f781e26cc8..80c67715fd7 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -3044,6 +3044,10 @@ FUNCTION (svadrb, svadr_bhwd_impl, (0)) FUNCTION (svadrd, svadr_bhwd_impl, (3)) FUNCTION (svadrh, svadr_bhwd_impl, (1)) FUNCTION (svadrw, svadr_bhwd_impl, (2)) +FUNCTION (svamax, cond_or_uncond_unspec_function, (UNSPEC_COND_FAMAX, + UNSPEC_FAMAX)) +FUNCTION (svamin, cond_or_uncond_unspec_function, (UNSPEC_COND_FAMIN, + UNSPEC_FAMAX)) FUNCTION (svand, rtx_code_function, (AND, AND)) FUNCTION (svandv, reduction, (UNSPEC_ANDV)) FUNCTION (svasr, rtx_code_function, (ASHIFTRT, ASHIFTRT)) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.def b/gcc/config/aarch64/aarch64-sve-builtins-base.def index 65fcba91586..95e04e4393d 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.def +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.def @@ -379,3 +379,8 @@ DEF_SVE_FUNCTION (svzip2q, binary, all_data, none) DEF_SVE_FUNCTION (svld1ro, load_replicate, all_data, implicit) DEF_SVE_FUNCTION (svmmla, mmla, d_float, none) #undef REQUIRED_EXTENSIONS + +#define REQUIRED_EXTENSIONS AARCH64_FL_SVE | AARCH64_FL_FAMINMAX +DEF_SVE_FUNCTION (svamax, binary_opt_single_n, all_float, mxz) +DEF_SVE_FUNCTION (svamin, binary_opt_single_n, all_float, mxz) +#undef REQUIRED_EXTENSIONS diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.h b/gcc/config/aarch64/aarch64-sve-builtins-base.h index 5bbf3569c4b..978cf7013f9 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.h +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.h @@ -37,6 +37,8 @@ namespace aarch64_sve extern const function_base *const svadrd; extern const function_base *const svadrh; extern const function_base *const svadrw; +extern const function_base *const svamax; +extern const function_base *const svamin; extern const function_base *const svand; extern const function_base *const svandv
[PATCH 2/2] aarch64: Add codegen support for SVE2 faminmax
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch adds code generation for famax and famin in terms of existing unspecs. With this patch: 1. famax can be expressed as taking fmax/fmaxnm of the two operands and then taking absolute value of their result. 2. famin can be expressed as taking fmin/fminnm of the two operands and then taking absolute value of their result. This fusion of operators is only possible when -march=armv9-a+faminmax+sve flags are passed. This code generation is only available on -O2 or -O3 as that is when auto-vectorization is enabled. gcc/ChangeLog: * config/aarch64/aarch64-sve.md (*aarch64_pred_faminmax_fused): Instruction pattern for faminmax codegen. * config/aarch64/iterators.md: Attribute for faminmax codegen. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/faminmax.c: New test. --- gcc/config/aarch64/aarch64-sve.md | 29 +++ gcc/config/aarch64/iterators.md | 6 ++ .../gcc.target/aarch64/sve/faminmax.c | 85 +++ 3 files changed, 120 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax.c diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index a5cd42be9d5..feb6438efde 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -1,3 +1,32 @@ return "sel\t%0., %3, %2., %1."; } ) + +;; - +;; -- [FP] Absolute maximum and minimum +;; - +;; Includes: +;; - FAMAX +;; - FAMIN +;; - + +;; Predicated floating-point absolute maximum and minimum. +(define_insn "*aarch64_pred_faminmax_fused" + [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w") + (unspec:SVE_FULL_F + [(match_operand: 1 "register_operand" "Upl") + (match_operand:SI 4 "aarch64_sve_gp_strictness" "w") + (unspec:SVE_FULL_F + [(match_operand 5) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_FULL_F 2 "register_operand" "w")] + UNSPEC_COND_FABS) + (unspec:SVE_FULL_F + [(match_operand 6) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_FULL_F 3 "register_operand" "w")] + UNSPEC_COND_FABS)] + SVE_COND_FP_MAXMIN))] + "TARGET_SVE_FAMINMAX" + "\t%0., %1/m, %0., %3." +) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index b993ac9a7f6..5bdf1970f92 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -4489,5 +4489,11 @@ (define_int_attr faminmax_uns_op [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")]) +(define_int_attr faminmax_cond_uns_op + [(UNSPEC_COND_FMAX "famax") + (UNSPEC_COND_FMAXNM "famax") + (UNSPEC_COND_FMIN "famin") + (UNSPEC_COND_FMINNM "famin")]) + (define_code_attr faminmax_op [(smax "famax") (smin "famin")]) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/faminmax.c b/gcc/testsuite/gcc.target/aarch64/sve/faminmax.c new file mode 100644 index 000..b70e19fa276 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/faminmax.c @@ -0,0 +1,85 @@ +/* { dg-do assemble} */ +/* { dg-additional-options "-O3 -ffast-math -march=armv9-a+sve+faminmax" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_sve.h" + +#pragma GCC target "+sve" + +#define TEST_FAMAX(TYPE) \ + void fn_famax_##TYPE (TYPE * restrict a,\ + TYPE * restrict b,\ + TYPE * restrict c,\ + int n) { \ +for (int i = 0; i < n; i++) { \ + TYPE temp1 = __builtin_fabs (a[i]);\ + TYPE temp2 = __builtin_fabs (b[i]);\ + c[i] = __builtin_fmax (temp1, temp2);\ +} \ + } \ + +#define TEST_FAMIN(TYPE) \ + void fn_famin_##TYPE (TYPE * restrict a,\ + TYPE * restrict b,\ + TYPE * restrict c,\ + int n) { \ +for (int i = 0; i < n; i++) { \ + TYPE temp1 = __builtin_fabs (a[i]);\ + TYPE temp2 = __builtin_fabs (b[i]);\ + c[i] = __builtin_fmin (temp1, temp2);\ +} \ + } \ + +/* +** fn_famax_float16_t: +** ... +** famax z31.h, p6/m, z31.h, z30.h +** ... +** ret +*/ +TEST_FAMAX (float16_t) + +/* +** fn_famax_float32_t: +** ... +** famax z31.s, p6/m, z31.s, z30.s +** ... +** ret +*/ +TEST_FAMAX (float32_t) + +/* +** fn_famax_float64_t: +** ... +** famax z31.d, p6/m, z31.d, z30.d +** ... +** ret +*/ +TEST_FAMAX (float64_t) + +/* +** fn_famin_float16_t: +** ... +** famin z31.h, p6/m, z31.h, z30.h +** ... +** ret +*/ +TEST_FAMIN (float16_t) + +/* +** fn_famin_float32_t: +** ... +** famin z31.s, p6/m, z31.s, z30.s +** ... +** ret +*
[PATCH v5 1/2] aarch64: Add SVE2 faminmax intrinsics
The AArch64 FEAT_FAMINMAX extension introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch introduces SVE2 faminmax intrinsics. The intrinsics of this extension are implemented as the following builtin functions: * sva[max|min]_[m|x|z] * sva[max|min]_[f16|f32|f64]_[m|x|z] * sva[max|min]_n_[f16|f32|f64]_[m|x|z] gcc/ChangeLog: * config/aarch64/aarch64-sve-builtins-base.cc (svamax): Absolute maximum declaration. (svamin): Absolute minimum declaration. * config/aarch64/aarch64-sve-builtins-base.def (REQUIRED_EXTENSIONS): Add faminmax intrinsics behind a flag. (svamax): Absolute maximum declaration. (svamin): Absolute minimum declaration. * config/aarch64/aarch64-sve-builtins-base.h: Declaring function bases for the new intrinsics. * config/aarch64/aarch64.h (TARGET_SVE_FAMINMAX): New flag for SVE2 faminmax. * config/aarch64/iterators.md: New unspecs, iterators, and attrs for the new intrinsics. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve2/acle/asm/amax_f16.c: New test. * gcc.target/aarch64/sve2/acle/asm/amax_f32.c: New test. * gcc.target/aarch64/sve2/acle/asm/amax_f64.c: New test. * gcc.target/aarch64/sve2/acle/asm/amin_f16.c: New test. * gcc.target/aarch64/sve2/acle/asm/amin_f32.c: New test. * gcc.target/aarch64/sve2/acle/asm/amin_f64.c: New test. --- .../aarch64/aarch64-sve-builtins-base.cc | 4 + .../aarch64/aarch64-sve-builtins-base.def | 5 + .../aarch64/aarch64-sve-builtins-base.h | 2 + gcc/config/aarch64/aarch64.h | 1 + gcc/config/aarch64/iterators.md | 18 +- .../aarch64/sve2/acle/asm/amax_f16.c | 437 ++ .../aarch64/sve2/acle/asm/amax_f32.c | 437 ++ .../aarch64/sve2/acle/asm/amax_f64.c | 437 ++ .../aarch64/sve2/acle/asm/amin_f16.c | 437 ++ .../aarch64/sve2/acle/asm/amin_f32.c | 437 ++ .../aarch64/sve2/acle/asm/amin_f64.c | 437 ++ 11 files changed, 2651 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f64.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f64.c diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index 4b33585d981..b189818d643 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -3071,6 +3071,10 @@ FUNCTION (svadrb, svadr_bhwd_impl, (0)) FUNCTION (svadrd, svadr_bhwd_impl, (3)) FUNCTION (svadrh, svadr_bhwd_impl, (1)) FUNCTION (svadrw, svadr_bhwd_impl, (2)) +FUNCTION (svamax, cond_or_uncond_unspec_function, + (UNSPEC_COND_FAMAX, UNSPEC_FAMAX)) +FUNCTION (svamin, cond_or_uncond_unspec_function, + (UNSPEC_COND_FAMIN, UNSPEC_FAMIN)) FUNCTION (svand, rtx_code_function, (AND, AND)) FUNCTION (svandv, reduction, (UNSPEC_ANDV)) FUNCTION (svasr, rtx_code_function, (ASHIFTRT, ASHIFTRT)) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.def b/gcc/config/aarch64/aarch64-sve-builtins-base.def index 65fcba91586..95e04e4393d 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.def +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.def @@ -379,3 +379,8 @@ DEF_SVE_FUNCTION (svzip2q, binary, all_data, none) DEF_SVE_FUNCTION (svld1ro, load_replicate, all_data, implicit) DEF_SVE_FUNCTION (svmmla, mmla, d_float, none) #undef REQUIRED_EXTENSIONS + +#define REQUIRED_EXTENSIONS AARCH64_FL_SVE | AARCH64_FL_FAMINMAX +DEF_SVE_FUNCTION (svamax, binary_opt_single_n, all_float, mxz) +DEF_SVE_FUNCTION (svamin, binary_opt_single_n, all_float, mxz) +#undef REQUIRED_EXTENSIONS diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.h b/gcc/config/aarch64/aarch64-sve-builtins-base.h index 5bbf3569c4b..978cf7013f9 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.h +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.h @@ -37,6 +37,8 @@ namespace aarch64_sve extern const function_base *const svadrd; extern const function_base *const svadrh; extern const function_base *const svadrw; +extern const function_base *const svamax; +extern const function_base *const svamin; extern const function_base *const svand; extern const function_base *const svandv; extern const function_base *const svasr; diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index 030cffb1760..593319fd472 100644 --- a/gcc/c
[PATCH v5 0/2] Add support for SVE2 faminmax
From: Saurabh Jha This patch series is a revised version of: https://gcc.gnu.org/pipermail/gcc-patches/2024-October/664391.html Previous review comments are in this thread: https://gcc.gnu.org/pipermail/gcc-patches/2024-October/664329.html The second patch of this is okay to merge. The changes are in the first patch which are as follows: 1. Fixing sve_pred_fp_rhs2_operand for the new unspecs as the new operators don't have an immediate form. 2. Adding new intrinsic test cases to make sure we handle immediate arguments correctly. Also removed the use of fmov instructions. Regression tested on aarch64-unknown-linux-gnu and found no regressions. Ok for master? Thanks, Saurabh Saurabh Jha (2): aarch64: Add SVE2 faminmax intrinsics aarch64: Add codegen support for SVE2 faminmax .../aarch64/aarch64-sve-builtins-base.cc | 4 + .../aarch64/aarch64-sve-builtins-base.def | 5 + .../aarch64/aarch64-sve-builtins-base.h | 2 + gcc/config/aarch64/aarch64-sve2.md| 37 ++ gcc/config/aarch64/aarch64.h | 1 + gcc/config/aarch64/iterators.md | 24 +- .../gcc.target/aarch64/sve/faminmax_1.c | 44 ++ .../gcc.target/aarch64/sve/faminmax_2.c | 60 +++ .../aarch64/sve2/acle/asm/amax_f16.c | 437 ++ .../aarch64/sve2/acle/asm/amax_f32.c | 437 ++ .../aarch64/sve2/acle/asm/amax_f64.c | 437 ++ .../aarch64/sve2/acle/asm/amin_f16.c | 437 ++ .../aarch64/sve2/acle/asm/amin_f32.c | 437 ++ .../aarch64/sve2/acle/asm/amin_f64.c | 437 ++ 14 files changed, 2798 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f64.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f64.c -- 2.34.1
[PATCH v5 2/2] aarch64: Add codegen support for SVE2 faminmax
The AArch64 FEAT_FAMINMAX extension introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch adds code generation for famax and famin in terms of existing unspecs. With this patch: 1. famax can be expressed as taking UNSPEC_COND_SMAX of the two operands and then taking absolute value of their result. 2. famin can be expressed as taking UNSPEC_COND_SMIN of the two operands and then taking absolute value of their result. This fusion of operators is only possible when -march=armv9-a+faminmax+sve flags are passed. We also need to pass -ffast-math flag; this is what enables compiler to use UNSPEC_COND_SMAX and UNSPEC_COND_SMIN. This code generation is only available on -O2 or -O3 as that is when auto-vectorization is enabled. gcc/ChangeLog: * config/aarch64/aarch64-sve2.md (*aarch64_pred_faminmax_fused): Instruction pattern for faminmax codegen. * config/aarch64/iterators.md: Iterator and attribute for faminmax codegen. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/faminmax_1.c: New test. * gcc.target/aarch64/sve/faminmax_2.c: New test. --- gcc/config/aarch64/aarch64-sve2.md| 37 gcc/config/aarch64/iterators.md | 6 ++ .../gcc.target/aarch64/sve/faminmax_1.c | 44 ++ .../gcc.target/aarch64/sve/faminmax_2.c | 60 +++ 4 files changed, 147 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_2.c diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index 725092cc95f..5f2697c3179 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -2467,6 +2467,43 @@ [(set_attr "movprfx" "yes")] ) +;; - +;; -- [FP] Absolute maximum and minimum +;; - +;; Includes: +;; - FAMAX +;; - FAMIN +;; - +;; Predicated floating-point absolute maximum and minimum. +(define_insn_and_rewrite "*aarch64_pred_faminmax_fused" + [(set (match_operand:SVE_FULL_F 0 "register_operand") + (unspec:SVE_FULL_F + [(match_operand: 1 "register_operand") + (match_operand:SI 4 "aarch64_sve_gp_strictness") + (unspec:SVE_FULL_F + [(match_operand 5) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_FULL_F 2 "register_operand")] + UNSPEC_COND_FABS) + (unspec:SVE_FULL_F + [(match_operand 6) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_FULL_F 3 "register_operand")] + UNSPEC_COND_FABS)] + SVE_COND_SMAXMIN))] + "TARGET_SVE_FAMINMAX" + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] + [ w, Upl , %0 , w ; * ] \t%0., %1/m, %0., %3. + [ ?&w , Upl , w , w ; yes] movprfx\t%0, %2\;\t%0., %1/m, %0., %3. + } + "&& (!rtx_equal_p (operands[1], operands[5]) + || !rtx_equal_p (operands[1], operands[6]))" + { +operands[5] = copy_rtx (operands[1]); +operands[6] = copy_rtx (operands[1]); + } +) + ;; = ;; == Complex arithmetic ;; = diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index cbacf59c451..244a9c1b75d 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -3143,6 +3143,9 @@ UNSPEC_COND_SMAX UNSPEC_COND_SMIN]) +(define_int_iterator SVE_COND_SMAXMIN [UNSPEC_COND_SMAX + UNSPEC_COND_SMIN]) + (define_int_iterator SVE_COND_FP_TERNARY [UNSPEC_COND_FMLA UNSPEC_COND_FMLS UNSPEC_COND_FNMLA @@ -4503,6 +4506,9 @@ (define_int_iterator FAMINMAX_UNS [UNSPEC_FAMAX UNSPEC_FAMIN]) +(define_int_attr faminmax_cond_uns_op + [(UNSPEC_COND_SMAX "famax") (UNSPEC_COND_SMIN "famin")]) + (define_int_attr faminmax_uns_op [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")]) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c b/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c new file mode 100644 index 000..3b65ccea065 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c @@ -0,0 +1,44 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -ffast-math" } */ + +#include "arm_sve.h" + +#pragma GCC target "+sve+faminmax" + +#define TEST_FAMAX(TYPE) \ + void fn_famax_##TYPE (TYPE * restrict a,\ + TYPE * restrict b,\ + TYPE * restrict c,\ + int n) { \ +for (int i = 0; i < n; i++) { \ + TYPE temp1 = __builtin_fabs (a[i]);\ + TYPE temp2 = __builtin_fabs (b[i]);\ + c[i] = __builtin_fmax (temp1, temp2);\ +} \ + }
[PATCH v6 0/2] Add support for SVE2 faminmax
From: Saurabh Jha This patch series is a revised version of: https://gcc.gnu.org/pipermail/gcc-patches/2024-October/664912.html In particular, the only changes are in the first patch, where in the test cases of intrinsics, we removed unnecessary capture of regular expression of operands. The second patch has been reviewed already. Regression tested on aarch64-unknown-linux-gnu and found no regressions. Ok for master? Regards, Saurabh Saurabh Jha (2): aarch64: Add SVE2 faminmax intrinsics aarch64: Add codegen support for SVE2 faminmax .../aarch64/aarch64-sve-builtins-base.cc | 4 + .../aarch64/aarch64-sve-builtins-base.def | 5 + .../aarch64/aarch64-sve-builtins-base.h | 2 + gcc/config/aarch64/aarch64-sve2.md| 37 ++ gcc/config/aarch64/aarch64.h | 1 + gcc/config/aarch64/iterators.md | 24 +- .../gcc.target/aarch64/sve/faminmax_1.c | 44 ++ .../gcc.target/aarch64/sve/faminmax_2.c | 60 +++ .../aarch64/sve2/acle/asm/amax_f16.c | 437 ++ .../aarch64/sve2/acle/asm/amax_f32.c | 437 ++ .../aarch64/sve2/acle/asm/amax_f64.c | 437 ++ .../aarch64/sve2/acle/asm/amin_f16.c | 437 ++ .../aarch64/sve2/acle/asm/amin_f32.c | 437 ++ .../aarch64/sve2/acle/asm/amin_f64.c | 437 ++ 14 files changed, 2798 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f64.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f64.c -- 2.34.1
[PATCH v6 1/2] aarch64: Add SVE2 faminmax intrinsics
The AArch64 FEAT_FAMINMAX extension introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch introduces SVE2 faminmax intrinsics. The intrinsics of this extension are implemented as the following builtin functions: * sva[max|min]_[m|x|z] * sva[max|min]_[f16|f32|f64]_[m|x|z] * sva[max|min]_n_[f16|f32|f64]_[m|x|z] gcc/ChangeLog: * config/aarch64/aarch64-sve-builtins-base.cc (svamax): Absolute maximum declaration. (svamin): Absolute minimum declaration. * config/aarch64/aarch64-sve-builtins-base.def (REQUIRED_EXTENSIONS): Add faminmax intrinsics behind a flag. (svamax): Absolute maximum declaration. (svamin): Absolute minimum declaration. * config/aarch64/aarch64-sve-builtins-base.h: Declaring function bases for the new intrinsics. * config/aarch64/aarch64.h (TARGET_SVE_FAMINMAX): New flag for SVE2 faminmax. * config/aarch64/iterators.md: New unspecs, iterators, and attrs for the new intrinsics. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve2/acle/asm/amax_f16.c: New test. * gcc.target/aarch64/sve2/acle/asm/amax_f32.c: New test. * gcc.target/aarch64/sve2/acle/asm/amax_f64.c: New test. * gcc.target/aarch64/sve2/acle/asm/amin_f16.c: New test. * gcc.target/aarch64/sve2/acle/asm/amin_f32.c: New test. * gcc.target/aarch64/sve2/acle/asm/amin_f64.c: New test. --- .../aarch64/aarch64-sve-builtins-base.cc | 4 + .../aarch64/aarch64-sve-builtins-base.def | 5 + .../aarch64/aarch64-sve-builtins-base.h | 2 + gcc/config/aarch64/aarch64.h | 1 + gcc/config/aarch64/iterators.md | 18 +- .../aarch64/sve2/acle/asm/amax_f16.c | 437 ++ .../aarch64/sve2/acle/asm/amax_f32.c | 437 ++ .../aarch64/sve2/acle/asm/amax_f64.c | 437 ++ .../aarch64/sve2/acle/asm/amin_f16.c | 437 ++ .../aarch64/sve2/acle/asm/amin_f32.c | 437 ++ .../aarch64/sve2/acle/asm/amin_f64.c | 437 ++ 11 files changed, 2651 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f64.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f64.c diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index 4b33585d981..b189818d643 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -3071,6 +3071,10 @@ FUNCTION (svadrb, svadr_bhwd_impl, (0)) FUNCTION (svadrd, svadr_bhwd_impl, (3)) FUNCTION (svadrh, svadr_bhwd_impl, (1)) FUNCTION (svadrw, svadr_bhwd_impl, (2)) +FUNCTION (svamax, cond_or_uncond_unspec_function, + (UNSPEC_COND_FAMAX, UNSPEC_FAMAX)) +FUNCTION (svamin, cond_or_uncond_unspec_function, + (UNSPEC_COND_FAMIN, UNSPEC_FAMIN)) FUNCTION (svand, rtx_code_function, (AND, AND)) FUNCTION (svandv, reduction, (UNSPEC_ANDV)) FUNCTION (svasr, rtx_code_function, (ASHIFTRT, ASHIFTRT)) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.def b/gcc/config/aarch64/aarch64-sve-builtins-base.def index 65fcba91586..95e04e4393d 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.def +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.def @@ -379,3 +379,8 @@ DEF_SVE_FUNCTION (svzip2q, binary, all_data, none) DEF_SVE_FUNCTION (svld1ro, load_replicate, all_data, implicit) DEF_SVE_FUNCTION (svmmla, mmla, d_float, none) #undef REQUIRED_EXTENSIONS + +#define REQUIRED_EXTENSIONS AARCH64_FL_SVE | AARCH64_FL_FAMINMAX +DEF_SVE_FUNCTION (svamax, binary_opt_single_n, all_float, mxz) +DEF_SVE_FUNCTION (svamin, binary_opt_single_n, all_float, mxz) +#undef REQUIRED_EXTENSIONS diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.h b/gcc/config/aarch64/aarch64-sve-builtins-base.h index 5bbf3569c4b..978cf7013f9 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.h +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.h @@ -37,6 +37,8 @@ namespace aarch64_sve extern const function_base *const svadrd; extern const function_base *const svadrh; extern const function_base *const svadrw; +extern const function_base *const svamax; +extern const function_base *const svamin; extern const function_base *const svand; extern const function_base *const svandv; extern const function_base *const svasr; diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index 030cffb1760..593319fd472 100644 --- a/gcc/c
[PATCH v6 2/2] aarch64: Add codegen support for SVE2 faminmax
The AArch64 FEAT_FAMINMAX extension introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch adds code generation for famax and famin in terms of existing unspecs. With this patch: 1. famax can be expressed as taking UNSPEC_COND_SMAX of the two operands and then taking absolute value of their result. 2. famin can be expressed as taking UNSPEC_COND_SMIN of the two operands and then taking absolute value of their result. This fusion of operators is only possible when -march=armv9-a+faminmax+sve flags are passed. We also need to pass -ffast-math flag; this is what enables compiler to use UNSPEC_COND_SMAX and UNSPEC_COND_SMIN. This code generation is only available on -O2 or -O3 as that is when auto-vectorization is enabled. gcc/ChangeLog: * config/aarch64/aarch64-sve2.md (*aarch64_pred_faminmax_fused): Instruction pattern for faminmax codegen. * config/aarch64/iterators.md: Iterator and attribute for faminmax codegen. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/faminmax_1.c: New test. * gcc.target/aarch64/sve/faminmax_2.c: New test. --- gcc/config/aarch64/aarch64-sve2.md| 37 gcc/config/aarch64/iterators.md | 6 ++ .../gcc.target/aarch64/sve/faminmax_1.c | 44 ++ .../gcc.target/aarch64/sve/faminmax_2.c | 60 +++ 4 files changed, 147 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_2.c diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index 725092cc95f..5f2697c3179 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -2467,6 +2467,43 @@ [(set_attr "movprfx" "yes")] ) +;; - +;; -- [FP] Absolute maximum and minimum +;; - +;; Includes: +;; - FAMAX +;; - FAMIN +;; - +;; Predicated floating-point absolute maximum and minimum. +(define_insn_and_rewrite "*aarch64_pred_faminmax_fused" + [(set (match_operand:SVE_FULL_F 0 "register_operand") + (unspec:SVE_FULL_F + [(match_operand: 1 "register_operand") + (match_operand:SI 4 "aarch64_sve_gp_strictness") + (unspec:SVE_FULL_F + [(match_operand 5) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_FULL_F 2 "register_operand")] + UNSPEC_COND_FABS) + (unspec:SVE_FULL_F + [(match_operand 6) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_FULL_F 3 "register_operand")] + UNSPEC_COND_FABS)] + SVE_COND_SMAXMIN))] + "TARGET_SVE_FAMINMAX" + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] + [ w, Upl , %0 , w ; * ] \t%0., %1/m, %0., %3. + [ ?&w , Upl , w , w ; yes] movprfx\t%0, %2\;\t%0., %1/m, %0., %3. + } + "&& (!rtx_equal_p (operands[1], operands[5]) + || !rtx_equal_p (operands[1], operands[6]))" + { +operands[5] = copy_rtx (operands[1]); +operands[6] = copy_rtx (operands[1]); + } +) + ;; = ;; == Complex arithmetic ;; = diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index cbacf59c451..244a9c1b75d 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -3143,6 +3143,9 @@ UNSPEC_COND_SMAX UNSPEC_COND_SMIN]) +(define_int_iterator SVE_COND_SMAXMIN [UNSPEC_COND_SMAX + UNSPEC_COND_SMIN]) + (define_int_iterator SVE_COND_FP_TERNARY [UNSPEC_COND_FMLA UNSPEC_COND_FMLS UNSPEC_COND_FNMLA @@ -4503,6 +4506,9 @@ (define_int_iterator FAMINMAX_UNS [UNSPEC_FAMAX UNSPEC_FAMIN]) +(define_int_attr faminmax_cond_uns_op + [(UNSPEC_COND_SMAX "famax") (UNSPEC_COND_SMIN "famin")]) + (define_int_attr faminmax_uns_op [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")]) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c b/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c new file mode 100644 index 000..3b65ccea065 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c @@ -0,0 +1,44 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -ffast-math" } */ + +#include "arm_sve.h" + +#pragma GCC target "+sve+faminmax" + +#define TEST_FAMAX(TYPE) \ + void fn_famax_##TYPE (TYPE * restrict a,\ + TYPE * restrict b,\ + TYPE * restrict c,\ + int n) { \ +for (int i = 0; i < n; i++) { \ + TYPE temp1 = __builtin_fabs (a[i]);\ + TYPE temp2 = __builtin_fabs (b[i]);\ + c[i] = __builtin_fmax (temp1, temp2);\ +} \ + }
[PATCH] aarch64: Fix bug with max/min (PR116934)
In ac4cdf5cb43c0b09e81760e2a1902ceebcf1a135, I introduced a bug where I put the new unspecs, UNSPEC_COND_SMAX and UNSPEC_COND_SMIN, into the wrong iterator. I should have put new unspecs in SVE_COND_FP_MAXMIN but I put it in SVE_COND_FP_BINARY_REG instead. That was incorrect because the SVE_COND_FP_MAXMIN iterator is being used for predicated floating-point maximum/minimum, not SVE_COND_FP_BINARY_REG. Also added a testcase to validate the new change. Regression tested on aarch64-unknown-linux-gnu and found no regressions. There are some test cases with "libitm" in their directory names which appear in compare_tests output as changed tests but it looks like they are in the output just because of changed build directories, like from build-patched/aarch64-unknown-linux-gnu/./libitm/* to build-pristine/aarch64-unknown-linux-gnu/./libitm/*. I didn't think it was a cause of concern and have pushed this for review. gcc/ChangeLog: * config/aarch64/iterators.md: Move UNSPEC_COND_SMAX and UNSPEC_COND_SMIN to correct iterators. gcc/testsuite/ChangeLog: PR target/116934 * gcc.target/aarch64/sve2/pr116934.c: New test. --- gcc/config/aarch64/iterators.md | 8 gcc/testsuite/gcc.target/aarch64/sve2/pr116934.c | 13 + 2 files changed, 17 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/pr116934.c diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 0836dee61c9..fcad236eee9 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -3125,9 +3125,7 @@ (define_int_iterator SVE_COND_FP_BINARY_REG [UNSPEC_COND_FDIV - UNSPEC_COND_FMULX - UNSPEC_COND_SMAX - UNSPEC_COND_SMIN]) + UNSPEC_COND_FMULX]) (define_int_iterator SVE_COND_FCADD [UNSPEC_COND_FCADD90 UNSPEC_COND_FCADD270]) @@ -3135,7 +3133,9 @@ (define_int_iterator SVE_COND_FP_MAXMIN [UNSPEC_COND_FMAX UNSPEC_COND_FMAXNM UNSPEC_COND_FMIN - UNSPEC_COND_FMINNM]) + UNSPEC_COND_FMINNM + UNSPEC_COND_SMAX + UNSPEC_COND_SMIN]) (define_int_iterator SVE_COND_FP_TERNARY [UNSPEC_COND_FMLA UNSPEC_COND_FMLS diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/pr116934.c b/gcc/testsuite/gcc.target/aarch64/sve2/pr116934.c new file mode 100644 index 000..94fb96ffa7d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve2/pr116934.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-Ofast -mcpu=neoverse-v2" } */ + +int a; +float *b; + +void foo() { + for (; a; a--, b += 4) { +b[0] = b[1] = b[2] = b[2] > 0 ?: 0; +if (b[3] < 0) + b[3] = 0; + } +}
[PATCH] [MAINTAINERS] Add myself to write after approval
ChangeLog: * MAINTAINERS: Add myself to write after approval. --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index e9fafaf45a7..0ea4db20f88 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -557,6 +557,7 @@ Andrew Jenner andrewjenner Haochen Jiang - Qian Jianhua- Michal Jiresmjires +Saurabh Jha - Janis Johnson janis Teresa Johnson tejohnson Kean Johnston -
[PATCH] [MAINTAINERS] Fix myself in order and add username
ChangeLog: * MAINTAINERS: Fix sort order and add username. --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 0ea4db20f88..3b4cf9d20d8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -554,10 +554,10 @@ Sam James sjames Surya Kumari Jangalajskumari Jakub Jelinek jakub Andrew Jenner andrewjenner +Saurabh Jha saurabhjha Haochen Jiang - Qian Jianhua- Michal Jiresmjires -Saurabh Jha - Janis Johnson janis Teresa Johnson tejohnson Kean Johnston -
[PATCH v2 3/3] aarch64: Add codegen support for SVE2 faminmax
The AArch64 FEAT_FAMINMAX extension introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch adds code generation for famax and famin in terms of existing unspecs. With this patch: 1. famax can be expressed as taking UNSPEC_COND_SMAX of the two operands and then taking absolute value of their result. 2. famin can be expressed as taking UNSPEC_COND_SMIN of the two operands and then taking absolute value of their result. This fusion of operators is only possible when -march=armv9-a+faminmax+sve flags are passed. We also need to pass -ffast-math flag; this is what enables compiler to use UNSPEC_COND_SMAX and UNSPEC_COND_SMIN. This code generation is only available on -O2 or -O3 as that is when auto-vectorization is enabled. gcc/ChangeLog: * config/aarch64/aarch64-sve2.md (*aarch64_pred_faminmax_fused): Instruction pattern for faminmax codegen. * config/aarch64/iterators.md: Iterator and attribute for faminmax codegen. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/faminmax_1.c: New test. * gcc.target/aarch64/sve/faminmax_2.c: New test. --- gcc/config/aarch64/aarch64-sve2.md| 31 gcc/config/aarch64/iterators.md | 6 + .../gcc.target/aarch64/sve/faminmax_1.c | 85 ++ .../gcc.target/aarch64/sve/faminmax_2.c | 154 ++ 4 files changed, 276 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_2.c diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index 972b03a4fef..6a8e940e16d 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -2467,6 +2467,37 @@ [(set_attr "movprfx" "yes")] ) +;; - +;; -- [FP] Absolute maximum and minimum +;; - +;; Includes: +;; - FAMAX +;; - FAMIN +;; - +;; Predicated floating-point absolute maximum and minimum. +(define_insn "*aarch64_pred_faminmax_fused" + [(set (match_operand:SVE_FULL_F 0 "register_operand") + (unspec:SVE_FULL_F + [(match_operand: 1 "register_operand") + (match_operand:SI 4 "aarch64_sve_gp_strictness") + (unspec:SVE_FULL_F + [(match_operand 5) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_FULL_F 2 "register_operand")] + UNSPEC_COND_FABS) + (unspec:SVE_FULL_F + [(match_operand 6) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_FULL_F 3 "register_operand")] + UNSPEC_COND_FABS)] + SVE_COND_FP_SMAXMIN))] + "TARGET_SVE_FAMINMAX" + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] + [ w, Upl , %0 , w ; * ] \t%0., %1/m, %0., %3. + [ ?&w , Upl , w , w ; yes] movprfx\t%0, %2\;\t%0., %1/m, %0., %3. + } +) + ;; = ;; == Complex arithmetic ;; = diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index d3a457fc6d9..e9adb4209da 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -3143,6 +3143,9 @@ UNSPEC_COND_FMIN UNSPEC_COND_FMINNM]) +(define_int_iterator SVE_COND_FP_SMAXMIN [UNSPEC_COND_SMAX + UNSPEC_COND_SMIN]) + (define_int_iterator SVE_COND_FP_TERNARY [UNSPEC_COND_FMLA UNSPEC_COND_FMLS UNSPEC_COND_FNMLA @@ -4503,6 +4506,9 @@ (define_int_iterator FAMINMAX_UNS [UNSPEC_FAMAX UNSPEC_FAMIN]) +(define_int_attr faminmax_cond_uns_op + [(UNSPEC_COND_SMAX "famax") (UNSPEC_COND_SMIN "famin")]) + (define_int_attr faminmax_uns_op [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")]) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c b/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c new file mode 100644 index 000..bdf077ab2f7 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c @@ -0,0 +1,85 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -ffast-math" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_sve.h" + +#pragma GCC target "+sve+faminmax" + +#define TEST_FAMAX(TYPE) \ + void fn_famax_##TYPE (TYPE * restrict a,\ + TYPE * restrict b,\ + TYPE * restrict c,\ + int n) { \ +for (int i = 0; i < n; i++) { \ + TYPE temp1 = __builtin_fabs (a[i]);\ + TYPE temp2 = __builtin_fabs (b[i]);\ + c[i] = __builtin_fmax (temp1, temp2);\ +} \ + } \ + +#define TEST_FAMIN(TYPE) \ + void fn_famin_##TYPE (TYPE * restrict a,\ + TYPE * restrict b,\ + TYPE * restrict c,\ + int n)
[PATCH v2 2/3] aarch64: Introduce new unspecs for smax/smin
Introduce two new unspecs, UNSPEC_COND_SMAX and UNSPEC_COND_SMIN, corresponding to rtl operators smax and smin. UNSPEC_COND_SMAX is used to generate fmaxnm instruction and UNSPEC_COND_SMIN is used to generate fminnm instruction. With these new unspecs, we can generate SVE2 max/min instructions using existing generic unpredicated and predicated instruction patterns that use optab attribute. Thus, we have removed specialised instruction patterns for max/min instructions that were using SVE_COND_FP_MAXMIN_PUBLIC iterator. No new test cases as the existing test cases should be enough to test this refactoring. gcc/ChangeLog: * config/aarch64/aarch64-sve.md (3): Remove this instruction pattern. (cond_): Remove this instruction pattern. * config/aarch64/iterators.md: New unspecs and changes to iterators and attrs to use the new unspecs --- gcc/config/aarch64/aarch64-sve.md | 33 --- gcc/config/aarch64/iterators.md | 55 --- 2 files changed, 35 insertions(+), 53 deletions(-) diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index bfa28849adf..989ba9546d7 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -6600,39 +6600,6 @@ ;; - FMINNM ;; - -;; Unpredicated fmax/fmin (the libm functions). The optabs for the -;; smax/smin rtx codes are handled in the generic section above. -(define_expand "3" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_dup 3) - (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 1 "register_operand") - (match_operand:SVE_FULL_F 2 "aarch64_sve_float_maxmin_operand")] - SVE_COND_FP_MAXMIN_PUBLIC))] - "TARGET_SVE" - { -operands[3] = aarch64_ptrue_reg (mode); - } -) - -;; Predicated fmax/fmin (the libm functions). The optabs for the -;; smax/smin rtx codes are handled in the generic section above. -(define_expand "cond_" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand: 1 "register_operand") - (unspec:SVE_FULL_F - [(match_dup 1) - (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "aarch64_sve_float_maxmin_operand")] - SVE_COND_FP_MAXMIN_PUBLIC) - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")] - UNSPEC_SEL))] - "TARGET_SVE" -) - ;; Predicated floating-point maximum/minimum. (define_insn "@aarch64_pred_" [(set (match_operand:SVE_FULL_F 0 "register_operand") diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index cf9ee2639a9..d3a457fc6d9 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -884,6 +884,8 @@ UNSPEC_COND_FSUB ; Used in aarch64-sve.md. UNSPEC_COND_SCVTF ; Used in aarch64-sve.md. UNSPEC_COND_UCVTF ; Used in aarch64-sve.md. +UNSPEC_COND_SMAX ; Used in aarch64-sve.md. +UNSPEC_COND_SMIN ; Used in aarch64-sve.md. UNSPEC_LASTA ; Used in aarch64-sve.md. UNSPEC_LASTB ; Used in aarch64-sve.md. UNSPEC_ASHIFT_WIDE ; Used in aarch64-sve.md. @@ -3094,7 +3096,9 @@ UNSPEC_COND_FMINNM UNSPEC_COND_FMUL UNSPEC_COND_FMULX - UNSPEC_COND_FSUB]) + UNSPEC_COND_FSUB + UNSPEC_COND_SMAX + UNSPEC_COND_SMIN]) ;; Same as SVE_COND_FP_BINARY, but without codes that have a dedicated ;; 3 expander. @@ -3105,7 +3109,9 @@ UNSPEC_COND_FMINNM UNSPEC_COND_FMUL UNSPEC_COND_FMULX - UNSPEC_COND_FSUB]) + UNSPEC_COND_FSUB + UNSPEC_COND_SMAX + UNSPEC_COND_SMIN]) (define_int_iterator SVE_COND_FP_BINARY_INT [UNSPEC_COND_FSCALE]) @@ -3117,13 +3123,17 @@ UNSPEC_COND_FMAXNM UNSPEC_COND_FMIN UNSPEC_COND_FMINNM - UNSPEC_COND_FMUL]) + UNSPEC_COND_FMUL + UNSPEC_COND_SMAX + UNSPEC_COND_SMIN]) (define_int_iterator SVE_COND_FP_BINARY_REG [(UNSPEC_COND_FAMAX "TARGET_SVE_FAMINMAX") (UNSPEC_COND_FAMIN "TARGET_SVE_FAMINMAX") UNSPEC_COND_FDIV - UNSPEC_COND_FMULX]) + UNSPEC_COND_FMULX + UNSPEC_COND_SMAX + UNSPEC_COND_SMIN]) (define_int_iterator SVE_COND_FCADD [UNSPEC_COND_FCADD90 UNSPEC_COND_FCADD270]) @@ -3133,11 +3143,6 @@ UNSPEC_COND_FMIN UNSPEC_COND_FMINNM]) -;; Floating-point max/min operations that correspond to optabs, -;; as opposed to those that are internal to the port. -(define_int_iterator SVE_COND_FP_MAXMIN_PUBLIC [UNSPEC_COND_FMAXNM - UNSPEC_COND_FMINNM]) - (define_int_iterator SVE_COND_FP_TERNARY [UNSPEC_COND_FMLA UNSPEC_COND_FMLS UNSPEC_COND_FNMLA @@ -3715,9 +3720,9 @@ (UNSPEC_COND_FCVTZU "fixuns_trunc") (UNSPEC_COND_FDIV "div") (UNSPEC_COND_FMAX "fmax_nan") - (UNSPEC_COND_FMAXNM "smax") + (UNSPEC_COND_FMAXNM "fmax")
[PATCH v2 0/3] Add support for SVE2 faminmax
From: Saurabh Jha This patch series a revised version of an earlier patch series: https://gcc.gnu.org/pipermail/gcc-patches/2024-September/662951.html. The main change in this patch series is the introduction of the two unspecs, UNSPEC_COND_SMAX and UNSPEC_COND_SMIN, and using them for existing minmax instructions and for the code generation of faminmax instructions. Other changes in this patch series are to address review comments: 1. [PATCH 1] Removing stray gcc/testsuite/gcc.target/aarch64/aminmax.h. 2. [PATCH 1] Fixing formatting in new iterators in iterators.md. 3. [PATCH 1] Using pragma "+sve+faminmax" in test cases. Remove options directive. 4. [PATCH 2] Move instruction pattern to aarch64-sve2.md. 5. [PATCH 2] Fix use of operand 2. 6. [PATCH 2] Fix use of assemble directive. 7. [PATCH 2] Using pragma "+sve+faminmax" and remove them from options. 8. [PATCH 2] New test cases to make sure we are not operator combine while using intrinsics. Another minor change was in aarch64-sve-builtins-base.cc where we fixed the formatting. Saurabh Jha (3): aarch64: Add SVE2 faminmax intrinsics aarch64: Introduce new unspecs for smax/smin aarch64: Add codegen support for SVE2 faminmax .../aarch64/aarch64-sve-builtins-base.cc | 4 + .../aarch64/aarch64-sve-builtins-base.def | 5 + .../aarch64/aarch64-sve-builtins-base.h | 2 + gcc/config/aarch64/aarch64-sve.md | 33 gcc/config/aarch64/aarch64-sve2.md| 31 gcc/config/aarch64/aarch64.h | 1 + gcc/config/aarch64/iterators.md | 97 +++ .../gcc.target/aarch64/sve/faminmax_1.c | 85 ++ .../gcc.target/aarch64/sve/faminmax_2.c | 154 ++ .../aarch64/sve2/acle/asm/amax_f16.c | 142 .../aarch64/sve2/acle/asm/amax_f32.c | 142 .../aarch64/sve2/acle/asm/amax_f64.c | 142 .../aarch64/sve2/acle/asm/amin_f16.c | 142 .../aarch64/sve2/acle/asm/amin_f32.c | 142 .../aarch64/sve2/acle/asm/amin_f64.c | 142 15 files changed, 1202 insertions(+), 62 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f64.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f64.c -- 2.34.1
[PATCH v2 1/3] aarch64: Add SVE2 faminmax intrinsics
The AArch64 FEAT_FAMINMAX extension introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch introduces SVE2 faminmax intrinsics. The intrinsics of this extension are implemented as the following builtin functions: * sva[max|min]_[m|x|z] * sva[max|min]_[f16|f32|f64]_[m|x|z] * sva[max|min]_n_[f16|f32|f64]_[m|x|z] gcc/ChangeLog: * config/aarch64/aarch64-sve-builtins-base.cc (svamax): Absolute maximum declaration. (svamin): Absolute minimum declaration. * config/aarch64/aarch64-sve-builtins-base.def (REQUIRED_EXTENSIONS): Add faminmax intrinsics behind a flag. (svamax): Absolute maximum declaration. (svamin): Absolute minimum declaration. * config/aarch64/aarch64-sve-builtins-base.h: Declaring function bases for the new intrinsics. * config/aarch64/aarch64.h (TARGET_SVE_FAMINMAX): New flag for SVE2 faminmax. * config/aarch64/iterators.md: New unspecs, iterators, and attrs for the new intrinsics. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve2/acle/asm/amax_f16.c: New test. * gcc.target/aarch64/sve2/acle/asm/amax_f32.c: New test. * gcc.target/aarch64/sve2/acle/asm/amax_f64.c: New test. * gcc.target/aarch64/sve2/acle/asm/amin_f16.c: New test. * gcc.target/aarch64/sve2/acle/asm/amin_f32.c: New test. * gcc.target/aarch64/sve2/acle/asm/amin_f64.c: New test. --- .../aarch64/aarch64-sve-builtins-base.cc | 4 + .../aarch64/aarch64-sve-builtins-base.def | 5 + .../aarch64/aarch64-sve-builtins-base.h | 2 + gcc/config/aarch64/aarch64.h | 1 + gcc/config/aarch64/iterators.md | 40 +++-- .../aarch64/sve2/acle/asm/amax_f16.c | 142 ++ .../aarch64/sve2/acle/asm/amax_f32.c | 142 ++ .../aarch64/sve2/acle/asm/amax_f64.c | 142 ++ .../aarch64/sve2/acle/asm/amin_f16.c | 142 ++ .../aarch64/sve2/acle/asm/amin_f32.c | 142 ++ .../aarch64/sve2/acle/asm/amin_f64.c | 142 ++ 11 files changed, 893 insertions(+), 11 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f64.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f64.c diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index afce52a7e8d..dd4efdf6ca5 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -3070,6 +3070,10 @@ FUNCTION (svadrb, svadr_bhwd_impl, (0)) FUNCTION (svadrd, svadr_bhwd_impl, (3)) FUNCTION (svadrh, svadr_bhwd_impl, (1)) FUNCTION (svadrw, svadr_bhwd_impl, (2)) +FUNCTION (svamax, cond_or_uncond_unspec_function, + (UNSPEC_COND_FAMAX, UNSPEC_FAMAX)) +FUNCTION (svamin, cond_or_uncond_unspec_function, + (UNSPEC_COND_FAMIN, UNSPEC_FAMIN)) FUNCTION (svand, rtx_code_function, (AND, AND)) FUNCTION (svandv, reduction, (UNSPEC_ANDV)) FUNCTION (svasr, rtx_code_function, (ASHIFTRT, ASHIFTRT)) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.def b/gcc/config/aarch64/aarch64-sve-builtins-base.def index 65fcba91586..95e04e4393d 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.def +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.def @@ -379,3 +379,8 @@ DEF_SVE_FUNCTION (svzip2q, binary, all_data, none) DEF_SVE_FUNCTION (svld1ro, load_replicate, all_data, implicit) DEF_SVE_FUNCTION (svmmla, mmla, d_float, none) #undef REQUIRED_EXTENSIONS + +#define REQUIRED_EXTENSIONS AARCH64_FL_SVE | AARCH64_FL_FAMINMAX +DEF_SVE_FUNCTION (svamax, binary_opt_single_n, all_float, mxz) +DEF_SVE_FUNCTION (svamin, binary_opt_single_n, all_float, mxz) +#undef REQUIRED_EXTENSIONS diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.h b/gcc/config/aarch64/aarch64-sve-builtins-base.h index 5bbf3569c4b..978cf7013f9 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.h +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.h @@ -37,6 +37,8 @@ namespace aarch64_sve extern const function_base *const svadrd; extern const function_base *const svadrh; extern const function_base *const svadrw; +extern const function_base *const svamax; +extern const function_base *const svamin; extern const function_base *const svand; extern const function_base *const svandv; extern const function_base *const svasr; diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index 43819adb48c..a496235db42 100644 --- a/g
[PATCH v9 1/2] aarch64: Add AdvSIMD faminmax intrinsics
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch introduces AdvSIMD faminmax intrinsics. The intrinsics of this extension are implemented as the following builtin functions: * vamax_f16 * vamaxq_f16 * vamax_f32 * vamaxq_f32 * vamaxq_f64 * vamin_f16 * vaminq_f16 * vamin_f32 * vaminq_f32 * vaminq_f64 We are defining a new way to add AArch64 AdvSIMD intrinsics by listing all the intrinsics in a .def file and then using that .def file to initialise various data structures. This would lead to more concise code and easier addition of the new AdvSIMD intrinsics in future. The faminmax intrinsics are defined using the new approach. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (ENTRY): Macro to parse the contents of aarch64-simd-pragma-builtins.def. (ENTRY_VHSDF): Macro to parse the contents of aarch64-simd-pragma-builtins.def. (enum aarch64_builtins): New enum values for faminmax builtins via aarch64-simd-pragma-builtins.def. (enum class aarch64_builtin_signatures): Enum class to specify the number of operands a builtin will take. (struct aarch64_pragma_builtins_data): Struct to hold data from aarch64-simd-pragma-builtins.def. (aarch64_fntype): New function to define function types of intrinsics given an object of type aarch64_pragma_builtins_data. (aarch64_init_pragma_builtins): New function to define pragma builtins. (aarch64_get_pragma_builtin): New function to get a row of aarch64_pragma_builtins, given code. (handle_arm_neon_h): Modify to call aarch64_init_pragma_builtins. (aarch64_general_check_builtin_call): Modify to check whether required flag is being used for pragma builtins. (aarch64_expand_pragma_builtin): New function to emit instructions of pragma_builtin. (aarch64_general_expand_builtin): Modify to call aarch64_expand_pragma_builtin. * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): Introduce new flag for this extension. * config/aarch64/aarch64-simd.md (@aarch64_): Instruction pattern for faminmax intrinsics. * config/aarch64/aarch64.h (TARGET_FAMINMAX): Introduce new flag for this extension. * config/aarch64/iterators.md: New iterators and unspecs. * doc/invoke.texi: Document extension in AArch64 Options. * config/aarch64/aarch64-simd-pragma-builtins.def: New file to list pragma builtins. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/faminmax-builtins-no-flag.c: New test. * gcc.target/aarch64/simd/faminmax-builtins.c: New test. --- gcc/config/aarch64/aarch64-builtins.cc| 119 ++ .../aarch64/aarch64-option-extensions.def | 2 + .../aarch64/aarch64-simd-pragma-builtins.def | 23 gcc/config/aarch64/aarch64-simd.md| 10 ++ gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 9 ++ gcc/doc/invoke.texi | 2 + .../aarch64/simd/faminmax-builtins-no-flag.c | 10 ++ .../aarch64/simd/faminmax-builtins.c | 115 + 9 files changed, 294 insertions(+) create mode 100644 gcc/config/aarch64/aarch64-simd-pragma-builtins.def create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins.c diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index eb878b933fe..6266bea3b39 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -757,6 +757,18 @@ typedef struct #define VAR1(T, N, MAP, FLAG, A) \ AARCH64_SIMD_BUILTIN_##T##_##N##A, +#undef ENTRY +#define ENTRY(N, S, M, U, F) \ + AARCH64_##N, + +#undef ENTRY_VHSDF +#define ENTRY_VHSDF(NAME, SIGNATURE, UNSPEC, EXTENSIONS) \ + AARCH64_##NAME##_f16, \ + AARCH64_##NAME##q_f16, \ + AARCH64_##NAME##_f32, \ + AARCH64_##NAME##q_f32, \ + AARCH64_##NAME##q_f64, + enum aarch64_builtins { AARCH64_BUILTIN_MIN, @@ -829,6 +841,10 @@ enum aarch64_builtins AARCH64_RBIT, AARCH64_RBITL, AARCH64_RBITLL, + /* Pragma builtins. */ + AARCH64_PRAGMA_BUILTIN_START, +#include "aarch64-simd-pragma-builtins.def" + AARCH64_PRAGMA_BUILTIN_END, /* System register builtins. */ AARCH64_RSR, AARCH64_RSRP, @@ -947,6 +963,7 @@ const char *aarch64_scalar_builtin_types[] = { extern GTY(()) aarch64_simd_type_info aarch64_simd_types[]; +#undef ENTRY #define ENTRY(E, M, Q, G) \ {E, "__" #E, #G "__" #E, NULL_TREE, NULL_TREE, E_##M##mode, qualifier_##Q}, struct aarch64_simd_type_info aarch64_simd_types [] = { @@ -1
[PATCH v9 0/2] Add support for AdvSIMD faminmax
From: Saurabh Jha This is a revised version of this patch series: https://gcc.gnu.org/pipermail/gcc-patches/2024-September/thread.html Thanks for the review comments. They are all addressed in this version. The changes are as follows. 1. [intrinsics patch] Using enum class for aarch64_builtin_signatures. 2. [intrinsics patch] Fixed formatting of const aarch64_pragma_builtins_data *builtin_data 3. [intrinsics patch] Removed brace for single statement in the if block if (auto builtin_data = aarch64_get_pragma_builtin (fcode)) return aarch64_expand_pragma_builtin (exp, target, builtin_data) 4. [intrinsics patch] Removed use of scheduling type in the intrinsic instruction pattern. 5. [intrinsics patch] Formatted comment for +faminmax flag in aarch64.h. 6. [codegen patch] Removed use of type in the codegen instruction pattern. 7. [codegen patch] Added a new test file called faminmax-no-codegen.c to test that we don't combine vmax/vmaxnm/vmin/vminnm with vabs. Rebased with latest master. Regression tested with aarch64-none-linux-gnu target and found no regressions. Ok for master? I don't have commit access so can someone please commit on my behalf? Saurabh Jha (2): aarch64: Add AdvSIMD faminmax intrinsics aarch64: Add codegen support for AdvSIMD faminmax gcc/config/aarch64/aarch64-builtins.cc| 119 .../aarch64/aarch64-option-extensions.def | 2 + .../aarch64/aarch64-simd-pragma-builtins.def | 23 ++ gcc/config/aarch64/aarch64-simd.md| 19 ++ gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 12 + gcc/doc/invoke.texi | 2 + .../aarch64/simd/faminmax-builtins-no-flag.c | 10 + .../aarch64/simd/faminmax-builtins.c | 115 .../aarch64/simd/faminmax-codegen-no-flag.c | 217 ++ .../aarch64/simd/faminmax-codegen.c | 197 + .../aarch64/simd/faminmax-no-codegen.c| 267 ++ 12 files changed, 987 insertions(+) create mode 100644 gcc/config/aarch64/aarch64-simd-pragma-builtins.def create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-no-codegen.c -- 2.43.2
[PATCH v10 0/2] Add support for AdvSIMD faminmax
From: Saurabh Jha This is a revised version of this patch series: https://gcc.gnu.org/pipermail/gcc-patches/2024-September/663204.html The only new thing in both patches of this series are fixing directives in test cases, replace /* { dg-do assemble} */ with /* { dg-do compile } */. We need compile here to make the tests work. Sorry for missing this review in my previous version. No changes in code. Both patches don't require further review as pointed out by Richard Sandiford in replies to the two patches * https://gcc.gnu.org/pipermail/gcc-patches/2024-September/663229.html * https://gcc.gnu.org/pipermail/gcc-patches/2024-September/663230.html I will request commit access to gcc after this patch is accepted. Because I already have commit access to binutils, I will email the overseers. Saurabh Jha (2): aarch64: Add AdvSIMD faminmax intrinsics aarch64: Add codegen support for AdvSIMD faminmax gcc/config/aarch64/aarch64-builtins.cc| 119 .../aarch64/aarch64-option-extensions.def | 2 + .../aarch64/aarch64-simd-pragma-builtins.def | 23 ++ gcc/config/aarch64/aarch64-simd.md| 19 ++ gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 12 + gcc/doc/invoke.texi | 2 + .../aarch64/simd/faminmax-builtins-no-flag.c | 10 + .../aarch64/simd/faminmax-builtins.c | 115 .../aarch64/simd/faminmax-codegen-no-flag.c | 217 ++ .../aarch64/simd/faminmax-codegen.c | 197 + .../aarch64/simd/faminmax-no-codegen.c| 267 ++ 12 files changed, 987 insertions(+) create mode 100644 gcc/config/aarch64/aarch64-simd-pragma-builtins.def create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-no-codegen.c -- 2.43.2
[PATCH v10 1/2] aarch64: Add AdvSIMD faminmax intrinsics
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch introduces AdvSIMD faminmax intrinsics. The intrinsics of this extension are implemented as the following builtin functions: * vamax_f16 * vamaxq_f16 * vamax_f32 * vamaxq_f32 * vamaxq_f64 * vamin_f16 * vaminq_f16 * vamin_f32 * vaminq_f32 * vaminq_f64 We are defining a new way to add AArch64 AdvSIMD intrinsics by listing all the intrinsics in a .def file and then using that .def file to initialise various data structures. This would lead to more concise code and easier addition of the new AdvSIMD intrinsics in future. The faminmax intrinsics are defined using the new approach. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (ENTRY): Macro to parse the contents of aarch64-simd-pragma-builtins.def. (ENTRY_VHSDF): Macro to parse the contents of aarch64-simd-pragma-builtins.def. (enum aarch64_builtins): New enum values for faminmax builtins via aarch64-simd-pragma-builtins.def. (enum class aarch64_builtin_signatures): Enum class to specify the number of operands a builtin will take. (struct aarch64_pragma_builtins_data): Struct to hold data from aarch64-simd-pragma-builtins.def. (aarch64_fntype): New function to define function types of intrinsics given an object of type aarch64_pragma_builtins_data. (aarch64_init_pragma_builtins): New function to define pragma builtins. (aarch64_get_pragma_builtin): New function to get a row of aarch64_pragma_builtins, given code. (handle_arm_neon_h): Modify to call aarch64_init_pragma_builtins. (aarch64_general_check_builtin_call): Modify to check whether required flag is being used for pragma builtins. (aarch64_expand_pragma_builtin): New function to emit instructions of pragma_builtin. (aarch64_general_expand_builtin): Modify to call aarch64_expand_pragma_builtin. * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): Introduce new flag for this extension. * config/aarch64/aarch64-simd.md (@aarch64_): Instruction pattern for faminmax intrinsics. * config/aarch64/aarch64.h (TARGET_FAMINMAX): Introduce new flag for this extension. * config/aarch64/iterators.md: New iterators and unspecs. * doc/invoke.texi: Document extension in AArch64 Options. * config/aarch64/aarch64-simd-pragma-builtins.def: New file to list pragma builtins. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/faminmax-builtins-no-flag.c: New test. * gcc.target/aarch64/simd/faminmax-builtins.c: New test. --- gcc/config/aarch64/aarch64-builtins.cc| 119 ++ .../aarch64/aarch64-option-extensions.def | 2 + .../aarch64/aarch64-simd-pragma-builtins.def | 23 gcc/config/aarch64/aarch64-simd.md| 10 ++ gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 9 ++ gcc/doc/invoke.texi | 2 + .../aarch64/simd/faminmax-builtins-no-flag.c | 10 ++ .../aarch64/simd/faminmax-builtins.c | 115 + 9 files changed, 294 insertions(+) create mode 100644 gcc/config/aarch64/aarch64-simd-pragma-builtins.def create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins.c diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index eb878b933fe..6266bea3b39 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -757,6 +757,18 @@ typedef struct #define VAR1(T, N, MAP, FLAG, A) \ AARCH64_SIMD_BUILTIN_##T##_##N##A, +#undef ENTRY +#define ENTRY(N, S, M, U, F) \ + AARCH64_##N, + +#undef ENTRY_VHSDF +#define ENTRY_VHSDF(NAME, SIGNATURE, UNSPEC, EXTENSIONS) \ + AARCH64_##NAME##_f16, \ + AARCH64_##NAME##q_f16, \ + AARCH64_##NAME##_f32, \ + AARCH64_##NAME##q_f32, \ + AARCH64_##NAME##q_f64, + enum aarch64_builtins { AARCH64_BUILTIN_MIN, @@ -829,6 +841,10 @@ enum aarch64_builtins AARCH64_RBIT, AARCH64_RBITL, AARCH64_RBITLL, + /* Pragma builtins. */ + AARCH64_PRAGMA_BUILTIN_START, +#include "aarch64-simd-pragma-builtins.def" + AARCH64_PRAGMA_BUILTIN_END, /* System register builtins. */ AARCH64_RSR, AARCH64_RSRP, @@ -947,6 +963,7 @@ const char *aarch64_scalar_builtin_types[] = { extern GTY(()) aarch64_simd_type_info aarch64_simd_types[]; +#undef ENTRY #define ENTRY(E, M, Q, G) \ {E, "__" #E, #G "__" #E, NULL_TREE, NULL_TREE, E_##M##mode, qualifier_##Q}, struct aarch64_simd_type_info aarch64_simd_types [] = { @@ -1
[PATCH v10 2/2] aarch64: Add codegen support for AdvSIMD faminmax
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch adds code generation support for famax and famin in terms of existing RTL operators. famax/famin is equivalent to first taking abs of the operands and then taking smax/smin on the results of abs. famax/famin (a, b) = smax/smin (abs (a), abs (b)) This fusion of operators is only possible when -march=armv9-a+faminmax flags are passed. We also need to pass -ffast-math flag; if we don't, then a statement like c[i] = __builtin_fmaxf16 (a[i], b[i]); is RTL expanded to UNSPEC_FMAXNM instead of smax (likewise for smin). This code generation is only available on -O2 or -O3 as that is when auto-vectorization is enabled. gcc/ChangeLog: * config/aarch64/aarch64-simd.md (*aarch64_faminmax_fused): Instruction pattern for faminmax codegen. * config/aarch64/iterators.md: Attribute for faminmax codegen. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/faminmax-codegen-no-flag.c: New test. * gcc.target/aarch64/simd/faminmax-codegen.c: New test. * gcc.target/aarch64/simd/faminmax-no-codegen.c: New test. --- gcc/config/aarch64/aarch64-simd.md| 9 + gcc/config/aarch64/iterators.md | 3 + .../aarch64/simd/faminmax-codegen-no-flag.c | 217 ++ .../aarch64/simd/faminmax-codegen.c | 197 + .../aarch64/simd/faminmax-no-codegen.c| 267 ++ 5 files changed, 693 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-no-codegen.c diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 67f0fe26f93..2a44aa3fcc3 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -9920,3 +9920,12 @@ "TARGET_FAMINMAX" "\t%0., %1., %2." ) + +(define_insn "*aarch64_faminmax_fused" + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (FMAXMIN:VHSDF + (abs:VHSDF (match_operand:VHSDF 1 "register_operand" "w")) + (abs:VHSDF (match_operand:VHSDF 2 "register_operand" "w"] + "TARGET_FAMINMAX" + "\t%0., %1., %2." +) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 17ac5e073aa..c2fcd18306e 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -4472,3 +4472,6 @@ (define_int_attr faminmax_uns_op [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")]) + +(define_code_attr faminmax_op + [(smax "famax") (smin "famin")]) diff --git a/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c new file mode 100644 index 000..6688a7883b7 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c @@ -0,0 +1,217 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -ffast-math -march=armv9-a" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + +#pragma GCC target "+nosve" + +/* +** test_vamax_f16: +** fabs v1.4h, v1.4h +** fabs v0.4h, v0.4h +** fmaxnm v0.4h, v0.4h, v1.4h +** ret +*/ +float16x4_t +test_vamax_f16 (float16x4_t a, float16x4_t b) +{ + int i; + float16x4_t c; + + for (i = 0; i < 4; ++i) { +a[i] = __builtin_fabsf16 (a[i]); +b[i] = __builtin_fabsf16 (b[i]); +c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f16: +** fabs v1.8h, v1.8h +** fabs v0.8h, v0.8h +** fmaxnm v0.8h, v0.8h, v1.8h +** ret +*/ +float16x8_t +test_vamaxq_f16 (float16x8_t a, float16x8_t b) +{ + int i; + float16x8_t c; + + for (i = 0; i < 8; ++i) { +a[i] = __builtin_fabsf16 (a[i]); +b[i] = __builtin_fabsf16 (b[i]); +c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamax_f32: +** fabs v1.2s, v1.2s +** fabs v0.2s, v0.2s +** fmaxnm v0.2s, v0.2s, v1.2s +** ret +*/ +float32x2_t +test_vamax_f32 (float32x2_t a, float32x2_t b) +{ + int i; + float32x2_t c; + + for (i = 0; i < 2; ++i) { +a[i] = __builtin_fabsf32 (a[i]); +b[i] = __builtin_fabsf32 (b[i]); +c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f32: +** fabs v1.4s, v1.4s +** fabs v0.4s, v0.4s +** fmaxnm v0.4s, v0.4s, v1.4s +** ret +*/ +float32x4_t +test_vamaxq_f32 (float32x4_t a, float32x4_t b) +{ + int i; + float32x4_t c; + + for (i = 0; i < 4; ++i) { +a[i] = __builtin_fabsf32 (a[i]); +b[i] = __builtin_fabsf32 (b[i]); +c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f64: +** fabs v1.2d, v1.2d +** fabs v0.2d, v0.2d +** fmaxnm v0.2d, v0.2d, v1.2d +** ret +*/ +float64x2_t +test_vamaxq_f6
[PATCH v9 2/2] aarch64: Add codegen support for AdvSIMD faminmax
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch adds code generation support for famax and famin in terms of existing RTL operators. famax/famin is equivalent to first taking abs of the operands and then taking smax/smin on the results of abs. famax/famin (a, b) = smax/smin (abs (a), abs (b)) This fusion of operators is only possible when -march=armv9-a+faminmax flags are passed. We also need to pass -ffast-math flag; if we don't, then a statement like c[i] = __builtin_fmaxf16 (a[i], b[i]); is RTL expanded to UNSPEC_FMAXNM instead of smax (likewise for smin). This code generation is only available on -O2 or -O3 as that is when auto-vectorization is enabled. gcc/ChangeLog: * config/aarch64/aarch64-simd.md (*aarch64_faminmax_fused): Instruction pattern for faminmax codegen. * config/aarch64/iterators.md: Attribute for faminmax codegen. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/faminmax-codegen-no-flag.c: New test. * gcc.target/aarch64/simd/faminmax-codegen.c: New test. * gcc.target/aarch64/simd/faminmax-no-codegen.c: New test. --- gcc/config/aarch64/aarch64-simd.md| 9 + gcc/config/aarch64/iterators.md | 3 + .../aarch64/simd/faminmax-codegen-no-flag.c | 217 ++ .../aarch64/simd/faminmax-codegen.c | 197 + .../aarch64/simd/faminmax-no-codegen.c| 267 ++ 5 files changed, 693 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-no-codegen.c diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 67f0fe26f93..2a44aa3fcc3 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -9920,3 +9920,12 @@ "TARGET_FAMINMAX" "\t%0., %1., %2." ) + +(define_insn "*aarch64_faminmax_fused" + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (FMAXMIN:VHSDF + (abs:VHSDF (match_operand:VHSDF 1 "register_operand" "w")) + (abs:VHSDF (match_operand:VHSDF 2 "register_operand" "w"] + "TARGET_FAMINMAX" + "\t%0., %1., %2." +) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 17ac5e073aa..c2fcd18306e 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -4472,3 +4472,6 @@ (define_int_attr faminmax_uns_op [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")]) + +(define_code_attr faminmax_op + [(smax "famax") (smin "famin")]) diff --git a/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c new file mode 100644 index 000..d77f5a5d19f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c @@ -0,0 +1,217 @@ +/* { dg-do assemble} */ +/* { dg-additional-options "-O3 -ffast-math -march=armv9-a" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + +#pragma GCC target "+nosve" + +/* +** test_vamax_f16: +** fabs v1.4h, v1.4h +** fabs v0.4h, v0.4h +** fmaxnm v0.4h, v0.4h, v1.4h +** ret +*/ +float16x4_t +test_vamax_f16 (float16x4_t a, float16x4_t b) +{ + int i; + float16x4_t c; + + for (i = 0; i < 4; ++i) { +a[i] = __builtin_fabsf16 (a[i]); +b[i] = __builtin_fabsf16 (b[i]); +c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f16: +** fabs v1.8h, v1.8h +** fabs v0.8h, v0.8h +** fmaxnm v0.8h, v0.8h, v1.8h +** ret +*/ +float16x8_t +test_vamaxq_f16 (float16x8_t a, float16x8_t b) +{ + int i; + float16x8_t c; + + for (i = 0; i < 8; ++i) { +a[i] = __builtin_fabsf16 (a[i]); +b[i] = __builtin_fabsf16 (b[i]); +c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamax_f32: +** fabs v1.2s, v1.2s +** fabs v0.2s, v0.2s +** fmaxnm v0.2s, v0.2s, v1.2s +** ret +*/ +float32x2_t +test_vamax_f32 (float32x2_t a, float32x2_t b) +{ + int i; + float32x2_t c; + + for (i = 0; i < 2; ++i) { +a[i] = __builtin_fabsf32 (a[i]); +b[i] = __builtin_fabsf32 (b[i]); +c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f32: +** fabs v1.4s, v1.4s +** fabs v0.4s, v0.4s +** fmaxnm v0.4s, v0.4s, v1.4s +** ret +*/ +float32x4_t +test_vamaxq_f32 (float32x4_t a, float32x4_t b) +{ + int i; + float32x4_t c; + + for (i = 0; i < 4; ++i) { +a[i] = __builtin_fabsf32 (a[i]); +b[i] = __builtin_fabsf32 (b[i]); +c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f64: +** fabs v1.2d, v1.2d +** fabs v0.2d, v0.2d +** fmaxnm v0.2d, v0.2d, v1.2d +** ret +*/ +float64x2_t +test_vamaxq_f6
[PATCH v3 2/2] aarch64: Add codegen support for SVE2 faminmax
The AArch64 FEAT_FAMINMAX extension introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch adds code generation for famax and famin in terms of existing unspecs. With this patch: 1. famax can be expressed as taking UNSPEC_COND_SMAX of the two operands and then taking absolute value of their result. 2. famin can be expressed as taking UNSPEC_COND_SMIN of the two operands and then taking absolute value of their result. This fusion of operators is only possible when -march=armv9-a+faminmax+sve flags are passed. We also need to pass -ffast-math flag; this is what enables compiler to use UNSPEC_COND_SMAX and UNSPEC_COND_SMIN. This code generation is only available on -O2 or -O3 as that is when auto-vectorization is enabled. gcc/ChangeLog: * config/aarch64/aarch64-sve2.md (*aarch64_pred_faminmax_fused): Instruction pattern for faminmax codegen. * config/aarch64/iterators.md: Iterator and attribute for faminmax codegen. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/faminmax_1.c: New test. * gcc.target/aarch64/sve/faminmax_2.c: New test. --- gcc/config/aarch64/aarch64-sve2.md| 37 +++ gcc/config/aarch64/iterators.md | 6 ++ .../gcc.target/aarch64/sve/faminmax_1.c | 45 ++ .../gcc.target/aarch64/sve/faminmax_2.c | 61 +++ 4 files changed, 149 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_2.c diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index 725092cc95f..5f2697c3179 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -2467,6 +2467,43 @@ [(set_attr "movprfx" "yes")] ) +;; - +;; -- [FP] Absolute maximum and minimum +;; - +;; Includes: +;; - FAMAX +;; - FAMIN +;; - +;; Predicated floating-point absolute maximum and minimum. +(define_insn_and_rewrite "*aarch64_pred_faminmax_fused" + [(set (match_operand:SVE_FULL_F 0 "register_operand") + (unspec:SVE_FULL_F + [(match_operand: 1 "register_operand") + (match_operand:SI 4 "aarch64_sve_gp_strictness") + (unspec:SVE_FULL_F + [(match_operand 5) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_FULL_F 2 "register_operand")] + UNSPEC_COND_FABS) + (unspec:SVE_FULL_F + [(match_operand 6) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_FULL_F 3 "register_operand")] + UNSPEC_COND_FABS)] + SVE_COND_SMAXMIN))] + "TARGET_SVE_FAMINMAX" + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] + [ w, Upl , %0 , w ; * ] \t%0., %1/m, %0., %3. + [ ?&w , Upl , w , w ; yes] movprfx\t%0, %2\;\t%0., %1/m, %0., %3. + } + "&& (!rtx_equal_p (operands[1], operands[5]) + || !rtx_equal_p (operands[1], operands[6]))" + { +operands[5] = copy_rtx (operands[1]); +operands[6] = copy_rtx (operands[1]); + } +) + ;; = ;; == Complex arithmetic ;; = diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index c06f8c2c90f..8b18682c341 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -3143,6 +3143,9 @@ UNSPEC_COND_FMIN UNSPEC_COND_FMINNM]) +(define_int_iterator SVE_COND_SMAXMIN [UNSPEC_COND_SMAX + UNSPEC_COND_SMIN]) + (define_int_iterator SVE_COND_FP_TERNARY [UNSPEC_COND_FMLA UNSPEC_COND_FMLS UNSPEC_COND_FNMLA @@ -4503,6 +4506,9 @@ (define_int_iterator FAMINMAX_UNS [UNSPEC_FAMAX UNSPEC_FAMIN]) +(define_int_attr faminmax_cond_uns_op + [(UNSPEC_COND_SMAX "famax") (UNSPEC_COND_SMIN "famin")]) + (define_int_attr faminmax_uns_op [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")]) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c b/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c new file mode 100644 index 000..d54f5d99b5e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c @@ -0,0 +1,45 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -ffast-math" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_sve.h" + +#pragma GCC target "+sve+faminmax" + +#define TEST_FAMAX(TYPE) \ + void fn_famax_##TYPE (TYPE * restrict a,\ + TYPE * restrict b,\ + TYPE * restrict c,\ + int n) { \ +for (int i = 0; i < n; i++) { \ + TYPE temp1 = __builtin_fabs (a[i]);\ + TYPE temp2 = __builtin_fabs (b[i]);\ + c[i] = __b
[PATCH v3 1/2] aarch64: Add SVE2 faminmax intrinsics
The AArch64 FEAT_FAMINMAX extension introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch introduces SVE2 faminmax intrinsics. The intrinsics of this extension are implemented as the following builtin functions: * sva[max|min]_[m|x|z] * sva[max|min]_[f16|f32|f64]_[m|x|z] * sva[max|min]_n_[f16|f32|f64]_[m|x|z] gcc/ChangeLog: * config/aarch64/aarch64-sve-builtins-base.cc (svamax): Absolute maximum declaration. (svamin): Absolute minimum declaration. * config/aarch64/aarch64-sve-builtins-base.def (REQUIRED_EXTENSIONS): Add faminmax intrinsics behind a flag. (svamax): Absolute maximum declaration. (svamin): Absolute minimum declaration. * config/aarch64/aarch64-sve-builtins-base.h: Declaring function bases for the new intrinsics. * config/aarch64/aarch64.h (TARGET_SVE_FAMINMAX): New flag for SVE2 faminmax. * config/aarch64/iterators.md: New unspecs, iterators, and attrs for the new intrinsics. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve2/acle/asm/amax_f16.c: New test. * gcc.target/aarch64/sve2/acle/asm/amax_f32.c: New test. * gcc.target/aarch64/sve2/acle/asm/amax_f64.c: New test. * gcc.target/aarch64/sve2/acle/asm/amin_f16.c: New test. * gcc.target/aarch64/sve2/acle/asm/amin_f32.c: New test. * gcc.target/aarch64/sve2/acle/asm/amin_f64.c: New test. --- .../aarch64/aarch64-sve-builtins-base.cc | 4 + .../aarch64/aarch64-sve-builtins-base.def | 5 + .../aarch64/aarch64-sve-builtins-base.h | 2 + gcc/config/aarch64/aarch64.h | 1 + gcc/config/aarch64/iterators.md | 18 +- .../aarch64/sve2/acle/asm/amax_f16.c | 312 ++ .../aarch64/sve2/acle/asm/amax_f32.c | 312 ++ .../aarch64/sve2/acle/asm/amax_f64.c | 312 ++ .../aarch64/sve2/acle/asm/amin_f16.c | 311 + .../aarch64/sve2/acle/asm/amin_f32.c | 312 ++ .../aarch64/sve2/acle/asm/amin_f64.c | 312 ++ 11 files changed, 1900 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f64.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f64.c diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index 4b33585d981..b189818d643 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -3071,6 +3071,10 @@ FUNCTION (svadrb, svadr_bhwd_impl, (0)) FUNCTION (svadrd, svadr_bhwd_impl, (3)) FUNCTION (svadrh, svadr_bhwd_impl, (1)) FUNCTION (svadrw, svadr_bhwd_impl, (2)) +FUNCTION (svamax, cond_or_uncond_unspec_function, + (UNSPEC_COND_FAMAX, UNSPEC_FAMAX)) +FUNCTION (svamin, cond_or_uncond_unspec_function, + (UNSPEC_COND_FAMIN, UNSPEC_FAMIN)) FUNCTION (svand, rtx_code_function, (AND, AND)) FUNCTION (svandv, reduction, (UNSPEC_ANDV)) FUNCTION (svasr, rtx_code_function, (ASHIFTRT, ASHIFTRT)) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.def b/gcc/config/aarch64/aarch64-sve-builtins-base.def index 65fcba91586..95e04e4393d 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.def +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.def @@ -379,3 +379,8 @@ DEF_SVE_FUNCTION (svzip2q, binary, all_data, none) DEF_SVE_FUNCTION (svld1ro, load_replicate, all_data, implicit) DEF_SVE_FUNCTION (svmmla, mmla, d_float, none) #undef REQUIRED_EXTENSIONS + +#define REQUIRED_EXTENSIONS AARCH64_FL_SVE | AARCH64_FL_FAMINMAX +DEF_SVE_FUNCTION (svamax, binary_opt_single_n, all_float, mxz) +DEF_SVE_FUNCTION (svamin, binary_opt_single_n, all_float, mxz) +#undef REQUIRED_EXTENSIONS diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.h b/gcc/config/aarch64/aarch64-sve-builtins-base.h index 5bbf3569c4b..978cf7013f9 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.h +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.h @@ -37,6 +37,8 @@ namespace aarch64_sve extern const function_base *const svadrd; extern const function_base *const svadrh; extern const function_base *const svadrw; +extern const function_base *const svamax; +extern const function_base *const svamin; extern const function_base *const svand; extern const function_base *const svandv; extern const function_base *const svasr; diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index ec8fde783b3..34f56a4b869 100644 --- a/gcc/co
[PATCH v3 0/2] Add support for SVE2 faminmax
From: Saurabh Jha This patch series is a revised version of: https://gcc.gnu.org/pipermail/gcc-patches/2024-October/664209.html The second commit of the previous patch series was reviewed and has been commited separately. This patch contains first and third commit of the previous patch. The changes are as follows: 1. [First patch]: The test cases are copied from files like gcc.target/aarch64/sve/acle/asm/max_f16.c. There is some hardcoding of registers in the copied tests which I did not change. Let me know if that's a problem. 2. [Third patch]: Incorporated the instruction pattern change as suggested. Also thanks for the explanation. 3. [Third patch]: Agreed and incorporated suggestions on test cases. I completely agree that scan-assembler is better for the test cases in this patch. 4. [Third patch]: Agreed for SVE_COND_FP_SMAXMIN -> SVE_COND_SMAXMIN. Regards, Saurabh Saurabh Jha (2): aarch64: Add SVE2 faminmax intrinsics aarch64: Add codegen support for SVE2 faminmax .../aarch64/aarch64-sve-builtins-base.cc | 4 + .../aarch64/aarch64-sve-builtins-base.def | 5 + .../aarch64/aarch64-sve-builtins-base.h | 2 + gcc/config/aarch64/aarch64-sve2.md| 37 +++ gcc/config/aarch64/aarch64.h | 1 + gcc/config/aarch64/iterators.md | 24 +- .../gcc.target/aarch64/sve/faminmax_1.c | 45 +++ .../gcc.target/aarch64/sve/faminmax_2.c | 61 .../aarch64/sve2/acle/asm/amax_f16.c | 312 ++ .../aarch64/sve2/acle/asm/amax_f32.c | 312 ++ .../aarch64/sve2/acle/asm/amax_f64.c | 312 ++ .../aarch64/sve2/acle/asm/amin_f16.c | 311 + .../aarch64/sve2/acle/asm/amin_f32.c | 312 ++ .../aarch64/sve2/acle/asm/amin_f64.c | 312 ++ 14 files changed, 2049 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f64.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f64.c -- 2.34.1
[PATCH v4 0/2] Add support for SVE2 faminmax
From: Saurabh Jha This is a revised version of this patch series: https://gcc.gnu.org/pipermail/gcc-patches/2024-October/664329.html Unfortunately, I had test case failures which I missed but shouldn't have. Apologies for that. This version fixes the failing test cases in the second patch with no other changes. Regression tested on aarch64-unknown-linux-gnu and found no regressions. Ok for master? Thanks, Saurabh Saurabh Jha (2): aarch64: Add SVE2 faminmax intrinsics aarch64: Add codegen support for SVE2 faminmax .../aarch64/aarch64-sve-builtins-base.cc | 4 + .../aarch64/aarch64-sve-builtins-base.def | 5 + .../aarch64/aarch64-sve-builtins-base.h | 2 + gcc/config/aarch64/aarch64-sve2.md| 37 +++ gcc/config/aarch64/aarch64.h | 1 + gcc/config/aarch64/iterators.md | 24 +- .../gcc.target/aarch64/sve/faminmax_1.c | 44 +++ .../gcc.target/aarch64/sve/faminmax_2.c | 60 .../aarch64/sve2/acle/asm/amax_f16.c | 312 ++ .../aarch64/sve2/acle/asm/amax_f32.c | 312 ++ .../aarch64/sve2/acle/asm/amax_f64.c | 312 ++ .../aarch64/sve2/acle/asm/amin_f16.c | 311 + .../aarch64/sve2/acle/asm/amin_f32.c | 312 ++ .../aarch64/sve2/acle/asm/amin_f64.c | 312 ++ 14 files changed, 2047 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f64.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f64.c -- 2.34.1
[PATCH v4 2/2] aarch64: Add codegen support for SVE2 faminmax
The AArch64 FEAT_FAMINMAX extension introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch adds code generation for famax and famin in terms of existing unspecs. With this patch: 1. famax can be expressed as taking UNSPEC_COND_SMAX of the two operands and then taking absolute value of their result. 2. famin can be expressed as taking UNSPEC_COND_SMIN of the two operands and then taking absolute value of their result. This fusion of operators is only possible when -march=armv9-a+faminmax+sve flags are passed. We also need to pass -ffast-math flag; this is what enables compiler to use UNSPEC_COND_SMAX and UNSPEC_COND_SMIN. This code generation is only available on -O2 or -O3 as that is when auto-vectorization is enabled. gcc/ChangeLog: * config/aarch64/aarch64-sve2.md (*aarch64_pred_faminmax_fused): Instruction pattern for faminmax codegen. * config/aarch64/iterators.md: Iterator and attribute for faminmax codegen. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/faminmax_1.c: New test. * gcc.target/aarch64/sve/faminmax_2.c: New test. --- gcc/config/aarch64/aarch64-sve2.md| 37 gcc/config/aarch64/iterators.md | 6 ++ .../gcc.target/aarch64/sve/faminmax_1.c | 44 ++ .../gcc.target/aarch64/sve/faminmax_2.c | 60 +++ 4 files changed, 147 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_2.c diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index 725092cc95f..5f2697c3179 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -2467,6 +2467,43 @@ [(set_attr "movprfx" "yes")] ) +;; - +;; -- [FP] Absolute maximum and minimum +;; - +;; Includes: +;; - FAMAX +;; - FAMIN +;; - +;; Predicated floating-point absolute maximum and minimum. +(define_insn_and_rewrite "*aarch64_pred_faminmax_fused" + [(set (match_operand:SVE_FULL_F 0 "register_operand") + (unspec:SVE_FULL_F + [(match_operand: 1 "register_operand") + (match_operand:SI 4 "aarch64_sve_gp_strictness") + (unspec:SVE_FULL_F + [(match_operand 5) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_FULL_F 2 "register_operand")] + UNSPEC_COND_FABS) + (unspec:SVE_FULL_F + [(match_operand 6) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_FULL_F 3 "register_operand")] + UNSPEC_COND_FABS)] + SVE_COND_SMAXMIN))] + "TARGET_SVE_FAMINMAX" + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] + [ w, Upl , %0 , w ; * ] \t%0., %1/m, %0., %3. + [ ?&w , Upl , w , w ; yes] movprfx\t%0, %2\;\t%0., %1/m, %0., %3. + } + "&& (!rtx_equal_p (operands[1], operands[5]) + || !rtx_equal_p (operands[1], operands[6]))" + { +operands[5] = copy_rtx (operands[1]); +operands[6] = copy_rtx (operands[1]); + } +) + ;; = ;; == Complex arithmetic ;; = diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index c06f8c2c90f..8b18682c341 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -3143,6 +3143,9 @@ UNSPEC_COND_FMIN UNSPEC_COND_FMINNM]) +(define_int_iterator SVE_COND_SMAXMIN [UNSPEC_COND_SMAX + UNSPEC_COND_SMIN]) + (define_int_iterator SVE_COND_FP_TERNARY [UNSPEC_COND_FMLA UNSPEC_COND_FMLS UNSPEC_COND_FNMLA @@ -4503,6 +4506,9 @@ (define_int_iterator FAMINMAX_UNS [UNSPEC_FAMAX UNSPEC_FAMIN]) +(define_int_attr faminmax_cond_uns_op + [(UNSPEC_COND_SMAX "famax") (UNSPEC_COND_SMIN "famin")]) + (define_int_attr faminmax_uns_op [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")]) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c b/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c new file mode 100644 index 000..3b65ccea065 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c @@ -0,0 +1,44 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -ffast-math" } */ + +#include "arm_sve.h" + +#pragma GCC target "+sve+faminmax" + +#define TEST_FAMAX(TYPE) \ + void fn_famax_##TYPE (TYPE * restrict a,\ + TYPE * restrict b,\ + TYPE * restrict c,\ + int n) { \ +for (int i = 0; i < n; i++) { \ + TYPE temp1 = __builtin_fabs (a[i]);\ + TYPE temp2 = __builtin_fabs (b[i]);\ + c[i] = __builtin_fmax (temp1, temp2);\ +} \ +
[PATCH v4 1/2] aarch64: Add SVE2 faminmax intrinsics
The AArch64 FEAT_FAMINMAX extension introduces instructions for computing the floating point absolute maximum and minimum of the two vectors element-wise. This patch introduces SVE2 faminmax intrinsics. The intrinsics of this extension are implemented as the following builtin functions: * sva[max|min]_[m|x|z] * sva[max|min]_[f16|f32|f64]_[m|x|z] * sva[max|min]_n_[f16|f32|f64]_[m|x|z] gcc/ChangeLog: * config/aarch64/aarch64-sve-builtins-base.cc (svamax): Absolute maximum declaration. (svamin): Absolute minimum declaration. * config/aarch64/aarch64-sve-builtins-base.def (REQUIRED_EXTENSIONS): Add faminmax intrinsics behind a flag. (svamax): Absolute maximum declaration. (svamin): Absolute minimum declaration. * config/aarch64/aarch64-sve-builtins-base.h: Declaring function bases for the new intrinsics. * config/aarch64/aarch64.h (TARGET_SVE_FAMINMAX): New flag for SVE2 faminmax. * config/aarch64/iterators.md: New unspecs, iterators, and attrs for the new intrinsics. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve2/acle/asm/amax_f16.c: New test. * gcc.target/aarch64/sve2/acle/asm/amax_f32.c: New test. * gcc.target/aarch64/sve2/acle/asm/amax_f64.c: New test. * gcc.target/aarch64/sve2/acle/asm/amin_f16.c: New test. * gcc.target/aarch64/sve2/acle/asm/amin_f32.c: New test. * gcc.target/aarch64/sve2/acle/asm/amin_f64.c: New test. --- .../aarch64/aarch64-sve-builtins-base.cc | 4 + .../aarch64/aarch64-sve-builtins-base.def | 5 + .../aarch64/aarch64-sve-builtins-base.h | 2 + gcc/config/aarch64/aarch64.h | 1 + gcc/config/aarch64/iterators.md | 18 +- .../aarch64/sve2/acle/asm/amax_f16.c | 312 ++ .../aarch64/sve2/acle/asm/amax_f32.c | 312 ++ .../aarch64/sve2/acle/asm/amax_f64.c | 312 ++ .../aarch64/sve2/acle/asm/amin_f16.c | 311 + .../aarch64/sve2/acle/asm/amin_f32.c | 312 ++ .../aarch64/sve2/acle/asm/amin_f64.c | 312 ++ 11 files changed, 1900 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amax_f64.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f32.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/amin_f64.c diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index 4b33585d981..b189818d643 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -3071,6 +3071,10 @@ FUNCTION (svadrb, svadr_bhwd_impl, (0)) FUNCTION (svadrd, svadr_bhwd_impl, (3)) FUNCTION (svadrh, svadr_bhwd_impl, (1)) FUNCTION (svadrw, svadr_bhwd_impl, (2)) +FUNCTION (svamax, cond_or_uncond_unspec_function, + (UNSPEC_COND_FAMAX, UNSPEC_FAMAX)) +FUNCTION (svamin, cond_or_uncond_unspec_function, + (UNSPEC_COND_FAMIN, UNSPEC_FAMIN)) FUNCTION (svand, rtx_code_function, (AND, AND)) FUNCTION (svandv, reduction, (UNSPEC_ANDV)) FUNCTION (svasr, rtx_code_function, (ASHIFTRT, ASHIFTRT)) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.def b/gcc/config/aarch64/aarch64-sve-builtins-base.def index 65fcba91586..95e04e4393d 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.def +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.def @@ -379,3 +379,8 @@ DEF_SVE_FUNCTION (svzip2q, binary, all_data, none) DEF_SVE_FUNCTION (svld1ro, load_replicate, all_data, implicit) DEF_SVE_FUNCTION (svmmla, mmla, d_float, none) #undef REQUIRED_EXTENSIONS + +#define REQUIRED_EXTENSIONS AARCH64_FL_SVE | AARCH64_FL_FAMINMAX +DEF_SVE_FUNCTION (svamax, binary_opt_single_n, all_float, mxz) +DEF_SVE_FUNCTION (svamin, binary_opt_single_n, all_float, mxz) +#undef REQUIRED_EXTENSIONS diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.h b/gcc/config/aarch64/aarch64-sve-builtins-base.h index 5bbf3569c4b..978cf7013f9 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.h +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.h @@ -37,6 +37,8 @@ namespace aarch64_sve extern const function_base *const svadrd; extern const function_base *const svadrh; extern const function_base *const svadrw; +extern const function_base *const svamax; +extern const function_base *const svamin; extern const function_base *const svand; extern const function_base *const svandv; extern const function_base *const svasr; diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index ec8fde783b3..34f56a4b869 100644 --- a/gcc/co
[PATCH 1/3] aarch64: Add support for fp8 convert and scale
The AArch64 FEAT_FP8 extension introduces instructions for conversion and scaling. This patch introduces the following intrinsics: 1. vcvt{1|2}_{bf16|high_bf16|low_bf16}_mf8_fpm. 2. vcvt{q}_mf8_f16_fpm. 3. vcvt_{high}_mf8_f32_fpm. 4. vscale{q}_{f16|f32|f64}. We introduced three new aarch64_builtin_signatures enum variants: 1. binary_fpm. 2. ternary_fpm. 3. unary_fpm. We added support for these variants for declaring types and for expanding to RTL. We added new simd_types for integers (s32, s32q, and s64q) and for fp8 (f8, and f8q). Also changed the faminmax intrinsic instruction pattern so that it works better with the new fscale pattern. Because we added support for fp8 intrinsics here, we modified the check in acle/fp8.c that was checking that __ARM_FEATURE_FP8 macro is not defined. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (enum class): New variants to support new signatures. (aarch64_fntype): Handle new signatures. (aarch64_expand_pragma_builtin): Handle new signatures. * config/aarch64/aarch64-c.cc (aarch64_update_cpp_builtins): New flag for FP8. * config/aarch64/aarch64-simd-pragma-builtins.def (ENTRY_BINARY_FPM): Macro to declare unary fpm intrinsics. (ENTRY_TERNARY_FPM): Macro to declare ternary fpm intrinsics. (ENTRY_UNARY_FPM): Macro to declare unary fpm intrinsics. (ENTRY_VHSDF_VHSDI): Macro to declare binary intrinsics. * config/aarch64/aarch64-simd.md (@aarch64_): Renamed. (@aarch64_): Renamed. (@aarch64_): Unary fpm pattern. (@aarch64_): Unary fpm pattern. (@aarch64_): Binary fpm pattern. (@aarch64_): Ternary fpm pattern. (@aarch64_): Scale fpm pattern. * config/aarch64/iterators.md: New attributes and iterators. gcc/testsuite/ChangeLog: * gcc.target/aarch64/acle/fp8.c: Remove check that fp8 feature macro doesn't exist. * gcc.target/aarch64/simd/scale_fpm.c: New test. * gcc.target/aarch64/simd/vcvt_fpm.c: New test. --- I could not find a way to compress declarations in aarch64-simd-pragma-builtins.def for convert instructions as there was no pattern apart from the repetion for vcvt1/vcvt2 types. Let me know if those declrations can be expressed more concisely. In the scale instructions, I am not doing any casting from float to int modes in the second operand. Let me know if that's a problem. --- gcc/config/aarch64/aarch64-builtins.cc| 132 ++-- gcc/config/aarch64/aarch64-c.cc | 2 + .../aarch64/aarch64-simd-pragma-builtins.def | 56 + gcc/config/aarch64/aarch64-simd.md| 72 ++- gcc/config/aarch64/iterators.md | 99 + gcc/testsuite/gcc.target/aarch64/acle/fp8.c | 10 - .../gcc.target/aarch64/simd/scale_fpm.c | 60 ++ .../gcc.target/aarch64/simd/vcvt_fpm.c| 197 ++ 8 files changed, 603 insertions(+), 25 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index ad82c680c6a..df19bff71d0 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -1591,6 +1591,9 @@ aarch64_init_simd_builtin_functions (bool called_from_pragma) enum class aarch64_builtin_signatures { binary, + binary_fpm, + ternary_fpm, + unary_fpm, }; namespace { @@ -1602,6 +1605,9 @@ struct simd_type { namespace simd_types { + constexpr simd_type f8 { V8QImode, qualifier_modal_float }; + constexpr simd_type f8q { V16QImode, qualifier_modal_float }; + constexpr simd_type s8 { V8QImode, qualifier_none }; constexpr simd_type u8 { V8QImode, qualifier_unsigned }; constexpr simd_type s8q { V16QImode, qualifier_none }; @@ -1612,6 +1618,11 @@ namespace simd_types { constexpr simd_type s16q { V8HImode, qualifier_none }; constexpr simd_type u16q { V8HImode, qualifier_unsigned }; + constexpr simd_type s32 { V2SImode, qualifier_none }; + constexpr simd_type s32q { V4SImode, qualifier_none }; + + constexpr simd_type s64q { V2DImode, qualifier_none }; + constexpr simd_type p8 { V8QImode, qualifier_poly }; constexpr simd_type p8q { V16QImode, qualifier_poly }; constexpr simd_type p16 { V4HImode, qualifier_poly }; @@ -1655,7 +1666,7 @@ static aarch64_pragma_builtins_data aarch64_pragma_builtins[] = { static tree aarch64_fntype (const aarch64_pragma_builtins_data &builtin_data) { - tree type0, type1, type2; + tree type0, type1, type2, type3; switch (builtin_data.signature) { @@ -1668,6 +1679,36 @@ aarch64_fntype (const aarch64_pragma_builtins_data &builtin_data) builtin_data.types[2].qualifiers); return build_function_type_l
[PATCH 0/3] aarch64: Add fp8, fp8dot2, fp8dot4, and fp8fma acle
From: Saurabh Jha This patch series has three patches for adding support for fp8, fp8dot2, and fp8dot4 acle AdvSIMD intrinsics. The specific things I need thoughts are on are written in each commit message after the "---" which will be omitted when commiting. Regression tested on aarch64-unknown-linux-gnu and found no regressions. This series is built on top of the first commit of another patch series and should only be commited after the other patch series is commited: https://gcc.gnu.org/pipermail/gcc-patches/2024-November/667692.html Thanks, Saurabh Saurabh Jha (3): aarch64: Add support for fp8 convert and scale aarch64: Add support for fp8dot2 and fp8dot4 aarch64: Add support for fp8fma instructions gcc/config/aarch64/aarch64-builtins.cc| 276 -- gcc/config/aarch64/aarch64-c.cc | 8 + .../aarch64/aarch64-option-extensions.def | 6 + .../aarch64/aarch64-simd-pragma-builtins.def | 106 ++- gcc/config/aarch64/aarch64-simd.md| 159 +- gcc/config/aarch64/aarch64.h | 9 + gcc/config/aarch64/iterators.md | 135 + gcc/doc/invoke.texi | 6 + gcc/testsuite/gcc.target/aarch64/acle/fp8.c | 10 - .../gcc.target/aarch64/simd/fma_fpm.c | 221 ++ .../gcc.target/aarch64/simd/scale_fpm.c | 60 .../gcc.target/aarch64/simd/vcvt_fpm.c| 197 + .../gcc.target/aarch64/simd/vdot2_fpmdot.c| 77 + .../gcc.target/aarch64/simd/vdot4_fpmdot.c| 77 + 14 files changed, 1317 insertions(+), 30 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fma_fpm.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpmdot.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpmdot.c -- 2.34.1
[PATCH 2/3] aarch64: Add support for fp8dot2 and fp8dot4
The AArch64 FEAT_FP8DOT2 and FEAT_FP8DOT4 extension introduces instructions for dot product of vectors. This patch introduces the following intrinsics: 1. vdot{q}_{fp16|fp32}_mf8_fpm. 2. vdot{q}_lane{q}_{fp16|fp32}_mf8_fpm. It introduces two flags: fp8dot2 and fp8dot4. We had to add space for another type in aarch64_pragma_builtins_data struct. The macros were updated to reflect that. We added a new aarch64_builtin_signature variant, ternary_fpm_lane, and added support it in declaration of types and expansion to RTL. We added a new namespace, function_checker, to implement range checks for functions defined using the new pragma approach. The old intrinsic range checks should remain unaffected. All the new AdvSIMD intrinsics we define that need lane checks should be using the function in this namespace to implement the checks. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (ENTRY): Change to handle extra type. (enum class): Added new variant. (struct aarch64_pragma_builtins_data): Add support for another type. (aarch64_fntype): Handle new signature. (require_integer_constant): New function to check whether the operand is an integer constant. (require_immediate_range): New function to validate index ranges. (check_simd_lane_bounds): New function to validate index operands. (aarch64_expand_pragma_builtin): Handle new signature. * config/aarch64/aarch64-c.cc (aarch64_update_cpp_builtins): New flags. * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): New flags. * config/aarch64/aarch64-simd-pragma-builtins.def (ENTRY_BINARY): Change to handle extra type. (ENTRY_BINARY_FPM): Change to handle extra type. (ENTRY_TERNARY_FPM_LANE): Macro to declare fpm ternary with lane intrinsics. (ENTRY_VDOT_FPM): Change to handle extra type. (ENTRY_UNARY_FPM): Change to handle extra type. * config/aarch64/aarch64-simd.md: New instruction pattern for fp8dot2 and fp8dot4 instructions. * config/aarch64/aarch64.h (TARGET_FP8DOT2): New flag for fp8dot2 instructions. (TARGET_FP8DOT4): New flag for fp8dot4 instructions. * config/aarch64/iterators.md: New attributes and iterators. * doc/invoke.texi: New flag for fp8dot2 and fp8dot4 instructions. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/vdot2_fpmdot.c: New test. * gcc.target/aarch64/simd/vdot4_fpmdot.c: New test. --- Is there a better to validate indices? --- gcc/config/aarch64/aarch64-builtins.cc| 138 +- gcc/config/aarch64/aarch64-c.cc | 4 + .../aarch64/aarch64-option-extensions.def | 4 + .../aarch64/aarch64-simd-pragma-builtins.def | 39 - gcc/config/aarch64/aarch64-simd.md| 58 gcc/config/aarch64/aarch64.h | 6 + gcc/config/aarch64/iterators.md | 20 ++- gcc/doc/invoke.texi | 4 + .../gcc.target/aarch64/simd/vdot2_fpmdot.c| 77 ++ .../gcc.target/aarch64/simd/vdot4_fpmdot.c| 77 ++ 10 files changed, 415 insertions(+), 12 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpmdot.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpmdot.c diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index df19bff71d0..ba3bffaa4f9 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -780,7 +780,7 @@ typedef struct AARCH64_SIMD_BUILTIN_##T##_##N##A, #undef ENTRY -#define ENTRY(N, S, M0, M1, M2, M3, U) \ +#define ENTRY(N, S, M0, M1, M2, M3, M4, U) \ AARCH64_##N, enum aarch64_builtins @@ -1593,6 +1593,7 @@ enum class aarch64_builtin_signatures binary, binary_fpm, ternary_fpm, + ternary_fpm_lane, unary_fpm, }; @@ -1643,10 +1644,10 @@ namespace simd_types { } #undef ENTRY -#define ENTRY(N, S, T0, T1, T2, T3, U) \ +#define ENTRY(N, S, T0, T1, T2, T3, T4, U) \ {#N, aarch64_builtin_signatures::S, simd_types::T0, simd_types::T1, \ -simd_types::T2, simd_types::T3, U, \ -aarch64_required_extensions::REQUIRED_EXTENSIONS}, + simd_types::T2, simd_types::T3, simd_types::T4, U, \ + aarch64_required_extensions::REQUIRED_EXTENSIONS}, /* Initialize pragma builtins. */ @@ -1654,7 +1655,7 @@ struct aarch64_pragma_builtins_data { const char *name; aarch64_builtin_signatures signature; - simd_type types[4]; + simd_type types[5]; int unspec; aarch64_required_extensions required_extensions; }; @@ -1667,6 +1668,7 @@ static tree aarch64_fntype (const aarch64_pragma_builtins_data &builtin_data) { tree type0, type1, type2, type3; + tree immtype = aarch64_simd_builtin_type (SImode, qualifier_lane_index); switc
[PATCH 3/3] aarch64: Add support for fp8fma instructions
The AArch64 FEAT_FP8FMA extension introduces instructions for multiply-add of vectors. This patch introduces the following instructions: 1. {vmlalbq|vmlaltq}_f16_mf8_fpm. 2. {vmlalbq|vmlaltq}_lane{q}_f16_mf8_fpm. 3. {vmlallbbq|vmlallbtq|vmlalltbq|vmlallttq}_f32_mf8_fpm. 4. {vmlallbbq|vmlallbtq|vmlalltbq|vmlallttq}_lane{q}_f32_mf8_fpm. It introduces the fp8fma flag. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (check_simd_lane_bounds): Add support for new unspecs. * config/aarch64/aarch64-c.cc (aarch64_update_cpp_builtins): New flags. * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): New flags. * config/aarch64/aarch64-simd-pragma-builtins.def (ENTRY_FMA_FPM): Macro to declare fma intrinsics. * config/aarch64/aarch64-simd.md: New instruction pattern for fp8fma instructions. * config/aarch64/aarch64.h (TARGET_FP8FMA): New flag for fp8fma instructions. * config/aarch64/iterators.md: New attributes and iterators. * doc/invoke.texi: New flag for fp8fma instructions. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/fma_fpm.c: New test. --- In the instruction pattern for handling lanes, I am not doing any casting of the the third operand and hardcoding the '.b' suffix in the assembly string. Is that okay? --- gcc/config/aarch64/aarch64-builtins.cc| 10 + gcc/config/aarch64/aarch64-c.cc | 2 + .../aarch64/aarch64-option-extensions.def | 2 + .../aarch64/aarch64-simd-pragma-builtins.def | 19 ++ gcc/config/aarch64/aarch64-simd.md| 29 +++ gcc/config/aarch64/aarch64.h | 3 + gcc/config/aarch64/iterators.md | 18 ++ gcc/doc/invoke.texi | 2 + .../gcc.target/aarch64/simd/fma_fpm.c | 221 ++ 9 files changed, 306 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fma_fpm.c diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index ba3bffaa4f9..dc996f0563e 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -2593,6 +2593,16 @@ check_simd_lane_bounds (location_t location, const aarch64_pragma_builtins_data vector_to_index_mode_size / 4 - 1); break; + case UNSPEC_FMLALB: + case UNSPEC_FMLALT: + case UNSPEC_FMLALLBB: + case UNSPEC_FMLALLBT: + case UNSPEC_FMLALLTB: + case UNSPEC_FMLALLTT: + require_immediate_range (location, index_arg, 0, + vector_to_index_mode_size - 1); + break; + default: gcc_unreachable (); } diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc index 3e30ba5afd9..4dc7711486f 100644 --- a/gcc/config/aarch64/aarch64-c.cc +++ b/gcc/config/aarch64/aarch64-c.cc @@ -263,6 +263,8 @@ aarch64_update_cpp_builtins (cpp_reader *pfile) aarch64_def_or_undef (TARGET_FP8DOT4, "__ARM_FEATURE_FP8DOT4", pfile); + aarch64_def_or_undef (TARGET_FP8FMA, "__ARM_FEATURE_FP8FMA", pfile); + aarch64_def_or_undef (TARGET_LS64, "__ARM_FEATURE_LS64", pfile); aarch64_def_or_undef (TARGET_RCPC, "__ARM_FEATURE_RCPC", pfile); diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def index fd4d29e5df6..9806801e472 100644 --- a/gcc/config/aarch64/aarch64-option-extensions.def +++ b/gcc/config/aarch64/aarch64-option-extensions.def @@ -238,6 +238,8 @@ AARCH64_OPT_EXTENSION("fp8dot2", FP8DOT2, (SIMD), (), (), "fp8dot2") AARCH64_OPT_EXTENSION("fp8dot4", FP8DOT4, (SIMD), (), (), "fp8dot4") +AARCH64_OPT_EXTENSION("fp8fma", FP8FMA, (SIMD), (), (), "fp8fma") + AARCH64_OPT_EXTENSION("faminmax", FAMINMAX, (SIMD), (), (), "faminmax") #undef AARCH64_OPT_FMV_EXTENSION diff --git a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def index 9dea2939b47..a85a4c48dbd 100644 --- a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def @@ -48,6 +48,15 @@ ENTRY_TERNARY_FPM_LANE (vdotq_laneq_##T##_mf8_fpm, ternary_fpm_lane, T##q, \ T##q, f8q, f8q, U) +#undef ENTRY_FMA_FPM +#define ENTRY_FMA_FPM(N, T, U) \ + ENTRY_TERNARY_FPM (N##_##T##_mf8_fpm, ternary_fpm, \ + T##q, T##q, f8q, f8q, U)\ + ENTRY_TERNARY_FPM_LANE (N##_lane_##T##_mf8_fpm, ternary_fpm_lane, \ + T##q, T##q, f8q, f8, U) \ + ENTRY_TERNARY_FPM_LANE (N##_laneq_##T##_mf8_fpm, ternary_fpm_lane, \ + T##q, T##q, f8q, f8q, U) + #undef ENTRY_UNARY_FPM #define ENTRY_UNARY_FPM(N, S, T0, T1, U) \ ENTRY (N, S, T0, T1, none, none, none, U) @@ -121,3 +130,13 @@ ENTRY_VDOT_FPM (f16, UNSPEC_VDOT2) #define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8DOT4) ENTRY_VDOT_FPM (f32, UNSPEC_VDOT4) #undef REQUIRED_EXTENSIONS + +// fp8 multiply-add +#define REQUIRED_EXTENSIO
[PATCH v2 1/3] aarch64: Add support for fp8 convert and scale
The AArch64 FEAT_FP8 extension introduces instructions for conversion and scaling. This patch introduces the following intrinsics: 1. vcvt{1|2}_{bf16|high_bf16|low_bf16}_mf8_fpm. 2. vcvt{q}_mf8_f16_fpm. 3. vcvt_{high}_mf8_f32_fpm. 4. vscale{q}_{f16|f32|f64}. We introduced two aarch64_builtin_signatures enum variants, unary and ternary, and added support for these variants in the functions aarch64_fntype and aarch64_expand_pragma_builtin. We added new simd_types for integers (s32, s32q, and s64q) and for floating points (f8 and f8q). Because we added support for fp8 intrinsics here, we modified the check in acle/fp8.c that was checking that __ARM_FEATURE_FP8 macro is not defined. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (ENTRY): Modified to support uses_fpmr flag. (enum class): New variants to support new signatures. (struct aarch64_pragma_builtins_data): Add a new boolean field, uses_fpmr. (aarch64_get_number_of_args): Helper function used in aarch64_fntype and aarch64_expand_pragma_builtin. (aarch64_fntype): Handle new signatures. (aarch64_expand_pragma_builtin): Handle new signatures. * config/aarch64/aarch64-c.cc (aarch64_update_cpp_builtins): New flag for FP8. * config/aarch64/aarch64-simd-pragma-builtins.def (ENTRY_BINARY): Macro to declare binary intrinsics. (ENTRY_TERNARY): Macro to declare ternary intrinsics. (ENTRY_UNARY): Macro to declare unary intrinsics. (ENTRY_VHSDF): Macro to declare binary intrinsics. (ENTRY_VHSDF_VHSDI): Macro to declare binary intrinsics. (REQUIRED_EXTENSIONS): Define to declare functions behind command line flags. * config/aarch64/aarch64-simd.md (@aarch64_): Unary pattern. (@aarch64_): Unary pattern. (@aarch64_lower_): Unary pattern. (@aarch64_lower_): Unary pattern. (@aarch64): Binary pattern. (@aarch64_): Unary pattern. (@aarch64_): Binary pattern. * config/aarch64/iterators.md: New attributes and iterators. gcc/testsuite/ChangeLog: * gcc.target/aarch64/acle/fp8.c: Remove check that fp8 feature macro doesn't exist. * gcc.target/aarch64/simd/scale_fpm.c: New test. * gcc.target/aarch64/simd/vcvt_fpm.c: New test. --- gcc/config/aarch64/aarch64-builtins.cc| 137 +--- gcc/config/aarch64/aarch64-c.cc | 2 + .../aarch64/aarch64-simd-pragma-builtins.def | 67 +- gcc/config/aarch64/aarch64-simd.md| 98 + gcc/config/aarch64/iterators.md | 65 ++ gcc/testsuite/gcc.target/aarch64/acle/fp8.c | 10 - .../gcc.target/aarch64/simd/scale_fpm.c | 60 ++ .../gcc.target/aarch64/simd/vcvt_fpm.c| 197 ++ 8 files changed, 587 insertions(+), 49 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index ad82c680c6a..9b7280a30d0 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -780,7 +780,7 @@ typedef struct AARCH64_SIMD_BUILTIN_##T##_##N##A, #undef ENTRY -#define ENTRY(N, S, M0, M1, M2, M3, U) \ +#define ENTRY(N, S, M0, M1, M2, M3, USES_FPMR, U) \ AARCH64_##N, enum aarch64_builtins @@ -1591,6 +1591,8 @@ aarch64_init_simd_builtin_functions (bool called_from_pragma) enum class aarch64_builtin_signatures { binary, + ternary, + unary, }; namespace { @@ -1602,6 +1604,9 @@ struct simd_type { namespace simd_types { + constexpr simd_type f8 { V8QImode, qualifier_modal_float }; + constexpr simd_type f8q { V16QImode, qualifier_modal_float }; + constexpr simd_type s8 { V8QImode, qualifier_none }; constexpr simd_type u8 { V8QImode, qualifier_unsigned }; constexpr simd_type s8q { V16QImode, qualifier_none }; @@ -1612,6 +1617,11 @@ namespace simd_types { constexpr simd_type s16q { V8HImode, qualifier_none }; constexpr simd_type u16q { V8HImode, qualifier_unsigned }; + constexpr simd_type s32 { V2SImode, qualifier_none }; + constexpr simd_type s32q { V4SImode, qualifier_none }; + + constexpr simd_type s64q { V2DImode, qualifier_none }; + constexpr simd_type p8 { V8QImode, qualifier_poly }; constexpr simd_type p8q { V16QImode, qualifier_poly }; constexpr simd_type p16 { V4HImode, qualifier_poly }; @@ -1632,10 +1642,10 @@ namespace simd_types { } #undef ENTRY -#define ENTRY(N, S, T0, T1, T2, T3, U) \ +#define ENTRY(N, S, T0, T1, T2, T3, USES_FPMR, U) \ {#N, aarch64_builtin_signatures::S, simd_types::T0, simd_types::T1, \ -simd_types::T2, simd_types::T3, U, \ -aarch64_required_extensions::REQUIRED_EXTENSIONS}, + simd_types::T2, simd_types::T3, U, USES_FPM
[PATCH v2 3/3] aarch64: Add support for fp8fma instructions
The AArch64 FEAT_FP8FMA extension introduces instructions for multiply-add of vectors. This patch introduces the following instructions: 1. {vmlalbq|vmlaltq}_f16_mf8_fpm. 2. {vmlalbq|vmlaltq}_lane{q}_f16_mf8_fpm. 3. {vmlallbbq|vmlallbtq|vmlalltbq|vmlallttq}_f32_mf8_fpm. 4. {vmlallbbq|vmlallbtq|vmlalltbq|vmlallttq}_lane{q}_f32_mf8_fpm. It introduces the fp8fma flag. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (check_simd_lane_bounds): Add support for new unspecs. (aarch64_expand_pragma_builtins): Add support for new unspecs. * config/aarch64/aarch64-c.cc (aarch64_update_cpp_builtins): New flags. * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): New flags. * config/aarch64/aarch64-simd-pragma-builtins.def (ENTRY_FMA_FPM): Macro to declare fma intrinsics. (REQUIRED_EXTENSIONS): Define to declare functions behind command line flags. * config/aarch64/aarch64-simd.md: (@aarch64_diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index a71c8c9a64e..7b2decf671f 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -2562,10 +2562,26 @@ check_simd_lane_bounds (location_t location, const aarch64_pragma_builtins_data = GET_MODE_NUNITS (vector_to_index_mode).to_constant (); auto low = 0; - int high - = builtin_data->unspec == UNSPEC_VDOT2 - ? vector_to_index_mode_size / 2 - 1 - : vector_to_index_mode_size / 4 - 1; + int high; + switch (builtin_data->unspec) + { + case UNSPEC_VDOT2: + high = vector_to_index_mode_size / 2 - 1; + break; + case UNSPEC_VDOT4: + high = vector_to_index_mode_size / 4 - 1; + break; + case UNSPEC_FMLALB: + case UNSPEC_FMLALT: + case UNSPEC_FMLALLBB: + case UNSPEC_FMLALLBT: + case UNSPEC_FMLALLTB: + case UNSPEC_FMLALLTT: + high = vector_to_index_mode_size - 1; + break; + default: + gcc_unreachable (); + } require_immediate_range (location, index_arg, low, high); break; } @@ -3552,6 +3568,12 @@ aarch64_expand_pragma_builtin (tree exp, rtx target, case UNSPEC_VDOT2: case UNSPEC_VDOT4: +case UNSPEC_FMLALB: +case UNSPEC_FMLALT: +case UNSPEC_FMLALLBB: +case UNSPEC_FMLALLBT: +case UNSPEC_FMLALLTB: +case UNSPEC_FMLALLTT: if (builtin_data->signature == aarch64_builtin_signatures::ternary) icode = code_for_aarch64 (builtin_data->unspec, builtin_data->types[0].mode, diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc index ae1472e0fcf..03f912cde07 100644 --- a/gcc/config/aarch64/aarch64-c.cc +++ b/gcc/config/aarch64/aarch64-c.cc @@ -264,6 +264,8 @@ aarch64_update_cpp_builtins (cpp_reader *pfile) aarch64_def_or_undef (TARGET_FP8DOT4, "__ARM_FEATURE_FP8DOT4", pfile); + aarch64_def_or_undef (TARGET_FP8FMA, "__ARM_FEATURE_FP8FMA", pfile); + aarch64_def_or_undef (TARGET_LS64, "__ARM_FEATURE_LS64", pfile); aarch64_def_or_undef (TARGET_RCPC, "__ARM_FEATURE_RCPC", pfile); diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def index 44d2e18d46b..8446d1bcd5d 100644 --- a/gcc/config/aarch64/aarch64-option-extensions.def +++ b/gcc/config/aarch64/aarch64-option-extensions.def @@ -240,6 +240,8 @@ AARCH64_OPT_EXTENSION("fp8dot2", FP8DOT2, (SIMD), (), (), "fp8dot2") AARCH64_OPT_EXTENSION("fp8dot4", FP8DOT4, (SIMD), (), (), "fp8dot4") +AARCH64_OPT_EXTENSION("fp8fma", FP8FMA, (SIMD), (), (), "fp8fma") + AARCH64_OPT_EXTENSION("faminmax", FAMINMAX, (SIMD), (), (), "faminmax") #undef AARCH64_OPT_FMV_EXTENSION diff --git a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def index 4a94a6613f0..c7857123ca0 100644 --- a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def @@ -48,6 +48,12 @@ ENTRY_TERNARY_FPM_LANE (vdotq_lane_##T##_mf8_fpm, T##q, T##q, f8q, f8, U) \ ENTRY_TERNARY_FPM_LANE (vdotq_laneq_##T##_mf8_fpm, T##q, T##q, f8q, f8q, U) +#undef ENTRY_FMA_FPM +#define ENTRY_FMA_FPM(N, T, U) \ + ENTRY_TERNARY_FPM (N##_##T##_mf8_fpm, T##q, T##q, f8q, f8q, U) \ + ENTRY_TERNARY_FPM_LANE (N##_lane_##T##_mf8_fpm, T##q, T##q, f8q, f8, U) \ + ENTRY_TERNARY_FPM_LANE (N##_laneq_##T##_mf8_fpm, T##q, T##q, f8q, f8q, U) + #undef ENTRY_VHSDF #define ENTRY_VHSDF(NAME, UNSPEC) \ ENTRY_BINARY (NAME##_f16, f16, f16, f16, UNSPEC) \ @@ -106,3 +112,13 @@ ENTRY_VDOT_FPM (f16, UNSPEC_VDOT2) #define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8DOT4) ENTRY_VDOT_FPM (f32, UNSPEC_VDOT4) #undef REQUIRED_EXTENSIONS + +// fp8 multiply-add +#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8FMA) +ENTRY_FMA_FPM (vmlalbq, f16, UNSPEC_FMLALB) +ENTRY_FMA_FPM (vmlaltq, f16, UNSPEC_FMLALT) +ENTRY_FMA_FPM (vmlallbbq, f32, UNSPEC_FMLALLBB) +ENTRY_FMA_FPM (vml
[PATCH v2 2/3] aarch64: Add support for fp8dot2 and fp8dot4
The AArch64 FEAT_FP8DOT2 and FEAT_FP8DOT4 extension introduces instructions for dot product of vectors. This patch introduces the following intrinsics: 1. vdot{q}_{fp16|fp32}_mf8_fpm. 2. vdot{q}_lane{q}_{fp16|fp32}_mf8_fpm. It introduces two flags: fp8dot2 and fp8dot4. We had to add space for another type in aarch64_pragma_builtins_data struct. The macros were updated to reflect that. We added a new aarch64_builtin_signature variant, quaternary, and added support for it in the functions aarch64_fntype and aarch64_expand_pragma_builtin. We added a new namespace, function_checker, to implement range checks for functions defined using the new pragma approach. The old intrinsic range checks will continue to work. All the new AdvSIMD intrinsics we define that need lane checks should be using the function in this namespace to implement the checks. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (ENTRY): Change to handle extra type. (enum class): Added new variant. (struct aarch64_pragma_builtins_data): Add support for another type. (aarch64_get_number_of_args): Handle new signature. (require_integer_constant): New function to check whether the operand is an integer constant. (require_immediate_range): New function to validate index ranges. (check_simd_lane_bounds): New function to validate index operands. (aarch64_general_check_builtin_call): Call function_checker::check-simd_lane_bounds. (aarch64_expand_pragma_builtin): Handle new signature. * config/aarch64/aarch64-c.cc (aarch64_update_cpp_builtins): New flags. * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): New flags. * config/aarch64/aarch64-simd-pragma-builtins.def (ENTRY_BINARY): Change to handle extra type. (ENTRY_BINARY_FPM): Change to handle extra type. (ENTRY_UNARY_FPM): Change to handle extra type. (ENTRY_TERNARY_FPM_LANE): Macro to declare fpm ternary with lane intrinsics. (ENTRY_VDOT_FPM): Macro to declare vdot intrinsics. (REQUIRED_EXTENSIONS): Define to declare functions behind command line flags. * config/aarch64/aarch64-simd.md: (@aarch64_): Instruction pattern for vdot2 intrinsics. (@aarch64_): Instruction pattern for vdot2 intrinsics with lane. (@aarch64_): Instruction pattern for vdot4 intrinsics. (@aarch64_): Instruction pattern for vdo4 intrinsics with lane. * config/aarch64/aarch64.h (TARGET_FP8DOT2): New flag for fp8dot2 instructions. (TARGET_FP8DOT4): New flag for fp8dot4 instructions. * config/aarch64/iterators.md: New attributes and iterators. * doc/invoke.texi: New flag for fp8dot2 and fp8dot4 instructions. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/vdot2_fpmdot.c: New test. * gcc.target/aarch64/simd/vdot4_fpmdot.c: New test. --- gcc/config/aarch64/aarch64-builtins.cc| 107 +- gcc/config/aarch64/aarch64-c.cc | 4 + .../aarch64/aarch64-option-extensions.def | 4 + .../aarch64/aarch64-simd-pragma-builtins.def | 39 +-- gcc/config/aarch64/aarch64-simd.md| 58 ++ gcc/config/aarch64/aarch64.h | 6 + gcc/config/aarch64/iterators.md | 19 +++- gcc/doc/invoke.texi | 4 + .../gcc.target/aarch64/simd/vdot2_fpmdot.c| 77 + .../gcc.target/aarch64/simd/vdot4_fpmdot.c| 77 + 10 files changed, 380 insertions(+), 15 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpmdot.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpmdot.c diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index 9b7280a30d0..a71c8c9a64e 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -780,7 +780,7 @@ typedef struct AARCH64_SIMD_BUILTIN_##T##_##N##A, #undef ENTRY -#define ENTRY(N, S, M0, M1, M2, M3, USES_FPMR, U) \ +#define ENTRY(N, S, M0, M1, M2, M3, M4, USES_FPMR, U) \ AARCH64_##N, enum aarch64_builtins @@ -1590,9 +1590,10 @@ aarch64_init_simd_builtin_functions (bool called_from_pragma) enum class aarch64_builtin_signatures { + unary, binary, ternary, - unary, + quaternary, }; namespace { @@ -1617,6 +1618,7 @@ namespace simd_types { constexpr simd_type s16q { V8HImode, qualifier_none }; constexpr simd_type u16q { V8HImode, qualifier_unsigned }; + constexpr simd_type s32_index { SImode, qualifier_lane_index }; constexpr simd_type s32 { V2SImode, qualifier_none }; constexpr simd_type s32q { V4SImode, qualifier_none }; @@ -1642,10 +1644,10 @@ namespace simd_types { } #undef ENTRY -#define ENTRY(N, S, T0, T1, T2, T3, USES_FPMR,
[PATCH v2 0/3] aarch64: Add fp8, fp8dot2, fp8dot4, and fp8fma acle
From: Saurabh Jha This patch series is a revised version of: https://gcc.gnu.org/pipermail/gcc-patches/2024-November/667723.html I have addressed comments around building a list of operands while declaring types and while expanding to RTL. I have also removed signatures with "_fpm" and "_lane" as suffixes and instead representing that information with a boolean flag and with a scalar signed integer type respectively. I have also removed the change to the faminmax pattern. The new structure, which dispatches based on unspecs, should be more extensible as we move existing intrinsics to the new pragma-based framework in future patches. Regression tested on aarch64-unknown-linux-gnu and found no regressions. This series is built on top of the first commit of another patch series and should only be commited after the other patch series is commited: https://gcc.gnu.org/pipermail/gcc-patches/2024-November/667692.html Ok for master? Thanks, Saurabh Saurabh Jha (3): aarch64: Add support for fp8 convert and scale aarch64: Add support for fp8dot2 and fp8dot4 aarch64: Add support for fp8fma instructions gcc/config/aarch64/aarch64-builtins.cc| 256 -- gcc/config/aarch64/aarch64-c.cc | 8 + .../aarch64/aarch64-option-extensions.def | 6 + .../aarch64/aarch64-simd-pragma-builtins.def | 104 ++- gcc/config/aarch64/aarch64-simd.md| 185 + gcc/config/aarch64/aarch64.h | 9 + gcc/config/aarch64/iterators.md | 100 +++ gcc/doc/invoke.texi | 6 + gcc/testsuite/gcc.target/aarch64/acle/fp8.c | 10 - .../gcc.target/aarch64/simd/fma_fpm.c | 221 +++ .../gcc.target/aarch64/simd/scale_fpm.c | 60 .../gcc.target/aarch64/simd/vcvt_fpm.c| 197 ++ .../gcc.target/aarch64/simd/vdot2_fpmdot.c| 77 ++ .../gcc.target/aarch64/simd/vdot4_fpmdot.c| 77 ++ 14 files changed, 1266 insertions(+), 50 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fma_fpm.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpmdot.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpmdot.c -- 2.34.1
[PATCH v2 0/2] aarch64: Add AdvSIMD lut
From: Saurabh Jha This patch series is a revised version of: https://gcc.gnu.org/pipermail/gcc-patches/2024-November/667692.html In the refactor patch, I redesigned how types are declared and how expand happens. I have taken some ideas from the reviews of the other patch series: https://gcc.gnu.org/pipermail/gcc-patches/2024-November/667723.html In the lut patch, I added support for luti4 and put the second operand inside braces, as the ACLE spec states. Also added a system of checking lanes. In the lut patch, I am now treating index as just another argument, named s32_index. This is leading to simpler code in aarch64-builtins.cc. Regression tested on aarch64-unknown-linux-gnu and found no regressions. Ok for master? Thank you, Saurabh Saurabh Jha (1): aarch64: Add support for AdvSIMD lut Vladimir Miloserdov (1): aarch64: Refactor AdvSIMD intrinsics gcc/config/aarch64/aarch64-builtins.cc| 211 - .../aarch64/aarch64-option-extensions.def | 2 + .../aarch64/aarch64-simd-pragma-builtins.def | 61 +- gcc/config/aarch64/aarch64-simd.md| 24 + gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 49 +- gcc/doc/invoke.texi | 2 + .../aarch64/simd/lut-incorrect-range.c| 212 + .../gcc.target/aarch64/simd/lut-no-flag.c | 10 + gcc/testsuite/gcc.target/aarch64/simd/lut.c | 849 ++ 10 files changed, 1383 insertions(+), 41 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/lut-incorrect-range.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/lut-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/lut.c -- 2.34.1
[PATCH v2 1/2] aarch64: Refactor AdvSIMD intrinsics
Refactor AdvSIMD intrinsics defined using the new pragma-based approach so that it is more extensible. Introduce a new struct, simd_type, which defines types using a mode and qualifiers, and use objects of this struct in the declaration of intrinsics in the aarch64-simd-pragma-builtins.def file. Change aarch64_pragma_builtins_data struct to support return type and argument types. Refactor aarch64_fntype and aarch64_expand_pragma_builtin so that it initialises corresponding vectors in a loop. As we add intrinsics with more arguments, these functions won't need to change to support those. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (ENTRY): Modify to add support of return and argument types. (struct simd_type): New struct to declare types using mode and qualifiers. (struct aarch64_pragma_builtins_data): Replace mode with the array of types to support return and argument types. (aarch64_get_number_of_args): New utility to get number of arguments given an aarch64_builtin_signatures variant. (aarch64_fntype): Modify to handle different signatures. (aarch64_expand_pragma_builtin): Modify to handle different signatures. * config/aarch64/aarch64-simd-pragma-builtins.def (ENTRY_VHSDF): Rename to ENTRY_BINARY_VHSDF. (ENTRY_BINARY): New macro to declare binary intrinsics. (ENTRY_BINARY_VHSDF): Remove signature argument and use ENTRY_BINARY. --- gcc/config/aarch64/aarch64-builtins.cc| 106 ++ .../aarch64/aarch64-simd-pragma-builtins.def | 22 ++-- 2 files changed, 97 insertions(+), 31 deletions(-) diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index b860e22f01f..f4d719a6b5a 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -780,7 +780,7 @@ typedef struct AARCH64_SIMD_BUILTIN_##T##_##N##A, #undef ENTRY -#define ENTRY(N, S, M, U) \ +#define ENTRY(N, S, T0, T1, T2, U) \ AARCH64_##N, enum aarch64_builtins @@ -1598,10 +1598,47 @@ enum class aarch64_builtin_signatures binary, }; +namespace { + +struct simd_type { + machine_mode mode; + aarch64_type_qualifiers qualifiers; +}; + +namespace simd_types { + + constexpr simd_type p8 { V8QImode, qualifier_poly }; + constexpr simd_type p8q { V16QImode, qualifier_poly }; + constexpr simd_type s8 { V8QImode, qualifier_none }; + constexpr simd_type s8q { V16QImode, qualifier_none }; + constexpr simd_type u8 { V8QImode, qualifier_unsigned }; + constexpr simd_type u8q { V16QImode, qualifier_unsigned }; + + constexpr simd_type f16 { V4HFmode, qualifier_none }; + constexpr simd_type f16q { V8HFmode, qualifier_none }; + constexpr simd_type p16 { V4HImode, qualifier_poly }; + constexpr simd_type p16q { V8HImode, qualifier_poly }; + constexpr simd_type s16 { V4HImode, qualifier_none }; + constexpr simd_type s16q { V8HImode, qualifier_none }; + constexpr simd_type u16 { V4HImode, qualifier_unsigned }; + constexpr simd_type u16q { V8HImode, qualifier_unsigned }; + + constexpr simd_type bf16 { V4BFmode, qualifier_none }; + constexpr simd_type bf16q { V8BFmode, qualifier_none }; + + constexpr simd_type f32 { V2SFmode, qualifier_none }; + constexpr simd_type f32q { V4SFmode, qualifier_none }; + constexpr simd_type f64q { V2DFmode, qualifier_none }; + + constexpr simd_type none { VOIDmode, qualifier_none }; +} + +} + #undef ENTRY -#define ENTRY(N, S, M, U) \ - {#N, aarch64_builtin_signatures::S, E_##M##mode, U, \ - aarch64_required_extensions::REQUIRED_EXTENSIONS}, +#define ENTRY(N, S, T0, T1, T2, U) \ + {#N, aarch64_builtin_signatures::S, simd_types::T0, simd_types::T1, \ + simd_types::T2, U, aarch64_required_extensions::REQUIRED_EXTENSIONS}, /* Initialize pragma builtins. */ @@ -1609,7 +1646,7 @@ struct aarch64_pragma_builtins_data { const char *name; aarch64_builtin_signatures signature; - machine_mode mode; + simd_type types[3]; int unspec; aarch64_required_extensions required_extensions; }; @@ -1618,17 +1655,33 @@ static aarch64_pragma_builtins_data aarch64_pragma_builtins[] = { #include "aarch64-simd-pragma-builtins.def" }; +static unsigned int +aarch64_get_number_of_args (const aarch64_pragma_builtins_data &builtin_data) +{ + if (builtin_data.signature == aarch64_builtin_signatures::binary) +return 2; + else +// No other signature supported. +gcc_unreachable (); +} + static tree aarch64_fntype (const aarch64_pragma_builtins_data &builtin_data) { - auto type = aarch64_simd_builtin_type (builtin_data.mode, qualifier_none); - switch (builtin_data.signature) + tree return_type += aarch64_simd_builtin_type (builtin_data.types[0].mode, + builtin_data.types[0].qualifiers); + + vec *arg_types = NULL; + auto nargs = aarch64_get_number_of_args (builtin_data); + for (unsigned int i = 1; i <= nargs; ++i) { -case aarch64_builti
[PATCH v2 2/2] aarch64: Add support for AdvSIMD lut
The AArch64 FEAT_LUT extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for lookup table reads with bit indices. This patch adds support for AdvSIMD lut intrinsics. The intrinsics for this extension are implemented as the following builtin functions: * vluti2{q}_lane{q}_{u8|s8|p8} * vluti2{q}_lane{q}_{u16|s16|p16|f16|bf16} * vluti4q_lane{q}_{u8|s8|p8} * vluti4q_lane{q}_{u16|s16|p16|f16|bf16}_x2 We also introduced a new approach to do lane checks for AdvSIMD. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (ENTRY): Add support for one more type. (enum class): Add enum variant for ternary functions. (struct aarch64_pragma_builtins_data): Add support for one more type. (aarch64_get_number_of_args): Add support for ternary functions. (require_integer_constant): Function to check whether an argument is a const integer. (require_immediate_range): Function to check whether the const integer argument fits in a range. (check_simd_lane_bounds): Main function to check the validity of an index argument. (aarch64_general_check_builtin_call): Call function_checker::check_simd_lane_bounds. (aarch64_expand_pragma_builtin): Add support for lut unspecs. * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): Add lut option. * config/aarch64/aarch64-simd-pragma-builtins.def (ENTRY_BINARY_LANE): Modify to use new ENTRY macro. (ENTRY_TERNARY_VLUT8): Macro to declare lut intrinsics. (ENTRY_TERNARY_VLUT16): Macro to declare lut intrinsics. (REQUIRED_EXTENSIONS): Declare lut intrinsics. * config/aarch64/aarch64-simd.md (@aarch64_): Instruction pattern for luti2 and luti4 intrinsics. (@aarch64_lutx2): Instruction pattern for luti4x2 intrinsics. * config/aarch64/aarch64.h (TARGET_LUT): lut flag. * config/aarch64/iterators.md: Iterators and attributes for lut. * doc/invoke.texi: Document extension in AArch64 Options. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/lut-incorrect-range.c: New test. * gcc.target/aarch64/simd/lut-no-flag.c: New test. * gcc.target/aarch64/simd/lut.c: New test. --- gcc/config/aarch64/aarch64-builtins.cc| 129 ++- .../aarch64/aarch64-option-extensions.def | 2 + .../aarch64/aarch64-simd-pragma-builtins.def | 40 +- gcc/config/aarch64/aarch64-simd.md| 24 + gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 55 +- gcc/doc/invoke.texi | 2 + .../aarch64/simd/lut-incorrect-range.c| 212 + .../gcc.target/aarch64/simd/lut-no-flag.c | 10 + gcc/testsuite/gcc.target/aarch64/simd/lut.c | 849 ++ 10 files changed, 1304 insertions(+), 23 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/lut-incorrect-range.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/lut-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/lut.c diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index f4d719a6b5a..45aeca33e3f 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -780,7 +780,7 @@ typedef struct AARCH64_SIMD_BUILTIN_##T##_##N##A, #undef ENTRY -#define ENTRY(N, S, T0, T1, T2, U) \ +#define ENTRY(N, S, T0, T1, T2, T3, U) \ AARCH64_##N, enum aarch64_builtins @@ -1596,6 +1596,7 @@ aarch64_init_simd_builtin_functions (bool called_from_pragma) enum class aarch64_builtin_signatures { binary, + ternary, }; namespace { @@ -1616,18 +1617,25 @@ namespace simd_types { constexpr simd_type f16 { V4HFmode, qualifier_none }; constexpr simd_type f16q { V8HFmode, qualifier_none }; + constexpr simd_type f16qx2 { V2x8HFmode, qualifier_none }; constexpr simd_type p16 { V4HImode, qualifier_poly }; constexpr simd_type p16q { V8HImode, qualifier_poly }; + constexpr simd_type p16qx2 { V2x8HImode, qualifier_poly }; constexpr simd_type s16 { V4HImode, qualifier_none }; constexpr simd_type s16q { V8HImode, qualifier_none }; + constexpr simd_type s16qx2 { V2x8HImode, qualifier_none }; constexpr simd_type u16 { V4HImode, qualifier_unsigned }; constexpr simd_type u16q { V8HImode, qualifier_unsigned }; + constexpr simd_type u16qx2 { V2x8HImode, qualifier_unsigned }; constexpr simd_type bf16 { V4BFmode, qualifier_none }; constexpr simd_type bf16q { V8BFmode, qualifier_none }; + constexpr simd_type bf16qx2 { V2x8BFmode, qualifier_none }; constexpr simd_type f32 { V2SFmode, qualifier_none }; constexpr simd_type f32q { V4SFmode, qualifier_none }; + constexpr simd_type s32_index { SImode, qualifier_lane_index }; + constexpr simd_type f64q { V2DFmode, qualifier_none }; const
[PATCH] Fix command flags for SVE2 faminmax
Earlier, we were gating SVE2 faminmax behind sve+faminmax. This was incorrect and this patch changes it so that it is gated behind sve2+faminmax. gcc/ChangeLog: * config/aarch64/aarch64-sve2.md: (*aarch64_pred_faminmax_fused): Fix to use the correct flags. * config/aarch64/aarch64.h (TARGET_SVE_FAMINMAX): Remove. * config/aarch64/iterators.md: Fix iterators so that famax and famin use correct flags. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/faminmax_1.c: Fix test to use the correct flags. * gcc.target/aarch64/sve/faminmax_2.c: Fix test to use the correct flags. * gcc.target/aarch64/sve/faminmax_3.c: New test. --- Hey, This patch is in response to Andrew's review here: https://gcc.gnu.org/pipermail/gcc-patches/2025-January/672934.html. Regression tested on aarch64-none-linux-gnu and found no regressions. Ok for master? Thanks, Saurabh --- gcc/config/aarch64/aarch64-sve2.md| 2 +- gcc/config/aarch64/aarch64.h | 1 - gcc/config/aarch64/iterators.md | 8 gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c | 2 +- gcc/testsuite/gcc.target/aarch64/sve/faminmax_2.c | 2 +- gcc/testsuite/gcc.target/aarch64/sve/faminmax_3.c | 11 +++ 6 files changed, 18 insertions(+), 8 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_3.c diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index 60bc03b2650..3e08e092cd0 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -2950,7 +2950,7 @@ (match_operand:SVE_FULL_F 3 "register_operand")] UNSPEC_COND_FABS)] SVE_COND_SMAXMIN))] - "TARGET_SVE_FAMINMAX" + "TARGET_FAMINMAX && TARGET_SVE2_OR_SME2" {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] [ w, Upl , %0 , w ; * ] \t%0., %1/m, %0., %3. [ ?&w , Upl , w , w ; yes] movprfx\t%0, %2\;\t%0., %1/m, %0., %3. diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index 218868a5246..3c8b972a8fd 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -472,7 +472,6 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED /* Floating Point Absolute Maximum/Minimum extension instructions are enabled through +faminmax. */ #define TARGET_FAMINMAX AARCH64_HAVE_ISA (FAMINMAX) -#define TARGET_SVE_FAMINMAX (TARGET_SVE && TARGET_FAMINMAX) /* Lookup table (LUTI) extension instructions are enabled through +lut. */ #define TARGET_LUT AARCH64_HAVE_ISA (LUT) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index e843c66cf26..9fbd7493988 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -3340,8 +3340,8 @@ (define_int_iterator SVE_COND_FP_BINARY [UNSPEC_COND_FADD - (UNSPEC_COND_FAMAX "TARGET_SVE_FAMINMAX") - (UNSPEC_COND_FAMIN "TARGET_SVE_FAMINMAX") + (UNSPEC_COND_FAMAX "TARGET_FAMINMAX && TARGET_SVE2_OR_SME2") + (UNSPEC_COND_FAMIN "TARGET_FAMINMAX && TARGET_SVE2_OR_SME2") UNSPEC_COND_FDIV UNSPEC_COND_FMAX UNSPEC_COND_FMAXNM @@ -3381,8 +3381,8 @@ UNSPEC_COND_SMIN]) (define_int_iterator SVE_COND_FP_BINARY_REG - [(UNSPEC_COND_FAMAX "TARGET_SVE_FAMINMAX") - (UNSPEC_COND_FAMIN "TARGET_SVE_FAMINMAX") + [(UNSPEC_COND_FAMAX "TARGET_FAMINMAX && TARGET_SVE2_OR_SME2") + (UNSPEC_COND_FAMIN "TARGET_FAMINMAX && TARGET_SVE2_OR_SME2") UNSPEC_COND_FDIV UNSPEC_COND_FMULX]) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c b/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c index 3b65ccea065..154dbd9de84 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c @@ -3,7 +3,7 @@ #include "arm_sve.h" -#pragma GCC target "+sve+faminmax" +#pragma GCC target "+sve2+faminmax" #define TEST_FAMAX(TYPE) \ void fn_famax_##TYPE (TYPE * restrict a,\ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/faminmax_2.c b/gcc/testsuite/gcc.target/aarch64/sve/faminmax_2.c index d80f6eca8f8..44ecef1e087 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/faminmax_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/faminmax_2.c @@ -3,7 +3,7 @@ #include "arm_sve.h" -#pragma GCC target "+sve+faminmax" +#pragma GCC target "+sve2+faminmax" #define TEST_WITH_SVMAX(TYPE) \ TYPE fn_fmax_##TYPE (TYPE x, TYPE y) {\ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/faminmax_3.c b/gcc/testsuite/gcc.target/aarch64/sve/faminmax_3.c new file mode 100644 index 000..2b01fa48b8e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/faminmax_3.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ + +#include + +#pragma GCC target ("arch=armv9.2-a+sve2") + +void +test (svbool_t p, svfloat16_t a, svfloat16_t b) +{
[PATCH v5] AArch64: Add LUTI ACLE for SVE2
This patch introduces support for LUTI2/LUTI4 ACLE for SVE2. LUTI instructions are used for efficient table lookups with 2-bit or 4-bit indices. LUTI2 reads indexed 8-bit or 16-bit elements from the low 128 bits of the table vector using packed 2-bit indices, while LUTI4 can read from the low 128 or 256 bits of the table vector or from two table vectors using packed 4-bit indices. These instructions fill the destination vector by copying elements indexed by segments of the source vector, selected by the vector segment index. The changes include the addition of a new AArch64 option extension "lut", __ARM_FEATURE_LUT preprocessor macro, definitions for the new LUTI instruction shapes, and implementations of the svluti2 and svluti4 builtins. --- This is a respin of https://gcc.gnu.org/pipermail/gcc-patches/2025-January/673707.html. The two major changes in this version are: 1. We were allowing u8x2 and s8x2 types with luti4_x2 intrinsics. This was incorrect; removed them. 2. From the review of the last version, added explicit_group_suffix_p in luti_base struct. Also added more tests as suggested by the review. Regression tested on aarch64-unknown-linux-gnu and found no regressions. Ok for master? Thanks, Saurabh --- gcc/config/aarch64/aarch64-c.cc | 2 + .../aarch64/aarch64-sve-builtins-shapes.cc| 46 +++ .../aarch64/aarch64-sve-builtins-shapes.h | 2 + .../aarch64/aarch64-sve-builtins-sve2.cc | 17 ++ .../aarch64/aarch64-sve-builtins-sve2.def | 8 + .../aarch64/aarch64-sve-builtins-sve2.h | 2 + gcc/config/aarch64/aarch64-sve-builtins.cc| 8 +- gcc/config/aarch64/aarch64-sve2.md| 33 +++ gcc/config/aarch64/iterators.md | 7 + .../aarch64/sve/acle/asm/test_sve_acle.h | 16 ++ .../aarch64/sve/acle/general-c/lut_1.c| 34 +++ .../aarch64/sve/acle/general-c/lut_2.c| 11 + .../aarch64/sve/acle/general-c/lut_3.c| 92 ++ .../aarch64/sve/acle/general-c/lut_4.c| 262 ++ .../aarch64/sve2/acle/asm/luti2_bf16.c| 50 .../aarch64/sve2/acle/asm/luti2_f16.c | 50 .../aarch64/sve2/acle/asm/luti2_s16.c | 50 .../aarch64/sve2/acle/asm/luti2_s8.c | 50 .../aarch64/sve2/acle/asm/luti2_u16.c | 50 .../aarch64/sve2/acle/asm/luti2_u8.c | 50 .../aarch64/sve2/acle/asm/luti4_bf16.c| 50 .../aarch64/sve2/acle/asm/luti4_bf16_x2.c | 50 .../aarch64/sve2/acle/asm/luti4_f16.c | 50 .../aarch64/sve2/acle/asm/luti4_f16_x2.c | 50 .../aarch64/sve2/acle/asm/luti4_s16.c | 50 .../aarch64/sve2/acle/asm/luti4_s16_x2.c | 50 .../aarch64/sve2/acle/asm/luti4_s8.c | 50 .../aarch64/sve2/acle/asm/luti4_u16.c | 50 .../aarch64/sve2/acle/asm/luti4_u16_x2.c | 50 .../aarch64/sve2/acle/asm/luti4_u8.c | 50 gcc/testsuite/lib/target-supports.exp | 2 +- 31 files changed, 1340 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_3.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_4.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_s16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_s8.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_u16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_u8.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16_x2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_f16_x2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s16_x2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s8.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u16_x2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u8.c diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc index 92fcf5389a3..d1e2ab9831d 100644 --- a/gcc/config/aarch64/aarch64-c.cc +++ b/gcc/config/aarch64/aarch64-c.cc @@ -268,6 +268,8 @@ aarch64_update_cpp_builtins (cpp_reader *pfile) aarch64_def_or_undef (TARGET_SVE_BF16, "__A
[PATCH v6] AArch64: Add LUTI ACLE for SVE2
This patch introduces support for LUTI2/LUTI4 ACLE for SVE2. LUTI instructions are used for efficient table lookups with 2-bit or 4-bit indices. LUTI2 reads indexed 8-bit or 16-bit elements from the low 128 bits of the table vector using packed 2-bit indices, while LUTI4 can read from the low 128 or 256 bits of the table vector or from two table vectors using packed 4-bit indices. These instructions fill the destination vector by copying elements indexed by segments of the source vector, selected by the vector segment index. The changes include the addition of a new AArch64 option extension "lut", __ARM_FEATURE_LUT preprocessor macro, definitions for the new LUTI instruction shapes, and implementations of the svluti2 and svluti4 builtins. gcc/ChangeLog: * config/aarch64/aarch64-c.cc (aarch64_update_cpp_builtins): Add new flag TARGET_LUT. * config/aarch64/aarch64-sve-builtins-shapes.cc (struct luti_base): Shape for lut intrinsics. (SHAPE): Specializations for lut shapes for luti2 and luti4.. * config/aarch64/aarch64-sve-builtins-shapes.h: Declare lut intrinsics. * config/aarch64/aarch64-sve-builtins-sve2.cc (class svluti_lane_impl): Define expand for lut intrinsics. (FUNCTION): Define expand for lut intrinsics. * config/aarch64/aarch64-sve-builtins-sve2.def (REQUIRED_EXTENSIONS): Declare lut intrinsics behind lut flag. (svluti2_lane): Define intrinsic behind flag. (svluti4_lane): Define intrinsic behind flag. * config/aarch64/aarch64-sve-builtins-sve2.h: Declare lut intrinsics. * config/aarch64/aarch64-sve-builtins.cc (TYPES_bh_data): New type for byte and halfword. (bh_data): Type array for byte and halfword. (h_data): Type array for halfword. * config/aarch64/aarch64-sve2.md (@aarch64_sve_luti): Instruction patterns for lut intrinsics. * config/aarch64/iterators.md: Iterators and attributes for lut intrinsics. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/acle/asm/test_sve_acle.h: New test macro. * lib/target-supports.exp: Add lut flag to the for loop. * gcc.target/aarch64/sve/acle/general-c/lut_1.c: New test. * gcc.target/aarch64/sve/acle/general-c/lut_2.c: New test. * gcc.target/aarch64/sve/acle/general-c/lut_3.c: New test. * gcc.target/aarch64/sve/acle/general-c/lut_4.c: New test. * gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c: New test. * gcc.target/aarch64/sve2/acle/asm/luti2_f16.c: New test. * gcc.target/aarch64/sve2/acle/asm/luti2_s16.c: New test. * gcc.target/aarch64/sve2/acle/asm/luti2_s8.c: New test. * gcc.target/aarch64/sve2/acle/asm/luti2_u16.c: New test. * gcc.target/aarch64/sve2/acle/asm/luti2_u8.c: New test. * gcc.target/aarch64/sve2/acle/asm/luti4_bf16.c: New test. * gcc.target/aarch64/sve2/acle/asm/luti4_bf16_x2.c: New test. * gcc.target/aarch64/sve2/acle/asm/luti4_f16.c: New test. * gcc.target/aarch64/sve2/acle/asm/luti4_f16_x2.c: New test. * gcc.target/aarch64/sve2/acle/asm/luti4_s16.c: New test. * gcc.target/aarch64/sve2/acle/asm/luti4_s16_x2.c: New test. * gcc.target/aarch64/sve2/acle/asm/luti4_s8.c: New test. * gcc.target/aarch64/sve2/acle/asm/luti4_u16.c: New test. * gcc.target/aarch64/sve2/acle/asm/luti4_u16_x2.c: New test. * gcc.target/aarch64/sve2/acle/asm/luti4_u8.c: New test. --- This is a respin of https://gcc.gnu.org/pipermail/gcc-patches/2025-January/674085.html The only change from the previous version is the addition of ChangeLog in the commit message. Ok for master? Thanks, Saurabh --- gcc/config/aarch64/aarch64-c.cc | 2 + .../aarch64/aarch64-sve-builtins-shapes.cc| 46 +++ .../aarch64/aarch64-sve-builtins-shapes.h | 2 + .../aarch64/aarch64-sve-builtins-sve2.cc | 17 ++ .../aarch64/aarch64-sve-builtins-sve2.def | 8 + .../aarch64/aarch64-sve-builtins-sve2.h | 2 + gcc/config/aarch64/aarch64-sve-builtins.cc| 8 +- gcc/config/aarch64/aarch64-sve2.md| 33 +++ gcc/config/aarch64/iterators.md | 7 + .../aarch64/sve/acle/asm/test_sve_acle.h | 16 ++ .../aarch64/sve/acle/general-c/lut_1.c| 34 +++ .../aarch64/sve/acle/general-c/lut_2.c| 11 + .../aarch64/sve/acle/general-c/lut_3.c| 92 ++ .../aarch64/sve/acle/general-c/lut_4.c| 262 ++ .../aarch64/sve2/acle/asm/luti2_bf16.c| 50 .../aarch64/sve2/acle/asm/luti2_f16.c | 50 .../aarch64/sve2/acle/asm/luti2_s16.c | 50 .../aarch64/sve2/acle/asm/luti2_s8.c | 50 .../aarch64/sve2/acle/asm/luti2_u16.c | 50 .../aarch64/sve2/acle/asm/luti2_u8.c | 50 ..
[PATCH v3] AArch64: Add LUTI ACLE for SVE2
This patch introduces support for LUTI2/LUTI4 ACLE for SVE2. LUTI instructions are used for efficient table lookups with 2-bit or 4-bit indices. LUTI2 reads indexed 8-bit or 16-bit elements from the low 128 bits of the table vector using packed 2-bit indices, while LUTI4 can read from the low 128 or 256 bits of the table vector or from two table vectors using packed 4-bit indices. These instructions fill the destination vector by copying elements indexed by segments of the source vector, selected by the vector segment index. The changes include the addition of a new AArch64 option extension "lut", __ARM_FEATURE_LUT preprocessor macro, definitions for the new LUTI instruction shapes, and implementations of the svluti2 and svluti4 builtins. New tests are added as well. --- Hey, This is a respin of https://gcc.gnu.org/pipermail/gcc-patches/2024-July/658015.html. Rebased with master. Regression tested on aarch64-unknown-linux-gnu and found no regressions. Ok for master? Thanks, Saurabh --- gcc/config/aarch64/aarch64-c.cc | 2 + .../aarch64/aarch64-sve-builtins-shapes.cc| 41 + .../aarch64/aarch64-sve-builtins-shapes.h | 2 + .../aarch64/aarch64-sve-builtins-sve2.cc | 17 +++ .../aarch64/aarch64-sve-builtins-sve2.def | 4 ++ .../aarch64/aarch64-sve-builtins-sve2.h | 2 + gcc/config/aarch64/aarch64-sve2.md| 45 +++ gcc/config/aarch64/iterators.md | 10 + .../aarch64/sve/acle/asm/test_sve_acle.h | 16 +++ .../aarch64/sve2/acle/asm/luti2_bf16.c| 40 + .../aarch64/sve2/acle/asm/luti2_f16.c | 40 + .../aarch64/sve2/acle/asm/luti2_s16.c | 40 + .../aarch64/sve2/acle/asm/luti2_s8.c | 40 + .../aarch64/sve2/acle/asm/luti2_u16.c | 40 + .../aarch64/sve2/acle/asm/luti2_u8.c | 40 + .../aarch64/sve2/acle/asm/luti4_bf16.c| 40 + .../aarch64/sve2/acle/asm/luti4_bf16_x2.c | 20 + .../aarch64/sve2/acle/asm/luti4_f16.c | 40 + .../aarch64/sve2/acle/asm/luti4_f16_x2.c | 20 + .../aarch64/sve2/acle/asm/luti4_s16.c | 40 + .../aarch64/sve2/acle/asm/luti4_s16_x2.c | 20 + .../aarch64/sve2/acle/asm/luti4_s8.c | 30 + .../aarch64/sve2/acle/asm/luti4_u16.c | 40 + .../aarch64/sve2/acle/asm/luti4_u16_x2.c | 20 + .../aarch64/sve2/acle/asm/luti4_u8.c | 30 + gcc/testsuite/lib/target-supports.exp | 12 + 26 files changed, 691 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_s16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_s8.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_u16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_u8.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16_x2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_f16_x2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s16_x2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s8.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u16_x2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u8.c diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc index 92fcf5389a3..d1e2ab9831d 100644 --- a/gcc/config/aarch64/aarch64-c.cc +++ b/gcc/config/aarch64/aarch64-c.cc @@ -268,6 +268,8 @@ aarch64_update_cpp_builtins (cpp_reader *pfile) aarch64_def_or_undef (TARGET_SVE_BF16, "__ARM_FEATURE_SVE_BF16", pfile); + aarch64_def_or_undef (TARGET_LUT, "__ARM_FEATURE_LUT", pfile); + aarch64_def_or_undef (TARGET_FP8, "__ARM_FEATURE_FP8", pfile); aarch64_def_or_undef (TARGET_FP8DOT2, "__ARM_FEATURE_FP8DOT2", pfile); diff --git a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc index ca721dd2c09..0f6d366b2d6 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc @@ -903,6 +903,47 @@ struct load_ext_gather_base : public overloaded_base<1> } }; + +/* sv_t svlut_(sv_t, svuint8_t, uint64_t) + where the final argument is a constant ind
[PATCH v4] AArch64: Add LUTI ACLE for SVE2
This patch introduces support for LUTI2/LUTI4 ACLE for SVE2. LUTI instructions are used for efficient table lookups with 2-bit or 4-bit indices. LUTI2 reads indexed 8-bit or 16-bit elements from the low 128 bits of the table vector using packed 2-bit indices, while LUTI4 can read from the low 128 or 256 bits of the table vector or from two table vectors using packed 4-bit indices. These instructions fill the destination vector by copying elements indexed by segments of the source vector, selected by the vector segment index. The changes include the addition of a new AArch64 option extension "lut", __ARM_FEATURE_LUT preprocessor macro, definitions for the new LUTI instruction shapes, and implementations of the svluti2 and svluti4 builtins. New tests are added as well. --- This is a respin of https://gcc.gnu.org/pipermail/gcc-patches/2025-January/672910.html. Addressed comments on lut. The faminmax comments will be addressed in a separate patch. Regression tested on aarch64-unknown-linux-gnu and found no regressions. Ok for master? Thanks, Saurabh --- gcc/config/aarch64/aarch64-c.cc | 2 + .../aarch64/aarch64-sve-builtins-shapes.cc| 44 + .../aarch64/aarch64-sve-builtins-shapes.h | 2 + .../aarch64/aarch64-sve-builtins-sve2.cc | 17 + .../aarch64/aarch64-sve-builtins-sve2.def | 8 +++ .../aarch64/aarch64-sve-builtins-sve2.h | 2 + gcc/config/aarch64/aarch64-sve-builtins.cc| 7 +- gcc/config/aarch64/aarch64-sve2.md| 33 ++ gcc/config/aarch64/iterators.md | 13 .../aarch64/sve/acle/asm/test_sve_acle.h | 16 + .../aarch64/sve/acle/general-c/lut_1.c| 64 +++ .../aarch64/sve/acle/general-c/lut_2.c| 11 .../aarch64/sve/acle/general-c/lut_3.c| 56 .../aarch64/sve2/acle/asm/luti2_bf16.c| 50 +++ .../aarch64/sve2/acle/asm/luti2_f16.c | 50 +++ .../aarch64/sve2/acle/asm/luti2_s16.c | 50 +++ .../aarch64/sve2/acle/asm/luti2_s8.c | 50 +++ .../aarch64/sve2/acle/asm/luti2_u16.c | 50 +++ .../aarch64/sve2/acle/asm/luti2_u8.c | 50 +++ .../aarch64/sve2/acle/asm/luti4_bf16.c| 50 +++ .../aarch64/sve2/acle/asm/luti4_bf16_x2.c | 30 + .../aarch64/sve2/acle/asm/luti4_f16.c | 50 +++ .../aarch64/sve2/acle/asm/luti4_f16_x2.c | 30 + .../aarch64/sve2/acle/asm/luti4_s16.c | 50 +++ .../aarch64/sve2/acle/asm/luti4_s16_x2.c | 30 + .../aarch64/sve2/acle/asm/luti4_s8.c | 50 +++ .../aarch64/sve2/acle/asm/luti4_u16.c | 50 +++ .../aarch64/sve2/acle/asm/luti4_u16_x2.c | 30 + .../aarch64/sve2/acle/asm/luti4_u8.c | 50 +++ gcc/testsuite/lib/target-supports.exp | 2 +- 30 files changed, 995 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_3.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_s16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_s8.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_u16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_u8.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16_x2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_f16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_f16_x2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s16_x2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_s8.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u16.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u16_x2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_u8.c diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc index 92fcf5389a3..d1e2ab9831d 100644 --- a/gcc/config/aarch64/aarch64-c.cc +++ b/gcc/config/aarch64/aarch64-c.cc @@ -268,6 +268,8 @@ aarch64_update_cpp_builtins (cpp_reader *pfile) aarch64_def_or_undef (TARGET_SVE_BF16, "__ARM_FEATURE_SVE_BF16", pfile); + aarch64_def_or_undef (TARGET_LUT, "__ARM_FEATURE_LUT", pfile); + aarch64_def_or_undef (TARGET_FP8, "__ARM_FEAT