Hi all, This patch adds Bfloat type support to the ARM back-end. It also adds a new machine_mode (BFmode) for this type and accompanying Vector modes V4BFmode and V8BFmode.
The second patch in this series uses existing target hooks to restrict type use. Regression testing on aarch64-none-elf passed successfully. This patch depends on: https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00857.html for test suite effective_target update. Ok for trunk? Cheers, Stam ACLE documents are at https://developer.arm.com/docs/101028/latest ISA documents are at https://developer.arm.com/docs/ddi0596/latest Details on ARM Bfloat can be found here: https://community.arm.com/developer/ip-products/processors/b/ml-ip-blog/posts/bfloat16-processing-for-neural-networks-on-armv8_2d00_a PS. I don't have commit rights, so if someone could commit on my behalf, that would be great :) gcc/ChangeLog: 2019-12-16 Stam Markianos-Wright <stam.markianos-wri...@arm.com> * config.gcc: Add arm_bf16.h. * config/aarch64/aarch64-builtins.c (aarch64_simd_builtin_std_type): Add BFmode. (aarch64_init_simd_builtin_types): Add element types for vector types. (aarch64_init_bf16_types): New function. (aarch64_general_init_builtins): Add arm_init_bf16_types function call. * config/aarch64/aarch64-modes.def: Add BFmode and vector modes. * config/aarch64/aarch64-simd-builtin-types.def: * config/aarch64/aarch64-simd.md: Add BF types to NEON move patterns. * config/aarch64/aarch64.c (aarch64_classify_vector_mode): Add BF modes. (aarch64_gimplify_va_arg_expr): Add BFmode. * config/aarch64/aarch64.h (AARCH64_VALID_SIMD_DREG_MODE): Add V4BF. (AARCH64_VALID_SIMD_QREG_MODE): Add V8BF. * config/aarch64/aarch64.md: New enabled_for_bfmode_scalar, enabled_for_bfmode_vector attributes. Add BFmode to movhf pattern. * config/aarch64/arm_bf16.h: New file. * config/aarch64/arm_neon.h: Add arm_bf16.h and Bfloat vector types. * config/aarch64/iterators.md (HFBF, GPF_TF_F16_MOV, VDMOV, VQMOV, VALL_F16MOV): New. gcc/testsuite/ChangeLog: 2019-12-16 Stam Markianos-Wright <stam.markianos-wri...@arm.com> * gcc.target/aarch64/bfloat16_compile.c: New test.
diff --git a/gcc/config.gcc b/gcc/config.gcc index 9802f436e06..b49c110ccaf 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -315,7 +315,7 @@ m32c*-*-*) ;; aarch64*-*-*) cpu_type=aarch64 - extra_headers="arm_fp16.h arm_neon.h arm_acle.h arm_sve.h" + extra_headers="arm_fp16.h arm_neon.h arm_bf16.h arm_acle.h arm_sve.h" c_target_objs="aarch64-c.o" cxx_target_objs="aarch64-c.o" d_target_objs="aarch64-d.o" diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index c35a1b1f029..3ba2f12166f 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -68,6 +68,9 @@ #define hi_UP E_HImode #define hf_UP E_HFmode #define qi_UP E_QImode +#define bf_UP E_BFmode +#define v4bf_UP E_V4BFmode +#define v8bf_UP E_V8BFmode #define UP(X) X##_UP #define SIMD_MAX_BUILTIN_ARGS 5 @@ -568,6 +571,10 @@ static tree aarch64_simd_intXI_type_node = NULL_TREE; tree aarch64_fp16_type_node = NULL_TREE; tree aarch64_fp16_ptr_type_node = NULL_TREE; +/* Back-end node type for brain float (bfloat) types. */ +tree aarch64_bf16_type_node = NULL_TREE; +tree aarch64_bf16_ptr_type_node = NULL_TREE; + /* Wrapper around add_builtin_function. NAME is the name of the built-in function, TYPE is the function type, and CODE is the function subcode (relative to AARCH64_BUILTIN_GENERAL). */ @@ -659,6 +666,8 @@ aarch64_simd_builtin_std_type (machine_mode mode, return float_type_node; case E_DFmode: return double_type_node; + case E_BFmode: + return aarch64_bf16_type_node; default: gcc_unreachable (); } @@ -750,6 +759,11 @@ aarch64_init_simd_builtin_types (void) aarch64_simd_types[Float64x1_t].eltype = double_type_node; aarch64_simd_types[Float64x2_t].eltype = double_type_node; + +/* Init Bfloat vector types with underlying uint types. */ + aarch64_simd_types[Bfloat16x4_t].eltype = aarch64_bf16_type_node; + aarch64_simd_types[Bfloat16x8_t].eltype = aarch64_bf16_type_node; + for (i = 0; i < nelts; i++) { tree eltype = aarch64_simd_types[i].eltype; @@ -1059,6 +1073,19 @@ aarch64_init_fp16_types (void) aarch64_fp16_ptr_type_node = build_pointer_type (aarch64_fp16_type_node); } +/* Initialize the backend REAL_TYPE type supporting bfloat types. */ +static void +aarch64_init_bf16_types (void) +{ + aarch64_bf16_type_node = make_node (REAL_TYPE); + TYPE_PRECISION (aarch64_bf16_type_node) = 16; + SET_TYPE_MODE (aarch64_bf16_type_node, BFmode); + layout_type (aarch64_bf16_type_node); + + (*lang_hooks.types.register_builtin_type) (aarch64_bf16_type_node, "__bf16"); + aarch64_bf16_ptr_type_node = build_pointer_type (aarch64_bf16_type_node); +} + /* Pointer authentication builtins that will become NOP on legacy platform. Currently, these builtins are for internal use only (libgcc EH unwinder). */ @@ -1214,6 +1241,8 @@ aarch64_general_init_builtins (void) aarch64_init_fp16_types (); + aarch64_init_bf16_types (); + if (TARGET_SIMD) aarch64_init_simd_builtins (); diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def index 3c698b620cd..59f2ec4eaec 100644 --- a/gcc/config/aarch64/aarch64-modes.def +++ b/gcc/config/aarch64/aarch64-modes.def @@ -69,6 +69,13 @@ VECTOR_MODES (FLOAT, 16); /* V4SF V2DF. */ VECTOR_MODE (FLOAT, DF, 1); /* V1DF. */ VECTOR_MODE (FLOAT, HF, 2); /* V2HF. */ +/* Bfloat16 modes. */ +FLOAT_MODE (BF, 2, 0); +ADJUST_FLOAT_FORMAT (BF, &arm_bfloat_half_format); + +VECTOR_MODE (FLOAT, BF, 4); /* V4BF. */ +VECTOR_MODE (FLOAT, BF, 8); /* V8BF. */ + /* Oct Int: 256-bit integer mode needed for 32-byte vector arguments. */ INT_MODE (OI, 32); diff --git a/gcc/config/aarch64/aarch64-simd-builtin-types.def b/gcc/config/aarch64/aarch64-simd-builtin-types.def index b015694293c..3b387377f38 100644 --- a/gcc/config/aarch64/aarch64-simd-builtin-types.def +++ b/gcc/config/aarch64/aarch64-simd-builtin-types.def @@ -50,3 +50,5 @@ ENTRY (Float32x4_t, V4SF, none, 13) ENTRY (Float64x1_t, V1DF, none, 13) ENTRY (Float64x2_t, V2DF, none, 13) + ENTRY (Bfloat16x4_t, V4BF, none, 15) + ENTRY (Bfloat16x8_t, V8BF, none, 15) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index ad4676bc167..c4858ab7cff 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -19,8 +19,8 @@ ;; <http://www.gnu.org/licenses/>. (define_expand "mov<mode>" - [(set (match_operand:VALL_F16 0 "nonimmediate_operand") - (match_operand:VALL_F16 1 "general_operand"))] + [(set (match_operand:VALL_F16MOV 0 "nonimmediate_operand") + (match_operand:VALL_F16MOV 1 "general_operand"))] "TARGET_SIMD" " /* Force the operand into a register if it is not an @@ -101,10 +101,10 @@ [(set_attr "type" "neon_dup<q>")] ) -(define_insn "*aarch64_simd_mov<VD:mode>" - [(set (match_operand:VD 0 "nonimmediate_operand" +(define_insn "*aarch64_simd_mov<VDMOV:mode>" + [(set (match_operand:VDMOV 0 "nonimmediate_operand" "=w, m, m, w, ?r, ?w, ?r, w") - (match_operand:VD 1 "general_operand" + (match_operand:VDMOV 1 "general_operand" "m, Dz, w, w, w, r, r, Dn"))] "TARGET_SIMD && (register_operand (operands[0], <MODE>mode) @@ -126,13 +126,14 @@ } [(set_attr "type" "neon_load1_1reg<q>, store_8, neon_store1_1reg<q>,\ neon_logic<q>, neon_to_gp<q>, f_mcr,\ - mov_reg, neon_move<q>")] + mov_reg, neon_move<q>") + (set_attr "arch" "*,notbf16,*,*,*,*,*,notbf16")] ) -(define_insn "*aarch64_simd_mov<VQ:mode>" - [(set (match_operand:VQ 0 "nonimmediate_operand" +(define_insn "*aarch64_simd_mov<VQMOV:mode>" + [(set (match_operand:VQMOV 0 "nonimmediate_operand" "=w, Umn, m, w, ?r, ?w, ?r, w") - (match_operand:VQ 1 "general_operand" + (match_operand:VQMOV 1 "general_operand" "m, Dz, w, w, w, r, r, Dn"))] "TARGET_SIMD && (register_operand (operands[0], <MODE>mode) @@ -161,7 +162,8 @@ [(set_attr "type" "neon_load1_1reg<q>, store_16, neon_store1_1reg<q>,\ neon_logic<q>, multiple, multiple,\ multiple, neon_move<q>") - (set_attr "length" "4,4,4,4,8,8,8,4")] + (set_attr "length" "4,4,4,4,8,8,8,4") + (set_attr "arch" "*,notbf16,*,*,*,*,*,notbf16")] ) ;; When storing lane zero we can use the normal STR and its more permissive diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index b0aca03bcb4..f57469b6e23 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -1692,6 +1692,7 @@ aarch64_classify_vector_mode (machine_mode mode) case E_V2SImode: /* ...E_V1DImode doesn't exist. */ case E_V4HFmode: + case E_V4BFmode: case E_V2SFmode: case E_V1DFmode: /* 128-bit Advanced SIMD vectors. */ @@ -1700,6 +1701,7 @@ aarch64_classify_vector_mode (machine_mode mode) case E_V4SImode: case E_V2DImode: case E_V8HFmode: + case E_V8BFmode: case E_V4SFmode: case E_V2DFmode: return TARGET_SIMD ? VEC_ADVSIMD : 0; @@ -15548,6 +15550,10 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p, field_t = aarch64_fp16_type_node; field_ptr_t = aarch64_fp16_ptr_type_node; break; + case E_BFmode: + field_t = aarch64_bf16_type_node; + field_ptr_t = aarch64_bf16_ptr_type_node; + break; case E_V2SImode: case E_V4SImode: { diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index 2bb5a208720..857e2b8f90e 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -1120,13 +1120,13 @@ extern enum aarch64_code_model aarch64_cmodel; #define AARCH64_VALID_SIMD_DREG_MODE(MODE) \ ((MODE) == V2SImode || (MODE) == V4HImode || (MODE) == V8QImode \ || (MODE) == V2SFmode || (MODE) == V4HFmode || (MODE) == DImode \ - || (MODE) == DFmode) + || (MODE) == DFmode || (MODE) == V4BFmode) /* Modes valid for AdvSIMD Q registers. */ #define AARCH64_VALID_SIMD_QREG_MODE(MODE) \ ((MODE) == V4SImode || (MODE) == V8HImode || (MODE) == V16QImode \ || (MODE) == V4SFmode || (MODE) == V8HFmode || (MODE) == V2DImode \ - || (MODE) == V2DFmode) + || (MODE) == V2DFmode || (MODE) == V8BFmode) #define ENDIAN_LANE_N(NUNITS, N) \ (BYTES_BIG_ENDIAN ? NUNITS - 1 - N : N) @@ -1174,6 +1174,11 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); extern tree aarch64_fp16_type_node; extern tree aarch64_fp16_ptr_type_node; +/* This type is the user-visible __bf16, and a pointer to that type. We + need it in many places in the backend. Defined in aarch64-builtins.c. */ +extern tree aarch64_bf16_type_node; +extern tree aarch64_bf16_ptr_type_node; + /* The generic unwind code in libgcc does not initialize the frame pointer. So in order to unwind a function using a frame pointer, the very first function that is unwound must save the frame pointer. That way the frame diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index b11ead7ab23..6c1cd76bb16 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -344,7 +344,7 @@ ;; alternative). This attribute is used to compute attribute "enabled", use type ;; "any" to enable an alternative in all cases. -(define_enum "arches" [ any rcpc8_4 fp simd sve fp16]) +(define_enum "arches" [ any rcpc8_4 fp simd sve fp16 fp16_notbf16 notbf16]) (define_enum_attr "arch" "arches" (const_string "any")) @@ -378,6 +378,12 @@ (and (eq_attr "arch" "fp16") (match_test "TARGET_FP_F16INST")) + (and (eq_attr "arch" "fp16_notbf16") + (match_test "TARGET_FP_F16INST && !TARGET_BF16_FP")) + + (and (eq_attr "arch" "notbf16") + (match_test "!TARGET_BF16_SIMD")) + (and (eq_attr "arch" "sve") (match_test "TARGET_SVE"))) (const_string "yes") @@ -1304,8 +1310,8 @@ }) (define_expand "mov<mode>" - [(set (match_operand:GPF_TF_F16 0 "nonimmediate_operand") - (match_operand:GPF_TF_F16 1 "general_operand"))] + [(set (match_operand:GPF_TF_F16_MOV 0 "nonimmediate_operand") + (match_operand:GPF_TF_F16_MOV 1 "general_operand"))] "" { if (!TARGET_FLOAT) @@ -1321,11 +1327,11 @@ } ) -(define_insn "*movhf_aarch64" - [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w , w,?r,w,w ,w ,w,m,r,m ,r") - (match_operand:HF 1 "general_operand" "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))] - "TARGET_FLOAT && (register_operand (operands[0], HFmode) - || aarch64_reg_or_fp_zero (operands[1], HFmode))" +(define_insn "*mov<mode>_aarch64" + [(set (match_operand:HFBF 0 "nonimmediate_operand" "=w,w , w,?r,w,w ,w ,w,m,r,m ,r") + (match_operand:HFBF 1 "general_operand" "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))] + "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode) + || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))" "@ movi\\t%0.4h, #0 fmov\\t%h0, %w1 @@ -1341,7 +1347,7 @@ mov\\t%w0, %w1" [(set_attr "type" "neon_move,f_mcr,neon_move,neon_to_gp, neon_move,fconsts, \ neon_move,f_loads,f_stores,load_4,store_4,mov_reg") - (set_attr "arch" "simd,fp16,simd,simd,simd,fp16,simd,*,*,*,*,*")] + (set_attr "arch" "simd,fp16,simd,simd,simd,fp16_notbf16,simd,*,*,*,*,*")] ) (define_insn "*movsf_aarch64" diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h new file mode 100644 index 00000000000..aedb0972735 --- /dev/null +++ b/gcc/config/aarch64/arm_bf16.h @@ -0,0 +1,42 @@ +/* Arm BF16 instrinsics include file. + + Copyright (C) 2019 Free Software Foundation, Inc. + Contributed by Arm. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _AARCH64_BF16_H_ +#define _AARCH64_BF16_H_ + +#include <stdint.h> + +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+bf16") +#ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC + +typedef __bf16 bfloat16_t; + + +#endif +#pragma GCC pop_options + +#endif diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 8b861601a48..5996df0a612 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -73,6 +73,10 @@ typedef __fp16 float16_t; typedef float float32_t; typedef double float64_t; +typedef __bf16 bfloat16_t; +typedef __Bfloat16x4_t bfloat16x4_t; +typedef __Bfloat16x8_t bfloat16x8_t; + typedef struct int8x8x2_t { int8x8_t val[2]; @@ -34606,6 +34610,8 @@ vrnd64xq_f64 (float64x2_t __a) #pragma GCC pop_options +#include "arm_bf16.h" + #undef __aarch64_vget_lane_any #undef __aarch64_vdup_lane_any diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 1ca5ed1ef1b..9480efef47c 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -57,9 +57,17 @@ ;; Iterator for all scalar floating point modes (HF, SF, DF) (define_mode_iterator GPF_HF [HF SF DF]) +;; Iterator for all 16-bit scalar floating point modes (HF, BF) +(define_mode_iterator HFBF [HF BF]) + ;; Iterator for all scalar floating point modes (HF, SF, DF and TF) (define_mode_iterator GPF_TF_F16 [HF SF DF TF]) +;; Iterator for all scalar floating point modes suitable for moving, including +;; special BF type.(HF, SF, DF, TF and BF) +(define_mode_iterator GPF_TF_F16_MOV [(HF "") (BF "TARGET_BF16_FP") (SF "") + (DF "") (TF "")]) + ;; Double vector modes. (define_mode_iterator VDF [V2SF V4HF]) @@ -79,6 +87,9 @@ ;; Double vector modes. (define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF]) +;; Double vector modes suitable for moving. Includes BFmode. +(define_mode_iterator VDMOV [V8QI V4HI V4HF V4BF V2SI V2SF]) + ;; All modes stored in registers d0-d31. (define_mode_iterator DREG [V8QI V4HI V4HF V2SI V2SF DF]) @@ -94,6 +105,9 @@ ;; Quad vector modes. (define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF]) +;; Quad vector modes suitable for moving. Includes BFmode. +(define_mode_iterator VQMOV [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF]) + ;; Copy of the above. (define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF]) @@ -160,6 +174,15 @@ (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI V4HF V8HF V2SF V4SF V2DF]) +;; All Advanced SIMD modes suitable for moving, loading, and storing, +;; including special Bfloat vector types. +(define_mode_iterator VALL_F16MOV [(V8QI "") (V16QI "") (V4HI "") (V8HI "") + (V2SI "") (V4SI "") (V2DI "") + (V4HF "") (V8HF "") + (V4BF "TARGET_BF16_SIMD") + (V8BF "TARGET_BF16_SIMD") + (V2SF "") (V4SF "") (V2DF "")]) + ;; The VALL_F16 modes except the 128-bit 2-element ones. (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI V4HF V8HF V2SF V4SF]) @@ -885,7 +908,8 @@ (V8HF "16b") (V2SF "8b") (V4SF "16b") (V2DF "16b") (DI "8b") (DF "8b") - (SI "8b") (SF "8b")]) + (SI "8b") (SF "8b") + (V4BF "8b") (V8HF "16b")]) ;; Define element mode for each vector mode. (define_mode_attr VEL [(V8QI "QI") (V16QI "QI") @@ -1265,6 +1289,7 @@ (V2SI "") (V4SI "_q") (DI "") (V2DI "_q") (V4HF "") (V8HF "_q") + (V4BF "") (V8BF "_q") (V2SF "") (V4SF "_q") (V2DF "_q") (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")]) diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_compile.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile.c new file mode 100644 index 00000000000..f2bef671deb --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile.c @@ -0,0 +1,51 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-O3 --save-temps" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include <arm_neon.h> + +/* +**stacktest1: +** ... +** str h0, \[sp, [0-9]+\] +** ldr h0, \[sp, [0-9]+\] +** ... +** ret +*/ +bfloat16_t stacktest1 (bfloat16_t __a) +{ + volatile bfloat16_t b = __a; + return b; +} + +/* +**stacktest2: +** ... +** str d0, \[sp, [0-9]+\] +** ldr d0, \[sp, [0-9]+\] +** ... +** ret +*/ +bfloat16x4_t stacktest2 (bfloat16x4_t __a) +{ + volatile bfloat16x4_t b = __a; + return b; +} + +/* +**stacktest3: +** ... +** str q0, \[sp\] +** ldr q0, \[sp\] +** ... +** ret +*/ +bfloat16x8_t stacktest3 (bfloat16x8_t __a) +{ + volatile bfloat16x8_t b = __a; + return b; +} + +