Re: [GCC][PATCH][Aarch64] Add Bfloat16_t scalar type, vector types and machine modes to Aarch64 back-end [1/2]

Stam Markianos-Wright Fri, 10 Jan 2020 11:26:01 -0800


On 1/9/20 3:42 PM, Richard Sandiford wrote:
> Thanks for the update, looks great.
> 
> Stam Markianos-Wright <stam.markianos-wri...@arm.com> writes:
>> diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
>> new file mode 100644
>> index 
>> 0000000000000000000000000000000000000000..884b6f3bc7a28c516e54c26a71b1b769f55867a7
>> --- /dev/null
>> +++ b/gcc/config/aarch64/arm_bf16.h
>> @@ -0,0 +1,32 @@
>> +/* Arm BF16 instrinsics include file.
>> +
>> +   Copyright (C) 2019 Free Software Foundation, Inc.
>> +   Contributed by Arm.
> 
> Needs to include 2020 now :-)  Maybe 2019-2020 since it was posted
> in 2019 and would have been changed to 2019-2020 in the automatic update.
> 
> Which reminds me to update my patches too...
> 
> OK for trunk with that change, thanks.


Done and committed as 280129.

Diff attached for reference (and as an attempt to try and keep myself sane and 
not mix it all up!)

Cheers,
Stam

> 
> Richard
>

diff --git a/gcc/config.gcc b/gcc/config.gcc
index c3d6464f3e6adaa1db818a61de00cff8e00ae08e..075e46072d1643302b9587d4e3f14f2e29b4ec8d 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -315,7 +315,7 @@ m32c*-*-*)
         ;;
 aarch64*-*-*)
 	cpu_type=aarch64
-	extra_headers="arm_fp16.h arm_neon.h arm_acle.h arm_sve.h"
+	extra_headers="arm_fp16.h arm_neon.h arm_bf16.h arm_acle.h arm_sve.h"
 	c_target_objs="aarch64-c.o"
 	cxx_target_objs="aarch64-c.o"
 	d_target_objs="aarch64-d.o"
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 1bd2640a1ced352de232fed1cf134b46c69b80f7..b2d6b761489183c262320d62293bec343b315c11 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -68,6 +68,9 @@
 #define hi_UP    E_HImode
 #define hf_UP    E_HFmode
 #define qi_UP    E_QImode
+#define bf_UP    E_BFmode
+#define v4bf_UP  E_V4BFmode
+#define v8bf_UP  E_V8BFmode
 #define UP(X) X##_UP
 
 #define SIMD_MAX_BUILTIN_ARGS 5
@@ -568,6 +571,10 @@ static tree aarch64_simd_intXI_type_node = NULL_TREE;
 tree aarch64_fp16_type_node = NULL_TREE;
 tree aarch64_fp16_ptr_type_node = NULL_TREE;
 
+/* Back-end node type for brain float (bfloat) types.  */
+tree aarch64_bf16_type_node = NULL_TREE;
+tree aarch64_bf16_ptr_type_node = NULL_TREE;
+
 /* Wrapper around add_builtin_function.  NAME is the name of the built-in
    function, TYPE is the function type, and CODE is the function subcode
    (relative to AARCH64_BUILTIN_GENERAL).  */
@@ -659,6 +666,8 @@ aarch64_simd_builtin_std_type (machine_mode mode,
       return float_type_node;
     case E_DFmode:
       return double_type_node;
+    case E_BFmode:
+      return aarch64_bf16_type_node;
     default:
       gcc_unreachable ();
     }
@@ -750,6 +759,10 @@ aarch64_init_simd_builtin_types (void)
   aarch64_simd_types[Float64x1_t].eltype = double_type_node;
   aarch64_simd_types[Float64x2_t].eltype = double_type_node;
 
+  /* Init Bfloat vector types with underlying __bf16 type.  */
+  aarch64_simd_types[Bfloat16x4_t].eltype = aarch64_bf16_type_node;
+  aarch64_simd_types[Bfloat16x8_t].eltype = aarch64_bf16_type_node;
+
   for (i = 0; i < nelts; i++)
     {
       tree eltype = aarch64_simd_types[i].eltype;
@@ -1059,6 +1072,19 @@ aarch64_init_fp16_types (void)
   aarch64_fp16_ptr_type_node = build_pointer_type (aarch64_fp16_type_node);
 }
 
+/* Initialize the backend REAL_TYPE type supporting bfloat types.  */
+static void
+aarch64_init_bf16_types (void)
+{
+  aarch64_bf16_type_node = make_node (REAL_TYPE);
+  TYPE_PRECISION (aarch64_bf16_type_node) = 16;
+  SET_TYPE_MODE (aarch64_bf16_type_node, BFmode);
+  layout_type (aarch64_bf16_type_node);
+
+  lang_hooks.types.register_builtin_type (aarch64_bf16_type_node, "__bf16");
+  aarch64_bf16_ptr_type_node = build_pointer_type (aarch64_bf16_type_node);
+}
+
 /* Pointer authentication builtins that will become NOP on legacy platform.
    Currently, these builtins are for internal use only (libgcc EH unwinder).  */
 
@@ -1214,6 +1240,8 @@ aarch64_general_init_builtins (void)
 
   aarch64_init_fp16_types ();
 
+  aarch64_init_bf16_types ();
+
   if (TARGET_SIMD)
     aarch64_init_simd_builtins ();
 
diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def
index 6cd8ed0972ad7029e0319aad71d3afbda5684a4f..1eeb8d884520b1a53b8a580f165d42858c03228c 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -69,6 +69,13 @@ VECTOR_MODES (FLOAT, 16);     /*            V4SF V2DF.  */
 VECTOR_MODE (FLOAT, DF, 1);   /*                 V1DF.  */
 VECTOR_MODE (FLOAT, HF, 2);   /*                 V2HF.  */
 
+/* Bfloat16 modes.  */
+FLOAT_MODE (BF, 2, 0);
+ADJUST_FLOAT_FORMAT (BF, &arm_bfloat_half_format);
+
+VECTOR_MODE (FLOAT, BF, 4);   /*		 V4BF.  */
+VECTOR_MODE (FLOAT, BF, 8);   /*		 V8BF.  */
+
 /* Oct Int: 256-bit integer mode needed for 32-byte vector arguments.  */
 INT_MODE (OI, 32);
 
diff --git a/gcc/config/aarch64/aarch64-simd-builtin-types.def b/gcc/config/aarch64/aarch64-simd-builtin-types.def
index 76d4d130013d7498a23728337b63875958273a54..e885755bc927d1174dce8d490636df463b76d2f8 100644
--- a/gcc/config/aarch64/aarch64-simd-builtin-types.def
+++ b/gcc/config/aarch64/aarch64-simd-builtin-types.def
@@ -50,3 +50,5 @@
   ENTRY (Float32x4_t, V4SF, none, 13)
   ENTRY (Float64x1_t, V1DF, none, 13)
   ENTRY (Float64x2_t, V2DF, none, 13)
+  ENTRY (Bfloat16x4_t, V4BF, none, 14)
+  ENTRY (Bfloat16x8_t, V8BF, none, 14)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 4e28cf97516df19e1d502e56c776f6b34f15c116..cea9592695ac8bd2f4e625f8b769ddaf716e9091 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -19,8 +19,8 @@
 ;; <http://www.gnu.org/licenses/>.
 
 (define_expand "mov<mode>"
-  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
-	(match_operand:VALL_F16 1 "general_operand"))]
+  [(set (match_operand:VALL_F16MOV 0 "nonimmediate_operand")
+	(match_operand:VALL_F16MOV 1 "general_operand"))]
   "TARGET_SIMD"
   "
   /* Force the operand into a register if it is not an
@@ -101,10 +101,10 @@
   [(set_attr "type" "neon_dup<q>")]
 )
 
-(define_insn "*aarch64_simd_mov<VD:mode>"
-  [(set (match_operand:VD 0 "nonimmediate_operand"
+(define_insn "*aarch64_simd_mov<VDMOV:mode>"
+  [(set (match_operand:VDMOV 0 "nonimmediate_operand"
 		"=w, m,  m,  w, ?r, ?w, ?r, w")
-	(match_operand:VD 1 "general_operand"
+	(match_operand:VDMOV 1 "general_operand"
 		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
@@ -129,10 +129,10 @@
 		     mov_reg, neon_move<q>")]
 )
 
-(define_insn "*aarch64_simd_mov<VQ:mode>"
-  [(set (match_operand:VQ 0 "nonimmediate_operand"
+(define_insn "*aarch64_simd_mov<VQMOV:mode>"
+  [(set (match_operand:VQMOV 0 "nonimmediate_operand"
 		"=w, Umn,  m,  w, ?r, ?w, ?r, w")
-	(match_operand:VQ 1 "general_operand"
+	(match_operand:VQMOV 1 "general_operand"
 		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
@@ -234,8 +234,8 @@
 
 
 (define_split
-  [(set (match_operand:VQ 0 "register_operand" "")
-      (match_operand:VQ 1 "register_operand" ""))]
+  [(set (match_operand:VQMOV 0 "register_operand" "")
+      (match_operand:VQMOV 1 "register_operand" ""))]
   "TARGET_SIMD && reload_completed
    && GP_REGNUM_P (REGNO (operands[0]))
    && GP_REGNUM_P (REGNO (operands[1]))"
@@ -246,8 +246,8 @@
 })
 
 (define_split
-  [(set (match_operand:VQ 0 "register_operand" "")
-        (match_operand:VQ 1 "register_operand" ""))]
+  [(set (match_operand:VQMOV 0 "register_operand" "")
+        (match_operand:VQMOV 1 "register_operand" ""))]
   "TARGET_SIMD && reload_completed
    && ((FP_REGNUM_P (REGNO (operands[0])) && GP_REGNUM_P (REGNO (operands[1])))
        || (GP_REGNUM_P (REGNO (operands[0])) && FP_REGNUM_P (REGNO (operands[1]))))"
@@ -258,8 +258,8 @@
 })
 
 (define_expand "@aarch64_split_simd_mov<mode>"
-  [(set (match_operand:VQ 0)
-        (match_operand:VQ 1))]
+  [(set (match_operand:VQMOV 0)
+        (match_operand:VQMOV 1))]
   "TARGET_SIMD"
   {
     rtx dst = operands[0];
@@ -295,8 +295,8 @@
 (define_insn "aarch64_simd_mov_from_<mode>low"
   [(set (match_operand:<VHALF> 0 "register_operand" "=r")
         (vec_select:<VHALF>
-          (match_operand:VQ 1 "register_operand" "w")
-          (match_operand:VQ 2 "vect_par_cnst_lo_half" "")))]
+          (match_operand:VQMOV 1 "register_operand" "w")
+          (match_operand:VQMOV 2 "vect_par_cnst_lo_half" "")))]
   "TARGET_SIMD && reload_completed"
   "umov\t%0, %1.d[0]"
   [(set_attr "type" "neon_to_gp<q>")
@@ -306,8 +306,8 @@
 (define_insn "aarch64_simd_mov_from_<mode>high"
   [(set (match_operand:<VHALF> 0 "register_operand" "=r")
         (vec_select:<VHALF>
-          (match_operand:VQ 1 "register_operand" "w")
-          (match_operand:VQ 2 "vect_par_cnst_hi_half" "")))]
+          (match_operand:VQMOV 1 "register_operand" "w")
+          (match_operand:VQMOV 2 "vect_par_cnst_hi_half" "")))]
   "TARGET_SIMD && reload_completed"
   "umov\t%0, %1.d[1]"
   [(set_attr "type" "neon_to_gp<q>")
@@ -1471,8 +1471,8 @@
 ;; On big-endian this is { zeroes, operand }
 
 (define_insn "move_lo_quad_internal_<mode>"
-  [(set (match_operand:VQ_NO2E 0 "register_operand" "=w,w,w")
-	(vec_concat:VQ_NO2E
+  [(set (match_operand:VQMOV_NO2E 0 "register_operand" "=w,w,w")
+	(vec_concat:VQMOV_NO2E
 	  (match_operand:<VHALF> 1 "register_operand" "w,r,r")
 	  (vec_duplicate:<VHALF> (const_int 0))))]
   "TARGET_SIMD && !BYTES_BIG_ENDIAN"
@@ -1501,8 +1501,8 @@
 )
 
 (define_insn "move_lo_quad_internal_be_<mode>"
-  [(set (match_operand:VQ_NO2E 0 "register_operand" "=w,w,w")
-	(vec_concat:VQ_NO2E
+  [(set (match_operand:VQMOV_NO2E 0 "register_operand" "=w,w,w")
+	(vec_concat:VQMOV_NO2E
 	  (vec_duplicate:<VHALF> (const_int 0))
 	  (match_operand:<VHALF> 1 "register_operand" "w,r,r")))]
   "TARGET_SIMD && BYTES_BIG_ENDIAN"
@@ -1531,8 +1531,8 @@
 )
 
 (define_expand "move_lo_quad_<mode>"
-  [(match_operand:VQ 0 "register_operand")
-   (match_operand:VQ 1 "register_operand")]
+  [(match_operand:VQMOV 0 "register_operand")
+   (match_operand:VQMOV 1 "register_operand")]
   "TARGET_SIMD"
 {
   if (BYTES_BIG_ENDIAN)
@@ -1549,11 +1549,11 @@
 ;; For big-endian this is { operand1, operand2 }
 
 (define_insn "aarch64_simd_move_hi_quad_<mode>"
-  [(set (match_operand:VQ 0 "register_operand" "+w,w")
-        (vec_concat:VQ
+  [(set (match_operand:VQMOV 0 "register_operand" "+w,w")
+        (vec_concat:VQMOV
           (vec_select:<VHALF>
                 (match_dup 0)
-                (match_operand:VQ 2 "vect_par_cnst_lo_half" ""))
+                (match_operand:VQMOV 2 "vect_par_cnst_lo_half" ""))
 	  (match_operand:<VHALF> 1 "register_operand" "w,r")))]
   "TARGET_SIMD && !BYTES_BIG_ENDIAN"
   "@
@@ -1563,12 +1563,12 @@
 )
 
 (define_insn "aarch64_simd_move_hi_quad_be_<mode>"
-  [(set (match_operand:VQ 0 "register_operand" "+w,w")
-        (vec_concat:VQ
+  [(set (match_operand:VQMOV 0 "register_operand" "+w,w")
+        (vec_concat:VQMOV
 	  (match_operand:<VHALF> 1 "register_operand" "w,r")
           (vec_select:<VHALF>
                 (match_dup 0)
-                (match_operand:VQ 2 "vect_par_cnst_lo_half" ""))))]
+                (match_operand:VQMOV 2 "vect_par_cnst_lo_half" ""))))]
   "TARGET_SIMD && BYTES_BIG_ENDIAN"
   "@
    ins\\t%0.d[1], %1.d[0]
@@ -1577,7 +1577,7 @@
 )
 
 (define_expand "move_hi_quad_<mode>"
- [(match_operand:VQ 0 "register_operand")
+ [(match_operand:VQMOV 0 "register_operand")
   (match_operand:<VHALF> 1 "register_operand")]
  "TARGET_SIMD"
 {
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 04dabd46437cf650e38e085d219c4e629b537e67..b04922056106ad2060d72b99fb49d57fd2b50f4b 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -1120,13 +1120,13 @@ extern enum aarch64_code_model aarch64_cmodel;
 #define AARCH64_VALID_SIMD_DREG_MODE(MODE) \
   ((MODE) == V2SImode || (MODE) == V4HImode || (MODE) == V8QImode \
    || (MODE) == V2SFmode || (MODE) == V4HFmode || (MODE) == DImode \
-   || (MODE) == DFmode)
+   || (MODE) == DFmode || (MODE) == V4BFmode)
 
 /* Modes valid for AdvSIMD Q registers.  */
 #define AARCH64_VALID_SIMD_QREG_MODE(MODE) \
   ((MODE) == V4SImode || (MODE) == V8HImode || (MODE) == V16QImode \
    || (MODE) == V4SFmode || (MODE) == V8HFmode || (MODE) == V2DImode \
-   || (MODE) == V2DFmode)
+   || (MODE) == V2DFmode || (MODE) == V8BFmode)
 
 #define ENDIAN_LANE_N(NUNITS, N) \
   (BYTES_BIG_ENDIAN ? NUNITS - 1 - N : N)
@@ -1174,6 +1174,11 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 extern tree aarch64_fp16_type_node;
 extern tree aarch64_fp16_ptr_type_node;
 
+/* This type is the user-visible __bf16, and a pointer to that type.  Defined
+   in aarch64-builtins.c.  */
+extern tree aarch64_bf16_type_node;
+extern tree aarch64_bf16_ptr_type_node;
+
 /* The generic unwind code in libgcc does not initialize the frame pointer.
    So in order to unwind a function using a frame pointer, the very first
    function that is unwound must save the frame pointer.  That way the frame
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 85cadef1be819b3c1ad68ae70e755e0150ad6469..ebd3f6cf45bc0b5118c4c39e323e6380d64c885e 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1692,6 +1692,7 @@ aarch64_classify_vector_mode (machine_mode mode)
     case E_V2SImode:
     /* ...E_V1DImode doesn't exist.  */
     case E_V4HFmode:
+    case E_V4BFmode:
     case E_V2SFmode:
     case E_V1DFmode:
     /* 128-bit Advanced SIMD vectors.  */
@@ -1700,6 +1701,7 @@ aarch64_classify_vector_mode (machine_mode mode)
     case E_V4SImode:
     case E_V2DImode:
     case E_V8HFmode:
+    case E_V8BFmode:
     case E_V4SFmode:
     case E_V2DFmode:
       return TARGET_SIMD ? VEC_ADVSIMD : 0;
@@ -15603,6 +15605,10 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
 	  field_t = aarch64_fp16_type_node;
 	  field_ptr_t = aarch64_fp16_ptr_type_node;
 	  break;
+	case E_BFmode:
+	  field_t = aarch64_bf16_type_node;
+	  field_ptr_t = aarch64_bf16_ptr_type_node;
+	  break;
 	case E_V2SImode:
 	case E_V4SImode:
 	    {
@@ -16116,6 +16122,8 @@ aarch64_vq_mode (scalar_mode mode)
       return V4SFmode;
     case E_HFmode:
       return V8HFmode;
+    case E_BFmode:
+      return V8BFmode;
     case E_SImode:
       return V4SImode;
     case E_HImode:
@@ -16149,6 +16157,8 @@ aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
 	    return V2SFmode;
 	  case E_HFmode:
 	    return V4HFmode;
+	  case E_BFmode:
+	    return V4BFmode;
 	  case E_SImode:
 	    return V2SImode;
 	  case E_HImode:
@@ -16263,9 +16273,14 @@ aarch64_mangle_type (const_tree type)
   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
     return "St9__va_list";
 
-  /* Half-precision float.  */
+  /* Half-precision floating point types.  */
   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
-    return "Dh";
+    {
+      if (TYPE_MODE (type) == BFmode)
+	return "u6__bf16";
+      else
+	return "Dh";
+    }
 
   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
      builtin types.  */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 34cb99e28975de2ef10d7f4202417e2f05a870a2..85106910f7459d1211e729c73f222f99f04e6d7f 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1304,8 +1304,8 @@
 })
 
 (define_expand "mov<mode>"
-  [(set (match_operand:GPF_TF_F16 0 "nonimmediate_operand")
-	(match_operand:GPF_TF_F16 1 "general_operand"))]
+  [(set (match_operand:GPF_TF_F16_MOV 0 "nonimmediate_operand")
+	(match_operand:GPF_TF_F16_MOV 1 "general_operand"))]
   ""
   {
     if (!TARGET_FLOAT)
@@ -1321,11 +1321,11 @@
   }
 )
 
-(define_insn "*movhf_aarch64"
-  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
-	(match_operand:HF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
-  "TARGET_FLOAT && (register_operand (operands[0], HFmode)
-    || aarch64_reg_or_fp_zero (operands[1], HFmode))"
+(define_insn "*mov<mode>_aarch64"
+  [(set (match_operand:HFBF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
+	(match_operand:HFBF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
+  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
+    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
   "@
    movi\\t%0.4h, #0
    fmov\\t%h0, %w1
diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
new file mode 100644
index 0000000000000000000000000000000000000000..3759c0d1cb449a7f0125cc2a1433127564d66622
--- /dev/null
+++ b/gcc/config/aarch64/arm_bf16.h
@@ -0,0 +1,32 @@
+/* Arm BF16 instrinsics include file.
+
+   Copyright (C) 2019-2020 Free Software Foundation, Inc.
+   Contributed by Arm.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _AARCH64_BF16_H_
+#define _AARCH64_BF16_H_
+
+typedef __bf16 bfloat16_t;
+
+#endif
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index c7425346b86b5f5310a7148c465497b53ac75bf5..eaba156e26cf35b07b96972fe2741a9c00d6caa9 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -73,6 +73,9 @@ typedef __fp16 float16_t;
 typedef float float32_t;
 typedef double float64_t;
 
+typedef __Bfloat16x4_t bfloat16x4_t;
+typedef __Bfloat16x8_t bfloat16x8_t;
+
 typedef struct int8x8x2_t
 {
   int8x8_t val[2];
@@ -34606,6 +34609,8 @@ vrnd64xq_f64 (float64x2_t __a)
 
 #pragma GCC pop_options
 
+#include "arm_bf16.h"
+
 #undef __aarch64_vget_lane_any
 
 #undef __aarch64_vdup_lane_any
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index e5fa31f6748ee81d4323f11544fd8edb19d9af43..b0be5492e5e928daae93fde08c046150eab631e2 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -57,9 +57,16 @@
 ;; Iterator for all scalar floating point modes (HF, SF, DF)
 (define_mode_iterator GPF_HF [HF SF DF])
 
+;; Iterator for all 16-bit scalar floating point modes (HF, BF)
+(define_mode_iterator HFBF [HF BF])
+
 ;; Iterator for all scalar floating point modes (HF, SF, DF and TF)
 (define_mode_iterator GPF_TF_F16 [HF SF DF TF])
 
+;; Iterator for all scalar floating point modes suitable for moving, including
+;; special BF type (HF, SF, DF, TF and BF)
+(define_mode_iterator GPF_TF_F16_MOV [HF BF SF DF TF])
+
 ;; Double vector modes.
 (define_mode_iterator VDF [V2SF V4HF])
 
@@ -79,6 +86,9 @@
 ;; Double vector modes.
 (define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF])
 
+;; Double vector modes suitable for moving.  Includes BFmode.
+(define_mode_iterator VDMOV [V8QI V4HI V4HF V4BF V2SI V2SF])
+
 ;; All modes stored in registers d0-d31.
 (define_mode_iterator DREG [V8QI V4HI V4HF V2SI V2SF DF])
 
@@ -97,6 +107,12 @@
 ;; Copy of the above.
 (define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
 
+;; Quad vector modes suitable for moving.  Includes BFmode.
+(define_mode_iterator VQMOV [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF])
+
+;; VQMOV without 2-element modes.
+(define_mode_iterator VQMOV_NO2E [V16QI V8HI V4SI V8HF V8BF V4SF])
+
 ;; Quad integer vector modes.
 (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
 
@@ -160,6 +176,11 @@
 (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
 				V4HF V8HF V2SF V4SF V2DF])
 
+;; All Advanced SIMD modes suitable for moving, loading, and storing,
+;; including special Bfloat vector types.
+(define_mode_iterator VALL_F16MOV [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+				   V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
+
 ;; The VALL_F16 modes except the 128-bit 2-element ones.
 (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI
 				V4HF V8HF V2SF V4SF])
@@ -745,6 +766,7 @@
 			  (V2SI "2") (V4SI "4")
 				     (V2DI "2")
 			  (V4HF "4") (V8HF "8")
+			  (V4BF "4") (V8BF "8")
 			  (V2SF "2") (V4SF "4")
 			  (V1DF "1") (V2DF "2")
 			  (DI "1") (DF "1")])
@@ -885,7 +907,8 @@
 			  (V8HF "16b") (V2SF  "8b")
 			  (V4SF "16b") (V2DF  "16b")
 			  (DI   "8b")  (DF    "8b")
-			  (SI   "8b")  (SF    "8b")])
+			  (SI   "8b")  (SF    "8b")
+			  (V4BF "8b")  (V8BF  "16b")])
 
 ;; Define element mode for each vector mode.
 (define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
@@ -965,12 +988,13 @@
 			 (V2SI "SI")    (V4SI  "V2SI")
 			 (V2DI "DI")    (V2SF  "SF")
 			 (V4SF "V2SF")  (V4HF "V2HF")
-			 (V8HF "V4HF")  (V2DF  "DF")])
+			 (V8HF "V4HF")  (V2DF  "DF")
+			 (V8BF "V4BF")])
 
 ;; Half modes of all vector modes, in lower-case.
 (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
 			 (V4HI "v2hi")  (V8HI  "v4hi")
-			 (V8HF  "v4hf")
+			 (V8HF  "v4hf") (V8BF  "v4bf")
 			 (V2SI "si")    (V4SI  "v2si")
 			 (V2DI "di")    (V2SF  "sf")
 			 (V4SF "v2sf")  (V2DF  "df")])
@@ -1265,6 +1289,7 @@
 		     (V2SI "") (V4SI  "_q")
 		     (DI   "") (V2DI  "_q")
 		     (V4HF "") (V8HF "_q")
+		     (V4BF "") (V8BF "_q")
 		     (V2SF "") (V4SF  "_q")
 			       (V2DF  "_q")
 		     (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")])
diff --git a/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C b/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
index 5740c0281b2fdf8bbc11d9428ca2f6ba8f1760a0..50c1452ed83c8a2f4ad3b162931292db328813c6 100644
--- a/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
+++ b/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
@@ -14,6 +14,7 @@ void f4 (uint16x4_t a) {}
 void f5 (uint32x2_t a) {}
 void f23 (uint64x1_t a) {}
 void f61 (float16x4_t a) {}
+void f62 (bfloat16x4_t a) {}
 void f6 (float32x2_t a) {}
 void f7 (poly8x8_t a) {}
 void f8 (poly16x4_t a) {}
@@ -27,6 +28,7 @@ void f14 (uint16x8_t a) {}
 void f15 (uint32x4_t a) {}
 void f16 (uint64x2_t a) {}
 void f171 (float16x8_t a) {}
+void f172 (bfloat16x8_t a) {}
 void f17 (float32x4_t a) {}
 void f18 (float64x2_t a) {}
 void f19 (poly8x16_t a) {}
@@ -45,6 +47,7 @@ void g1 (int8x16_t, int8x16_t) {}
 // { dg-final { scan-assembler "_Z2f512__Uint32x2_t:" } }
 // { dg-final { scan-assembler "_Z3f2312__Uint64x1_t:" } }
 // { dg-final { scan-assembler "_Z3f6113__Float16x4_t:" } }
+// { dg-final { scan-assembler "_Z3f6214__Bfloat16x4_t:" } }
 // { dg-final { scan-assembler "_Z2f613__Float32x2_t:" } }
 // { dg-final { scan-assembler "_Z2f711__Poly8x8_t:" } }
 // { dg-final { scan-assembler "_Z2f812__Poly16x4_t:" } }
@@ -57,6 +60,7 @@ void g1 (int8x16_t, int8x16_t) {}
 // { dg-final { scan-assembler "_Z3f1512__Uint32x4_t:" } }
 // { dg-final { scan-assembler "_Z3f1612__Uint64x2_t:" } }
 // { dg-final { scan-assembler "_Z4f17113__Float16x8_t:" } }
+// { dg-final { scan-assembler "_Z4f17214__Bfloat16x8_t:" } }
 // { dg-final { scan-assembler "_Z3f1713__Float32x4_t:" } }
 // { dg-final { scan-assembler "_Z3f1813__Float64x2_t:" } }
 // { dg-final { scan-assembler "_Z3f1912__Poly8x16_t:" } }
diff --git a/gcc/testsuite/g++.dg/ext/arm-bf16/bf16-mangle-aarch64-1.C b/gcc/testsuite/g++.dg/ext/arm-bf16/bf16-mangle-aarch64-1.C
new file mode 100644
index 0000000000000000000000000000000000000000..5426a1814b842db9d73d556bcc228d19f970f466
--- /dev/null
+++ b/gcc/testsuite/g++.dg/ext/arm-bf16/bf16-mangle-aarch64-1.C
@@ -0,0 +1,13 @@
+/* { dg-do compile { target aarch64*-*-* } } */
+
+/* Test mangling */
+
+/* { dg-final { scan-assembler "\t.global\t_Z1fPu6__bf16" } } */
+void f (__bf16 *x) { }
+
+/* { dg-final { scan-assembler "\t.global\t_Z1gPu6__bf16S_" } } */
+void g (__bf16 *x, __bf16 *y) { }
+
+/* { dg-final { scan-assembler "\t.global\t_ZN1SIu6__bf16u6__bf16E1iE" } } */
+template <typename T, typename U> struct S { static int i; };
+template <> int S<__bf16, __bf16>::i = 3;
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..ef43766495c8f7bc628e658b2818bdc5b8bea247
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_1.c
@@ -0,0 +1,102 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-O3 --save-temps -std=gnu90" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_bf16.h>
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**bfloat_mov_ww:
+**	mov	v1.h\[0\], v2.h\[0\]
+**	ret
+*/
+void bfloat_mov_ww (void)
+{
+  register bfloat16_t x asm ("h2");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_rw:
+**	dup	v1.4h, w1
+**	ret
+*/
+void bfloat_mov_rw (void)
+{
+  register bfloat16_t x asm ("w1");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_wr:
+**	umov	w1, v1.h\[0\]
+**	ret
+*/
+void bfloat_mov_wr (void)
+{
+  register bfloat16_t x asm ("h1");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rr:
+**	mov	w1, w2
+**	ret
+*/
+void bfloat_mov_rr (void)
+{
+  register bfloat16_t x asm ("w2");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rm:
+**	strh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_rm (bfloat16_t *ptr)
+{
+   register bfloat16_t x asm ("w2");
+   asm volatile ("" : "=r" (x));
+   *ptr = x;
+}
+
+/*
+**bfloat_mov_mr:
+**	ldrh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_mr (bfloat16_t *ptr)
+{
+   register bfloat16_t y asm ("w2");
+   y = *ptr;
+   asm volatile ("" :: "r" (y));
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_2.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..df8e7518c24c6534f04f1e1b3c50e2655f69bf95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_2.c
@@ -0,0 +1,106 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps -std=gnu90" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_bf16.h>
+
+#pragma GCC push_options
+#pragma GCC target ("+bf16")
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**bfloat_mov_ww:
+**	mov	v1.h\[0\], v2.h\[0\]
+**	ret
+*/
+void bfloat_mov_ww (void)
+{
+  register bfloat16_t x asm ("h2");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_rw:
+**	dup	v1.4h, w1
+**	ret
+*/
+void bfloat_mov_rw (void)
+{
+  register bfloat16_t x asm ("w1");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_wr:
+**	umov	w1, v1.h\[0\]
+**	ret
+*/
+void bfloat_mov_wr (void)
+{
+  register bfloat16_t x asm ("h1");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rr:
+**	mov	w1, w2
+**	ret
+*/
+void bfloat_mov_rr (void)
+{
+  register bfloat16_t x asm ("w2");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rm:
+**	strh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_rm (bfloat16_t *ptr)
+{
+   register bfloat16_t x asm ("w2");
+   asm volatile ("" : "=r" (x));
+   *ptr = x;
+}
+
+/*
+**bfloat_mov_mr:
+**	ldrh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_mr (bfloat16_t *ptr)
+{
+   register bfloat16_t y asm ("w2");
+   y = *ptr;
+   asm volatile ("" :: "r" (y));
+}
+
+#pragma GCC pop_options
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_3.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..5d7a4317ceefbdd411062fe506e3bf9461d98bf8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_3.c
@@ -0,0 +1,101 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps -std=gnu90" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_bf16.h>
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**bfloat_mov_ww:
+**	mov	v1.h\[0\], v2.h\[0\]
+**	ret
+*/
+void bfloat_mov_ww (void)
+{
+  register bfloat16_t x asm ("h2");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_rw:
+**	dup	v1.4h, w1
+**	ret
+*/
+void bfloat_mov_rw (void)
+{
+  register bfloat16_t x asm ("w1");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_wr:
+**	umov	w1, v1.h\[0\]
+**	ret
+*/
+void bfloat_mov_wr (void)
+{
+  register bfloat16_t x asm ("h1");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rr:
+**	mov	w1, w2
+**	ret
+*/
+void bfloat_mov_rr (void)
+{
+  register bfloat16_t x asm ("w2");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rm:
+**	strh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_rm (bfloat16_t *ptr)
+{
+   register bfloat16_t x asm ("w2");
+   asm volatile ("" : "=r" (x));
+   *ptr = x;
+}
+
+/*
+**bfloat_mov_mr:
+**	ldrh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_mr (bfloat16_t *ptr)
+{
+   register bfloat16_t y asm ("w2");
+   y = *ptr;
+   asm volatile ("" :: "r" (y));
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_4.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..b812011c223b257fe405ef210d24bf5edc3535c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_4.c
@@ -0,0 +1,16 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-std=c99 -pedantic-errors -O3 --save-temps" } */
+
+#include <arm_bf16.h>
+
+_Complex bfloat16_t stacktest1 (_Complex bfloat16_t __a)
+{
+  volatile _Complex bfloat16_t b = __a;
+  return b;
+}
+
+/* { dg-error {ISO C does not support plain 'complex' meaning 'double complex'} "" { target *-*-* } 8 } */
+/* { dg-error {expected '=', ',', ';', 'asm' or '__attribute__' before 'stacktest1'} "" { target *-*-* } 8 } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..6cad557ebf2cd8e9b2f063d1cc7e9ad4a3e6ac31
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_1.c
@@ -0,0 +1,93 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-O3 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**stacktest2:
+**	sub	sp, sp, #16
+**	str	d0, \[sp, 8\]
+**	ldr	d0, \[sp, 8\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x4_t stacktest2 (bfloat16x4_t __a)
+{
+  volatile bfloat16x4_t b = __a;
+  return b;
+}
+
+/*
+**stacktest3:
+**	sub	sp, sp, #16
+**	str	q0, \[sp\]
+**	ldr	q0, \[sp\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x8_t stacktest3 (bfloat16x8_t __a)
+{
+  volatile bfloat16x8_t b = __a;
+  return b;
+}
+
+/*  Test compilation of __attribute__ vectors of 8, 16, 32, etc. BFloats.  */
+typedef bfloat16_t v8bf __attribute__((vector_size(16)));
+typedef bfloat16_t v16bf __attribute__((vector_size(32)));
+typedef bfloat16_t v32bf __attribute__((vector_size(64)));
+typedef bfloat16_t v64bf __attribute__((vector_size(128)));
+typedef bfloat16_t v128bf __attribute__((vector_size(256)));
+
+v8bf stacktest4 (v8bf __a)
+{
+  volatile v8bf b = __a;
+  return b;
+}
+
+v16bf stacktest5 (v16bf __a)
+{
+  volatile v16bf b = __a;
+  return b;
+}
+
+v32bf stacktest6 (v32bf __a)
+{
+  volatile v32bf b = __a;
+  return b;
+}
+
+v64bf stacktest7 (v64bf __a)
+{
+  volatile v64bf b = __a;
+  return b;
+}
+
+v128bf stacktest8 (v128bf __a)
+{
+  volatile v128bf b = __a;
+  return b;
+}
+
+/* Test use of constant values to assign values to vectors.  */
+
+typedef bfloat16_t v2bf __attribute__((vector_size(4)));
+v2bf c2 (void) { return (v2bf) 0x12345678; }
+
+bfloat16x4_t c3 (void) { return (bfloat16x4_t) 0x1234567812345678; }
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_2.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..3891dcfc900ab942bf29eb638d16660a194597e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_2.c
@@ -0,0 +1,97 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+
+#pragma GCC push_options
+#pragma GCC target ("+bf16")
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**stacktest2:
+**	sub	sp, sp, #16
+**	str	d0, \[sp, 8\]
+**	ldr	d0, \[sp, 8\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x4_t stacktest2 (bfloat16x4_t __a)
+{
+  volatile bfloat16x4_t b = __a;
+  return b;
+}
+
+/*
+**stacktest3:
+**	sub	sp, sp, #16
+**	str	q0, \[sp\]
+**	ldr	q0, \[sp\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x8_t stacktest3 (bfloat16x8_t __a)
+{
+  volatile bfloat16x8_t b = __a;
+  return b;
+}
+
+/*  Test compilation of __attribute__ vectors of 8, 16, 32, etc. BFloats.  */
+typedef bfloat16_t v8bf __attribute__((vector_size(16)));
+typedef bfloat16_t v16bf __attribute__((vector_size(32)));
+typedef bfloat16_t v32bf __attribute__((vector_size(64)));
+typedef bfloat16_t v64bf __attribute__((vector_size(128)));
+typedef bfloat16_t v128bf __attribute__((vector_size(256)));
+
+v8bf stacktest4 (v8bf __a)
+{
+  volatile v8bf b = __a;
+  return b;
+}
+
+v16bf stacktest5 (v16bf __a)
+{
+  volatile v16bf b = __a;
+  return b;
+}
+
+v32bf stacktest6 (v32bf __a)
+{
+  volatile v32bf b = __a;
+  return b;
+}
+
+v64bf stacktest7 (v64bf __a)
+{
+  volatile v64bf b = __a;
+  return b;
+}
+
+v128bf stacktest8 (v128bf __a)
+{
+  volatile v128bf b = __a;
+  return b;
+}
+
+/* Test use of constant values to assign values to vectors.  */
+
+typedef bfloat16_t v2bf __attribute__((vector_size(4)));
+v2bf c2 (void) { return (v2bf) 0x12345678; }
+
+bfloat16x4_t c3 (void) { return (bfloat16x4_t) 0x1234567812345678; }
+
+#pragma GCC pop_options
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_3.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..b35f5e527be1fe7a6fd928bcd326b57fb376596a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_3.c
@@ -0,0 +1,92 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**stacktest2:
+**	sub	sp, sp, #16
+**	str	d0, \[sp, 8\]
+**	ldr	d0, \[sp, 8\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x4_t stacktest2 (bfloat16x4_t __a)
+{
+  volatile bfloat16x4_t b = __a;
+  return b;
+}
+
+/*
+**stacktest3:
+**	sub	sp, sp, #16
+**	str	q0, \[sp\]
+**	ldr	q0, \[sp\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x8_t stacktest3 (bfloat16x8_t __a)
+{
+  volatile bfloat16x8_t b = __a;
+  return b;
+}
+
+/*  Test compilation of __attribute__ vectors of 8, 16, 32, etc. BFloats.  */
+typedef bfloat16_t v8bf __attribute__((vector_size(16)));
+typedef bfloat16_t v16bf __attribute__((vector_size(32)));
+typedef bfloat16_t v32bf __attribute__((vector_size(64)));
+typedef bfloat16_t v64bf __attribute__((vector_size(128)));
+typedef bfloat16_t v128bf __attribute__((vector_size(256)));
+
+v8bf stacktest4 (v8bf __a)
+{
+  volatile v8bf b = __a;
+  return b;
+}
+
+v16bf stacktest5 (v16bf __a)
+{
+  volatile v16bf b = __a;
+  return b;
+}
+
+v32bf stacktest6 (v32bf __a)
+{
+  volatile v32bf b = __a;
+  return b;
+}
+
+v64bf stacktest7 (v64bf __a)
+{
+  volatile v64bf b = __a;
+  return b;
+}
+
+v128bf stacktest8 (v128bf __a)
+{
+  volatile v128bf b = __a;
+  return b;
+}
+
+/* Test use of constant values to assign values to vectors.  */
+
+typedef bfloat16_t v2bf __attribute__((vector_size(4)));
+v2bf c2 (void) { return (v2bf) 0x12345678; }
+
+bfloat16x4_t c3 (void) { return (bfloat16x4_t) 0x1234567812345678; }

Re: [GCC][PATCH][Aarch64] Add Bfloat16_t scalar type, vector types and machine modes to Aarch64 back-end [1/2]

Reply via email to