[PATCH] D82345: [sve][acle] Implement some of the C intrinsics for brain float.

Francesco Petrogalli via Phabricator via cfe-commits Mon, 22 Jun 2020 20:57:23 -0700

fpetrogalli created this revision.
fpetrogalli added reviewers: sdesmalen, kmclaughlin, c-rhodes, ctetreau.
Herald added subscribers: llvm-commits, cfe-commits, psnobl, rkruppe, 
hiraditya, tschuett.
Herald added a reviewer: efriedma.
Herald added projects: clang, LLVM.


The following intrinsics have been extended to support brain float types:

svbfloat16_t svclasta[_bf16](svbool_t pg, svbfloat16_t fallback, svbfloat16_t 
data)
bfloat16_t svclasta[_n_bf16](svbool_t pg, bfloat16_t fallback, svbfloat16_t 
data)
bfloat16_t svlasta[_bf16](svbool_t pg, svbfloat16_t op)

svbfloat16_t svclastb[_bf16](svbool_t pg, svbfloat16_t fallback, svbfloat16_t 
data)
bfloat16_t svclastb[_n_bf16](svbool_t pg, bfloat16_t fallback, svbfloat16_t 
data)
bfloat16_t svlastb[_bf16](svbool_t pg, svbfloat16_t op)

svbfloat16_t svdup[_n]_bf16(bfloat16_t op)
svbfloat16_t svdup[_n]_bf16_m(svbfloat16_t inactive, svbool_t pg, bfloat16_t op)
svbfloat16_t svdup[_n]_bf16_x(svbool_t pg, bfloat16_t op)
svbfloat16_t svdup[_n]_bf16_z(svbool_t pg, bfloat16_t op)

svbfloat16_t svdupq[_n]_bf16(bfloat16_t x0, bfloat16_t x1, bfloat16_t x2, 
bfloat16_t x3, bfloat16_t x4, bfloat16_t x5, bfloat16_t x6, bfloat16_t x7)
svbfloat16_t svdupq_lane[_bf16](svbfloat16_t data, uint64_t index)

svbfloat16_t svinsr[_n_bf16](svbfloat16_t op1, bfloat16_t op2)


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D82345

Files:
  clang/include/clang/Basic/arm_sve.td
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dup.c
  clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq.c
  clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_insr.c
  llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
  llvm/lib/Target/AArch64/SVEInstrFormats.td
  llvm/test/CodeGen/AArch64/sve-intrinsics-dup-bfloat.ll
  llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
  llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
  llvm/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll
  llvm/test/CodeGen/AArch64/sve-intrinsics-shifts.ll

Index: llvm/test/CodeGen/AArch64/sve-intrinsics-shifts.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve-intrinsics-shifts.ll
+++ llvm/test/CodeGen/AArch64/sve-intrinsics-shifts.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 < %s 2>%t | FileCheck %s
 ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
 
 ; WARN-NOT: warning
@@ -165,6 +165,14 @@
   ret <vscale x 8 x half> %out
 }
 
+define <vscale x 8 x bfloat> @insr_bf16(<vscale x 8 x bfloat> %a, bfloat %b) {
+; CHECK-LABEL: insr_bf16:
+; CHECK: insr z0.h, h1
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.insr.nxv8bf16(<vscale x 8 x bfloat> %a, bfloat %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
 define <vscale x 4 x float> @insr_f32(<vscale x 4 x float> %a, float %b) {
 ; CHECK-LABEL: insr_f32:
 ; CHECK: insr z0.s, s1
@@ -348,6 +356,7 @@
 declare <vscale x 4 x i32> @llvm.aarch64.sve.insr.nxv4i32(<vscale x 4 x i32>, i32)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.insr.nxv2i64(<vscale x 2 x i64>, i64)
 declare <vscale x 8 x half> @llvm.aarch64.sve.insr.nxv8f16(<vscale x 8 x half>, half)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.insr.nxv8bf16(<vscale x 8 x bfloat>, bfloat)
 declare <vscale x 4 x float> @llvm.aarch64.sve.insr.nxv4f32(<vscale x 4 x float>, float)
 declare <vscale x 2 x double> @llvm.aarch64.sve.insr.nxv2f64(<vscale x 2 x double>, double)
 
Index: llvm/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll
+++ llvm/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 < %s 2>%t | FileCheck %s
 ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
 
 ; WARN-NOT: warning
@@ -57,6 +57,16 @@
   ret <vscale x 8 x half> %out
 }
 
+define <vscale x 8 x bfloat> @dup_bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, bfloat %b) {
+; CHECK-LABEL: dup_bf16:
+; CHECK: mov z0.h, p0/m, h1
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> %a,
+                                                                <vscale x 8 x i1> %pg,
+                                                                bfloat %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
 define <vscale x 4 x float> @dup_f32(<vscale x 4 x float> %a, <vscale x 4 x i1> %pg, float %b) {
 ; CHECK-LABEL: dup_f32:
 ; CHECK: mov z0.s, p0/m, s1
@@ -82,5 +92,6 @@
 declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64)
 declare <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat)
 declare <vscale x 4 x float> @llvm.aarch64.sve.dup.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float)
 declare <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double)
Index: llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
+++ llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 < %s 2>%t | FileCheck %s
 ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
 
 ; WARN-NOT: warning
@@ -57,6 +57,16 @@
   ret <vscale x 8 x half> %out
 }
 
+define <vscale x 8 x bfloat> @clasta_bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: clasta_bf16:
+; CHECK: clasta z0.h, p0, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.clasta.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                                      <vscale x 8 x bfloat> %a,
+                                                                      <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
 define <vscale x 4 x float> @clasta_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: clasta_f32:
 ; CHECK: clasta z0.s, p0, z0.s, z1.s
@@ -131,6 +141,16 @@
   ret half %out
 }
 
+define bfloat @clasta_n_bf16(<vscale x 8 x i1> %pg, bfloat %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: clasta_n_bf16:
+; CHECK: clasta h0, p0, h0, z1.h
+; CHECK-NEXT: ret
+  %out = call bfloat @llvm.aarch64.sve.clasta.n.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                         bfloat %a,
+                                                         <vscale x 8 x bfloat> %b)
+  ret bfloat %out
+}
+
 define float @clasta_n_f32(<vscale x 4 x i1> %pg, float %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: clasta_n_f32:
 ; CHECK: clasta s0, p0, s0, z1.s
@@ -205,6 +225,16 @@
   ret <vscale x 8 x half> %out
 }
 
+define <vscale x 8 x bfloat> @clastb_bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: clastb_bf16:
+; CHECK: clastb z0.h, p0, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.clastb.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                                      <vscale x 8 x bfloat> %a,
+                                                                      <vscale x 8 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
 define <vscale x 4 x float> @clastb_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: clastb_f32:
 ; CHECK: clastb z0.s, p0, z0.s, z1.s
@@ -279,6 +309,16 @@
   ret half %out
 }
 
+define bfloat @clastb_n_bf16(<vscale x 8 x i1> %pg, bfloat %a, <vscale x 8 x bfloat> %b) {
+; CHECK-LABEL: clastb_n_bf16:
+; CHECK: clastb h0, p0, h0, z1.h
+; CHECK-NEXT: ret
+  %out = call bfloat @llvm.aarch64.sve.clastb.n.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                        bfloat %a,
+                                                        <vscale x 8 x bfloat> %b)
+  ret bfloat %out
+}
+
 define float @clastb_n_f32(<vscale x 4 x i1> %pg, float %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: clastb_n_f32:
 ; CHECK: clastb s0, p0, s0, z1.s
@@ -343,6 +383,14 @@
   ret <vscale x 8 x half> %out
 }
 
+define <vscale x 8 x bfloat> @dupq_bf16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: dupq_bf16:
+; CHECK: mov z0.q, q0
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %a, i64 0)
+  ret <vscale x 8 x bfloat> %out
+}
+
 define <vscale x 4 x float> @dupq_f32(<vscale x 4 x float> %a) {
 ; CHECK-LABEL: dupq_f32:
 ; CHECK: mov z0.q, z0.q[1]
@@ -433,6 +481,20 @@
 }
 
 ; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant).
+define <vscale x 8 x bfloat> @dupq_lane_bf16(<vscale x 8 x bfloat> %a, i64 %idx) {
+; CHECK-LABEL: dupq_lane_bf16:
+; CHECK-DAG:  index [[Z1:z[0-9]+]].d, #0, #1
+; CHECK-DAG:  and   [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1
+; CHECK-DAG:  add   [[X1:x[0-9]+]], x0, x0
+; CHECK-DAG:  mov   [[Z3:z[0-9]+]].d, [[X1]]
+; CHECK:      add   [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d
+; CHECK: tbl z0.d, { z0.d }, [[Z4]].d
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %a, i64 %idx)
+  ret <vscale x 8 x bfloat> %out
+}
+
+; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant).
 define <vscale x 4 x float> @dupq_lane_f32(<vscale x 4 x float> %a, i64 %idx) {
 ; CHECK-LABEL: dupq_lane_f32:
 ; CHECK-DAG:  index [[Z1:z[0-9]+]].d, #0, #1
@@ -595,6 +657,15 @@
   ret half %res
 }
 
+define bfloat @lasta_bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: lasta_bf16
+; CHECK: lasta h0, p0, z0.h
+; CHECK-NEXT: ret
+  %res = call bfloat @llvm.aarch64.sve.lasta.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                   <vscale x 8 x bfloat> %a)
+  ret bfloat %res
+}
+
 define float @lasta_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) {
 ; CHECK-LABEL: lasta_f32
 ; CHECK: lasta s0, p0, z0.s
@@ -671,6 +742,15 @@
   ret half %res
 }
 
+define bfloat @lastb_bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: lastb_bf16
+; CHECK: lastb h0, p0, z0.h
+; CHECK-NEXT: ret
+  %res = call bfloat @llvm.aarch64.sve.lastb.nxv8bf16(<vscale x 8 x i1> %pg,
+                                                      <vscale x 8 x bfloat> %a)
+  ret bfloat %res
+}
+
 define float @lastb_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) {
 ; CHECK-LABEL: lastb_f32
 ; CHECK: lastb s0, p0, z0.s
@@ -1760,6 +1840,7 @@
 declare <vscale x 4 x i32> @llvm.aarch64.sve.clasta.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.clasta.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
 declare <vscale x 8 x half> @llvm.aarch64.sve.clasta.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.clasta.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
 declare <vscale x 4 x float> @llvm.aarch64.sve.clasta.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
 declare <vscale x 2 x double> @llvm.aarch64.sve.clasta.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
 
@@ -1768,6 +1849,7 @@
 declare i32 @llvm.aarch64.sve.clasta.n.nxv4i32(<vscale x 4 x i1>, i32, <vscale x 4 x i32>)
 declare i64 @llvm.aarch64.sve.clasta.n.nxv2i64(<vscale x 2 x i1>, i64, <vscale x 2 x i64>)
 declare half @llvm.aarch64.sve.clasta.n.nxv8f16(<vscale x 8 x i1>, half, <vscale x 8 x half>)
+declare bfloat @llvm.aarch64.sve.clasta.n.nxv8bf16(<vscale x 8 x i1>, bfloat, <vscale x 8 x bfloat>)
 declare float @llvm.aarch64.sve.clasta.n.nxv4f32(<vscale x 4 x i1>, float, <vscale x 4 x float>)
 declare double @llvm.aarch64.sve.clasta.n.nxv2f64(<vscale x 2 x i1>, double, <vscale x 2 x double>)
 
@@ -1776,6 +1858,7 @@
 declare <vscale x 4 x i32> @llvm.aarch64.sve.clastb.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.clastb.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
 declare <vscale x 8 x half> @llvm.aarch64.sve.clastb.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.clastb.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
 declare <vscale x 4 x float> @llvm.aarch64.sve.clastb.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
 declare <vscale x 2 x double> @llvm.aarch64.sve.clastb.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
 
@@ -1784,6 +1867,7 @@
 declare i32 @llvm.aarch64.sve.clastb.n.nxv4i32(<vscale x 4 x i1>, i32, <vscale x 4 x i32>)
 declare i64 @llvm.aarch64.sve.clastb.n.nxv2i64(<vscale x 2 x i1>, i64, <vscale x 2 x i64>)
 declare half @llvm.aarch64.sve.clastb.n.nxv8f16(<vscale x 8 x i1>, half, <vscale x 8 x half>)
+declare bfloat @llvm.aarch64.sve.clastb.n.nxv8bf16(<vscale x 8 x i1>, bfloat, <vscale x 8 x bfloat>)
 declare float @llvm.aarch64.sve.clastb.n.nxv4f32(<vscale x 4 x i1>, float, <vscale x 4 x float>)
 declare double @llvm.aarch64.sve.clastb.n.nxv2f64(<vscale x 2 x i1>, double, <vscale x 2 x double>)
 
@@ -1797,6 +1881,7 @@
 declare <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32>, i64)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64>, i64)
 declare <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half>, i64)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat>, i64)
 declare <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float>, i64)
 declare <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double>, i64)
 
@@ -1813,6 +1898,7 @@
 declare i32 @llvm.aarch64.sve.lasta.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>)
 declare i64 @llvm.aarch64.sve.lasta.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>)
 declare half @llvm.aarch64.sve.lasta.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>)
+declare bfloat @llvm.aarch64.sve.lasta.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>)
 declare float @llvm.aarch64.sve.lasta.nxv2f32(<vscale x 2 x i1>, <vscale x 2 x float>)
 declare float @llvm.aarch64.sve.lasta.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>)
 declare double @llvm.aarch64.sve.lasta.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>)
@@ -1822,6 +1908,7 @@
 declare i32 @llvm.aarch64.sve.lastb.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>)
 declare i64 @llvm.aarch64.sve.lastb.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>)
 declare half @llvm.aarch64.sve.lastb.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>)
+declare bfloat @llvm.aarch64.sve.lastb.nxv8bf16(<vscale x 8 x i1>, <vscale x 8 x bfloat>)
 declare float @llvm.aarch64.sve.lastb.nxv2f32(<vscale x 2 x i1>, <vscale x 2 x float>)
 declare float @llvm.aarch64.sve.lastb.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>)
 declare double @llvm.aarch64.sve.lastb.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>)
Index: llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
+++ llvm/test/CodeGen/AArch64/sve-intrinsics-dup-x.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -mattr=+sve -asm-verbose=0 < %s 2>%t | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -mattr=+sve -mattr=+bf16 -asm-verbose=0 < %s 2>%t | FileCheck %s
 ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
 
 ; WARN-NOT: warning
@@ -81,6 +81,14 @@
   ret <vscale x 8 x half> %out
 }
 
+define <vscale x 8 x bfloat> @dup_bf16(bfloat %b) {
+; CHECK-LABEL: dup_bf16:
+; CHECK: mov z0.h, h0
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.x.nxv8bf16(bfloat %b)
+  ret <vscale x 8 x bfloat> %out
+}
+
 define <vscale x 8 x half> @dup_imm_f16(half %b) {
 ; CHECK-LABEL: dup_imm_f16:
 ; CHECK: mov z0.h, #16.00000000
@@ -126,5 +134,6 @@
 declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64)
 declare <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.x.nxv8bf16(bfloat)
 declare <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float)
 declare <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double)
Index: llvm/test/CodeGen/AArch64/sve-intrinsics-dup-bfloat.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-intrinsics-dup-bfloat.ll
@@ -0,0 +1,32 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 < %s 2>%t | FileCheck %s
+
+define <vscale x 8 x bfloat> @test_svdup_n_bf16_z(<vscale x 16 x i1> %pg, bfloat %op) local_unnamed_addr #0 {
+; CHECK-LABEL: test_svdup_n_bf16_z:
+entry:
+  %0 = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  %1 = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x i1> %0, bfloat %op)
+  ret <vscale x 8 x bfloat> %1
+}
+
+; Function Attrs: nounwind readnone
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat) #1
+
+
+define <vscale x 8 x bfloat> @test_svdup_n_bf16_m(<vscale x 8 x bfloat> %inactive, <vscale x 16 x i1> %pg, bfloat %op) local_unnamed_addr #0 {
+; CHECK-LABEL: test_svdup_n_bf16_m:
+entry:
+  %0 = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  %1 = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> %inactive, <vscale x 8 x i1> %0, bfloat %op)
+  ret <vscale x 8 x bfloat> %1
+}
+
+
+define <vscale x 8 x bfloat> @test_svdup_n_bf16_x(<vscale x 16 x i1> %pg, bfloat %op) local_unnamed_addr #0 {
+; CHECK-LABEL: test_svdup_n_bf16_x:
+entry:
+  %0 = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  %1 = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> %0, bfloat %op)
+  ret <vscale x 8 x bfloat> %1
+}
+
+declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)
Index: llvm/lib/Target/AArch64/SVEInstrFormats.td
===================================================================
--- llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -1237,6 +1237,7 @@
   def _D : sve_int_perm_insrv<0b11, asm, ZPR64, FPR64>;
 
   def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, bf16, !cast<Instruction>(NAME # _H)>;
   def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, f64, !cast<Instruction>(NAME # _D)>;
 }
@@ -5669,9 +5670,10 @@
   def _S : sve_int_perm_clast_vz<0b10, ab, asm, ZPR32, FPR32>;
   def _D : sve_int_perm_clast_vz<0b11, ab, asm, ZPR64, FPR64>;
 
-  def : SVE_3_Op_Pat<f16, op, nxv8i1,  f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Pat<f32, op, nxv4i1,  f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<f64, op, nxv2i1,  f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_3_Op_Pat<f16,  op, nxv8i1, f16,  nxv8f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<bf16, op, nxv8i1, bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<f32,  op, nxv4i1, f32,  nxv4f32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<f64,  op, nxv2i1, f64,  nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve_int_perm_clast_zz<bits<2> sz8_64, bit ab, string asm,
@@ -5708,9 +5710,10 @@
   def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1,  nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
   def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1,  nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 
-  def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_3_Op_Pat<nxv8f16,  op, nxv8i1, nxv8f16,  nxv8f16,  !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4f32,  op, nxv4i1, nxv4f32,  nxv4f32,  !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2f64,  op, nxv2i1, nxv2f64,  nxv2f64,  !cast<Instruction>(NAME # _D)>;
 }
 
 class sve_int_perm_last_r<bits<2> sz8_64, bit ab, string asm,
@@ -5769,10 +5772,11 @@
   def _S : sve_int_perm_last_v<0b10, ab, asm, ZPR32, FPR32>;
   def _D : sve_int_perm_last_v<0b11, ab, asm, ZPR64, FPR64>;
 
-  def : SVE_2_Op_Pat<f16, op, nxv8i1,  nxv8f16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_2_Op_Pat<f32, op, nxv4i1,  nxv4f32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_2_Op_Pat<f32, op, nxv2i1,  nxv2f32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_2_Op_Pat<f64, op, nxv2i1,  nxv2f64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_2_Op_Pat<f16,  op, nxv8i1, nxv8f16,  !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<bf16, op, nxv8i1, nxv8bf16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<f32,  op, nxv4i1, nxv4f32,  !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<f32,  op, nxv2i1, nxv2f32,  !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<f64,  op, nxv2i1, nxv2f64,  !cast<Instruction>(NAME # _D)>;
 }
 
 class sve_int_perm_splice<bits<2> sz8_64, string asm, ZPRRegOp zprty>
@@ -5979,6 +5983,7 @@
                   (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, FPR64:$Vn), 1>;
 
   def : SVE_3_Op_Pat<nxv8f16, op, nxv8f16, nxv8i1, f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8i1, bf16, !cast<Instruction>(NAME # _H)>;
   def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv4i1, f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_3_Op_Pat<nxv2f32, op, nxv2f32, nxv2i1, f32, !cast<Instruction>(NAME # _S)>;
   def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2i1, f64, !cast<Instruction>(NAME # _D)>;
Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -403,6 +403,8 @@
   // Duplicate FP scalar into all vector elements
   def : Pat<(nxv8f16 (AArch64dup (f16 FPR16:$src))),
             (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
+  def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))),
+            (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
   def : Pat<(nxv4f16 (AArch64dup (f16 FPR16:$src))),
             (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
   def : Pat<(nxv2f16 (AArch64dup (f16 FPR16:$src))),
@@ -415,12 +417,13 @@
             (DUP_ZZI_D (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), 0)>;
 
   // Duplicate +0.0 into all vector elements
-  def : Pat<(nxv8f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
-  def : Pat<(nxv4f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
-  def : Pat<(nxv2f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
-  def : Pat<(nxv4f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
-  def : Pat<(nxv2f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
-  def : Pat<(nxv2f64 (AArch64dup (f64 fpimm0))), (DUP_ZI_D 0, 0)>;
+  def : Pat<(nxv8f16  (AArch64dup (f16 fpimm0))),  (DUP_ZI_H 0, 0)>;
+  def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
+  def : Pat<(nxv4f16  (AArch64dup (f16 fpimm0))),  (DUP_ZI_H 0, 0)>;
+  def : Pat<(nxv2f16  (AArch64dup (f16 fpimm0))),  (DUP_ZI_H 0, 0)>;
+  def : Pat<(nxv4f32  (AArch64dup (f32 fpimm0))),  (DUP_ZI_S 0, 0)>;
+  def : Pat<(nxv2f32  (AArch64dup (f32 fpimm0))),  (DUP_ZI_S 0, 0)>;
+  def : Pat<(nxv2f64  (AArch64dup (f64 fpimm0))),  (DUP_ZI_D 0, 0)>;
 
   // Duplicate Int immediate into all vector elements
   def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
@@ -1430,12 +1433,12 @@
     def : Pat<(nxv2i64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
     def : Pat<(nxv2i64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
     def : Pat<(nxv2i64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+    def : Pat<(nxv2i64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
     def : Pat<(nxv2i64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
     def : Pat<(nxv2i64 (bitconvert (nxv2f64 ZPR:$src))), (nxv2i64 ZPR:$src)>;
 
     def : Pat<(nxv8f16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8f16 ZPR:$src)>;
     def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>;
-    def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
     def : Pat<(nxv8f16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
     def : Pat<(nxv8f16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
     def : Pat<(nxv8f16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
@@ -1454,6 +1457,9 @@
     def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>;
     def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
     def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+
+    def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
+    def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
   }
 
   def : Pat<(nxv16i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
Index: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_insr.c
===================================================================
--- clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_insr.c
+++ clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_insr.c
@@ -4,6 +4,9 @@
 // RUN: %clang_cc1 -D__ARM_FEATURE_SVE -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s >/dev/null 2>%t
 // RUN: FileCheck --check-prefix=ASM --allow-empty %s <%t
 
+// RUN: %clang_cc1 -DENABLE_BFLOAT_TESTS -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECKBF16
+// RUN: %clang_cc1 -DENABLE_BFLOAT_TESTS -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECKBF16
+
 // If this check fails please read test/CodeGen/aarch64-sve-intrinsics/README for instructions on how to resolve it.
 // ASM-NOT: warning
 #include <arm_sve.h>
@@ -87,6 +90,16 @@
   return SVE_ACLE_FUNC(svinsr,_n_f16,,)(op1, op2);
 }
 
+#ifdef ENABLE_BF16_TESTS
+svbfloat16_t test_svinsr_n_bf16(svbfloat16_t op1, bfloat16_t op2)
+{
+  // CHECK16-LABEL: test_svinsr_n_bf16
+  // CHECK16: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.insr.nxv8bf16(<vscale x 8 x bfloat> %op1, bfloat %op2)
+  // CHECK16: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
+  return SVE_ACLE_FUNC(svinsr,_n_bf16,,)(op1, op2);
+}
+#endif
+
 svfloat32_t test_svinsr_n_f32(svfloat32_t op1, float32_t op2)
 {
   // CHECK-LABEL: test_svinsr_n_f32
Index: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq.c
===================================================================
--- clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq.c
+++ clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq.c
@@ -1,6 +1,10 @@
 // RUN: %clang_cc1 -D__ARM_FEATURE_SVE -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -D__ARM_FEATURE_SVE -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 
+// RUN: %clang_cc1 -DENABLE_BF16_TESTS -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECKBF16
+// RUN: %clang_cc1 -DENABLE_BF16_TESTS -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECKBF16
+
+
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
@@ -82,6 +86,16 @@
   return SVE_ACLE_FUNC(svdupq_lane,_f16,,)(data, index);
 }
 
+#ifdef ENABLE_BF16_TESTS
+svbfloat16_t test_svdupq_lane_bf16(svbfloat16_t data, uint64_t index)
+{
+  // CHECKBF16-LABEL: test_svdupq_lane_bf16
+  // CHECKBF16: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %data, i64 %index)
+  // CHECKBF16: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
+  return SVE_ACLE_FUNC(svdupq_lane,_bf16,,)(data, index);
+}
+#endif
+
 svfloat32_t test_svdupq_lane_f32(svfloat32_t data, uint64_t index)
 {
   // CHECK-LABEL: test_svdupq_lane_f32
@@ -249,6 +263,25 @@
   return SVE_ACLE_FUNC(svdupq,_n,_f16,)(x0, x1, x2, x3, x4, x5, x6, x7);
 }
 
+#ifdef ENABLE_BF16_TESTS
+svbfloat16_t test_svdupq_n_bf16(bfloat16_t x0, bfloat16_t x1, bfloat16_t x2, bfloat16_t x3,
+                                bfloat16_t x4, bfloat16_t x5, bfloat16_t x6, bfloat16_t x7)
+{
+  // CHECKBF16-LABEL: test_svdupq_n_bf16
+  // CHECKBF16: %[[ALLOCA:.*]] = alloca [8 x bfloat], align 16
+  // CHECKBF16-DAG: %[[BASE:.*]] = getelementptr inbounds [8 x bfloat], [8 x bfloat]* %[[ALLOCA]], i64 0, i64 0
+  // CHECKBF16-DAG: store bfloat %x0, bfloat* %[[BASE]], align 16
+  // <assume other stores>
+  // CHECKBF16-DAG: %[[GEP:.*]] = getelementptr inbounds [8 x bfloat], [8 x bfloat]* %[[ALLOCA]], i64 0, i64 7
+  // CHECKBF16: store bfloat %x7, bfloat* %[[GEP]], align 2
+  // CHECKBF16-NOT: store
+  // CHECKBF16: call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  // CHECKBF16: %[[LOAD:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> %{{.*}}, bfloat* nonnull %[[BASE]])
+  // CHECKBF16: ret <vscale x 8 x bfloat> %[[LOAD]]
+  return SVE_ACLE_FUNC(svdupq,_n,_bf16,)(x0, x1, x2, x3, x4, x5, x6, x7);
+}
+#endif
+
 svfloat32_t test_svdupq_n_f32(float32_t x0, float32_t x1, float32_t x2, float32_t x3)
 {
   // CHECK-LABEL: test_svdupq_n_f32
Index: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dup.c
===================================================================
--- clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dup.c
+++ clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dup.c
@@ -1,6 +1,9 @@
 // RUN: %clang_cc1 -D__ARM_FEATURE_SVE -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -D__ARM_FEATURE_SVE -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 
+// RUN: %clang_cc1 -DENABLE_BF16_TESTS -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECKBF16
+// RUN: %clang_cc1 -DENABLE_BF16_TESTS -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECKBF16
+
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
@@ -82,6 +85,16 @@
   return SVE_ACLE_FUNC(svdup,_n,_f16,)(op);
 }
 
+#ifdef ENABLE_BF16_TESTS
+svbfloat16_t test_svdup_n_bf16(bfloat16_t op)
+{
+  // CHECKBF16-LABEL: test_svdup_n_bf16
+  // CHECKBF16: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.x.nxv8bf16(bfloat %op)
+  // CHECKBF16: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
+  return SVE_ACLE_FUNC(svdup,_n,_bf16,)(op);
+}
+#endif
+
 svfloat32_t test_svdup_n_f32(float32_t op)
 {
   // CHECK-LABEL: test_svdup_n_f32
@@ -177,6 +190,17 @@
   return SVE_ACLE_FUNC(svdup,_n,_f16_z,)(pg, op);
 }
 
+#ifdef ENABLE_BF16_TESTS
+svbfloat16_t test_svdup_n_bf16_z(svbool_t pg, bfloat16_t op)
+{
+  // CHECKBF16-LABEL: test_svdup_n_bf16_z
+  // CHECKBF16: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECKBF16: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x i1> %[[PG]], bfloat %op)
+  // CHECKBF16: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
+  return SVE_ACLE_FUNC(svdup,_n,_bf16_z,)(pg, op);
+}
+#endif
+
 svfloat32_t test_svdup_n_f32_z(svbool_t pg, float32_t op)
 {
   // CHECK-LABEL: test_svdup_n_f32_z
@@ -274,6 +298,17 @@
   return SVE_ACLE_FUNC(svdup,_n,_f16_m,)(inactive, pg, op);
 }
 
+#ifdef ENABLE_BF16_TESTS
+svbfloat16_t test_svdup_n_bf16_m(svbfloat16_t inactive, svbool_t pg, bfloat16_t op)
+{
+  // CHECKBF16-LABEL: test_svdup_n_bf16_m
+  // CHECKBF16: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECKBF16: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> %inactive, <vscale x 8 x i1> %[[PG]], bfloat %op)
+  // CHECKBF16: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
+  return SVE_ACLE_FUNC(svdup,_n,_bf16_m,)(inactive, pg, op);
+}
+#endif
+
 svfloat32_t test_svdup_n_f32_m(svfloat32_t inactive, svbool_t pg, float32_t op)
 {
   // CHECK-LABEL: test_svdup_n_f32_m
@@ -371,6 +406,17 @@
   return SVE_ACLE_FUNC(svdup,_n,_f16_x,)(pg, op);
 }
 
+#ifdef ENABLE_BF16_TESTS
+svbfloat16_t test_svdup_n_bf16_x(svbool_t pg, bfloat16_t op)
+{
+  // CHECKBF16-LABEL: test_svdup_n_bf16_x
+  // CHECKBF16: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECKBF16: %[[INTRINSIC:.*]] = call <vscale x 8 x bfloat> @llvm.aarch64.sve.dup.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> %[[PG]], bfloat %op)
+  // CHECKBF16: ret <vscale x 8 x bfloat> %[[INTRINSIC]]
+  return SVE_ACLE_FUNC(svdup,_n,_bf16_x,)(pg, op);
+}
+#endif
+
 svfloat32_t test_svdup_n_f32_x(svbool_t pg, float32_t op)
 {
   // CHECK-LABEL: test_svdup_n_f32_x
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -7720,6 +7720,9 @@
     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
   case SVETypeFlags::EltTyBool64:
     return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
+
+  case SVETypeFlags::EltTyBFloat16:
+    return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
   }
 }
 
@@ -8363,6 +8366,7 @@
   case SVE::BI__builtin_sve_svdupq_n_s64:
   case SVE::BI__builtin_sve_svdupq_n_u16:
   case SVE::BI__builtin_sve_svdupq_n_f16:
+  case SVE::BI__builtin_sve_svdupq_n_bf16:
   case SVE::BI__builtin_sve_svdupq_n_s16:
   case SVE::BI__builtin_sve_svdupq_n_u32:
   case SVE::BI__builtin_sve_svdupq_n_f32:
Index: clang/include/clang/Basic/arm_sve.td
===================================================================
--- clang/include/clang/Basic/arm_sve.td
+++ clang/include/clang/Basic/arm_sve.td
@@ -678,13 +678,23 @@
 
 def SVDUPQ_8  : SInst<"svdupq[_n]_{d}", "dssssssssssssssss",  "cUc", MergeNone>;
 def SVDUPQ_16 : SInst<"svdupq[_n]_{d}", "dssssssss",  "sUsh", MergeNone>;
+let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16) && defined(__ARM_FEATURE_BF16_SCALAR_ARITHMETIC) " in {
+  def SVDUPQ_BF16 : SInst<"svdupq[_n]_{d}", "dssssssss",  "b", MergeNone>;
+}
 def SVDUPQ_32 : SInst<"svdupq[_n]_{d}", "dssss",  "iUif", MergeNone>;
 def SVDUPQ_64 : SInst<"svdupq[_n]_{d}", "dss",  "lUld", MergeNone>;
 
-def SVDUP   : SInst<"svdup[_n]_{d}", "ds",   "csilUcUsUiUlhfd", MergeNone,    "aarch64_sve_dup_x">;
-def SVDUP_M : SInst<"svdup[_n]_{d}", "ddPs", "csilUcUsUiUlhfd", MergeOp1,     "aarch64_sve_dup">;
-def SVDUP_X : SInst<"svdup[_n]_{d}", "dPs",  "csilUcUsUiUlhfd", MergeAnyExp,  "aarch64_sve_dup">;
-def SVDUP_Z : SInst<"svdup[_n]_{d}", "dPs",  "csilUcUsUiUlhfd", MergeZeroExp, "aarch64_sve_dup">;
+multiclass svdup_base<string n, string p, MergeType mt, string i> {
+  def NAME   : SInst<n, p,   "csilUcUsUiUlhfd", mt,    i>;
+  let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16) && defined(__ARM_FEATURE_BF16_SCALAR_ARITHMETIC) " in {
+    def _BF16: SInst<n, p,   "b", mt,    i>;
+  }
+}
+
+defm SVDUP   : svdup_base<"svdup[_n]_{d}", "ds",   MergeNone,    "aarch64_sve_dup_x">;
+defm SVDUP_M : svdup_base<"svdup[_n]_{d}", "ddPs", MergeOp1,     "aarch64_sve_dup">;
+defm SVDUP_X : svdup_base<"svdup[_n]_{d}", "dPs",  MergeAnyExp,  "aarch64_sve_dup">;
+defm SVDUP_Z : svdup_base<"svdup[_n]_{d}", "dPs",  MergeZeroExp, "aarch64_sve_dup">;
 
 def SVINDEX : SInst<"svindex_{d}",   "dss",  "csilUcUsUiUl",    MergeNone,    "aarch64_sve_index">;
 
@@ -803,8 +813,11 @@
 def SVASRD_M : SInst<"svasrd[_n_{d}]", "dPdi", "csil",            MergeOp1,  "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
 def SVASRD_X : SInst<"svasrd[_n_{d}]", "dPdi", "csil",            MergeAny,  "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
 def SVASRD_Z : SInst<"svasrd[_n_{d}]", "dPdi", "csil",            MergeZero, "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
-def SVINSR   : SInst<"svinsr[_n_{d}]", "dds",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_insr">;
 
+def SVINSR   : SInst<"svinsr[_n_{d}]", "dds",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_insr">;
+let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16) && defined(__ARM_FEATURE_BF16_SCALAR_ARITHMETIC) " in {
+  def SVINSR_BF16 : SInst<"svinsr[_n_{d}]", "dds",  "b", MergeNone, "aarch64_sve_insr">;
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 // Integer reductions
@@ -1133,6 +1146,9 @@
 // instruction's immediate.
 def SVDUP_LANE   : SInst<"svdup_lane[_{d}]",  "ddL",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl">;
 def SVDUPQ_LANE  : SInst<"svdupq_lane[_{d}]", "ddn",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_dupq_lane">;
+let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16) && defined(__ARM_FEATURE_BF16_SCALAR_ARITHMETIC) " in {
+  def SVDUPQ_LANE_BF16  : SInst<"svdupq_lane[_{d}]", "ddn",  "b", MergeNone, "aarch64_sve_dupq_lane">;
+}
 def SVEXT        : SInst<"svext[_{d}]",       "dddi", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ext", [], [ImmCheck<2, ImmCheckExtract, 1>]>;
 def SVLASTA      : SInst<"svlasta[_{d}]",     "sPd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_lasta">;
 def SVLASTB      : SInst<"svlastb[_{d}]",     "sPd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_lastb">;

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D82345: [sve][acle] Implement some of the C intrinsics for brain float.

Reply via email to