[PATCH] D81740: [AArch32]: BFloat MatMul Intrinsics&CodeGen

Luke Geeson via Phabricator via cfe-commits Mon, 15 Jun 2020 04:54:14 -0700

LukeGeeson updated this revision to Diff 270720.
LukeGeeson added a comment.


removed redundancy in patch


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D81740/new/

https://reviews.llvm.org/D81740

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
  llvm/include/llvm/IR/IntrinsicsARM.td
  llvm/lib/Target/ARM/ARMInstrNEON.td
  llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll

Index: llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll
@@ -0,0 +1,150 @@
+; RUN: llc -mtriple armv8.6a-arm-none-eabi  -mattr=+bf16 -float-abi=hard %s -o - | FileCheck %s --check-prefix=CHECK
+
+; CHECK-LABEL: test_vbfdot_f32
+; CHECK: vdot.bf16       d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+define arm_aapcs_vfpcc <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast <4 x bfloat> %a to <8 x i8>
+  %1 = bitcast <4 x bfloat> %b to <8 x i8>
+  %vbfdot1.i = tail call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1) #3
+  ret <2 x float> %vbfdot1.i
+}
+
+; CHECK-LABEL: test_vbfdotq_f32
+; CHECK: vdot.bf16       q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) local_unnamed_addr #1 {
+entry:
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %b to <16 x i8>
+  %vbfdot1.i = tail call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3
+  ret <4 x float> %vbfdot1.i
+}
+
+; CHECK-LABEL: test_vbfdot_lane_f32
+; CHECK: vdot.bf16       d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[0]
+define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast <4 x bfloat> %b to <2 x float>
+  %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer
+  %1 = bitcast <4 x bfloat> %a to <8 x i8>
+  %2 = bitcast <2 x float> %shuffle to <8 x i8>
+  %vbfdot1.i = tail call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) #3
+  ret <2 x float> %vbfdot1.i
+}
+
+; CHECK-LABEL: test_vbfdotq_laneq_f32
+; CHECK: vdup.32 q{{[0-9]+}}, d{{[0-9]+}}[1]
+; CHECK: vdot.bf16       q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) local_unnamed_addr #1 {
+entry:
+  %0 = bitcast <8 x bfloat> %b to <4 x float>
+  %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %1 = bitcast <8 x bfloat> %a to <16 x i8>
+  %2 = bitcast <4 x float> %shuffle to <16 x i8>
+  %vbfdot1.i = tail call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) #3
+  ret <4 x float> %vbfdot1.i
+}
+
+; CHECK-LABEL: test_vbfdot_laneq_f32
+; CHECK: vdot.bf16       d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[1]
+define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) local_unnamed_addr #1 {
+entry:
+  %0 = bitcast <8 x bfloat> %b to <4 x float>
+  %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %1 = bitcast <4 x bfloat> %a to <8 x i8>
+  %2 = bitcast <2 x float> %shuffle to <8 x i8>
+  %vbfdot1.i = tail call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) #3
+  ret <2 x float> %vbfdot1.i
+}
+
+; CHECK-LABEL: test_vbfdotq_lane_f32
+; CHECK: vdot.bf16       q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0]
+define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) local_unnamed_addr #1 {
+entry:
+  %0 = bitcast <4 x bfloat> %b to <2 x float>
+  %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer
+  %1 = bitcast <8 x bfloat> %a to <16 x i8>
+  %2 = bitcast <4 x float> %shuffle to <16 x i8>
+  %vbfdot1.i = tail call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) #3
+  ret <4 x float> %vbfdot1.i
+}
+
+; CHECK-LABEL: test_vbfmmlaq_f32
+; CHECK: vmmla.bf16      q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) local_unnamed_addr #1 {
+entry:
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %b to <16 x i8>
+  %vbfmmla1.i = tail call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3
+  ret <4 x float> %vbfmmla1.i
+}
+
+; CHECK-LABEL: test_vbfmlalbq_f32
+; CHECK: vfmab.bf16      q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) local_unnamed_addr #1 {
+entry:
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %b to <16 x i8>
+  %vbfmlalb1.i = tail call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3
+  ret <4 x float> %vbfmlalb1.i
+}
+
+; CHECK-LABEL: test_vbfmlaltq_f32
+; CHECK: vfmat.bf16      q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) local_unnamed_addr #1 {
+entry:
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %b to <16 x i8>
+  %vbfmlalt1.i = tail call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3
+  ret <4 x float> %vbfmlalt1.i
+}
+
+; CHECK-LABEL: test_vbfmlalbq_lane_f32
+; CHECK: vfmab.bf16      q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0]
+define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) local_unnamed_addr #1 {
+entry:
+  %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
+  %vbfmlalb1.i = tail call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3
+  ret <4 x float> %vbfmlalb1.i
+}
+
+; CHECK-LABEL: test_vbfmlalbq_laneq_f32
+; CHECK: vfmab.bf16      q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[3]
+define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) local_unnamed_addr #1 {
+entry:
+  %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
+  %vbfmlalb1.i = tail call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3
+  ret <4 x float> %vbfmlalb1.i
+}
+
+; CHECK-LABEL: test_vbfmlaltq_lane_f32
+; CHECK: vfmat.bf16      q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0]
+define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) local_unnamed_addr #1 {
+entry:
+  %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
+  %vbfmlalt1.i = tail call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3
+  ret <4 x float> %vbfmlalt1.i
+}
+
+; CHECK-LABEL: test_vbfmlaltq_laneq_f32
+; CHECK: vfmat.bf16      q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[3]
+define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) local_unnamed_addr #1 {
+entry:
+  %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
+  %vbfmlalt1.i = tail call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3
+  ret <4 x float> %vbfmlalt1.i
+}
+
+declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) #2
+declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
+declare <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
+declare <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
+declare <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
Index: llvm/lib/Target/ARM/ARMInstrNEON.td
===================================================================
--- llvm/lib/Target/ARM/ARMInstrNEON.td
+++ llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -9035,15 +9035,20 @@
 // ARMv8.6a BFloat16 instructions.
 let Predicates = [HasBF16, HasNEON] in {
 class BF16VDOT<bits<5> op27_23, bits<2> op21_20, bit op6,
-               dag oops, dag iops>
+               dag oops, dag iops, list<dag> pattern>
    : N3Vnp<op27_23, op21_20, 0b1101, op6, 0, oops, iops,
-           N3RegFrm, IIC_VDOTPROD, "", "", []> {
+           N3RegFrm, IIC_VDOTPROD, "", "", pattern>
+{
     let DecoderNamespace = "VFPV8";
 }
 
 class BF16VDOTS<bit Q, RegisterClass RegTy, string opc, ValueType AccumTy, ValueType InputTy>
    : BF16VDOT<0b11000, 0b00,  Q, (outs RegTy:$dst),
-              (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm)> {
+              (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm),
+            [(set (AccumTy RegTy:$dst),
+                  (int_arm_neon_bfdot (AccumTy RegTy:$Vd),
+                                      (InputTy RegTy:$Vn),
+                                      (InputTy RegTy:$Vm)))]> {
   let Constraints = "$dst = $Vd";
   let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm");
     let DecoderNamespace = "VFPV8";
@@ -9054,7 +9059,7 @@
 
   def "" : BF16VDOT<0b11100, 0b00, Q, (outs RegTy:$dst),
                     (ins RegTy:$Vd, RegTy:$Vn,
-                    DPR_VFP2:$Vm, VectorIndex32:$lane)> {
+                    DPR_VFP2:$Vm, VectorIndex32:$lane), []> {
     bit lane;
     let Inst{5} = lane;
     let Constraints = "$dst = $Vd";
@@ -9062,6 +9067,13 @@
     let DecoderNamespace = "VFPV8";
   }
 
+  def : Pat<
+    (AccumTy (int_arm_neon_bfdot (AccumTy RegTy:$Vd),
+                                 (InputTy RegTy:$Vn),
+                                 (InputTy (bitconvert (AccumTy
+                                          (ARMvduplane (AccumTy RegTy:$Vm),
+                                                        VectorIndex32:$lane)))))),
+    (!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>;
 }
 
 def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v8i8>;
@@ -9074,7 +9086,10 @@
              string opc>
    : N3Vnp<0b11000, 0b00, 0b1100, Q, 0,
            (outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm),
-           N3RegFrm, IIC_VDOTPROD, "", "", []> {
+           N3RegFrm, IIC_VDOTPROD, "", "",
+                [(set (v4f32 QPR:$dst), (int_arm_neon_bfmmla (v4f32 QPR:$Vd),
+                                                (v16i8 QPR:$Vn),
+                                                (v16i8 QPR:$Vm)))]> {
    let Constraints = "$dst = $Vd";
    let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm");
    let DecoderNamespace = "VFPV8";
@@ -9082,19 +9097,22 @@
 
 def VMMLA : BF16MM<1, QPR, "vmmla">;
 
-class VBF16MALQ<bit T, string suffix>
+class VBF16MALQ<bit T, string suffix, SDPatternOperator OpNode>
   : N3VCP8<0b00, 0b11, T, 1,
            (outs QPR:$dst), (ins QPR:$Vd, QPR:$Vn, QPR:$Vm),
            NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "",
-              []> { // TODO: Add intrinsics
+                [(set (v4f32 QPR:$dst),
+                      (OpNode (v4f32 QPR:$Vd),
+                              (v16i8 QPR:$Vn),
+                              (v16i8 QPR:$Vm)))]> {
   let Constraints = "$dst = $Vd";
   let DecoderNamespace = "VFPV8";
 }
 
-def VBF16MALTQ: VBF16MALQ<1, "t">;
-def VBF16MALBQ: VBF16MALQ<0, "b">;
+def VBF16MALTQ: VBF16MALQ<1, "t", int_arm_neon_bfmlalt>;
+def VBF16MALBQ: VBF16MALQ<0, "b", int_arm_neon_bfmlalb>;
 
-multiclass VBF16MALQI<bit T, string suffix> {
+multiclass VBF16MALQI<bit T, string suffix, SDPatternOperator OpNode> {
   def "" : N3VLaneCP8<0, 0b11, T, 1, (outs QPR:$dst),
               (ins QPR:$Vd, QPR:$Vn, DPR_8:$Vm, VectorIndex16:$idx),
                IIC_VMACD, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm$idx", "", []> {
@@ -9105,10 +9123,17 @@
   let DecoderNamespace = "VFPV8";
   }
 
+  def : Pat<
+    (v4f32 (OpNode (v4f32 QPR:$Vd),
+                   (v16i8 QPR:$Vn),
+                   (v16i8 (bitconvert (v8bf16
+                          (ARMvduplane (v8bf16 QPR:$Vm),
+                                        VectorIndex16:$lane)))))),
+    (!cast<Instruction>(NAME) QPR:$Vd, QPR:$Vn, (EXTRACT_SUBREG QPR:$Vm, dsub_0), VectorIndex32:$lane)>;
 }
 
-defm VBF16MALTQI: VBF16MALQI<1, "t">;
-defm VBF16MALBQI: VBF16MALQI<0, "b">;
+defm VBF16MALTQI: VBF16MALQI<1, "t", int_arm_neon_bfmlalt>;
+defm VBF16MALBQI: VBF16MALQI<0, "b", int_arm_neon_bfmlalb>;
 
 def BF16_VCVT :  N2V<0b11, 0b11, 0b01, 0b10, 0b01100, 1, 0,
                     (outs DPR:$Vd), (ins QPR:$Vm),
Index: llvm/include/llvm/IR/IntrinsicsARM.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsARM.td
+++ llvm/include/llvm/IR/IntrinsicsARM.td
@@ -785,7 +785,16 @@
 def int_arm_neon_usdot  : Neon_Dot_Intrinsic;
 
 // v8.6-A Bfloat Intrinsics
+def int_arm_neon_bfdot : Neon_Dot_Intrinsic;
+def int_arm_neon_bfmmla : Neon_MatMul_Intrinsic;
 
+class Neon_FML_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty],
+              [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
+              [IntrNoMem]>;
+def int_arm_neon_bfmlalb : Neon_FML_Intrinsic;
+def int_arm_neon_bfmlalt : Neon_FML_Intrinsic;
+ 
 def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
 def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
 
Index: clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
@@ -0,0 +1,203 @@
+// RUN: %clang_cc1 -triple armv8-arm-none-eabi \
+// RUN:   -O2 -target-feature +neon -target-feature +bf16 \
+// RUN:   -mfloat-abi hard \
+// RUN:   -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BF16
+// RUN: %clang_cc1 -triple armv8-arm-none-eabi \
+// RUN:   -O2 -target-feature +neon -target-feature +bf16 \
+// RUN:   -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NO-BF16
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: test_vbfdot_f32
+// CHECK-BF16:  %0 = bitcast <4 x bfloat> %a to <8 x i8>
+// CHECK-BF16:  %1 = bitcast <4 x bfloat> %b to <8 x i8>
+// CHECK-BF16:  %vbfdot1.i = tail call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
+// CHECK-BF16:  ret <2 x float> %vbfdot1.i
+// CHECK-NO-BF16:  %0 = bitcast <2 x i32> %a.coerce to <8 x i8>
+// CHECK-NO-BF16:  %1 = bitcast <2 x i32> %b.coerce to <8 x i8>
+// CHECK-NO-BF16:  %vbfdot3.i = tail call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1) #3
+// CHECK-NO-BF16:  ret <2 x float> %vbfdot3.i
+float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
+  return vbfdot_f32(r, a, b);
+}
+
+// CHECK-LABEL: test_vbfdotq_f32
+// CHECK-BF16:  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+// CHECK-BF16:  %1 = bitcast <8 x bfloat> %b to <16 x i8>
+// CHECK-BF16:  %vbfdot1.i = tail call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+// CHECK-BF16:  ret <4 x float> %vbfdot1.i
+// CHECK-NO-BF16:  %0 = bitcast <4 x i32> %a.coerce to <16 x i8>
+// CHECK-NO-BF16:  %1 = bitcast <4 x i32> %b.coerce to <16 x i8>
+// CHECK-NO-BF16:  %vbfdot3.i = tail call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3
+// CHECK-NO-BF16:  ret <4 x float> %vbfdot3.i
+float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
+  return vbfdotq_f32(r, a, b);
+}
+
+// CHECK-LABEL: test_vbfdot_lane_f32
+// CHECK-BF16:  %0 = bitcast <4 x bfloat> %b to <2 x float>
+// CHECK-BF16:  %lane = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer
+// CHECK-BF16:  %1 = bitcast <4 x bfloat> %a to <8 x i8>
+// CHECK-BF16:  %2 = bitcast <2 x float> %lane to <8 x i8>
+// CHECK-BF16:  %vbfdot1.i = tail call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
+// CHECK-BF16:  ret <2 x float> %vbfdot1.i
+// CHECK-NO-BF16:  %0 = shufflevector <2 x i32> %b.coerce, <2 x i32> undef, <2 x i32> zeroinitializer
+// CHECK-NO-BF16:  %1 = bitcast <2 x i32> %a.coerce to <8 x i8>
+// CHECK-NO-BF16:  %2 = bitcast <2 x i32> %0 to <8 x i8>
+// CHECK-NO-BF16:  %vbfdot3.i = tail call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) #3
+// CHECK-NO-BF16:  ret <2 x float> %vbfdot3.i
+float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
+  return vbfdot_lane_f32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vbfdotq_laneq_f32
+// CHECK-BF16:  %0 = bitcast <8 x bfloat> %b to <4 x float>
+// CHECK-BF16:  %lane = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-BF16:  %1 = bitcast <8 x bfloat> %a to <16 x i8>
+// CHECK-BF16:  %2 = bitcast <4 x float> %lane to <16 x i8>
+// CHECK-BF16:  %vbfdot1.i = tail call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
+// CHECK-BF16:  ret <4 x float> %vbfdot1.i
+// CHECK-NO-BF16:  %0 = shufflevector <4 x i32> %b.coerce, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NO-BF16:  %1 = bitcast <4 x i32> %a.coerce to <16 x i8>
+// CHECK-NO-BF16:  %2 = bitcast <4 x i32> %0 to <16 x i8>
+// CHECK-NO-BF16:  %vbfdot3.i = tail call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) #3
+// CHECK-NO-BF16:  ret <4 x float> %vbfdot3.i
+float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
+  return vbfdotq_laneq_f32(r, a, b, 3);
+}
+
+// CHECK-LABEL: test_vbfdot_laneq_f32
+// CHECK-BF16:  %0 = bitcast <8 x bfloat> %b to <4 x float>
+// CHECK-BF16:  %lane = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+// CHECK-BF16:  %1 = bitcast <4 x bfloat> %a to <8 x i8>
+// CHECK-BF16:  %2 = bitcast <2 x float> %lane to <8 x i8>
+// CHECK-BF16:  %vbfdot1.i = tail call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
+// CHECK-BF16:  ret <2 x float> %vbfdot1.i
+// CHECK-NO-BF16:  %0 = bitcast <4 x i32> %b.coerce to <4 x float>
+// CHECK-NO-BF16:  %lane = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+// CHECK-NO-BF16:  %1 = bitcast <2 x i32> %a.coerce to <8 x i8>
+// CHECK-NO-BF16:  %2 = bitcast <2 x float> %lane to <8 x i8>
+// CHECK-NO-BF16:  %vbfdot3.i = tail call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) #3
+// CHECK-NO-BF16:  ret <2 x float> %vbfdot3.i
+float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
+  return vbfdot_laneq_f32(r, a, b, 3);
+}
+
+// CHECK-LABEL: test_vbfdotq_lane_f32
+// CHECK-BF16:  %0 = bitcast <4 x bfloat> %b to <2 x float>
+// CHECK-BF16:  %lane = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer
+// CHECK-BF16:  %1 = bitcast <8 x bfloat> %a to <16 x i8>
+// CHECK-BF16:  %2 = bitcast <4 x float> %lane to <16 x i8>
+// CHECK-BF16:  %vbfdot1.i = tail call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
+// CHECK-BF16:  ret <4 x float> %vbfdot1.i
+// CHECK-NO-BF16:  %0 = bitcast <2 x i32> %b.coerce to <2 x float>
+// CHECK-NO-BF16:  %lane = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NO-BF16:  %1 = bitcast <4 x i32> %a.coerce to <16 x i8>
+// CHECK-NO-BF16:  %2 = bitcast <4 x float> %lane to <16 x i8>
+// CHECK-NO-BF16:  %vbfdot3.i = tail call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) #3
+// CHECK-NO-BF16:  ret <4 x float> %vbfdot3.i
+float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
+  return vbfdotq_lane_f32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vbfmmlaq_f32
+// CHECK-BF16:  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+// CHECK-BF16:  %1 = bitcast <8 x bfloat> %b to <16 x i8>
+// CHECK-BF16:  %vbfmmla1.i = tail call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+// CHECK-BF16:  ret <4 x float> %vbfmmla1.i
+// CHECK-NO-BF16:  %0 = bitcast <4 x i32> %a.coerce to <16 x i8>
+// CHECK-NO-BF16:  %1 = bitcast <4 x i32> %b.coerce to <16 x i8>
+// CHECK-NO-BF16:  %vbfmmla3.i = tail call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3
+// CHECK-NO-BF16:  ret <4 x float> %vbfmmla3.i
+float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
+  return vbfmmlaq_f32(r, a, b);
+}
+
+// CHECK-LABEL: test_vbfmlalbq_f32
+// CHECK-BF16:  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+// CHECK-BF16:  %1 = bitcast <8 x bfloat> %b to <16 x i8>
+// CHECK-BF16:  %vbfmlalb1.i = tail call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+// CHECK-BF16:  ret <4 x float> %vbfmlalb1.i
+// CHECK-NO-BF16:  %0 = bitcast <4 x i32> %a.coerce to <16 x i8>
+// CHECK-NO-BF16:  %1 = bitcast <4 x i32> %b.coerce to <16 x i8>
+// CHECK-NO-BF16:  %vbfmlalb3.i = tail call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3
+// CHECK-NO-BF16:  ret <4 x float> %vbfmlalb3.i
+float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
+  return vbfmlalbq_f32(r, a, b);
+}
+
+// CHECK-LABEL: test_vbfmlaltq_f32
+// CHECK-BF16:  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+// CHECK-BF16:  %1 = bitcast <8 x bfloat> %b to <16 x i8>
+// CHECK-BF16:  %vbfmlalt1.i = tail call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+// CHECK-BF16:  ret <4 x float> %vbfmlalt1.i
+// CHECK-NO-BF16:  %0 = bitcast <4 x i32> %a.coerce to <16 x i8>
+// CHECK-NO-BF16:  %1 = bitcast <4 x i32> %b.coerce to <16 x i8>
+// CHECK-NO-BF16:  %vbfmlalt3.i = tail call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3
+// CHECK-NO-BF16:  ret <4 x float> %vbfmlalt3.i
+float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
+  return vbfmlaltq_f32(r, a, b);
+}
+
+// CHECK-LABEL: test_vbfmlalbq_lane_f32
+// CHECK-BF16:  %vecinit71 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
+// CHECK-BF16:  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+// CHECK-BF16:  %1 = bitcast <8 x bfloat> %vecinit71 to <16 x i8>
+// CHECK-BF16:  %vbfmlalb1.i = tail call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+// CHECK-BF16:  ret <4 x float> %vbfmlalb1.i
+// CHECK-NO-BF16:  %bc = bitcast <2 x i32> %b.coerce to <4 x bfloat>
+// CHECK-NO-BF16:  %vecinit73 = shufflevector <4 x bfloat> %bc, <4 x bfloat> undef, <8 x i32> zeroinitializer
+// CHECK-NO-BF16:  %0 = bitcast <4 x i32> %a.coerce to <16 x i8>
+// CHECK-NO-BF16:  %1 = bitcast <8 x bfloat> %vecinit73 to <16 x i8>
+// CHECK-NO-BF16:  %vbfmlalb3.i = tail call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3
+// CHECK-NO-BF16:  ret <4 x float> %vbfmlalb3.i
+float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
+  return vbfmlalbq_lane_f32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vbfmlalbq_laneq_f32
+// CHECK-BF16:  %vecinit71 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-BF16:  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+// CHECK-BF16:  %1 = bitcast <8 x bfloat> %vecinit71 to <16 x i8>
+// CHECK-BF16:  %vbfmlalb1.i = tail call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+// CHECK-NO-BF16:  %bc = bitcast <4 x i32> %b.coerce to <8 x bfloat>
+// CHECK-NO-BF16:  %vecinit73 = shufflevector <8 x bfloat> %bc, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NO-BF16:  %0 = bitcast <4 x i32> %a.coerce to <16 x i8>
+// CHECK-NO-BF16:  %1 = bitcast <8 x bfloat> %vecinit73 to <16 x i8>
+// CHECK-NO-BF16:  %vbfmlalb3.i = tail call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3
+// CHECK-NO-BF16:  ret <4 x float> %vbfmlalb3.i
+float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
+  return vbfmlalbq_laneq_f32(r, a, b, 3);
+}
+
+// CHECK-LABEL: test_vbfmlaltq_lane_f32
+// CHECK-BF16:  %vecinit71 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
+// CHECK-BF16:  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+// CHECK-BF16:  %1 = bitcast <8 x bfloat> %vecinit71 to <16 x i8>
+// CHECK-BF16:  %vbfmlalt1.i = tail call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+// CHECK-BF16:  ret <4 x float> %vbfmlalt1.i
+// CHECK-NO-BF16:  %bc = bitcast <2 x i32> %b.coerce to <4 x bfloat>
+// CHECK-NO-BF16:  %vecinit73 = shufflevector <4 x bfloat> %bc, <4 x bfloat> undef, <8 x i32> zeroinitializer
+// CHECK-NO-BF16:  %0 = bitcast <4 x i32> %a.coerce to <16 x i8>
+// CHECK-NO-BF16:  %1 = bitcast <8 x bfloat> %vecinit73 to <16 x i8>
+// CHECK-NO-BF16:  %vbfmlalt3.i = tail call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3
+// CHECK-NO-BF16:  ret <4 x float> %vbfmlalt3.i
+float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
+  return vbfmlaltq_lane_f32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vbfmlaltq_laneq_f32
+// CHECK-BF16:  %vecinit71 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-BF16:  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+// CHECK-BF16:  %1 = bitcast <8 x bfloat> %vecinit71 to <16 x i8>
+// CHECK-BF16:  %vbfmlalt1.i = tail call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+// CHECK-BF16:  ret <4 x float> %vbfmlalt1.i
+// CHECK-NO-BF16:  %bc = bitcast <4 x i32> %b.coerce to <8 x bfloat>
+// CHECK-NO-BF16:  %vecinit73 = shufflevector <8 x bfloat> %bc, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NO-BF16:  %0 = bitcast <4 x i32> %a.coerce to <16 x i8>
+// CHECK-NO-BF16:  %1 = bitcast <8 x bfloat> %vecinit73 to <16 x i8>
+// CHECK-NO-BF16:  %vbfmlalt3.i = tail call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3
+// CHECK-NO-BF16:  ret <4 x float> %vbfmlalt3.i
+float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
+  return vbfmlaltq_laneq_f32(r, a, b, 3);
+}
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -4681,6 +4681,11 @@
   NEONMAP1(vaeseq_v, arm_neon_aese, 0),
   NEONMAP1(vaesimcq_v, arm_neon_aesimc, 0),
   NEONMAP1(vaesmcq_v, arm_neon_aesmc, 0),
+  NEONMAP1(vbfdot_v, arm_neon_bfdot, 0),
+  NEONMAP1(vbfdotq_v, arm_neon_bfdot, 0),
+  NEONMAP1(vbfmlalbq_v, arm_neon_bfmlalb, 0),
+  NEONMAP1(vbfmlaltq_v, arm_neon_bfmlalt, 0),
+  NEONMAP1(vbfmmlaq_v, arm_neon_bfmmla, 0),
   NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType),
   NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType),
   NEONMAP1(vcadd_rot270_v, arm_neon_vcadd_rot270, Add1ArgType),

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D81740: [AArch32]: BFloat MatMul Intrinsics&CodeGen

Reply via email to