LukeGeeson updated this revision to Diff 270720. LukeGeeson added a comment.
removed redundancy in patch CHANGES SINCE LAST ACTION https://reviews.llvm.org/D81740/new/ https://reviews.llvm.org/D81740 Files: clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c llvm/include/llvm/IR/IntrinsicsARM.td llvm/lib/Target/ARM/ARMInstrNEON.td llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll
Index: llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll @@ -0,0 +1,150 @@ +; RUN: llc -mtriple armv8.6a-arm-none-eabi -mattr=+bf16 -float-abi=hard %s -o - | FileCheck %s --check-prefix=CHECK + +; CHECK-LABEL: test_vbfdot_f32 +; CHECK: vdot.bf16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +define arm_aapcs_vfpcc <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) local_unnamed_addr #0 { +entry: + %0 = bitcast <4 x bfloat> %a to <8 x i8> + %1 = bitcast <4 x bfloat> %b to <8 x i8> + %vbfdot1.i = tail call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1) #3 + ret <2 x float> %vbfdot1.i +} + +; CHECK-LABEL: test_vbfdotq_f32 +; CHECK: vdot.bf16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) local_unnamed_addr #1 { +entry: + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %b to <16 x i8> + %vbfdot1.i = tail call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3 + ret <4 x float> %vbfdot1.i +} + +; CHECK-LABEL: test_vbfdot_lane_f32 +; CHECK: vdot.bf16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[0] +define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) local_unnamed_addr #0 { +entry: + %0 = bitcast <4 x bfloat> %b to <2 x float> + %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer + %1 = bitcast <4 x bfloat> %a to <8 x i8> + %2 = bitcast <2 x float> %shuffle to <8 x i8> + %vbfdot1.i = tail call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) #3 + ret <2 x float> %vbfdot1.i +} + +; CHECK-LABEL: test_vbfdotq_laneq_f32 +; CHECK: vdup.32 q{{[0-9]+}}, d{{[0-9]+}}[1] +; CHECK: vdot.bf16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) local_unnamed_addr #1 { +entry: + %0 = bitcast <8 x bfloat> %b to <4 x float> + %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> + %1 = bitcast <8 x bfloat> %a to <16 x i8> + %2 = bitcast <4 x float> %shuffle to <16 x i8> + %vbfdot1.i = tail call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) #3 + ret <4 x float> %vbfdot1.i +} + +; CHECK-LABEL: test_vbfdot_laneq_f32 +; CHECK: vdot.bf16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[1] +define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) local_unnamed_addr #1 { +entry: + %0 = bitcast <8 x bfloat> %b to <4 x float> + %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3> + %1 = bitcast <4 x bfloat> %a to <8 x i8> + %2 = bitcast <2 x float> %shuffle to <8 x i8> + %vbfdot1.i = tail call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) #3 + ret <2 x float> %vbfdot1.i +} + +; CHECK-LABEL: test_vbfdotq_lane_f32 +; CHECK: vdot.bf16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0] +define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) local_unnamed_addr #1 { +entry: + %0 = bitcast <4 x bfloat> %b to <2 x float> + %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer + %1 = bitcast <8 x bfloat> %a to <16 x i8> + %2 = bitcast <4 x float> %shuffle to <16 x i8> + %vbfdot1.i = tail call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) #3 + ret <4 x float> %vbfdot1.i +} + +; CHECK-LABEL: test_vbfmmlaq_f32 +; CHECK: vmmla.bf16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) local_unnamed_addr #1 { +entry: + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %b to <16 x i8> + %vbfmmla1.i = tail call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3 + ret <4 x float> %vbfmmla1.i +} + +; CHECK-LABEL: test_vbfmlalbq_f32 +; CHECK: vfmab.bf16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) local_unnamed_addr #1 { +entry: + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %b to <16 x i8> + %vbfmlalb1.i = tail call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3 + ret <4 x float> %vbfmlalb1.i +} + +; CHECK-LABEL: test_vbfmlaltq_f32 +; CHECK: vfmat.bf16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}} +define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) local_unnamed_addr #1 { +entry: + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %b to <16 x i8> + %vbfmlalt1.i = tail call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3 + ret <4 x float> %vbfmlalt1.i +} + +; CHECK-LABEL: test_vbfmlalbq_lane_f32 +; CHECK: vfmab.bf16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0] +define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) local_unnamed_addr #1 { +entry: + %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> + %vbfmlalb1.i = tail call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3 + ret <4 x float> %vbfmlalb1.i +} + +; CHECK-LABEL: test_vbfmlalbq_laneq_f32 +; CHECK: vfmab.bf16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[3] +define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) local_unnamed_addr #1 { +entry: + %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> + %vbfmlalb1.i = tail call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3 + ret <4 x float> %vbfmlalb1.i +} + +; CHECK-LABEL: test_vbfmlaltq_lane_f32 +; CHECK: vfmat.bf16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0] +define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) local_unnamed_addr #1 { +entry: + %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> + %vbfmlalt1.i = tail call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3 + ret <4 x float> %vbfmlalt1.i +} + +; CHECK-LABEL: test_vbfmlaltq_laneq_f32 +; CHECK: vfmat.bf16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[3] +define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) local_unnamed_addr #1 { +entry: + %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> + %vbfmlalt1.i = tail call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3 + ret <4 x float> %vbfmlalt1.i +} + +declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) #2 +declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 +declare <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 +declare <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 +declare <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 Index: llvm/lib/Target/ARM/ARMInstrNEON.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrNEON.td +++ llvm/lib/Target/ARM/ARMInstrNEON.td @@ -9035,15 +9035,20 @@ // ARMv8.6a BFloat16 instructions. let Predicates = [HasBF16, HasNEON] in { class BF16VDOT<bits<5> op27_23, bits<2> op21_20, bit op6, - dag oops, dag iops> + dag oops, dag iops, list<dag> pattern> : N3Vnp<op27_23, op21_20, 0b1101, op6, 0, oops, iops, - N3RegFrm, IIC_VDOTPROD, "", "", []> { + N3RegFrm, IIC_VDOTPROD, "", "", pattern> +{ let DecoderNamespace = "VFPV8"; } class BF16VDOTS<bit Q, RegisterClass RegTy, string opc, ValueType AccumTy, ValueType InputTy> : BF16VDOT<0b11000, 0b00, Q, (outs RegTy:$dst), - (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm)> { + (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm), + [(set (AccumTy RegTy:$dst), + (int_arm_neon_bfdot (AccumTy RegTy:$Vd), + (InputTy RegTy:$Vn), + (InputTy RegTy:$Vm)))]> { let Constraints = "$dst = $Vd"; let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm"); let DecoderNamespace = "VFPV8"; @@ -9054,7 +9059,7 @@ def "" : BF16VDOT<0b11100, 0b00, Q, (outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, - DPR_VFP2:$Vm, VectorIndex32:$lane)> { + DPR_VFP2:$Vm, VectorIndex32:$lane), []> { bit lane; let Inst{5} = lane; let Constraints = "$dst = $Vd"; @@ -9062,6 +9067,13 @@ let DecoderNamespace = "VFPV8"; } + def : Pat< + (AccumTy (int_arm_neon_bfdot (AccumTy RegTy:$Vd), + (InputTy RegTy:$Vn), + (InputTy (bitconvert (AccumTy + (ARMvduplane (AccumTy RegTy:$Vm), + VectorIndex32:$lane)))))), + (!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>; } def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v8i8>; @@ -9074,7 +9086,10 @@ string opc> : N3Vnp<0b11000, 0b00, 0b1100, Q, 0, (outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm), - N3RegFrm, IIC_VDOTPROD, "", "", []> { + N3RegFrm, IIC_VDOTPROD, "", "", + [(set (v4f32 QPR:$dst), (int_arm_neon_bfmmla (v4f32 QPR:$Vd), + (v16i8 QPR:$Vn), + (v16i8 QPR:$Vm)))]> { let Constraints = "$dst = $Vd"; let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm"); let DecoderNamespace = "VFPV8"; @@ -9082,19 +9097,22 @@ def VMMLA : BF16MM<1, QPR, "vmmla">; -class VBF16MALQ<bit T, string suffix> +class VBF16MALQ<bit T, string suffix, SDPatternOperator OpNode> : N3VCP8<0b00, 0b11, T, 1, (outs QPR:$dst), (ins QPR:$Vd, QPR:$Vn, QPR:$Vm), NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "", - []> { // TODO: Add intrinsics + [(set (v4f32 QPR:$dst), + (OpNode (v4f32 QPR:$Vd), + (v16i8 QPR:$Vn), + (v16i8 QPR:$Vm)))]> { let Constraints = "$dst = $Vd"; let DecoderNamespace = "VFPV8"; } -def VBF16MALTQ: VBF16MALQ<1, "t">; -def VBF16MALBQ: VBF16MALQ<0, "b">; +def VBF16MALTQ: VBF16MALQ<1, "t", int_arm_neon_bfmlalt>; +def VBF16MALBQ: VBF16MALQ<0, "b", int_arm_neon_bfmlalb>; -multiclass VBF16MALQI<bit T, string suffix> { +multiclass VBF16MALQI<bit T, string suffix, SDPatternOperator OpNode> { def "" : N3VLaneCP8<0, 0b11, T, 1, (outs QPR:$dst), (ins QPR:$Vd, QPR:$Vn, DPR_8:$Vm, VectorIndex16:$idx), IIC_VMACD, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm$idx", "", []> { @@ -9105,10 +9123,17 @@ let DecoderNamespace = "VFPV8"; } + def : Pat< + (v4f32 (OpNode (v4f32 QPR:$Vd), + (v16i8 QPR:$Vn), + (v16i8 (bitconvert (v8bf16 + (ARMvduplane (v8bf16 QPR:$Vm), + VectorIndex16:$lane)))))), + (!cast<Instruction>(NAME) QPR:$Vd, QPR:$Vn, (EXTRACT_SUBREG QPR:$Vm, dsub_0), VectorIndex32:$lane)>; } -defm VBF16MALTQI: VBF16MALQI<1, "t">; -defm VBF16MALBQI: VBF16MALQI<0, "b">; +defm VBF16MALTQI: VBF16MALQI<1, "t", int_arm_neon_bfmlalt>; +defm VBF16MALBQI: VBF16MALQI<0, "b", int_arm_neon_bfmlalb>; def BF16_VCVT : N2V<0b11, 0b11, 0b01, 0b10, 0b01100, 1, 0, (outs DPR:$Vd), (ins QPR:$Vm), Index: llvm/include/llvm/IR/IntrinsicsARM.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsARM.td +++ llvm/include/llvm/IR/IntrinsicsARM.td @@ -785,7 +785,16 @@ def int_arm_neon_usdot : Neon_Dot_Intrinsic; // v8.6-A Bfloat Intrinsics +def int_arm_neon_bfdot : Neon_Dot_Intrinsic; +def int_arm_neon_bfmmla : Neon_MatMul_Intrinsic; +class Neon_FML_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>], + [IntrNoMem]>; +def int_arm_neon_bfmlalb : Neon_FML_Intrinsic; +def int_arm_neon_bfmlalt : Neon_FML_Intrinsic; + def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; Index: clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c =================================================================== --- /dev/null +++ clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c @@ -0,0 +1,203 @@ +// RUN: %clang_cc1 -triple armv8-arm-none-eabi \ +// RUN: -O2 -target-feature +neon -target-feature +bf16 \ +// RUN: -mfloat-abi hard \ +// RUN: -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BF16 +// RUN: %clang_cc1 -triple armv8-arm-none-eabi \ +// RUN: -O2 -target-feature +neon -target-feature +bf16 \ +// RUN: -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NO-BF16 + +#include <arm_neon.h> + +// CHECK-LABEL: test_vbfdot_f32 +// CHECK-BF16: %0 = bitcast <4 x bfloat> %a to <8 x i8> +// CHECK-BF16: %1 = bitcast <4 x bfloat> %b to <8 x i8> +// CHECK-BF16: %vbfdot1.i = tail call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1) +// CHECK-BF16: ret <2 x float> %vbfdot1.i +// CHECK-NO-BF16: %0 = bitcast <2 x i32> %a.coerce to <8 x i8> +// CHECK-NO-BF16: %1 = bitcast <2 x i32> %b.coerce to <8 x i8> +// CHECK-NO-BF16: %vbfdot3.i = tail call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1) #3 +// CHECK-NO-BF16: ret <2 x float> %vbfdot3.i +float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) { + return vbfdot_f32(r, a, b); +} + +// CHECK-LABEL: test_vbfdotq_f32 +// CHECK-BF16: %0 = bitcast <8 x bfloat> %a to <16 x i8> +// CHECK-BF16: %1 = bitcast <8 x bfloat> %b to <16 x i8> +// CHECK-BF16: %vbfdot1.i = tail call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) +// CHECK-BF16: ret <4 x float> %vbfdot1.i +// CHECK-NO-BF16: %0 = bitcast <4 x i32> %a.coerce to <16 x i8> +// CHECK-NO-BF16: %1 = bitcast <4 x i32> %b.coerce to <16 x i8> +// CHECK-NO-BF16: %vbfdot3.i = tail call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3 +// CHECK-NO-BF16: ret <4 x float> %vbfdot3.i +float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){ + return vbfdotq_f32(r, a, b); +} + +// CHECK-LABEL: test_vbfdot_lane_f32 +// CHECK-BF16: %0 = bitcast <4 x bfloat> %b to <2 x float> +// CHECK-BF16: %lane = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer +// CHECK-BF16: %1 = bitcast <4 x bfloat> %a to <8 x i8> +// CHECK-BF16: %2 = bitcast <2 x float> %lane to <8 x i8> +// CHECK-BF16: %vbfdot1.i = tail call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) +// CHECK-BF16: ret <2 x float> %vbfdot1.i +// CHECK-NO-BF16: %0 = shufflevector <2 x i32> %b.coerce, <2 x i32> undef, <2 x i32> zeroinitializer +// CHECK-NO-BF16: %1 = bitcast <2 x i32> %a.coerce to <8 x i8> +// CHECK-NO-BF16: %2 = bitcast <2 x i32> %0 to <8 x i8> +// CHECK-NO-BF16: %vbfdot3.i = tail call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) #3 +// CHECK-NO-BF16: ret <2 x float> %vbfdot3.i +float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){ + return vbfdot_lane_f32(r, a, b, 0); +} + +// CHECK-LABEL: test_vbfdotq_laneq_f32 +// CHECK-BF16: %0 = bitcast <8 x bfloat> %b to <4 x float> +// CHECK-BF16: %lane = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> +// CHECK-BF16: %1 = bitcast <8 x bfloat> %a to <16 x i8> +// CHECK-BF16: %2 = bitcast <4 x float> %lane to <16 x i8> +// CHECK-BF16: %vbfdot1.i = tail call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) +// CHECK-BF16: ret <4 x float> %vbfdot1.i +// CHECK-NO-BF16: %0 = shufflevector <4 x i32> %b.coerce, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> +// CHECK-NO-BF16: %1 = bitcast <4 x i32> %a.coerce to <16 x i8> +// CHECK-NO-BF16: %2 = bitcast <4 x i32> %0 to <16 x i8> +// CHECK-NO-BF16: %vbfdot3.i = tail call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) #3 +// CHECK-NO-BF16: ret <4 x float> %vbfdot3.i +float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { + return vbfdotq_laneq_f32(r, a, b, 3); +} + +// CHECK-LABEL: test_vbfdot_laneq_f32 +// CHECK-BF16: %0 = bitcast <8 x bfloat> %b to <4 x float> +// CHECK-BF16: %lane = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3> +// CHECK-BF16: %1 = bitcast <4 x bfloat> %a to <8 x i8> +// CHECK-BF16: %2 = bitcast <2 x float> %lane to <8 x i8> +// CHECK-BF16: %vbfdot1.i = tail call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) +// CHECK-BF16: ret <2 x float> %vbfdot1.i +// CHECK-NO-BF16: %0 = bitcast <4 x i32> %b.coerce to <4 x float> +// CHECK-NO-BF16: %lane = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3> +// CHECK-NO-BF16: %1 = bitcast <2 x i32> %a.coerce to <8 x i8> +// CHECK-NO-BF16: %2 = bitcast <2 x float> %lane to <8 x i8> +// CHECK-NO-BF16: %vbfdot3.i = tail call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) #3 +// CHECK-NO-BF16: ret <2 x float> %vbfdot3.i +float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) { + return vbfdot_laneq_f32(r, a, b, 3); +} + +// CHECK-LABEL: test_vbfdotq_lane_f32 +// CHECK-BF16: %0 = bitcast <4 x bfloat> %b to <2 x float> +// CHECK-BF16: %lane = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer +// CHECK-BF16: %1 = bitcast <8 x bfloat> %a to <16 x i8> +// CHECK-BF16: %2 = bitcast <4 x float> %lane to <16 x i8> +// CHECK-BF16: %vbfdot1.i = tail call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) +// CHECK-BF16: ret <4 x float> %vbfdot1.i +// CHECK-NO-BF16: %0 = bitcast <2 x i32> %b.coerce to <2 x float> +// CHECK-NO-BF16: %lane = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer +// CHECK-NO-BF16: %1 = bitcast <4 x i32> %a.coerce to <16 x i8> +// CHECK-NO-BF16: %2 = bitcast <4 x float> %lane to <16 x i8> +// CHECK-NO-BF16: %vbfdot3.i = tail call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) #3 +// CHECK-NO-BF16: ret <4 x float> %vbfdot3.i +float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { + return vbfdotq_lane_f32(r, a, b, 0); +} + +// CHECK-LABEL: test_vbfmmlaq_f32 +// CHECK-BF16: %0 = bitcast <8 x bfloat> %a to <16 x i8> +// CHECK-BF16: %1 = bitcast <8 x bfloat> %b to <16 x i8> +// CHECK-BF16: %vbfmmla1.i = tail call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) +// CHECK-BF16: ret <4 x float> %vbfmmla1.i +// CHECK-NO-BF16: %0 = bitcast <4 x i32> %a.coerce to <16 x i8> +// CHECK-NO-BF16: %1 = bitcast <4 x i32> %b.coerce to <16 x i8> +// CHECK-NO-BF16: %vbfmmla3.i = tail call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3 +// CHECK-NO-BF16: ret <4 x float> %vbfmmla3.i +float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { + return vbfmmlaq_f32(r, a, b); +} + +// CHECK-LABEL: test_vbfmlalbq_f32 +// CHECK-BF16: %0 = bitcast <8 x bfloat> %a to <16 x i8> +// CHECK-BF16: %1 = bitcast <8 x bfloat> %b to <16 x i8> +// CHECK-BF16: %vbfmlalb1.i = tail call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) +// CHECK-BF16: ret <4 x float> %vbfmlalb1.i +// CHECK-NO-BF16: %0 = bitcast <4 x i32> %a.coerce to <16 x i8> +// CHECK-NO-BF16: %1 = bitcast <4 x i32> %b.coerce to <16 x i8> +// CHECK-NO-BF16: %vbfmlalb3.i = tail call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3 +// CHECK-NO-BF16: ret <4 x float> %vbfmlalb3.i +float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { + return vbfmlalbq_f32(r, a, b); +} + +// CHECK-LABEL: test_vbfmlaltq_f32 +// CHECK-BF16: %0 = bitcast <8 x bfloat> %a to <16 x i8> +// CHECK-BF16: %1 = bitcast <8 x bfloat> %b to <16 x i8> +// CHECK-BF16: %vbfmlalt1.i = tail call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) +// CHECK-BF16: ret <4 x float> %vbfmlalt1.i +// CHECK-NO-BF16: %0 = bitcast <4 x i32> %a.coerce to <16 x i8> +// CHECK-NO-BF16: %1 = bitcast <4 x i32> %b.coerce to <16 x i8> +// CHECK-NO-BF16: %vbfmlalt3.i = tail call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3 +// CHECK-NO-BF16: ret <4 x float> %vbfmlalt3.i +float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { + return vbfmlaltq_f32(r, a, b); +} + +// CHECK-LABEL: test_vbfmlalbq_lane_f32 +// CHECK-BF16: %vecinit71 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer +// CHECK-BF16: %0 = bitcast <8 x bfloat> %a to <16 x i8> +// CHECK-BF16: %1 = bitcast <8 x bfloat> %vecinit71 to <16 x i8> +// CHECK-BF16: %vbfmlalb1.i = tail call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) +// CHECK-BF16: ret <4 x float> %vbfmlalb1.i +// CHECK-NO-BF16: %bc = bitcast <2 x i32> %b.coerce to <4 x bfloat> +// CHECK-NO-BF16: %vecinit73 = shufflevector <4 x bfloat> %bc, <4 x bfloat> undef, <8 x i32> zeroinitializer +// CHECK-NO-BF16: %0 = bitcast <4 x i32> %a.coerce to <16 x i8> +// CHECK-NO-BF16: %1 = bitcast <8 x bfloat> %vecinit73 to <16 x i8> +// CHECK-NO-BF16: %vbfmlalb3.i = tail call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3 +// CHECK-NO-BF16: ret <4 x float> %vbfmlalb3.i +float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { + return vbfmlalbq_lane_f32(r, a, b, 0); +} + +// CHECK-LABEL: test_vbfmlalbq_laneq_f32 +// CHECK-BF16: %vecinit71 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> +// CHECK-BF16: %0 = bitcast <8 x bfloat> %a to <16 x i8> +// CHECK-BF16: %1 = bitcast <8 x bfloat> %vecinit71 to <16 x i8> +// CHECK-BF16: %vbfmlalb1.i = tail call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) +// CHECK-NO-BF16: %bc = bitcast <4 x i32> %b.coerce to <8 x bfloat> +// CHECK-NO-BF16: %vecinit73 = shufflevector <8 x bfloat> %bc, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> +// CHECK-NO-BF16: %0 = bitcast <4 x i32> %a.coerce to <16 x i8> +// CHECK-NO-BF16: %1 = bitcast <8 x bfloat> %vecinit73 to <16 x i8> +// CHECK-NO-BF16: %vbfmlalb3.i = tail call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3 +// CHECK-NO-BF16: ret <4 x float> %vbfmlalb3.i +float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { + return vbfmlalbq_laneq_f32(r, a, b, 3); +} + +// CHECK-LABEL: test_vbfmlaltq_lane_f32 +// CHECK-BF16: %vecinit71 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer +// CHECK-BF16: %0 = bitcast <8 x bfloat> %a to <16 x i8> +// CHECK-BF16: %1 = bitcast <8 x bfloat> %vecinit71 to <16 x i8> +// CHECK-BF16: %vbfmlalt1.i = tail call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) +// CHECK-BF16: ret <4 x float> %vbfmlalt1.i +// CHECK-NO-BF16: %bc = bitcast <2 x i32> %b.coerce to <4 x bfloat> +// CHECK-NO-BF16: %vecinit73 = shufflevector <4 x bfloat> %bc, <4 x bfloat> undef, <8 x i32> zeroinitializer +// CHECK-NO-BF16: %0 = bitcast <4 x i32> %a.coerce to <16 x i8> +// CHECK-NO-BF16: %1 = bitcast <8 x bfloat> %vecinit73 to <16 x i8> +// CHECK-NO-BF16: %vbfmlalt3.i = tail call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3 +// CHECK-NO-BF16: ret <4 x float> %vbfmlalt3.i +float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { + return vbfmlaltq_lane_f32(r, a, b, 0); +} + +// CHECK-LABEL: test_vbfmlaltq_laneq_f32 +// CHECK-BF16: %vecinit71 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> +// CHECK-BF16: %0 = bitcast <8 x bfloat> %a to <16 x i8> +// CHECK-BF16: %1 = bitcast <8 x bfloat> %vecinit71 to <16 x i8> +// CHECK-BF16: %vbfmlalt1.i = tail call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) +// CHECK-BF16: ret <4 x float> %vbfmlalt1.i +// CHECK-NO-BF16: %bc = bitcast <4 x i32> %b.coerce to <8 x bfloat> +// CHECK-NO-BF16: %vecinit73 = shufflevector <8 x bfloat> %bc, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> +// CHECK-NO-BF16: %0 = bitcast <4 x i32> %a.coerce to <16 x i8> +// CHECK-NO-BF16: %1 = bitcast <8 x bfloat> %vecinit73 to <16 x i8> +// CHECK-NO-BF16: %vbfmlalt3.i = tail call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) #3 +// CHECK-NO-BF16: ret <4 x float> %vbfmlalt3.i +float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { + return vbfmlaltq_laneq_f32(r, a, b, 3); +} Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -4681,6 +4681,11 @@ NEONMAP1(vaeseq_v, arm_neon_aese, 0), NEONMAP1(vaesimcq_v, arm_neon_aesimc, 0), NEONMAP1(vaesmcq_v, arm_neon_aesmc, 0), + NEONMAP1(vbfdot_v, arm_neon_bfdot, 0), + NEONMAP1(vbfdotq_v, arm_neon_bfdot, 0), + NEONMAP1(vbfmlalbq_v, arm_neon_bfmlalb, 0), + NEONMAP1(vbfmlaltq_v, arm_neon_bfmlalt, 0), + NEONMAP1(vbfmmlaq_v, arm_neon_bfmmla, 0), NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType), NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType), NEONMAP1(vcadd_rot270_v, arm_neon_vcadd_rot270, Add1ArgType),
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits