sanwou01 created this revision.
Herald added subscribers: llvm-commits, cfe-commits, jdoerfert, hiraditya,
kristof.beyls.
Herald added projects: clang, LLVM.
sanwou01 added reviewers: SjoerdMeijer, dmgreen, t.p.northover.
Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h),
are represented in LLVM IR as a by vector sqdmulh and a vector of (repeated)
indices, like so:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32
3, i32 3, i32 3>
%vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>
%a, <4 x i16> %shuffle)
When %v's values are known, the shufflevector is optimized away and we are no
longer able to select the lane variant of sqdmulh in the backend.
This makes it impossible to do a neat trick when using NEON intrinsics: one can
load a number of constants using a single vector load, which are then repeatedly
used to multiply whole vectors by one of the constants. This trick is used for a
nice performance upside (2.5% to 4% on one microbenchmark) in libjpeg-turbo.
This patch adds four LLVM IR intrinsics to the AArch64 backend:
- sqdmulh_lane
- sqdmulh_laneq
- sqrdmulh_lane
- sqrdmulh_laneq.
This prevents the constant propagation when it is not wanted.
In order to represent the type of these intrinsics, the patch adds
LLVMNarrowType and LLVMWideType to the IntrinsicEmitter. The second parameter
of the 'lane' variants is a vector with the same element type, restricted to a
total of 64 bits. Similarly, the 'laneq' variants' second parameter is widened
to a total of 128 bits.
The 'lane' variants also need an additional register class. The second argument
must be in the lower half of the 64-bit NEON register file, but only when
operating on i16 elements.
Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane
(etc.) remain, so code that does not rely on NEON intrinsics to generate these
instructions is not affected.
This patch also changes clang to emit the new LLVM IR intrinsics for the
corresponding NEON intrinsics (AArch64 only).
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D71469
Files:
clang/include/clang/Basic/arm_neon.td
clang/lib/CodeGen/CGBuiltin.cpp
clang/test/CodeGen/aarch64-neon-2velem.c
llvm/include/llvm/IR/Intrinsics.h
llvm/include/llvm/IR/Intrinsics.td
llvm/include/llvm/IR/IntrinsicsAArch64.td
llvm/lib/IR/Function.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64InstrFormats.td
llvm/lib/Target/AArch64/AArch64InstrInfo.td
llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
llvm/lib/Target/AArch64/AArch64RegisterInfo.td
llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
llvm/utils/TableGen/IntrinsicEmitter.cpp
Index: llvm/utils/TableGen/IntrinsicEmitter.cpp
===================================================================
--- llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -224,7 +224,9 @@
IIT_SCALABLE_VEC = 43,
IIT_SUBDIVIDE2_ARG = 44,
IIT_SUBDIVIDE4_ARG = 45,
- IIT_VEC_OF_BITCASTS_TO_INT = 46
+ IIT_VEC_OF_BITCASTS_TO_INT = 46,
+ IIT_NARROW_VEC = 47,
+ IIT_WIDE_VEC = 48
};
static void EncodeFixedValueType(MVT::SimpleValueType VT,
@@ -302,6 +304,10 @@
Sig.push_back(IIT_SUBDIVIDE4_ARG);
else if (R->isSubClassOf("LLVMVectorOfBitcastsToInt"))
Sig.push_back(IIT_VEC_OF_BITCASTS_TO_INT);
+ else if (R->isSubClassOf("LLVMNarrowType"))
+ Sig.push_back(IIT_NARROW_VEC);
+ else if (R->isSubClassOf("LLVMWideType"))
+ Sig.push_back(IIT_WIDE_VEC);
else
Sig.push_back(IIT_ARG);
return Sig.push_back((Number << 3) | 7 /*IITDescriptor::AK_MatchType*/);
Index: llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -9,20 +9,36 @@
declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32(<4 x i32>, <2 x i32>, i32)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32(<4 x i32>, <4 x i32>, i32)
declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32(<2 x i32>, <2 x i32>, i32)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32(<2 x i32>, <4 x i32>, i32)
declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16(<8 x i16>, <4 x i16>, i32)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16(<8 x i16>, <8 x i16>, i32)
declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16(<4 x i16>, <4 x i16>, i32)
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16(<4 x i16>, <8 x i16>, i32)
declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32(<4 x i32>, <2 x i32>, i32)
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32(<4 x i32>, <4 x i32>, i32)
declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32(<2 x i32>, <2 x i32>, i32)
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32(<2 x i32>, <4 x i32>, i32)
declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16(<8 x i16>, <4 x i16>, i32)
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16(<8 x i16>, <8 x i16>, i32)
declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16(<4 x i16>, <4 x i16>, i32)
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16(<4 x i16>, <8 x i16>, i32)
declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
@@ -1515,6 +1531,37 @@
ret <4 x i16> %vqdmulh2.i
}
+define <4 x i16> @test_vqdmulh_lane_s16_intrinsic(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmulh_lane_s16_intrinsic:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.h[3]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16(<4 x i16> %a, <4 x i16> %v, i32 3)
+ ret <4 x i16> %vqdmulh2.i
+}
+
+define <4 x i16> @test_vqdmulh_laneq_s16_intrinsic_lo(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmulh_laneq_s16_intrinsic_lo:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.h[3]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16(<4 x i16> %a, <8 x i16> %v, i32 3)
+ ret <4 x i16> %vqdmulh2.i
+}
+
+define <4 x i16> @test_vqdmulh_laneq_s16_intrinsic_hi(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmulh_laneq_s16_intrinsic_hi:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.h[7]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16(<4 x i16> %a, <8 x i16> %v, i32 7)
+ ret <4 x i16> %vqdmulh2.i
+}
+
define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqdmulhq_lane_s16:
; CHECK: // %bb.0: // %entry
@@ -1527,6 +1574,37 @@
ret <8 x i16> %vqdmulh2.i
}
+define <8 x i16> @test_vqdmulhq_lane_s16_intrinsic(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmulhq_lane_s16_intrinsic:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.h[3]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16(<8 x i16> %a, <4 x i16> %v, i32 3)
+ ret <8 x i16> %vqdmulh2.i
+}
+
+define <8 x i16> @test_vqdmulhq_laneq_s16_intrinsic_lo(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmulhq_laneq_s16_intrinsic_lo:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.h[3]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16(<8 x i16> %a, <8 x i16> %v, i32 3)
+ ret <8 x i16> %vqdmulh2.i
+}
+
+define <8 x i16> @test_vqdmulhq_laneq_s16_intrinsic_hi(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmulhq_laneq_s16_intrinsic_hi:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.h[7]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16(<8 x i16> %a, <8 x i16> %v, i32 7)
+ ret <8 x i16> %vqdmulh2.i
+}
+
define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmulh_lane_s32:
; CHECK: // %bb.0: // %entry
@@ -1539,6 +1617,37 @@
ret <2 x i32> %vqdmulh2.i
}
+define <2 x i32> @test_vqdmulh_lane_s32_intrinsic(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmulh_lane_s32_intrinsic:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32(<2 x i32> %a, <2 x i32> %v, i32 1)
+ ret <2 x i32> %vqdmulh2.i
+}
+
+define <2 x i32> @test_vqdmulh_laneq_s32_intrinsic_lo(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmulh_laneq_s32_intrinsic_lo:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32(<2 x i32> %a, <4 x i32> %v, i32 1)
+ ret <2 x i32> %vqdmulh2.i
+}
+
+define <2 x i32> @test_vqdmulh_laneq_s32_intrinsic_hi(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmulh_laneq_s32_intrinsic_hi:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32(<2 x i32> %a, <4 x i32> %v, i32 3)
+ ret <2 x i32> %vqdmulh2.i
+}
+
define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqdmulhq_lane_s32:
; CHECK: // %bb.0: // %entry
@@ -1551,6 +1660,37 @@
ret <4 x i32> %vqdmulh2.i
}
+define <4 x i32> @test_vqdmulhq_lane_s32_intrinsic(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmulhq_lane_s32_intrinsic:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.s[1]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32(<4 x i32> %a, <2 x i32> %v, i32 1)
+ ret <4 x i32> %vqdmulh2.i
+}
+
+define <4 x i32> @test_vqdmulhq_laneq_s32_intrinsic_lo(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmulhq_laneq_s32_intrinsic_lo:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.s[1]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32(<4 x i32> %a, <4 x i32> %v, i32 1)
+ ret <4 x i32> %vqdmulh2.i
+}
+
+define <4 x i32> @test_vqdmulhq_laneq_s32_intrinsic_hi(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmulhq_laneq_s32_intrinsic_hi:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.s[3]
+; CHECK-NEXT: ret
+entry:
+ %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32(<4 x i32> %a, <4 x i32> %v, i32 3)
+ ret <4 x i32> %vqdmulh2.i
+}
+
define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqrdmulh_lane_s16:
; CHECK: // %bb.0: // %entry
@@ -1563,6 +1703,37 @@
ret <4 x i16> %vqrdmulh2.i
}
+define <4 x i16> @test_vqrdmulh_lane_s16_intrinsic(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulh_lane_s16_intrinsic:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[3]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16(<4 x i16> %a, <4 x i16> %v, i32 3)
+ ret <4 x i16> %vqrdmulh2.i
+}
+
+define <4 x i16> @test_vqrdmulh_laneq_s16_intrinsic_lo(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulh_laneq_s16_intrinsic_lo:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[3]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16(<4 x i16> %a, <8 x i16> %v, i32 3)
+ ret <4 x i16> %vqrdmulh2.i
+}
+
+define <4 x i16> @test_vqrdmulh_laneq_s16_intrinsic_hi(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulh_laneq_s16_intrinsic_hi:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[7]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16(<4 x i16> %a, <8 x i16> %v, i32 7)
+ ret <4 x i16> %vqrdmulh2.i
+}
+
define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
; CHECK-LABEL: test_vqrdmulhq_lane_s16:
; CHECK: // %bb.0: // %entry
@@ -1575,6 +1746,37 @@
ret <8 x i16> %vqrdmulh2.i
}
+define <8 x i16> @test_vqrdmulhq_lane_s16_intrinsic(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulhq_lane_s16_intrinsic:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[3]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16(<8 x i16> %a, <4 x i16> %v, i32 3)
+ ret <8 x i16> %vqrdmulh2.i
+}
+
+define <8 x i16> @test_vqrdmulhq_laneq_s16_intrinsic_lo(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulhq_laneq_s16_intrinsic_lo:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[3]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16(<8 x i16> %a, <8 x i16> %v, i32 3)
+ ret <8 x i16> %vqrdmulh2.i
+}
+
+define <8 x i16> @test_vqrdmulhq_laneq_s16_intrinsic_hi(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulhq_laneq_s16_intrinsic_hi:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[7]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16(<8 x i16> %a, <8 x i16> %v, i32 7)
+ ret <8 x i16> %vqrdmulh2.i
+}
+
define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqrdmulh_lane_s32:
; CHECK: // %bb.0: // %entry
@@ -1587,6 +1789,37 @@
ret <2 x i32> %vqrdmulh2.i
}
+define <2 x i32> @test_vqrdmulh_lane_s32_intrinsic(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulh_lane_s32_intrinsic:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32(<2 x i32> %a, <2 x i32> %v, i32 1)
+ ret <2 x i32> %vqrdmulh2.i
+}
+
+define <2 x i32> @test_vqrdmulh_laneq_s32_intrinsic_lo(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulh_laneq_s32_intrinsic_lo:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32(<2 x i32> %a, <4 x i32> %v, i32 1)
+ ret <2 x i32> %vqrdmulh2.i
+}
+
+define <2 x i32> @test_vqrdmulh_laneq_s32_intrinsic_hi(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulh_laneq_s32_intrinsic_hi:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32(<2 x i32> %a, <4 x i32> %v, i32 3)
+ ret <2 x i32> %vqrdmulh2.i
+}
+
define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
; CHECK-LABEL: test_vqrdmulhq_lane_s32:
; CHECK: // %bb.0: // %entry
@@ -1599,6 +1832,37 @@
ret <4 x i32> %vqrdmulh2.i
}
+define <4 x i32> @test_vqrdmulhq_lane_s32_intrinsic(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulhq_lane_s32_intrinsic:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[1]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32(<4 x i32> %a, <2 x i32> %v, i32 1)
+ ret <4 x i32> %vqrdmulh2.i
+}
+
+define <4 x i32> @test_vqrdmulhq_laneq_s32_intrinsic_lo(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulhq_laneq_s32_intrinsic_lo:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[1]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32(<4 x i32> %a, <4 x i32> %v, i32 1)
+ ret <4 x i32> %vqrdmulh2.i
+}
+
+define <4 x i32> @test_vqrdmulhq_laneq_s32_intrinsic_hi(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulhq_laneq_s32_intrinsic_hi:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[3]
+; CHECK-NEXT: ret
+entry:
+ %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32(<4 x i32> %a, <4 x i32> %v, i32 3)
+ ret <4 x i32> %vqrdmulh2.i
+}
+
define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
; CHECK-LABEL: test_vmul_lane_f32:
; CHECK: // %bb.0: // %entry
Index: llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
===================================================================
--- llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -1033,8 +1033,10 @@
bool isNeonVectorRegLo() const {
return Kind == k_Register && Reg.Kind == RegKind::NeonVector &&
- AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
- Reg.RegNum);
+ (AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
+ Reg.RegNum) ||
+ AArch64MCRegisterClasses[AArch64::FPR64_loRegClassID].contains(
+ Reg.RegNum));
}
template <unsigned Class> bool isSVEVectorReg() const {
Index: llvm/lib/Target/AArch64/AArch64RegisterInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -429,6 +429,10 @@
def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
v1i64, v4f16],
64, (sequence "D%u", 0, 31)>;
+def FPR64_lo : RegisterClass<"AArch64",
+ [v8i8, v4i16, v2i32, v1i64, v4f16, v2f32, v1f64],
+ 64, (trunc FPR64, 16)>;
+
// We don't (yet) have an f128 legal type, so don't use that here. We
// normalize 128-bit vectors to v2f64 for arg passing and such, so use
// that here.
@@ -503,6 +507,9 @@
let Name = "VectorRegLo";
let PredicateMethod = "isNeonVectorRegLo";
}
+def V64_lo : RegisterOperand<FPR64_lo, "printVRegOperand"> {
+ let ParserMatchClass = VectorRegLoAsmOperand;
+}
def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
let ParserMatchClass = VectorRegLoAsmOperand;
}
Index: llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -572,6 +572,7 @@
return 32;
case AArch64::FPR128_loRegClassID:
+ case AArch64::FPR64_loRegClassID:
return 16;
}
}
Index: llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -229,6 +229,7 @@
case AArch64::FPR16RegClassID:
case AArch64::FPR32RegClassID:
case AArch64::FPR64RegClassID:
+ case AArch64::FPR64_loRegClassID:
case AArch64::FPR128RegClassID:
case AArch64::FPR128_loRegClassID:
case AArch64::DDRegClassID:
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5577,6 +5577,11 @@
defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm SQDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqdmulh_lane,
+ int_aarch64_neon_sqdmulh_laneq>;
+defm SQRDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqrdmulh_lane,
+ int_aarch64_neon_sqrdmulh_laneq>;
+
// Generated by MachineCombine
defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla", null_frag>;
defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>;
Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -358,6 +358,9 @@
def am_indexedu6s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedU6S128", []>;
def am_indexeds9s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedS9S128", []>;
+def UImmS1XForm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i64);
+}]>;
def UImmS2XForm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64);
}]>;
@@ -7869,6 +7872,64 @@
}
}
+multiclass SIMDIndexedHSPatterns<SDPatternOperator OpNodeLane,
+ SDPatternOperator OpNodeLaneQ> {
+
+ def : Pat<(v4i16 (OpNodeLane
+ (v4i16 V64:$Rn), (v4i16 V64_lo:$Rm),
+ VectorIndexS32b:$idx)),
+ (!cast<Instruction>(NAME # v4i16_indexed) $Rn,
+ (SUBREG_TO_REG (i32 0), (v4i16 V64_lo:$Rm), dsub),
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v4i16 (OpNodeLaneQ
+ (v4i16 V64:$Rn), (v8i16 V128_lo:$Rm),
+ VectorIndexH32b:$idx)),
+ (!cast<Instruction>(NAME # v4i16_indexed) $Rn, $Rm,
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v8i16 (OpNodeLane
+ (v8i16 V128:$Rn), (v4i16 V64_lo:$Rm),
+ VectorIndexS32b:$idx)),
+ (!cast<Instruction>(NAME # v8i16_indexed) $Rn,
+ (SUBREG_TO_REG (i32 0), $Rm, dsub),
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v8i16 (OpNodeLaneQ
+ (v8i16 V128:$Rn), (v8i16 V128_lo:$Rm),
+ VectorIndexH32b:$idx)),
+ (!cast<Instruction>(NAME # v8i16_indexed) $Rn, $Rm,
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v2i32 (OpNodeLane
+ (v2i32 V64:$Rn), (v2i32 V64:$Rm),
+ VectorIndexD32b:$idx)),
+ (!cast<Instruction>(NAME # v2i32_indexed) $Rn,
+ (SUBREG_TO_REG (i32 0), (v2i32 V64_lo:$Rm), dsub),
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v2i32 (OpNodeLaneQ
+ (v2i32 V64:$Rn), (v4i32 V128:$Rm),
+ VectorIndexS32b:$idx)),
+ (!cast<Instruction>(NAME # v2i32_indexed) $Rn, $Rm,
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v4i32 (OpNodeLane
+ (v4i32 V128:$Rn), (v2i32 V64:$Rm),
+ VectorIndexD32b:$idx)),
+ (!cast<Instruction>(NAME # v4i32_indexed) $Rn,
+ (SUBREG_TO_REG (i32 0), $Rm, dsub),
+ (UImmS1XForm $idx))>;
+
+ def : Pat<(v4i32 (OpNodeLaneQ
+ (v4i32 V128:$Rn),
+ (v4i32 V128:$Rm),
+ VectorIndexS32b:$idx)),
+ (!cast<Instruction>(NAME # v4i32_indexed) $Rn, $Rm,
+ (UImmS1XForm $idx))>;
+
+}
+
multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode> {
def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6050,6 +6050,8 @@
break;
if (VT.isScalableVector())
return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
+ if (VT.getSizeInBits() == 64)
+ return std::make_pair(0U, &AArch64::FPR64_loRegClass);
if (VT.getSizeInBits() == 128)
return std::make_pair(0U, &AArch64::FPR128_loRegClass);
break;
Index: llvm/lib/IR/Function.cpp
===================================================================
--- llvm/lib/IR/Function.cpp
+++ llvm/lib/IR/Function.cpp
@@ -707,7 +707,9 @@
IIT_SCALABLE_VEC = 43,
IIT_SUBDIVIDE2_ARG = 44,
IIT_SUBDIVIDE4_ARG = 45,
- IIT_VEC_OF_BITCASTS_TO_INT = 46
+ IIT_VEC_OF_BITCASTS_TO_INT = 46,
+ IIT_NARROW_VEC = 47,
+ IIT_WIDE_VEC = 48
};
static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
@@ -902,6 +904,17 @@
ArgInfo));
return;
}
+ case IIT_NARROW_VEC: {
+ unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+ OutputTable.push_back(
+ IITDescriptor::get(IITDescriptor::NarrowVec, ArgInfo));
+ return;
+ }
+ case IIT_WIDE_VEC: {
+ unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+ OutputTable.push_back(IITDescriptor::get(IITDescriptor::WideVec, ArgInfo));
+ return;
+ }
}
llvm_unreachable("unhandled");
}
@@ -1042,6 +1055,22 @@
return VectorType::get(Ty->getVectorElementType(),
{ Ty->getVectorNumElements(), true });
}
+ case IITDescriptor::NarrowVec: {
+ Type *Ty = Tys[D.getArgumentNumber()];
+ if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+ Type *ElTy = VTy->getElementType();
+ return VectorType::get(ElTy, 64 / ElTy->getIntegerBitWidth());
+ }
+ llvm_unreachable("Expected an argument of Vector Type");
+ }
+ case IITDescriptor::WideVec: {
+ Type *Ty = Tys[D.getArgumentNumber()];
+ if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+ Type *ElTy = VTy->getElementType();
+ return VectorType::get(ElTy, 128 / ElTy->getIntegerBitWidth());
+ }
+ llvm_unreachable("Expected an argument of Vector Type");
+ }
}
llvm_unreachable("unhandled");
}
@@ -1336,6 +1365,28 @@
return true;
return ThisArgVecTy != VectorType::getInteger(ReferenceType);
}
+ case IITDescriptor::NarrowVec: {
+ if (D.getArgumentNumber() >= ArgTys.size())
+ return IsDeferredCheck || DeferCheck(Ty);
+
+ auto *VTy = dyn_cast<VectorType>(Ty);
+ auto *RefVTy = dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
+ if (!VTy || !RefVTy || VTy->getBitWidth() != 64)
+ return true;
+
+ return VTy->getElementType() != RefVTy->getElementType();
+ }
+ case IITDescriptor::WideVec: {
+ if (D.getArgumentNumber() >= ArgTys.size())
+ return IsDeferredCheck || DeferCheck(Ty);
+
+ auto *VTy = dyn_cast<VectorType>(Ty);
+ auto *RefVTy = dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
+ if (!VTy || !RefVTy || VTy->getBitWidth() != 128)
+ return true;
+
+ return VTy->getElementType() != RefVTy->getElementType();
+ }
}
llvm_unreachable("unhandled");
}
Index: llvm/include/llvm/IR/IntrinsicsAArch64.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -133,6 +133,14 @@
: Intrinsic<[llvm_anyvector_ty],
[LLVMHalfElementsVectorType<0>, llvm_anyvector_ty],
[IntrNoMem]>;
+ class AdvSIMD_2VectorArg_Narrow_Lane_Intrinsic
+ : Intrinsic<[llvm_anyint_ty],
+ [LLVMMatchType<0>, LLVMNarrowType<0>, llvm_i32_ty],
+ [IntrNoMem]>;
+ class AdvSIMD_2VectorArg_Wide_Lane_Intrinsic
+ : Intrinsic<[llvm_anyint_ty],
+ [LLVMMatchType<0>, LLVMWideType<0>, llvm_i32_ty],
+ [IntrNoMem]>;
class AdvSIMD_3VectorArg_Intrinsic
: Intrinsic<[llvm_anyvector_ty],
@@ -207,9 +215,13 @@
// Vector Saturating Doubling Multiply High
def int_aarch64_neon_sqdmulh : AdvSIMD_2IntArg_Intrinsic;
+ def int_aarch64_neon_sqdmulh_lane : AdvSIMD_2VectorArg_Narrow_Lane_Intrinsic;
+ def int_aarch64_neon_sqdmulh_laneq : AdvSIMD_2VectorArg_Wide_Lane_Intrinsic;
// Vector Saturating Rounding Doubling Multiply High
def int_aarch64_neon_sqrdmulh : AdvSIMD_2IntArg_Intrinsic;
+ def int_aarch64_neon_sqrdmulh_lane : AdvSIMD_2VectorArg_Narrow_Lane_Intrinsic;
+ def int_aarch64_neon_sqrdmulh_laneq : AdvSIMD_2VectorArg_Wide_Lane_Intrinsic;
// Vector Polynominal Multiply
def int_aarch64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic;
Index: llvm/include/llvm/IR/Intrinsics.td
===================================================================
--- llvm/include/llvm/IR/Intrinsics.td
+++ llvm/include/llvm/IR/Intrinsics.td
@@ -183,6 +183,9 @@
class LLVMVectorOfAnyPointersToElt<int num> : LLVMMatchType<num>;
class LLVMVectorElementType<int num> : LLVMMatchType<num>;
+class LLVMNarrowType<int num> : LLVMMatchType<num>;
+class LLVMWideType<int num> : LLVMMatchType<num>;
+
// Match the type of another intrinsic parameter that is expected to be a
// vector type, but change the element count to be half as many
class LLVMHalfElementsVectorType<int num> : LLVMMatchType<num>;
Index: llvm/include/llvm/IR/Intrinsics.h
===================================================================
--- llvm/include/llvm/IR/Intrinsics.h
+++ llvm/include/llvm/IR/Intrinsics.h
@@ -101,7 +101,7 @@
Argument, ExtendArgument, TruncArgument, HalfVecArgument,
SameVecWidthArgument, PtrToArgument, PtrToElt, VecOfAnyPtrsToElt,
VecElementArgument, ScalableVecArgument, Subdivide2Argument,
- Subdivide4Argument, VecOfBitcastsToInt
+ Subdivide4Argument, VecOfBitcastsToInt, WideVec, NarrowVec
} Kind;
union {
@@ -128,7 +128,8 @@
Kind == SameVecWidthArgument || Kind == PtrToArgument ||
Kind == PtrToElt || Kind == VecElementArgument ||
Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
- Kind == VecOfBitcastsToInt);
+ Kind == VecOfBitcastsToInt || Kind == WideVec ||
+ Kind == NarrowVec);
return Argument_Info >> 3;
}
ArgKind getArgumentKind() const {
Index: clang/test/CodeGen/aarch64-neon-2velem.c
===================================================================
--- clang/test/CodeGen/aarch64-neon-2velem.c
+++ clang/test/CodeGen/aarch64-neon-2velem.c
@@ -1440,12 +1440,14 @@
// CHECK-LABEL: @test_vqdmulh_lane_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 3)
+// CHECK-NEXT: [[VQDMULH_LANE_V3:%.*]] = bitcast <4 x i16> [[VQDMULH_LANE_V2]] to <8 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_LANE_V3]] to <4 x i16>
+// CHECK-NEXT: ret <4 x i16> [[TMP2]]
//
int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
return vqdmulh_lane_s16(a, v, 3);
@@ -1453,12 +1455,14 @@
// CHECK-LABEL: @test_vqdmulhq_lane_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 3)
+// CHECK-NEXT: [[VQDMULHQ_LANE_V3:%.*]] = bitcast <8 x i16> [[VQDMULHQ_LANE_V2]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_LANE_V3]] to <8 x i16>
+// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
return vqdmulhq_lane_s16(a, v, 3);
@@ -1466,12 +1470,14 @@
// CHECK-LABEL: @test_vqdmulh_lane_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 1)
+// CHECK-NEXT: [[VQDMULH_LANE_V3:%.*]] = bitcast <2 x i32> [[VQDMULH_LANE_V2]] to <8 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_LANE_V3]] to <2 x i32>
+// CHECK-NEXT: ret <2 x i32> [[TMP2]]
//
int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
return vqdmulh_lane_s32(a, v, 1);
@@ -1479,12 +1485,14 @@
// CHECK-LABEL: @test_vqdmulhq_lane_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 1)
+// CHECK-NEXT: [[VQDMULHQ_LANE_V3:%.*]] = bitcast <4 x i32> [[VQDMULHQ_LANE_V2]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_LANE_V3]] to <4 x i32>
+// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
return vqdmulhq_lane_s32(a, v, 1);
@@ -1492,12 +1500,14 @@
// CHECK-LABEL: @test_vqrdmulh_lane_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 3)
+// CHECK-NEXT: [[VQRDMULH_LANE_V3:%.*]] = bitcast <4 x i16> [[VQRDMULH_LANE_V2]] to <8 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_LANE_V3]] to <4 x i16>
+// CHECK-NEXT: ret <4 x i16> [[TMP2]]
//
int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
return vqrdmulh_lane_s16(a, v, 3);
@@ -1505,12 +1515,14 @@
// CHECK-LABEL: @test_vqrdmulhq_lane_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 3)
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V3:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_LANE_V2]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_LANE_V3]] to <8 x i16>
+// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
return vqrdmulhq_lane_s16(a, v, 3);
@@ -1518,12 +1530,14 @@
// CHECK-LABEL: @test_vqrdmulh_lane_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 1)
+// CHECK-NEXT: [[VQRDMULH_LANE_V3:%.*]] = bitcast <2 x i32> [[VQRDMULH_LANE_V2]] to <8 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_LANE_V3]] to <2 x i32>
+// CHECK-NEXT: ret <2 x i32> [[TMP2]]
//
int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
return vqrdmulh_lane_s32(a, v, 1);
@@ -1531,12 +1545,14 @@
// CHECK-LABEL: @test_vqrdmulhq_lane_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 1)
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V3:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_LANE_V2]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_LANE_V3]] to <4 x i32>
+// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
return vqrdmulhq_lane_s32(a, v, 1);
@@ -3066,12 +3082,14 @@
// CHECK-LABEL: @test_vqdmulh_lane_s16_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 0)
+// CHECK-NEXT: [[VQDMULH_LANE_V3:%.*]] = bitcast <4 x i16> [[VQDMULH_LANE_V2]] to <8 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_LANE_V3]] to <4 x i16>
+// CHECK-NEXT: ret <4 x i16> [[TMP2]]
//
int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
return vqdmulh_lane_s16(a, v, 0);
@@ -3079,12 +3097,14 @@
// CHECK-LABEL: @test_vqdmulhq_lane_s16_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 0)
+// CHECK-NEXT: [[VQDMULHQ_LANE_V3:%.*]] = bitcast <8 x i16> [[VQDMULHQ_LANE_V2]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_LANE_V3]] to <8 x i16>
+// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
return vqdmulhq_lane_s16(a, v, 0);
@@ -3092,12 +3112,14 @@
// CHECK-LABEL: @test_vqdmulh_lane_s32_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 0)
+// CHECK-NEXT: [[VQDMULH_LANE_V3:%.*]] = bitcast <2 x i32> [[VQDMULH_LANE_V2]] to <8 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_LANE_V3]] to <2 x i32>
+// CHECK-NEXT: ret <2 x i32> [[TMP2]]
//
int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
return vqdmulh_lane_s32(a, v, 0);
@@ -3105,12 +3127,14 @@
// CHECK-LABEL: @test_vqdmulhq_lane_s32_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 0)
+// CHECK-NEXT: [[VQDMULHQ_LANE_V3:%.*]] = bitcast <4 x i32> [[VQDMULHQ_LANE_V2]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_LANE_V3]] to <4 x i32>
+// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
return vqdmulhq_lane_s32(a, v, 0);
@@ -3118,12 +3142,14 @@
// CHECK-LABEL: @test_vqrdmulh_lane_s16_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 0)
+// CHECK-NEXT: [[VQRDMULH_LANE_V3:%.*]] = bitcast <4 x i16> [[VQRDMULH_LANE_V2]] to <8 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_LANE_V3]] to <4 x i16>
+// CHECK-NEXT: ret <4 x i16> [[TMP2]]
//
int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
return vqrdmulh_lane_s16(a, v, 0);
@@ -3131,12 +3157,14 @@
// CHECK-LABEL: @test_vqrdmulhq_lane_s16_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 0)
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V3:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_LANE_V2]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_LANE_V3]] to <8 x i16>
+// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
return vqrdmulhq_lane_s16(a, v, 0);
@@ -3144,12 +3172,14 @@
// CHECK-LABEL: @test_vqrdmulh_lane_s32_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 0)
+// CHECK-NEXT: [[VQRDMULH_LANE_V3:%.*]] = bitcast <2 x i32> [[VQRDMULH_LANE_V2]] to <8 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_LANE_V3]] to <2 x i32>
+// CHECK-NEXT: ret <2 x i32> [[TMP2]]
//
int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
return vqrdmulh_lane_s32(a, v, 0);
@@ -3157,12 +3187,14 @@
// CHECK-LABEL: @test_vqrdmulhq_lane_s32_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 0)
+// CHECK-NEXT: [[VQRDMULHQ_LANE_V3:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_LANE_V2]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_LANE_V3]] to <4 x i32>
+// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
return vqrdmulhq_lane_s32(a, v, 0);
@@ -4753,12 +4785,14 @@
// CHECK-LABEL: @test_vqdmulh_laneq_s16_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 0)
+// CHECK-NEXT: [[VQDMULH_LANEQ_V3:%.*]] = bitcast <4 x i16> [[VQDMULH_LANEQ_V2]] to <8 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_LANEQ_V3]] to <4 x i16>
+// CHECK-NEXT: ret <4 x i16> [[TMP2]]
//
int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
return vqdmulh_laneq_s16(a, v, 0);
@@ -4766,12 +4800,14 @@
// CHECK-LABEL: @test_vqdmulhq_laneq_s16_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 0)
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V3:%.*]] = bitcast <8 x i16> [[VQDMULHQ_LANEQ_V2]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_LANEQ_V3]] to <8 x i16>
+// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
return vqdmulhq_laneq_s16(a, v, 0);
@@ -4779,12 +4815,14 @@
// CHECK-LABEL: @test_vqdmulh_laneq_s32_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 0)
+// CHECK-NEXT: [[VQDMULH_LANEQ_V3:%.*]] = bitcast <2 x i32> [[VQDMULH_LANEQ_V2]] to <8 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_LANEQ_V3]] to <2 x i32>
+// CHECK-NEXT: ret <2 x i32> [[TMP2]]
//
int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
return vqdmulh_laneq_s32(a, v, 0);
@@ -4792,12 +4830,14 @@
// CHECK-LABEL: @test_vqdmulhq_laneq_s32_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 0)
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V3:%.*]] = bitcast <4 x i32> [[VQDMULHQ_LANEQ_V2]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_LANEQ_V3]] to <4 x i32>
+// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
return vqdmulhq_laneq_s32(a, v, 0);
@@ -4805,12 +4845,14 @@
// CHECK-LABEL: @test_vqrdmulh_laneq_s16_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 0)
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V3:%.*]] = bitcast <4 x i16> [[VQRDMULH_LANEQ_V2]] to <8 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_LANEQ_V3]] to <4 x i16>
+// CHECK-NEXT: ret <4 x i16> [[TMP2]]
//
int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
return vqrdmulh_laneq_s16(a, v, 0);
@@ -4818,12 +4860,14 @@
// CHECK-LABEL: @test_vqrdmulhq_laneq_s16_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 0)
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V3:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_LANEQ_V2]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_LANEQ_V3]] to <8 x i16>
+// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
return vqrdmulhq_laneq_s16(a, v, 0);
@@ -4831,12 +4875,14 @@
// CHECK-LABEL: @test_vqrdmulh_laneq_s32_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 0)
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V3:%.*]] = bitcast <2 x i32> [[VQRDMULH_LANEQ_V2]] to <8 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_LANEQ_V3]] to <2 x i32>
+// CHECK-NEXT: ret <2 x i32> [[TMP2]]
//
int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
return vqrdmulh_laneq_s32(a, v, 0);
@@ -4844,12 +4890,14 @@
// CHECK-LABEL: @test_vqrdmulhq_laneq_s32_0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 0)
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V3:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_LANEQ_V2]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_LANEQ_V3]] to <4 x i32>
+// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
return vqrdmulhq_laneq_s32(a, v, 0);
@@ -5149,12 +5197,14 @@
// CHECK-LABEL: @test_vqdmulh_laneq_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 7)
+// CHECK-NEXT: [[VQDMULH_LANEQ_V3:%.*]] = bitcast <4 x i16> [[VQDMULH_LANEQ_V2]] to <8 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_LANEQ_V3]] to <4 x i16>
+// CHECK-NEXT: ret <4 x i16> [[TMP2]]
//
int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
return vqdmulh_laneq_s16(a, v, 7);
@@ -5162,12 +5212,14 @@
// CHECK-LABEL: @test_vqdmulhq_laneq_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 7)
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V3:%.*]] = bitcast <8 x i16> [[VQDMULHQ_LANEQ_V2]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_LANEQ_V3]] to <8 x i16>
+// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
return vqdmulhq_laneq_s16(a, v, 7);
@@ -5175,12 +5227,14 @@
// CHECK-LABEL: @test_vqdmulh_laneq_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 3)
+// CHECK-NEXT: [[VQDMULH_LANEQ_V3:%.*]] = bitcast <2 x i32> [[VQDMULH_LANEQ_V2]] to <8 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_LANEQ_V3]] to <2 x i32>
+// CHECK-NEXT: ret <2 x i32> [[TMP2]]
//
int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
return vqdmulh_laneq_s32(a, v, 3);
@@ -5188,12 +5242,14 @@
// CHECK-LABEL: @test_vqdmulhq_laneq_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 3)
+// CHECK-NEXT: [[VQDMULHQ_LANEQ_V3:%.*]] = bitcast <4 x i32> [[VQDMULHQ_LANEQ_V2]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_LANEQ_V3]] to <4 x i32>
+// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
return vqdmulhq_laneq_s32(a, v, 3);
@@ -5201,12 +5257,14 @@
// CHECK-LABEL: @test_vqrdmulh_laneq_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 7)
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V3:%.*]] = bitcast <4 x i16> [[VQRDMULH_LANEQ_V2]] to <8 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_LANEQ_V3]] to <4 x i16>
+// CHECK-NEXT: ret <4 x i16> [[TMP2]]
//
int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
return vqrdmulh_laneq_s16(a, v, 7);
@@ -5214,12 +5272,14 @@
// CHECK-LABEL: @test_vqrdmulhq_laneq_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 7)
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V3:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_LANEQ_V2]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_LANEQ_V3]] to <8 x i16>
+// CHECK-NEXT: ret <8 x i16> [[TMP2]]
//
int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
return vqrdmulhq_laneq_s16(a, v, 7);
@@ -5227,12 +5287,14 @@
// CHECK-LABEL: @test_vqrdmulh_laneq_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 3)
+// CHECK-NEXT: [[VQRDMULH_LANEQ_V3:%.*]] = bitcast <2 x i32> [[VQRDMULH_LANEQ_V2]] to <8 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_LANEQ_V3]] to <2 x i32>
+// CHECK-NEXT: ret <2 x i32> [[TMP2]]
//
int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
return vqrdmulh_laneq_s32(a, v, 3);
@@ -5240,12 +5302,14 @@
// CHECK-LABEL: @test_vqrdmulhq_laneq_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 3)
+// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V3:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_LANEQ_V2]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_LANEQ_V3]] to <4 x i32>
+// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
return vqrdmulhq_laneq_s32(a, v, 3);
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -4937,14 +4937,22 @@
NEONMAP2(vqaddq_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
NEONMAP2(vqdmlal_v, aarch64_neon_sqdmull, aarch64_neon_sqadd, 0),
NEONMAP2(vqdmlsl_v, aarch64_neon_sqdmull, aarch64_neon_sqsub, 0),
+ NEONMAP1(vqdmulh_lane_v, aarch64_neon_sqdmulh_lane, Add1ArgType),
+ NEONMAP1(vqdmulh_laneq_v, aarch64_neon_sqdmulh_laneq, Add1ArgType),
NEONMAP1(vqdmulh_v, aarch64_neon_sqdmulh, Add1ArgType),
+ NEONMAP1(vqdmulhq_lane_v, aarch64_neon_sqdmulh_lane, Add1ArgType),
+ NEONMAP1(vqdmulhq_laneq_v, aarch64_neon_sqdmulh_laneq, Add1ArgType),
NEONMAP1(vqdmulhq_v, aarch64_neon_sqdmulh, Add1ArgType),
NEONMAP1(vqdmull_v, aarch64_neon_sqdmull, Add1ArgType),
NEONMAP2(vqmovn_v, aarch64_neon_uqxtn, aarch64_neon_sqxtn, Add1ArgType | UnsignedAlts),
NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType),
NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType),
NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType),
+ NEONMAP1(vqrdmulh_lane_v, aarch64_neon_sqrdmulh_lane, Add1ArgType),
+ NEONMAP1(vqrdmulh_laneq_v, aarch64_neon_sqrdmulh_laneq, Add1ArgType),
NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType),
+ NEONMAP1(vqrdmulhq_lane_v, aarch64_neon_sqrdmulh_lane, Add1ArgType),
+ NEONMAP1(vqrdmulhq_laneq_v, aarch64_neon_sqrdmulh_laneq, Add1ArgType),
NEONMAP1(vqrdmulhq_v, aarch64_neon_sqrdmulh, Add1ArgType),
NEONMAP2(vqrshl_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
NEONMAP2(vqrshlq_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
Index: clang/include/clang/Basic/arm_neon.td
===================================================================
--- clang/include/clang/Basic/arm_neon.td
+++ clang/include/clang/Basic/arm_neon.td
@@ -528,9 +528,16 @@
def VQDMULL_N : SOpInst<"vqdmull_n", "(>Q).1", "si", OP_QDMULL_N>;
def VQDMULL_LANE : SOpInst<"vqdmull_lane", "(>Q)..I", "si", OP_QDMULL_LN>;
def VQDMULH_N : SOpInst<"vqdmulh_n", "..1", "siQsQi", OP_QDMULH_N>;
-def VQDMULH_LANE : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>;
def VQRDMULH_N : SOpInst<"vqrdmulh_n", "..1", "siQsQi", OP_QRDMULH_N>;
+
+let ArchGuard = "!defined(__aarch64__)" in {
+def VQDMULH_LANE : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>;
def VQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "..qI", "siQsQi", OP_QRDMULH_LN>;
+}
+let ArchGuard = "defined(__aarch64__)" in {
+def A64_VQDMULH_LANE : SInst<"vqdmulh_lane", "..qI", "siQsQi">;
+def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..qI", "siQsQi">;
+}
let ArchGuard = "defined(__ARM_FEATURE_QRDMX)" in {
def VQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "...qI", "siQsQi", OP_QRDMLAH_LN>;
@@ -951,9 +958,10 @@
def VQDMULL_HIGH_LANEQ : SOpInst<"vqdmull_high_laneq", "(>Q)QQI", "si",
OP_QDMULLHi_LN>;
-def VQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "..QI", "siQsQi", OP_QDMULH_LN>;
-def VQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "..QI", "siQsQi", OP_QRDMULH_LN>;
-
+let isLaneQ = 1 in {
+def VQDMULH_LANEQ : SInst<"vqdmulh_laneq", "..QI", "siQsQi">;
+def VQRDMULH_LANEQ : SInst<"vqrdmulh_laneq", "..QI", "siQsQi">;
+}
let ArchGuard = "defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)" in {
def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN>;
def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "...QI", "siQsQi", OP_QRDMLSH_LN>;
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits