[PATCH] D71469: [AArch64] Add sq(r)dmulh_lane(q) LLVM IR intrinsics

Sanne Wouda via Phabricator via cfe-commits Fri, 13 Dec 2019 08:07:33 -0800

sanwou01 created this revision.
Herald added subscribers: llvm-commits, cfe-commits, jdoerfert, hiraditya, 
kristof.beyls.
Herald added projects: clang, LLVM.
sanwou01 added reviewers: SjoerdMeijer, dmgreen, t.p.northover.


Currently, sqdmulh_lane and friends from the ACLE (implemented in arm_neon.h),
are represented in LLVM IR as a by vector sqdmulh and a vector of (repeated)
indices, like so:

  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 
3, i32 3, i32 3>
  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> 
%a, <4 x i16> %shuffle)

When %v's values are known, the shufflevector is optimized away and we are no
longer able to select the lane variant of sqdmulh in the backend.

This makes it impossible to do a neat trick when using NEON intrinsics: one can
load a number of constants using a single vector load, which are then repeatedly
used to multiply whole vectors by one of the constants. This trick is used for a
nice performance upside (2.5% to 4% on one microbenchmark) in libjpeg-turbo.

This patch adds four LLVM IR intrinsics to the AArch64 backend:

- sqdmulh_lane
- sqdmulh_laneq
- sqrdmulh_lane
- sqrdmulh_laneq.

This prevents the constant propagation when it is not wanted.

In order to represent the type of these intrinsics, the patch adds
LLVMNarrowType and LLVMWideType to the IntrinsicEmitter.  The second parameter
of the 'lane' variants is a vector with the same element type, restricted to a
total of 64 bits.  Similarly, the 'laneq' variants' second parameter is widened
to a total of 128 bits.

The 'lane' variants also need an additional register class.  The second argument
must be in the lower half of the 64-bit NEON register file, but only when
operating on i16 elements.

Note that the existing patterns for shufflevector and sqdmulh into sqdmulh_lane
(etc.) remain, so code that does not rely on NEON intrinsics to generate these
instructions is not affected.

This patch also changes clang to emit the new LLVM IR intrinsics for the
corresponding NEON intrinsics (AArch64 only).


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D71469

Files:
  clang/include/clang/Basic/arm_neon.td
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/aarch64-neon-2velem.c
  llvm/include/llvm/IR/Intrinsics.h
  llvm/include/llvm/IR/Intrinsics.td
  llvm/include/llvm/IR/IntrinsicsAArch64.td
  llvm/lib/IR/Function.cpp
  llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
  llvm/lib/Target/AArch64/AArch64InstrFormats.td
  llvm/lib/Target/AArch64/AArch64InstrInfo.td
  llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
  llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
  llvm/lib/Target/AArch64/AArch64RegisterInfo.td
  llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
  llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
  llvm/utils/TableGen/IntrinsicEmitter.cpp

Index: llvm/utils/TableGen/IntrinsicEmitter.cpp
===================================================================
--- llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -224,7 +224,9 @@
   IIT_SCALABLE_VEC = 43,
   IIT_SUBDIVIDE2_ARG = 44,
   IIT_SUBDIVIDE4_ARG = 45,
-  IIT_VEC_OF_BITCASTS_TO_INT = 46
+  IIT_VEC_OF_BITCASTS_TO_INT = 46,
+  IIT_NARROW_VEC = 47,
+  IIT_WIDE_VEC = 48
 };
 
 static void EncodeFixedValueType(MVT::SimpleValueType VT,
@@ -302,6 +304,10 @@
       Sig.push_back(IIT_SUBDIVIDE4_ARG);
     else if (R->isSubClassOf("LLVMVectorOfBitcastsToInt"))
       Sig.push_back(IIT_VEC_OF_BITCASTS_TO_INT);
+    else if (R->isSubClassOf("LLVMNarrowType"))
+      Sig.push_back(IIT_NARROW_VEC);
+    else if (R->isSubClassOf("LLVMWideType"))
+      Sig.push_back(IIT_WIDE_VEC);
     else
       Sig.push_back(IIT_ARG);
     return Sig.push_back((Number << 3) | 7 /*IITDescriptor::AK_MatchType*/);
Index: llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -9,20 +9,36 @@
 declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
 
 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32(<4 x i32>, <2 x i32>, i32)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32(<4 x i32>, <4 x i32>, i32)
 
 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32(<2 x i32>, <2 x i32>, i32)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32(<2 x i32>, <4 x i32>, i32)
 
 declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16(<8 x i16>, <4 x i16>, i32)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16(<8 x i16>, <8 x i16>, i32)
 
 declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16(<4 x i16>, <4 x i16>, i32)
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16(<4 x i16>, <8 x i16>, i32)
 
 declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32(<4 x i32>, <2 x i32>, i32)
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32(<4 x i32>, <4 x i32>, i32)
 
 declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32(<2 x i32>, <2 x i32>, i32)
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32(<2 x i32>, <4 x i32>, i32)
 
 declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16(<8 x i16>, <4 x i16>, i32)
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16(<8 x i16>, <8 x i16>, i32)
 
 declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16(<4 x i16>, <4 x i16>, i32)
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16(<4 x i16>, <8 x i16>, i32)
 
 declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
 
@@ -1515,6 +1531,37 @@
   ret <4 x i16> %vqdmulh2.i
 }
 
+define <4 x i16> @test_vqdmulh_lane_s16_intrinsic(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmulh_lane_s16_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmulh v0.4h, v0.4h, v1.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16(<4 x i16> %a, <4 x i16> %v, i32 3)
+  ret <4 x i16> %vqdmulh2.i
+}
+
+define <4 x i16> @test_vqdmulh_laneq_s16_intrinsic_lo(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmulh_laneq_s16_intrinsic_lo:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmulh v0.4h, v0.4h, v1.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16(<4 x i16> %a, <8 x i16> %v, i32 3)
+  ret <4 x i16> %vqdmulh2.i
+}
+
+define <4 x i16> @test_vqdmulh_laneq_s16_intrinsic_hi(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmulh_laneq_s16_intrinsic_hi:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmulh v0.4h, v0.4h, v1.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16(<4 x i16> %a, <8 x i16> %v, i32 7)
+  ret <4 x i16> %vqdmulh2.i
+}
+
 define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqdmulhq_lane_s16:
 ; CHECK:       // %bb.0: // %entry
@@ -1527,6 +1574,37 @@
   ret <8 x i16> %vqdmulh2.i
 }
 
+define <8 x i16> @test_vqdmulhq_lane_s16_intrinsic(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmulhq_lane_s16_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmulh v0.8h, v0.8h, v1.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16(<8 x i16> %a, <4 x i16> %v, i32 3)
+  ret <8 x i16> %vqdmulh2.i
+}
+
+define <8 x i16> @test_vqdmulhq_laneq_s16_intrinsic_lo(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmulhq_laneq_s16_intrinsic_lo:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmulh v0.8h, v0.8h, v1.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16(<8 x i16> %a, <8 x i16> %v, i32 3)
+  ret <8 x i16> %vqdmulh2.i
+}
+
+define <8 x i16> @test_vqdmulhq_laneq_s16_intrinsic_hi(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmulhq_laneq_s16_intrinsic_hi:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmulh v0.8h, v0.8h, v1.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16(<8 x i16> %a, <8 x i16> %v, i32 7)
+  ret <8 x i16> %vqdmulh2.i
+}
+
 define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmulh_lane_s32:
 ; CHECK:       // %bb.0: // %entry
@@ -1539,6 +1617,37 @@
   ret <2 x i32> %vqdmulh2.i
 }
 
+define <2 x i32> @test_vqdmulh_lane_s32_intrinsic(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmulh_lane_s32_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmulh v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32(<2 x i32> %a, <2 x i32> %v, i32 1)
+  ret <2 x i32> %vqdmulh2.i
+}
+
+define <2 x i32> @test_vqdmulh_laneq_s32_intrinsic_lo(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmulh_laneq_s32_intrinsic_lo:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmulh v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32(<2 x i32> %a, <4 x i32> %v, i32 1)
+  ret <2 x i32> %vqdmulh2.i
+}
+
+define <2 x i32> @test_vqdmulh_laneq_s32_intrinsic_hi(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmulh_laneq_s32_intrinsic_hi:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmulh v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32(<2 x i32> %a, <4 x i32> %v, i32 3)
+  ret <2 x i32> %vqdmulh2.i
+}
+
 define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqdmulhq_lane_s32:
 ; CHECK:       // %bb.0: // %entry
@@ -1551,6 +1660,37 @@
   ret <4 x i32> %vqdmulh2.i
 }
 
+define <4 x i32> @test_vqdmulhq_lane_s32_intrinsic(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmulhq_lane_s32_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqdmulh v0.4s, v0.4s, v1.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32(<4 x i32> %a, <2 x i32> %v, i32 1)
+  ret <4 x i32> %vqdmulh2.i
+}
+
+define <4 x i32> @test_vqdmulhq_laneq_s32_intrinsic_lo(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmulhq_laneq_s32_intrinsic_lo:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmulh v0.4s, v0.4s, v1.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32(<4 x i32> %a, <4 x i32> %v, i32 1)
+  ret <4 x i32> %vqdmulh2.i
+}
+
+define <4 x i32> @test_vqdmulhq_laneq_s32_intrinsic_hi(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmulhq_laneq_s32_intrinsic_hi:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmulh v0.4s, v0.4s, v1.s[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32(<4 x i32> %a, <4 x i32> %v, i32 3)
+  ret <4 x i32> %vqdmulh2.i
+}
+
 define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqrdmulh_lane_s16:
 ; CHECK:       // %bb.0: // %entry
@@ -1563,6 +1703,37 @@
   ret <4 x i16> %vqrdmulh2.i
 }
 
+define <4 x i16> @test_vqrdmulh_lane_s16_intrinsic(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulh_lane_s16_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqrdmulh v0.4h, v0.4h, v1.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16(<4 x i16> %a, <4 x i16> %v, i32 3)
+  ret <4 x i16> %vqrdmulh2.i
+}
+
+define <4 x i16> @test_vqrdmulh_laneq_s16_intrinsic_lo(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulh_laneq_s16_intrinsic_lo:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmulh v0.4h, v0.4h, v1.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16(<4 x i16> %a, <8 x i16> %v, i32 3)
+  ret <4 x i16> %vqrdmulh2.i
+}
+
+define <4 x i16> @test_vqrdmulh_laneq_s16_intrinsic_hi(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulh_laneq_s16_intrinsic_hi:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmulh v0.4h, v0.4h, v1.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16(<4 x i16> %a, <8 x i16> %v, i32 7)
+  ret <4 x i16> %vqrdmulh2.i
+}
+
 define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
 ; CHECK-LABEL: test_vqrdmulhq_lane_s16:
 ; CHECK:       // %bb.0: // %entry
@@ -1575,6 +1746,37 @@
   ret <8 x i16> %vqrdmulh2.i
 }
 
+define <8 x i16> @test_vqrdmulhq_lane_s16_intrinsic(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulhq_lane_s16_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqrdmulh v0.8h, v0.8h, v1.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16(<8 x i16> %a, <4 x i16> %v, i32 3)
+  ret <8 x i16> %vqrdmulh2.i
+}
+
+define <8 x i16> @test_vqrdmulhq_laneq_s16_intrinsic_lo(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulhq_laneq_s16_intrinsic_lo:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmulh v0.8h, v0.8h, v1.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16(<8 x i16> %a, <8 x i16> %v, i32 3)
+  ret <8 x i16> %vqrdmulh2.i
+}
+
+define <8 x i16> @test_vqrdmulhq_laneq_s16_intrinsic_hi(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulhq_laneq_s16_intrinsic_hi:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmulh v0.8h, v0.8h, v1.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16(<8 x i16> %a, <8 x i16> %v, i32 7)
+  ret <8 x i16> %vqrdmulh2.i
+}
+
 define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqrdmulh_lane_s32:
 ; CHECK:       // %bb.0: // %entry
@@ -1587,6 +1789,37 @@
   ret <2 x i32> %vqrdmulh2.i
 }
 
+define <2 x i32> @test_vqrdmulh_lane_s32_intrinsic(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulh_lane_s32_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqrdmulh v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32(<2 x i32> %a, <2 x i32> %v, i32 1)
+  ret <2 x i32> %vqrdmulh2.i
+}
+
+define <2 x i32> @test_vqrdmulh_laneq_s32_intrinsic_lo(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulh_laneq_s32_intrinsic_lo:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmulh v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32(<2 x i32> %a, <4 x i32> %v, i32 1)
+  ret <2 x i32> %vqrdmulh2.i
+}
+
+define <2 x i32> @test_vqrdmulh_laneq_s32_intrinsic_hi(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulh_laneq_s32_intrinsic_hi:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmulh v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32(<2 x i32> %a, <4 x i32> %v, i32 3)
+  ret <2 x i32> %vqrdmulh2.i
+}
+
 define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
 ; CHECK-LABEL: test_vqrdmulhq_lane_s32:
 ; CHECK:       // %bb.0: // %entry
@@ -1599,6 +1832,37 @@
   ret <4 x i32> %vqrdmulh2.i
 }
 
+define <4 x i32> @test_vqrdmulhq_lane_s32_intrinsic(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulhq_lane_s32_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    sqrdmulh v0.4s, v0.4s, v1.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32(<4 x i32> %a, <2 x i32> %v, i32 1)
+  ret <4 x i32> %vqrdmulh2.i
+}
+
+define <4 x i32> @test_vqrdmulhq_laneq_s32_intrinsic_lo(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulhq_laneq_s32_intrinsic_lo:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmulh v0.4s, v0.4s, v1.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32(<4 x i32> %a, <4 x i32> %v, i32 1)
+  ret <4 x i32> %vqrdmulh2.i
+}
+
+define <4 x i32> @test_vqrdmulhq_laneq_s32_intrinsic_hi(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulhq_laneq_s32_intrinsic_hi:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmulh v0.4s, v0.4s, v1.s[3]
+; CHECK-NEXT:    ret
+entry:
+  %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32(<4 x i32> %a, <4 x i32> %v, i32 3)
+  ret <4 x i32> %vqrdmulh2.i
+}
+
 define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
 ; CHECK-LABEL: test_vmul_lane_f32:
 ; CHECK:       // %bb.0: // %entry
Index: llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
===================================================================
--- llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -1033,8 +1033,10 @@
 
   bool isNeonVectorRegLo() const {
     return Kind == k_Register && Reg.Kind == RegKind::NeonVector &&
-           AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
-               Reg.RegNum);
+           (AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
+                Reg.RegNum) ||
+            AArch64MCRegisterClasses[AArch64::FPR64_loRegClassID].contains(
+                Reg.RegNum));
   }
 
   template <unsigned Class> bool isSVEVectorReg() const {
Index: llvm/lib/Target/AArch64/AArch64RegisterInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -429,6 +429,10 @@
 def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
                                     v1i64, v4f16],
                                     64, (sequence "D%u", 0, 31)>;
+def FPR64_lo : RegisterClass<"AArch64",
+                             [v8i8, v4i16, v2i32, v1i64, v4f16, v2f32, v1f64],
+                             64, (trunc FPR64, 16)>;
+
 // We don't (yet) have an f128 legal type, so don't use that here. We
 // normalize 128-bit vectors to v2f64 for arg passing and such, so use
 // that here.
@@ -503,6 +507,9 @@
   let Name = "VectorRegLo";
   let PredicateMethod = "isNeonVectorRegLo";
 }
+def V64_lo : RegisterOperand<FPR64_lo, "printVRegOperand"> {
+  let ParserMatchClass = VectorRegLoAsmOperand;
+}
 def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
   let ParserMatchClass = VectorRegLoAsmOperand;
 }
Index: llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -572,6 +572,7 @@
     return 32;
 
   case AArch64::FPR128_loRegClassID:
+  case AArch64::FPR64_loRegClassID:
     return 16;
   }
 }
Index: llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -229,6 +229,7 @@
   case AArch64::FPR16RegClassID:
   case AArch64::FPR32RegClassID:
   case AArch64::FPR64RegClassID:
+  case AArch64::FPR64_loRegClassID:
   case AArch64::FPR128RegClassID:
   case AArch64::FPR128_loRegClassID:
   case AArch64::DDRegClassID:
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5577,6 +5577,11 @@
 defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
 defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
 
+defm SQDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqdmulh_lane,
+                                     int_aarch64_neon_sqdmulh_laneq>;
+defm SQRDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqrdmulh_lane,
+                                      int_aarch64_neon_sqrdmulh_laneq>;
+
 // Generated by MachineCombine
 defm MLA   : SIMDVectorIndexedHSTied<1, 0b0000, "mla", null_frag>;
 defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>;
Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -358,6 +358,9 @@
 def am_indexedu6s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedU6S128", []>;
 def am_indexeds9s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedS9S128", []>;
 
+def UImmS1XForm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i64);
+}]>;
 def UImmS2XForm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64);
 }]>;
@@ -7869,6 +7872,64 @@
   }
 }
 
+multiclass SIMDIndexedHSPatterns<SDPatternOperator OpNodeLane,
+                                 SDPatternOperator OpNodeLaneQ> {
+
+  def : Pat<(v4i16 (OpNodeLane
+                     (v4i16 V64:$Rn), (v4i16 V64_lo:$Rm),
+                     VectorIndexS32b:$idx)),
+            (!cast<Instruction>(NAME # v4i16_indexed) $Rn,
+              (SUBREG_TO_REG (i32 0), (v4i16 V64_lo:$Rm), dsub),
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v4i16 (OpNodeLaneQ
+                     (v4i16 V64:$Rn), (v8i16 V128_lo:$Rm),
+                     VectorIndexH32b:$idx)),
+            (!cast<Instruction>(NAME # v4i16_indexed) $Rn, $Rm,
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v8i16 (OpNodeLane
+                     (v8i16 V128:$Rn), (v4i16 V64_lo:$Rm),
+                     VectorIndexS32b:$idx)),
+            (!cast<Instruction>(NAME # v8i16_indexed) $Rn,
+              (SUBREG_TO_REG (i32 0), $Rm, dsub),
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v8i16 (OpNodeLaneQ
+                     (v8i16 V128:$Rn), (v8i16 V128_lo:$Rm),
+                     VectorIndexH32b:$idx)),
+            (!cast<Instruction>(NAME # v8i16_indexed) $Rn, $Rm,
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v2i32 (OpNodeLane
+                     (v2i32 V64:$Rn), (v2i32 V64:$Rm),
+                     VectorIndexD32b:$idx)),
+            (!cast<Instruction>(NAME # v2i32_indexed) $Rn,
+              (SUBREG_TO_REG (i32 0), (v2i32 V64_lo:$Rm), dsub),
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v2i32 (OpNodeLaneQ
+                     (v2i32 V64:$Rn), (v4i32 V128:$Rm),
+                     VectorIndexS32b:$idx)),
+            (!cast<Instruction>(NAME # v2i32_indexed) $Rn, $Rm,
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v4i32 (OpNodeLane
+                     (v4i32 V128:$Rn), (v2i32 V64:$Rm),
+                     VectorIndexD32b:$idx)),
+            (!cast<Instruction>(NAME # v4i32_indexed) $Rn,
+              (SUBREG_TO_REG (i32 0), $Rm, dsub),
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v4i32 (OpNodeLaneQ
+                     (v4i32 V128:$Rn),
+                     (v4i32 V128:$Rm),
+                     VectorIndexS32b:$idx)),
+            (!cast<Instruction>(NAME # v4i32_indexed) $Rn, $Rm,
+              (UImmS1XForm $idx))>;
+
+}
+
 multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
                          SDPatternOperator OpNode> {
   def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6050,6 +6050,8 @@
         break;
       if (VT.isScalableVector())
         return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
+      if (VT.getSizeInBits() == 64)
+        return std::make_pair(0U, &AArch64::FPR64_loRegClass);
       if (VT.getSizeInBits() == 128)
         return std::make_pair(0U, &AArch64::FPR128_loRegClass);
       break;
Index: llvm/lib/IR/Function.cpp
===================================================================
--- llvm/lib/IR/Function.cpp
+++ llvm/lib/IR/Function.cpp
@@ -707,7 +707,9 @@
   IIT_SCALABLE_VEC = 43,
   IIT_SUBDIVIDE2_ARG = 44,
   IIT_SUBDIVIDE4_ARG = 45,
-  IIT_VEC_OF_BITCASTS_TO_INT = 46
+  IIT_VEC_OF_BITCASTS_TO_INT = 46,
+  IIT_NARROW_VEC = 47,
+  IIT_WIDE_VEC = 48
 };
 
 static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
@@ -902,6 +904,17 @@
                                              ArgInfo));
     return;
   }
+  case IIT_NARROW_VEC: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(
+        IITDescriptor::get(IITDescriptor::NarrowVec, ArgInfo));
+    return;
+  }
+  case IIT_WIDE_VEC: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::WideVec, ArgInfo));
+    return;
+  }
   }
   llvm_unreachable("unhandled");
 }
@@ -1042,6 +1055,22 @@
     return VectorType::get(Ty->getVectorElementType(),
                            { Ty->getVectorNumElements(), true });
   }
+  case IITDescriptor::NarrowVec: {
+    Type *Ty = Tys[D.getArgumentNumber()];
+    if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+      Type *ElTy = VTy->getElementType();
+      return VectorType::get(ElTy, 64 / ElTy->getIntegerBitWidth());
+    }
+    llvm_unreachable("Expected an argument of Vector Type");
+  }
+  case IITDescriptor::WideVec: {
+    Type *Ty = Tys[D.getArgumentNumber()];
+    if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+      Type *ElTy = VTy->getElementType();
+      return VectorType::get(ElTy, 128 / ElTy->getIntegerBitWidth());
+    }
+    llvm_unreachable("Expected an argument of Vector Type");
+  }
   }
   llvm_unreachable("unhandled");
 }
@@ -1336,6 +1365,28 @@
         return true;
       return ThisArgVecTy != VectorType::getInteger(ReferenceType);
     }
+    case IITDescriptor::NarrowVec: {
+      if (D.getArgumentNumber() >= ArgTys.size())
+        return IsDeferredCheck || DeferCheck(Ty);
+
+      auto *VTy = dyn_cast<VectorType>(Ty);
+      auto *RefVTy = dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
+      if (!VTy || !RefVTy || VTy->getBitWidth() != 64)
+        return true;
+
+      return VTy->getElementType() != RefVTy->getElementType();
+    }
+    case IITDescriptor::WideVec: {
+      if (D.getArgumentNumber() >= ArgTys.size())
+        return IsDeferredCheck || DeferCheck(Ty);
+
+      auto *VTy = dyn_cast<VectorType>(Ty);
+      auto *RefVTy = dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
+      if (!VTy || !RefVTy || VTy->getBitWidth() != 128)
+        return true;
+
+      return VTy->getElementType() != RefVTy->getElementType();
+    }
   }
   llvm_unreachable("unhandled");
 }
Index: llvm/include/llvm/IR/IntrinsicsAArch64.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -133,6 +133,14 @@
     : Intrinsic<[llvm_anyvector_ty],
                 [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty],
                 [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Narrow_Lane_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMMatchType<0>, LLVMNarrowType<0>, llvm_i32_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Wide_Lane_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMMatchType<0>, LLVMWideType<0>, llvm_i32_ty],
+                [IntrNoMem]>;
 
   class AdvSIMD_3VectorArg_Intrinsic
       : Intrinsic<[llvm_anyvector_ty],
@@ -207,9 +215,13 @@
 
   // Vector Saturating Doubling Multiply High
   def int_aarch64_neon_sqdmulh : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_sqdmulh_lane : AdvSIMD_2VectorArg_Narrow_Lane_Intrinsic;
+  def int_aarch64_neon_sqdmulh_laneq : AdvSIMD_2VectorArg_Wide_Lane_Intrinsic;
 
   // Vector Saturating Rounding Doubling Multiply High
   def int_aarch64_neon_sqrdmulh : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_sqrdmulh_lane : AdvSIMD_2VectorArg_Narrow_Lane_Intrinsic;
+  def int_aarch64_neon_sqrdmulh_laneq : AdvSIMD_2VectorArg_Wide_Lane_Intrinsic;
 
   // Vector Polynominal Multiply
   def int_aarch64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic;
Index: llvm/include/llvm/IR/Intrinsics.td
===================================================================
--- llvm/include/llvm/IR/Intrinsics.td
+++ llvm/include/llvm/IR/Intrinsics.td
@@ -183,6 +183,9 @@
 class LLVMVectorOfAnyPointersToElt<int num> : LLVMMatchType<num>;
 class LLVMVectorElementType<int num> : LLVMMatchType<num>;
 
+class LLVMNarrowType<int num> : LLVMMatchType<num>;
+class LLVMWideType<int num> : LLVMMatchType<num>;
+
 // Match the type of another intrinsic parameter that is expected to be a
 // vector type, but change the element count to be half as many
 class LLVMHalfElementsVectorType<int num> : LLVMMatchType<num>;
Index: llvm/include/llvm/IR/Intrinsics.h
===================================================================
--- llvm/include/llvm/IR/Intrinsics.h
+++ llvm/include/llvm/IR/Intrinsics.h
@@ -101,7 +101,7 @@
       Argument, ExtendArgument, TruncArgument, HalfVecArgument,
       SameVecWidthArgument, PtrToArgument, PtrToElt, VecOfAnyPtrsToElt,
       VecElementArgument, ScalableVecArgument, Subdivide2Argument,
-      Subdivide4Argument, VecOfBitcastsToInt
+      Subdivide4Argument, VecOfBitcastsToInt, WideVec, NarrowVec
     } Kind;
 
     union {
@@ -128,7 +128,8 @@
              Kind == SameVecWidthArgument || Kind == PtrToArgument ||
              Kind == PtrToElt || Kind == VecElementArgument ||
              Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
-             Kind == VecOfBitcastsToInt);
+             Kind == VecOfBitcastsToInt || Kind == WideVec ||
+             Kind == NarrowVec);
       return Argument_Info >> 3;
     }
     ArgKind getArgumentKind() const {
Index: clang/test/CodeGen/aarch64-neon-2velem.c
===================================================================
--- clang/test/CodeGen/aarch64-neon-2velem.c
+++ clang/test/CodeGen/aarch64-neon-2velem.c
@@ -1440,12 +1440,14 @@
 
 // CHECK-LABEL: @test_vqdmulh_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 3)
+// CHECK-NEXT:    [[VQDMULH_LANE_V3:%.*]] = bitcast <4 x i16> [[VQDMULH_LANE_V2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_LANE_V3]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP2]]
 //
 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
   return vqdmulh_lane_s16(a, v, 3);
@@ -1453,12 +1455,14 @@
 
 // CHECK-LABEL: @test_vqdmulhq_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 3)
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V3:%.*]] = bitcast <8 x i16> [[VQDMULHQ_LANE_V2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_LANE_V3]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
   return vqdmulhq_lane_s16(a, v, 3);
@@ -1466,12 +1470,14 @@
 
 // CHECK-LABEL: @test_vqdmulh_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 1)
+// CHECK-NEXT:    [[VQDMULH_LANE_V3:%.*]] = bitcast <2 x i32> [[VQDMULH_LANE_V2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_LANE_V3]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP2]]
 //
 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
   return vqdmulh_lane_s32(a, v, 1);
@@ -1479,12 +1485,14 @@
 
 // CHECK-LABEL: @test_vqdmulhq_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 1)
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V3:%.*]] = bitcast <4 x i32> [[VQDMULHQ_LANE_V2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_LANE_V3]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
   return vqdmulhq_lane_s32(a, v, 1);
@@ -1492,12 +1500,14 @@
 
 // CHECK-LABEL: @test_vqrdmulh_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 3)
+// CHECK-NEXT:    [[VQRDMULH_LANE_V3:%.*]] = bitcast <4 x i16> [[VQRDMULH_LANE_V2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_LANE_V3]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP2]]
 //
 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
   return vqrdmulh_lane_s16(a, v, 3);
@@ -1505,12 +1515,14 @@
 
 // CHECK-LABEL: @test_vqrdmulhq_lane_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 3)
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V3:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_LANE_V2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_LANE_V3]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
   return vqrdmulhq_lane_s16(a, v, 3);
@@ -1518,12 +1530,14 @@
 
 // CHECK-LABEL: @test_vqrdmulh_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> <i32 1, i32 1>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 1)
+// CHECK-NEXT:    [[VQRDMULH_LANE_V3:%.*]] = bitcast <2 x i32> [[VQRDMULH_LANE_V2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_LANE_V3]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP2]]
 //
 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
   return vqrdmulh_lane_s32(a, v, 1);
@@ -1531,12 +1545,14 @@
 
 // CHECK-LABEL: @test_vqrdmulhq_lane_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 1)
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V3:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_LANE_V2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_LANE_V3]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
   return vqrdmulhq_lane_s32(a, v, 1);
@@ -3066,12 +3082,14 @@
 
 // CHECK-LABEL: @test_vqdmulh_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 0)
+// CHECK-NEXT:    [[VQDMULH_LANE_V3:%.*]] = bitcast <4 x i16> [[VQDMULH_LANE_V2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_LANE_V3]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP2]]
 //
 int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
   return vqdmulh_lane_s16(a, v, 0);
@@ -3079,12 +3097,14 @@
 
 // CHECK-LABEL: @test_vqdmulhq_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 0)
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V3:%.*]] = bitcast <8 x i16> [[VQDMULHQ_LANE_V2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_LANE_V3]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
   return vqdmulhq_lane_s16(a, v, 0);
@@ -3092,12 +3112,14 @@
 
 // CHECK-LABEL: @test_vqdmulh_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 0)
+// CHECK-NEXT:    [[VQDMULH_LANE_V3:%.*]] = bitcast <2 x i32> [[VQDMULH_LANE_V2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_LANE_V3]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP2]]
 //
 int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
   return vqdmulh_lane_s32(a, v, 0);
@@ -3105,12 +3127,14 @@
 
 // CHECK-LABEL: @test_vqdmulhq_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 0)
+// CHECK-NEXT:    [[VQDMULHQ_LANE_V3:%.*]] = bitcast <4 x i32> [[VQDMULHQ_LANE_V2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_LANE_V3]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
   return vqdmulhq_lane_s32(a, v, 0);
@@ -3118,12 +3142,14 @@
 
 // CHECK-LABEL: @test_vqrdmulh_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 0)
+// CHECK-NEXT:    [[VQRDMULH_LANE_V3:%.*]] = bitcast <4 x i16> [[VQRDMULH_LANE_V2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_LANE_V3]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP2]]
 //
 int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
   return vqrdmulh_lane_s16(a, v, 0);
@@ -3131,12 +3157,14 @@
 
 // CHECK-LABEL: @test_vqrdmulhq_lane_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 0)
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V3:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_LANE_V2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_LANE_V3]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
   return vqrdmulhq_lane_s16(a, v, 0);
@@ -3144,12 +3172,14 @@
 
 // CHECK-LABEL: @test_vqrdmulh_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 0)
+// CHECK-NEXT:    [[VQRDMULH_LANE_V3:%.*]] = bitcast <2 x i32> [[VQRDMULH_LANE_V2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_LANE_V3]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP2]]
 //
 int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
   return vqrdmulh_lane_s32(a, v, 0);
@@ -3157,12 +3187,14 @@
 
 // CHECK-LABEL: @test_vqrdmulhq_lane_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 0)
+// CHECK-NEXT:    [[VQRDMULHQ_LANE_V3:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_LANE_V2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_LANE_V3]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
   return vqrdmulhq_lane_s32(a, v, 0);
@@ -4753,12 +4785,14 @@
 
 // CHECK-LABEL: @test_vqdmulh_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V3:%.*]] = bitcast <4 x i16> [[VQDMULH_LANEQ_V2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_LANEQ_V3]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP2]]
 //
 int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
   return vqdmulh_laneq_s16(a, v, 0);
@@ -4766,12 +4800,14 @@
 
 // CHECK-LABEL: @test_vqdmulhq_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V3:%.*]] = bitcast <8 x i16> [[VQDMULHQ_LANEQ_V2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_LANEQ_V3]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
   return vqdmulhq_laneq_s16(a, v, 0);
@@ -4779,12 +4815,14 @@
 
 // CHECK-LABEL: @test_vqdmulh_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V3:%.*]] = bitcast <2 x i32> [[VQDMULH_LANEQ_V2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_LANEQ_V3]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP2]]
 //
 int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
   return vqdmulh_laneq_s32(a, v, 0);
@@ -4792,12 +4830,14 @@
 
 // CHECK-LABEL: @test_vqdmulhq_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V3:%.*]] = bitcast <4 x i32> [[VQDMULHQ_LANEQ_V2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_LANEQ_V3]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
   return vqdmulhq_laneq_s32(a, v, 0);
@@ -4805,12 +4845,14 @@
 
 // CHECK-LABEL: @test_vqrdmulh_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V3:%.*]] = bitcast <4 x i16> [[VQRDMULH_LANEQ_V2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_LANEQ_V3]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP2]]
 //
 int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
   return vqrdmulh_laneq_s16(a, v, 0);
@@ -4818,12 +4860,14 @@
 
 // CHECK-LABEL: @test_vqrdmulhq_laneq_s16_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V3:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_LANEQ_V2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_LANEQ_V3]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
   return vqrdmulhq_laneq_s16(a, v, 0);
@@ -4831,12 +4875,14 @@
 
 // CHECK-LABEL: @test_vqrdmulh_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V3:%.*]] = bitcast <2 x i32> [[VQRDMULH_LANEQ_V2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_LANEQ_V3]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP2]]
 //
 int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
   return vqrdmulh_laneq_s32(a, v, 0);
@@ -4844,12 +4890,14 @@
 
 // CHECK-LABEL: @test_vqrdmulhq_laneq_s32_0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 0)
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V3:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_LANEQ_V2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_LANEQ_V3]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
   return vqrdmulhq_laneq_s32(a, v, 0);
@@ -5149,12 +5197,14 @@
 
 // CHECK-LABEL: @test_vqdmulh_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <4 x i16> [[VQDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 7)
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V3:%.*]] = bitcast <4 x i16> [[VQDMULH_LANEQ_V2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_LANEQ_V3]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP2]]
 //
 int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
   return vqdmulh_laneq_s16(a, v, 7);
@@ -5162,12 +5212,14 @@
 
 // CHECK-LABEL: @test_vqdmulhq_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <8 x i16> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 7)
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V3:%.*]] = bitcast <8 x i16> [[VQDMULHQ_LANEQ_V2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_LANEQ_V3]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
   return vqdmulhq_laneq_s16(a, v, 7);
@@ -5175,12 +5227,14 @@
 
 // CHECK-LABEL: @test_vqdmulh_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <2 x i32> [[VQDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 3)
+// CHECK-NEXT:    [[VQDMULH_LANEQ_V3:%.*]] = bitcast <2 x i32> [[VQDMULH_LANEQ_V2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_LANEQ_V3]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP2]]
 //
 int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
   return vqdmulh_laneq_s32(a, v, 3);
@@ -5188,12 +5242,14 @@
 
 // CHECK-LABEL: @test_vqdmulhq_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <4 x i32> [[VQDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 3)
+// CHECK-NEXT:    [[VQDMULHQ_LANEQ_V3:%.*]] = bitcast <4 x i32> [[VQDMULHQ_LANEQ_V2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_LANEQ_V3]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
   return vqdmulhq_laneq_s32(a, v, 3);
@@ -5201,12 +5257,14 @@
 
 // CHECK-LABEL: @test_vqrdmulh_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <4 x i16> [[VQRDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 7)
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V3:%.*]] = bitcast <4 x i16> [[VQRDMULH_LANEQ_V2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_LANEQ_V3]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP2]]
 //
 int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
   return vqrdmulh_laneq_s16(a, v, 7);
@@ -5214,12 +5272,14 @@
 
 // CHECK-LABEL: @test_vqrdmulhq_laneq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <8 x i16> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 7)
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V3:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_LANEQ_V2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_LANEQ_V3]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
   return vqrdmulhq_laneq_s16(a, v, 7);
@@ -5227,12 +5287,14 @@
 
 // CHECK-LABEL: @test_vqrdmulh_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> <i32 3, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
-// CHECK-NEXT:    [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
-// CHECK-NEXT:    ret <2 x i32> [[VQRDMULH_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 3)
+// CHECK-NEXT:    [[VQRDMULH_LANEQ_V3:%.*]] = bitcast <2 x i32> [[VQRDMULH_LANEQ_V2]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_LANEQ_V3]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP2]]
 //
 int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
   return vqrdmulh_laneq_s32(a, v, 3);
@@ -5240,12 +5302,14 @@
 
 // CHECK-LABEL: @test_vqrdmulhq_laneq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
-// CHECK-NEXT:    [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4
-// CHECK-NEXT:    [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
-// CHECK-NEXT:    ret <4 x i32> [[VQRDMULHQ_V2_I]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 3)
+// CHECK-NEXT:    [[VQRDMULHQ_LANEQ_V3:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_LANEQ_V2]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_LANEQ_V3]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
   return vqrdmulhq_laneq_s32(a, v, 3);
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -4937,14 +4937,22 @@
   NEONMAP2(vqaddq_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
   NEONMAP2(vqdmlal_v, aarch64_neon_sqdmull, aarch64_neon_sqadd, 0),
   NEONMAP2(vqdmlsl_v, aarch64_neon_sqdmull, aarch64_neon_sqsub, 0),
+  NEONMAP1(vqdmulh_lane_v, aarch64_neon_sqdmulh_lane, Add1ArgType),
+  NEONMAP1(vqdmulh_laneq_v, aarch64_neon_sqdmulh_laneq, Add1ArgType),
   NEONMAP1(vqdmulh_v, aarch64_neon_sqdmulh, Add1ArgType),
+  NEONMAP1(vqdmulhq_lane_v, aarch64_neon_sqdmulh_lane, Add1ArgType),
+  NEONMAP1(vqdmulhq_laneq_v, aarch64_neon_sqdmulh_laneq, Add1ArgType),
   NEONMAP1(vqdmulhq_v, aarch64_neon_sqdmulh, Add1ArgType),
   NEONMAP1(vqdmull_v, aarch64_neon_sqdmull, Add1ArgType),
   NEONMAP2(vqmovn_v, aarch64_neon_uqxtn, aarch64_neon_sqxtn, Add1ArgType | UnsignedAlts),
   NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType),
   NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType),
   NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType),
+  NEONMAP1(vqrdmulh_lane_v, aarch64_neon_sqrdmulh_lane, Add1ArgType),
+  NEONMAP1(vqrdmulh_laneq_v, aarch64_neon_sqrdmulh_laneq, Add1ArgType),
   NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType),
+  NEONMAP1(vqrdmulhq_lane_v, aarch64_neon_sqrdmulh_lane, Add1ArgType),
+  NEONMAP1(vqrdmulhq_laneq_v, aarch64_neon_sqrdmulh_laneq, Add1ArgType),
   NEONMAP1(vqrdmulhq_v, aarch64_neon_sqrdmulh, Add1ArgType),
   NEONMAP2(vqrshl_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
   NEONMAP2(vqrshlq_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
Index: clang/include/clang/Basic/arm_neon.td
===================================================================
--- clang/include/clang/Basic/arm_neon.td
+++ clang/include/clang/Basic/arm_neon.td
@@ -528,9 +528,16 @@
 def VQDMULL_N     : SOpInst<"vqdmull_n", "(>Q).1", "si", OP_QDMULL_N>;
 def VQDMULL_LANE  : SOpInst<"vqdmull_lane", "(>Q)..I", "si", OP_QDMULL_LN>;
 def VQDMULH_N     : SOpInst<"vqdmulh_n", "..1", "siQsQi", OP_QDMULH_N>;
-def VQDMULH_LANE  : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>;
 def VQRDMULH_N    : SOpInst<"vqrdmulh_n", "..1", "siQsQi", OP_QRDMULH_N>;
+
+let ArchGuard = "!defined(__aarch64__)" in {
+def VQDMULH_LANE  : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>;
 def VQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "..qI", "siQsQi", OP_QRDMULH_LN>;
+}
+let ArchGuard = "defined(__aarch64__)" in {
+def A64_VQDMULH_LANE  : SInst<"vqdmulh_lane", "..qI", "siQsQi">;
+def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..qI", "siQsQi">;
+}
 
 let ArchGuard = "defined(__ARM_FEATURE_QRDMX)" in {
 def VQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "...qI", "siQsQi", OP_QRDMLAH_LN>;
@@ -951,9 +958,10 @@
 def VQDMULL_HIGH_LANEQ  : SOpInst<"vqdmull_high_laneq", "(>Q)QQI", "si",
                                   OP_QDMULLHi_LN>;
 
-def VQDMULH_LANEQ  : SOpInst<"vqdmulh_laneq", "..QI", "siQsQi", OP_QDMULH_LN>;
-def VQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "..QI", "siQsQi", OP_QRDMULH_LN>;
-
+let isLaneQ = 1 in {
+def VQDMULH_LANEQ  : SInst<"vqdmulh_laneq", "..QI", "siQsQi">;
+def VQRDMULH_LANEQ : SInst<"vqrdmulh_laneq", "..QI", "siQsQi">;
+}
 let ArchGuard = "defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)" in {
 def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN>;
 def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "...QI", "siQsQi", OP_QRDMLSH_LN>;

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D71469: [AArch64] Add sq(r)dmulh_lane(q) LLVM IR intrinsics

Reply via email to