[PATCH] D73903: [AArch64][SVE] Add remaining SVE2 intrinsics for widening DSP operations

Kerry McLaughlin via Phabricator via cfe-commits Mon, 03 Feb 2020 09:10:34 -0800

kmclaughlin created this revision.
kmclaughlin added reviewers: sdesmalen, dancgr, efriedma, c-rhodes.
Herald added subscribers: psnobl, rkruppe, hiraditya, kristof.beyls, tschuett.
Herald added a reviewer: rengolin.
Herald added a project: LLVM.
kmclaughlin added a parent revision: D73719: [AArch64][SVE] Add SVE2 intrinsics 
for widening DSP operations.


Implements the following intrinsics:

- llvm.aarch64.sve.[s|u]mullb_lane
- llvm.aarch64.sve.[s|u]mullt_lane
- llvm.aarch64.sve.sqdmullb_lane
- llvm.aarch64.sve.sqdmullt_lane
- llvm.aarch64.sve.[s|u]addwb
- llvm.aarch64.sve.[s|u]addwt
- llvm.aarch64.sve.[s|u]shllb
- llvm.aarch64.sve.[s|u]shllt
- llvm.aarch64.sve.[s|u]subwb
- llvm.aarch64.sve.[s|u]subwt


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D73903

Files:
  llvm/include/llvm/IR/IntrinsicsAArch64.td
  llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
  llvm/lib/Target/AArch64/SVEInstrFormats.td
  llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll

Index: llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll
+++ llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll
@@ -193,6 +193,69 @@
 }
 
 ;
+; SADDWB
+;
+
+define <vscale x 8 x i16> @saddwb_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: saddwb_b:
+; CHECK: saddwb z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.saddwb.nxv8i16(<vscale x 8 x i16> %a,
+                                                                  <vscale x 16 x i8> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @saddwb_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: saddwb_h:
+; CHECK: saddwb z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.saddwb.nxv4i32(<vscale x 4 x i32> %a,
+                                                                  <vscale x 8 x i16> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @saddwb_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: saddwb_s:
+; CHECK: saddwb z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.saddwb.nxv2i64(<vscale x 2 x i64> %a,
+                                                                  <vscale x 4 x i32> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; SADDWT
+;
+
+define <vscale x 8 x i16> @saddwt_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: saddwt_b:
+; CHECK: saddwt z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.saddwt.nxv8i16(<vscale x 8 x i16> %a,
+                                                                  <vscale x 16 x i8> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @saddwt_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: saddwt_h:
+; CHECK: saddwt z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.saddwt.nxv4i32(<vscale x 4 x i32> %a,
+                                                                  <vscale x 8 x i16> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @saddwt_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: saddwt_s:
+; CHECK: saddwt z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.saddwt.nxv2i64(<vscale x 2 x i64> %a,
+                                                                  <vscale x 4 x i32> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+
+;
 ; SMULLB (Vectors)
 ;
 
@@ -224,6 +287,30 @@
 }
 
 ;
+; SMULLB (Indexed)
+;
+
+define <vscale x 4 x i32> @smullb_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: smullb_lane_h:
+; CHECK: smullb z0.s, z0.h, z1.h[4]
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.smullb.lane.nxv4i32(<vscale x 8 x i16> %a,
+                                                                       <vscale x 8 x i16> %b,
+                                                                       i32 4)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @smullb_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: smullb_lane_s:
+; CHECK: smullb z0.d, z0.s, z1.s[3]
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.smullb.lane.nxv2i64(<vscale x 4 x i32> %a,
+                                                                       <vscale x 4 x i32> %b,
+                                                                       i32 3)
+  ret <vscale x 2 x i64> %out
+}
+
+;
 ; SMULLT (Vectors)
 ;
 
@@ -255,6 +342,30 @@
 }
 
 ;
+; SMULLT (Indexed)
+;
+
+define <vscale x 4 x i32> @smullt_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: smullt_lane_h:
+; CHECK: smullt z0.s, z0.h, z1.h[5]
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.smullt.lane.nxv4i32(<vscale x 8 x i16> %a,
+                                                                       <vscale x 8 x i16> %b,
+                                                                       i32 5)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @smullt_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: smullt_lane_s:
+; CHECK: smullt z0.d, z0.s, z1.s[2]
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.smullt.lane.nxv2i64(<vscale x 4 x i32> %a,
+                                                                       <vscale x 4 x i32> %b,
+                                                                       i32 2)
+  ret <vscale x 2 x i64> %out
+}
+
+;
 ; SQDMULLB (Vectors)
 ;
 
@@ -286,6 +397,30 @@
 }
 
 ;
+; SQDMULLB (Indexed)
+;
+
+define <vscale x 4 x i32> @sqdmullb_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: sqdmullb_lane_h:
+; CHECK: sqdmullb z0.s, z0.h, z1.h[2]
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullb.lane.nxv4i32(<vscale x 8 x i16> %a,
+                                                                         <vscale x 8 x i16> %b,
+                                                                         i32 2)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @sqdmullb_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: sqdmullb_lane_s:
+; CHECK: sqdmullb z0.d, z0.s, z1.s[1]
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullb.lane.nxv2i64(<vscale x 4 x i32> %a,
+                                                                         <vscale x 4 x i32> %b,
+                                                                         i32 1)
+  ret <vscale x 2 x i64> %out
+}
+
+;
 ; SQDMULLT (Vectors)
 ;
 
@@ -317,6 +452,30 @@
 }
 
 ;
+; SQDMULLT (Indexed)
+;
+
+define <vscale x 4 x i32> @sqdmullt_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: sqdmullt_lane_h:
+; CHECK: sqdmullt z0.s, z0.h, z1.h[3]
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullt.lane.nxv4i32(<vscale x 8 x i16> %a,
+                                                                         <vscale x 8 x i16> %b,
+                                                                         i32 3)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @sqdmullt_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: sqdmullt_lane_s:
+; CHECK: sqdmullt z0.d, z0.s, z1.s[0]
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullt.lane.nxv2i64(<vscale x 4 x i32> %a,
+                                                                         <vscale x 4 x i32> %b,
+                                                                         i32 0)
+  ret <vscale x 2 x i64> %out
+}
+
+;
 ; SSUBLB
 ;
 
@@ -348,6 +507,62 @@
 }
 
 ;
+; SSHLLB
+;
+
+define <vscale x 8 x i16> @sshllb_b(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: sshllb_b:
+; CHECK: sshllb z0.h, z0.b, #0
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sshllb.nxv8i16(<vscale x 16 x i8> %a, i32 0)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @sshllb_h(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: sshllb_h:
+; CHECK: sshllb z0.s, z0.h, #1
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sshllb.nxv4i32(<vscale x 8 x i16> %a, i32 1)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @sshllb_s(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: sshllb_s:
+; CHECK: sshllb z0.d, z0.s, #2
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sshllb.nxv2i64(<vscale x 4 x i32> %a, i32 2)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; SSHLLT
+;
+
+define <vscale x 8 x i16> @sshllt_b(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: sshllt_b:
+; CHECK: sshllt z0.h, z0.b, #3
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sshllt.nxv8i16(<vscale x 16 x i8> %a, i32 3)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @sshllt_h(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: sshllt_h:
+; CHECK: sshllt z0.s, z0.h, #4
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sshllt.nxv4i32(<vscale x 8 x i16> %a, i32 4)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @sshllt_s(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: sshllt_s:
+; CHECK: sshllt z0.d, z0.s, #5
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sshllt.nxv2i64(<vscale x 4 x i32> %a, i32 5)
+  ret <vscale x 2 x i64> %out
+}
+
+;
 ; SSUBLT
 ;
 
@@ -379,6 +594,68 @@
 }
 
 ;
+; SSUBWB
+;
+
+define <vscale x 8 x i16> @ssubwb_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: ssubwb_b:
+; CHECK: ssubwb z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.ssubwb.nxv8i16(<vscale x 8 x i16> %a,
+                                                                  <vscale x 16 x i8> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @ssubwb_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: ssubwb_h:
+; CHECK: ssubwb z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.ssubwb.nxv4i32(<vscale x 4 x i32> %a,
+                                                                  <vscale x 8 x i16> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @ssubwb_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: ssubwb_s:
+; CHECK: ssubwb z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.ssubwb.nxv2i64(<vscale x 2 x i64> %a,
+                                                                  <vscale x 4 x i32> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; SSUBWT
+;
+
+define <vscale x 8 x i16> @ssubwt_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: ssubwt_b:
+; CHECK: ssubwt z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.ssubwt.nxv8i16(<vscale x 8 x i16> %a,
+                                                                  <vscale x 16 x i8> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @ssubwt_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: ssubwt_h:
+; CHECK: ssubwt z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.ssubwt.nxv4i32(<vscale x 4 x i32> %a,
+                                                                  <vscale x 8 x i16> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @ssubwt_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: ssubwt_s:
+; CHECK: ssubwt z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.ssubwt.nxv2i64(<vscale x 2 x i64> %a,
+                                                                  <vscale x 4 x i32> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+;
 ; UABALB
 ;
 
@@ -571,6 +848,68 @@
 }
 
 ;
+; UADDWB
+;
+
+define <vscale x 8 x i16> @uaddwb_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: uaddwb_b:
+; CHECK: uaddwb z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uaddwb.nxv8i16(<vscale x 8 x i16> %a,
+                                                                  <vscale x 16 x i8> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @uaddwb_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: uaddwb_h:
+; CHECK: uaddwb z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uaddwb.nxv4i32(<vscale x 4 x i32> %a,
+                                                                  <vscale x 8 x i16> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @uaddwb_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: uaddwb_s:
+; CHECK: uaddwb z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uaddwb.nxv2i64(<vscale x 2 x i64> %a,
+                                                                  <vscale x 4 x i32> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; UADDWT
+;
+
+define <vscale x 8 x i16> @uaddwt_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: uaddwt_b:
+; CHECK: uaddwt z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uaddwt.nxv8i16(<vscale x 8 x i16> %a,
+                                                                  <vscale x 16 x i8> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @uaddwt_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: uaddwt_h:
+; CHECK: uaddwt z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uaddwt.nxv4i32(<vscale x 4 x i32> %a,
+                                                                  <vscale x 8 x i16> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @uaddwt_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: uaddwt_s:
+; CHECK: uaddwt z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uaddwt.nxv2i64(<vscale x 2 x i64> %a,
+                                                                  <vscale x 4 x i32> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+;
 ; UMULLB (Vectors)
 ;
 
@@ -602,6 +941,31 @@
 }
 
 ;
+; UMULLB (Indexed)
+;
+
+define <vscale x 4 x i32> @umullb_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: umullb_lane_h:
+; CHECK: umullb z0.s, z0.h, z1.h[0]
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.umullb.lane.nxv4i32(<vscale x 8 x i16> %a,
+                                                                       <vscale x 8 x i16> %b,
+                                                                       i32 0)
+  ret <vscale x 4 x i32> %out
+}
+
+
+define <vscale x 2 x i64> @umullb_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: umullb_lane_s:
+; CHECK: umullb z0.d, z0.s, z1.s[3]
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.umullb.lane.nxv2i64(<vscale x 4 x i32> %a,
+                                                                       <vscale x 4 x i32> %b,
+                                                                       i32 3)
+  ret <vscale x 2 x i64> %out
+}
+
+;
 ; UMULLT (Vectors)
 ;
 
@@ -633,6 +997,86 @@
 }
 
 ;
+; UMULLT (Indexed)
+;
+
+define <vscale x 4 x i32> @umullt_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: umullt_lane_h:
+; CHECK: umullt z0.s, z0.h, z1.h[1]
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.umullt.lane.nxv4i32(<vscale x 8 x i16> %a,
+                                                                       <vscale x 8 x i16> %b,
+                                                                       i32 1)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @umullt_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: umullt_lane_s:
+; CHECK: umullt z0.d, z0.s, z1.s[2]
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.umullt.lane.nxv2i64(<vscale x 4 x i32> %a,
+                                                                       <vscale x 4 x i32> %b,
+                                                                       i32 2)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; USHLLB
+;
+
+define <vscale x 8 x i16> @ushllb_b(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: ushllb_b:
+; CHECK: ushllb z0.h, z0.b, #6
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.ushllb.nxv8i16(<vscale x 16 x i8> %a, i32 6)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @ushllb_h(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: ushllb_h:
+; CHECK: ushllb z0.s, z0.h, #7
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.ushllb.nxv4i32(<vscale x 8 x i16> %a, i32 7)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @ushllb_s(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: ushllb_s:
+; CHECK: ushllb z0.d, z0.s, #8
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.ushllb.nxv2i64(<vscale x 4 x i32> %a, i32 8)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; USHLLT
+;
+
+define <vscale x 8 x i16> @ushllt_b(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: ushllt_b:
+; CHECK: ushllt z0.h, z0.b, #7
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.ushllt.nxv8i16(<vscale x 16 x i8> %a, i32 7)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @ushllt_h(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: ushllt_h:
+; CHECK: ushllt z0.s, z0.h, #15
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.ushllt.nxv4i32(<vscale x 8 x i16> %a, i32 15)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @ushllt_s(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: ushllt_s:
+; CHECK: ushllt z0.d, z0.s, #31
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.ushllt.nxv2i64(<vscale x 4 x i32> %a, i32 31)
+  ret <vscale x 2 x i64> %out
+}
+
+;
 ; USUBLB
 ;
 
@@ -694,6 +1138,68 @@
   ret <vscale x 2 x i64> %out
 }
 
+;
+; USUBWB
+;
+
+define <vscale x 8 x i16> @usubwb_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: usubwb_b:
+; CHECK: usubwb z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.usubwb.nxv8i16(<vscale x 8 x i16> %a,
+                                                                  <vscale x 16 x i8> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @usubwb_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: usubwb_h:
+; CHECK: usubwb z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.usubwb.nxv4i32(<vscale x 4 x i32> %a,
+                                                                  <vscale x 8 x i16> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @usubwb_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: usubwb_s:
+; CHECK: usubwb z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.usubwb.nxv2i64(<vscale x 2 x i64> %a,
+                                                                  <vscale x 4 x i32> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; USUBWT
+;
+
+define <vscale x 8 x i16> @usubwt_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: usubwt_b:
+; CHECK: usubwt z0.h, z0.h, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.usubwt.nxv8i16(<vscale x 8 x i16> %a,
+                                                                  <vscale x 16 x i8> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @usubwt_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: usubwt_h:
+; CHECK: usubwt z0.s, z0.s, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.usubwt.nxv4i32(<vscale x 4 x i32> %a,
+                                                                  <vscale x 8 x i16> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @usubwt_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: usubwt_s:
+; CHECK: usubwt z0.d, z0.d, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.usubwt.nxv2i64(<vscale x 2 x i64> %a,
+                                                                  <vscale x 4 x i32> %b)
+  ret <vscale x 2 x i64> %out
+}
+
 declare <vscale x 8 x i16> @llvm.aarch64.sve.sabalb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.sabalb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.sabalb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>, <vscale x 4 x i32>)
@@ -718,22 +1224,50 @@
 declare <vscale x 4 x i32> @llvm.aarch64.sve.saddlt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.saddlt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
+declare <vscale x 8 x i16> @llvm.aarch64.sve.saddwb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.saddwb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.saddwb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.saddwt.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.saddwt.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.saddwt.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
 declare <vscale x 8 x i16> @llvm.aarch64.sve.smullb.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.smullb.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.smullb.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
+declare <vscale x 4 x i32> @llvm.aarch64.sve.smullb.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.smullb.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+
 declare <vscale x 8 x i16> @llvm.aarch64.sve.smullt.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.smullt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.smullt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
+declare <vscale x 4 x i32> @llvm.aarch64.sve.smullt.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.smullt.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+
 declare <vscale x 8 x i16> @llvm.aarch64.sve.sqdmullb.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullb.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullb.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullb.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullb.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+
 declare <vscale x 8 x i16> @llvm.aarch64.sve.sqdmullt.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullt.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullt.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.sshllb.nxv8i16(<vscale x 16 x i8>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sshllb.nxv4i32(<vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.sshllb.nxv2i64(<vscale x 4 x i32>, i32)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.sshllt.nxv8i16(<vscale x 16 x i8>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sshllt.nxv4i32(<vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.sshllt.nxv2i64(<vscale x 4 x i32>, i32)
+
 declare <vscale x 8 x i16> @llvm.aarch64.sve.ssublb.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.ssublb.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.ssublb.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
@@ -742,6 +1276,14 @@
 declare <vscale x 4 x i32> @llvm.aarch64.sve.ssublt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.ssublt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ssubwb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ssubwb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ssubwb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ssubwt.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ssubwt.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ssubwt.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
 declare <vscale x 8 x i16> @llvm.aarch64.sve.uabalb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.uabalb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.uabalb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>, <vscale x 4 x i32>)
@@ -766,14 +1308,36 @@
 declare <vscale x 4 x i32> @llvm.aarch64.sve.uaddlt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.uaddlt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
+declare <vscale x 8 x i16> @llvm.aarch64.sve.uaddwb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.uaddwb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.uaddwb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.uaddwt.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.uaddwt.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.uaddwt.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
 declare <vscale x 8 x i16> @llvm.aarch64.sve.umullb.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.umullb.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.umullb.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
+declare <vscale x 4 x i32> @llvm.aarch64.sve.umullb.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.umullb.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+
 declare <vscale x 8 x i16> @llvm.aarch64.sve.umullt.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.umullt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.umullt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
+declare <vscale x 4 x i32> @llvm.aarch64.sve.umullt.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.umullt.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ushllb.nxv8i16(<vscale x 16 x i8>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ushllb.nxv4i32(<vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ushllb.nxv2i64(<vscale x 4 x i32>, i32)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ushllt.nxv8i16(<vscale x 16 x i8>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ushllt.nxv4i32(<vscale x 8 x i16>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ushllt.nxv2i64(<vscale x 4 x i32>, i32)
+
 declare <vscale x 8 x i16> @llvm.aarch64.sve.usublb.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.usublb.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.usublb.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
@@ -781,3 +1345,11 @@
 declare <vscale x 8 x i16> @llvm.aarch64.sve.usublt.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.usublt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.usublt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.usubwb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.usubwb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.usubwb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.usubwt.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.usubwt.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.usubwt.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>)
Index: llvm/lib/Target/AArch64/SVEInstrFormats.td
===================================================================
--- llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -2703,9 +2703,10 @@
   def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm> {
+multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm,
+                                             SDPatternOperator op> {
   def _S : sve2_int_mul_by_indexed_elem<0b10, { opc{2-1}, ?, opc{0} }, asm,
-                                        ZPR32, ZPR16, ZPR3b16, VectorIndexH> {
+                                        ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> {
     bits<3> Zm;
     bits<3> iop;
     let Inst{20-19} = iop{2-1};
@@ -2713,13 +2714,16 @@
     let Inst{11} = iop{0};
   }
   def _D : sve2_int_mul_by_indexed_elem<0b11, { opc{2-1}, ?, opc{0} }, asm,
-                                        ZPR64, ZPR32, ZPR4b32, VectorIndexS> {
+                                        ZPR64, ZPR32, ZPR4b32, VectorIndexS32b> {
     bits<4> Zm;
     bits<2> iop;
     let Inst{20} = iop{1};
     let Inst{19-16} = Zm;
     let Inst{11} = iop{0};
   }
+
+  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2866,10 +2870,15 @@
   def : SVE_2_Op_Pat<nxv2i64, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve2_wide_int_arith_wide<bits<3> opc, string asm> {
+multiclass sve2_wide_int_arith_wide<bits<3> opc, string asm,
+                                    SDPatternOperator op> {
   def _H : sve2_wide_int_arith<0b01, { 0b10, opc }, asm, ZPR16, ZPR16, ZPR8>;
   def _S : sve2_wide_int_arith<0b10, { 0b10, opc }, asm, ZPR32, ZPR32, ZPR16>;
   def _D : sve2_wide_int_arith<0b11, { 0b10, opc }, asm, ZPR64, ZPR64, ZPR32>;
+
+  def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve2_pmul_long<bits<1> opc, string asm> {
@@ -2959,7 +2968,8 @@
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> {
+multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm,
+                                        SDPatternOperator op> {
   def _H : sve2_bitwise_shift_left_long<{0,0,1}, opc, asm,
                                         ZPR16, ZPR8, vecshiftL8>;
   def _S : sve2_bitwise_shift_left_long<{0,1,?}, opc, asm,
@@ -2970,6 +2980,9 @@
                                         ZPR64, ZPR32, vecshiftL32> {
     let Inst{20-19} = imm{4-3};
   }
+  def : SVE_2_Op_Imm_Pat<nxv8i16, op, nxv16i8, i32, tvecshiftL8,  !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Imm_Pat<nxv4i32, op, nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Imm_Pat<nxv2i64, op, nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1457,14 +1457,14 @@
   defm SQRDCMLAH_ZZZ : sve2_int_cmla<0b1, "sqrdcmlah">;
 
   // SVE2 integer multiply long (indexed)
-  defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb">;
-  defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt">;
-  defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb">;
-  defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt">;
+  defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb", int_aarch64_sve_smullb_lane>;
+  defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt", int_aarch64_sve_smullt_lane>;
+  defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb", int_aarch64_sve_umullb_lane>;
+  defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt", int_aarch64_sve_umullt_lane>;
 
   // SVE2 saturating multiply (indexed)
-  defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb">;
-  defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt">;
+  defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb", int_aarch64_sve_sqdmullb_lane>;
+  defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt", int_aarch64_sve_sqdmullt_lane>;
 
   // SVE2 integer multiply-add long (indexed)
   defm SMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1000, "smlalb", int_aarch64_sve_smlalb_lane>;
@@ -1575,14 +1575,14 @@
   defm UABDLT_ZZZ : sve2_wide_int_arith_long<0b01111, "uabdlt", int_aarch64_sve_uabdlt>;
 
   // SVE2 integer add/subtract wide
-  defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb">;
-  defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt">;
-  defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb">;
-  defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt">;
-  defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb">;
-  defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt">;
-  defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb">;
-  defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt">;
+  defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb", int_aarch64_sve_saddwb>;
+  defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt", int_aarch64_sve_saddwt>;
+  defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb", int_aarch64_sve_uaddwb>;
+  defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt", int_aarch64_sve_uaddwt>;
+  defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb", int_aarch64_sve_ssubwb>;
+  defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt", int_aarch64_sve_ssubwt>;
+  defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb", int_aarch64_sve_usubwb>;
+  defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt", int_aarch64_sve_usubwt>;
 
   // SVE2 integer multiply long
   defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb", int_aarch64_sve_sqdmullb>;
@@ -1675,10 +1675,10 @@
   defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb">;
 
   // SVE2 bitwise shift left long
-  defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb">;
-  defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt">;
-  defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb">;
-  defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt">;
+  defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb", int_aarch64_sve_sshllb>;
+  defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt", int_aarch64_sve_sshllt>;
+  defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb", int_aarch64_sve_ushllb>;
+  defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt", int_aarch64_sve_ushllt>;
 
   // SVE2 integer add/subtract interleaved long
   defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt">;
Index: llvm/include/llvm/IR/IntrinsicsAArch64.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1040,12 +1040,31 @@
                  LLVMVectorOfBitcastsToInt<0>],
                 [IntrNoMem]>;
 
+  class SVE2_1VectorArg_Long_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMSubdivide2VectorType<0>,
+                 llvm_i32_ty],
+                [IntrNoMem, ImmArg<1>]>;
+
   class SVE2_2VectorArg_Long_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
                 [LLVMSubdivide2VectorType<0>,
                  LLVMSubdivide2VectorType<0>],
                 [IntrNoMem]>;
 
+  class SVE2_2VectorArgIndexed_Long_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty],
+              [LLVMSubdivide2VectorType<0>,
+               LLVMSubdivide2VectorType<0>,
+               llvm_i32_ty],
+              [IntrNoMem, ImmArg<2>]>;
+
+  class SVE2_2VectorArg_Wide_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>,
+                 LLVMSubdivide2VectorType<0>],
+                [IntrNoMem]>;
+
   class SVE2_2VectorArg_Pred_Long_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
@@ -1078,6 +1097,7 @@
                 [LLVMSubdivide2VectorType<0>,
                  llvm_anyvector_ty],
                 [IntrNoMem]>;
+
   class SVE2_2VectorArg_Narrowing_Intrinsic
       : Intrinsic<
             [LLVMSubdivide2VectorType<0>],
@@ -1731,28 +1751,70 @@
 // SVE2 - Widening DSP operations
 //
 
-def int_aarch64_sve_sabalb   : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_sabalt   : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_sabdlb   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_sabdlt   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_saddlb   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_saddlt   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_smullb   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_smullt   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_sqdmullb : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_sqdmullt : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_ssublb   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_ssublt   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_uabalb   : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_uabalt   : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_uabdlb   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_uabdlt   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_uaddlb   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_uaddlt   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_umullb   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_umullt   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_usublb   : SVE2_2VectorArg_Long_Intrinsic;
-def int_aarch64_sve_usublt   : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sabalb        : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sabalt        : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sabdlb        : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sabdlt        : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_saddlb        : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_saddlt        : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_saddwb        : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_saddwt        : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_smlalb        : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_smlalb_lane   : SVE2_3VectorArg_Indexed_Intrinsic;
+def int_aarch64_sve_smlalt        : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_smlalt_lane   : SVE2_3VectorArg_Indexed_Intrinsic;
+def int_aarch64_sve_smlslb        : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_smlslb_lane   : SVE2_3VectorArg_Indexed_Intrinsic;
+def int_aarch64_sve_smlslt        : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_smlslt_lane   : SVE2_3VectorArg_Indexed_Intrinsic;
+def int_aarch64_sve_smullb        : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_smullb_lane   : SVE2_2VectorArgIndexed_Long_Intrinsic;
+def int_aarch64_sve_smullt        : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_smullt_lane   : SVE2_2VectorArgIndexed_Long_Intrinsic;
+def int_aarch64_sve_sqdmlalb      : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sqdmlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic;
+def int_aarch64_sve_sqdmlalt      : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sqdmlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic;
+def int_aarch64_sve_sqdmlslb      : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sqdmlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic;
+def int_aarch64_sve_sqdmlslt      : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sqdmlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic;
+def int_aarch64_sve_sqdmullb      : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sqdmullb_lane : SVE2_2VectorArgIndexed_Long_Intrinsic;
+def int_aarch64_sve_sqdmullt      : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sqdmullt_lane : SVE2_2VectorArgIndexed_Long_Intrinsic;
+def int_aarch64_sve_sshllb        : SVE2_1VectorArg_Long_Intrinsic;
+def int_aarch64_sve_sshllt        : SVE2_1VectorArg_Long_Intrinsic;
+def int_aarch64_sve_ssublb        : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_ssublt        : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_ssubwb        : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_ssubwt        : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_uabalb        : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_uabalt        : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_uabdlb        : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_uabdlt        : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_uaddlb        : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_uaddlt        : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_uaddwb        : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_uaddwt        : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_umlalb        : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_umlalb_lane   : SVE2_3VectorArg_Indexed_Intrinsic;
+def int_aarch64_sve_umlalt        : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_umlalt_lane   : SVE2_3VectorArg_Indexed_Intrinsic;
+def int_aarch64_sve_umlslb        : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_umlslb_lane   : SVE2_3VectorArg_Indexed_Intrinsic;
+def int_aarch64_sve_umlslt        : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_umlslt_lane   : SVE2_3VectorArg_Indexed_Intrinsic;
+def int_aarch64_sve_umullb        : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_umullb_lane   : SVE2_2VectorArgIndexed_Long_Intrinsic;
+def int_aarch64_sve_umullt        : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_umullt_lane   : SVE2_2VectorArgIndexed_Long_Intrinsic;
+def int_aarch64_sve_ushllb        : SVE2_1VectorArg_Long_Intrinsic;
+def int_aarch64_sve_ushllt        : SVE2_1VectorArg_Long_Intrinsic;
+def int_aarch64_sve_usublb        : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_usublt        : SVE2_2VectorArg_Long_Intrinsic;
+def int_aarch64_sve_usubwb        : SVE2_2VectorArg_Wide_Intrinsic;
+def int_aarch64_sve_usubwt        : SVE2_2VectorArg_Wide_Intrinsic;
 
 //
 // SVE2 - Non-widening pairwise arithmetic
@@ -1849,34 +1911,6 @@
 def int_aarch64_sve_sqrshrunb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic;
 def int_aarch64_sve_sqrshrunt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic;
 
-// SVE2 MLA LANE.
-def int_aarch64_sve_smlalb_lane   : SVE2_3VectorArg_Indexed_Intrinsic;
-def int_aarch64_sve_smlalt_lane   : SVE2_3VectorArg_Indexed_Intrinsic;
-def int_aarch64_sve_umlalb_lane   : SVE2_3VectorArg_Indexed_Intrinsic;
-def int_aarch64_sve_umlalt_lane   : SVE2_3VectorArg_Indexed_Intrinsic;
-def int_aarch64_sve_smlslb_lane   : SVE2_3VectorArg_Indexed_Intrinsic;
-def int_aarch64_sve_smlslt_lane   : SVE2_3VectorArg_Indexed_Intrinsic;
-def int_aarch64_sve_umlslb_lane   : SVE2_3VectorArg_Indexed_Intrinsic;
-def int_aarch64_sve_umlslt_lane   : SVE2_3VectorArg_Indexed_Intrinsic;
-def int_aarch64_sve_sqdmlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic;
-def int_aarch64_sve_sqdmlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic;
-def int_aarch64_sve_sqdmlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic;
-def int_aarch64_sve_sqdmlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic;
-
-// SVE2 MLA Unpredicated.
-def int_aarch64_sve_smlalb      : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_smlalt      : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_umlalb      : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_umlalt      : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_smlslb      : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_smlslt      : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_umlslb      : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_umlslt      : SVE2_3VectorArg_Long_Intrinsic;
-
-def int_aarch64_sve_sqdmlalb    : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_sqdmlalt    : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_sqdmlslb    : SVE2_3VectorArg_Long_Intrinsic;
-def int_aarch64_sve_sqdmlslt    : SVE2_3VectorArg_Long_Intrinsic;
 def int_aarch64_sve_sqdmlalbt   : SVE2_3VectorArg_Long_Intrinsic;
 def int_aarch64_sve_sqdmlslbt   : SVE2_3VectorArg_Long_Intrinsic;

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D73903: [AArch64][SVE] Add remaining SVE2 intrinsics for widening DSP operations

Reply via email to