kmclaughlin created this revision. kmclaughlin added reviewers: sdesmalen, dancgr, efriedma, c-rhodes. Herald added subscribers: psnobl, rkruppe, hiraditya, kristof.beyls, tschuett. Herald added a reviewer: rengolin. Herald added a project: LLVM. kmclaughlin added a parent revision: D73719: [AArch64][SVE] Add SVE2 intrinsics for widening DSP operations.
Implements the following intrinsics: - llvm.aarch64.sve.[s|u]mullb_lane - llvm.aarch64.sve.[s|u]mullt_lane - llvm.aarch64.sve.sqdmullb_lane - llvm.aarch64.sve.sqdmullt_lane - llvm.aarch64.sve.[s|u]addwb - llvm.aarch64.sve.[s|u]addwt - llvm.aarch64.sve.[s|u]shllb - llvm.aarch64.sve.[s|u]shllt - llvm.aarch64.sve.[s|u]subwb - llvm.aarch64.sve.[s|u]subwt Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D73903 Files: llvm/include/llvm/IR/IntrinsicsAArch64.td llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td llvm/lib/Target/AArch64/SVEInstrFormats.td llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll
Index: llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll +++ llvm/test/CodeGen/AArch64/sve2-intrinsics-widening-dsp.ll @@ -193,6 +193,69 @@ } ; +; SADDWB +; + +define <vscale x 8 x i16> @saddwb_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) { +; CHECK-LABEL: saddwb_b: +; CHECK: saddwb z0.h, z0.h, z1.b +; CHECK-NEXT: ret + %out = call <vscale x 8 x i16> @llvm.aarch64.sve.saddwb.nxv8i16(<vscale x 8 x i16> %a, + <vscale x 16 x i8> %b) + ret <vscale x 8 x i16> %out +} + +define <vscale x 4 x i32> @saddwb_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) { +; CHECK-LABEL: saddwb_h: +; CHECK: saddwb z0.s, z0.s, z1.h +; CHECK-NEXT: ret + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.saddwb.nxv4i32(<vscale x 4 x i32> %a, + <vscale x 8 x i16> %b) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @saddwb_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: saddwb_s: +; CHECK: saddwb z0.d, z0.d, z1.s +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.saddwb.nxv2i64(<vscale x 2 x i64> %a, + <vscale x 4 x i32> %b) + ret <vscale x 2 x i64> %out +} + +; +; SADDWT +; + +define <vscale x 8 x i16> @saddwt_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) { +; CHECK-LABEL: saddwt_b: +; CHECK: saddwt z0.h, z0.h, z1.b +; CHECK-NEXT: ret + %out = call <vscale x 8 x i16> @llvm.aarch64.sve.saddwt.nxv8i16(<vscale x 8 x i16> %a, + <vscale x 16 x i8> %b) + ret <vscale x 8 x i16> %out +} + +define <vscale x 4 x i32> @saddwt_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) { +; CHECK-LABEL: saddwt_h: +; CHECK: saddwt z0.s, z0.s, z1.h +; CHECK-NEXT: ret + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.saddwt.nxv4i32(<vscale x 4 x i32> %a, + <vscale x 8 x i16> %b) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @saddwt_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: saddwt_s: +; CHECK: saddwt z0.d, z0.d, z1.s +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.saddwt.nxv2i64(<vscale x 2 x i64> %a, + <vscale x 4 x i32> %b) + ret <vscale x 2 x i64> %out +} + + +; ; SMULLB (Vectors) ; @@ -224,6 +287,30 @@ } ; +; SMULLB (Indexed) +; + +define <vscale x 4 x i32> @smullb_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { +; CHECK-LABEL: smullb_lane_h: +; CHECK: smullb z0.s, z0.h, z1.h[4] +; CHECK-NEXT: ret + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.smullb.lane.nxv4i32(<vscale x 8 x i16> %a, + <vscale x 8 x i16> %b, + i32 4) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @smullb_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: smullb_lane_s: +; CHECK: smullb z0.d, z0.s, z1.s[3] +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.smullb.lane.nxv2i64(<vscale x 4 x i32> %a, + <vscale x 4 x i32> %b, + i32 3) + ret <vscale x 2 x i64> %out +} + +; ; SMULLT (Vectors) ; @@ -255,6 +342,30 @@ } ; +; SMULLT (Indexed) +; + +define <vscale x 4 x i32> @smullt_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { +; CHECK-LABEL: smullt_lane_h: +; CHECK: smullt z0.s, z0.h, z1.h[5] +; CHECK-NEXT: ret + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.smullt.lane.nxv4i32(<vscale x 8 x i16> %a, + <vscale x 8 x i16> %b, + i32 5) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @smullt_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: smullt_lane_s: +; CHECK: smullt z0.d, z0.s, z1.s[2] +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.smullt.lane.nxv2i64(<vscale x 4 x i32> %a, + <vscale x 4 x i32> %b, + i32 2) + ret <vscale x 2 x i64> %out +} + +; ; SQDMULLB (Vectors) ; @@ -286,6 +397,30 @@ } ; +; SQDMULLB (Indexed) +; + +define <vscale x 4 x i32> @sqdmullb_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { +; CHECK-LABEL: sqdmullb_lane_h: +; CHECK: sqdmullb z0.s, z0.h, z1.h[2] +; CHECK-NEXT: ret + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullb.lane.nxv4i32(<vscale x 8 x i16> %a, + <vscale x 8 x i16> %b, + i32 2) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @sqdmullb_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: sqdmullb_lane_s: +; CHECK: sqdmullb z0.d, z0.s, z1.s[1] +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullb.lane.nxv2i64(<vscale x 4 x i32> %a, + <vscale x 4 x i32> %b, + i32 1) + ret <vscale x 2 x i64> %out +} + +; ; SQDMULLT (Vectors) ; @@ -317,6 +452,30 @@ } ; +; SQDMULLT (Indexed) +; + +define <vscale x 4 x i32> @sqdmullt_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { +; CHECK-LABEL: sqdmullt_lane_h: +; CHECK: sqdmullt z0.s, z0.h, z1.h[3] +; CHECK-NEXT: ret + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullt.lane.nxv4i32(<vscale x 8 x i16> %a, + <vscale x 8 x i16> %b, + i32 3) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @sqdmullt_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: sqdmullt_lane_s: +; CHECK: sqdmullt z0.d, z0.s, z1.s[0] +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullt.lane.nxv2i64(<vscale x 4 x i32> %a, + <vscale x 4 x i32> %b, + i32 0) + ret <vscale x 2 x i64> %out +} + +; ; SSUBLB ; @@ -348,6 +507,62 @@ } ; +; SSHLLB +; + +define <vscale x 8 x i16> @sshllb_b(<vscale x 16 x i8> %a) { +; CHECK-LABEL: sshllb_b: +; CHECK: sshllb z0.h, z0.b, #0 +; CHECK-NEXT: ret + %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sshllb.nxv8i16(<vscale x 16 x i8> %a, i32 0) + ret <vscale x 8 x i16> %out +} + +define <vscale x 4 x i32> @sshllb_h(<vscale x 8 x i16> %a) { +; CHECK-LABEL: sshllb_h: +; CHECK: sshllb z0.s, z0.h, #1 +; CHECK-NEXT: ret + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sshllb.nxv4i32(<vscale x 8 x i16> %a, i32 1) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @sshllb_s(<vscale x 4 x i32> %a) { +; CHECK-LABEL: sshllb_s: +; CHECK: sshllb z0.d, z0.s, #2 +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sshllb.nxv2i64(<vscale x 4 x i32> %a, i32 2) + ret <vscale x 2 x i64> %out +} + +; +; SSHLLT +; + +define <vscale x 8 x i16> @sshllt_b(<vscale x 16 x i8> %a) { +; CHECK-LABEL: sshllt_b: +; CHECK: sshllt z0.h, z0.b, #3 +; CHECK-NEXT: ret + %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sshllt.nxv8i16(<vscale x 16 x i8> %a, i32 3) + ret <vscale x 8 x i16> %out +} + +define <vscale x 4 x i32> @sshllt_h(<vscale x 8 x i16> %a) { +; CHECK-LABEL: sshllt_h: +; CHECK: sshllt z0.s, z0.h, #4 +; CHECK-NEXT: ret + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sshllt.nxv4i32(<vscale x 8 x i16> %a, i32 4) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @sshllt_s(<vscale x 4 x i32> %a) { +; CHECK-LABEL: sshllt_s: +; CHECK: sshllt z0.d, z0.s, #5 +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sshllt.nxv2i64(<vscale x 4 x i32> %a, i32 5) + ret <vscale x 2 x i64> %out +} + +; ; SSUBLT ; @@ -379,6 +594,68 @@ } ; +; SSUBWB +; + +define <vscale x 8 x i16> @ssubwb_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) { +; CHECK-LABEL: ssubwb_b: +; CHECK: ssubwb z0.h, z0.h, z1.b +; CHECK-NEXT: ret + %out = call <vscale x 8 x i16> @llvm.aarch64.sve.ssubwb.nxv8i16(<vscale x 8 x i16> %a, + <vscale x 16 x i8> %b) + ret <vscale x 8 x i16> %out +} + +define <vscale x 4 x i32> @ssubwb_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) { +; CHECK-LABEL: ssubwb_h: +; CHECK: ssubwb z0.s, z0.s, z1.h +; CHECK-NEXT: ret + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.ssubwb.nxv4i32(<vscale x 4 x i32> %a, + <vscale x 8 x i16> %b) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @ssubwb_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: ssubwb_s: +; CHECK: ssubwb z0.d, z0.d, z1.s +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.ssubwb.nxv2i64(<vscale x 2 x i64> %a, + <vscale x 4 x i32> %b) + ret <vscale x 2 x i64> %out +} + +; +; SSUBWT +; + +define <vscale x 8 x i16> @ssubwt_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) { +; CHECK-LABEL: ssubwt_b: +; CHECK: ssubwt z0.h, z0.h, z1.b +; CHECK-NEXT: ret + %out = call <vscale x 8 x i16> @llvm.aarch64.sve.ssubwt.nxv8i16(<vscale x 8 x i16> %a, + <vscale x 16 x i8> %b) + ret <vscale x 8 x i16> %out +} + +define <vscale x 4 x i32> @ssubwt_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) { +; CHECK-LABEL: ssubwt_h: +; CHECK: ssubwt z0.s, z0.s, z1.h +; CHECK-NEXT: ret + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.ssubwt.nxv4i32(<vscale x 4 x i32> %a, + <vscale x 8 x i16> %b) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @ssubwt_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: ssubwt_s: +; CHECK: ssubwt z0.d, z0.d, z1.s +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.ssubwt.nxv2i64(<vscale x 2 x i64> %a, + <vscale x 4 x i32> %b) + ret <vscale x 2 x i64> %out +} + +; ; UABALB ; @@ -571,6 +848,68 @@ } ; +; UADDWB +; + +define <vscale x 8 x i16> @uaddwb_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) { +; CHECK-LABEL: uaddwb_b: +; CHECK: uaddwb z0.h, z0.h, z1.b +; CHECK-NEXT: ret + %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uaddwb.nxv8i16(<vscale x 8 x i16> %a, + <vscale x 16 x i8> %b) + ret <vscale x 8 x i16> %out +} + +define <vscale x 4 x i32> @uaddwb_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) { +; CHECK-LABEL: uaddwb_h: +; CHECK: uaddwb z0.s, z0.s, z1.h +; CHECK-NEXT: ret + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uaddwb.nxv4i32(<vscale x 4 x i32> %a, + <vscale x 8 x i16> %b) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @uaddwb_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: uaddwb_s: +; CHECK: uaddwb z0.d, z0.d, z1.s +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uaddwb.nxv2i64(<vscale x 2 x i64> %a, + <vscale x 4 x i32> %b) + ret <vscale x 2 x i64> %out +} + +; +; UADDWT +; + +define <vscale x 8 x i16> @uaddwt_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) { +; CHECK-LABEL: uaddwt_b: +; CHECK: uaddwt z0.h, z0.h, z1.b +; CHECK-NEXT: ret + %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uaddwt.nxv8i16(<vscale x 8 x i16> %a, + <vscale x 16 x i8> %b) + ret <vscale x 8 x i16> %out +} + +define <vscale x 4 x i32> @uaddwt_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) { +; CHECK-LABEL: uaddwt_h: +; CHECK: uaddwt z0.s, z0.s, z1.h +; CHECK-NEXT: ret + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uaddwt.nxv4i32(<vscale x 4 x i32> %a, + <vscale x 8 x i16> %b) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @uaddwt_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: uaddwt_s: +; CHECK: uaddwt z0.d, z0.d, z1.s +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uaddwt.nxv2i64(<vscale x 2 x i64> %a, + <vscale x 4 x i32> %b) + ret <vscale x 2 x i64> %out +} + +; ; UMULLB (Vectors) ; @@ -602,6 +941,31 @@ } ; +; UMULLB (Indexed) +; + +define <vscale x 4 x i32> @umullb_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { +; CHECK-LABEL: umullb_lane_h: +; CHECK: umullb z0.s, z0.h, z1.h[0] +; CHECK-NEXT: ret + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.umullb.lane.nxv4i32(<vscale x 8 x i16> %a, + <vscale x 8 x i16> %b, + i32 0) + ret <vscale x 4 x i32> %out +} + + +define <vscale x 2 x i64> @umullb_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: umullb_lane_s: +; CHECK: umullb z0.d, z0.s, z1.s[3] +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.umullb.lane.nxv2i64(<vscale x 4 x i32> %a, + <vscale x 4 x i32> %b, + i32 3) + ret <vscale x 2 x i64> %out +} + +; ; UMULLT (Vectors) ; @@ -633,6 +997,86 @@ } ; +; UMULLT (Indexed) +; + +define <vscale x 4 x i32> @umullt_lane_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { +; CHECK-LABEL: umullt_lane_h: +; CHECK: umullt z0.s, z0.h, z1.h[1] +; CHECK-NEXT: ret + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.umullt.lane.nxv4i32(<vscale x 8 x i16> %a, + <vscale x 8 x i16> %b, + i32 1) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @umullt_lane_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: umullt_lane_s: +; CHECK: umullt z0.d, z0.s, z1.s[2] +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.umullt.lane.nxv2i64(<vscale x 4 x i32> %a, + <vscale x 4 x i32> %b, + i32 2) + ret <vscale x 2 x i64> %out +} + +; +; USHLLB +; + +define <vscale x 8 x i16> @ushllb_b(<vscale x 16 x i8> %a) { +; CHECK-LABEL: ushllb_b: +; CHECK: ushllb z0.h, z0.b, #6 +; CHECK-NEXT: ret + %out = call <vscale x 8 x i16> @llvm.aarch64.sve.ushllb.nxv8i16(<vscale x 16 x i8> %a, i32 6) + ret <vscale x 8 x i16> %out +} + +define <vscale x 4 x i32> @ushllb_h(<vscale x 8 x i16> %a) { +; CHECK-LABEL: ushllb_h: +; CHECK: ushllb z0.s, z0.h, #7 +; CHECK-NEXT: ret + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.ushllb.nxv4i32(<vscale x 8 x i16> %a, i32 7) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @ushllb_s(<vscale x 4 x i32> %a) { +; CHECK-LABEL: ushllb_s: +; CHECK: ushllb z0.d, z0.s, #8 +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.ushllb.nxv2i64(<vscale x 4 x i32> %a, i32 8) + ret <vscale x 2 x i64> %out +} + +; +; USHLLT +; + +define <vscale x 8 x i16> @ushllt_b(<vscale x 16 x i8> %a) { +; CHECK-LABEL: ushllt_b: +; CHECK: ushllt z0.h, z0.b, #7 +; CHECK-NEXT: ret + %out = call <vscale x 8 x i16> @llvm.aarch64.sve.ushllt.nxv8i16(<vscale x 16 x i8> %a, i32 7) + ret <vscale x 8 x i16> %out +} + +define <vscale x 4 x i32> @ushllt_h(<vscale x 8 x i16> %a) { +; CHECK-LABEL: ushllt_h: +; CHECK: ushllt z0.s, z0.h, #15 +; CHECK-NEXT: ret + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.ushllt.nxv4i32(<vscale x 8 x i16> %a, i32 15) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @ushllt_s(<vscale x 4 x i32> %a) { +; CHECK-LABEL: ushllt_s: +; CHECK: ushllt z0.d, z0.s, #31 +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.ushllt.nxv2i64(<vscale x 4 x i32> %a, i32 31) + ret <vscale x 2 x i64> %out +} + +; ; USUBLB ; @@ -694,6 +1138,68 @@ ret <vscale x 2 x i64> %out } +; +; USUBWB +; + +define <vscale x 8 x i16> @usubwb_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) { +; CHECK-LABEL: usubwb_b: +; CHECK: usubwb z0.h, z0.h, z1.b +; CHECK-NEXT: ret + %out = call <vscale x 8 x i16> @llvm.aarch64.sve.usubwb.nxv8i16(<vscale x 8 x i16> %a, + <vscale x 16 x i8> %b) + ret <vscale x 8 x i16> %out +} + +define <vscale x 4 x i32> @usubwb_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) { +; CHECK-LABEL: usubwb_h: +; CHECK: usubwb z0.s, z0.s, z1.h +; CHECK-NEXT: ret + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.usubwb.nxv4i32(<vscale x 4 x i32> %a, + <vscale x 8 x i16> %b) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @usubwb_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: usubwb_s: +; CHECK: usubwb z0.d, z0.d, z1.s +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.usubwb.nxv2i64(<vscale x 2 x i64> %a, + <vscale x 4 x i32> %b) + ret <vscale x 2 x i64> %out +} + +; +; USUBWT +; + +define <vscale x 8 x i16> @usubwt_b(<vscale x 8 x i16> %a, <vscale x 16 x i8> %b) { +; CHECK-LABEL: usubwt_b: +; CHECK: usubwt z0.h, z0.h, z1.b +; CHECK-NEXT: ret + %out = call <vscale x 8 x i16> @llvm.aarch64.sve.usubwt.nxv8i16(<vscale x 8 x i16> %a, + <vscale x 16 x i8> %b) + ret <vscale x 8 x i16> %out +} + +define <vscale x 4 x i32> @usubwt_h(<vscale x 4 x i32> %a, <vscale x 8 x i16> %b) { +; CHECK-LABEL: usubwt_h: +; CHECK: usubwt z0.s, z0.s, z1.h +; CHECK-NEXT: ret + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.usubwt.nxv4i32(<vscale x 4 x i32> %a, + <vscale x 8 x i16> %b) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @usubwt_s(<vscale x 2 x i64> %a, <vscale x 4 x i32> %b) { +; CHECK-LABEL: usubwt_s: +; CHECK: usubwt z0.d, z0.d, z1.s +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.usubwt.nxv2i64(<vscale x 2 x i64> %a, + <vscale x 4 x i32> %b) + ret <vscale x 2 x i64> %out +} + declare <vscale x 8 x i16> @llvm.aarch64.sve.sabalb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>, <vscale x 16 x i8>) declare <vscale x 4 x i32> @llvm.aarch64.sve.sabalb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>, <vscale x 8 x i16>) declare <vscale x 2 x i64> @llvm.aarch64.sve.sabalb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>, <vscale x 4 x i32>) @@ -718,22 +1224,50 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.saddlt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>) declare <vscale x 2 x i64> @llvm.aarch64.sve.saddlt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>) +declare <vscale x 8 x i16> @llvm.aarch64.sve.saddwb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.saddwb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>) +declare <vscale x 2 x i64> @llvm.aarch64.sve.saddwb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>) + +declare <vscale x 8 x i16> @llvm.aarch64.sve.saddwt.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.saddwt.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>) +declare <vscale x 2 x i64> @llvm.aarch64.sve.saddwt.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>) + declare <vscale x 8 x i16> @llvm.aarch64.sve.smullb.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>) declare <vscale x 4 x i32> @llvm.aarch64.sve.smullb.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>) declare <vscale x 2 x i64> @llvm.aarch64.sve.smullb.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.smullb.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32) +declare <vscale x 2 x i64> @llvm.aarch64.sve.smullb.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32) + declare <vscale x 8 x i16> @llvm.aarch64.sve.smullt.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>) declare <vscale x 4 x i32> @llvm.aarch64.sve.smullt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>) declare <vscale x 2 x i64> @llvm.aarch64.sve.smullt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.smullt.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32) +declare <vscale x 2 x i64> @llvm.aarch64.sve.smullt.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32) + declare <vscale x 8 x i16> @llvm.aarch64.sve.sqdmullb.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>) declare <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullb.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>) declare <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullb.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullb.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32) +declare <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullb.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32) + declare <vscale x 8 x i16> @llvm.aarch64.sve.sqdmullt.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>) declare <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>) declare <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.sqdmullt.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32) +declare <vscale x 2 x i64> @llvm.aarch64.sve.sqdmullt.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32) + +declare <vscale x 8 x i16> @llvm.aarch64.sve.sshllb.nxv8i16(<vscale x 16 x i8>, i32) +declare <vscale x 4 x i32> @llvm.aarch64.sve.sshllb.nxv4i32(<vscale x 8 x i16>, i32) +declare <vscale x 2 x i64> @llvm.aarch64.sve.sshllb.nxv2i64(<vscale x 4 x i32>, i32) + +declare <vscale x 8 x i16> @llvm.aarch64.sve.sshllt.nxv8i16(<vscale x 16 x i8>, i32) +declare <vscale x 4 x i32> @llvm.aarch64.sve.sshllt.nxv4i32(<vscale x 8 x i16>, i32) +declare <vscale x 2 x i64> @llvm.aarch64.sve.sshllt.nxv2i64(<vscale x 4 x i32>, i32) + declare <vscale x 8 x i16> @llvm.aarch64.sve.ssublb.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>) declare <vscale x 4 x i32> @llvm.aarch64.sve.ssublb.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>) declare <vscale x 2 x i64> @llvm.aarch64.sve.ssublb.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>) @@ -742,6 +1276,14 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.ssublt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>) declare <vscale x 2 x i64> @llvm.aarch64.sve.ssublt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>) +declare <vscale x 8 x i16> @llvm.aarch64.sve.ssubwb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.ssubwb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>) +declare <vscale x 2 x i64> @llvm.aarch64.sve.ssubwb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>) + +declare <vscale x 8 x i16> @llvm.aarch64.sve.ssubwt.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.ssubwt.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>) +declare <vscale x 2 x i64> @llvm.aarch64.sve.ssubwt.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>) + declare <vscale x 8 x i16> @llvm.aarch64.sve.uabalb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>, <vscale x 16 x i8>) declare <vscale x 4 x i32> @llvm.aarch64.sve.uabalb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>, <vscale x 8 x i16>) declare <vscale x 2 x i64> @llvm.aarch64.sve.uabalb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>, <vscale x 4 x i32>) @@ -766,14 +1308,36 @@ declare <vscale x 4 x i32> @llvm.aarch64.sve.uaddlt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>) declare <vscale x 2 x i64> @llvm.aarch64.sve.uaddlt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>) +declare <vscale x 8 x i16> @llvm.aarch64.sve.uaddwb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.uaddwb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>) +declare <vscale x 2 x i64> @llvm.aarch64.sve.uaddwb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>) + +declare <vscale x 8 x i16> @llvm.aarch64.sve.uaddwt.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.uaddwt.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>) +declare <vscale x 2 x i64> @llvm.aarch64.sve.uaddwt.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>) + declare <vscale x 8 x i16> @llvm.aarch64.sve.umullb.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>) declare <vscale x 4 x i32> @llvm.aarch64.sve.umullb.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>) declare <vscale x 2 x i64> @llvm.aarch64.sve.umullb.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.umullb.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32) +declare <vscale x 2 x i64> @llvm.aarch64.sve.umullb.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32) + declare <vscale x 8 x i16> @llvm.aarch64.sve.umullt.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>) declare <vscale x 4 x i32> @llvm.aarch64.sve.umullt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>) declare <vscale x 2 x i64> @llvm.aarch64.sve.umullt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.umullt.lane.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>, i32) +declare <vscale x 2 x i64> @llvm.aarch64.sve.umullt.lane.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>, i32) + +declare <vscale x 8 x i16> @llvm.aarch64.sve.ushllb.nxv8i16(<vscale x 16 x i8>, i32) +declare <vscale x 4 x i32> @llvm.aarch64.sve.ushllb.nxv4i32(<vscale x 8 x i16>, i32) +declare <vscale x 2 x i64> @llvm.aarch64.sve.ushllb.nxv2i64(<vscale x 4 x i32>, i32) + +declare <vscale x 8 x i16> @llvm.aarch64.sve.ushllt.nxv8i16(<vscale x 16 x i8>, i32) +declare <vscale x 4 x i32> @llvm.aarch64.sve.ushllt.nxv4i32(<vscale x 8 x i16>, i32) +declare <vscale x 2 x i64> @llvm.aarch64.sve.ushllt.nxv2i64(<vscale x 4 x i32>, i32) + declare <vscale x 8 x i16> @llvm.aarch64.sve.usublb.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>) declare <vscale x 4 x i32> @llvm.aarch64.sve.usublb.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>) declare <vscale x 2 x i64> @llvm.aarch64.sve.usublb.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>) @@ -781,3 +1345,11 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.usublt.nxv8i16(<vscale x 16 x i8>, <vscale x 16 x i8>) declare <vscale x 4 x i32> @llvm.aarch64.sve.usublt.nxv4i32(<vscale x 8 x i16>, <vscale x 8 x i16>) declare <vscale x 2 x i64> @llvm.aarch64.sve.usublt.nxv2i64(<vscale x 4 x i32>, <vscale x 4 x i32>) + +declare <vscale x 8 x i16> @llvm.aarch64.sve.usubwb.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.usubwb.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>) +declare <vscale x 2 x i64> @llvm.aarch64.sve.usubwb.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>) + +declare <vscale x 8 x i16> @llvm.aarch64.sve.usubwt.nxv8i16(<vscale x 8 x i16>, <vscale x 16 x i8>) +declare <vscale x 4 x i32> @llvm.aarch64.sve.usubwt.nxv4i32(<vscale x 4 x i32>, <vscale x 8 x i16>) +declare <vscale x 2 x i64> @llvm.aarch64.sve.usubwt.nxv2i64(<vscale x 2 x i64>, <vscale x 4 x i32>) Index: llvm/lib/Target/AArch64/SVEInstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/SVEInstrFormats.td +++ llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -2703,9 +2703,10 @@ def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _D)>; } -multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm> { +multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm, + SDPatternOperator op> { def _S : sve2_int_mul_by_indexed_elem<0b10, { opc{2-1}, ?, opc{0} }, asm, - ZPR32, ZPR16, ZPR3b16, VectorIndexH> { + ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> { bits<3> Zm; bits<3> iop; let Inst{20-19} = iop{2-1}; @@ -2713,13 +2714,16 @@ let Inst{11} = iop{0}; } def _D : sve2_int_mul_by_indexed_elem<0b11, { opc{2-1}, ?, opc{0} }, asm, - ZPR64, ZPR32, ZPR4b32, VectorIndexS> { + ZPR64, ZPR32, ZPR4b32, VectorIndexS32b> { bits<4> Zm; bits<2> iop; let Inst{20} = iop{1}; let Inst{19-16} = Zm; let Inst{11} = iop{0}; } + + def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -2866,10 +2870,15 @@ def : SVE_2_Op_Pat<nxv2i64, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>; } -multiclass sve2_wide_int_arith_wide<bits<3> opc, string asm> { +multiclass sve2_wide_int_arith_wide<bits<3> opc, string asm, + SDPatternOperator op> { def _H : sve2_wide_int_arith<0b01, { 0b10, opc }, asm, ZPR16, ZPR16, ZPR8>; def _S : sve2_wide_int_arith<0b10, { 0b10, opc }, asm, ZPR32, ZPR32, ZPR16>; def _D : sve2_wide_int_arith<0b11, { 0b10, opc }, asm, ZPR64, ZPR64, ZPR32>; + + def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, !cast<Instruction>(NAME # _D)>; } multiclass sve2_pmul_long<bits<1> opc, string asm> { @@ -2959,7 +2968,8 @@ let Inst{4-0} = Zd; } -multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> { +multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm, + SDPatternOperator op> { def _H : sve2_bitwise_shift_left_long<{0,0,1}, opc, asm, ZPR16, ZPR8, vecshiftL8>; def _S : sve2_bitwise_shift_left_long<{0,1,?}, opc, asm, @@ -2970,6 +2980,9 @@ ZPR64, ZPR32, vecshiftL32> { let Inst{20-19} = imm{4-3}; } + def : SVE_2_Op_Imm_Pat<nxv8i16, op, nxv16i8, i32, tvecshiftL8, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Imm_Pat<nxv4i32, op, nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Imm_Pat<nxv2i64, op, nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1457,14 +1457,14 @@ defm SQRDCMLAH_ZZZ : sve2_int_cmla<0b1, "sqrdcmlah">; // SVE2 integer multiply long (indexed) - defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb">; - defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt">; - defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb">; - defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt">; + defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb", int_aarch64_sve_smullb_lane>; + defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt", int_aarch64_sve_smullt_lane>; + defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb", int_aarch64_sve_umullb_lane>; + defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt", int_aarch64_sve_umullt_lane>; // SVE2 saturating multiply (indexed) - defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb">; - defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt">; + defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb", int_aarch64_sve_sqdmullb_lane>; + defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt", int_aarch64_sve_sqdmullt_lane>; // SVE2 integer multiply-add long (indexed) defm SMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1000, "smlalb", int_aarch64_sve_smlalb_lane>; @@ -1575,14 +1575,14 @@ defm UABDLT_ZZZ : sve2_wide_int_arith_long<0b01111, "uabdlt", int_aarch64_sve_uabdlt>; // SVE2 integer add/subtract wide - defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb">; - defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt">; - defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb">; - defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt">; - defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb">; - defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt">; - defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb">; - defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt">; + defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb", int_aarch64_sve_saddwb>; + defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt", int_aarch64_sve_saddwt>; + defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb", int_aarch64_sve_uaddwb>; + defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt", int_aarch64_sve_uaddwt>; + defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb", int_aarch64_sve_ssubwb>; + defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt", int_aarch64_sve_ssubwt>; + defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb", int_aarch64_sve_usubwb>; + defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt", int_aarch64_sve_usubwt>; // SVE2 integer multiply long defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb", int_aarch64_sve_sqdmullb>; @@ -1675,10 +1675,10 @@ defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb">; // SVE2 bitwise shift left long - defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb">; - defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt">; - defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb">; - defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt">; + defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb", int_aarch64_sve_sshllb>; + defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt", int_aarch64_sve_sshllt>; + defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb", int_aarch64_sve_ushllb>; + defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt", int_aarch64_sve_ushllt>; // SVE2 integer add/subtract interleaved long defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt">; Index: llvm/include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1040,12 +1040,31 @@ LLVMVectorOfBitcastsToInt<0>], [IntrNoMem]>; + class SVE2_1VectorArg_Long_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMSubdivide2VectorType<0>, + llvm_i32_ty], + [IntrNoMem, ImmArg<1>]>; + class SVE2_2VectorArg_Long_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>], [IntrNoMem]>; + class SVE2_2VectorArgIndexed_Long_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMSubdivide2VectorType<0>, + LLVMSubdivide2VectorType<0>, + llvm_i32_ty], + [IntrNoMem, ImmArg<2>]>; + + class SVE2_2VectorArg_Wide_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + LLVMSubdivide2VectorType<0>], + [IntrNoMem]>; + class SVE2_2VectorArg_Pred_Long_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, @@ -1078,6 +1097,7 @@ [LLVMSubdivide2VectorType<0>, llvm_anyvector_ty], [IntrNoMem]>; + class SVE2_2VectorArg_Narrowing_Intrinsic : Intrinsic< [LLVMSubdivide2VectorType<0>], @@ -1731,28 +1751,70 @@ // SVE2 - Widening DSP operations // -def int_aarch64_sve_sabalb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_sabalt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_sabdlb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_sabdlt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_saddlb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_saddlt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_smullb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_smullt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_sqdmullb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_sqdmullt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_ssublb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_ssublt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_uabalb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_uabalt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_uabdlb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_uabdlt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_uaddlb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_uaddlt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_umullb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_umullt : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_usublb : SVE2_2VectorArg_Long_Intrinsic; -def int_aarch64_sve_usublt : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_sabalb : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_sabalt : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_sabdlb : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_sabdlt : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_saddlb : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_saddlt : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_saddwb : SVE2_2VectorArg_Wide_Intrinsic; +def int_aarch64_sve_saddwt : SVE2_2VectorArg_Wide_Intrinsic; +def int_aarch64_sve_smlalb : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_smlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic; +def int_aarch64_sve_smlalt : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_smlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic; +def int_aarch64_sve_smlslb : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_smlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic; +def int_aarch64_sve_smlslt : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_smlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic; +def int_aarch64_sve_smullb : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_smullb_lane : SVE2_2VectorArgIndexed_Long_Intrinsic; +def int_aarch64_sve_smullt : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_smullt_lane : SVE2_2VectorArgIndexed_Long_Intrinsic; +def int_aarch64_sve_sqdmlalb : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_sqdmlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic; +def int_aarch64_sve_sqdmlalt : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_sqdmlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic; +def int_aarch64_sve_sqdmlslb : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_sqdmlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic; +def int_aarch64_sve_sqdmlslt : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_sqdmlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic; +def int_aarch64_sve_sqdmullb : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_sqdmullb_lane : SVE2_2VectorArgIndexed_Long_Intrinsic; +def int_aarch64_sve_sqdmullt : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_sqdmullt_lane : SVE2_2VectorArgIndexed_Long_Intrinsic; +def int_aarch64_sve_sshllb : SVE2_1VectorArg_Long_Intrinsic; +def int_aarch64_sve_sshllt : SVE2_1VectorArg_Long_Intrinsic; +def int_aarch64_sve_ssublb : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_ssublt : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_ssubwb : SVE2_2VectorArg_Wide_Intrinsic; +def int_aarch64_sve_ssubwt : SVE2_2VectorArg_Wide_Intrinsic; +def int_aarch64_sve_uabalb : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_uabalt : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_uabdlb : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_uabdlt : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_uaddlb : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_uaddlt : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_uaddwb : SVE2_2VectorArg_Wide_Intrinsic; +def int_aarch64_sve_uaddwt : SVE2_2VectorArg_Wide_Intrinsic; +def int_aarch64_sve_umlalb : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_umlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic; +def int_aarch64_sve_umlalt : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_umlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic; +def int_aarch64_sve_umlslb : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_umlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic; +def int_aarch64_sve_umlslt : SVE2_3VectorArg_Long_Intrinsic; +def int_aarch64_sve_umlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic; +def int_aarch64_sve_umullb : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_umullb_lane : SVE2_2VectorArgIndexed_Long_Intrinsic; +def int_aarch64_sve_umullt : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_umullt_lane : SVE2_2VectorArgIndexed_Long_Intrinsic; +def int_aarch64_sve_ushllb : SVE2_1VectorArg_Long_Intrinsic; +def int_aarch64_sve_ushllt : SVE2_1VectorArg_Long_Intrinsic; +def int_aarch64_sve_usublb : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_usublt : SVE2_2VectorArg_Long_Intrinsic; +def int_aarch64_sve_usubwb : SVE2_2VectorArg_Wide_Intrinsic; +def int_aarch64_sve_usubwt : SVE2_2VectorArg_Wide_Intrinsic; // // SVE2 - Non-widening pairwise arithmetic @@ -1849,34 +1911,6 @@ def int_aarch64_sve_sqrshrunb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic; def int_aarch64_sve_sqrshrunt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic; -// SVE2 MLA LANE. -def int_aarch64_sve_smlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_smlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_umlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_umlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_smlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_smlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_umlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_umlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_sqdmlalb_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_sqdmlalt_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_sqdmlslb_lane : SVE2_3VectorArg_Indexed_Intrinsic; -def int_aarch64_sve_sqdmlslt_lane : SVE2_3VectorArg_Indexed_Intrinsic; - -// SVE2 MLA Unpredicated. -def int_aarch64_sve_smlalb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_smlalt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_umlalb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_umlalt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_smlslb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_smlslt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_umlslb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_umlslt : SVE2_3VectorArg_Long_Intrinsic; - -def int_aarch64_sve_sqdmlalb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_sqdmlalt : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_sqdmlslb : SVE2_3VectorArg_Long_Intrinsic; -def int_aarch64_sve_sqdmlslt : SVE2_3VectorArg_Long_Intrinsic; def int_aarch64_sve_sqdmlalbt : SVE2_3VectorArg_Long_Intrinsic; def int_aarch64_sve_sqdmlslbt : SVE2_3VectorArg_Long_Intrinsic;
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits