https://github.com/TelGome created https://github.com/llvm/llvm-project/pull/207110
This pr support RISC-V P extension intrinsics [Packed Merge](https://github.com/riscv/riscv-p-spec/blob/master/P-ext-intrinsics.adoc#packed-merge). >From 129f400205f23a4619ffa105fdd6683045f0e3cf Mon Sep 17 00:00:00 2001 From: Dongyan Chen <[email protected]> Date: Mon, 29 Jun 2026 07:07:50 +0000 Subject: [PATCH] Support Packed Merge --- clang/include/clang/Basic/BuiltinsRISCV.td | 14 + clang/lib/CodeGen/TargetBuiltins/RISCV.cpp | 25 +- clang/lib/Headers/riscv_packed_simd.h | 21 ++ clang/test/CodeGen/RISCV/rvp-intrinsics.c | 244 ++++++++++++++++ .../riscv_packed_simd.c | 198 +++++++++++++ llvm/include/llvm/IR/IntrinsicsRISCV.td | 8 + llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 52 +++- llvm/test/CodeGen/RISCV/rvp-simd-32.ll | 109 +++++++ llvm/test/CodeGen/RISCV/rvp-simd-64.ll | 271 ++++++++++++++++++ 9 files changed, 937 insertions(+), 5 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsRISCV.td b/clang/include/clang/Basic/BuiltinsRISCV.td index 3f84528fdca70..785e41324ad78 100644 --- a/clang/include/clang/Basic/BuiltinsRISCV.td +++ b/clang/include/clang/Basic/BuiltinsRISCV.td @@ -233,6 +233,20 @@ def predsumu_u16x4_u64 : RISCVBuiltin<"uint64_t(_Vector<4, unsigned short>, uint def predsum_i32x2_i64 : RISCVBuiltin<"int64_t(_Vector<2, int>, int64_t)">; def predsumu_u32x2_u64 : RISCVBuiltin<"uint64_t(_Vector<2, unsigned int>, uint64_t)">; +// Packed Merge (32-bit) +def pmerge_u8x4 : RISCVBuiltin<"_Vector<4, unsigned char>(_Vector<4, unsigned char>, _Vector<4, unsigned char>, _Vector<4, unsigned char>)">; +def pmerge_i8x4 : RISCVBuiltin<"_Vector<4, signed char>(_Vector<4, signed char>, _Vector<4, signed char>, _Vector<4, unsigned char>)">; +def pmerge_u16x2 : RISCVBuiltin<"_Vector<2, unsigned short>(_Vector<2, unsigned short>, _Vector<2, unsigned short>, _Vector<2, unsigned short>)">; +def pmerge_i16x2 : RISCVBuiltin<"_Vector<2, short>(_Vector<2, short>, _Vector<2, short>, _Vector<2, unsigned short>)">; + +// Packed Merge (64-bit) +def pmerge_u8x8 : RISCVBuiltin<"_Vector<8, unsigned char>(_Vector<8, unsigned char>, _Vector<8, unsigned char>, _Vector<8, unsigned char>)">; +def pmerge_i8x8 : RISCVBuiltin<"_Vector<8, signed char>(_Vector<8, signed char>, _Vector<8, signed char>, _Vector<8, unsigned char>)">; +def pmerge_u16x4 : RISCVBuiltin<"_Vector<4, unsigned short>(_Vector<4, unsigned short>, _Vector<4, unsigned short>, _Vector<4, unsigned short>)">; +def pmerge_i16x4 : RISCVBuiltin<"_Vector<4, short>(_Vector<4, short>, _Vector<4, short>, _Vector<4, unsigned short>)">; +def pmerge_u32x2 : RISCVBuiltin<"_Vector<2, unsigned int>(_Vector<2, unsigned int>, _Vector<2, unsigned int>, _Vector<2, unsigned int>)">; +def pmerge_i32x2 : RISCVBuiltin<"_Vector<2, int>(_Vector<2, int>, _Vector<2, int>, _Vector<2, unsigned int>)">; + } // Features = "experimental-p" //===----------------------------------------------------------------------===// diff --git a/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp b/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp index bb8fa86e7a564..588e229499082 100644 --- a/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp @@ -1247,7 +1247,18 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID, case RISCV::BI__builtin_riscv_pabdu_u8x4: case RISCV::BI__builtin_riscv_pabdu_u16x2: case RISCV::BI__builtin_riscv_pabdu_u8x8: - case RISCV::BI__builtin_riscv_pabdu_u16x4: { + case RISCV::BI__builtin_riscv_pabdu_u16x4: + // Packed Merge + case RISCV::BI__builtin_riscv_pmerge_u8x4: + case RISCV::BI__builtin_riscv_pmerge_i8x4: + case RISCV::BI__builtin_riscv_pmerge_u16x2: + case RISCV::BI__builtin_riscv_pmerge_i16x2: + case RISCV::BI__builtin_riscv_pmerge_u8x8: + case RISCV::BI__builtin_riscv_pmerge_i8x8: + case RISCV::BI__builtin_riscv_pmerge_u16x4: + case RISCV::BI__builtin_riscv_pmerge_i16x4: + case RISCV::BI__builtin_riscv_pmerge_u32x2: + case RISCV::BI__builtin_riscv_pmerge_i32x2: { switch (BuiltinID) { default: llvm_unreachable("unexpected builtin ID"); @@ -1321,6 +1332,18 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID, case RISCV::BI__builtin_riscv_pabdu_u16x4: ID = Intrinsic::riscv_pabdu; break; + case RISCV::BI__builtin_riscv_pmerge_u8x4: + case RISCV::BI__builtin_riscv_pmerge_i8x4: + case RISCV::BI__builtin_riscv_pmerge_u16x2: + case RISCV::BI__builtin_riscv_pmerge_i16x2: + case RISCV::BI__builtin_riscv_pmerge_u8x8: + case RISCV::BI__builtin_riscv_pmerge_i8x8: + case RISCV::BI__builtin_riscv_pmerge_u16x4: + case RISCV::BI__builtin_riscv_pmerge_i16x4: + case RISCV::BI__builtin_riscv_pmerge_u32x2: + case RISCV::BI__builtin_riscv_pmerge_i32x2: + ID = Intrinsic::riscv_pmerge; + break; } IntrinsicTypes = {ResultType}; diff --git a/clang/lib/Headers/riscv_packed_simd.h b/clang/lib/Headers/riscv_packed_simd.h index c61e156ca6a7f..cdfa7ec5812ba 100644 --- a/clang/lib/Headers/riscv_packed_simd.h +++ b/clang/lib/Headers/riscv_packed_simd.h @@ -109,6 +109,12 @@ typedef uint32_t uint32x2_t __attribute__((__vector_size__(8))); return builtin(__rs1, __rs2); \ } +#define __packed_merge_builtin(name, ty, mask_ty, builtin) \ + static __inline__ ty __DEFAULT_FN_ATTRS __riscv_##name( \ + ty __rs1, ty __rs2, mask_ty __rd) { \ + return (ty)builtin(__rs1, __rs2, __rd); \ + } + // clang-format off: macro call sites have no trailing semicolons, which // confuses clang-format into a deeply nested expression. @@ -448,6 +454,20 @@ __packed_reduction(predsumu_u16x4_u64, uint64_t, uint16x4_t, __builtin_riscv_pre __packed_reduction(predsum_i32x2_i64, int64_t, int32x2_t, __builtin_riscv_predsum_i32x2_i64) __packed_reduction(predsumu_u32x2_u64, uint64_t, uint32x2_t, __builtin_riscv_predsumu_u32x2_u64) +/* Packed Merge (32-bit) */ +__packed_merge_builtin(pmerge_u8x4, uint8x4_t, uint8x4_t, __builtin_riscv_pmerge_u8x4) +__packed_merge_builtin(pmerge_i8x4, int8x4_t, uint8x4_t, __builtin_riscv_pmerge_i8x4) +__packed_merge_builtin(pmerge_u16x2, uint16x2_t, uint16x2_t, __builtin_riscv_pmerge_u16x2) +__packed_merge_builtin(pmerge_i16x2, int16x2_t, uint16x2_t, __builtin_riscv_pmerge_i16x2) + +/* Packed Merge (64-bit) */ +__packed_merge_builtin(pmerge_u8x8, uint8x8_t, uint8x8_t, __builtin_riscv_pmerge_u8x8) +__packed_merge_builtin(pmerge_i8x8, int8x8_t, uint8x8_t, __builtin_riscv_pmerge_i8x8) +__packed_merge_builtin(pmerge_u16x4, uint16x4_t, uint16x4_t, __builtin_riscv_pmerge_u16x4) +__packed_merge_builtin(pmerge_i16x4, int16x4_t, uint16x4_t, __builtin_riscv_pmerge_i16x4) +__packed_merge_builtin(pmerge_u32x2, uint32x2_t, uint32x2_t, __builtin_riscv_pmerge_u32x2) +__packed_merge_builtin(pmerge_i32x2, int32x2_t, uint32x2_t, __builtin_riscv_pmerge_i32x2) + // clang-format on #undef __packed_splat2 @@ -468,6 +488,7 @@ __packed_reduction(predsumu_u32x2_u64, uint64_t, uint32x2_t, __builtin_riscv_pre #undef __packed_pabs #undef __packed_binary_builtin_cast #undef __packed_reduction +#undef __packed_merge_builtin #undef __DEFAULT_FN_ATTRS #if defined(__cplusplus) diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c b/clang/test/CodeGen/RISCV/rvp-intrinsics.c index 290f61787ceff..ec555e52d62fe 100644 --- a/clang/test/CodeGen/RISCV/rvp-intrinsics.c +++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c @@ -6143,3 +6143,247 @@ int64_t test_predsum_i32x2_i64(int32x2_t rs1, int64_t rs2) { uint64_t test_predsumu_u32x2_u64(uint32x2_t rs1, uint64_t rs2) { return __riscv_predsumu_u32x2_u64(rs1, rs2); } + +// Packed Merge (32-bit) + +// RV32-LABEL: define dso_local i32 @test_pmerge_u8x4( +// RV32-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2_COERCE:%.*]], i32 noundef [[RD_COERCE:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <4 x i8> +// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[RS2_COERCE]] to <4 x i8> +// RV32-NEXT: [[TMP2:%.*]] = bitcast i32 [[RD_COERCE]] to <4 x i8> +// RV32-NEXT: [[TMP3:%.*]] = call <4 x i8> @llvm.riscv.pmerge.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]], <4 x i8> [[TMP2]]) +// RV32-NEXT: [[TMP4:%.*]] = bitcast <4 x i8> [[TMP3]] to i32 +// RV32-NEXT: ret i32 [[TMP4]] +// +// RV64-LABEL: define dso_local i32 @test_pmerge_u8x4( +// RV64-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2_COERCE:%.*]], i32 noundef [[RD_COERCE:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <4 x i8> +// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[RS2_COERCE]] to <4 x i8> +// RV64-NEXT: [[TMP2:%.*]] = bitcast i32 [[RD_COERCE]] to <4 x i8> +// RV64-NEXT: [[TMP3:%.*]] = call <4 x i8> @llvm.riscv.pmerge.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]], <4 x i8> [[TMP2]]) +// RV64-NEXT: [[TMP4:%.*]] = bitcast <4 x i8> [[TMP3]] to i32 +// RV64-NEXT: ret i32 [[TMP4]] +// +uint8x4_t test_pmerge_u8x4(uint8x4_t rs1, uint8x4_t rs2, uint8x4_t rd) { + return __riscv_pmerge_u8x4(rs1, rs2, rd); +} + +// RV32-LABEL: define dso_local i32 @test_pmerge_i8x4( +// RV32-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2_COERCE:%.*]], i32 noundef [[RD_COERCE:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <4 x i8> +// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[RS2_COERCE]] to <4 x i8> +// RV32-NEXT: [[TMP2:%.*]] = bitcast i32 [[RD_COERCE]] to <4 x i8> +// RV32-NEXT: [[TMP3:%.*]] = call <4 x i8> @llvm.riscv.pmerge.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]], <4 x i8> [[TMP2]]) +// RV32-NEXT: [[TMP4:%.*]] = bitcast <4 x i8> [[TMP3]] to i32 +// RV32-NEXT: ret i32 [[TMP4]] +// +// RV64-LABEL: define dso_local i32 @test_pmerge_i8x4( +// RV64-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2_COERCE:%.*]], i32 noundef [[RD_COERCE:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <4 x i8> +// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[RS2_COERCE]] to <4 x i8> +// RV64-NEXT: [[TMP2:%.*]] = bitcast i32 [[RD_COERCE]] to <4 x i8> +// RV64-NEXT: [[TMP3:%.*]] = call <4 x i8> @llvm.riscv.pmerge.v4i8(<4 x i8> [[TMP0]], <4 x i8> [[TMP1]], <4 x i8> [[TMP2]]) +// RV64-NEXT: [[TMP4:%.*]] = bitcast <4 x i8> [[TMP3]] to i32 +// RV64-NEXT: ret i32 [[TMP4]] +// +int8x4_t test_pmerge_i8x4(int8x4_t rs1, int8x4_t rs2, uint8x4_t rd) { + return __riscv_pmerge_i8x4(rs1, rs2, rd); +} + +// RV32-LABEL: define dso_local i32 @test_pmerge_u16x2( +// RV32-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2_COERCE:%.*]], i32 noundef [[RD_COERCE:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <2 x i16> +// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[RS2_COERCE]] to <2 x i16> +// RV32-NEXT: [[TMP2:%.*]] = bitcast i32 [[RD_COERCE]] to <2 x i16> +// RV32-NEXT: [[TMP3:%.*]] = call <2 x i16> @llvm.riscv.pmerge.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]], <2 x i16> [[TMP2]]) +// RV32-NEXT: [[TMP4:%.*]] = bitcast <2 x i16> [[TMP3]] to i32 +// RV32-NEXT: ret i32 [[TMP4]] +// +// RV64-LABEL: define dso_local i32 @test_pmerge_u16x2( +// RV64-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2_COERCE:%.*]], i32 noundef [[RD_COERCE:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <2 x i16> +// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[RS2_COERCE]] to <2 x i16> +// RV64-NEXT: [[TMP2:%.*]] = bitcast i32 [[RD_COERCE]] to <2 x i16> +// RV64-NEXT: [[TMP3:%.*]] = call <2 x i16> @llvm.riscv.pmerge.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]], <2 x i16> [[TMP2]]) +// RV64-NEXT: [[TMP4:%.*]] = bitcast <2 x i16> [[TMP3]] to i32 +// RV64-NEXT: ret i32 [[TMP4]] +// +uint16x2_t test_pmerge_u16x2(uint16x2_t rs1, uint16x2_t rs2, uint16x2_t rd) { + return __riscv_pmerge_u16x2(rs1, rs2, rd); +} + +// RV32-LABEL: define dso_local i32 @test_pmerge_i16x2( +// RV32-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2_COERCE:%.*]], i32 noundef [[RD_COERCE:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <2 x i16> +// RV32-NEXT: [[TMP1:%.*]] = bitcast i32 [[RS2_COERCE]] to <2 x i16> +// RV32-NEXT: [[TMP2:%.*]] = bitcast i32 [[RD_COERCE]] to <2 x i16> +// RV32-NEXT: [[TMP3:%.*]] = call <2 x i16> @llvm.riscv.pmerge.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]], <2 x i16> [[TMP2]]) +// RV32-NEXT: [[TMP4:%.*]] = bitcast <2 x i16> [[TMP3]] to i32 +// RV32-NEXT: ret i32 [[TMP4]] +// +// RV64-LABEL: define dso_local i32 @test_pmerge_i16x2( +// RV64-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2_COERCE:%.*]], i32 noundef [[RD_COERCE:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <2 x i16> +// RV64-NEXT: [[TMP1:%.*]] = bitcast i32 [[RS2_COERCE]] to <2 x i16> +// RV64-NEXT: [[TMP2:%.*]] = bitcast i32 [[RD_COERCE]] to <2 x i16> +// RV64-NEXT: [[TMP3:%.*]] = call <2 x i16> @llvm.riscv.pmerge.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]], <2 x i16> [[TMP2]]) +// RV64-NEXT: [[TMP4:%.*]] = bitcast <2 x i16> [[TMP3]] to i32 +// RV64-NEXT: ret i32 [[TMP4]] +// +int16x2_t test_pmerge_i16x2(int16x2_t rs1, int16x2_t rs2, uint16x2_t rd) { + return __riscv_pmerge_i16x2(rs1, rs2, rd); +} + +// Packed Merge (64-bit) + +// RV32-LABEL: define dso_local i64 @test_pmerge_u8x8( +// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2_COERCE:%.*]], i64 noundef [[RD_COERCE:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8> +// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[RS2_COERCE]] to <8 x i8> +// RV32-NEXT: [[TMP2:%.*]] = bitcast i64 [[RD_COERCE]] to <8 x i8> +// RV32-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.riscv.pmerge.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// RV32-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to i64 +// RV32-NEXT: ret i64 [[TMP4]] +// +// RV64-LABEL: define dso_local i64 @test_pmerge_u8x8( +// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2_COERCE:%.*]], i64 noundef [[RD_COERCE:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8> +// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[RS2_COERCE]] to <8 x i8> +// RV64-NEXT: [[TMP2:%.*]] = bitcast i64 [[RD_COERCE]] to <8 x i8> +// RV64-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.riscv.pmerge.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// RV64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to i64 +// RV64-NEXT: ret i64 [[TMP4]] +// +uint8x8_t test_pmerge_u8x8(uint8x8_t rs1, uint8x8_t rs2, uint8x8_t rd) { + return __riscv_pmerge_u8x8(rs1, rs2, rd); +} + +// RV32-LABEL: define dso_local i64 @test_pmerge_i8x8( +// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2_COERCE:%.*]], i64 noundef [[RD_COERCE:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8> +// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[RS2_COERCE]] to <8 x i8> +// RV32-NEXT: [[TMP2:%.*]] = bitcast i64 [[RD_COERCE]] to <8 x i8> +// RV32-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.riscv.pmerge.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// RV32-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to i64 +// RV32-NEXT: ret i64 [[TMP4]] +// +// RV64-LABEL: define dso_local i64 @test_pmerge_i8x8( +// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2_COERCE:%.*]], i64 noundef [[RD_COERCE:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8> +// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[RS2_COERCE]] to <8 x i8> +// RV64-NEXT: [[TMP2:%.*]] = bitcast i64 [[RD_COERCE]] to <8 x i8> +// RV64-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.riscv.pmerge.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// RV64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to i64 +// RV64-NEXT: ret i64 [[TMP4]] +// +int8x8_t test_pmerge_i8x8(int8x8_t rs1, int8x8_t rs2, uint8x8_t rd) { + return __riscv_pmerge_i8x8(rs1, rs2, rd); +} + +// RV32-LABEL: define dso_local i64 @test_pmerge_u16x4( +// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2_COERCE:%.*]], i64 noundef [[RD_COERCE:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16> +// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[RS2_COERCE]] to <4 x i16> +// RV32-NEXT: [[TMP2:%.*]] = bitcast i64 [[RD_COERCE]] to <4 x i16> +// RV32-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.riscv.pmerge.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) +// RV32-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to i64 +// RV32-NEXT: ret i64 [[TMP4]] +// +// RV64-LABEL: define dso_local i64 @test_pmerge_u16x4( +// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2_COERCE:%.*]], i64 noundef [[RD_COERCE:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16> +// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[RS2_COERCE]] to <4 x i16> +// RV64-NEXT: [[TMP2:%.*]] = bitcast i64 [[RD_COERCE]] to <4 x i16> +// RV64-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.riscv.pmerge.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) +// RV64-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to i64 +// RV64-NEXT: ret i64 [[TMP4]] +// +uint16x4_t test_pmerge_u16x4(uint16x4_t rs1, uint16x4_t rs2, uint16x4_t rd) { + return __riscv_pmerge_u16x4(rs1, rs2, rd); +} + +// RV32-LABEL: define dso_local i64 @test_pmerge_i16x4( +// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2_COERCE:%.*]], i64 noundef [[RD_COERCE:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16> +// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[RS2_COERCE]] to <4 x i16> +// RV32-NEXT: [[TMP2:%.*]] = bitcast i64 [[RD_COERCE]] to <4 x i16> +// RV32-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.riscv.pmerge.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) +// RV32-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to i64 +// RV32-NEXT: ret i64 [[TMP4]] +// +// RV64-LABEL: define dso_local i64 @test_pmerge_i16x4( +// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2_COERCE:%.*]], i64 noundef [[RD_COERCE:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16> +// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[RS2_COERCE]] to <4 x i16> +// RV64-NEXT: [[TMP2:%.*]] = bitcast i64 [[RD_COERCE]] to <4 x i16> +// RV64-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.riscv.pmerge.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) +// RV64-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to i64 +// RV64-NEXT: ret i64 [[TMP4]] +// +int16x4_t test_pmerge_i16x4(int16x4_t rs1, int16x4_t rs2, uint16x4_t rd) { + return __riscv_pmerge_i16x4(rs1, rs2, rd); +} + +// RV32-LABEL: define dso_local i64 @test_pmerge_u32x2( +// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2_COERCE:%.*]], i64 noundef [[RD_COERCE:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <2 x i32> +// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[RS2_COERCE]] to <2 x i32> +// RV32-NEXT: [[TMP2:%.*]] = bitcast i64 [[RD_COERCE]] to <2 x i32> +// RV32-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.riscv.pmerge.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]]) +// RV32-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64 +// RV32-NEXT: ret i64 [[TMP4]] +// +// RV64-LABEL: define dso_local i64 @test_pmerge_u32x2( +// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2_COERCE:%.*]], i64 noundef [[RD_COERCE:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <2 x i32> +// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[RS2_COERCE]] to <2 x i32> +// RV64-NEXT: [[TMP2:%.*]] = bitcast i64 [[RD_COERCE]] to <2 x i32> +// RV64-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.riscv.pmerge.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]]) +// RV64-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64 +// RV64-NEXT: ret i64 [[TMP4]] +// +uint32x2_t test_pmerge_u32x2(uint32x2_t rs1, uint32x2_t rs2, uint32x2_t rd) { + return __riscv_pmerge_u32x2(rs1, rs2, rd); +} + +// RV32-LABEL: define dso_local i64 @test_pmerge_i32x2( +// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2_COERCE:%.*]], i64 noundef [[RD_COERCE:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <2 x i32> +// RV32-NEXT: [[TMP1:%.*]] = bitcast i64 [[RS2_COERCE]] to <2 x i32> +// RV32-NEXT: [[TMP2:%.*]] = bitcast i64 [[RD_COERCE]] to <2 x i32> +// RV32-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.riscv.pmerge.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]]) +// RV32-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64 +// RV32-NEXT: ret i64 [[TMP4]] +// +// RV64-LABEL: define dso_local i64 @test_pmerge_i32x2( +// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2_COERCE:%.*]], i64 noundef [[RD_COERCE:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <2 x i32> +// RV64-NEXT: [[TMP1:%.*]] = bitcast i64 [[RS2_COERCE]] to <2 x i32> +// RV64-NEXT: [[TMP2:%.*]] = bitcast i64 [[RD_COERCE]] to <2 x i32> +// RV64-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.riscv.pmerge.v2i32(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]]) +// RV64-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64 +// RV64-NEXT: ret i64 [[TMP4]] +// +int32x2_t test_pmerge_i32x2(int32x2_t rs1, int32x2_t rs2, uint32x2_t rd) { + return __riscv_pmerge_i32x2(rs1, rs2, rd); +} diff --git a/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c index 4b939675cbeb5..2efccb8ab3ea5 100644 --- a/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c +++ b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c @@ -2082,3 +2082,201 @@ int64_t test_predsum_i32x2_i64(int32x2_t a, int64_t b) { uint64_t test_predsumu_u32x2_u64(uint32x2_t a, uint64_t b) { return __riscv_predsumu_u32x2_u64(a, b); } + +// CHECK-LABEL: test_pmerge_merge_u8x4: +// CHECK: merge +uint8x4_t test_pmerge_merge_u8x4(uint8x4_t rd, uint8x4_t rs1, uint8x4_t rs2) { + return __riscv_pmerge_u8x4(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_mvm_u8x4: +// CHECK: mvm +uint8x4_t test_pmerge_mvm_u8x4(uint8x4_t rs1, uint8x4_t rd, uint8x4_t rs2) { + return __riscv_pmerge_u8x4(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_mvmn_u8x4: +// CHECK: mvmn +uint8x4_t test_pmerge_mvmn_u8x4(uint8x4_t rs2, uint8x4_t rs1, uint8x4_t rd) { + return __riscv_pmerge_u8x4(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_merge_i8x4: +// CHECK: merge +int8x4_t test_pmerge_merge_i8x4(uint8x4_t rd, int8x4_t rs1, int8x4_t rs2) { + return __riscv_pmerge_i8x4(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_mvm_i8x4: +// CHECK: mvm +int8x4_t test_pmerge_mvm_i8x4(int8x4_t rs1, uint8x4_t rd, int8x4_t rs2) { + return __riscv_pmerge_i8x4(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_mvmn_i8x4: +// CHECK: mvmn +int8x4_t test_pmerge_mvmn_i8x4(int8x4_t rs2, int8x4_t rs1, uint8x4_t rd) { + return __riscv_pmerge_i8x4(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_merge_u16x2: +// CHECK: merge +uint16x2_t test_pmerge_merge_u16x2(uint16x2_t rd, uint16x2_t rs1, uint16x2_t rs2) { + return __riscv_pmerge_u16x2(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_mvm_u16x2: +// CHECK: mvm +uint16x2_t test_pmerge_mvm_u16x2(uint16x2_t rs1, uint16x2_t rd, uint16x2_t rs2) { + return __riscv_pmerge_u16x2(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_mvmn_u16x2: +// CHECK: mvmn +uint16x2_t test_pmerge_mvmn_u16x2(uint16x2_t rs2, uint16x2_t rs1, uint16x2_t rd) { + return __riscv_pmerge_u16x2(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_merge_i16x2: +// CHECK: merge +int16x2_t test_pmerge_merge_i16x2(uint16x2_t rd, int16x2_t rs1, int16x2_t rs2) { + return __riscv_pmerge_i16x2(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_mvm_i16x2: +// CHECK: mvm +int16x2_t test_pmerge_mvm_i16x2(int16x2_t rs1, uint16x2_t rd, int16x2_t rs2) { + return __riscv_pmerge_i16x2(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_mvmn_i16x2: +// CHECK: mvmn +int16x2_t test_pmerge_mvmn_i16x2(int16x2_t rs2, int16x2_t rs1, uint16x2_t rd) { + return __riscv_pmerge_i16x2(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_merge_u8x8: +// RV32-COUNT-2: merge +// RV64: merge +uint8x8_t test_pmerge_merge_u8x8(uint8x8_t rd, uint8x8_t rs1, uint8x8_t rs2) { + return __riscv_pmerge_u8x8(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_mvm_u8x8: +// RV32-COUNT-2: mvm +// RV64: mvm +uint8x8_t test_pmerge_mvm_u8x8(uint8x8_t rs1, uint8x8_t rd, uint8x8_t rs2) { + return __riscv_pmerge_u8x8(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_mvmn_u8x8: +// RV32-COUNT-2: mvmn +// RV64: mvmn +uint8x8_t test_pmerge_mvmn_u8x8(uint8x8_t rs2, uint8x8_t rs1, uint8x8_t rd) { + return __riscv_pmerge_u8x8(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_merge_i8x8: +// RV32-COUNT-2: merge +// RV64: merge +int8x8_t test_pmerge_merge_i8x8(uint8x8_t rd, int8x8_t rs1, int8x8_t rs2) { + return __riscv_pmerge_i8x8(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_mvm_i8x8: +// RV32-COUNT-2: mvm +// RV64: mvm +int8x8_t test_pmerge_mvm_i8x8(int8x8_t rs1, uint8x8_t rd, int8x8_t rs2) { + return __riscv_pmerge_i8x8(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_mvmn_i8x8: +// RV32-COUNT-2: mvmn +// RV64: mvmn +int8x8_t test_pmerge_mvmn_i8x8(int8x8_t rs2, int8x8_t rs1, uint8x8_t rd) { + return __riscv_pmerge_i8x8(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_merge_u16x4: +// RV32-COUNT-2: merge +// RV64: merge +uint16x4_t test_pmerge_merge_u16x4(uint16x4_t rd, uint16x4_t rs1, uint16x4_t rs2) { + return __riscv_pmerge_u16x4(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_mvm_u16x4: +// RV32-COUNT-2: mvm +// RV64: mvm +uint16x4_t test_pmerge_mvm_u16x4(uint16x4_t rs1, uint16x4_t rd, uint16x4_t rs2) { + return __riscv_pmerge_u16x4(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_mvmn_u16x4: +// RV32-COUNT-2: mvmn +// RV64: mvmn +uint16x4_t test_pmerge_mvmn_u16x4(uint16x4_t rs2, uint16x4_t rs1, uint16x4_t rd) { + return __riscv_pmerge_u16x4(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_merge_i16x4: +// RV32-COUNT-2: merge +// RV64: merge +int16x4_t test_pmerge_merge_i16x4(uint16x4_t rd, int16x4_t rs1, int16x4_t rs2) { + return __riscv_pmerge_i16x4(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_mvm_i16x4: +// RV32-COUNT-2: mvm +// RV64: mvm +int16x4_t test_pmerge_mvm_i16x4(int16x4_t rs1, uint16x4_t rd, int16x4_t rs2) { + return __riscv_pmerge_i16x4(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_mvmn_i16x4: +// RV32-COUNT-2: mvmn +// RV64: mvmn +int16x4_t test_pmerge_mvmn_i16x4(int16x4_t rs2, int16x4_t rs1, uint16x4_t rd) { + return __riscv_pmerge_i16x4(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_merge_u32x2: +// RV32-COUNT-2: merge +// RV64: merge +uint32x2_t test_pmerge_merge_u32x2(uint32x2_t rd, uint32x2_t rs1, uint32x2_t rs2) { + return __riscv_pmerge_u32x2(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_mvm_u32x2: +// RV32-COUNT-2: mvm +// RV64: mvm +uint32x2_t test_pmerge_mvm_u32x2(uint32x2_t rs1, uint32x2_t rd, uint32x2_t rs2) { + return __riscv_pmerge_u32x2(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_mvmn_u32x2: +// RV32-COUNT-2: mvmn +// RV64: mvmn +uint32x2_t test_pmerge_mvmn_u32x2(uint32x2_t rs2, uint32x2_t rs1, uint32x2_t rd) { + return __riscv_pmerge_u32x2(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_merge_i32x2: +// RV32-COUNT-2: merge +// RV64: merge +int32x2_t test_pmerge_merge_i32x2(uint32x2_t rd, int32x2_t rs1, int32x2_t rs2) { + return __riscv_pmerge_i32x2(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_mvm_i32x2: +// RV32-COUNT-2: mvm +// RV64: mvm +int32x2_t test_pmerge_mvm_i32x2(int32x2_t rs1, uint32x2_t rd, int32x2_t rs2) { + return __riscv_pmerge_i32x2(rs1, rs2, rd); +} + +// CHECK-LABEL: test_pmerge_mvmn_i32x2: +// RV32-COUNT-2: mvmn +// RV64: mvmn +int32x2_t test_pmerge_mvmn_i32x2(int32x2_t rs2, int32x2_t rs1, uint32x2_t rd) { + return __riscv_pmerge_i32x2(rs1, rs2, rd); +} diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td index f201c0e38e94e..fb7f8d3000514 100644 --- a/llvm/include/llvm/IR/IntrinsicsRISCV.td +++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td @@ -2076,6 +2076,14 @@ class RVPBinaryIntrinsic [IntrNoMem, IntrSpeculatable]>; def int_riscv_predsum : RVPReductionIntrinsic; def int_riscv_predsumu : RVPReductionIntrinsic; + + // Packed Merge + class RVPTernaryIntrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, + LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable]>; + def int_riscv_pmerge : RVPTernaryIntrinsic; } // TargetPrefix = "riscv" //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index a994b9fc5961f..dd5bbabf3b87f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -11851,6 +11851,39 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(Opc, DL, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } + case Intrinsic::riscv_pmerge: { + EVT VT = Op.getValueType(); + auto buildMerge = [&](SDValue Rs1, SDValue Rs2, SDValue Mask, EVT ResultVT) { + MVT IntVT = MVT::getIntegerVT(ResultVT.getSizeInBits()); + SDValue Res = DAG.getNode(RISCVISD::MERGE, DL, IntVT, + DAG.getBitcast(IntVT, Mask), + DAG.getBitcast(IntVT, Rs1), + DAG.getBitcast(IntVT, Rs2)); + return DAG.getBitcast(ResultVT, Res); + }; + + // 64-bit packed types on RV32: split into two 32-bit halves. v2i32 has no + // legal 32-bit vector half, so bitcast it to v4i16 (same 64 bits) first; + // the merge result is identical. + if (!Subtarget.is64Bit() && + (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)) { + EVT WorkVT = VT == MVT::v2i32 ? (EVT)MVT::v4i16 : VT; + SDValue Rs1 = DAG.getBitcast(WorkVT, Op.getOperand(1)); + SDValue Rs2 = DAG.getBitcast(WorkVT, Op.getOperand(2)); + SDValue Mask = DAG.getBitcast(WorkVT, Op.getOperand(3)); + MVT HalfVT = WorkVT == MVT::v8i8 ? MVT::v4i8 : MVT::v2i16; + auto [Rs1Lo, Rs1Hi] = DAG.SplitVector(Rs1, DL, HalfVT, HalfVT); + auto [Rs2Lo, Rs2Hi] = DAG.SplitVector(Rs2, DL, HalfVT, HalfVT); + auto [MaskLo, MaskHi] = DAG.SplitVector(Mask, DL, HalfVT, HalfVT); + SDValue ResLo = buildMerge(Rs1Lo, Rs2Lo, MaskLo, HalfVT); + SDValue ResHi = buildMerge(Rs1Hi, Rs2Hi, MaskHi, HalfVT); + SDValue Res = + DAG.getNode(ISD::CONCAT_VECTORS, DL, WorkVT, ResLo, ResHi); + return DAG.getBitcast(VT, Res); + } + + return buildMerge(Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), VT); + } case Intrinsic::experimental_get_vector_length: return lowerGetVectorLength(Op.getNode(), DAG, Subtarget); case Intrinsic::riscv_vmv_x_s: { @@ -15766,7 +15799,8 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, case Intrinsic::riscv_psas: case Intrinsic::riscv_pssa: case Intrinsic::riscv_paas: - case Intrinsic::riscv_pasa: { + case Intrinsic::riscv_pasa: + case Intrinsic::riscv_pmerge: { EVT VT = N->getValueType(0); if (!Subtarget.is64Bit() || (VT != MVT::v4i8 && VT != MVT::v2i16)) return; @@ -15792,6 +15826,8 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, Opc = ISD::ABDU; break; default: + // pas/psa/psas/pssa/paas/pasa and pmerge: re-emit at the widened type + // rather than lowering to a generic node. Opc = ISD::INTRINSIC_WO_CHAIN; break; } @@ -15803,10 +15839,18 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, SDValue Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, WideVT, N->getOperand(2), Undef); SDValue Res; - if (Opc == ISD::INTRINSIC_WO_CHAIN) - Res = DAG.getNode(Opc, DL, WideVT, N->getOperand(0), Op0, Op1); - else + if (Opc == ISD::INTRINSIC_WO_CHAIN) { + SmallVector<SDValue, 5> Ops; + Ops.push_back(N->getOperand(0)); + Ops.push_back(Op0); + Ops.push_back(Op1); + if (N->getNumOperands() > 3) + Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, WideVT, + N->getOperand(3), Undef)); + Res = DAG.getNode(Opc, DL, WideVT, Ops); + } else { Res = DAG.getNode(Opc, DL, WideVT, Op0, Op1); + } Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, DAG.getVectorIdxConstant(0, DL))); return; diff --git a/llvm/test/CodeGen/RISCV/rvp-simd-32.ll b/llvm/test/CodeGen/RISCV/rvp-simd-32.ll index 2b62ce13c5003..8f2b97da5f5c3 100644 --- a/llvm/test/CodeGen/RISCV/rvp-simd-32.ll +++ b/llvm/test/CodeGen/RISCV/rvp-simd-32.ll @@ -2522,3 +2522,112 @@ define i32 @test_predsumu_u16x2_u32(<2 x i16> %a, i32 %b) { %res = call i32 @llvm.riscv.predsumu.i32.v2i16(<2 x i16> %a, i32 %b) ret i32 %res } + +; Packed Merge +define <4 x i8> @test_pmerge_merge_u8x4(<4 x i8> %rd, <4 x i8> %rs1, <4 x i8> %rs2) { +; CHECK-LABEL: test_pmerge_merge_u8x4: +; CHECK: # %bb.0: +; CHECK-NEXT: merge a0, a1, a2 +; CHECK-NEXT: ret + %res = call <4 x i8> @llvm.riscv.pmerge.v4i8(<4 x i8> %rs1, <4 x i8> %rs2, <4 x i8> %rd) + ret <4 x i8> %res +} + +define <4 x i8> @test_pmerge_mvm_u8x4(<4 x i8> %rs1, <4 x i8> %rd, <4 x i8> %rs2) { +; CHECK-LABEL: test_pmerge_mvm_u8x4: +; CHECK: # %bb.0: +; CHECK-NEXT: mvm a0, a2, a1 +; CHECK-NEXT: ret + %res = call <4 x i8> @llvm.riscv.pmerge.v4i8(<4 x i8> %rs1, <4 x i8> %rs2, <4 x i8> %rd) + ret <4 x i8> %res +} + +define <4 x i8> @test_pmerge_mvmn_u8x4(<4 x i8> %rs2, <4 x i8> %rs1, <4 x i8> %rd) { +; CHECK-LABEL: test_pmerge_mvmn_u8x4: +; CHECK: # %bb.0: +; CHECK-NEXT: mvmn a0, a1, a2 +; CHECK-NEXT: ret + %res = call <4 x i8> @llvm.riscv.pmerge.v4i8(<4 x i8> %rs1, <4 x i8> %rs2, <4 x i8> %rd) + ret <4 x i8> %res +} + +define <4 x i8> @test_pmerge_merge_i8x4(<4 x i8> %rd, <4 x i8> %rs1, <4 x i8> %rs2) { +; CHECK-LABEL: test_pmerge_merge_i8x4: +; CHECK: # %bb.0: +; CHECK-NEXT: merge a0, a1, a2 +; CHECK-NEXT: ret + %res = call <4 x i8> @llvm.riscv.pmerge.v4i8(<4 x i8> %rs1, <4 x i8> %rs2, <4 x i8> %rd) + ret <4 x i8> %res +} + +define <4 x i8> @test_pmerge_mvm_i8x4(<4 x i8> %rs1, <4 x i8> %rd, <4 x i8> %rs2) { +; CHECK-LABEL: test_pmerge_mvm_i8x4: +; CHECK: # %bb.0: +; CHECK-NEXT: mvm a0, a2, a1 +; CHECK-NEXT: ret + %res = call <4 x i8> @llvm.riscv.pmerge.v4i8(<4 x i8> %rs1, <4 x i8> %rs2, <4 x i8> %rd) + ret <4 x i8> %res +} + +define <4 x i8> @test_pmerge_mvmn_i8x4(<4 x i8> %rs2, <4 x i8> %rs1, <4 x i8> %rd) { +; CHECK-LABEL: test_pmerge_mvmn_i8x4: +; CHECK: # %bb.0: +; CHECK-NEXT: mvmn a0, a1, a2 +; CHECK-NEXT: ret + %res = call <4 x i8> @llvm.riscv.pmerge.v4i8(<4 x i8> %rs1, <4 x i8> %rs2, <4 x i8> %rd) + ret <4 x i8> %res +} + +define <2 x i16> @test_pmerge_merge_u16x2(<2 x i16> %rd, <2 x i16> %rs1, <2 x i16> %rs2) { +; CHECK-LABEL: test_pmerge_merge_u16x2: +; CHECK: # %bb.0: +; CHECK-NEXT: merge a0, a1, a2 +; CHECK-NEXT: ret + %res = call <2 x i16> @llvm.riscv.pmerge.v2i16(<2 x i16> %rs1, <2 x i16> %rs2, <2 x i16> %rd) + ret <2 x i16> %res +} + +define <2 x i16> @test_pmerge_mvm_u16x2(<2 x i16> %rs1, <2 x i16> %rd, <2 x i16> %rs2) { +; CHECK-LABEL: test_pmerge_mvm_u16x2: +; CHECK: # %bb.0: +; CHECK-NEXT: mvm a0, a2, a1 +; CHECK-NEXT: ret + %res = call <2 x i16> @llvm.riscv.pmerge.v2i16(<2 x i16> %rs1, <2 x i16> %rs2, <2 x i16> %rd) + ret <2 x i16> %res +} + +define <2 x i16> @test_pmerge_mvmn_u16x2(<2 x i16> %rs2, <2 x i16> %rs1, <2 x i16> %rd) { +; CHECK-LABEL: test_pmerge_mvmn_u16x2: +; CHECK: # %bb.0: +; CHECK-NEXT: mvmn a0, a1, a2 +; CHECK-NEXT: ret + %res = call <2 x i16> @llvm.riscv.pmerge.v2i16(<2 x i16> %rs1, <2 x i16> %rs2, <2 x i16> %rd) + ret <2 x i16> %res +} + +define <2 x i16> @test_pmerge_merge_i16x2(<2 x i16> %rd, <2 x i16> %rs1, <2 x i16> %rs2) { +; CHECK-LABEL: test_pmerge_merge_i16x2: +; CHECK: # %bb.0: +; CHECK-NEXT: merge a0, a1, a2 +; CHECK-NEXT: ret + %res = call <2 x i16> @llvm.riscv.pmerge.v2i16(<2 x i16> %rs1, <2 x i16> %rs2, <2 x i16> %rd) + ret <2 x i16> %res +} + +define <2 x i16> @test_pmerge_mvm_i16x2(<2 x i16> %rs1, <2 x i16> %rd, <2 x i16> %rs2) { +; CHECK-LABEL: test_pmerge_mvm_i16x2: +; CHECK: # %bb.0: +; CHECK-NEXT: mvm a0, a2, a1 +; CHECK-NEXT: ret + %res = call <2 x i16> @llvm.riscv.pmerge.v2i16(<2 x i16> %rs1, <2 x i16> %rs2, <2 x i16> %rd) + ret <2 x i16> %res +} + +define <2 x i16> @test_pmerge_mvmn_i16x2(<2 x i16> %rs2, <2 x i16> %rs1, <2 x i16> %rd) { +; CHECK-LABEL: test_pmerge_mvmn_i16x2: +; CHECK: # %bb.0: +; CHECK-NEXT: mvmn a0, a1, a2 +; CHECK-NEXT: ret + %res = call <2 x i16> @llvm.riscv.pmerge.v2i16(<2 x i16> %rs1, <2 x i16> %rs2, <2 x i16> %rd) + ret <2 x i16> %res +} diff --git a/llvm/test/CodeGen/RISCV/rvp-simd-64.ll b/llvm/test/CodeGen/RISCV/rvp-simd-64.ll index 5fefbb394404b..2b6b631e39fe4 100644 --- a/llvm/test/CodeGen/RISCV/rvp-simd-64.ll +++ b/llvm/test/CodeGen/RISCV/rvp-simd-64.ll @@ -5483,3 +5483,274 @@ define i64 @test_predsumu_u32x2_u64(<2 x i32> %a, i64 %b) { %res = call i64 @llvm.riscv.predsumu.i64.v2i32(<2 x i32> %a, i64 %b) ret i64 %res } + +; Packed Merge +define <8 x i8> @test_pmerge_merge_u8x8(<8 x i8> %rd, <8 x i8> %rs1, <8 x i8> %rs2) { +; RV32-LABEL: test_pmerge_merge_u8x8: +; RV32: # %bb.0: +; RV32-NEXT: merge a1, a3, a5 +; RV32-NEXT: merge a0, a2, a4 +; RV32-NEXT: ret +; +; RV64-LABEL: test_pmerge_merge_u8x8: +; RV64: # %bb.0: +; RV64-NEXT: merge a0, a1, a2 +; RV64-NEXT: ret + %res = call <8 x i8> @llvm.riscv.pmerge.v8i8(<8 x i8> %rs1, <8 x i8> %rs2, <8 x i8> %rd) + ret <8 x i8> %res +} + +define <8 x i8> @test_pmerge_mvm_u8x8(<8 x i8> %rs1, <8 x i8> %rd, <8 x i8> %rs2) { +; RV32-LABEL: test_pmerge_mvm_u8x8: +; RV32: # %bb.0: +; RV32-NEXT: mvm a1, a5, a3 +; RV32-NEXT: mvm a0, a4, a2 +; RV32-NEXT: ret +; +; RV64-LABEL: test_pmerge_mvm_u8x8: +; RV64: # %bb.0: +; RV64-NEXT: mvm a0, a2, a1 +; RV64-NEXT: ret + %res = call <8 x i8> @llvm.riscv.pmerge.v8i8(<8 x i8> %rs1, <8 x i8> %rs2, <8 x i8> %rd) + ret <8 x i8> %res +} + +define <8 x i8> @test_pmerge_mvmn_u8x8(<8 x i8> %rs2, <8 x i8> %rs1, <8 x i8> %rd) { +; RV32-LABEL: test_pmerge_mvmn_u8x8: +; RV32: # %bb.0: +; RV32-NEXT: mvmn a1, a3, a5 +; RV32-NEXT: mvmn a0, a2, a4 +; RV32-NEXT: ret +; +; RV64-LABEL: test_pmerge_mvmn_u8x8: +; RV64: # %bb.0: +; RV64-NEXT: mvmn a0, a1, a2 +; RV64-NEXT: ret + %res = call <8 x i8> @llvm.riscv.pmerge.v8i8(<8 x i8> %rs1, <8 x i8> %rs2, <8 x i8> %rd) + ret <8 x i8> %res +} + +define <8 x i8> @test_pmerge_merge_i8x8(<8 x i8> %rd, <8 x i8> %rs1, <8 x i8> %rs2) { +; RV32-LABEL: test_pmerge_merge_i8x8: +; RV32: # %bb.0: +; RV32-NEXT: merge a1, a3, a5 +; RV32-NEXT: merge a0, a2, a4 +; RV32-NEXT: ret +; +; RV64-LABEL: test_pmerge_merge_i8x8: +; RV64: # %bb.0: +; RV64-NEXT: merge a0, a1, a2 +; RV64-NEXT: ret + %res = call <8 x i8> @llvm.riscv.pmerge.v8i8(<8 x i8> %rs1, <8 x i8> %rs2, <8 x i8> %rd) + ret <8 x i8> %res +} + +define <8 x i8> @test_pmerge_mvm_i8x8(<8 x i8> %rs1, <8 x i8> %rd, <8 x i8> %rs2) { +; RV32-LABEL: test_pmerge_mvm_i8x8: +; RV32: # %bb.0: +; RV32-NEXT: mvm a1, a5, a3 +; RV32-NEXT: mvm a0, a4, a2 +; RV32-NEXT: ret +; +; RV64-LABEL: test_pmerge_mvm_i8x8: +; RV64: # %bb.0: +; RV64-NEXT: mvm a0, a2, a1 +; RV64-NEXT: ret + %res = call <8 x i8> @llvm.riscv.pmerge.v8i8(<8 x i8> %rs1, <8 x i8> %rs2, <8 x i8> %rd) + ret <8 x i8> %res +} + +define <8 x i8> @test_pmerge_mvmn_i8x8(<8 x i8> %rs2, <8 x i8> %rs1, <8 x i8> %rd) { +; RV32-LABEL: test_pmerge_mvmn_i8x8: +; RV32: # %bb.0: +; RV32-NEXT: mvmn a1, a3, a5 +; RV32-NEXT: mvmn a0, a2, a4 +; RV32-NEXT: ret +; +; RV64-LABEL: test_pmerge_mvmn_i8x8: +; RV64: # %bb.0: +; RV64-NEXT: mvmn a0, a1, a2 +; RV64-NEXT: ret + %res = call <8 x i8> @llvm.riscv.pmerge.v8i8(<8 x i8> %rs1, <8 x i8> %rs2, <8 x i8> %rd) + ret <8 x i8> %res +} + +define <4 x i16> @test_pmerge_merge_u16x4(<4 x i16> %rd, <4 x i16> %rs1, <4 x i16> %rs2) { +; RV32-LABEL: test_pmerge_merge_u16x4: +; RV32: # %bb.0: +; RV32-NEXT: merge a1, a3, a5 +; RV32-NEXT: merge a0, a2, a4 +; RV32-NEXT: ret +; +; RV64-LABEL: test_pmerge_merge_u16x4: +; RV64: # %bb.0: +; RV64-NEXT: merge a0, a1, a2 +; RV64-NEXT: ret + %res = call <4 x i16> @llvm.riscv.pmerge.v4i16(<4 x i16> %rs1, <4 x i16> %rs2, <4 x i16> %rd) + ret <4 x i16> %res +} + +define <4 x i16> @test_pmerge_mvm_u16x4(<4 x i16> %rs1, <4 x i16> %rd, <4 x i16> %rs2) { +; RV32-LABEL: test_pmerge_mvm_u16x4: +; RV32: # %bb.0: +; RV32-NEXT: mvm a1, a5, a3 +; RV32-NEXT: mvm a0, a4, a2 +; RV32-NEXT: ret +; +; RV64-LABEL: test_pmerge_mvm_u16x4: +; RV64: # %bb.0: +; RV64-NEXT: mvm a0, a2, a1 +; RV64-NEXT: ret + %res = call <4 x i16> @llvm.riscv.pmerge.v4i16(<4 x i16> %rs1, <4 x i16> %rs2, <4 x i16> %rd) + ret <4 x i16> %res +} + +define <4 x i16> @test_pmerge_mvmn_u16x4(<4 x i16> %rs2, <4 x i16> %rs1, <4 x i16> %rd) { +; RV32-LABEL: test_pmerge_mvmn_u16x4: +; RV32: # %bb.0: +; RV32-NEXT: mvmn a1, a3, a5 +; RV32-NEXT: mvmn a0, a2, a4 +; RV32-NEXT: ret +; +; RV64-LABEL: test_pmerge_mvmn_u16x4: +; RV64: # %bb.0: +; RV64-NEXT: mvmn a0, a1, a2 +; RV64-NEXT: ret + %res = call <4 x i16> @llvm.riscv.pmerge.v4i16(<4 x i16> %rs1, <4 x i16> %rs2, <4 x i16> %rd) + ret <4 x i16> %res +} + +define <4 x i16> @test_pmerge_merge_i16x4(<4 x i16> %rd, <4 x i16> %rs1, <4 x i16> %rs2) { +; RV32-LABEL: test_pmerge_merge_i16x4: +; RV32: # %bb.0: +; RV32-NEXT: merge a1, a3, a5 +; RV32-NEXT: merge a0, a2, a4 +; RV32-NEXT: ret +; +; RV64-LABEL: test_pmerge_merge_i16x4: +; RV64: # %bb.0: +; RV64-NEXT: merge a0, a1, a2 +; RV64-NEXT: ret + %res = call <4 x i16> @llvm.riscv.pmerge.v4i16(<4 x i16> %rs1, <4 x i16> %rs2, <4 x i16> %rd) + ret <4 x i16> %res +} + +define <4 x i16> @test_pmerge_mvm_i16x4(<4 x i16> %rs1, <4 x i16> %rd, <4 x i16> %rs2) { +; RV32-LABEL: test_pmerge_mvm_i16x4: +; RV32: # %bb.0: +; RV32-NEXT: mvm a1, a5, a3 +; RV32-NEXT: mvm a0, a4, a2 +; RV32-NEXT: ret +; +; RV64-LABEL: test_pmerge_mvm_i16x4: +; RV64: # %bb.0: +; RV64-NEXT: mvm a0, a2, a1 +; RV64-NEXT: ret + %res = call <4 x i16> @llvm.riscv.pmerge.v4i16(<4 x i16> %rs1, <4 x i16> %rs2, <4 x i16> %rd) + ret <4 x i16> %res +} + +define <4 x i16> @test_pmerge_mvmn_i16x4(<4 x i16> %rs2, <4 x i16> %rs1, <4 x i16> %rd) { +; RV32-LABEL: test_pmerge_mvmn_i16x4: +; RV32: # %bb.0: +; RV32-NEXT: mvmn a1, a3, a5 +; RV32-NEXT: mvmn a0, a2, a4 +; RV32-NEXT: ret +; +; RV64-LABEL: test_pmerge_mvmn_i16x4: +; RV64: # %bb.0: +; RV64-NEXT: mvmn a0, a1, a2 +; RV64-NEXT: ret + %res = call <4 x i16> @llvm.riscv.pmerge.v4i16(<4 x i16> %rs1, <4 x i16> %rs2, <4 x i16> %rd) + ret <4 x i16> %res +} + +define <2 x i32> @test_pmerge_merge_u32x2(<2 x i32> %rd, <2 x i32> %rs1, <2 x i32> %rs2) { +; RV32-LABEL: test_pmerge_merge_u32x2: +; RV32: # %bb.0: +; RV32-NEXT: merge a1, a3, a5 +; RV32-NEXT: merge a0, a2, a4 +; RV32-NEXT: ret +; +; RV64-LABEL: test_pmerge_merge_u32x2: +; RV64: # %bb.0: +; RV64-NEXT: merge a0, a1, a2 +; RV64-NEXT: ret + %res = call <2 x i32> @llvm.riscv.pmerge.v2i32(<2 x i32> %rs1, <2 x i32> %rs2, <2 x i32> %rd) + ret <2 x i32> %res +} + +define <2 x i32> @test_pmerge_mvm_u32x2(<2 x i32> %rs1, <2 x i32> %rd, <2 x i32> %rs2) { +; RV32-LABEL: test_pmerge_mvm_u32x2: +; RV32: # %bb.0: +; RV32-NEXT: mvm a1, a5, a3 +; RV32-NEXT: mvm a0, a4, a2 +; RV32-NEXT: ret +; +; RV64-LABEL: test_pmerge_mvm_u32x2: +; RV64: # %bb.0: +; RV64-NEXT: mvm a0, a2, a1 +; RV64-NEXT: ret + %res = call <2 x i32> @llvm.riscv.pmerge.v2i32(<2 x i32> %rs1, <2 x i32> %rs2, <2 x i32> %rd) + ret <2 x i32> %res +} + +define <2 x i32> @test_pmerge_mvmn_u32x2(<2 x i32> %rs2, <2 x i32> %rs1, <2 x i32> %rd) { +; RV32-LABEL: test_pmerge_mvmn_u32x2: +; RV32: # %bb.0: +; RV32-NEXT: mvmn a1, a3, a5 +; RV32-NEXT: mvmn a0, a2, a4 +; RV32-NEXT: ret +; +; RV64-LABEL: test_pmerge_mvmn_u32x2: +; RV64: # %bb.0: +; RV64-NEXT: mvmn a0, a1, a2 +; RV64-NEXT: ret + %res = call <2 x i32> @llvm.riscv.pmerge.v2i32(<2 x i32> %rs1, <2 x i32> %rs2, <2 x i32> %rd) + ret <2 x i32> %res +} + +define <2 x i32> @test_pmerge_merge_i32x2(<2 x i32> %rd, <2 x i32> %rs1, <2 x i32> %rs2) { +; RV32-LABEL: test_pmerge_merge_i32x2: +; RV32: # %bb.0: +; RV32-NEXT: merge a1, a3, a5 +; RV32-NEXT: merge a0, a2, a4 +; RV32-NEXT: ret +; +; RV64-LABEL: test_pmerge_merge_i32x2: +; RV64: # %bb.0: +; RV64-NEXT: merge a0, a1, a2 +; RV64-NEXT: ret + %res = call <2 x i32> @llvm.riscv.pmerge.v2i32(<2 x i32> %rs1, <2 x i32> %rs2, <2 x i32> %rd) + ret <2 x i32> %res +} + +define <2 x i32> @test_pmerge_mvm_i32x2(<2 x i32> %rs1, <2 x i32> %rd, <2 x i32> %rs2) { +; RV32-LABEL: test_pmerge_mvm_i32x2: +; RV32: # %bb.0: +; RV32-NEXT: mvm a1, a5, a3 +; RV32-NEXT: mvm a0, a4, a2 +; RV32-NEXT: ret +; +; RV64-LABEL: test_pmerge_mvm_i32x2: +; RV64: # %bb.0: +; RV64-NEXT: mvm a0, a2, a1 +; RV64-NEXT: ret + %res = call <2 x i32> @llvm.riscv.pmerge.v2i32(<2 x i32> %rs1, <2 x i32> %rs2, <2 x i32> %rd) + ret <2 x i32> %res +} + +define <2 x i32> @test_pmerge_mvmn_i32x2(<2 x i32> %rs2, <2 x i32> %rs1, <2 x i32> %rd) { +; RV32-LABEL: test_pmerge_mvmn_i32x2: +; RV32: # %bb.0: +; RV32-NEXT: mvmn a1, a3, a5 +; RV32-NEXT: mvmn a0, a2, a4 +; RV32-NEXT: ret +; +; RV64-LABEL: test_pmerge_mvmn_i32x2: +; RV64: # %bb.0: +; RV64-NEXT: mvmn a0, a1, a2 +; RV64-NEXT: ret + %res = call <2 x i32> @llvm.riscv.pmerge.v2i32(<2 x i32> %rs1, <2 x i32> %rs2, <2 x i32> %rd) + ret <2 x i32> %res +} _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
