https://github.com/sihuan created https://github.com/llvm/llvm-project/pull/206441
Add the __riscv_predsum/predsumu_* header wrappers over new __builtin_riscv_* builtins, lowering to the llvm.riscv.predsum/predsumu intrinsics. Stacked on #206430 (its commit appears first here); please review/merge that PR first. Once it lands I will rebase so only the Clang commit remains. >From ec949aea717dbf613d46176d09e4ca92d9019b68 Mon Sep 17 00:00:00 2001 From: SiHuaN <[email protected]> Date: Mon, 29 Jun 2026 07:30:05 +0000 Subject: [PATCH 1/2] [RISCV][P-ext] Avoid redundant accumulator extend for reduction sum For a reduction sum with an i32 accumulator on RV64, the result is computed at i64 and truncated, so the accumulator's upper bits are unused. Any-extend it instead of sign-/zero-extending, dropping a redundant sext.w/zext.w. Follow-up to #206004. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 6 +++++- llvm/test/CodeGen/RISCV/rvp-simd-32.ll | 4 ---- llvm/test/CodeGen/RISCV/rvp-simd-64.ll | 4 ---- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index ff14ecbd4917c..3a4ec65e4085f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -15861,7 +15861,11 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, Vec = DAG.getBitcast(WideVT, Wide); } - SDValue Res = RedSum(MVT::i64, Vec, Ext(N->getOperand(2))); + // The result is truncated to i32, so the accumulator's upper bits are + // unused and need no sign/zero extension. + SDValue Acc = + DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2)); + SDValue Res = RedSum(MVT::i64, Vec, Acc); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); return; } diff --git a/llvm/test/CodeGen/RISCV/rvp-simd-32.ll b/llvm/test/CodeGen/RISCV/rvp-simd-32.ll index 36a3430be3a1b..2b62ce13c5003 100644 --- a/llvm/test/CodeGen/RISCV/rvp-simd-32.ll +++ b/llvm/test/CodeGen/RISCV/rvp-simd-32.ll @@ -2471,7 +2471,6 @@ define i32 @test_predsum_i8x4_i32(<4 x i8> %a, i32 %b) { ; ; RV64-LABEL: test_predsum_i8x4_i32: ; RV64: # %bb.0: -; RV64-NEXT: sext.w a1, a1 ; RV64-NEXT: zext.w a0, a0 ; RV64-NEXT: predsum.bs a0, a0, a1 ; RV64-NEXT: ret @@ -2487,7 +2486,6 @@ define i32 @test_predsumu_u8x4_u32(<4 x i8> %a, i32 %b) { ; ; RV64-LABEL: test_predsumu_u8x4_u32: ; RV64: # %bb.0: -; RV64-NEXT: zext.w a1, a1 ; RV64-NEXT: zext.w a0, a0 ; RV64-NEXT: predsumu.bs a0, a0, a1 ; RV64-NEXT: ret @@ -2503,7 +2501,6 @@ define i32 @test_predsum_i16x2_i32(<2 x i16> %a, i32 %b) { ; ; RV64-LABEL: test_predsum_i16x2_i32: ; RV64: # %bb.0: -; RV64-NEXT: sext.w a1, a1 ; RV64-NEXT: zext.w a0, a0 ; RV64-NEXT: predsum.hs a0, a0, a1 ; RV64-NEXT: ret @@ -2519,7 +2516,6 @@ define i32 @test_predsumu_u16x2_u32(<2 x i16> %a, i32 %b) { ; ; RV64-LABEL: test_predsumu_u16x2_u32: ; RV64: # %bb.0: -; RV64-NEXT: zext.w a1, a1 ; RV64-NEXT: zext.w a0, a0 ; RV64-NEXT: predsumu.hs a0, a0, a1 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvp-simd-64.ll b/llvm/test/CodeGen/RISCV/rvp-simd-64.ll index 76838b44a9827..5fefbb394404b 100644 --- a/llvm/test/CodeGen/RISCV/rvp-simd-64.ll +++ b/llvm/test/CodeGen/RISCV/rvp-simd-64.ll @@ -5342,7 +5342,6 @@ define i32 @test_predsum_i8x8_i32(<8 x i8> %a, i32 %b) { ; ; RV64-LABEL: test_predsum_i8x8_i32: ; RV64: # %bb.0: -; RV64-NEXT: sext.w a1, a1 ; RV64-NEXT: predsum.bs a0, a0, a1 ; RV64-NEXT: ret %res = call i32 @llvm.riscv.predsum.i32.v8i8(<8 x i8> %a, i32 %b) @@ -5357,7 +5356,6 @@ define i32 @test_predsumu_u8x8_u32(<8 x i8> %a, i32 %b) { ; ; RV64-LABEL: test_predsumu_u8x8_u32: ; RV64: # %bb.0: -; RV64-NEXT: zext.w a1, a1 ; RV64-NEXT: predsumu.bs a0, a0, a1 ; RV64-NEXT: ret %res = call i32 @llvm.riscv.predsumu.i32.v8i8(<8 x i8> %a, i32 %b) @@ -5404,7 +5402,6 @@ define i32 @test_predsum_i16x4_i32(<4 x i16> %a, i32 %b) { ; ; RV64-LABEL: test_predsum_i16x4_i32: ; RV64: # %bb.0: -; RV64-NEXT: sext.w a1, a1 ; RV64-NEXT: predsum.hs a0, a0, a1 ; RV64-NEXT: ret %res = call i32 @llvm.riscv.predsum.i32.v4i16(<4 x i16> %a, i32 %b) @@ -5419,7 +5416,6 @@ define i32 @test_predsumu_u16x4_u32(<4 x i16> %a, i32 %b) { ; ; RV64-LABEL: test_predsumu_u16x4_u32: ; RV64: # %bb.0: -; RV64-NEXT: zext.w a1, a1 ; RV64-NEXT: predsumu.hs a0, a0, a1 ; RV64-NEXT: ret %res = call i32 @llvm.riscv.predsumu.i32.v4i16(<4 x i16> %a, i32 %b) >From 2b0ce4a9fc8af5fd89ede7f343f79dd43cc25463 Mon Sep 17 00:00:00 2001 From: SiHuaN <[email protected]> Date: Mon, 29 Jun 2026 07:39:36 +0000 Subject: [PATCH 2/2] [Clang][RISCV] packed reduction sum intrinsics Add the __riscv_predsum/predsumu_* header wrappers over new __builtin_riscv_* builtins, lowering to the llvm.riscv.predsum/predsumu intrinsics. --- clang/include/clang/Basic/BuiltinsRISCV.td | 18 ++ clang/lib/CodeGen/TargetBuiltins/RISCV.cpp | 42 +++ clang/lib/Headers/riscv_packed_simd.h | 25 ++ clang/test/CodeGen/RISCV/rvp-intrinsics.c | 254 ++++++++++++++++++ .../riscv_packed_simd.c | 114 ++++++++ 5 files changed, 453 insertions(+) diff --git a/clang/include/clang/Basic/BuiltinsRISCV.td b/clang/include/clang/Basic/BuiltinsRISCV.td index ee20fefadd7c3..3f84528fdca70 100644 --- a/clang/include/clang/Basic/BuiltinsRISCV.td +++ b/clang/include/clang/Basic/BuiltinsRISCV.td @@ -215,6 +215,24 @@ def pabd_i16x4 : RISCVBuiltin<"_Vector<4, unsigned short>(_Vector<4, short>, _Ve def pabdu_u8x8 : RISCVBuiltin<"_Vector<8, unsigned char>(_Vector<8, unsigned char>, _Vector<8, unsigned char>)">; def pabdu_u16x4 : RISCVBuiltin<"_Vector<4, unsigned short>(_Vector<4, unsigned short>, _Vector<4, unsigned short>)">; +// Packed Reduction Sum (32-bit) +def predsum_i8x4_i32 : RISCVBuiltin<"int(_Vector<4, signed char>, int)">; +def predsumu_u8x4_u32 : RISCVBuiltin<"unsigned int(_Vector<4, unsigned char>, unsigned int)">; +def predsum_i16x2_i32 : RISCVBuiltin<"int(_Vector<2, short>, int)">; +def predsumu_u16x2_u32 : RISCVBuiltin<"unsigned int(_Vector<2, unsigned short>, unsigned int)">; + +// Packed Reduction Sum (64-bit) +def predsum_i8x8_i32 : RISCVBuiltin<"int(_Vector<8, signed char>, int)">; +def predsumu_u8x8_u32 : RISCVBuiltin<"unsigned int(_Vector<8, unsigned char>, unsigned int)">; +def predsum_i16x4_i32 : RISCVBuiltin<"int(_Vector<4, short>, int)">; +def predsumu_u16x4_u32 : RISCVBuiltin<"unsigned int(_Vector<4, unsigned short>, unsigned int)">; +def predsum_i8x8_i64 : RISCVBuiltin<"int64_t(_Vector<8, signed char>, int64_t)">; +def predsumu_u8x8_u64 : RISCVBuiltin<"uint64_t(_Vector<8, unsigned char>, uint64_t)">; +def predsum_i16x4_i64 : RISCVBuiltin<"int64_t(_Vector<4, short>, int64_t)">; +def predsumu_u16x4_u64 : RISCVBuiltin<"uint64_t(_Vector<4, unsigned short>, uint64_t)">; +def predsum_i32x2_i64 : RISCVBuiltin<"int64_t(_Vector<2, int>, int64_t)">; +def predsumu_u32x2_u64 : RISCVBuiltin<"uint64_t(_Vector<2, unsigned int>, uint64_t)">; + } // Features = "experimental-p" //===----------------------------------------------------------------------===// diff --git a/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp b/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp index d5b027fe5f8fe..bb8fa86e7a564 100644 --- a/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp @@ -1327,6 +1327,48 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID, break; } + // Packed Reduction Sum + case RISCV::BI__builtin_riscv_predsum_i8x4_i32: + case RISCV::BI__builtin_riscv_predsum_i16x2_i32: + case RISCV::BI__builtin_riscv_predsum_i8x8_i32: + case RISCV::BI__builtin_riscv_predsum_i16x4_i32: + case RISCV::BI__builtin_riscv_predsum_i8x8_i64: + case RISCV::BI__builtin_riscv_predsum_i16x4_i64: + case RISCV::BI__builtin_riscv_predsum_i32x2_i64: + case RISCV::BI__builtin_riscv_predsumu_u8x4_u32: + case RISCV::BI__builtin_riscv_predsumu_u16x2_u32: + case RISCV::BI__builtin_riscv_predsumu_u8x8_u32: + case RISCV::BI__builtin_riscv_predsumu_u16x4_u32: + case RISCV::BI__builtin_riscv_predsumu_u8x8_u64: + case RISCV::BI__builtin_riscv_predsumu_u16x4_u64: + case RISCV::BI__builtin_riscv_predsumu_u32x2_u64: { + switch (BuiltinID) { + default: + llvm_unreachable("unexpected builtin ID"); + case RISCV::BI__builtin_riscv_predsum_i8x4_i32: + case RISCV::BI__builtin_riscv_predsum_i16x2_i32: + case RISCV::BI__builtin_riscv_predsum_i8x8_i32: + case RISCV::BI__builtin_riscv_predsum_i16x4_i32: + case RISCV::BI__builtin_riscv_predsum_i8x8_i64: + case RISCV::BI__builtin_riscv_predsum_i16x4_i64: + case RISCV::BI__builtin_riscv_predsum_i32x2_i64: + ID = Intrinsic::riscv_predsum; + break; + case RISCV::BI__builtin_riscv_predsumu_u8x4_u32: + case RISCV::BI__builtin_riscv_predsumu_u16x2_u32: + case RISCV::BI__builtin_riscv_predsumu_u8x8_u32: + case RISCV::BI__builtin_riscv_predsumu_u16x4_u32: + case RISCV::BI__builtin_riscv_predsumu_u8x8_u64: + case RISCV::BI__builtin_riscv_predsumu_u16x4_u64: + case RISCV::BI__builtin_riscv_predsumu_u32x2_u64: + ID = Intrinsic::riscv_predsumu; + break; + } + + IntrinsicTypes = {ResultType, Ops[0]->getType()}; + break; + } + // Zk builtins // Zknh diff --git a/clang/lib/Headers/riscv_packed_simd.h b/clang/lib/Headers/riscv_packed_simd.h index 5aa00f1519671..c61e156ca6a7f 100644 --- a/clang/lib/Headers/riscv_packed_simd.h +++ b/clang/lib/Headers/riscv_packed_simd.h @@ -103,6 +103,12 @@ typedef uint32_t uint32x2_t __attribute__((__vector_size__(8))); return (rty)builtin(__rs1, __rs2); \ } +#define __packed_reduction(name, rty, ty, builtin) \ + static __inline__ rty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1, \ + rty __rs2) { \ + return builtin(__rs1, __rs2); \ + } + // clang-format off: macro call sites have no trailing semicolons, which // confuses clang-format into a deeply nested expression. @@ -424,6 +430,24 @@ __packed_binary_builtin_cast(pabd_i16x4, int16x4_t, uint16x4_t, __builtin_riscv_ __packed_binary_builtin_cast(pabdu_u8x8, uint8x8_t, uint8x8_t, __builtin_riscv_pabdu_u8x8) __packed_binary_builtin_cast(pabdu_u16x4, uint16x4_t, uint16x4_t, __builtin_riscv_pabdu_u16x4) +/* Packed Reduction Sum (32-bit) */ +__packed_reduction(predsum_i8x4_i32, int32_t, int8x4_t, __builtin_riscv_predsum_i8x4_i32) +__packed_reduction(predsumu_u8x4_u32, uint32_t, uint8x4_t, __builtin_riscv_predsumu_u8x4_u32) +__packed_reduction(predsum_i16x2_i32, int32_t, int16x2_t, __builtin_riscv_predsum_i16x2_i32) +__packed_reduction(predsumu_u16x2_u32, uint32_t, uint16x2_t, __builtin_riscv_predsumu_u16x2_u32) + +/* Packed Reduction Sum (64-bit) */ +__packed_reduction(predsum_i8x8_i32, int32_t, int8x8_t, __builtin_riscv_predsum_i8x8_i32) +__packed_reduction(predsumu_u8x8_u32, uint32_t, uint8x8_t, __builtin_riscv_predsumu_u8x8_u32) +__packed_reduction(predsum_i16x4_i32, int32_t, int16x4_t, __builtin_riscv_predsum_i16x4_i32) +__packed_reduction(predsumu_u16x4_u32, uint32_t, uint16x4_t, __builtin_riscv_predsumu_u16x4_u32) +__packed_reduction(predsum_i8x8_i64, int64_t, int8x8_t, __builtin_riscv_predsum_i8x8_i64) +__packed_reduction(predsumu_u8x8_u64, uint64_t, uint8x8_t, __builtin_riscv_predsumu_u8x8_u64) +__packed_reduction(predsum_i16x4_i64, int64_t, int16x4_t, __builtin_riscv_predsum_i16x4_i64) +__packed_reduction(predsumu_u16x4_u64, uint64_t, uint16x4_t, __builtin_riscv_predsumu_u16x4_u64) +__packed_reduction(predsum_i32x2_i64, int64_t, int32x2_t, __builtin_riscv_predsum_i32x2_i64) +__packed_reduction(predsumu_u32x2_u64, uint64_t, uint32x2_t, __builtin_riscv_predsumu_u32x2_u64) + // clang-format on #undef __packed_splat2 @@ -443,6 +467,7 @@ __packed_binary_builtin_cast(pabdu_u16x4, uint16x4_t, uint16x4_t, __builtin_risc #undef __packed_cmp #undef __packed_pabs #undef __packed_binary_builtin_cast +#undef __packed_reduction #undef __DEFAULT_FN_ATTRS #if defined(__cplusplus) diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c b/clang/test/CodeGen/RISCV/rvp-intrinsics.c index d3f153109b904..290f61787ceff 100644 --- a/clang/test/CodeGen/RISCV/rvp-intrinsics.c +++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c @@ -5889,3 +5889,257 @@ uint8x8_t test_pabdu_u8x8(uint8x8_t rs1, uint8x8_t rs2) { uint16x4_t test_pabdu_u16x4(uint16x4_t rs1, uint16x4_t rs2) { return __riscv_pabdu_u16x4(rs1, rs2); } + +/* Packed Reduction Sum (32-bit) */ +// RV32-LABEL: define dso_local i32 @test_predsum_i8x4_i32( +// RV32-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <4 x i8> +// RV32-NEXT: [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v4i8(<4 x i8> [[TMP0]], i32 [[RS2]]) +// RV32-NEXT: ret i32 [[TMP1]] +// +// RV64-LABEL: define dso_local signext i32 @test_predsum_i8x4_i32( +// RV64-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <4 x i8> +// RV64-NEXT: [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v4i8(<4 x i8> [[TMP0]], i32 [[RS2]]) +// RV64-NEXT: ret i32 [[TMP1]] +// +int32_t test_predsum_i8x4_i32(int8x4_t rs1, int32_t rs2) { + return __riscv_predsum_i8x4_i32(rs1, rs2); +} + +// RV32-LABEL: define dso_local i32 @test_predsumu_u8x4_u32( +// RV32-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <4 x i8> +// RV32-NEXT: [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v4i8(<4 x i8> [[TMP0]], i32 [[RS2]]) +// RV32-NEXT: ret i32 [[TMP1]] +// +// RV64-LABEL: define dso_local signext i32 @test_predsumu_u8x4_u32( +// RV64-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <4 x i8> +// RV64-NEXT: [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v4i8(<4 x i8> [[TMP0]], i32 [[RS2]]) +// RV64-NEXT: ret i32 [[TMP1]] +// +uint32_t test_predsumu_u8x4_u32(uint8x4_t rs1, uint32_t rs2) { + return __riscv_predsumu_u8x4_u32(rs1, rs2); +} + +// RV32-LABEL: define dso_local i32 @test_predsum_i16x2_i32( +// RV32-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <2 x i16> +// RV32-NEXT: [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v2i16(<2 x i16> [[TMP0]], i32 [[RS2]]) +// RV32-NEXT: ret i32 [[TMP1]] +// +// RV64-LABEL: define dso_local signext i32 @test_predsum_i16x2_i32( +// RV64-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <2 x i16> +// RV64-NEXT: [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v2i16(<2 x i16> [[TMP0]], i32 [[RS2]]) +// RV64-NEXT: ret i32 [[TMP1]] +// +int32_t test_predsum_i16x2_i32(int16x2_t rs1, int32_t rs2) { + return __riscv_predsum_i16x2_i32(rs1, rs2); +} + +// RV32-LABEL: define dso_local i32 @test_predsumu_u16x2_u32( +// RV32-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <2 x i16> +// RV32-NEXT: [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v2i16(<2 x i16> [[TMP0]], i32 [[RS2]]) +// RV32-NEXT: ret i32 [[TMP1]] +// +// RV64-LABEL: define dso_local signext i32 @test_predsumu_u16x2_u32( +// RV64-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <2 x i16> +// RV64-NEXT: [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v2i16(<2 x i16> [[TMP0]], i32 [[RS2]]) +// RV64-NEXT: ret i32 [[TMP1]] +// +uint32_t test_predsumu_u16x2_u32(uint16x2_t rs1, uint32_t rs2) { + return __riscv_predsumu_u16x2_u32(rs1, rs2); +} + +/* Packed Reduction Sum (64-bit) */ +// RV32-LABEL: define dso_local i32 @test_predsum_i8x8_i32( +// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8> +// RV32-NEXT: [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v8i8(<8 x i8> [[TMP0]], i32 [[RS2]]) +// RV32-NEXT: ret i32 [[TMP1]] +// +// RV64-LABEL: define dso_local signext i32 @test_predsum_i8x8_i32( +// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8> +// RV64-NEXT: [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v8i8(<8 x i8> [[TMP0]], i32 [[RS2]]) +// RV64-NEXT: ret i32 [[TMP1]] +// +int32_t test_predsum_i8x8_i32(int8x8_t rs1, int32_t rs2) { + return __riscv_predsum_i8x8_i32(rs1, rs2); +} + +// RV32-LABEL: define dso_local i32 @test_predsumu_u8x8_u32( +// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8> +// RV32-NEXT: [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v8i8(<8 x i8> [[TMP0]], i32 [[RS2]]) +// RV32-NEXT: ret i32 [[TMP1]] +// +// RV64-LABEL: define dso_local signext i32 @test_predsumu_u8x8_u32( +// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8> +// RV64-NEXT: [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v8i8(<8 x i8> [[TMP0]], i32 [[RS2]]) +// RV64-NEXT: ret i32 [[TMP1]] +// +uint32_t test_predsumu_u8x8_u32(uint8x8_t rs1, uint32_t rs2) { + return __riscv_predsumu_u8x8_u32(rs1, rs2); +} + +// RV32-LABEL: define dso_local i32 @test_predsum_i16x4_i32( +// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16> +// RV32-NEXT: [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v4i16(<4 x i16> [[TMP0]], i32 [[RS2]]) +// RV32-NEXT: ret i32 [[TMP1]] +// +// RV64-LABEL: define dso_local signext i32 @test_predsum_i16x4_i32( +// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16> +// RV64-NEXT: [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v4i16(<4 x i16> [[TMP0]], i32 [[RS2]]) +// RV64-NEXT: ret i32 [[TMP1]] +// +int32_t test_predsum_i16x4_i32(int16x4_t rs1, int32_t rs2) { + return __riscv_predsum_i16x4_i32(rs1, rs2); +} + +// RV32-LABEL: define dso_local i32 @test_predsumu_u16x4_u32( +// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16> +// RV32-NEXT: [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v4i16(<4 x i16> [[TMP0]], i32 [[RS2]]) +// RV32-NEXT: ret i32 [[TMP1]] +// +// RV64-LABEL: define dso_local signext i32 @test_predsumu_u16x4_u32( +// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16> +// RV64-NEXT: [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v4i16(<4 x i16> [[TMP0]], i32 [[RS2]]) +// RV64-NEXT: ret i32 [[TMP1]] +// +uint32_t test_predsumu_u16x4_u32(uint16x4_t rs1, uint32_t rs2) { + return __riscv_predsumu_u16x4_u32(rs1, rs2); +} + +// RV32-LABEL: define dso_local i64 @test_predsum_i8x8_i64( +// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8> +// RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.riscv.predsum.i64.v8i8(<8 x i8> [[TMP0]], i64 [[RS2]]) +// RV32-NEXT: ret i64 [[TMP1]] +// +// RV64-LABEL: define dso_local i64 @test_predsum_i8x8_i64( +// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8> +// RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.riscv.predsum.i64.v8i8(<8 x i8> [[TMP0]], i64 [[RS2]]) +// RV64-NEXT: ret i64 [[TMP1]] +// +int64_t test_predsum_i8x8_i64(int8x8_t rs1, int64_t rs2) { + return __riscv_predsum_i8x8_i64(rs1, rs2); +} + +// RV32-LABEL: define dso_local i64 @test_predsumu_u8x8_u64( +// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8> +// RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.riscv.predsumu.i64.v8i8(<8 x i8> [[TMP0]], i64 [[RS2]]) +// RV32-NEXT: ret i64 [[TMP1]] +// +// RV64-LABEL: define dso_local i64 @test_predsumu_u8x8_u64( +// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8> +// RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.riscv.predsumu.i64.v8i8(<8 x i8> [[TMP0]], i64 [[RS2]]) +// RV64-NEXT: ret i64 [[TMP1]] +// +uint64_t test_predsumu_u8x8_u64(uint8x8_t rs1, uint64_t rs2) { + return __riscv_predsumu_u8x8_u64(rs1, rs2); +} + +// RV32-LABEL: define dso_local i64 @test_predsum_i16x4_i64( +// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16> +// RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.riscv.predsum.i64.v4i16(<4 x i16> [[TMP0]], i64 [[RS2]]) +// RV32-NEXT: ret i64 [[TMP1]] +// +// RV64-LABEL: define dso_local i64 @test_predsum_i16x4_i64( +// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16> +// RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.riscv.predsum.i64.v4i16(<4 x i16> [[TMP0]], i64 [[RS2]]) +// RV64-NEXT: ret i64 [[TMP1]] +// +int64_t test_predsum_i16x4_i64(int16x4_t rs1, int64_t rs2) { + return __riscv_predsum_i16x4_i64(rs1, rs2); +} + +// RV32-LABEL: define dso_local i64 @test_predsumu_u16x4_u64( +// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16> +// RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.riscv.predsumu.i64.v4i16(<4 x i16> [[TMP0]], i64 [[RS2]]) +// RV32-NEXT: ret i64 [[TMP1]] +// +// RV64-LABEL: define dso_local i64 @test_predsumu_u16x4_u64( +// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16> +// RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.riscv.predsumu.i64.v4i16(<4 x i16> [[TMP0]], i64 [[RS2]]) +// RV64-NEXT: ret i64 [[TMP1]] +// +uint64_t test_predsumu_u16x4_u64(uint16x4_t rs1, uint64_t rs2) { + return __riscv_predsumu_u16x4_u64(rs1, rs2); +} + +// RV32-LABEL: define dso_local i64 @test_predsum_i32x2_i64( +// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <2 x i32> +// RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.riscv.predsum.i64.v2i32(<2 x i32> [[TMP0]], i64 [[RS2]]) +// RV32-NEXT: ret i64 [[TMP1]] +// +// RV64-LABEL: define dso_local i64 @test_predsum_i32x2_i64( +// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <2 x i32> +// RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.riscv.predsum.i64.v2i32(<2 x i32> [[TMP0]], i64 [[RS2]]) +// RV64-NEXT: ret i64 [[TMP1]] +// +int64_t test_predsum_i32x2_i64(int32x2_t rs1, int64_t rs2) { + return __riscv_predsum_i32x2_i64(rs1, rs2); +} + +// RV32-LABEL: define dso_local i64 @test_predsumu_u32x2_u64( +// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) #[[ATTR0]] { +// RV32-NEXT: [[ENTRY:.*:]] +// RV32-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <2 x i32> +// RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.riscv.predsumu.i64.v2i32(<2 x i32> [[TMP0]], i64 [[RS2]]) +// RV32-NEXT: ret i64 [[TMP1]] +// +// RV64-LABEL: define dso_local i64 @test_predsumu_u32x2_u64( +// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) #[[ATTR0]] { +// RV64-NEXT: [[ENTRY:.*:]] +// RV64-NEXT: [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <2 x i32> +// RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.riscv.predsumu.i64.v2i32(<2 x i32> [[TMP0]], i64 [[RS2]]) +// RV64-NEXT: ret i64 [[TMP1]] +// +uint64_t test_predsumu_u32x2_u64(uint32x2_t rs1, uint64_t rs2) { + return __riscv_predsumu_u32x2_u64(rs1, rs2); +} diff --git a/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c index 020a6be70aadb..4b939675cbeb5 100644 --- a/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c +++ b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c @@ -1968,3 +1968,117 @@ uint8x8_t test_pabdu_u8x8(uint8x8_t a, uint8x8_t b) { uint16x4_t test_pabdu_u16x4(uint16x4_t a, uint16x4_t b) { return __riscv_pabdu_u16x4(a, b); } + +// CHECK-LABEL: test_predsum_i8x4_i32: +// RV32: predsum.bs +// RV64: zext.w +// RV64: predsum.bs +int32_t test_predsum_i8x4_i32(int8x4_t a, int32_t b) { + return __riscv_predsum_i8x4_i32(a, b); +} + +// CHECK-LABEL: test_predsumu_u8x4_u32: +// RV32: predsumu.bs +// RV64: zext.w +// RV64: predsumu.bs +uint32_t test_predsumu_u8x4_u32(uint8x4_t a, uint32_t b) { + return __riscv_predsumu_u8x4_u32(a, b); +} + +// CHECK-LABEL: test_predsum_i16x2_i32: +// RV32: predsum.hs +// RV64: zext.w +// RV64: predsum.hs +int32_t test_predsum_i16x2_i32(int16x2_t a, int32_t b) { + return __riscv_predsum_i16x2_i32(a, b); +} + +// CHECK-LABEL: test_predsumu_u16x2_u32: +// RV32: predsumu.hs +// RV64: zext.w +// RV64: predsumu.hs +uint32_t test_predsumu_u16x2_u32(uint16x2_t a, uint32_t b) { + return __riscv_predsumu_u16x2_u32(a, b); +} + +// CHECK-LABEL: test_predsum_i8x8_i32: +// RV32: predsum.dbs +// RV64: predsum.bs +int32_t test_predsum_i8x8_i32(int8x8_t a, int32_t b) { + return __riscv_predsum_i8x8_i32(a, b); +} + +// CHECK-LABEL: test_predsumu_u8x8_u32: +// RV32: predsumu.dbs +// RV64: predsumu.bs +uint32_t test_predsumu_u8x8_u32(uint8x8_t a, uint32_t b) { + return __riscv_predsumu_u8x8_u32(a, b); +} + +// CHECK-LABEL: test_predsum_i16x4_i32: +// RV32: predsum.dhs +// RV64: predsum.hs +int32_t test_predsum_i16x4_i32(int16x4_t a, int32_t b) { + return __riscv_predsum_i16x4_i32(a, b); +} + +// CHECK-LABEL: test_predsumu_u16x4_u32: +// RV32: predsumu.dhs +// RV64: predsumu.hs +uint32_t test_predsumu_u16x4_u32(uint16x4_t a, uint32_t b) { + return __riscv_predsumu_u16x4_u32(a, b); +} + +// TODO: The trailing "mvd" is a GPRPair copy inserted because wadda clobbers +// its rd; it may be avoidable (e.g. via convertToThreeAddress). +// CHECK-LABEL: test_predsum_i8x8_i64: +// RV32: predsum.dbs +// RV32: wadda{{[[:space:]]}} +// RV32: mvd +// RV64: predsum.bs +int64_t test_predsum_i8x8_i64(int8x8_t a, int64_t b) { + return __riscv_predsum_i8x8_i64(a, b); +} + +// CHECK-LABEL: test_predsumu_u8x8_u64: +// RV32: predsumu.dbs +// RV32: waddau{{[[:space:]]}} +// RV32: mvd +// RV64: predsumu.bs +uint64_t test_predsumu_u8x8_u64(uint8x8_t a, uint64_t b) { + return __riscv_predsumu_u8x8_u64(a, b); +} + +// CHECK-LABEL: test_predsum_i16x4_i64: +// RV32: predsum.dhs +// RV32: wadda{{[[:space:]]}} +// RV32: mvd +// RV64: predsum.hs +int64_t test_predsum_i16x4_i64(int16x4_t a, int64_t b) { + return __riscv_predsum_i16x4_i64(a, b); +} + +// CHECK-LABEL: test_predsumu_u16x4_u64: +// RV32: predsumu.dhs +// RV32: waddau{{[[:space:]]}} +// RV32: mvd +// RV64: predsumu.hs +uint64_t test_predsumu_u16x4_u64(uint16x4_t a, uint64_t b) { + return __riscv_predsumu_u16x4_u64(a, b); +} + +// CHECK-LABEL: test_predsum_i32x2_i64: +// RV32: wadda{{[[:space:]]}} +// RV32: mvd +// RV64: predsum.ws +int64_t test_predsum_i32x2_i64(int32x2_t a, int64_t b) { + return __riscv_predsum_i32x2_i64(a, b); +} + +// CHECK-LABEL: test_predsumu_u32x2_u64: +// RV32: waddau{{[[:space:]]}} +// RV32: mvd +// RV64: predsumu.ws +uint64_t test_predsumu_u32x2_u64(uint32x2_t a, uint64_t b) { + return __riscv_predsumu_u32x2_u64(a, b); +} _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
