[clang] [llvm] [Clang][RISCV] packed reduction sum intrinsics (PR #206441)

via cfe-commits Mon, 29 Jun 2026 02:29:16 -0700

https://github.com/sihuan created 
https://github.com/llvm/llvm-project/pull/206441


Add the __riscv_predsum/predsumu_* header wrappers over new
__builtin_riscv_* builtins, lowering to the llvm.riscv.predsum/predsumu
intrinsics.

Stacked on #206430 (its commit appears first here); please review/merge
that PR first. Once it lands I will rebase so only the Clang commit
remains.

>From ec949aea717dbf613d46176d09e4ca92d9019b68 Mon Sep 17 00:00:00 2001
From: SiHuaN <[email protected]>
Date: Mon, 29 Jun 2026 07:30:05 +0000
Subject: [PATCH 1/2] [RISCV][P-ext] Avoid redundant accumulator extend for
 reduction sum

For a reduction sum with an i32 accumulator on RV64, the result is
computed at i64 and truncated, so the accumulator's upper bits are
unused. Any-extend it instead of sign-/zero-extending, dropping a
redundant sext.w/zext.w. Follow-up to #206004.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 6 +++++-
 llvm/test/CodeGen/RISCV/rvp-simd-32.ll      | 4 ----
 llvm/test/CodeGen/RISCV/rvp-simd-64.ll      | 4 ----
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp 
b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index ff14ecbd4917c..3a4ec65e4085f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -15861,7 +15861,11 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
         Vec = DAG.getBitcast(WideVT, Wide);
       }
 
-      SDValue Res = RedSum(MVT::i64, Vec, Ext(N->getOperand(2)));
+      // The result is truncated to i32, so the accumulator's upper bits are
+      // unused and need no sign/zero extension.
+      SDValue Acc =
+          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2));
+      SDValue Res = RedSum(MVT::i64, Vec, Acc);
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
       return;
     }
diff --git a/llvm/test/CodeGen/RISCV/rvp-simd-32.ll 
b/llvm/test/CodeGen/RISCV/rvp-simd-32.ll
index 36a3430be3a1b..2b62ce13c5003 100644
--- a/llvm/test/CodeGen/RISCV/rvp-simd-32.ll
+++ b/llvm/test/CodeGen/RISCV/rvp-simd-32.ll
@@ -2471,7 +2471,6 @@ define i32 @test_predsum_i8x4_i32(<4 x i8> %a, i32 %b) {
 ;
 ; RV64-LABEL: test_predsum_i8x4_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a1, a1
 ; RV64-NEXT:    zext.w a0, a0
 ; RV64-NEXT:    predsum.bs a0, a0, a1
 ; RV64-NEXT:    ret
@@ -2487,7 +2486,6 @@ define i32 @test_predsumu_u8x4_u32(<4 x i8> %a, i32 %b) {
 ;
 ; RV64-LABEL: test_predsumu_u8x4_u32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    zext.w a1, a1
 ; RV64-NEXT:    zext.w a0, a0
 ; RV64-NEXT:    predsumu.bs a0, a0, a1
 ; RV64-NEXT:    ret
@@ -2503,7 +2501,6 @@ define i32 @test_predsum_i16x2_i32(<2 x i16> %a, i32 %b) {
 ;
 ; RV64-LABEL: test_predsum_i16x2_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a1, a1
 ; RV64-NEXT:    zext.w a0, a0
 ; RV64-NEXT:    predsum.hs a0, a0, a1
 ; RV64-NEXT:    ret
@@ -2519,7 +2516,6 @@ define i32 @test_predsumu_u16x2_u32(<2 x i16> %a, i32 %b) 
{
 ;
 ; RV64-LABEL: test_predsumu_u16x2_u32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    zext.w a1, a1
 ; RV64-NEXT:    zext.w a0, a0
 ; RV64-NEXT:    predsumu.hs a0, a0, a1
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvp-simd-64.ll 
b/llvm/test/CodeGen/RISCV/rvp-simd-64.ll
index 76838b44a9827..5fefbb394404b 100644
--- a/llvm/test/CodeGen/RISCV/rvp-simd-64.ll
+++ b/llvm/test/CodeGen/RISCV/rvp-simd-64.ll
@@ -5342,7 +5342,6 @@ define i32 @test_predsum_i8x8_i32(<8 x i8> %a, i32 %b) {
 ;
 ; RV64-LABEL: test_predsum_i8x8_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a1, a1
 ; RV64-NEXT:    predsum.bs a0, a0, a1
 ; RV64-NEXT:    ret
   %res = call i32 @llvm.riscv.predsum.i32.v8i8(<8 x i8> %a, i32 %b)
@@ -5357,7 +5356,6 @@ define i32 @test_predsumu_u8x8_u32(<8 x i8> %a, i32 %b) {
 ;
 ; RV64-LABEL: test_predsumu_u8x8_u32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    zext.w a1, a1
 ; RV64-NEXT:    predsumu.bs a0, a0, a1
 ; RV64-NEXT:    ret
   %res = call i32 @llvm.riscv.predsumu.i32.v8i8(<8 x i8> %a, i32 %b)
@@ -5404,7 +5402,6 @@ define i32 @test_predsum_i16x4_i32(<4 x i16> %a, i32 %b) {
 ;
 ; RV64-LABEL: test_predsum_i16x4_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a1, a1
 ; RV64-NEXT:    predsum.hs a0, a0, a1
 ; RV64-NEXT:    ret
   %res = call i32 @llvm.riscv.predsum.i32.v4i16(<4 x i16> %a, i32 %b)
@@ -5419,7 +5416,6 @@ define i32 @test_predsumu_u16x4_u32(<4 x i16> %a, i32 %b) 
{
 ;
 ; RV64-LABEL: test_predsumu_u16x4_u32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    zext.w a1, a1
 ; RV64-NEXT:    predsumu.hs a0, a0, a1
 ; RV64-NEXT:    ret
   %res = call i32 @llvm.riscv.predsumu.i32.v4i16(<4 x i16> %a, i32 %b)

>From 2b0ce4a9fc8af5fd89ede7f343f79dd43cc25463 Mon Sep 17 00:00:00 2001
From: SiHuaN <[email protected]>
Date: Mon, 29 Jun 2026 07:39:36 +0000
Subject: [PATCH 2/2] [Clang][RISCV] packed reduction sum intrinsics

Add the __riscv_predsum/predsumu_* header wrappers over new
__builtin_riscv_* builtins, lowering to the llvm.riscv.predsum/predsumu
intrinsics.
---
 clang/include/clang/Basic/BuiltinsRISCV.td    |  18 ++
 clang/lib/CodeGen/TargetBuiltins/RISCV.cpp    |  42 +++
 clang/lib/Headers/riscv_packed_simd.h         |  25 ++
 clang/test/CodeGen/RISCV/rvp-intrinsics.c     | 254 ++++++++++++++++++
 .../riscv_packed_simd.c                       | 114 ++++++++
 5 files changed, 453 insertions(+)

diff --git a/clang/include/clang/Basic/BuiltinsRISCV.td 
b/clang/include/clang/Basic/BuiltinsRISCV.td
index ee20fefadd7c3..3f84528fdca70 100644
--- a/clang/include/clang/Basic/BuiltinsRISCV.td
+++ b/clang/include/clang/Basic/BuiltinsRISCV.td
@@ -215,6 +215,24 @@ def pabd_i16x4 : RISCVBuiltin<"_Vector<4, unsigned 
short>(_Vector<4, short>, _Ve
 def pabdu_u8x8 : RISCVBuiltin<"_Vector<8, unsigned char>(_Vector<8, unsigned 
char>, _Vector<8, unsigned char>)">;
 def pabdu_u16x4 : RISCVBuiltin<"_Vector<4, unsigned short>(_Vector<4, unsigned 
short>, _Vector<4, unsigned short>)">;
 
+// Packed Reduction Sum (32-bit)
+def predsum_i8x4_i32 : RISCVBuiltin<"int(_Vector<4, signed char>, int)">;
+def predsumu_u8x4_u32 : RISCVBuiltin<"unsigned int(_Vector<4, unsigned char>, 
unsigned int)">;
+def predsum_i16x2_i32 : RISCVBuiltin<"int(_Vector<2, short>, int)">;
+def predsumu_u16x2_u32 : RISCVBuiltin<"unsigned int(_Vector<2, unsigned 
short>, unsigned int)">;
+
+// Packed Reduction Sum (64-bit)
+def predsum_i8x8_i32 : RISCVBuiltin<"int(_Vector<8, signed char>, int)">;
+def predsumu_u8x8_u32 : RISCVBuiltin<"unsigned int(_Vector<8, unsigned char>, 
unsigned int)">;
+def predsum_i16x4_i32 : RISCVBuiltin<"int(_Vector<4, short>, int)">;
+def predsumu_u16x4_u32 : RISCVBuiltin<"unsigned int(_Vector<4, unsigned 
short>, unsigned int)">;
+def predsum_i8x8_i64 : RISCVBuiltin<"int64_t(_Vector<8, signed char>, 
int64_t)">;
+def predsumu_u8x8_u64 : RISCVBuiltin<"uint64_t(_Vector<8, unsigned char>, 
uint64_t)">;
+def predsum_i16x4_i64 : RISCVBuiltin<"int64_t(_Vector<4, short>, int64_t)">;
+def predsumu_u16x4_u64 : RISCVBuiltin<"uint64_t(_Vector<4, unsigned short>, 
uint64_t)">;
+def predsum_i32x2_i64 : RISCVBuiltin<"int64_t(_Vector<2, int>, int64_t)">;
+def predsumu_u32x2_u64 : RISCVBuiltin<"uint64_t(_Vector<2, unsigned int>, 
uint64_t)">;
+
 } // Features = "experimental-p"
 
 
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp 
b/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp
index d5b027fe5f8fe..bb8fa86e7a564 100644
--- a/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp
@@ -1327,6 +1327,48 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned 
BuiltinID,
     break;
   }
 
+  // Packed Reduction Sum
+  case RISCV::BI__builtin_riscv_predsum_i8x4_i32:
+  case RISCV::BI__builtin_riscv_predsum_i16x2_i32:
+  case RISCV::BI__builtin_riscv_predsum_i8x8_i32:
+  case RISCV::BI__builtin_riscv_predsum_i16x4_i32:
+  case RISCV::BI__builtin_riscv_predsum_i8x8_i64:
+  case RISCV::BI__builtin_riscv_predsum_i16x4_i64:
+  case RISCV::BI__builtin_riscv_predsum_i32x2_i64:
+  case RISCV::BI__builtin_riscv_predsumu_u8x4_u32:
+  case RISCV::BI__builtin_riscv_predsumu_u16x2_u32:
+  case RISCV::BI__builtin_riscv_predsumu_u8x8_u32:
+  case RISCV::BI__builtin_riscv_predsumu_u16x4_u32:
+  case RISCV::BI__builtin_riscv_predsumu_u8x8_u64:
+  case RISCV::BI__builtin_riscv_predsumu_u16x4_u64:
+  case RISCV::BI__builtin_riscv_predsumu_u32x2_u64: {
+    switch (BuiltinID) {
+    default:
+      llvm_unreachable("unexpected builtin ID");
+    case RISCV::BI__builtin_riscv_predsum_i8x4_i32:
+    case RISCV::BI__builtin_riscv_predsum_i16x2_i32:
+    case RISCV::BI__builtin_riscv_predsum_i8x8_i32:
+    case RISCV::BI__builtin_riscv_predsum_i16x4_i32:
+    case RISCV::BI__builtin_riscv_predsum_i8x8_i64:
+    case RISCV::BI__builtin_riscv_predsum_i16x4_i64:
+    case RISCV::BI__builtin_riscv_predsum_i32x2_i64:
+      ID = Intrinsic::riscv_predsum;
+      break;
+    case RISCV::BI__builtin_riscv_predsumu_u8x4_u32:
+    case RISCV::BI__builtin_riscv_predsumu_u16x2_u32:
+    case RISCV::BI__builtin_riscv_predsumu_u8x8_u32:
+    case RISCV::BI__builtin_riscv_predsumu_u16x4_u32:
+    case RISCV::BI__builtin_riscv_predsumu_u8x8_u64:
+    case RISCV::BI__builtin_riscv_predsumu_u16x4_u64:
+    case RISCV::BI__builtin_riscv_predsumu_u32x2_u64:
+      ID = Intrinsic::riscv_predsumu;
+      break;
+    }
+
+    IntrinsicTypes = {ResultType, Ops[0]->getType()};
+    break;
+  }
+
   // Zk builtins
 
   // Zknh
diff --git a/clang/lib/Headers/riscv_packed_simd.h 
b/clang/lib/Headers/riscv_packed_simd.h
index 5aa00f1519671..c61e156ca6a7f 100644
--- a/clang/lib/Headers/riscv_packed_simd.h
+++ b/clang/lib/Headers/riscv_packed_simd.h
@@ -103,6 +103,12 @@ typedef uint32_t uint32x2_t 
__attribute__((__vector_size__(8)));
     return (rty)builtin(__rs1, __rs2);                                         
\
   }
 
+#define __packed_reduction(name, rty, ty, builtin)                             
\
+  static __inline__ rty __DEFAULT_FN_ATTRS __riscv_##name(ty __rs1,            
\
+                                                          rty __rs2) {         
\
+    return builtin(__rs1, __rs2);                                              
\
+  }
+
 // clang-format off: macro call sites have no trailing semicolons, which
 // confuses clang-format into a deeply nested expression.
 
@@ -424,6 +430,24 @@ __packed_binary_builtin_cast(pabd_i16x4, int16x4_t, 
uint16x4_t, __builtin_riscv_
 __packed_binary_builtin_cast(pabdu_u8x8, uint8x8_t, uint8x8_t, 
__builtin_riscv_pabdu_u8x8)
 __packed_binary_builtin_cast(pabdu_u16x4, uint16x4_t, uint16x4_t, 
__builtin_riscv_pabdu_u16x4)
 
+/* Packed Reduction Sum (32-bit) */
+__packed_reduction(predsum_i8x4_i32, int32_t, int8x4_t, 
__builtin_riscv_predsum_i8x4_i32)
+__packed_reduction(predsumu_u8x4_u32, uint32_t, uint8x4_t, 
__builtin_riscv_predsumu_u8x4_u32)
+__packed_reduction(predsum_i16x2_i32, int32_t, int16x2_t, 
__builtin_riscv_predsum_i16x2_i32)
+__packed_reduction(predsumu_u16x2_u32, uint32_t, uint16x2_t, 
__builtin_riscv_predsumu_u16x2_u32)
+
+/* Packed Reduction Sum (64-bit) */
+__packed_reduction(predsum_i8x8_i32, int32_t, int8x8_t, 
__builtin_riscv_predsum_i8x8_i32)
+__packed_reduction(predsumu_u8x8_u32, uint32_t, uint8x8_t, 
__builtin_riscv_predsumu_u8x8_u32)
+__packed_reduction(predsum_i16x4_i32, int32_t, int16x4_t, 
__builtin_riscv_predsum_i16x4_i32)
+__packed_reduction(predsumu_u16x4_u32, uint32_t, uint16x4_t, 
__builtin_riscv_predsumu_u16x4_u32)
+__packed_reduction(predsum_i8x8_i64, int64_t, int8x8_t, 
__builtin_riscv_predsum_i8x8_i64)
+__packed_reduction(predsumu_u8x8_u64, uint64_t, uint8x8_t, 
__builtin_riscv_predsumu_u8x8_u64)
+__packed_reduction(predsum_i16x4_i64, int64_t, int16x4_t, 
__builtin_riscv_predsum_i16x4_i64)
+__packed_reduction(predsumu_u16x4_u64, uint64_t, uint16x4_t, 
__builtin_riscv_predsumu_u16x4_u64)
+__packed_reduction(predsum_i32x2_i64, int64_t, int32x2_t, 
__builtin_riscv_predsum_i32x2_i64)
+__packed_reduction(predsumu_u32x2_u64, uint64_t, uint32x2_t, 
__builtin_riscv_predsumu_u32x2_u64)
+
 // clang-format on
 
 #undef __packed_splat2
@@ -443,6 +467,7 @@ __packed_binary_builtin_cast(pabdu_u16x4, uint16x4_t, 
uint16x4_t, __builtin_risc
 #undef __packed_cmp
 #undef __packed_pabs
 #undef __packed_binary_builtin_cast
+#undef __packed_reduction
 #undef __DEFAULT_FN_ATTRS
 
 #if defined(__cplusplus)
diff --git a/clang/test/CodeGen/RISCV/rvp-intrinsics.c 
b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
index d3f153109b904..290f61787ceff 100644
--- a/clang/test/CodeGen/RISCV/rvp-intrinsics.c
+++ b/clang/test/CodeGen/RISCV/rvp-intrinsics.c
@@ -5889,3 +5889,257 @@ uint8x8_t test_pabdu_u8x8(uint8x8_t rs1, uint8x8_t rs2) 
{
 uint16x4_t test_pabdu_u16x4(uint16x4_t rs1, uint16x4_t rs2) {
   return __riscv_pabdu_u16x4(rs1, rs2);
 }
+
+/* Packed Reduction Sum (32-bit) */
+// RV32-LABEL: define dso_local i32 @test_predsum_i8x4_i32(
+// RV32-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v4i8(<4 x i8> 
[[TMP0]], i32 [[RS2]])
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local signext i32 @test_predsum_i8x4_i32(
+// RV64-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v4i8(<4 x i8> 
[[TMP0]], i32 [[RS2]])
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int32_t test_predsum_i8x4_i32(int8x4_t rs1, int32_t rs2) {
+  return __riscv_predsum_i8x4_i32(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i32 @test_predsumu_u8x4_u32(
+// RV32-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <4 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v4i8(<4 x 
i8> [[TMP0]], i32 [[RS2]])
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local signext i32 @test_predsumu_u8x4_u32(
+// RV64-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <4 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v4i8(<4 x 
i8> [[TMP0]], i32 [[RS2]])
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+uint32_t test_predsumu_u8x4_u32(uint8x4_t rs1, uint32_t rs2) {
+  return __riscv_predsumu_u8x4_u32(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i32 @test_predsum_i16x2_i32(
+// RV32-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v2i16(<2 x 
i16> [[TMP0]], i32 [[RS2]])
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local signext i32 @test_predsum_i16x2_i32(
+// RV64-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v2i16(<2 x 
i16> [[TMP0]], i32 [[RS2]])
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int32_t test_predsum_i16x2_i32(int16x2_t rs1, int32_t rs2) {
+  return __riscv_predsum_i16x2_i32(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i32 @test_predsumu_u16x2_u32(
+// RV32-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <2 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v2i16(<2 x 
i16> [[TMP0]], i32 [[RS2]])
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local signext i32 @test_predsumu_u16x2_u32(
+// RV64-SAME: i32 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i32 [[RS1_COERCE]] to <2 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v2i16(<2 x 
i16> [[TMP0]], i32 [[RS2]])
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+uint32_t test_predsumu_u16x2_u32(uint16x2_t rs1, uint32_t rs2) {
+  return __riscv_predsumu_u16x2_u32(rs1, rs2);
+}
+
+/* Packed Reduction Sum (64-bit) */
+// RV32-LABEL: define dso_local i32 @test_predsum_i8x8_i32(
+// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v8i8(<8 x i8> 
[[TMP0]], i32 [[RS2]])
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local signext i32 @test_predsum_i8x8_i32(
+// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v8i8(<8 x i8> 
[[TMP0]], i32 [[RS2]])
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int32_t test_predsum_i8x8_i32(int8x8_t rs1, int32_t rs2) {
+  return __riscv_predsum_i8x8_i32(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i32 @test_predsumu_u8x8_u32(
+// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v8i8(<8 x 
i8> [[TMP0]], i32 [[RS2]])
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local signext i32 @test_predsumu_u8x8_u32(
+// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v8i8(<8 x 
i8> [[TMP0]], i32 [[RS2]])
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+uint32_t test_predsumu_u8x8_u32(uint8x8_t rs1, uint32_t rs2) {
+  return __riscv_predsumu_u8x8_u32(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i32 @test_predsum_i16x4_i32(
+// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v4i16(<4 x 
i16> [[TMP0]], i32 [[RS2]])
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local signext i32 @test_predsum_i16x4_i32(
+// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsum.i32.v4i16(<4 x 
i16> [[TMP0]], i32 [[RS2]])
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+int32_t test_predsum_i16x4_i32(int16x4_t rs1, int32_t rs2) {
+  return __riscv_predsum_i16x4_i32(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i32 @test_predsumu_u16x4_u32(
+// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v4i16(<4 x 
i16> [[TMP0]], i32 [[RS2]])
+// RV32-NEXT:    ret i32 [[TMP1]]
+//
+// RV64-LABEL: define dso_local signext i32 @test_predsumu_u16x4_u32(
+// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i32 noundef signext [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = call i32 @llvm.riscv.predsumu.i32.v4i16(<4 x 
i16> [[TMP0]], i32 [[RS2]])
+// RV64-NEXT:    ret i32 [[TMP1]]
+//
+uint32_t test_predsumu_u16x4_u32(uint16x4_t rs1, uint32_t rs2) {
+  return __riscv_predsumu_u16x4_u32(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i64 @test_predsum_i8x8_i64(
+// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsum.i64.v8i8(<8 x i8> 
[[TMP0]], i64 [[RS2]])
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_predsum_i8x8_i64(
+// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsum.i64.v8i8(<8 x i8> 
[[TMP0]], i64 [[RS2]])
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int64_t test_predsum_i8x8_i64(int8x8_t rs1, int64_t rs2) {
+  return __riscv_predsum_i8x8_i64(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i64 @test_predsumu_u8x8_u64(
+// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8>
+// RV32-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsumu.i64.v8i8(<8 x 
i8> [[TMP0]], i64 [[RS2]])
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_predsumu_u8x8_u64(
+// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <8 x i8>
+// RV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsumu.i64.v8i8(<8 x 
i8> [[TMP0]], i64 [[RS2]])
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint64_t test_predsumu_u8x8_u64(uint8x8_t rs1, uint64_t rs2) {
+  return __riscv_predsumu_u8x8_u64(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i64 @test_predsum_i16x4_i64(
+// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsum.i64.v4i16(<4 x 
i16> [[TMP0]], i64 [[RS2]])
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_predsum_i16x4_i64(
+// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsum.i64.v4i16(<4 x 
i16> [[TMP0]], i64 [[RS2]])
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int64_t test_predsum_i16x4_i64(int16x4_t rs1, int64_t rs2) {
+  return __riscv_predsum_i16x4_i64(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i64 @test_predsumu_u16x4_u64(
+// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16>
+// RV32-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsumu.i64.v4i16(<4 x 
i16> [[TMP0]], i64 [[RS2]])
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_predsumu_u16x4_u64(
+// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <4 x i16>
+// RV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsumu.i64.v4i16(<4 x 
i16> [[TMP0]], i64 [[RS2]])
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint64_t test_predsumu_u16x4_u64(uint16x4_t rs1, uint64_t rs2) {
+  return __riscv_predsumu_u16x4_u64(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i64 @test_predsum_i32x2_i64(
+// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsum.i64.v2i32(<2 x 
i32> [[TMP0]], i64 [[RS2]])
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_predsum_i32x2_i64(
+// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsum.i64.v2i32(<2 x 
i32> [[TMP0]], i64 [[RS2]])
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+int64_t test_predsum_i32x2_i64(int32x2_t rs1, int64_t rs2) {
+  return __riscv_predsum_i32x2_i64(rs1, rs2);
+}
+
+// RV32-LABEL: define dso_local i64 @test_predsumu_u32x2_u64(
+// RV32-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV32-NEXT:  [[ENTRY:.*:]]
+// RV32-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <2 x i32>
+// RV32-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsumu.i64.v2i32(<2 x 
i32> [[TMP0]], i64 [[RS2]])
+// RV32-NEXT:    ret i64 [[TMP1]]
+//
+// RV64-LABEL: define dso_local i64 @test_predsumu_u32x2_u64(
+// RV64-SAME: i64 noundef [[RS1_COERCE:%.*]], i64 noundef [[RS2:%.*]]) 
#[[ATTR0]] {
+// RV64-NEXT:  [[ENTRY:.*:]]
+// RV64-NEXT:    [[TMP0:%.*]] = bitcast i64 [[RS1_COERCE]] to <2 x i32>
+// RV64-NEXT:    [[TMP1:%.*]] = call i64 @llvm.riscv.predsumu.i64.v2i32(<2 x 
i32> [[TMP0]], i64 [[RS2]])
+// RV64-NEXT:    ret i64 [[TMP1]]
+//
+uint64_t test_predsumu_u32x2_u64(uint32x2_t rs1, uint64_t rs2) {
+  return __riscv_predsumu_u32x2_u64(rs1, rs2);
+}
diff --git a/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c 
b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
index 020a6be70aadb..4b939675cbeb5 100644
--- a/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
+++ b/cross-project-tests/intrinsic-header-tests/riscv_packed_simd.c
@@ -1968,3 +1968,117 @@ uint8x8_t test_pabdu_u8x8(uint8x8_t a, uint8x8_t b) {
 uint16x4_t test_pabdu_u16x4(uint16x4_t a, uint16x4_t b) {
   return __riscv_pabdu_u16x4(a, b);
 }
+
+// CHECK-LABEL: test_predsum_i8x4_i32:
+// RV32:        predsum.bs
+// RV64:        zext.w
+// RV64:        predsum.bs
+int32_t test_predsum_i8x4_i32(int8x4_t a, int32_t b) {
+  return __riscv_predsum_i8x4_i32(a, b);
+}
+
+// CHECK-LABEL: test_predsumu_u8x4_u32:
+// RV32:        predsumu.bs
+// RV64:        zext.w
+// RV64:        predsumu.bs
+uint32_t test_predsumu_u8x4_u32(uint8x4_t a, uint32_t b) {
+  return __riscv_predsumu_u8x4_u32(a, b);
+}
+
+// CHECK-LABEL: test_predsum_i16x2_i32:
+// RV32:        predsum.hs
+// RV64:        zext.w
+// RV64:        predsum.hs
+int32_t test_predsum_i16x2_i32(int16x2_t a, int32_t b) {
+  return __riscv_predsum_i16x2_i32(a, b);
+}
+
+// CHECK-LABEL: test_predsumu_u16x2_u32:
+// RV32:        predsumu.hs
+// RV64:        zext.w
+// RV64:        predsumu.hs
+uint32_t test_predsumu_u16x2_u32(uint16x2_t a, uint32_t b) {
+  return __riscv_predsumu_u16x2_u32(a, b);
+}
+
+// CHECK-LABEL: test_predsum_i8x8_i32:
+// RV32:        predsum.dbs
+// RV64:        predsum.bs
+int32_t test_predsum_i8x8_i32(int8x8_t a, int32_t b) {
+  return __riscv_predsum_i8x8_i32(a, b);
+}
+
+// CHECK-LABEL: test_predsumu_u8x8_u32:
+// RV32:        predsumu.dbs
+// RV64:        predsumu.bs
+uint32_t test_predsumu_u8x8_u32(uint8x8_t a, uint32_t b) {
+  return __riscv_predsumu_u8x8_u32(a, b);
+}
+
+// CHECK-LABEL: test_predsum_i16x4_i32:
+// RV32:        predsum.dhs
+// RV64:        predsum.hs
+int32_t test_predsum_i16x4_i32(int16x4_t a, int32_t b) {
+  return __riscv_predsum_i16x4_i32(a, b);
+}
+
+// CHECK-LABEL: test_predsumu_u16x4_u32:
+// RV32:        predsumu.dhs
+// RV64:        predsumu.hs
+uint32_t test_predsumu_u16x4_u32(uint16x4_t a, uint32_t b) {
+  return __riscv_predsumu_u16x4_u32(a, b);
+}
+
+// TODO: The trailing "mvd" is a GPRPair copy inserted because wadda clobbers
+// its rd; it may be avoidable (e.g. via convertToThreeAddress).
+// CHECK-LABEL: test_predsum_i8x8_i64:
+// RV32:        predsum.dbs
+// RV32:        wadda{{[[:space:]]}}
+// RV32:        mvd
+// RV64:        predsum.bs
+int64_t test_predsum_i8x8_i64(int8x8_t a, int64_t b) {
+  return __riscv_predsum_i8x8_i64(a, b);
+}
+
+// CHECK-LABEL: test_predsumu_u8x8_u64:
+// RV32:        predsumu.dbs
+// RV32:        waddau{{[[:space:]]}}
+// RV32:        mvd
+// RV64:        predsumu.bs
+uint64_t test_predsumu_u8x8_u64(uint8x8_t a, uint64_t b) {
+  return __riscv_predsumu_u8x8_u64(a, b);
+}
+
+// CHECK-LABEL: test_predsum_i16x4_i64:
+// RV32:        predsum.dhs
+// RV32:        wadda{{[[:space:]]}}
+// RV32:        mvd
+// RV64:        predsum.hs
+int64_t test_predsum_i16x4_i64(int16x4_t a, int64_t b) {
+  return __riscv_predsum_i16x4_i64(a, b);
+}
+
+// CHECK-LABEL: test_predsumu_u16x4_u64:
+// RV32:        predsumu.dhs
+// RV32:        waddau{{[[:space:]]}}
+// RV32:        mvd
+// RV64:        predsumu.hs
+uint64_t test_predsumu_u16x4_u64(uint16x4_t a, uint64_t b) {
+  return __riscv_predsumu_u16x4_u64(a, b);
+}
+
+// CHECK-LABEL: test_predsum_i32x2_i64:
+// RV32:        wadda{{[[:space:]]}}
+// RV32:        mvd
+// RV64:        predsum.ws
+int64_t test_predsum_i32x2_i64(int32x2_t a, int64_t b) {
+  return __riscv_predsum_i32x2_i64(a, b);
+}
+
+// CHECK-LABEL: test_predsumu_u32x2_u64:
+// RV32:        waddau{{[[:space:]]}}
+// RV32:        mvd
+// RV64:        predsumu.ws
+uint64_t test_predsumu_u32x2_u64(uint32x2_t a, uint64_t b) {
+  return __riscv_predsumu_u32x2_u64(a, b);
+}

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [Clang][RISCV] packed reduction sum intrinsics (PR #206441)

Reply via email to