https://github.com/16srivarshitha created
https://github.com/llvm/llvm-project/pull/188004
Related: #175871, #179278
When `-fgpu-default-stream=per-thread` is specified, CUDA and HIP kernels
should be launched using the per-thread stream variants of the launch API
instead of the default `cudaLaunchKernel`/`hipLaunchKernel`.
This PR implements that by selecting the correct launch function name in
`emitDeviceStubBodyNew`:
For CUDA: `cudaLaunchKernel_ptsz`
For HIP: `hipLaunchKernel_spt`
This matches the behavior of the OG CodeGen implementation in `CGCUDANV.cpp`
and resolves the `errorNYI("CUDA/HIP Stream per thread")` that was previously
hit when this stream mode was requested.
The existing kernel launch infrastructure (the `__cudaPushCallConfiguration` /
`__cudaPopCallConfiguration` flow and device stub calls) was already upstream -
this PR completes the kernel launch calls section of the CUDA/HIP tracking
issue by adding the missing stream-per-thread support.
Tested locally with FileCheck for CUDA-NEW, HIP-NEW, CUDA-PTH, HIP-PTH, and
DEVICE check prefixes.
>From 213a8c93c99572047e4c4c7f01eca376d326a939 Mon Sep 17 00:00:00 2001
From: 16srivarshitha <[email protected]>
Date: Mon, 23 Mar 2026 14:54:30 +0530
Subject: [PATCH] [CIR][CUDA][HIP] Support stream per thread kernel launch
---
clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp | 10 +++++++---
clang/test/CIR/CodeGenCUDA/kernel-call.cu | 12 ++++++++++++
2 files changed, 19 insertions(+), 3 deletions(-)
diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
index 8b8e99023eceb..1bae5e470aadd 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
@@ -150,10 +150,14 @@ void
CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
// The default stream is usually stream 0 (the legacy default stream).
// For per-thread default stream, we need a different LaunchKernel function.
- StringRef kernelLaunchAPI = "LaunchKernel";
+ std::string kernelLaunchAPI = "LaunchKernel";
if (cgm.getLangOpts().GPUDefaultStream ==
- LangOptions::GPUDefaultStreamKind::PerThread)
- cgm.errorNYI("CUDA/HIP Stream per thread");
+ LangOptions::GPUDefaultStreamKind::PerThread) {
+ if (cgm.getLangOpts().HIP)
+ kernelLaunchAPI += "_spt";
+ else if (cgm.getLangOpts().CUDA)
+ kernelLaunchAPI += "_ptsz";
+ }
std::string launchKernelName = addPrefixToName(kernelLaunchAPI);
const IdentifierInfo &launchII =
diff --git a/clang/test/CIR/CodeGenCUDA/kernel-call.cu
b/clang/test/CIR/CodeGenCUDA/kernel-call.cu
index 2d37b6eef73af..230bcdfe6e22c 100644
--- a/clang/test/CIR/CodeGenCUDA/kernel-call.cu
+++ b/clang/test/CIR/CodeGenCUDA/kernel-call.cu
@@ -14,6 +14,14 @@
// RUN: -emit-cir %s -x cuda -fcuda-is-device -o %t.device.cir
// RUN: FileCheck --input-file=%t.device.cir %s --check-prefix=DEVICE
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -target-sdk-version=9.2 \
+// RUN: -fgpu-default-stream=per-thread -DCUDA_API_PER_THREAD_DEFAULT_STREAM
\
+// RUN: -emit-cir %s -x cuda -o - | FileCheck %s --check-prefix=CUDA-PTH
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fhip-new-launch-api \
+// RUN: -fgpu-default-stream=per-thread -DHIP_API_PER_THREAD_DEFAULT_STREAM \
+// RUN: -emit-cir %s -x hip -o - | FileCheck %s --check-prefix=HIP-PTH
+
#include "Inputs/cuda.h"
@@ -55,6 +63,8 @@
// Check cudaLaunchKernel is called with all 6 arguments:
// func ptr, gridDim, blockDim, args, sharedMem, stream
// CUDA-NEW: cir.call @cudaLaunchKernel({{.*}}) : (!cir.ptr<!void>{{.*}},
!rec_dim3, !rec_dim3, !cir.ptr<!cir.ptr<!void>>{{.*}}, !u64i{{.*}},
!cir.ptr<!rec_cudaStream>{{.*}}) -> (!u32i {llvm.noundef})
+// CUDA-PTH: cir.call @cudaLaunchKernel_ptsz
+
//
// HIP-NEW: cir.global constant external @_Z6kernelif =
#cir.global_view<@_Z21__device_stub__kernelif> : !cir.func<(!s32i, !cir.float)>
// HIP-NEW-LABEL: cir.func {{.*}} @_Z21__device_stub__kernelif
@@ -62,6 +72,8 @@
// HIP-NEW: cir.call @__hipPopCallConfiguration({{.*}}) :
(!cir.ptr<!rec_dim3>, !cir.ptr<!rec_dim3>, !cir.ptr<!u64i>,
!cir.ptr<!cir.ptr<!rec_hipStream>>) -> !s32i
// HIP-NEW: cir.get_global @_Z6kernelif : !cir.ptr<!cir.func<(!s32i,
!cir.float)>>
// HIP-NEW: cir.call @hipLaunchKernel({{.*}}) : (!cir.ptr<!void> {{.*}},
!rec_dim3, !rec_dim3, !cir.ptr<!cir.ptr<!void>>{{.*}}, !u64i{{.*}},
!cir.ptr<!rec_hipStream>{{.*}}) -> (!u32i {llvm.noundef})
+// HIP-PTH: cir.call @hipLaunchKernel_spt
+
__global__ void kernel(int x, float y) {}
// ===----------------------------------------------------------------------===
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits