https://github.com/Fznamznon created https://github.com/llvm/llvm-project/pull/186788
Prior to this patch kernel generation copied kernel argument types exactly as they're written in function's signature of the function attributed with [[clang::sycl_kernel_entry_point]] attribute. This caused generation of kernels that have reference kernel argument instead of byval kernel argument. SYCL 2020 doesn't allow reference kernel arguments and it doesn't seem to work with the backends. Arguments to [[clang::sycl_kernel_entry_point]]-attributed function can be big, so we preserve references during host code generation to avoid performance issues in SYCL Runtime library implementation because the same function will be used for actual kernel argument setting via sycl_kernel_launch interface. Note that we still need to diagnose references in user's kernel arguments since they are explicitly not allowed by SYCL 2020 spec and this task is in TODO list. This patch simply removes references from types written in SYCL Runtime library. Assisted-by: claude in test case writing. >From c4cb1cd5b244000a78799d302edb47c41f5d1a99 Mon Sep 17 00:00:00 2001 From: "Podchishchaeva, Mariya" <[email protected]> Date: Mon, 16 Mar 2026 05:48:59 -0700 Subject: [PATCH] [clang][SYCL] Strip references from generated kernel arguments Prior to this patch kernel generation copied kernel argument types exactly as they're written in function's signature of the function attributed with [[clang::sycl_kernel_entry_point]] attribute. This caused generation of kernels that have reference kernel argument instead of byval kernel argument. SYCL 2020 doesn't allow reference kernel arguments and it doesn't seem to work with the backends. Arguments to [[clang::sycl_kernel_entry_point]]-attributed function can be big, so we preserve references during host code generation to avoid performance issues in SYCL Runtime library implementation because the same function will be used for actual kernel argument setting via sycl_kernel_launch interface. Note that we still need to diagnose references in user's kernel arguments since they are explicitly not allowed by SYCL 2020 spec and this task is in TODO list. This patch simply removes references from types written in SYCL Runtime library. Assisted-by: claude in test case writing. --- clang/lib/Sema/SemaSYCL.cpp | 10 +- .../ast-dump-sycl-kernel-call-stmt.cpp | 4 +- .../CodeGenSYCL/kernel-caller-entry-point.cpp | 143 ++++++++++++++++++ 3 files changed, 150 insertions(+), 7 deletions(-) diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index ff8ad61aa3af2..112a6e4416df2 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -608,10 +608,10 @@ class OutlinedFunctionDeclBodyInstantiator ParmDeclMap::iterator I = MapRef.find(PVD); if (I != MapRef.end()) { VarDecl *VD = I->second; - assert(SemaRef.getASTContext().hasSameUnqualifiedType(PVD->getType(), - VD->getType())); - assert(!VD->getType().isMoreQualifiedThan(PVD->getType(), - SemaRef.getASTContext())); + assert(SemaRef.getASTContext().hasSameUnqualifiedType( + PVD->getType().getNonReferenceType(), VD->getType())); + assert(!VD->getType().isMoreQualifiedThan( + PVD->getType().getNonReferenceType(), SemaRef.getASTContext())); VD->setIsUsed(); return DeclRefExpr::Create( SemaRef.getASTContext(), DRE->getQualifierLoc(), @@ -650,7 +650,7 @@ OutlinedFunctionDecl *BuildSYCLKernelEntryPointOutline(Sema &SemaRef, for (ParmVarDecl *PVD : FD->parameters()) { ImplicitParamDecl *IPD = ImplicitParamDecl::Create( SemaRef.getASTContext(), OFD, SourceLocation(), PVD->getIdentifier(), - PVD->getType(), ImplicitParamKind::Other); + PVD->getType().getNonReferenceType(), ImplicitParamKind::Other); OFD->setParam(i, IPD); ParmMap[PVD] = IPD; ++i; diff --git a/clang/test/ASTSYCL/ast-dump-sycl-kernel-call-stmt.cpp b/clang/test/ASTSYCL/ast-dump-sycl-kernel-call-stmt.cpp index 0c9ee05ac7614..9563892398da9 100644 --- a/clang/test/ASTSYCL/ast-dump-sycl-kernel-call-stmt.cpp +++ b/clang/test/ASTSYCL/ast-dump-sycl-kernel-call-stmt.cpp @@ -308,12 +308,12 @@ void skep6(const S6 &k) { // CHECK-NEXT: | | | `-CXXConstructExpr {{.*}} 'S6' 'void (const S6 &) noexcept' // CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'const S6' lvalue ParmVar {{.*}} 'k' 'const S6 &' // CHECK-NEXT: | | `-OutlinedFunctionDecl {{.*}} -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} implicit used k 'const S6 &' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} implicit used k 'const S6' // CHECK-NEXT: | | `-CompoundStmt {{.*}} // CHECK-NEXT: | | `-CXXOperatorCallExpr {{.*}} 'void' '()' // CHECK-NEXT: | | |-ImplicitCastExpr {{.*}} 'void (*)() const' <FunctionToPointerDecay> // CHECK-NEXT: | | | `-DeclRefExpr {{.*}} 'void () const' lvalue CXXMethod {{.*}} 'operator()' 'void () const' -// CHECK-NEXT: | | `-DeclRefExpr {{.*}} 'const S6' lvalue ImplicitParam {{.*}} 'k' 'const S6 &' +// CHECK-NEXT: | | `-DeclRefExpr {{.*}} 'const S6' lvalue ImplicitParam {{.*}} 'k' 'const S6' // CHECK-NEXT: | `-SYCLKernelEntryPointAttr {{.*}} KN<6> // Parameter types are not required to be complete at the point of a diff --git a/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp b/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp index 47c2c45ae7749..b44cc4c52588f 100644 --- a/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp +++ b/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp @@ -68,6 +68,22 @@ struct copyable { ~copyable(); }; +struct ref_arg_kernel_name; + +template <typename KernelName, typename KernelType> +[[clang::sycl_kernel_entry_point(KernelName)]] +void ref_arg_kernel(const KernelType &ref) { + ref(42); +} + +struct rvalue_ref_arg_kernel_name; + +template <typename KernelName, typename KernelType> +[[clang::sycl_kernel_entry_point(KernelName)]] +void rvalue_ref_arg_kernel(KernelType &&ref) { + ref(42); +} + int main() { single_purpose_kernel obj; single_purpose_kernel_task(obj); @@ -78,6 +94,8 @@ int main() { handler h; copyable c{42}; h.kernel_entry_point<struct KN>([=] (int a, int b) { return c.i + a + b; }, 1, 2); + ref_arg_kernel<ref_arg_kernel_name>(lambda); + rvalue_ref_arg_kernel<rvalue_ref_arg_kernel_name>(lambda); } // Verify that SYCL kernel caller functions are not emitted during host @@ -87,6 +105,8 @@ int main() { // CHECK-HOST-NOT: define {{.*}} @_ZTSZ4mainEUlT_E_ // CHECK-HOST-NOT: define {{.*}} @"_ZTS6\CE\B4\CF\84\CF\87" // CHECK-HOST-NOT: define {{.*}} @_ZTSZ4mainE2KN +// CHECK-HOST-NOT: define {{.*}} @_ZTS19ref_arg_kernel_name +// CHECK-HOST-NOT: define {{.*}} @_ZTS26rvalue_ref_arg_kernel_name // Verify that sycl_kernel_entry_point attributed functions are not emitted // during device compilation. @@ -94,6 +114,8 @@ int main() { // CHECK-DEVICE-NOT: single_purpose_kernel_task // CHECK-DEVICE-NOT: kernel_single_task // CHECK-DEVICE-NOT: kernel_entry_point +// CHECK-DEVICE-NOT: ref_arg_kernel +// CHECK-DEVICE-NOT: rvalue_ref_arg_kernel // Verify that kernel launch code is generated for sycl_kernel_entry_point // attributed functions during host compilation. @@ -101,6 +123,9 @@ int main() { // CHECK-HOST-LINUX: @.str = private unnamed_addr constant [33 x i8] c"_ZTS26single_purpose_kernel_name\00", align 1 // CHECK-HOST-LINUX: @.str.1 = private unnamed_addr constant [18 x i8] c"_ZTSZ4mainEUlT_E_\00", align 1 // CHECK-HOST-LINUX: @.str.2 = private unnamed_addr constant [12 x i8] c"_ZTS6\CE\B4\CF\84\CF\87\00", align 1 +// CHECK-HOST-LINUX: @.str.3 = private unnamed_addr constant [15 x i8] c"_ZTSZ4mainE2KN\00", align 1 +// CHECK-HOST-LINUX: @.str.4 = private unnamed_addr constant [26 x i8] c"_ZTS19ref_arg_kernel_name\00", align 1 +// CHECK-HOST-LINUX: @.str.5 = private unnamed_addr constant [33 x i8] c"_ZTS26rvalue_ref_arg_kernel_name\00", align 1 // // CHECK-HOST-LINUX: define dso_local void @_Z26single_purpose_kernel_task21single_purpose_kernel() #{{[0-9]+}} { // CHECK-HOST-LINUX-NEXT: entry: @@ -151,6 +176,32 @@ int main() { // CHECK-HOST-LINUX-NEXT: call void @_ZZ4mainENUliiE_D1Ev(ptr noundef nonnull align 4 dead_on_return(4) dereferenceable(4) %agg.tmp) #{{[0-9]+}} // CHECK-HOST-LINUX-NEXT: ret void // CHECK-HOST-LINUX-NEXT: } +// +// CHECK-HOST-LINUX: define internal void @_Z14ref_arg_kernelI19ref_arg_kernel_nameZ4mainEUlT_E_EvRKT0_(ptr noundef nonnull align 4 dereferenceable(4) %ref) #{{[0-9]+}} { +// CHECK-HOST-LINUX-NEXT: entry: +// CHECK-HOST-LINUX-NEXT: %ref.addr = alloca ptr, align 8 +// CHECK-HOST-LINUX-NEXT: %agg.tmp = alloca %class.anon, align 4 +// CHECK-HOST-LINUX-NEXT: store ptr %ref, ptr %ref.addr, align 8 +// CHECK-HOST-LINUX-NEXT: %0 = load ptr, ptr %ref.addr, align 8 +// CHECK-HOST-LINUX-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %agg.tmp, ptr align 4 %0, i64 4, i1 false) +// CHECK-HOST-LINUX-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr %agg.tmp, i32 0, i32 0 +// CHECK-HOST-LINUX-NEXT: %1 = load i32, ptr %coerce.dive, align 4 +// CHECK-HOST-LINUX-NEXT: call void @_Z18sycl_kernel_launchI19ref_arg_kernel_nameJZ4mainEUlT_E_EEvPKcDpT0_(ptr noundef @.str.4, i32 %1) +// CHECK-HOST-LINUX-NEXT: ret void +// CHECK-HOST-LINUX-NEXT: } +// +// CHECK-HOST-LINUX: define internal void @_Z21rvalue_ref_arg_kernelI26rvalue_ref_arg_kernel_nameRZ4mainEUlT_E_EvOT0_(ptr noundef nonnull align 4 dereferenceable(4) %ref) #{{[0-9]+}} { +// CHECK-HOST-LINUX-NEXT: entry: +// CHECK-HOST-LINUX-NEXT: %ref.addr = alloca ptr, align 8 +// CHECK-HOST-LINUX-NEXT: %agg.tmp = alloca %class.anon, align 4 +// CHECK-HOST-LINUX-NEXT: store ptr %ref, ptr %ref.addr, align 8 +// CHECK-HOST-LINUX-NEXT: %0 = load ptr, ptr %ref.addr, align 8 +// CHECK-HOST-LINUX-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %agg.tmp, ptr align 4 %0, i64 4, i1 false) +// CHECK-HOST-LINUX-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr %agg.tmp, i32 0, i32 0 +// CHECK-HOST-LINUX-NEXT: %1 = load i32, ptr %coerce.dive, align 4 +// CHECK-HOST-LINUX-NEXT: call void @_Z18sycl_kernel_launchI26rvalue_ref_arg_kernel_nameJZ4mainEUlT_E_EEvPKcDpT0_(ptr noundef @.str.5, i32 %1) +// CHECK-HOST-LINUX-NEXT: ret void +// CHECK-HOST-LINUX-NEXT: } // CHECK-HOST-WINDOWS: define dso_local void @"?single_purpose_kernel_task@@YAXUsingle_purpose_kernel@@@Z"(i8 %kernelFunc.coerce) #{{[0-9]+}} { // CHECK-HOST-WINDOWS-NEXT: entry: @@ -214,6 +265,32 @@ int main() { // CHECK-HOST-WINDOWS-NEXT: call void @"??1<lambda_3>@?0??main@@9@QEAA@XZ"(ptr noundef nonnull align 4 dead_on_return(4) dereferenceable(4) %k) #{{[0-9]+}} // CHECK-HOST-WINDOWS-NEXT: ret void // CHECK-HOST-WINDOWS-NEXT: } +// +// CHECK-HOST-WINDOWS: define internal void @"??$ref_arg_kernel@Uref_arg_kernel_name@@V<lambda_1>@?0??main@@9@@@YAXAEBV<lambda_1>@?0??main@@9@@Z"(ptr noundef nonnull align 4 dereferenceable(4) %ref) #{{[0-9]+}} { +// CHECK-HOST-WINDOWS-NEXT: entry: +// CHECK-HOST-WINDOWS-NEXT: %ref.addr = alloca ptr, align 8 +// CHECK-HOST-WINDOWS-NEXT: %agg.tmp = alloca %class.anon, align 4 +// CHECK-HOST-WINDOWS-NEXT: store ptr %ref, ptr %ref.addr, align 8 +// CHECK-HOST-WINDOWS-NEXT: %0 = load ptr, ptr %ref.addr, align 8 +// CHECK-HOST-WINDOWS-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %agg.tmp, ptr align 4 %0, i64 4, i1 false) +// CHECK-HOST-WINDOWS-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr %agg.tmp, i32 0, i32 0 +// CHECK-HOST-WINDOWS-NEXT: %1 = load i32, ptr %coerce.dive, align 4 +// CHECK-HOST-WINDOWS-NEXT: call void @"??$sycl_kernel_launch@Uref_arg_kernel_name@@V<lambda_1>@?0??main@@9@@@YAXPEBDV<lambda_1>@?0??main@@9@@Z"(ptr noundef @"??_C@_0BK@PPDJPOBM@_ZTS19ref_arg_kernel_name?$AA@", i32 %1) +// CHECK-HOST-WINDOWS-NEXT: ret void +// CHECK-HOST-WINDOWS-NEXT: } +// +// CHECK-HOST-WINDOWS: define internal void @"??$rvalue_ref_arg_kernel@Urvalue_ref_arg_kernel_name@@AEAV<lambda_1>@?0??main@@9@@@YAXAEAV<lambda_1>@?0??main@@9@@Z"(ptr noundef nonnull align 4 dereferenceable(4) %ref) #{{[0-9]+}} { +// CHECK-HOST-WINDOWS-NEXT: entry: +// CHECK-HOST-WINDOWS-NEXT: %ref.addr = alloca ptr, align 8 +// CHECK-HOST-WINDOWS-NEXT: %agg.tmp = alloca %class.anon, align 4 +// CHECK-HOST-WINDOWS-NEXT: store ptr %ref, ptr %ref.addr, align 8 +// CHECK-HOST-WINDOWS-NEXT: %0 = load ptr, ptr %ref.addr, align 8 +// CHECK-HOST-WINDOWS-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %agg.tmp, ptr align 4 %0, i64 4, i1 false) +// CHECK-HOST-WINDOWS-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr %agg.tmp, i32 0, i32 0 +// CHECK-HOST-WINDOWS-NEXT: %1 = load i32, ptr %coerce.dive, align 4 +// CHECK-HOST-WINDOWS-NEXT: call void @"??$sycl_kernel_launch@Urvalue_ref_arg_kernel_name@@V<lambda_1>@?0??main@@9@@@YAXPEBDV<lambda_1>@?0??main@@9@@Z"(ptr noundef @"??_C@_0CB@HCPMABHM@_ZTS26rvalue_ref_arg_kernel_name@", i32 %1) +// CHECK-HOST-WINDOWS-NEXT: ret void +// CHECK-HOST-WINDOWS-NEXT: } // Verify that SYCL kernel caller functions are emitted for each device target. // @@ -412,6 +489,72 @@ int main() { // CHECK-SPIRV-NEXT: ret void // CHECK-SPIRV-NEXT: } +// IR for the SYCL kernel caller function generated for ref_arg_kernel with +// ref_arg_kernel_name as the SYCL kernel name type. The reference parameter is +// lowered to a value parameter in the kernel entry point. +// +// CHECK-AMDGCN: define dso_local amdgpu_kernel void @_ZTS19ref_arg_kernel_name +// CHECK-AMDGCN-SAME: (i32 %ref.coerce) #[[AMDGCN_ATTR0]] { +// CHECK-AMDGCN-NEXT: entry: +// CHECK-AMDGCN-NEXT: %ref = alloca %class.anon, align 4, addrspace(5) +// CHECK-AMDGCN-NEXT: %ref1 = addrspacecast ptr addrspace(5) %ref to ptr +// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr %ref1, i32 0, i32 0 +// CHECK-AMDGCN-NEXT: store i32 %ref.coerce, ptr %coerce.dive, align 4 +// CHECK-AMDGCN-NEXT: call void @_ZZ4mainENKUlT_E_clIiEEDaS_ +// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %ref1, i32 noundef 42) #[[AMDGCN_ATTR1]] +// CHECK-AMDGCN-NEXT: ret void +// CHECK-AMDGCN-NEXT: } +// +// CHECK-NVPTX: define dso_local ptx_kernel void @_ZTS19ref_arg_kernel_name +// CHECK-NVPTX-SAME: (ptr noundef byval(%class.anon) align 4 %ref) #[[NVPTX_ATTR0]] { +// CHECK-NVPTX-NEXT: entry: +// CHECK-NVPTX-NEXT: call void @_ZZ4mainENKUlT_E_clIiEEDaS_ +// CHECK-NVPTX-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %ref, i32 noundef 42) #[[NVPTX_ATTR1]] +// CHECK-NVPTX-NEXT: ret void +// CHECK-NVPTX-NEXT: } +// +// CHECK-SPIR: define {{[a-z_ ]*}}spir_kernel void @_ZTS19ref_arg_kernel_name +// CHECK-SPIR-SAME: (ptr noundef byval(%class.anon) align 4 %ref) #[[SPIR_ATTR0]] { +// CHECK-SPIR-NEXT: entry: +// CHECK-SPIR-NEXT: %ref.ascast = addrspacecast ptr %ref to ptr addrspace(4) +// CHECK-SPIR-NEXT: call spir_func void @_ZZ4mainENKUlT_E_clIiEEDaS_ +// CHECK-SPIR-SAME: (ptr addrspace(4) noundef align 4 dereferenceable_or_null(4) %ref.ascast, i32 noundef 42) #[[SPIR_ATTR1]] +// CHECK-SPIR-NEXT: ret void +// CHECK-SPIR-NEXT: } + +// IR for the SYCL kernel caller function generated for rvalue_ref_arg_kernel +// with rvalue_ref_arg_kernel_name as the SYCL kernel name type. The rvalue +// reference parameter is lowered to a value parameter in the kernel entry point. +// +// CHECK-AMDGCN: define dso_local amdgpu_kernel void @_ZTS26rvalue_ref_arg_kernel_name +// CHECK-AMDGCN-SAME: (i32 %ref.coerce) #[[AMDGCN_ATTR0]] { +// CHECK-AMDGCN-NEXT: entry: +// CHECK-AMDGCN-NEXT: %ref = alloca %class.anon, align 4, addrspace(5) +// CHECK-AMDGCN-NEXT: %ref1 = addrspacecast ptr addrspace(5) %ref to ptr +// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr %ref1, i32 0, i32 0 +// CHECK-AMDGCN-NEXT: store i32 %ref.coerce, ptr %coerce.dive, align 4 +// CHECK-AMDGCN-NEXT: call void @_ZZ4mainENKUlT_E_clIiEEDaS_ +// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %ref1, i32 noundef 42) #[[AMDGCN_ATTR1]] +// CHECK-AMDGCN-NEXT: ret void +// CHECK-AMDGCN-NEXT: } +// +// CHECK-NVPTX: define dso_local ptx_kernel void @_ZTS26rvalue_ref_arg_kernel_name +// CHECK-NVPTX-SAME: (ptr noundef byval(%class.anon) align 4 %ref) #[[NVPTX_ATTR0]] { +// CHECK-NVPTX-NEXT: entry: +// CHECK-NVPTX-NEXT: call void @_ZZ4mainENKUlT_E_clIiEEDaS_ +// CHECK-NVPTX-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %ref, i32 noundef 42) #[[NVPTX_ATTR1]] +// CHECK-NVPTX-NEXT: ret void +// CHECK-NVPTX-NEXT: } +// +// CHECK-SPIR: define {{[a-z_ ]*}}spir_kernel void @_ZTS26rvalue_ref_arg_kernel_name +// CHECK-SPIR-SAME: (ptr noundef byval(%class.anon) align 4 %ref) #[[SPIR_ATTR0]] { +// CHECK-SPIR-NEXT: entry: +// CHECK-SPIR-NEXT: %ref.ascast = addrspacecast ptr %ref to ptr addrspace(4) +// CHECK-SPIR-NEXT: call spir_func void @_ZZ4mainENKUlT_E_clIiEEDaS_ +// CHECK-SPIR-SAME: (ptr addrspace(4) noundef align 4 dereferenceable_or_null(4) %ref.ascast, i32 noundef 42) #[[SPIR_ATTR1]] +// CHECK-SPIR-NEXT: ret void +// CHECK-SPIR-NEXT: } + // CHECK-AMDGCN: #[[AMDGCN_ATTR0]] = { convergent mustprogress noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // CHECK-AMDGCN: #[[AMDGCN_ATTR1]] = { convergent nounwind } // _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
