https://github.com/shiltian created https://github.com/llvm/llvm-project/pull/134892
Due to a previous workaround allowing kernels to be called from other functions, Clang currently doesn't use the `byref` attribute for aggregate kernel arguments. The issue was recently resolved in https://github.com/llvm/llvm-project/pull/115821. With that fix, we can now enable the use of `byref` consistently across all languages. Co-authored-by: Matt Arsenault <matthew.arsena...@amd.com> >From 3fb5c3926514abc87b8c0208df1e35e1d1079752 Mon Sep 17 00:00:00 2001 From: Shilei Tian <i...@tianshilei.me> Date: Tue, 8 Apr 2025 13:39:58 -0400 Subject: [PATCH] [Clang][OpenCL][AMDGPU] Use `byref` for OpenCL kernel arguments Due to a previous workaround allowing kernels to be called from other functions, Clang currently doesn't use the `byref` attribute for aggregate kernel arguments. The issue was recently resolved in https://github.com/llvm/llvm-project/pull/115821. With that fix, we can now enable the use of `byref` consistently across all languages. Co-authored-by: Matt Arsenault <matthew.arsena...@amd.com> --- clang/lib/CodeGen/Targets/AMDGPU.cpp | 6 +- .../CodeGenOpenCL/addr-space-struct-arg.cl | 156 ++++++------------ .../amdgpu-abi-struct-arg-byref.cl | 56 +++---- .../CodeGenOpenCL/amdgpu-abi-struct-coerce.cl | 24 ++- .../test/CodeGenOpenCL/opencl-kernel-call.cl | 70 +++----- 5 files changed, 116 insertions(+), 196 deletions(-) diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index db2a2c5740646..bcf039d9f268a 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -198,14 +198,10 @@ ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const { /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device)); } - // FIXME: Should also use this for OpenCL, but it requires addressing the - // problem of kernels being called. - // // FIXME: This doesn't apply the optimization of coercing pointers in structs // to global address space when using byref. This would require implementing a // new kind of coercion of the in-memory type when for indirect arguments. - if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy && - isAggregateTypeForABI(Ty)) { + if (LTy == OrigLTy && isAggregateTypeForABI(Ty)) { return ABIArgInfo::getIndirectAliased( getContext().getTypeAlignInChars(Ty), getContext().getTargetAddressSpace(LangAS::opencl_constant), diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl index 789aae7a5c34c..49604c6c5e61b 100644 --- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl +++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl @@ -546,12 +546,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // // // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( -// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 -// AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false) // AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // @@ -596,20 +594,15 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // // // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( -// AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 -// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 -// AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 -// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]] +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false) +// AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP3]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // // @@ -630,15 +623,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // // // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( -// AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false) // AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // @@ -868,15 +856,13 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // // // AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( -// AMDGCN20-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { // AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr -// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 -// AMDGCN20-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8 -// AMDGCN20-NEXT: [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5) -// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U1_ASCAST]]) #[[ATTR4]] +// AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN20-NEXT: [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5) +// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U_ASCAST]]) #[[ATTR4]] // AMDGCN20-NEXT: ret void // // @@ -927,21 +913,16 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // // // AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( -// AMDGCN20-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-SAME: ptr addrspace(4) noundef byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { // AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr -// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr [[TMP0]], align 8 -// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 -// AMDGCN20-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN20-NEXT: store <2 x i32> [[TMP3]], ptr [[TMP2]], align 8 -// AMDGCN20-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8 -// AMDGCN20-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 -// AMDGCN20-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8 -// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]] +// AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false) +// AMDGCN20-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8 +// AMDGCN20-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 +// AMDGCN20-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[TMP3]], align 8 +// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR4]] // AMDGCN20-NEXT: ret void // // @@ -963,18 +944,13 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // // // AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( -// AMDGCN20-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { // AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr -// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN20-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8 -// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 -// AMDGCN20-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN20-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr [[TMP2]], align 8 -// AMDGCN20-NEXT: [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5) -// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U1_ASCAST]]) #[[ATTR4]] +// AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN20-NEXT: [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5) +// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U_ASCAST]]) #[[ATTR4]] // AMDGCN20-NEXT: ret void // // @@ -1408,12 +1384,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // // // AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( -// AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { // AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] // AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 -// AMDGCN30-GVAR-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false) // AMDGCN30-GVAR-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]] // AMDGCN30-GVAR-NEXT: ret void // @@ -1458,20 +1432,15 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // // // AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( -// AMDGCN30-GVAR-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-SAME: ptr addrspace(4) noundef byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { // AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] // AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN30-GVAR-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 -// AMDGCN30-GVAR-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false) +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP3]], align 8 +// AMDGCN30-GVAR-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR4]] // AMDGCN30-GVAR-NEXT: ret void // // @@ -1492,15 +1461,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // // // AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( -// AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { // AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] // AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN30-GVAR-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN30-GVAR-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false) // AMDGCN30-GVAR-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]] // AMDGCN30-GVAR-NEXT: ret void // @@ -1699,12 +1663,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // // // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( -// AMDGCN30-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { // AMDGCN30-NEXT: [[ENTRY:.*:]] // AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 -// AMDGCN30-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false) // AMDGCN30-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]] // AMDGCN30-NEXT: ret void // @@ -1749,20 +1711,15 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // // // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( -// AMDGCN30-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-SAME: ptr addrspace(4) noundef byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { // AMDGCN30-NEXT: [[ENTRY:.*:]] // AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN30-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN30-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN30-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN30-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 -// AMDGCN30-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 -// AMDGCN30-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN30-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 -// AMDGCN30-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]] +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false) +// AMDGCN30-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN30-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP3]], align 8 +// AMDGCN30-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR4]] // AMDGCN30-NEXT: ret void // // @@ -1783,15 +1740,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // // // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( -// AMDGCN30-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { // AMDGCN30-NEXT: [[ENTRY:.*:]] // AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN30-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN30-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN30-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN30-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false) // AMDGCN30-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]] // AMDGCN30-NEXT: ret void // diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl index 6dc488c40da7f..7d0a66bac1469 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl @@ -1,4 +1,4 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 --include-generated-funcs +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 5 // RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL2.0 -O0 -triple amdgcn | FileCheck -check-prefix=AMDGCN %s typedef int int2 __attribute__((ext_vector_type(2))); @@ -330,15 +330,13 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // // // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( -// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr -// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 -// AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8 -// AMDGCN-NEXT: [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5) -// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U1_ASCAST]]) #[[ATTR4]] +// AMDGCN-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// AMDGCN-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN-NEXT: [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5) +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U_ASCAST]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // // @@ -389,21 +387,16 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // // // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( -// AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr -// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr [[TMP0]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN-NEXT: store <2 x i32> [[TMP3]], ptr [[TMP2]], align 8 -// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8 -// AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8 -// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]] +// AMDGCN-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// AMDGCN-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false) +// AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8 +// AMDGCN-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[TMP3]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // // @@ -425,18 +418,13 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // // // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( -// AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr -// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr [[TMP2]], align 8 -// AMDGCN-NEXT: [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5) -// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U1_ASCAST]]) #[[ATTR4]] +// AMDGCN-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// AMDGCN-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN-NEXT: [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr addrspace(5) +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U_ASCAST]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // // diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl index 4d09fc3ffb70b..06d3cdb01deb2 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl @@ -214,7 +214,8 @@ typedef struct struct_4regs int w; } struct_4regs; -// CHECK: void @kernel_empty_struct_arg(%struct.empty_struct %s.coerce) +// CHECK: void @kernel_empty_struct_arg(ptr addrspace(4) noundef readnone byref(%struct.empty_struct) align 1 captures(none) {{%.+}}) +// CHECK: void @__clang_ocl_kern_imp_kernel_empty_struct_arg() __kernel void kernel_empty_struct_arg(empty_struct s) { } // CHECK: void @kernel_single_element_struct_arg(i32 %arg1.coerce) @@ -223,28 +224,35 @@ __kernel void kernel_single_element_struct_arg(single_element_struct_arg_t arg1) // CHECK: void @kernel_nested_single_element_struct_arg(i32 %arg1.coerce) __kernel void kernel_nested_single_element_struct_arg(nested_single_element_struct_arg_t arg1) { } -// CHECK: void @kernel_struct_arg(%struct.struct_arg %arg1.coerce) +// CHECK: void @kernel_struct_arg(ptr addrspace(4) noundef readonly byref(%struct.struct_arg) align 4 captures(none) {{%.+}}) +// CHECK: void @__clang_ocl_kern_imp_kernel_struct_arg(i32 %arg1.coerce0, float %arg1.coerce1, i32 %arg1.coerce2) __kernel void kernel_struct_arg(struct_arg_t arg1) { } -// CHECK: void @kernel_struct_padding_arg(%struct.struct_padding_arg %arg1.coerce) +// CHECK: void @kernel_struct_padding_arg(ptr addrspace(4) noundef readonly byref(%struct.struct_padding_arg) align 8 captures(none) {{%.+}}) +// CHECK: void @__clang_ocl_kern_imp_kernel_struct_padding_arg(i8 %arg1.coerce0, i64 %arg1.coerce1) __kernel void kernel_struct_padding_arg(struct_padding_arg arg1) { } -// CHECK: void @kernel_test_struct_of_arrays_arg(%struct.struct_of_arrays_arg %arg1.coerce) +// CHECK: void @kernel_test_struct_of_arrays_arg(ptr addrspace(4) noundef readonly byref(%struct.struct_of_arrays_arg) align 4 captures(none) {{%.+}}) +// CHECK: void @__clang_ocl_kern_imp_kernel_test_struct_of_arrays_arg([2 x i32] %arg1.coerce0, float %arg1.coerce1, [4 x i32] %arg1.coerce2, [3 x float] %arg1.coerce3, i32 %arg1.coerce4) __kernel void kernel_test_struct_of_arrays_arg(struct_of_arrays_arg_t arg1) { } -// CHECK: void @kernel_struct_of_structs_arg(%struct.struct_of_structs_arg %arg1.coerce) +// CHECK: void @kernel_struct_of_structs_arg(ptr addrspace(4) noundef readonly byref(%struct.struct_of_structs_arg) align 4 captures(none) {{%.+}}) +// CHECK: void @__clang_ocl_kern_imp_kernel_struct_of_structs_arg(i32 %arg1.coerce0, float %arg1.coerce1, %struct.struct_arg %arg1.coerce2, i32 %arg1.coerce3) __kernel void kernel_struct_of_structs_arg(struct_of_structs_arg_t arg1) { } // CHECK: void @test_kernel_transparent_union_arg(i32 %u.coerce) __kernel void test_kernel_transparent_union_arg(transparent_u u) { } -// CHECK: void @kernel_single_array_element_struct_arg(%struct.single_array_element_struct_arg %arg1.coerce) +// CHECK: void @kernel_single_array_element_struct_arg(ptr addrspace(4) noundef readonly byref(%struct.single_array_element_struct_arg) align 4 captures(none) {{%.+}}) +// CHECK: void @__clang_ocl_kern_imp_kernel_single_array_element_struct_arg([4 x i32] %arg1.coerce) __kernel void kernel_single_array_element_struct_arg(single_array_element_struct_arg_t arg1) { } -// CHECK: void @kernel_single_struct_element_struct_arg(%struct.single_struct_element_struct_arg %arg1.coerce) +// CHECK: void @kernel_single_struct_element_struct_arg(ptr addrspace(4) noundef readonly byref(%struct.single_struct_element_struct_arg) align 8 captures(none) {{%.+}}) +// CHECK: void @__clang_ocl_kern_imp_kernel_single_struct_element_struct_arg(%struct.inner %arg1.coerce) __kernel void kernel_single_struct_element_struct_arg(single_struct_element_struct_arg_t arg1) { } -// CHECK: void @kernel_different_size_type_pair_arg(%struct.different_size_type_pair %arg1.coerce) +// CHECK: void @kernel_different_size_type_pair_arg(ptr addrspace(4) noundef readonly byref(%struct.different_size_type_pair) align 8 captures(none) {{%.+}}) +// CHECK: void @__clang_ocl_kern_imp_kernel_different_size_type_pair_arg(i64 %arg1.coerce0, i32 %arg1.coerce1) __kernel void kernel_different_size_type_pair_arg(different_size_type_pair arg1) { } // CHECK: define{{.*}} void @func_f32_arg(float noundef %arg) diff --git a/clang/test/CodeGenOpenCL/opencl-kernel-call.cl b/clang/test/CodeGenOpenCL/opencl-kernel-call.cl index cdbe510b723b9..a5b2bee127bd0 100644 --- a/clang/test/CodeGenOpenCL/opencl-kernel-call.cl +++ b/clang/test/CodeGenOpenCL/opencl-kernel-call.cl @@ -676,12 +676,10 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // // AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( -// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META7]] { // AMDGCN-NEXT: entry: // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 -// AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false) // AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR5]] // AMDGCN-NEXT: ret void // @@ -698,20 +696,15 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // // AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( -// AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META7]] { // AMDGCN-NEXT: entry: // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 -// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 -// AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 -// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false) +// AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP3]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR5]] // AMDGCN-NEXT: ret void // // @@ -734,15 +727,10 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // // AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( -// AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META7]] { // AMDGCN-NEXT: entry: // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false) // AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR5]] // AMDGCN-NEXT: ret void // @@ -815,28 +803,23 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // // AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone // AMDGCN-LABEL: define dso_local amdgpu_kernel void @caller_kern2( -// AMDGCN-SAME: <2 x i32> [[STRUCTONEMEM_COERCE:%.*]], ptr addrspace(1) noundef align 8 [[GLOBAL_STRUCTONEMEM:%.*]], [[STRUCT_STRUCTTWOMEMBER:%.*]] [[STRUCTTWOMEM_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] { +// AMDGCN-SAME: <2 x i32> [[STRUCTONEMEM_COERCE:%.*]], ptr addrspace(1) noundef align 8 [[GLOBAL_STRUCTONEMEM:%.*]], ptr addrspace(4) noundef byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] { // AMDGCN-NEXT: entry: // AMDGCN-NEXT: [[STRUCTONEMEM:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) // AMDGCN-NEXT: [[STRUCTTWOMEM:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[GLOBAL_STRUCTONEMEM_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0 // AMDGCN-NEXT: store <2 x i32> [[STRUCTONEMEM_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 -// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[STRUCTTWOMEM_COERCE]], 0 -// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[STRUCTTWOMEM_COERCE]], 1 -// AMDGCN-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[STRUCTTWOMEM]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false) // AMDGCN-NEXT: store ptr addrspace(1) [[GLOBAL_STRUCTONEMEM]], ptr addrspace(5) [[GLOBAL_STRUCTONEMEM_ADDR]], align 8 -// AMDGCN-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[GLOBAL_STRUCTONEMEM_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[GLOBAL_STRUCTONEMEM_ADDR]], align 8 // AMDGCN-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 -// AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 -// AMDGCN-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP8]], align 8 -// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_caller_kern2(<2 x i32> [[TMP5]], ptr addrspace(1) noundef align 8 [[TMP4]], <2 x i32> [[TMP7]], <2 x i32> [[TMP9]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP3]], align 8 +// AMDGCN-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_caller_kern2(<2 x i32> [[TMP2]], ptr addrspace(1) noundef align 8 [[TMP1]], <2 x i32> [[TMP4]], <2 x i32> [[TMP6]]) #[[ATTR5]] // AMDGCN-NEXT: ret void // // @@ -875,19 +858,12 @@ kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct // // AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone // AMDGCN-LABEL: define dso_local amdgpu_kernel void @caller_kern3( -// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[LARGESTRUCTONEMEM_COERCE:%.*]], [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[LARGESTRUCTTWOMEM_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META26:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META27:![0-9]+]] !kernel_arg_base_type [[META27]] !kernel_arg_type_qual [[META11]] { +// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]], ptr addrspace(4) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP1:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META26:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META27:![0-9]+]] !kernel_arg_base_type [[META27]] !kernel_arg_type_qual [[META11]] { // AMDGCN-NEXT: entry: // AMDGCN-NEXT: [[LARGESTRUCTONEMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[LARGESTRUCTTWOMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[LARGESTRUCTONEMEM]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[LARGESTRUCTONEMEM_COERCE]], 0 -// AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[LARGESTRUCTTWOMEM]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[LARGESTRUCTTWOMEM_COERCE]], 0 -// AMDGCN-NEXT: store [40 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 -// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[LARGESTRUCTTWOMEM]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[LARGESTRUCTTWOMEM_COERCE]], 1 -// AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP5]], ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[LARGESTRUCTONEMEM]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[LARGESTRUCTTWOMEM]], ptr addrspace(4) align 8 [[TMP1]], i64 480, i1 false) // AMDGCN-NEXT: call void @__clang_ocl_kern_imp_caller_kern3(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[LARGESTRUCTONEMEM]], ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[LARGESTRUCTTWOMEM]]) #[[ATTR5]] // AMDGCN-NEXT: ret void // _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits