https://github.com/shiltian created 
https://github.com/llvm/llvm-project/pull/134892

Due to a previous workaround allowing kernels to be called from other functions,
Clang currently doesn't use the `byref` attribute for aggregate kernel
arguments. The issue was recently resolved in
https://github.com/llvm/llvm-project/pull/115821. With that fix, we can now
enable the use of `byref` consistently across all languages.

Co-authored-by: Matt Arsenault <matthew.arsena...@amd.com>

>From 3fb5c3926514abc87b8c0208df1e35e1d1079752 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i...@tianshilei.me>
Date: Tue, 8 Apr 2025 13:39:58 -0400
Subject: [PATCH] [Clang][OpenCL][AMDGPU] Use `byref` for OpenCL kernel
 arguments

Due to a previous workaround allowing kernels to be called from other functions,
Clang currently doesn't use the `byref` attribute for aggregate kernel
arguments. The issue was recently resolved in
https://github.com/llvm/llvm-project/pull/115821. With that fix, we can now
enable the use of `byref` consistently across all languages.

Co-authored-by: Matt Arsenault <matthew.arsena...@amd.com>
---
 clang/lib/CodeGen/Targets/AMDGPU.cpp          |   6 +-
 .../CodeGenOpenCL/addr-space-struct-arg.cl    | 156 ++++++------------
 .../amdgpu-abi-struct-arg-byref.cl            |  56 +++----
 .../CodeGenOpenCL/amdgpu-abi-struct-coerce.cl |  24 ++-
 .../test/CodeGenOpenCL/opencl-kernel-call.cl  |  70 +++-----
 5 files changed, 116 insertions(+), 196 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp 
b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index db2a2c5740646..bcf039d9f268a 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -198,14 +198,10 @@ ABIArgInfo 
AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
         /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
   }
 
-  // FIXME: Should also use this for OpenCL, but it requires addressing the
-  // problem of kernels being called.
-  //
   // FIXME: This doesn't apply the optimization of coercing pointers in structs
   // to global address space when using byref. This would require implementing 
a
   // new kind of coercion of the in-memory type when for indirect arguments.
-  if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&
-      isAggregateTypeForABI(Ty)) {
+  if (LTy == OrigLTy && isAggregateTypeForABI(Ty)) {
     return ABIArgInfo::getIndirectAliased(
         getContext().getTypeAlignInChars(Ty),
         getContext().getTargetAddressSpace(LangAS::opencl_constant),
diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl 
b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
index 789aae7a5c34c..49604c6c5e61b 100644
--- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
+++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
@@ -546,12 +546,10 @@ kernel void KernelLargeTwoMember(struct 
LargeStructTwoMember u) {
 //
 //
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
-// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) 
#[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] 
!kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef 
byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] 
!kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] 
!kernel_arg_type_qual [[META13]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 
8, addrspace(5)
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] 
[[U_COERCE]], 0
-// AMDGCN-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) 
[[TMP0]], align 8
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 
[[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false)
 // AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr 
addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) 
#[[ATTR4]]
 // AMDGCN-NEXT:    ret void
 //
@@ -596,20 +594,15 @@ kernel void KernelLargeTwoMember(struct 
LargeStructTwoMember u) {
 //
 //
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
-// AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] 
!kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] 
!kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) 
align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] 
!kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] 
!kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, 
addrspace(5)
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] 
[[U_COERCE]], 0
-// AMDGCN-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
-// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] 
[[U_COERCE]], 1
-// AMDGCN-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
-// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], 
align 8
-// AMDGCN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], 
align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> 
[[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]]
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 
[[U]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
+// AMDGCN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], 
align 8
+// AMDGCN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP3]], 
align 8
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> 
[[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR4]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -630,15 +623,10 @@ kernel void KernelLargeTwoMember(struct 
LargeStructTwoMember u) {
 //
 //
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
-// AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) 
#[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] 
!kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef 
byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] 
!kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] 
!kernel_arg_type_qual [[META13]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 
8, addrspace(5)
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] 
[[U_COERCE]], 0
-// AMDGCN-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], 
align 8
-// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] 
[[U_COERCE]], 1
-// AMDGCN-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], 
align 8
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 
[[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false)
 // AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr 
addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) 
#[[ATTR4]]
 // AMDGCN-NEXT:    ret void
 //
@@ -868,15 +856,13 @@ kernel void KernelLargeTwoMember(struct 
LargeStructTwoMember u) {
 //
 //
 // AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
-// AMDGCN20-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) 
#[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] 
!kernel_arg_type_qual [[META13]] {
+// AMDGCN20-SAME: ptr addrspace(4) noundef 
byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] 
!kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] 
!kernel_arg_type_qual [[META13]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
-// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 
8, addrspace(5)
-// AMDGCN20-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN20-NEXT:    [[TMP1:%.*]] = extractvalue 
[[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
-// AMDGCN20-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8
-// AMDGCN20-NEXT:    [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr 
addrspace(5)
-// AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr 
addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 
[[U1_ASCAST]]) #[[ATTR4]]
+// AMDGCN20-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], 
align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to 
ptr
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr 
addrspace(4) align 8 [[TMP0]], i64 800, i1 false)
+// AMDGCN20-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr 
addrspace(5)
+// AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr 
addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 
[[U_ASCAST]]) #[[ATTR4]]
 // AMDGCN20-NEXT:    ret void
 //
 //
@@ -927,21 +913,16 @@ kernel void KernelLargeTwoMember(struct 
LargeStructTwoMember u) {
 //
 //
 // AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
-// AMDGCN20-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] 
!kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] 
!kernel_arg_type_qual [[META13]] {
+// AMDGCN20-SAME: ptr addrspace(4) noundef 
byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] 
!kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] 
!kernel_arg_type_qual [[META13]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
-// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, 
addrspace(5)
-// AMDGCN20-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN20-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] 
[[U_COERCE]], 0
-// AMDGCN20-NEXT:    store <2 x i32> [[TMP1]], ptr [[TMP0]], align 8
-// AMDGCN20-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN20-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] 
[[U_COERCE]], 1
-// AMDGCN20-NEXT:    store <2 x i32> [[TMP3]], ptr [[TMP2]], align 8
-// AMDGCN20-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN20-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8
-// AMDGCN20-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN20-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8
-// AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> 
[[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]]
+// AMDGCN20-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 
8, addrspace(5)
+// AMDGCN20-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to 
ptr
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr 
addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+// AMDGCN20-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1
+// AMDGCN20-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[TMP3]], align 8
+// AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> 
[[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR4]]
 // AMDGCN20-NEXT:    ret void
 //
 //
@@ -963,18 +944,13 @@ kernel void KernelLargeTwoMember(struct 
LargeStructTwoMember u) {
 //
 //
 // AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
-// AMDGCN20-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) 
#[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] 
!kernel_arg_type_qual [[META13]] {
+// AMDGCN20-SAME: ptr addrspace(4) noundef 
byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] 
!kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] 
!kernel_arg_type_qual [[META13]] {
 // AMDGCN20-NEXT:  [[ENTRY:.*:]]
-// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 
8, addrspace(5)
-// AMDGCN20-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN20-NEXT:    [[TMP1:%.*]] = extractvalue 
[[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
-// AMDGCN20-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8
-// AMDGCN20-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN20-NEXT:    [[TMP3:%.*]] = extractvalue 
[[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
-// AMDGCN20-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr [[TMP2]], align 8
-// AMDGCN20-NEXT:    [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr 
addrspace(5)
-// AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr 
addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 
[[U1_ASCAST]]) #[[ATTR4]]
+// AMDGCN20-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], 
align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to 
ptr
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr 
addrspace(4) align 8 [[TMP0]], i64 480, i1 false)
+// AMDGCN20-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr 
addrspace(5)
+// AMDGCN20-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr 
addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 
[[U_ASCAST]]) #[[ATTR4]]
 // AMDGCN20-NEXT:    ret void
 //
 //
@@ -1408,12 +1384,10 @@ kernel void KernelLargeTwoMember(struct 
LargeStructTwoMember u) {
 //
 //
 // AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void 
@KernelLargeOneMember(
-// AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) 
#[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] 
!kernel_arg_type_qual [[META13]] {
+// AMDGCN30-GVAR-SAME: ptr addrspace(4) noundef 
byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] 
!kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] 
!kernel_arg_type_qual [[META13]] {
 // AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
 // AMDGCN30-GVAR-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], 
align 8, addrspace(5)
-// AMDGCN30-GVAR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN30-GVAR-NEXT:    [[TMP1:%.*]] = extractvalue 
[[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
-// AMDGCN30-GVAR-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) 
[[TMP0]], align 8
+// AMDGCN30-GVAR-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) 
align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false)
 // AMDGCN30-GVAR-NEXT:    call void 
@__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef 
byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]]
 // AMDGCN30-GVAR-NEXT:    ret void
 //
@@ -1458,20 +1432,15 @@ kernel void KernelLargeTwoMember(struct 
LargeStructTwoMember u) {
 //
 //
 // AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
-// AMDGCN30-GVAR-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) 
#[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] 
!kernel_arg_type_qual [[META13]] {
+// AMDGCN30-GVAR-SAME: ptr addrspace(4) noundef 
byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] 
!kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] 
!kernel_arg_type_qual [[META13]] {
 // AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
 // AMDGCN30-GVAR-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 
8, addrspace(5)
-// AMDGCN30-GVAR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN30-GVAR-NEXT:    [[TMP1:%.*]] = extractvalue 
[[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0
-// AMDGCN30-GVAR-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], 
align 8
-// AMDGCN30-GVAR-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN30-GVAR-NEXT:    [[TMP3:%.*]] = extractvalue 
[[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1
-// AMDGCN30-GVAR-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], 
align 8
-// AMDGCN30-GVAR-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN30-GVAR-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) 
[[TMP4]], align 8
-// AMDGCN30-GVAR-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN30-GVAR-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) 
[[TMP6]], align 8
-// AMDGCN30-GVAR-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x 
i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]]
+// AMDGCN30-GVAR-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) 
align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
+// AMDGCN30-GVAR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) 
[[TMP1]], align 8
+// AMDGCN30-GVAR-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-GVAR-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) 
[[TMP3]], align 8
+// AMDGCN30-GVAR-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x 
i32> [[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR4]]
 // AMDGCN30-GVAR-NEXT:    ret void
 //
 //
@@ -1492,15 +1461,10 @@ kernel void KernelLargeTwoMember(struct 
LargeStructTwoMember u) {
 //
 //
 // AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void 
@KernelLargeTwoMember(
-// AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) 
#[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] 
!kernel_arg_type_qual [[META13]] {
+// AMDGCN30-GVAR-SAME: ptr addrspace(4) noundef 
byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] 
!kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] 
!kernel_arg_type_qual [[META13]] {
 // AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
 // AMDGCN30-GVAR-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], 
align 8, addrspace(5)
-// AMDGCN30-GVAR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN30-GVAR-NEXT:    [[TMP1:%.*]] = extractvalue 
[[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
-// AMDGCN30-GVAR-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) 
[[TMP0]], align 8
-// AMDGCN30-GVAR-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN30-GVAR-NEXT:    [[TMP3:%.*]] = extractvalue 
[[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
-// AMDGCN30-GVAR-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) 
[[TMP2]], align 8
+// AMDGCN30-GVAR-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) 
align 8 [[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false)
 // AMDGCN30-GVAR-NEXT:    call void 
@__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef 
byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]]
 // AMDGCN30-GVAR-NEXT:    ret void
 //
@@ -1699,12 +1663,10 @@ kernel void KernelLargeTwoMember(struct 
LargeStructTwoMember u) {
 //
 //
 // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
-// AMDGCN30-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) 
#[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] 
!kernel_arg_type_qual [[META13]] {
+// AMDGCN30-SAME: ptr addrspace(4) noundef 
byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] 
!kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] 
!kernel_arg_type_qual [[META13]] {
 // AMDGCN30-NEXT:  [[ENTRY:.*:]]
 // AMDGCN30-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 
8, addrspace(5)
-// AMDGCN30-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN30-NEXT:    [[TMP1:%.*]] = extractvalue 
[[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
-// AMDGCN30-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) 
[[TMP0]], align 8
+// AMDGCN30-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 
[[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false)
 // AMDGCN30-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr 
addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) 
#[[ATTR4]]
 // AMDGCN30-NEXT:    ret void
 //
@@ -1749,20 +1711,15 @@ kernel void KernelLargeTwoMember(struct 
LargeStructTwoMember u) {
 //
 //
 // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
-// AMDGCN30-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] 
!kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] 
!kernel_arg_type_qual [[META13]] {
+// AMDGCN30-SAME: ptr addrspace(4) noundef 
byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] 
!kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] 
!kernel_arg_type_qual [[META13]] {
 // AMDGCN30-NEXT:  [[ENTRY:.*:]]
 // AMDGCN30-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, 
addrspace(5)
-// AMDGCN30-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN30-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] 
[[U_COERCE]], 0
-// AMDGCN30-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], 
align 8
-// AMDGCN30-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN30-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] 
[[U_COERCE]], 1
-// AMDGCN30-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], 
align 8
-// AMDGCN30-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN30-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], 
align 8
-// AMDGCN30-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN30-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], 
align 8
-// AMDGCN30-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> 
[[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]]
+// AMDGCN30-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 
[[U]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
+// AMDGCN30-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], 
align 8
+// AMDGCN30-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP3]], 
align 8
+// AMDGCN30-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> 
[[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR4]]
 // AMDGCN30-NEXT:    ret void
 //
 //
@@ -1783,15 +1740,10 @@ kernel void KernelLargeTwoMember(struct 
LargeStructTwoMember u) {
 //
 //
 // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
-// AMDGCN30-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) 
#[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] 
!kernel_arg_type_qual [[META13]] {
+// AMDGCN30-SAME: ptr addrspace(4) noundef 
byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] 
!kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] 
!kernel_arg_type_qual [[META13]] {
 // AMDGCN30-NEXT:  [[ENTRY:.*:]]
 // AMDGCN30-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 
8, addrspace(5)
-// AMDGCN30-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN30-NEXT:    [[TMP1:%.*]] = extractvalue 
[[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
-// AMDGCN30-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) 
[[TMP0]], align 8
-// AMDGCN30-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN30-NEXT:    [[TMP3:%.*]] = extractvalue 
[[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
-// AMDGCN30-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) 
[[TMP2]], align 8
+// AMDGCN30-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 
[[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false)
 // AMDGCN30-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr 
addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) 
#[[ATTR4]]
 // AMDGCN30-NEXT:    ret void
 //
diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl 
b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
index 6dc488c40da7f..7d0a66bac1469 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 5 --include-generated-funcs 
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --include-generated-funcs --version 5
 // RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL2.0 -O0 -triple amdgcn | 
FileCheck -check-prefix=AMDGCN %s
 
 typedef int int2 __attribute__((ext_vector_type(2)));
@@ -330,15 +330,13 @@ kernel void KernelLargeTwoMember(struct 
LargeStructTwoMember u) {
 //
 //
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
-// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) 
#[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] 
!kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef 
byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] 
!kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] 
!kernel_arg_type_qual [[META13]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 
8, addrspace(5)
-// AMDGCN-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] 
[[U_COERCE]], 0
-// AMDGCN-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8
-// AMDGCN-NEXT:    [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr 
addrspace(5)
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr 
addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 
[[U1_ASCAST]]) #[[ATTR4]]
+// AMDGCN-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], 
align 8, addrspace(5)
+// AMDGCN-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
+// AMDGCN-NEXT:    call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr 
addrspace(4) align 8 [[TMP0]], i64 800, i1 false)
+// AMDGCN-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr 
addrspace(5)
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr 
addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 
[[U_ASCAST]]) #[[ATTR4]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -389,21 +387,16 @@ kernel void KernelLargeTwoMember(struct 
LargeStructTwoMember u) {
 //
 //
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
-// AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] 
!kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] 
!kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) 
align 8 [[TMP0:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] 
!kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] 
!kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, 
addrspace(5)
-// AMDGCN-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] 
[[U_COERCE]], 0
-// AMDGCN-NEXT:    store <2 x i32> [[TMP1]], ptr [[TMP0]], align 8
-// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] 
[[U_COERCE]], 1
-// AMDGCN-NEXT:    store <2 x i32> [[TMP3]], ptr [[TMP2]], align 8
-// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8
-// AMDGCN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> 
[[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]]
+// AMDGCN-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 
8, addrspace(5)
+// AMDGCN-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
+// AMDGCN-NEXT:    call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr 
addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
+// AMDGCN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+// AMDGCN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1
+// AMDGCN-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[TMP3]], align 8
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> 
[[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR4]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -425,18 +418,13 @@ kernel void KernelLargeTwoMember(struct 
LargeStructTwoMember u) {
 //
 //
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
-// AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) 
#[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] 
!kernel_arg_type_qual [[META13]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef 
byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR1]] 
!kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] 
!kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] 
!kernel_arg_type_qual [[META13]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 
8, addrspace(5)
-// AMDGCN-NEXT:    [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] 
[[U_COERCE]], 0
-// AMDGCN-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8
-// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] 
[[U_COERCE]], 1
-// AMDGCN-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr [[TMP2]], align 8
-// AMDGCN-NEXT:    [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr 
addrspace(5)
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr 
addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 
[[U1_ASCAST]]) #[[ATTR4]]
+// AMDGCN-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], 
align 8, addrspace(5)
+// AMDGCN-NEXT:    [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
+// AMDGCN-NEXT:    call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[U]], ptr 
addrspace(4) align 8 [[TMP0]], i64 480, i1 false)
+// AMDGCN-NEXT:    [[U_ASCAST:%.*]] = addrspacecast ptr [[U]] to ptr 
addrspace(5)
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr 
addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 
[[U_ASCAST]]) #[[ATTR4]]
 // AMDGCN-NEXT:    ret void
 //
 //
diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl 
b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
index 4d09fc3ffb70b..06d3cdb01deb2 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
@@ -214,7 +214,8 @@ typedef struct struct_4regs
   int w;
 } struct_4regs;
 
-// CHECK: void @kernel_empty_struct_arg(%struct.empty_struct %s.coerce)
+// CHECK: void @kernel_empty_struct_arg(ptr addrspace(4) noundef readnone 
byref(%struct.empty_struct) align 1 captures(none) {{%.+}})
+// CHECK: void @__clang_ocl_kern_imp_kernel_empty_struct_arg()
 __kernel void kernel_empty_struct_arg(empty_struct s) { }
 
 // CHECK: void @kernel_single_element_struct_arg(i32 %arg1.coerce)
@@ -223,28 +224,35 @@ __kernel void 
kernel_single_element_struct_arg(single_element_struct_arg_t arg1)
 // CHECK: void @kernel_nested_single_element_struct_arg(i32 %arg1.coerce)
 __kernel void 
kernel_nested_single_element_struct_arg(nested_single_element_struct_arg_t 
arg1) { }
 
-// CHECK: void @kernel_struct_arg(%struct.struct_arg %arg1.coerce)
+// CHECK: void @kernel_struct_arg(ptr addrspace(4) noundef readonly 
byref(%struct.struct_arg) align 4 captures(none) {{%.+}})
+// CHECK: void @__clang_ocl_kern_imp_kernel_struct_arg(i32 %arg1.coerce0, 
float %arg1.coerce1, i32 %arg1.coerce2)
 __kernel void kernel_struct_arg(struct_arg_t arg1) { }
 
-// CHECK: void @kernel_struct_padding_arg(%struct.struct_padding_arg 
%arg1.coerce)
+// CHECK: void @kernel_struct_padding_arg(ptr addrspace(4) noundef readonly 
byref(%struct.struct_padding_arg) align 8 captures(none) {{%.+}})
+// CHECK: void @__clang_ocl_kern_imp_kernel_struct_padding_arg(i8 
%arg1.coerce0, i64 %arg1.coerce1)
 __kernel void kernel_struct_padding_arg(struct_padding_arg arg1) { }
 
-// CHECK: void @kernel_test_struct_of_arrays_arg(%struct.struct_of_arrays_arg 
%arg1.coerce)
+// CHECK: void @kernel_test_struct_of_arrays_arg(ptr addrspace(4) noundef 
readonly byref(%struct.struct_of_arrays_arg) align 4 captures(none) {{%.+}})
+// CHECK: void @__clang_ocl_kern_imp_kernel_test_struct_of_arrays_arg([2 x 
i32] %arg1.coerce0, float %arg1.coerce1, [4 x i32] %arg1.coerce2, [3 x float] 
%arg1.coerce3, i32 %arg1.coerce4)
 __kernel void kernel_test_struct_of_arrays_arg(struct_of_arrays_arg_t arg1) { }
 
-// CHECK: void @kernel_struct_of_structs_arg(%struct.struct_of_structs_arg 
%arg1.coerce)
+// CHECK: void @kernel_struct_of_structs_arg(ptr addrspace(4) noundef readonly 
byref(%struct.struct_of_structs_arg) align 4 captures(none) {{%.+}})
+// CHECK: void @__clang_ocl_kern_imp_kernel_struct_of_structs_arg(i32 
%arg1.coerce0, float %arg1.coerce1, %struct.struct_arg %arg1.coerce2, i32 
%arg1.coerce3)
 __kernel void kernel_struct_of_structs_arg(struct_of_structs_arg_t arg1) { }
 
 // CHECK: void @test_kernel_transparent_union_arg(i32 %u.coerce)
 __kernel void test_kernel_transparent_union_arg(transparent_u u) { }
 
-// CHECK: void 
@kernel_single_array_element_struct_arg(%struct.single_array_element_struct_arg 
%arg1.coerce)
+// CHECK: void @kernel_single_array_element_struct_arg(ptr addrspace(4) 
noundef readonly byref(%struct.single_array_element_struct_arg) align 4 
captures(none) {{%.+}})
+// CHECK: void @__clang_ocl_kern_imp_kernel_single_array_element_struct_arg([4 
x i32] %arg1.coerce)
 __kernel void 
kernel_single_array_element_struct_arg(single_array_element_struct_arg_t arg1) 
{ }
 
-// CHECK: void 
@kernel_single_struct_element_struct_arg(%struct.single_struct_element_struct_arg
 %arg1.coerce)
+// CHECK: void @kernel_single_struct_element_struct_arg(ptr addrspace(4) 
noundef readonly byref(%struct.single_struct_element_struct_arg) align 8 
captures(none) {{%.+}})
+// CHECK: void 
@__clang_ocl_kern_imp_kernel_single_struct_element_struct_arg(%struct.inner 
%arg1.coerce)
 __kernel void 
kernel_single_struct_element_struct_arg(single_struct_element_struct_arg_t 
arg1) { }
 
-// CHECK: void 
@kernel_different_size_type_pair_arg(%struct.different_size_type_pair 
%arg1.coerce)
+// CHECK: void @kernel_different_size_type_pair_arg(ptr addrspace(4) noundef 
readonly byref(%struct.different_size_type_pair) align 8 captures(none) {{%.+}})
+// CHECK: void @__clang_ocl_kern_imp_kernel_different_size_type_pair_arg(i64 
%arg1.coerce0, i32 %arg1.coerce1)
 __kernel void kernel_different_size_type_pair_arg(different_size_type_pair 
arg1) { }
 
 // CHECK: define{{.*}} void @func_f32_arg(float noundef %arg)
diff --git a/clang/test/CodeGenOpenCL/opencl-kernel-call.cl 
b/clang/test/CodeGenOpenCL/opencl-kernel-call.cl
index cdbe510b723b9..a5b2bee127bd0 100644
--- a/clang/test/CodeGenOpenCL/opencl-kernel-call.cl
+++ b/clang/test/CodeGenOpenCL/opencl-kernel-call.cl
@@ -676,12 +676,10 @@ kernel void caller_kern3( struct LargeStructOneMember 
largeStructOneMem, struct
 //
 // AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
-// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) 
#[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] 
!kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] 
!kernel_arg_type_qual [[META7]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef 
byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] 
!kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] 
!kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] 
!kernel_arg_type_qual [[META7]] {
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 
8, addrspace(5)
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] 
[[U_COERCE]], 0
-// AMDGCN-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) 
[[TMP0]], align 8
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 
[[U]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false)
 // AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr 
addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) 
#[[ATTR5]]
 // AMDGCN-NEXT:    ret void
 //
@@ -698,20 +696,15 @@ kernel void caller_kern3( struct LargeStructOneMember 
largeStructOneMem, struct
 //
 // AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
-// AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR2]] 
!kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] 
!kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] 
!kernel_arg_type_qual [[META7]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) 
align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] 
!kernel_arg_access_qual [[META5]] !kernel_arg_type [[META16:![0-9]+]] 
!kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META7]] {
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, 
addrspace(5)
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] 
[[U_COERCE]], 0
-// AMDGCN-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
-// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] 
[[U_COERCE]], 1
-// AMDGCN-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
-// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], 
align 8
-// AMDGCN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], 
align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> 
[[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR5]]
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 
[[U]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
+// AMDGCN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], 
align 8
+// AMDGCN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP3]], 
align 8
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> 
[[TMP2]], <2 x i32> [[TMP4]]) #[[ATTR5]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -734,15 +727,10 @@ kernel void caller_kern3( struct LargeStructOneMember 
largeStructOneMem, struct
 //
 // AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
-// AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) 
#[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] 
!kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] 
!kernel_arg_type_qual [[META7]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef 
byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] 
!kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] 
!kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] 
!kernel_arg_type_qual [[META7]] {
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 
8, addrspace(5)
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] 
[[U_COERCE]], 0
-// AMDGCN-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], 
align 8
-// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] 
[[U_COERCE]], 1
-// AMDGCN-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], 
align 8
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 
[[U]], ptr addrspace(4) align 8 [[TMP0]], i64 480, i1 false)
 // AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr 
addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) 
#[[ATTR5]]
 // AMDGCN-NEXT:    ret void
 //
@@ -815,28 +803,23 @@ kernel void caller_kern3( struct LargeStructOneMember 
largeStructOneMem, struct
 //
 // AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @caller_kern2(
-// AMDGCN-SAME: <2 x i32> [[STRUCTONEMEM_COERCE:%.*]], ptr addrspace(1) 
noundef align 8 [[GLOBAL_STRUCTONEMEM:%.*]], [[STRUCT_STRUCTTWOMEMBER:%.*]] 
[[STRUCTTWOMEM_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space 
[[META22:![0-9]+]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type 
[[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual 
[[META25:![0-9]+]] {
+// AMDGCN-SAME: <2 x i32> [[STRUCTONEMEM_COERCE:%.*]], ptr addrspace(1) 
noundef align 8 [[GLOBAL_STRUCTONEMEM:%.*]], ptr addrspace(4) noundef 
byref([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] 
!kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual 
[[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type 
[[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] {
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[STRUCTONEMEM:%.*]] = alloca 
[[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
 // AMDGCN-NEXT:    [[STRUCTTWOMEM:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], 
align 8, addrspace(5)
 // AMDGCN-NEXT:    [[GLOBAL_STRUCTONEMEM_ADDR:%.*]] = alloca ptr addrspace(1), 
align 8, addrspace(5)
 // AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0
 // AMDGCN-NEXT:    store <2 x i32> [[STRUCTONEMEM_COERCE]], ptr addrspace(5) 
[[COERCE_DIVE]], align 8
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] 
[[STRUCTTWOMEM_COERCE]], 0
-// AMDGCN-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
-// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] 
[[STRUCTTWOMEM_COERCE]], 1
-// AMDGCN-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 
[[STRUCTTWOMEM]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false)
 // AMDGCN-NEXT:    store ptr addrspace(1) [[GLOBAL_STRUCTONEMEM]], ptr 
addrspace(5) [[GLOBAL_STRUCTONEMEM_ADDR]], align 8
-// AMDGCN-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) 
[[GLOBAL_STRUCTONEMEM_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) 
[[GLOBAL_STRUCTONEMEM_ADDR]], align 8
 // AMDGCN-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) 
[[COERCE_DIVE1]], align 8
-// AMDGCN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0
-// AMDGCN-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], 
align 8
-// AMDGCN-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1
-// AMDGCN-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP8]], 
align 8
-// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_caller_kern2(<2 x i32> 
[[TMP5]], ptr addrspace(1) noundef align 8 [[TMP4]], <2 x i32> [[TMP7]], <2 x 
i32> [[TMP9]]) #[[ATTR5]]
+// AMDGCN-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) 
[[COERCE_DIVE1]], align 8
+// AMDGCN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP3]], 
align 8
+// AMDGCN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw 
[[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1
+// AMDGCN-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], 
align 8
+// AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_caller_kern2(<2 x i32> 
[[TMP2]], ptr addrspace(1) noundef align 8 [[TMP1]], <2 x i32> [[TMP4]], <2 x 
i32> [[TMP6]]) #[[ATTR5]]
 // AMDGCN-NEXT:    ret void
 //
 //
@@ -875,19 +858,12 @@ kernel void caller_kern3( struct LargeStructOneMember 
largeStructOneMem, struct
 //
 // AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone
 // AMDGCN-LABEL: define dso_local amdgpu_kernel void @caller_kern3(
-// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] 
[[LARGESTRUCTONEMEM_COERCE:%.*]], [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] 
[[LARGESTRUCTTWOMEM_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space 
[[META26:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type 
[[META27:![0-9]+]] !kernel_arg_base_type [[META27]] !kernel_arg_type_qual 
[[META11]] {
+// AMDGCN-SAME: ptr addrspace(4) noundef 
byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]], ptr 
addrspace(4) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 
[[TMP1:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META26:![0-9]+]] 
!kernel_arg_access_qual [[META9]] !kernel_arg_type [[META27:![0-9]+]] 
!kernel_arg_base_type [[META27]] !kernel_arg_type_qual [[META11]] {
 // AMDGCN-NEXT:  entry:
 // AMDGCN-NEXT:    [[LARGESTRUCTONEMEM:%.*]] = alloca 
[[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
 // AMDGCN-NEXT:    [[LARGESTRUCTTWOMEM:%.*]] = alloca 
[[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
-// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[LARGESTRUCTONEMEM]], i32 0, 
i32 0
-// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] 
[[LARGESTRUCTONEMEM_COERCE]], 0
-// AMDGCN-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) 
[[TMP0]], align 8
-// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[LARGESTRUCTTWOMEM]], i32 0, 
i32 0
-// AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] 
[[LARGESTRUCTTWOMEM_COERCE]], 0
-// AMDGCN-NEXT:    store [40 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], 
align 8
-// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw 
[[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[LARGESTRUCTTWOMEM]], i32 0, 
i32 1
-// AMDGCN-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] 
[[LARGESTRUCTTWOMEM_COERCE]], 1
-// AMDGCN-NEXT:    store [20 x <2 x i32>] [[TMP5]], ptr addrspace(5) [[TMP4]], 
align 8
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 
[[LARGESTRUCTONEMEM]], ptr addrspace(4) align 8 [[TMP0]], i64 800, i1 false)
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 
[[LARGESTRUCTTWOMEM]], ptr addrspace(4) align 8 [[TMP1]], i64 480, i1 false)
 // AMDGCN-NEXT:    call void @__clang_ocl_kern_imp_caller_kern3(ptr 
addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 
[[LARGESTRUCTONEMEM]], ptr addrspace(5) noundef 
byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[LARGESTRUCTTWOMEM]]) #[[ATTR5]]
 // AMDGCN-NEXT:    ret void
 //

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to