arsenm created this revision. arsenm added reviewers: yaxunl, hliao, jdoerfert, rjmccall, Anastasia, rampitec. Herald added subscribers: kerbowa, nhaehnle, wdng, jvesely. arsenm added parent revisions: D79732: AMDGPU/HIP: Don't replace pointer types in kernel argument structs, D79630: AMDGPU: Start interpreting byval on kernel arguments, D79593: Verifier: Check address space for byval on AMDGPU calling conventions.
Previously, indirect arguments assumed assumed a stack passed object in the alloca address space. A stack pointer is unsuitable for kernel arguments, which are passed in a separate, constant buffer with a different address space. Start using byval for aggregate kernel arguments. Previously these were emitted as raw struct arguments, and turned into loads in the backend. These will lower identically, although with byval you now have the option of applying an explicit alignment. In the future, a reasonable implementation would use byval for all kernel arguments (this would be a practical problem at the moment due to losing things like noalias on pointer arguments). This is mostly to avoid fighting the optimizer's treatment of aggregate load/store. SROA and instcombine both turn aggregate loads and stores into a long sequence of element loads and stores, rather than the optimizable memcpy I would expect in this situation. Now an explicit memcpy will be introduced up-front which is better understood and helps eliminate the alloca in more situations. Most of the language surrounding byval involves the stack, however I can't find any real reason this needs to limited to a stack address space. My main concern is that nothing seems to explicitly disallow writing to a byval address, but it is illegal to write to this read only pointer. A theoretical stack space reusing pass might try to reuse some byval bytes. The only one we have now is the stack coloring machine pass, which would never see a frame index for these byval arguments. I could go a step further and add the readonly attribute to these to make sure writes should never be introduced, although this patch doesn't do that yet. There is code in the clang handling explicitly removing readnone from byval arguments for some reason. https://reviews.llvm.org/D79744 Files: clang/include/clang/CodeGen/CGFunctionInfo.h clang/lib/CodeGen/CGCall.cpp clang/lib/CodeGen/TargetInfo.cpp clang/test/CodeGenCUDA/kernel-args.cu clang/test/CodeGenOpenCL/addr-space-struct-arg.cl clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
Index: clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl =================================================================== --- clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl +++ clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl @@ -216,7 +216,7 @@ int w; } struct_4regs; -// CHECK: void @kernel_empty_struct_arg(%struct.empty_struct %s.coerce) +// CHECK: void @kernel_empty_struct_arg(%struct.empty_struct addrspace(4)* nocapture byval(%struct.empty_struct) align 1 {{%.+}}) __kernel void kernel_empty_struct_arg(empty_struct s) { } // CHECK: void @kernel_single_element_struct_arg(i32 %arg1.coerce) @@ -225,28 +225,28 @@ // CHECK: void @kernel_nested_single_element_struct_arg(i32 %arg1.coerce) __kernel void kernel_nested_single_element_struct_arg(nested_single_element_struct_arg_t arg1) { } -// CHECK: void @kernel_struct_arg(%struct.struct_arg %arg1.coerce) +// CHECK: void @kernel_struct_arg(%struct.struct_arg addrspace(4)* nocapture byval(%struct.struct_arg) align 4 {{%.+}}) __kernel void kernel_struct_arg(struct_arg_t arg1) { } -// CHECK: void @kernel_struct_padding_arg(%struct.struct_padding_arg %arg1.coerce) +// CHECK: void @kernel_struct_padding_arg(%struct.struct_padding_arg addrspace(4)* nocapture byval(%struct.struct_padding_arg) align 8 %{{.+}}) __kernel void kernel_struct_padding_arg(struct_padding_arg arg1) { } -// CHECK: void @kernel_test_struct_of_arrays_arg(%struct.struct_of_arrays_arg %arg1.coerce) +// CHECK: void @kernel_test_struct_of_arrays_arg(%struct.struct_of_arrays_arg addrspace(4)* nocapture byval(%struct.struct_of_arrays_arg) align 4 %{{.+}}) __kernel void kernel_test_struct_of_arrays_arg(struct_of_arrays_arg_t arg1) { } -// CHECK: void @kernel_struct_of_structs_arg(%struct.struct_of_structs_arg %arg1.coerce) +// CHECK: void @kernel_struct_of_structs_arg(%struct.struct_of_structs_arg addrspace(4)* nocapture byval(%struct.struct_of_structs_arg) align 4 %{{.+}}) __kernel void kernel_struct_of_structs_arg(struct_of_structs_arg_t arg1) { } // CHECK: void @test_kernel_transparent_union_arg(%union.transparent_u %u.coerce) __kernel void test_kernel_transparent_union_arg(transparent_u u) { } -// CHECK: void @kernel_single_array_element_struct_arg(%struct.single_array_element_struct_arg %arg1.coerce) +// CHECK: void @kernel_single_array_element_struct_arg(%struct.single_array_element_struct_arg addrspace(4)* nocapture byval(%struct.single_array_element_struct_arg) align 4 %{{.+}}) __kernel void kernel_single_array_element_struct_arg(single_array_element_struct_arg_t arg1) { } -// CHECK: void @kernel_single_struct_element_struct_arg(%struct.single_struct_element_struct_arg %arg1.coerce) +// CHECK: void @kernel_single_struct_element_struct_arg(%struct.single_struct_element_struct_arg addrspace(4)* nocapture byval(%struct.single_struct_element_struct_arg) align 8 %{{.+}}) __kernel void kernel_single_struct_element_struct_arg(single_struct_element_struct_arg_t arg1) { } -// CHECK: void @kernel_different_size_type_pair_arg(%struct.different_size_type_pair %arg1.coerce) +// CHECK: void @kernel_different_size_type_pair_arg(%struct.different_size_type_pair addrspace(4)* nocapture byval(%struct.different_size_type_pair) align 8 %{{.+}}) __kernel void kernel_different_size_type_pair_arg(different_size_type_pair arg1) { } // CHECK: define void @func_f32_arg(float %arg) Index: clang/test/CodeGenOpenCL/addr-space-struct-arg.cl =================================================================== --- clang/test/CodeGenOpenCL/addr-space-struct-arg.cl +++ clang/test/CodeGenOpenCL/addr-space-struct-arg.cl @@ -139,10 +139,13 @@ FuncOneMember(*u); } -// AMDGCN-LABEL: define amdgpu_kernel void @KernelLargeOneMember( -// AMDGCN: %[[U:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) -// AMDGCN: store %struct.LargeStructOneMember %u.coerce, %struct.LargeStructOneMember addrspace(5)* %[[U]], align 8 -// AMDGCN: call void @FuncOneLargeMember(%struct.LargeStructOneMember addrspace(5)* byval(%struct.LargeStructOneMember) align 8 %[[U]]) +// AMDGCN-LABEL: define amdgpu_kernel void @KernelLargeOneMember +// AMDGCN-SAME: (%struct.LargeStructOneMember addrspace(4)* byval(%struct.LargeStructOneMember) align 8 [[BYVAL_PTR:%.+]]) +// AMDGCN: [[U_ALLOCA:%.+]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) +// AMDGCN-NEXT: [[CAST_ALLOCA:%.+]] = bitcast %struct.LargeStructOneMember addrspace(5)* [[U_ALLOCA]] to i8 addrspace(5)* +// AMDGCN-NEXT: [[CAST_KERNARG:%.+]] = bitcast %struct.LargeStructOneMember addrspace(4)* [[BYVAL_PTR]] to i8 addrspace(4)* +// AMDGCN-NEXT: call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* align 8 [[CAST_ALLOCA]], i8 addrspace(4)* align 8 [[CAST_KERNARG]], i64 800, i1 false) +// AMDGCN-NEXT: call void @FuncOneLargeMember(%struct.LargeStructOneMember addrspace(5)* byval(%struct.LargeStructOneMember) align 8 [[U_ALLOCA]]) kernel void KernelLargeOneMember(struct LargeStructOneMember u) { FuncOneLargeMember(u); } @@ -158,20 +161,25 @@ } // AMDGCN-LABEL: define amdgpu_kernel void @KernelTwoMember -// AMDGCN-SAME: (%struct.StructTwoMember %[[u_coerce:.*]]) -// AMDGCN: %[[u:.*]] = alloca %struct.StructTwoMember, align 8, addrspace(5) -// AMDGCN: %[[LD0:.*]] = load <2 x i32>, <2 x i32> addrspace(5)* -// AMDGCN: %[[LD1:.*]] = load <2 x i32>, <2 x i32> addrspace(5)* +// AMDGCN-SAME: (%struct.StructTwoMember addrspace(4)* byval(%struct.StructTwoMember) align 8 [[BYVAL_PTR:%.+]]) +// AMDGCN: [[U_ALLOCA:%.+]] = alloca %struct.StructTwoMember, align 8, addrspace(5) +// AMDGCN-NEXT: [[CAST_ALLOCA:%.+]] = bitcast %struct.StructTwoMember addrspace(5)* [[U_ALLOCA]] to i8 addrspace(5)* +// AMDGCN-NEXT: [[CAST_KERNARG:%.+]] = bitcast %struct.StructTwoMember addrspace(4)* [[BYVAL_PTR]] to i8 addrspace(4)* +// AMDGCN-NEXT: call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* align 8 [[CAST_ALLOCA]], i8 addrspace(4)* align 8 [[CAST_KERNARG]], i64 16, i1 false) +// AMDGCN: %[[LD0:.+]] = load <2 x i32>, <2 x i32> addrspace(5)* +// AMDGCN: %[[LD1:.+]] = load <2 x i32>, <2 x i32> addrspace(5)* // AMDGCN: call void @FuncTwoMember(<2 x i32> %[[LD0]], <2 x i32> %[[LD1]]) kernel void KernelTwoMember(struct StructTwoMember u) { FuncTwoMember(u); } // AMDGCN-LABEL: define amdgpu_kernel void @KernelLargeTwoMember -// AMDGCN-SAME: (%struct.LargeStructTwoMember %[[u_coerce:.*]]) -// AMDGCN: %[[u:.*]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5) -// AMDGCN: store %struct.LargeStructTwoMember %[[u_coerce]], %struct.LargeStructTwoMember addrspace(5)* %[[u]] -// AMDGCN: call void @FuncLargeTwoMember(%struct.LargeStructTwoMember addrspace(5)* byval(%struct.LargeStructTwoMember) align 8 %[[u]]) +// AMDGCN-SAME: (%struct.LargeStructTwoMember addrspace(4)* byval(%struct.LargeStructTwoMember) align 8 [[BYVAL_PTR:%.+]]) +// AMDGCN: [[U_ALLOCA:%.+]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5) +// AMDGCN-NEXT: [[CAST_ALLOCA:%.+]] = bitcast %struct.LargeStructTwoMember addrspace(5)* [[U_ALLOCA]] to i8 addrspace(5)* +// AMDGCN-NEXT: [[CAST_KERNARG:%.+]] = bitcast %struct.LargeStructTwoMember addrspace(4)* %{{.+}} to i8 addrspace(4)* +// AMDGCN-NEXT: call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* align 8 [[CAST_ALLOCA]], i8 addrspace(4)* align 8 [[CAST_KERNARG]], i64 480, i1 false) +// AMDGCN: call void @FuncLargeTwoMember(%struct.LargeStructTwoMember addrspace(5)* byval(%struct.LargeStructTwoMember) align 8 [[U_ALLOCA]]) kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { FuncLargeTwoMember(u); } Index: clang/test/CodeGenCUDA/kernel-args.cu =================================================================== --- clang/test/CodeGenCUDA/kernel-args.cu +++ clang/test/CodeGenCUDA/kernel-args.cu @@ -8,14 +8,14 @@ int a[32]; }; -// AMDGCN: define amdgpu_kernel void @_Z6kernel1A(%struct.A %x.coerce) +// AMDGCN: define amdgpu_kernel void @_Z6kernel1A(%struct.A addrspace(4)* byval(%struct.A) align 4 %{{.+}}) // NVPTX: define void @_Z6kernel1A(%struct.A* byval(%struct.A) align 4 %x) __global__ void kernel(A x) { } class Kernel { public: - // AMDGCN: define amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(%struct.A %x.coerce) + // AMDGCN: define amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(%struct.A addrspace(4)* byval(%struct.A) align 4 %{{.+}}) // NVPTX: define void @_ZN6Kernel12memberKernelE1A(%struct.A* byval(%struct.A) align 4 %x) static __global__ void memberKernel(A x){} template<typename T> static __global__ void templateMemberKernel(T x) {} @@ -29,11 +29,11 @@ void test() { Kernel K; - // AMDGCN: define amdgpu_kernel void @_Z14templateKernelI1AEvT_(%struct.A %x.coerce) + // AMDGCN: define amdgpu_kernel void @_Z14templateKernelI1AEvT_(%struct.A addrspace(4)* byval(%struct.A) align 4 %{{.+}} // NVPTX: define void @_Z14templateKernelI1AEvT_(%struct.A* byval(%struct.A) align 4 %x) launch((void*)templateKernel<A>); - // AMDGCN: define amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A %x.coerce) + // AMDGCN: define amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A addrspace(4)* byval(%struct.A) align 4 %{{.+}} // NVPTX: define void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A* byval(%struct.A) align 4 %x) launch((void*)Kernel::templateMemberKernel<A>); } Index: clang/lib/CodeGen/TargetInfo.cpp =================================================================== --- clang/lib/CodeGen/TargetInfo.cpp +++ clang/lib/CodeGen/TargetInfo.cpp @@ -82,14 +82,16 @@ ABIArgInfo ABIInfo::getNaturalAlignIndirect(QualType Ty, bool ByRef, bool Realign, llvm::Type *Padding) const { - return ABIArgInfo::getIndirect(getContext().getTypeAlignInChars(Ty), - ByRef, Realign, Padding); + return ABIArgInfo::getIndirect(getContext().getTypeAlignInChars(Ty), ByRef, + Realign, Padding, + getDataLayout().getAllocaAddrSpace()); } ABIArgInfo ABIInfo::getNaturalAlignIndirectInReg(QualType Ty, bool Realign) const { return ABIArgInfo::getIndirectInReg(getContext().getTypeAlignInChars(Ty), - /*ByRef*/ false, Realign); + /*ByRef*/ false, Realign, + getDataLayout().getAllocaAddrSpace()); } Address ABIInfo::EmitMSVAArg(CodeGenFunction &CGF, Address VAListAddr, @@ -8486,6 +8488,13 @@ /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device)); } + if (!LTy && isAggregateTypeForABI(Ty)) { + return ABIArgInfo::getIndirect( + getContext().getTypeAlignInChars(Ty), true /*ByVal*/, false /*Realign*/, + nullptr /*Padding*/, + getContext().getTargetAddressSpace(LangAS::opencl_constant)); + } + // If we set CanBeFlattened to true, CodeGen will expand the struct to its // individual elements, which confuses the Clover OpenCL backend; therefore we // have to set it to false here. Other args of getDirect() are just defaults. Index: clang/lib/CodeGen/CGCall.cpp =================================================================== --- clang/lib/CodeGen/CGCall.cpp +++ clang/lib/CodeGen/CGCall.cpp @@ -1632,10 +1632,8 @@ case ABIArgInfo::Indirect: { assert(NumIRArgs == 1); - // indirect arguments are always on the stack, which is alloca addr space. llvm::Type *LTy = ConvertTypeForMem(it->type); - ArgTypes[FirstIRArg] = LTy->getPointerTo( - CGM.getDataLayout().getAllocaAddrSpace()); + ArgTypes[FirstIRArg] = LTy->getPointerTo(ArgInfo.getIndirectAddrSpace()); break; } @@ -2388,10 +2386,13 @@ Address(Fn->getArg(FirstIRArg), ArgI.getIndirectAlign()); if (!hasScalarEvaluationKind(Ty)) { - // Aggregates and complex variables are accessed by reference. All we - // need to do is realign the value, if requested. + // Aggregates and complex variables are accessed by reference. All we + // need to do is realign the value, if requested. Also, if the address + // isn't on the stack, copy it there since we need a stack address. Address V = ParamAddr; - if (ArgI.getIndirectRealign()) { + if (ArgI.getIndirectRealign() || + ArgI.getIndirectAddrSpace() != + CGM.getDataLayout().getAllocaAddrSpace()) { Address AlignedTemp = CreateMemTemp(Ty, "coerce"); // Copy from the incoming argument pointer to the temporary with the Index: clang/include/clang/CodeGen/CGFunctionInfo.h =================================================================== --- clang/include/clang/CodeGen/CGFunctionInfo.h +++ clang/include/clang/CodeGen/CGFunctionInfo.h @@ -44,8 +44,8 @@ /// but also emit a zero/sign extension attribute. Extend, - /// Indirect - Pass the argument indirectly via a hidden pointer - /// with the specified alignment (0 indicates default alignment). + /// Indirect - Pass the argument indirectly via a hidden pointer with the + /// specified alignment (0 indicates default alignment) and address space. Indirect, /// Ignore - Ignore the argument (treat as void). Useful for void and @@ -86,6 +86,7 @@ unsigned AllocaFieldIndex; // isInAlloca() }; Kind TheKind; + unsigned IndirectAddrSpace : 24; // isIndirect() bool PaddingInReg : 1; bool InAllocaSRet : 1; // isInAlloca() bool InAllocaIndirect : 1;// isInAlloca() @@ -112,9 +113,10 @@ public: ABIArgInfo(Kind K = Direct) : TypeData(nullptr), PaddingType(nullptr), DirectOffset(0), TheKind(K), - PaddingInReg(false), InAllocaSRet(false), InAllocaIndirect(false), - IndirectByVal(false), IndirectRealign(false), SRetAfterThis(false), - InReg(false), CanBeFlattened(false), SignExt(false) {} + IndirectAddrSpace(0), PaddingInReg(false), InAllocaSRet(false), + InAllocaIndirect(false), IndirectByVal(false), IndirectRealign(false), + SRetAfterThis(false), InReg(false), CanBeFlattened(false), + SignExt(false) {} static ABIArgInfo getDirect(llvm::Type *T = nullptr, unsigned Offset = 0, llvm::Type *Padding = nullptr, @@ -171,19 +173,23 @@ } static ABIArgInfo getIndirect(CharUnits Alignment, bool ByVal = true, bool Realign = false, - llvm::Type *Padding = nullptr) { + llvm::Type *Padding = nullptr, + unsigned AddrSpace = 0) { auto AI = ABIArgInfo(Indirect); AI.setIndirectAlign(Alignment); AI.setIndirectByVal(ByVal); AI.setIndirectRealign(Realign); AI.setSRetAfterThis(false); AI.setPaddingType(Padding); + AI.setIndirectAddrSpace(AddrSpace); return AI; } static ABIArgInfo getIndirectInReg(CharUnits Alignment, bool ByVal = true, - bool Realign = false) { + bool Realign = false, + unsigned AddrSpace = 0) { auto AI = getIndirect(Alignment, ByVal, Realign); AI.setInReg(true); + AI.setIndirectAddrSpace(AddrSpace); return AI; } static ABIArgInfo getInAlloca(unsigned FieldIndex, bool Indirect = false) { @@ -355,6 +361,16 @@ IndirectByVal = IBV; } + unsigned getIndirectAddrSpace() const { + assert(isIndirect() && "Invalid kind!"); + return IndirectAddrSpace; + } + + void setIndirectAddrSpace(unsigned AddrSpace) { + assert(isIndirect() && "Invalid kind!"); + IndirectAddrSpace = AddrSpace; + } + bool getIndirectRealign() const { assert(isIndirect() && "Invalid kind!"); return IndirectRealign;
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits