yaxunl created this revision. yaxunl added reviewers: tra, arsenm, rjmccall. Herald added subscribers: t-tye, tpr, dstuttard, wdng, kzhuravl.
Clang emits call of hipSetupArgument(arg, size, offset) in host IR to set up arguments for a HIP kernel. The offset should meet the expection of the device backend. Currently clang uses AST alignment to calculate the offset. This works for nvptx backend and in most cases works for amdpu backend. However, this does not work when the kernel argument is a packed struct. In the device IR for amdgpu backend, a struct type kernel argument is passed directly, instead of by a pointer with byval attribute. The backend calculates the offset of the argument by ABI alignment of the arg in IR. For packed struct, this is always 1. However, its AST alignment is different. This discrepency causes incorrect offset value used in the emitted call of hipSetupArgument. This patch fixes the issue by using ABI alignment of kernel arg in IR to calculate its offset for amdgpu target. It does not affect other targets. https://reviews.llvm.org/D55067 Files: lib/CodeGen/CGCUDANV.cpp test/CodeGenCUDA/kernel-args-alignment.cu Index: test/CodeGenCUDA/kernel-args-alignment.cu =================================================================== --- test/CodeGenCUDA/kernel-args-alignment.cu +++ test/CodeGenCUDA/kernel-args-alignment.cu @@ -1,8 +1,15 @@ // RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | \ -// RUN: FileCheck -check-prefix HOST -check-prefix CHECK %s +// RUN: FileCheck -check-prefixes=HOST,HOST-NV,CHECK %s // RUN: %clang_cc1 --std=c++11 -fcuda-is-device -triple nvptx64-nvidia-cuda \ -// RUN: -emit-llvm -o - %s | FileCheck -check-prefix DEVICE -check-prefix CHECK %s +// RUN: -emit-llvm -o - %s | FileCheck -check-prefixes=DEVICE,DEVICE-NV,CHECK %s + +// RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -x hip \ +// RUN: -aux-triple amdgcn-amd-amdhsa -emit-llvm -o - %s | \ +// RUN: FileCheck -check-prefixes=HOST,HOST-AMD,CHECK %s + +// RUN: %clang_cc1 --std=c++11 -fcuda-is-device -triple amdgcn-amd-amdhsa \ +// RUN: -x hip -emit-llvm -o - %s | FileCheck -check-prefixes=DEVICE,DEVICE-AMD,CHECK %s #include "Inputs/cuda.h" @@ -23,14 +30,25 @@ static_assert(alignof(S) == 8, "Unexpected alignment."); // HOST-LABEL: @_Z6kernelc1SPi -// Marshalled kernel args should be: +// For NVPTX backend, marshalled kernel args should be: // 1. offset 0, width 1 // 2. offset 8 (because alignof(S) == 8), width 16 // 3. offset 24, width 8 -// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 1, i64 0) -// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 16, i64 8) -// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24) +// HOST-NV: call i32 @cudaSetupArgument({{[^,]*}}, i64 1, i64 0) +// HOST-NV: call i32 @cudaSetupArgument({{[^,]*}}, i64 16, i64 8) +// HOST-NV: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24) +// AMDGPU backend assumes struct type kernel arguments are passed directly, +// not byval. It lays out kernel arguments by size and alignment in IR. +// Packed struct type in IR always has ABI alignment of 1. +// For AMDGPU backend, marshalled kernel args should be: +// 1. offset 0, width 1 +// 2. offset 1 (because ABI alignment of S is 1), width 16 +// 3. offset 24, width 8 +// HOST-AMD: call i32 @hipSetupArgument({{[^,]*}}, i64 1, i64 0) +// HOST-AMD: call i32 @hipSetupArgument({{[^,]*}}, i64 16, i64 1) +// HOST-AMD: call i32 @hipSetupArgument({{[^,]*}}, i64 8, i64 24) // DEVICE-LABEL: @_Z6kernelc1SPi -// DEVICE-SAME: i8{{[^,]*}}, %struct.S* byval align 8{{[^,]*}}, i32* +// DEVICE-NV-SAME: i8{{[^,]*}}, %struct.S* byval align 8{{[^,]*}}, i32* +// DEVICE-AMD-SAME: i8{{[^,]*}}, %struct.S{{[^,*]*}}, i32* __global__ void kernel(char a, S s, int *b) {} Index: lib/CodeGen/CGCUDANV.cpp =================================================================== --- lib/CodeGen/CGCUDANV.cpp +++ lib/CodeGen/CGCUDANV.cpp @@ -199,13 +199,21 @@ llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); CharUnits Offset = CharUnits::Zero(); for (const VarDecl *A : Args) { + auto *Arg = CGF.GetAddrOfLocalVar(A).getPointer(); CharUnits TyWidth, TyAlign; - std::tie(TyWidth, TyAlign) = - CGM.getContext().getTypeInfoInChars(A->getType()); + auto *Aux = CGM.getContext().getAuxTargetInfo(); + if (Aux && Aux->getTriple().getArch() == llvm::Triple::amdgcn) { + auto *ArgTy = Arg->getType()->getPointerElementType(); + auto &DL = CGM.getDataLayout(); + TyWidth = CharUnits::fromQuantity(DL.getTypeStoreSize(ArgTy)); + TyAlign = CharUnits::fromQuantity(DL.getABITypeAlignment(ArgTy)); + } else { + std::tie(TyWidth, TyAlign) = + CGM.getContext().getTypeInfoInChars(A->getType()); + } Offset = Offset.alignTo(TyAlign); llvm::Value *Args[] = { - CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), - VoidPtrTy), + CGF.Builder.CreatePointerCast(Arg, VoidPtrTy), llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), };
Index: test/CodeGenCUDA/kernel-args-alignment.cu =================================================================== --- test/CodeGenCUDA/kernel-args-alignment.cu +++ test/CodeGenCUDA/kernel-args-alignment.cu @@ -1,8 +1,15 @@ // RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | \ -// RUN: FileCheck -check-prefix HOST -check-prefix CHECK %s +// RUN: FileCheck -check-prefixes=HOST,HOST-NV,CHECK %s // RUN: %clang_cc1 --std=c++11 -fcuda-is-device -triple nvptx64-nvidia-cuda \ -// RUN: -emit-llvm -o - %s | FileCheck -check-prefix DEVICE -check-prefix CHECK %s +// RUN: -emit-llvm -o - %s | FileCheck -check-prefixes=DEVICE,DEVICE-NV,CHECK %s + +// RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -x hip \ +// RUN: -aux-triple amdgcn-amd-amdhsa -emit-llvm -o - %s | \ +// RUN: FileCheck -check-prefixes=HOST,HOST-AMD,CHECK %s + +// RUN: %clang_cc1 --std=c++11 -fcuda-is-device -triple amdgcn-amd-amdhsa \ +// RUN: -x hip -emit-llvm -o - %s | FileCheck -check-prefixes=DEVICE,DEVICE-AMD,CHECK %s #include "Inputs/cuda.h" @@ -23,14 +30,25 @@ static_assert(alignof(S) == 8, "Unexpected alignment."); // HOST-LABEL: @_Z6kernelc1SPi -// Marshalled kernel args should be: +// For NVPTX backend, marshalled kernel args should be: // 1. offset 0, width 1 // 2. offset 8 (because alignof(S) == 8), width 16 // 3. offset 24, width 8 -// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 1, i64 0) -// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 16, i64 8) -// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24) +// HOST-NV: call i32 @cudaSetupArgument({{[^,]*}}, i64 1, i64 0) +// HOST-NV: call i32 @cudaSetupArgument({{[^,]*}}, i64 16, i64 8) +// HOST-NV: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24) +// AMDGPU backend assumes struct type kernel arguments are passed directly, +// not byval. It lays out kernel arguments by size and alignment in IR. +// Packed struct type in IR always has ABI alignment of 1. +// For AMDGPU backend, marshalled kernel args should be: +// 1. offset 0, width 1 +// 2. offset 1 (because ABI alignment of S is 1), width 16 +// 3. offset 24, width 8 +// HOST-AMD: call i32 @hipSetupArgument({{[^,]*}}, i64 1, i64 0) +// HOST-AMD: call i32 @hipSetupArgument({{[^,]*}}, i64 16, i64 1) +// HOST-AMD: call i32 @hipSetupArgument({{[^,]*}}, i64 8, i64 24) // DEVICE-LABEL: @_Z6kernelc1SPi -// DEVICE-SAME: i8{{[^,]*}}, %struct.S* byval align 8{{[^,]*}}, i32* +// DEVICE-NV-SAME: i8{{[^,]*}}, %struct.S* byval align 8{{[^,]*}}, i32* +// DEVICE-AMD-SAME: i8{{[^,]*}}, %struct.S{{[^,*]*}}, i32* __global__ void kernel(char a, S s, int *b) {} Index: lib/CodeGen/CGCUDANV.cpp =================================================================== --- lib/CodeGen/CGCUDANV.cpp +++ lib/CodeGen/CGCUDANV.cpp @@ -199,13 +199,21 @@ llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); CharUnits Offset = CharUnits::Zero(); for (const VarDecl *A : Args) { + auto *Arg = CGF.GetAddrOfLocalVar(A).getPointer(); CharUnits TyWidth, TyAlign; - std::tie(TyWidth, TyAlign) = - CGM.getContext().getTypeInfoInChars(A->getType()); + auto *Aux = CGM.getContext().getAuxTargetInfo(); + if (Aux && Aux->getTriple().getArch() == llvm::Triple::amdgcn) { + auto *ArgTy = Arg->getType()->getPointerElementType(); + auto &DL = CGM.getDataLayout(); + TyWidth = CharUnits::fromQuantity(DL.getTypeStoreSize(ArgTy)); + TyAlign = CharUnits::fromQuantity(DL.getABITypeAlignment(ArgTy)); + } else { + std::tie(TyWidth, TyAlign) = + CGM.getContext().getTypeInfoInChars(A->getType()); + } Offset = Offset.alignTo(TyAlign); llvm::Value *Args[] = { - CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), - VoidPtrTy), + CGF.Builder.CreatePointerCast(Arg, VoidPtrTy), llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), };
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits