https://github.com/abidh updated https://github.com/llvm/llvm-project/pull/197004
>From 6ecc9ee71ee8b55a4c71c426e490b9b6d6dfa245 Mon Sep 17 00:00:00 2001 From: Abid Qadeer <[email protected]> Date: Wed, 29 Apr 2026 16:09:40 +0100 Subject: [PATCH 1/4] [AMDGPU] Synthetic return coerce for aggregates with empty-for-layout members. After llvm#96422, empty-for-layout members can show up as 4 x i8. This logically empty type ends up consuming 4 VGPRs and breaks the ABI. This PR teaches the AMDGPU ABI to use an explicit synthetic coerce struct when it is returning a struct that can transitively contain an empty-for-layout member. This coerce struct does not create array for padding bytes. As a result, the fields go in the registers as ABI expects. The numRegsForType has been fixed accordingly as well. --- clang/lib/CodeGen/Targets/AMDGPU.cpp | 111 +++++++++++++++++- .../amdgpu-aggregate-return-coerce.hip | 32 +++++ 2 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index a3a596bb9d822..06bd6076d4e50 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -9,7 +9,11 @@ #include "ABIInfoImpl.h" #include "TargetInfo.h" #include "clang/AST/DeclCXX.h" +#include "clang/AST/RecordLayout.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/Support/AMDGPUAddrSpace.h" using namespace clang; @@ -21,6 +25,94 @@ using namespace clang::CodeGen; namespace { +/// True if \p Ty is a record whose fields (or bases) include a field that +/// is empty for layout, or that contain such a field transitively through +/// member or base types. +static bool recordTypeHasEmptyFieldForLayout(ASTContext &Ctx, QualType Ty) { + const RecordDecl *RD = Ty->getAsRecordDecl(); + if (!RD) + return false; + + if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) { + for (const auto &B : CXXRD->bases()) { + if (recordTypeHasEmptyFieldForLayout(Ctx, B.getType())) + return true; + } + } + + for (const FieldDecl *FD : RD->fields()) { + if (isEmptyFieldForLayout(Ctx, FD)) + return true; + if (recordTypeHasEmptyFieldForLayout(Ctx, FD->getType())) + return true; + } + return false; +} + +/// Build a LLVM struct for AMDGPU aggregate return coercion: one element per +/// non-empty base subobject and per field, ordered by \c ASTRecordLayout +/// offsets (matching in-object layout). Nested records that also need this +/// coercion use a nested coerce type; otherwise \c ConvertType is used. +static llvm::Type *buildAMDGPUAggregateReturnCoerceType(CodeGenTypes &CGT, + ASTContext &Ctx, + QualType Ty) { + if (!recordTypeHasEmptyFieldForLayout(Ctx, Ty)) + return nullptr; + + const RecordDecl *RD = Ty->getAsRecordDecl(); + if (!RD || !RD->getDefinition() || RD->isUnion()) + return nullptr; + assert(!RD->hasFlexibleArrayMember()); + + // Vtable and dynamic-class layout are not represented here; use the normal + // LLVM record type as the coerce-to type. + if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) + if (CXXRD->isDynamicClass()) + return nullptr; + + const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD); + + struct CoerceMember { + CharUnits Offset; + QualType Ty; + }; + llvm::SmallVector<CoerceMember, 16> Members; + + if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) { + for (const CXXBaseSpecifier &B : CXXRD->bases()) { + const CXXRecordDecl *BaseDecl = B.getType()->getAsCXXRecordDecl(); + if (!BaseDecl || BaseDecl->isEmpty()) + continue; + BaseDecl = BaseDecl->getDefinition(); + CharUnits Off = B.isVirtual() ? Layout.getVBaseClassOffset(BaseDecl) + : Layout.getBaseClassOffset(BaseDecl); + Members.push_back({Off, B.getType()}); + } + } + + for (const FieldDecl *FD : RD->fields()) { + CharUnits Off = + Ctx.toCharUnitsFromBits(Layout.getFieldOffset(FD->getFieldIndex())); + Members.push_back({Off, FD->getType()}); + } + + llvm::stable_sort(Members, [](const CoerceMember &A, const CoerceMember &B) { + return A.Offset < B.Offset; + }); + + llvm::LLVMContext &VM = CGT.getLLVMContext(); + llvm::SmallVector<llvm::Type *, 16> Elts; + for (const CoerceMember &M : Members) { + if (llvm::Type *Nested = + buildAMDGPUAggregateReturnCoerceType(CGT, Ctx, M.Ty)) + Elts.push_back(Nested); + else + Elts.push_back(CGT.ConvertType(M.Ty)); + } + + return llvm::StructType::create(VM, Elts); +} + class AMDGPUABIInfo final : public DefaultABIInfo { private: static const unsigned MaxNumRegsForArgsRet = 16; @@ -99,7 +191,20 @@ uint64_t AMDGPUABIInfo::numRegsForType(QualType Ty) const { if (const auto *RD = Ty->getAsRecordDecl()) { assert(!RD->hasFlexibleArrayMember()); + if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) { + for (const CXXBaseSpecifier &B : CXXRD->bases()) { + const CXXRecordDecl *BD = B.getType()->getAsCXXRecordDecl(); + if (!BD || BD->isEmpty()) + continue; + NumRegs += numRegsForType(B.getType()); + } + } + for (const FieldDecl *Field : RD->fields()) { + if (isEmptyFieldForLayout(getContext(), Field)) { + NumRegs += 1; + continue; + } QualType FieldTy = Field->getType(); NumRegs += numRegsForType(FieldTy); } @@ -169,8 +274,12 @@ ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const { return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2)); } - if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet) + if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet) { + if (llvm::Type *CoerceTy = + buildAMDGPUAggregateReturnCoerceType(CGT, getContext(), RetTy)) + return ABIArgInfo::getDirect(CoerceTy); return ABIArgInfo::getDirect(); + } } } diff --git a/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip b/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip new file mode 100644 index 0000000000000..61729285fac5f --- /dev/null +++ b/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip @@ -0,0 +1,32 @@ +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -emit-llvm -fcuda-is-device -o - %s \ +// RUN: | FileCheck %s + +#define __device__ __attribute__((device)) + +struct OnlyStatic { + __device__ static int something; +}; +struct WithStaticFields { + int a[2]; + OnlyStatic sub; + float b; + __device__ static int c; + double d; +}; +__device__ int OnlyStatic::something = 42; +__device__ int WithStaticFields::c = 12; + +__device__ WithStaticFields returnWithStatic() { + OnlyStatic::something = 12; + WithStaticFields::c = 42; + return {.a = {8, 16}, .b = 3.14f, .d = 1.60218e-19}; +} + +__device__ void caller() { + WithStaticFields r = returnWithStatic(); + (void)r.b; +} + +// CHECK-DAG: define dso_local %[[TY:.*]] @_Z16returnWithStaticv +// CHECK-DAG: call %[[TY]] @_Z16returnWithStaticv +// CHECK-DAG: %[[TY]] = type { [2 x i32], %struct.OnlyStatic, float, double } >From a0b1c0168bf29000c5a279a5c97aead2c69057af Mon Sep 17 00:00:00 2001 From: Abid Qadeer <[email protected]> Date: Mon, 11 May 2026 18:29:41 +0100 Subject: [PATCH 2/4] Add a base class test. --- .../amdgpu-aggregate-return-coerce.hip | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip b/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip index 61729285fac5f..3214279d748b3 100644 --- a/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip +++ b/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip @@ -27,6 +27,40 @@ __device__ void caller() { (void)r.b; } +// Base class with an empty-for-layout member: return coercion must include the +// non-empty base subobject as a nested coerce struct (not only direct fields). +struct BaseWithEmptyMember { + int a[2]; + OnlyStatic sub; + float b; + __device__ static int c; +}; +struct DerivedWithBase : BaseWithEmptyMember { + double d; +}; +__device__ int BaseWithEmptyMember::c = 12; + +__device__ DerivedWithBase returnDerivedWithBase() { + OnlyStatic::something = 12; + BaseWithEmptyMember::c = 42; + DerivedWithBase r{}; + r.a[0] = 8; + r.a[1] = 16; + r.b = 3.14f; + r.d = 1.60218e-19; + return r; +} + +__device__ void callerDerived() { + DerivedWithBase r = returnDerivedWithBase(); + (void)r.b; +} + // CHECK-DAG: define dso_local %[[TY:.*]] @_Z16returnWithStaticv // CHECK-DAG: call %[[TY]] @_Z16returnWithStaticv // CHECK-DAG: %[[TY]] = type { [2 x i32], %struct.OnlyStatic, float, double } + +// CHECK-DAG: define dso_local %[[DER:.*]] @_Z21returnDerivedWithBasev +// CHECK-DAG: call %[[DER]] @_Z21returnDerivedWithBasev +// CHECK-DAG: %[[BASE:[0-9]+]] = type { [2 x i32], %struct.OnlyStatic, float } +// CHECK-DAG: %[[DER]] = type { %[[BASE]], double } >From a72915a0a2c1fd9ce96827b3593f57cb3bf87b69 Mon Sep 17 00:00:00 2001 From: Abid Qadeer <[email protected]> Date: Thu, 14 May 2026 16:51:35 +0100 Subject: [PATCH 3/4] Use sort instead of stable_sort. --- clang/lib/CodeGen/Targets/AMDGPU.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 06bd6076d4e50..1ace0502aaec6 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -96,7 +96,7 @@ static llvm::Type *buildAMDGPUAggregateReturnCoerceType(CodeGenTypes &CGT, Members.push_back({Off, FD->getType()}); } - llvm::stable_sort(Members, [](const CoerceMember &A, const CoerceMember &B) { + llvm::sort(Members, [](const CoerceMember &A, const CoerceMember &B) { return A.Offset < B.Offset; }); >From 35e7295ecaf013eca80d62d0ad1403ee06541e0a Mon Sep 17 00:00:00 2001 From: Abid Qadeer <[email protected]> Date: Wed, 10 Jun 2026 15:44:38 +0100 Subject: [PATCH 4/4] Handle review comments. Add more tests. Also refactor some code so that we can assert on an unreachable code path. --- clang/lib/CodeGen/Targets/AMDGPU.cpp | 16 +++-- .../amdgpu-aggregate-return-coerce.hip | 58 ++++++++++++++++--- 2 files changed, 57 insertions(+), 17 deletions(-) diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 1ace0502aaec6..2399103f795fe 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -64,12 +64,6 @@ static llvm::Type *buildAMDGPUAggregateReturnCoerceType(CodeGenTypes &CGT, return nullptr; assert(!RD->hasFlexibleArrayMember()); - // Vtable and dynamic-class layout are not represented here; use the normal - // LLVM record type as the coerce-to type. - if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) - if (CXXRD->isDynamicClass()) - return nullptr; - const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD); struct CoerceMember { @@ -79,14 +73,18 @@ static llvm::Type *buildAMDGPUAggregateReturnCoerceType(CodeGenTypes &CGT, llvm::SmallVector<CoerceMember, 16> Members; if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) { + // Vtable and dynamic-class layout are not represented here; use the normal + // LLVM record type as the coerce-to type. + if (CXXRD->isDynamicClass()) + return nullptr; for (const CXXBaseSpecifier &B : CXXRD->bases()) { const CXXRecordDecl *BaseDecl = B.getType()->getAsCXXRecordDecl(); if (!BaseDecl || BaseDecl->isEmpty()) continue; BaseDecl = BaseDecl->getDefinition(); - CharUnits Off = B.isVirtual() ? Layout.getVBaseClassOffset(BaseDecl) - : Layout.getBaseClassOffset(BaseDecl); - Members.push_back({Off, B.getType()}); + // isDynamicClass() above guards against any class that has virtual bases + assert(!B.isVirtual() && "virtual base implies isDynamicClass"); + Members.push_back({Layout.getBaseClassOffset(BaseDecl), B.getType()}); } } diff --git a/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip b/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip index 3214279d748b3..29b38b0e8cb2e 100644 --- a/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip +++ b/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip @@ -22,11 +22,6 @@ __device__ WithStaticFields returnWithStatic() { return {.a = {8, 16}, .b = 3.14f, .d = 1.60218e-19}; } -__device__ void caller() { - WithStaticFields r = returnWithStatic(); - (void)r.b; -} - // Base class with an empty-for-layout member: return coercion must include the // non-empty base subobject as a nested coerce struct (not only direct fields). struct BaseWithEmptyMember { @@ -51,9 +46,48 @@ __device__ DerivedWithBase returnDerivedWithBase() { return r; } -__device__ void callerDerived() { - DerivedWithBase r = returnDerivedWithBase(); - (void)r.b; +// Empty base class +struct EmptyBase {}; +struct WithEmptyBase : EmptyBase { + int a; + OnlyStatic sub; + float b; +}; + +__device__ WithEmptyBase returnWithEmptyBase() { + WithEmptyBase r{}; + r.a = 1; + r.b = 2.0f; + return r; +} + +// Derived class whose base has no empty-for-layout fields but the derived +// class itself does. The base does not need its own coerce type, so it +// appears in the coerce struct as the named LLVM type rather than +// an anonymous type generated by the recursive coerce path. +struct PlainBase { int x; float y; }; +struct DerivedWithOwnEmpty : PlainBase { + OnlyStatic sub; + double z; +}; + +__device__ DerivedWithOwnEmpty returnDerivedWithOwnEmpty() { + DerivedWithOwnEmpty r{}; + r.x = 1; + r.y = 2.0f; + r.z = 3.0; + return r; +} + +__device__ void caller() { + WithStaticFields r1 = returnWithStatic(); + (void)r1.b; + DerivedWithBase r2 = returnDerivedWithBase(); + (void)r2.b; + WithEmptyBase r3 = returnWithEmptyBase(); + (void)r3.a; + DerivedWithOwnEmpty r4 = returnDerivedWithOwnEmpty(); + (void)r4.z; } // CHECK-DAG: define dso_local %[[TY:.*]] @_Z16returnWithStaticv @@ -64,3 +98,11 @@ __device__ void callerDerived() { // CHECK-DAG: call %[[DER]] @_Z21returnDerivedWithBasev // CHECK-DAG: %[[BASE:[0-9]+]] = type { [2 x i32], %struct.OnlyStatic, float } // CHECK-DAG: %[[DER]] = type { %[[BASE]], double } + +// CHECK-DAG: define dso_local %[[EB:.*]] @_Z19returnWithEmptyBasev +// CHECK-DAG: call %[[EB]] @_Z19returnWithEmptyBasev +// CHECK-DAG: %[[EB]] = type { i32, %struct.OnlyStatic, float } + +// CHECK-DAG: define dso_local %[[PLAIN:.*]] @_Z25returnDerivedWithOwnEmptyv +// CHECK-DAG: call %[[PLAIN]] @_Z25returnDerivedWithOwnEmptyv +// CHECK-DAG: %[[PLAIN]] = type { %struct.PlainBase, %struct.OnlyStatic, double } _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
