https://github.com/abidh updated 
https://github.com/llvm/llvm-project/pull/197004

>From 6ecc9ee71ee8b55a4c71c426e490b9b6d6dfa245 Mon Sep 17 00:00:00 2001
From: Abid Qadeer <[email protected]>
Date: Wed, 29 Apr 2026 16:09:40 +0100
Subject: [PATCH 1/4] [AMDGPU] Synthetic return coerce for aggregates with
 empty-for-layout members.

After llvm#96422, empty-for-layout members can show up as 4 x i8. This
logically empty type ends up consuming 4 VGPRs and breaks the ABI.

This PR teaches the AMDGPU ABI to use an explicit synthetic coerce
struct when it is returning a struct that can transitively contain an
empty-for-layout member. This coerce struct does not create array for
padding bytes. As a result, the fields go in the registers as ABI
expects. The numRegsForType has been fixed accordingly as well.
---
 clang/lib/CodeGen/Targets/AMDGPU.cpp          | 111 +++++++++++++++++-
 .../amdgpu-aggregate-return-coerce.hip        |  32 +++++
 2 files changed, 142 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip

diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp 
b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index a3a596bb9d822..06bd6076d4e50 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -9,7 +9,11 @@
 #include "ABIInfoImpl.h"
 #include "TargetInfo.h"
 #include "clang/AST/DeclCXX.h"
+#include "clang/AST/RecordLayout.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/Support/AMDGPUAddrSpace.h"
 
 using namespace clang;
@@ -21,6 +25,94 @@ using namespace clang::CodeGen;
 
 namespace {
 
+/// True if \p Ty is a record whose fields (or bases) include a field that
+///  is empty for layout, or that contain such a field transitively through
+///  member or base types.
+static bool recordTypeHasEmptyFieldForLayout(ASTContext &Ctx, QualType Ty) {
+  const RecordDecl *RD = Ty->getAsRecordDecl();
+  if (!RD)
+    return false;
+
+  if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+    for (const auto &B : CXXRD->bases()) {
+      if (recordTypeHasEmptyFieldForLayout(Ctx, B.getType()))
+        return true;
+    }
+  }
+
+  for (const FieldDecl *FD : RD->fields()) {
+    if (isEmptyFieldForLayout(Ctx, FD))
+      return true;
+    if (recordTypeHasEmptyFieldForLayout(Ctx, FD->getType()))
+      return true;
+  }
+  return false;
+}
+
+/// Build a LLVM struct for AMDGPU aggregate return coercion: one element per
+/// non-empty base subobject and per field, ordered by \c ASTRecordLayout
+/// offsets (matching in-object layout). Nested records that also need this
+/// coercion use a nested coerce type; otherwise \c ConvertType is used.
+static llvm::Type *buildAMDGPUAggregateReturnCoerceType(CodeGenTypes &CGT,
+                                                        ASTContext &Ctx,
+                                                        QualType Ty) {
+  if (!recordTypeHasEmptyFieldForLayout(Ctx, Ty))
+    return nullptr;
+
+  const RecordDecl *RD = Ty->getAsRecordDecl();
+  if (!RD || !RD->getDefinition() || RD->isUnion())
+    return nullptr;
+  assert(!RD->hasFlexibleArrayMember());
+
+  // Vtable and dynamic-class layout are not represented here; use the normal
+  // LLVM record type as the coerce-to type.
+  if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD))
+    if (CXXRD->isDynamicClass())
+      return nullptr;
+
+  const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD);
+
+  struct CoerceMember {
+    CharUnits Offset;
+    QualType Ty;
+  };
+  llvm::SmallVector<CoerceMember, 16> Members;
+
+  if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+    for (const CXXBaseSpecifier &B : CXXRD->bases()) {
+      const CXXRecordDecl *BaseDecl = B.getType()->getAsCXXRecordDecl();
+      if (!BaseDecl || BaseDecl->isEmpty())
+        continue;
+      BaseDecl = BaseDecl->getDefinition();
+      CharUnits Off = B.isVirtual() ? Layout.getVBaseClassOffset(BaseDecl)
+                                    : Layout.getBaseClassOffset(BaseDecl);
+      Members.push_back({Off, B.getType()});
+    }
+  }
+
+  for (const FieldDecl *FD : RD->fields()) {
+    CharUnits Off =
+        Ctx.toCharUnitsFromBits(Layout.getFieldOffset(FD->getFieldIndex()));
+    Members.push_back({Off, FD->getType()});
+  }
+
+  llvm::stable_sort(Members, [](const CoerceMember &A, const CoerceMember &B) {
+    return A.Offset < B.Offset;
+  });
+
+  llvm::LLVMContext &VM = CGT.getLLVMContext();
+  llvm::SmallVector<llvm::Type *, 16> Elts;
+  for (const CoerceMember &M : Members) {
+    if (llvm::Type *Nested =
+            buildAMDGPUAggregateReturnCoerceType(CGT, Ctx, M.Ty))
+      Elts.push_back(Nested);
+    else
+      Elts.push_back(CGT.ConvertType(M.Ty));
+  }
+
+  return llvm::StructType::create(VM, Elts);
+}
+
 class AMDGPUABIInfo final : public DefaultABIInfo {
 private:
   static const unsigned MaxNumRegsForArgsRet = 16;
@@ -99,7 +191,20 @@ uint64_t AMDGPUABIInfo::numRegsForType(QualType Ty) const {
   if (const auto *RD = Ty->getAsRecordDecl()) {
     assert(!RD->hasFlexibleArrayMember());
 
+    if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+      for (const CXXBaseSpecifier &B : CXXRD->bases()) {
+        const CXXRecordDecl *BD = B.getType()->getAsCXXRecordDecl();
+        if (!BD || BD->isEmpty())
+          continue;
+        NumRegs += numRegsForType(B.getType());
+      }
+    }
+
     for (const FieldDecl *Field : RD->fields()) {
+      if (isEmptyFieldForLayout(getContext(), Field)) {
+        NumRegs += 1;
+        continue;
+      }
       QualType FieldTy = Field->getType();
       NumRegs += numRegsForType(FieldTy);
     }
@@ -169,8 +274,12 @@ ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType 
RetTy) const {
         return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
       }
 
-      if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
+      if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet) {
+        if (llvm::Type *CoerceTy =
+                buildAMDGPUAggregateReturnCoerceType(CGT, getContext(), RetTy))
+          return ABIArgInfo::getDirect(CoerceTy);
         return ABIArgInfo::getDirect();
+      }
     }
   }
 
diff --git a/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip 
b/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip
new file mode 100644
index 0000000000000..61729285fac5f
--- /dev/null
+++ b/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -emit-llvm 
-fcuda-is-device -o - %s \
+// RUN:   | FileCheck %s
+
+#define __device__ __attribute__((device))
+
+struct OnlyStatic {
+  __device__ static int something;
+};
+struct WithStaticFields {
+  int a[2];
+  OnlyStatic sub;
+  float b;
+  __device__ static int c;
+  double d;
+};
+__device__ int OnlyStatic::something = 42;
+__device__ int WithStaticFields::c = 12;
+
+__device__ WithStaticFields returnWithStatic() {
+  OnlyStatic::something = 12;
+  WithStaticFields::c = 42;
+  return {.a = {8, 16}, .b = 3.14f, .d = 1.60218e-19};
+}
+
+__device__ void caller() {
+  WithStaticFields r = returnWithStatic();
+  (void)r.b;
+}
+
+// CHECK-DAG: define dso_local %[[TY:.*]] @_Z16returnWithStaticv
+// CHECK-DAG: call %[[TY]] @_Z16returnWithStaticv
+// CHECK-DAG: %[[TY]] = type { [2 x i32], %struct.OnlyStatic, float, double }

>From a0b1c0168bf29000c5a279a5c97aead2c69057af Mon Sep 17 00:00:00 2001
From: Abid Qadeer <[email protected]>
Date: Mon, 11 May 2026 18:29:41 +0100
Subject: [PATCH 2/4] Add a base class test.

---
 .../amdgpu-aggregate-return-coerce.hip        | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip 
b/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip
index 61729285fac5f..3214279d748b3 100644
--- a/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip
+++ b/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip
@@ -27,6 +27,40 @@ __device__ void caller() {
   (void)r.b;
 }
 
+// Base class with an empty-for-layout member: return coercion must include the
+// non-empty base subobject as a nested coerce struct (not only direct fields).
+struct BaseWithEmptyMember {
+  int a[2];
+  OnlyStatic sub;
+  float b;
+  __device__ static int c;
+};
+struct DerivedWithBase : BaseWithEmptyMember {
+  double d;
+};
+__device__ int BaseWithEmptyMember::c = 12;
+
+__device__ DerivedWithBase returnDerivedWithBase() {
+  OnlyStatic::something = 12;
+  BaseWithEmptyMember::c = 42;
+  DerivedWithBase r{};
+  r.a[0] = 8;
+  r.a[1] = 16;
+  r.b = 3.14f;
+  r.d = 1.60218e-19;
+  return r;
+}
+
+__device__ void callerDerived() {
+  DerivedWithBase r = returnDerivedWithBase();
+  (void)r.b;
+}
+
 // CHECK-DAG: define dso_local %[[TY:.*]] @_Z16returnWithStaticv
 // CHECK-DAG: call %[[TY]] @_Z16returnWithStaticv
 // CHECK-DAG: %[[TY]] = type { [2 x i32], %struct.OnlyStatic, float, double }
+
+// CHECK-DAG: define dso_local %[[DER:.*]] @_Z21returnDerivedWithBasev
+// CHECK-DAG: call %[[DER]] @_Z21returnDerivedWithBasev
+// CHECK-DAG: %[[BASE:[0-9]+]] = type { [2 x i32], %struct.OnlyStatic, float }
+// CHECK-DAG: %[[DER]] = type { %[[BASE]], double }

>From a72915a0a2c1fd9ce96827b3593f57cb3bf87b69 Mon Sep 17 00:00:00 2001
From: Abid Qadeer <[email protected]>
Date: Thu, 14 May 2026 16:51:35 +0100
Subject: [PATCH 3/4] Use sort instead of stable_sort.

---
 clang/lib/CodeGen/Targets/AMDGPU.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp 
b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 06bd6076d4e50..1ace0502aaec6 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -96,7 +96,7 @@ static llvm::Type 
*buildAMDGPUAggregateReturnCoerceType(CodeGenTypes &CGT,
     Members.push_back({Off, FD->getType()});
   }
 
-  llvm::stable_sort(Members, [](const CoerceMember &A, const CoerceMember &B) {
+  llvm::sort(Members, [](const CoerceMember &A, const CoerceMember &B) {
     return A.Offset < B.Offset;
   });
 

>From 35e7295ecaf013eca80d62d0ad1403ee06541e0a Mon Sep 17 00:00:00 2001
From: Abid Qadeer <[email protected]>
Date: Wed, 10 Jun 2026 15:44:38 +0100
Subject: [PATCH 4/4] Handle review comments.

Add more tests. Also refactor some code so that we can assert on an
unreachable code path.
---
 clang/lib/CodeGen/Targets/AMDGPU.cpp          | 16 +++--
 .../amdgpu-aggregate-return-coerce.hip        | 58 ++++++++++++++++---
 2 files changed, 57 insertions(+), 17 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp 
b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 1ace0502aaec6..2399103f795fe 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -64,12 +64,6 @@ static llvm::Type 
*buildAMDGPUAggregateReturnCoerceType(CodeGenTypes &CGT,
     return nullptr;
   assert(!RD->hasFlexibleArrayMember());
 
-  // Vtable and dynamic-class layout are not represented here; use the normal
-  // LLVM record type as the coerce-to type.
-  if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD))
-    if (CXXRD->isDynamicClass())
-      return nullptr;
-
   const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD);
 
   struct CoerceMember {
@@ -79,14 +73,18 @@ static llvm::Type 
*buildAMDGPUAggregateReturnCoerceType(CodeGenTypes &CGT,
   llvm::SmallVector<CoerceMember, 16> Members;
 
   if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+    // Vtable and dynamic-class layout are not represented here; use the normal
+    // LLVM record type as the coerce-to type.
+    if (CXXRD->isDynamicClass())
+      return nullptr;
     for (const CXXBaseSpecifier &B : CXXRD->bases()) {
       const CXXRecordDecl *BaseDecl = B.getType()->getAsCXXRecordDecl();
       if (!BaseDecl || BaseDecl->isEmpty())
         continue;
       BaseDecl = BaseDecl->getDefinition();
-      CharUnits Off = B.isVirtual() ? Layout.getVBaseClassOffset(BaseDecl)
-                                    : Layout.getBaseClassOffset(BaseDecl);
-      Members.push_back({Off, B.getType()});
+      // isDynamicClass() above guards against any class that has virtual bases
+      assert(!B.isVirtual() && "virtual base implies isDynamicClass");
+      Members.push_back({Layout.getBaseClassOffset(BaseDecl), B.getType()});
     }
   }
 
diff --git a/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip 
b/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip
index 3214279d748b3..29b38b0e8cb2e 100644
--- a/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip
+++ b/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip
@@ -22,11 +22,6 @@ __device__ WithStaticFields returnWithStatic() {
   return {.a = {8, 16}, .b = 3.14f, .d = 1.60218e-19};
 }
 
-__device__ void caller() {
-  WithStaticFields r = returnWithStatic();
-  (void)r.b;
-}
-
 // Base class with an empty-for-layout member: return coercion must include the
 // non-empty base subobject as a nested coerce struct (not only direct fields).
 struct BaseWithEmptyMember {
@@ -51,9 +46,48 @@ __device__ DerivedWithBase returnDerivedWithBase() {
   return r;
 }
 
-__device__ void callerDerived() {
-  DerivedWithBase r = returnDerivedWithBase();
-  (void)r.b;
+// Empty base class
+struct EmptyBase {};
+struct WithEmptyBase : EmptyBase {
+  int a;
+  OnlyStatic sub;
+  float b;
+};
+
+__device__ WithEmptyBase returnWithEmptyBase() {
+  WithEmptyBase r{};
+  r.a = 1;
+  r.b = 2.0f;
+  return r;
+}
+
+// Derived class whose base has no empty-for-layout fields but the derived
+// class itself does.  The base does not need its own coerce type, so it
+// appears in the coerce struct as the named LLVM type rather than
+// an anonymous type generated by the recursive coerce path.
+struct PlainBase { int x; float y; };
+struct DerivedWithOwnEmpty : PlainBase {
+  OnlyStatic sub;
+  double z;
+};
+
+__device__ DerivedWithOwnEmpty returnDerivedWithOwnEmpty() {
+  DerivedWithOwnEmpty r{};
+  r.x = 1;
+  r.y = 2.0f;
+  r.z = 3.0;
+  return r;
+}
+
+__device__ void caller() {
+  WithStaticFields r1 = returnWithStatic();
+  (void)r1.b;
+  DerivedWithBase r2 = returnDerivedWithBase();
+  (void)r2.b;
+  WithEmptyBase r3 = returnWithEmptyBase();
+  (void)r3.a;
+  DerivedWithOwnEmpty r4 = returnDerivedWithOwnEmpty();
+  (void)r4.z;
 }
 
 // CHECK-DAG: define dso_local %[[TY:.*]] @_Z16returnWithStaticv
@@ -64,3 +98,11 @@ __device__ void callerDerived() {
 // CHECK-DAG: call %[[DER]] @_Z21returnDerivedWithBasev
 // CHECK-DAG: %[[BASE:[0-9]+]] = type { [2 x i32], %struct.OnlyStatic, float }
 // CHECK-DAG: %[[DER]] = type { %[[BASE]], double }
+
+// CHECK-DAG: define dso_local %[[EB:.*]] @_Z19returnWithEmptyBasev
+// CHECK-DAG: call %[[EB]] @_Z19returnWithEmptyBasev
+// CHECK-DAG: %[[EB]] = type { i32, %struct.OnlyStatic, float }
+
+// CHECK-DAG: define dso_local %[[PLAIN:.*]] @_Z25returnDerivedWithOwnEmptyv
+// CHECK-DAG: call %[[PLAIN]] @_Z25returnDerivedWithOwnEmptyv
+// CHECK-DAG: %[[PLAIN]] = type { %struct.PlainBase, %struct.OnlyStatic, 
double }

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to