ychen updated this revision to Diff 352908.
ychen marked 2 inline comments as done.
ychen added a comment.

- Not use `void *` in EmitBuiltinAlignTo signature.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D97915/new/

https://reviews.llvm.org/D97915

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/lib/CodeGen/CGCoroutine.cpp
  clang/lib/CodeGen/CodeGenFunction.h
  clang/test/CodeGenCoroutines/coro-alloc.cpp
  clang/test/CodeGenCoroutines/coro-cleanup.cpp
  clang/test/CodeGenCoroutines/coro-gro.cpp
  llvm/docs/Coroutines.rst
  llvm/include/llvm/IR/Intrinsics.td
  llvm/lib/Transforms/Coroutines/CoroFrame.cpp
  llvm/lib/Transforms/Coroutines/CoroInstr.h
  llvm/lib/Transforms/Coroutines/CoroInternal.h
  llvm/lib/Transforms/Coroutines/CoroSplit.cpp
  llvm/lib/Transforms/Coroutines/Coroutines.cpp
  llvm/test/Transforms/Coroutines/coro-frame-overalign.ll

Index: llvm/test/Transforms/Coroutines/coro-frame-overalign.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/Coroutines/coro-frame-overalign.ll
@@ -0,0 +1,78 @@
+; Check that `llvm.coro.align`, `llvm.coro.raw.frame.ptr.offset` and
+; `@llvm.coro.raw.frame.ptr.alloca` are lowered correctly.
+; RUN: opt < %s -passes=coro-split -S | FileCheck %s
+
+%PackedStruct = type <{ i64 }>
+
+declare void @consume(%PackedStruct*, i32, i32, i8**)
+declare void @consume2(i32, i32)
+
+define i8* @f() "coroutine.presplit"="1" {
+entry:
+  %data = alloca %PackedStruct, align 32
+  %id = call token @llvm.coro.id(i32 16, i8* null, i8* null, i8* null)
+  %size = call i32 @llvm.coro.size.i32()
+  %alloc = call i8* @malloc(i32 %size)
+  %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc)
+  %align = call i32 @llvm.coro.align.i32()
+  %offset = call i32 @llvm.coro.raw.frame.ptr.offset.i32()
+  %addr = call i8** @llvm.coro.raw.frame.ptr.addr()
+  call void @consume(%PackedStruct* %data, i32 %align, i32 %offset, i8** %addr)
+  %0 = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %0, label %suspend [i8 0, label %resume
+                                i8 1, label %cleanup]
+resume:
+  br label %cleanup
+
+cleanup:
+  %align2 = call i32 @llvm.coro.align.i32()
+  %offset2 = call i32 @llvm.coro.raw.frame.ptr.offset.i32()
+  call void @consume2(i32 %align2, i32 %offset2)
+  %mem = call i8* @llvm.coro.free(token %id, i8* %hdl)
+  call void @free(i8* %mem)
+  br label %suspend
+suspend:
+  call i1 @llvm.coro.end(i8* %hdl, i1 0)
+  ret i8* %hdl
+}
+
+; See if the raw frame address was inserted into the frame.
+; CHECK-LABEL: %f.Frame = type { void (%f.Frame*)*, void (%f.Frame*)*, i8*, i1, [7 x i8], %PackedStruct }
+
+; See if we used correct index to access frame addr field (field 2).
+; CHECK-LABEL: @f(
+; CHECK: %alloc.frame.ptr = alloca i8*, align 8
+; CHECK: %[[FIELD:.+]] = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 2
+; CHECK: %[[ADDR:.+]] = load i8*, i8** %alloc.frame.ptr, align 8
+; CHECK: store i8* %[[ADDR]], i8** %[[FIELD]], align 8
+; CHECK: %[[DATA:.+]] = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 5
+; CHECK: call void @consume(%PackedStruct* %[[DATA]], i32 32, i32 16, i8** %[[FIELD]])
+; CHECK: ret i8*
+
+; See if `llvm.coro.align` and `llvm.coro.raw.frame.ptr.offset` are lowered
+; correctly during deallocation.
+; CHECK-LABEL: @f.destroy(
+; CHECK: call void @consume2(i32 32, i32 16)
+; CHECK: call void @free(i8* %{{.*}})
+
+; CHECK-LABEL: @f.cleanup(
+; CHECK: call void @consume2(i32 32, i32 16)
+; CHECK: call void @free(i8*
+
+declare i8* @llvm.coro.free(token, i8*)
+declare i32 @llvm.coro.size.i32()
+declare i32 @llvm.coro.align.i32()
+declare i32 @llvm.coro.raw.frame.ptr.offset.i32()
+declare i8** @llvm.coro.raw.frame.ptr.addr()
+declare i8  @llvm.coro.suspend(token, i1)
+declare void @llvm.coro.resume(i8*)
+declare void @llvm.coro.destroy(i8*)
+
+declare token @llvm.coro.id(i32, i8*, i8*, i8*)
+declare i1 @llvm.coro.alloc(token)
+declare i8* @llvm.coro.begin(token, i8*)
+declare i1 @llvm.coro.end(i8*, i1)
+
+declare noalias i8* @malloc(i32)
+declare double @print(double)
+declare void @free(i8*)
Index: llvm/lib/Transforms/Coroutines/Coroutines.cpp
===================================================================
--- llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -234,6 +234,9 @@
   Shape.CoroBegin = nullptr;
   Shape.CoroEnds.clear();
   Shape.CoroSizes.clear();
+  Shape.CoroAligns.clear();
+  Shape.CoroRawFramePtrOffsets.clear();
+  Shape.CoroRawFramePtrAddrs.clear();
   Shape.CoroSuspends.clear();
 
   Shape.FrameTy = nullptr;
@@ -268,6 +271,15 @@
       case Intrinsic::coro_size:
         CoroSizes.push_back(cast<CoroSizeInst>(II));
         break;
+      case Intrinsic::coro_align:
+        CoroAligns.push_back(cast<CoroAlignInst>(II));
+        break;
+      case Intrinsic::coro_raw_frame_ptr_offset:
+        CoroRawFramePtrOffsets.push_back(cast<CoroRawFramePtrOffsetInst>(II));
+        break;
+      case Intrinsic::coro_raw_frame_ptr_addr:
+        CoroRawFramePtrAddrs.push_back(cast<CoroRawFramePtrAddrInst>(II));
+        break;
       case Intrinsic::coro_frame:
         CoroFrames.push_back(cast<CoroFrameInst>(II));
         break;
@@ -375,6 +387,7 @@
     this->SwitchLowering.ResumeSwitch = nullptr;
     this->SwitchLowering.PromiseAlloca = SwitchId->getPromise();
     this->SwitchLowering.ResumeEntryBlock = nullptr;
+    this->SwitchLowering.FramePtrOffset = 0;
 
     for (auto AnySuspend : CoroSuspends) {
       auto Suspend = dyn_cast<CoroSuspendInst>(AnySuspend);
Index: llvm/lib/Transforms/Coroutines/CoroSplit.cpp
===================================================================
--- llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -1046,23 +1046,44 @@
   Shape.AsyncLowering.AsyncFuncPointer->setInitializer(NewFuncPtrStruct);
 }
 
-static void replaceFrameSize(coro::Shape &Shape) {
+static void replaceFrameSizeAndAlign(coro::Shape &Shape) {
   if (Shape.ABI == coro::ABI::Async)
     updateAsyncFuncPointerContextSize(Shape);
 
-  if (Shape.CoroSizes.empty())
-    return;
+  if (!Shape.CoroSizes.empty()) {
+    // In the same function all coro.sizes should have the same result type.
+    auto *SizeIntrin = Shape.CoroSizes.back();
+    Module *M = SizeIntrin->getModule();
+    const DataLayout &DL = M->getDataLayout();
+    auto Size = DL.getTypeAllocSize(Shape.FrameTy);
+    auto *SizeConstant = ConstantInt::get(SizeIntrin->getType(), Size);
+
+    for (CoroSizeInst *CS : Shape.CoroSizes) {
+      CS->replaceAllUsesWith(SizeConstant);
+      CS->eraseFromParent();
+    }
+  }
 
-  // In the same function all coro.sizes should have the same result type.
-  auto *SizeIntrin = Shape.CoroSizes.back();
-  Module *M = SizeIntrin->getModule();
-  const DataLayout &DL = M->getDataLayout();
-  auto Size = DL.getTypeAllocSize(Shape.FrameTy);
-  auto *SizeConstant = ConstantInt::get(SizeIntrin->getType(), Size);
+  if (!Shape.CoroAligns.empty()) {
+    auto *Intrin = Shape.CoroAligns.back();
+    auto *AlignConstant =
+        ConstantInt::get(Intrin->getType(), Shape.FrameAlign.value());
 
-  for (CoroSizeInst *CS : Shape.CoroSizes) {
-    CS->replaceAllUsesWith(SizeConstant);
-    CS->eraseFromParent();
+    for (CoroAlignInst *CS : Shape.CoroAligns) {
+      CS->replaceAllUsesWith(AlignConstant);
+      CS->eraseFromParent();
+    }
+  }
+
+  if (!Shape.CoroRawFramePtrOffsets.empty()) {
+    auto *Intrin = Shape.CoroRawFramePtrOffsets.back();
+    auto *FramePtrOffset = ConstantInt::get(
+        Intrin->getType(), Shape.SwitchLowering.FramePtrOffset);
+
+    for (CoroRawFramePtrOffsetInst *CS : Shape.CoroRawFramePtrOffsets) {
+      CS->replaceAllUsesWith(FramePtrOffset);
+      CS->eraseFromParent();
+    }
   }
 }
 
@@ -1798,7 +1819,7 @@
 
   simplifySuspendPoints(Shape);
   buildCoroutineFrame(F, Shape);
-  replaceFrameSize(Shape);
+  replaceFrameSizeAndAlign(Shape);
 
   // If there are no suspend points, no split required, just remove
   // the allocation and deallocation blocks, they are not needed.
Index: llvm/lib/Transforms/Coroutines/CoroInternal.h
===================================================================
--- llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -99,6 +99,9 @@
   CoroBeginInst *CoroBegin;
   SmallVector<AnyCoroEndInst *, 4> CoroEnds;
   SmallVector<CoroSizeInst *, 2> CoroSizes;
+  SmallVector<CoroAlignInst *, 2> CoroAligns;
+  SmallVector<CoroRawFramePtrOffsetInst *, 2> CoroRawFramePtrOffsets;
+  SmallVector<CoroRawFramePtrAddrInst *, 2> CoroRawFramePtrAddrs;
   SmallVector<AnyCoroSuspendInst *, 4> CoroSuspends;
   SmallVector<CallInst*, 2> SwiftErrorOps;
 
@@ -135,6 +138,7 @@
     unsigned IndexField;
     unsigned IndexAlign;
     unsigned IndexOffset;
+    unsigned FramePtrOffset;
     bool HasFinalSuspend;
   };
 
Index: llvm/lib/Transforms/Coroutines/CoroInstr.h
===================================================================
--- llvm/lib/Transforms/Coroutines/CoroInstr.h
+++ llvm/lib/Transforms/Coroutines/CoroInstr.h
@@ -27,6 +27,7 @@
 
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
@@ -121,6 +122,10 @@
                : cast<AllocaInst>(Arg->stripPointerCasts());
   }
 
+  unsigned getAlignment() const {
+    return cast<ConstantInt>(getArgOperand(AlignArg))->getZExtValue();
+  }
+
   void clearPromise() {
     Value *Arg = getArgOperand(PromiseArg);
     setArgOperand(PromiseArg,
@@ -599,6 +604,42 @@
   }
 };
 
+/// This represents the llvm.coro.align instruction.
+class LLVM_LIBRARY_VISIBILITY CoroAlignInst : public IntrinsicInst {
+public:
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_align;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.raw.frame.ptr.offset instruction.
+class LLVM_LIBRARY_VISIBILITY CoroRawFramePtrOffsetInst : public IntrinsicInst {
+public:
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_raw_frame_ptr_offset;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.coro.raw.frame.ptr.addr instruction.
+class LLVM_LIBRARY_VISIBILITY CoroRawFramePtrAddrInst : public IntrinsicInst {
+public:
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_raw_frame_ptr_addr;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 class LLVM_LIBRARY_VISIBILITY AnyCoroEndInst : public IntrinsicInst {
   enum { FrameArg, UnwindArg };
 
Index: llvm/lib/Transforms/Coroutines/CoroFrame.cpp
===================================================================
--- llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/StackLifetime.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
@@ -531,6 +532,8 @@
     return StructAlign;
   }
 
+  SmallVector<Field, 8> &getFields() { return Fields; }
+
   FieldIDType getLayoutFieldIndex(FieldIDType Id) const {
     assert(IsFinished && "not yet finished!");
     return Fields[Id].LayoutFieldIndex;
@@ -1126,21 +1129,59 @@
   // Because multiple allocas may own the same field slot,
   // we add allocas to field here.
   B.addFieldForAllocas(F, FrameData, Shape);
-  // Add PromiseAlloca to Allocas list so that
-  // 1. updateLayoutIndex could update its index after
-  // `performOptimizedStructLayout`
-  // 2. it is processed in insertSpills.
-  if (Shape.ABI == coro::ABI::Switch && PromiseAlloca)
-    // We assume that the promise alloca won't be modified before
-    // CoroBegin and no alias will be create before CoroBegin.
-    FrameData.Allocas.emplace_back(
-        PromiseAlloca, DenseMap<Instruction *, llvm::Optional<APInt>>{}, false);
+
   // Create an entry for every spilled value.
   for (auto &S : FrameData.Spills) {
     FieldIDType Id = B.addField(S.first->getType(), None);
     FrameData.setFieldIndex(S.first, Id);
   }
 
+  Optional<FieldIDType> FramePtrField = None;
+  if (Shape.ABI == coro::ABI::Switch) {
+    // Add PromiseAlloca to Allocas list so that
+    // 1. updateLayoutIndex could update its index after
+    // `performOptimizedStructLayout`
+    // 2. it is processed in insertSpills.
+    if (PromiseAlloca)
+      // We assume that the promise alloca won't be modified before
+      // CoroBegin and no alias will be create before CoroBegin.
+      FrameData.Allocas.emplace_back(
+          PromiseAlloca, DenseMap<Instruction *, llvm::Optional<APInt>>{},
+          false);
+
+    Align FrameAlign =
+        std::max_element(
+            B.getFields().begin(), B.getFields().end(),
+            [](auto &F1, auto &F2) { return F1.Alignment < F2.Alignment; })
+            ->Alignment;
+
+    // Check for over-alignment.
+    Value *PtrAddr =
+        ConstantPointerNull::get(Type::getInt8PtrTy(C)->getPointerTo());
+    unsigned NewAlign = Shape.getSwitchCoroId()->getAlignment();
+    bool NeedFramePtrField = Shape.CoroRawFramePtrOffsets.size() > 0 ||
+                             Shape.CoroRawFramePtrAddrs.size() > 0;
+    if (NeedFramePtrField && NewAlign && FrameAlign > NewAlign) {
+      BasicBlock &Entry = F.getEntryBlock();
+      IRBuilder<> Builder(&Entry, Entry.getFirstInsertionPt());
+
+      // Reserve frame space for raw frame pointer.
+      Value *Mem = Shape.CoroBegin->getMem();
+      AllocaInst *FramePtrAddr =
+          Builder.CreateAlloca(Mem->getType(), nullptr, "alloc.frame.ptr");
+      PtrAddr = FramePtrAddr;
+      FramePtrField = B.addFieldForAlloca(FramePtrAddr);
+      FrameData.setFieldIndex(FramePtrAddr, *FramePtrField);
+      FrameData.Allocas.emplace_back(
+          FramePtrAddr, DenseMap<Instruction *, llvm::Optional<APInt>>{}, true);
+    }
+
+    for (CoroRawFramePtrAddrInst *C : Shape.CoroRawFramePtrAddrs) {
+      C->replaceAllUsesWith(PtrAddr);
+      C->eraseFromParent();
+    }
+  }
+
   B.finish(FrameTy);
   FrameData.updateLayoutIndex(B);
   Shape.FrameAlign = B.getStructAlign();
@@ -1154,6 +1195,12 @@
     Shape.SwitchLowering.IndexAlign = IndexField.Alignment.value();
     Shape.SwitchLowering.IndexOffset = IndexField.Offset;
 
+    if (FramePtrField) {
+      FieldIDType FieldIdx = B.getLayoutFieldIndex(*FramePtrField);
+      Shape.SwitchLowering.FramePtrOffset =
+          DL.getStructLayout(FrameTy)->getElementOffset(FieldIdx);
+    }
+
     // Also round the frame size up to a multiple of its alignment, as is
     // generally expected in C/C++.
     Shape.FrameSize = alignTo(Shape.FrameSize, Shape.FrameAlign);
Index: llvm/include/llvm/IR/Intrinsics.td
===================================================================
--- llvm/include/llvm/IR/Intrinsics.td
+++ llvm/include/llvm/IR/Intrinsics.td
@@ -1265,6 +1265,9 @@
 def int_coro_frame : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 def int_coro_noop : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 def int_coro_size : Intrinsic<[llvm_anyint_ty], [], [IntrNoMem]>;
+def int_coro_align : Intrinsic<[llvm_anyint_ty], [], [IntrNoMem]>;
+def int_coro_raw_frame_ptr_offset : Intrinsic<[llvm_anyint_ty], [], [IntrNoMem]>;
+def int_coro_raw_frame_ptr_addr : Intrinsic<[llvm_ptrptr_ty], [], [IntrNoMem]>;
 
 def int_coro_save : Intrinsic<[llvm_token_ty], [llvm_ptr_ty], []>;
 def int_coro_suspend : Intrinsic<[llvm_i8_ty], [llvm_token_ty, llvm_i1_ty], []>;
Index: llvm/docs/Coroutines.rst
===================================================================
--- llvm/docs/Coroutines.rst
+++ llvm/docs/Coroutines.rst
@@ -948,6 +948,88 @@
 The `coro.size` intrinsic is lowered to a constant representing the size of
 the coroutine frame. 
 
+.. _coro.align:
+
+'llvm.coro.align' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+::
+
+    declare i32 @llvm.coro.align.i32()
+    declare i64 @llvm.coro.align.i64()
+
+Overview:
+"""""""""
+
+The '``llvm.coro.align``' intrinsic returns the alignment of the coroutine frame
+in bytes.
+
+Arguments:
+""""""""""
+
+None
+
+Semantics:
+""""""""""
+
+The `coro.align` intrinsic is lowered to a constant representing the alignment
+of the coroutine frame.
+
+.. _coro.raw.frame.ptr.offset:
+
+'llvm.coro.raw.frame.ptr.offset' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+::
+
+    declare i32 @llvm.coro.raw.frame.ptr.offset.i32()
+    declare i64 @llvm.coro.raw.frame.ptr.offset.i64()
+
+Overview:
+"""""""""
+
+The '``llvm.coro.raw.frame.ptr.offset``' intrinsic returns the byte offset of
+the raw memory block address (returned by the allocator) in coroutine frame.
+This is only supported for switched-resume coroutines. The return value is
+undefined when the coroutine frame is not overaligned.
+
+Arguments:
+""""""""""
+
+None
+
+Semantics:
+""""""""""
+
+The `coro.raw.frame.ptr.offset` intrinsic is lowered to a constant representing
+the byte offset of the raw memory block address in coroutine frame.
+
+
+.. _coro.raw.frame.ptr.addr:
+
+'llvm.coro.raw.frame.ptr.addr' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+::
+
+    declare i8** @llvm.coro.raw.frame.ptr.addr()
+
+Overview:
+"""""""""
+
+The '``llvm.coro.raw.frame.ptr.addr``' intrinsic returns the address storing the raw
+frame address. The returned address is either an alloca or a coroutine frame
+field. This is only supported for switched-resume coroutines. The return value
+is undefined when the coroutine frame is not overaligned.
+
+Arguments:
+""""""""""
+
+None
+
+Semantics:
+""""""""""
+
+The `coro.raw.frame.ptr.offset.addr` intrinsic is lowered to either an alloca
+or a coroutine frame field storing the raw frame address.
+
 .. _coro.begin:
 
 'llvm.coro.begin' Intrinsic
Index: clang/test/CodeGenCoroutines/coro-gro.cpp
===================================================================
--- clang/test/CodeGenCoroutines/coro-gro.cpp
+++ clang/test/CodeGenCoroutines/coro-gro.cpp
@@ -68,6 +68,7 @@
   // CHECK: call void @_ZNSt12experimental16coroutine_traitsIJiEE12promise_typeD1Ev(
   // CHECK: %[[Mem:.+]] = call i8* @llvm.coro.free(
   // CHECK: call void @_ZdlPv(i8* %[[Mem]])
+  // CHECK: call void @_ZdlPv(i8* %{{.*}})
 
   // Initialize retval from Gro and destroy Gro
 
Index: clang/test/CodeGenCoroutines/coro-cleanup.cpp
===================================================================
--- clang/test/CodeGenCoroutines/coro-cleanup.cpp
+++ clang/test/CodeGenCoroutines/coro-cleanup.cpp
@@ -78,12 +78,46 @@
 
   // CHECK: [[Cleanup]]:
   // CHECK: call void @_ZNSt12experimental16coroutine_traitsIJvEE12promise_typeD1Ev(
-  // CHECK: %[[Mem0:.+]] = call i8* @llvm.coro.free(
-  // CHECK: call void @_ZdlPv(i8* %[[Mem0]]
+  // CHECK: %[[MEM0:.+]] = call i8* @llvm.coro.free(
+  // CHECK: br i1 %{{.*}}, label %[[CheckAlignBB:.+]], label %[[Afterwards:.+]]
+
+  // CHECK: [[CheckAlignBB]]:
+  // CHECK: %[[ALIGN:.+]] = call i64 @llvm.coro.align.i64()
+  // CHECK: %[[CMP:.+]] = icmp ugt i64 %[[ALIGN]],
+  // CHECK: br i1 %[[CMP]], label %[[AlignedFreeBB:.+]], label %[[FreeBB:.+]]
+
+  // CHECK: [[FreeBB]]:
+  // CHECK: call void @_ZdlPv(i8* %[[MEM0]]
+  // CHECK: br label %[[Afterwards]]
+
+  // CHECK: [[AlignedFreeBB]]:
+  // CHECK-NEXT: %[[OFFSET:.+]] = call i32 @llvm.coro.raw.frame.ptr.offset.i32()
+  // CHECK-NEXT: %[[ADDR:.+]] = getelementptr inbounds i8, i8* %[[MEM0]], i32 %[[OFFSET]]
+  // CHECK-NEXT: %[[ADDR2:.+]] = bitcast i8* %[[ADDR]] to i8**
+  // CHECK-NEXT: %[[MEM:.+]] = load i8*, i8** %[[ADDR2]], align 8
+  // CHECK-NEXT: call void @_ZdlPv(i8* %[[MEM]])
+  // CHECK-NEXT: br label %[[Afterwards]]
 
   // CHECK: [[Dealloc]]:
-  // CHECK:   %[[Mem:.+]] = call i8* @llvm.coro.free(
-  // CHECK:   call void @_ZdlPv(i8* %[[Mem]])
+  // CHECK: %[[MEM0:.+]] = call i8* @llvm.coro.free(
+  // CHECK: br i1 %{{.*}}, label %[[CheckAlignBB:.+]], label %[[Afterwards:.+]]
+
+  // CHECK: [[CheckAlignBB]]:
+  // CHECK: %[[ALIGN:.+]] = call i64 @llvm.coro.align.i64()
+  // CHECK: %[[CMP:.+]] = icmp ugt i64 %[[ALIGN]],
+  // CHECK: br i1 %[[CMP]], label %[[AlignedFreeBB:.+]], label %[[FreeBB:.+]]
+
+  // CHECK: [[FreeBB]]:
+  // CHECK: call void @_ZdlPv(i8* %[[MEM0]]
+  // CHECK: br label %[[Afterwards]]
+
+  // CHECK: [[AlignedFreeBB]]:
+  // CHECK-NEXT: %[[OFFSET:.+]] = call i32 @llvm.coro.raw.frame.ptr.offset.i32()
+  // CHECK-NEXT: %[[ADDR:.+]] = getelementptr inbounds i8, i8* %[[MEM0]], i32 %[[OFFSET]]
+  // CHECK-NEXT: %[[ADDR2:.+]] = bitcast i8* %[[ADDR]] to i8**
+  // CHECK-NEXT: %[[MEM:.+]] = load i8*, i8** %[[ADDR2]], align 8
+  // CHECK-NEXT: call void @_ZdlPv(i8* %[[MEM]])
+  // CHECK-NEXT: br label %[[Afterwards]]
 
   co_return;
 }
Index: clang/test/CodeGenCoroutines/coro-alloc.cpp
===================================================================
--- clang/test/CodeGenCoroutines/coro-alloc.cpp
+++ clang/test/CodeGenCoroutines/coro-alloc.cpp
@@ -57,24 +57,55 @@
 extern "C" void f0(global_new_delete_tag) {
   // CHECK: %[[ID:.+]] = call token @llvm.coro.id(i32 16
   // CHECK: %[[NeedAlloc:.+]] = call i1 @llvm.coro.alloc(token %[[ID]])
-  // CHECK: br i1 %[[NeedAlloc]], label %[[AllocBB:.+]], label %[[InitBB:.+]]
+  // CHECK: br i1 %[[NeedAlloc]], label %[[CheckAlignBB:.+]], label %[[InitBB:.+]]
+
+  // CHECK: [[CheckAlignBB]]:
+  // CHECK: %[[ALIGN:.+]] = call i64 @llvm.coro.align.i64()
+  // CHECK: %[[CMP:.+]] = icmp ugt i64 %[[ALIGN]], 16
+  // CHECK: br i1 %[[CMP]], label %[[AlignAllocBB:.+]], label %[[AllocBB:.+]]
 
   // CHECK: [[AllocBB]]:
+  // CHECK-NEXT: %[[SIZE:.+]] = call i64 @llvm.coro.size.i64()
+  // CHECK-NEXT: %[[MEM:.+]] = call noalias nonnull i8* @_Znwm(i64 %[[SIZE]])
+  // CHECK-NEXT: br label %[[InitBB:.+]]
+
+  // CHECK: [[AlignAllocBB]]:
   // CHECK: %[[SIZE:.+]] = call i64 @llvm.coro.size.i64()
-  // CHECK: %[[MEM:.+]] = call noalias nonnull i8* @_Znwm(i64 %[[SIZE]])
+  // CHECK: %[[ALIGN:.+]] = call i64 @llvm.coro.align.i64()
+  // CHECK: %[[PAD:.+]] = sub nsw i64 %[[ALIGN]], 16
+  // CHECK: %[[NEWSIZE:.+]] = add i64 %[[SIZE]], %[[PAD]]
+  // CHECK: %[[MEM2:.+]] = call noalias nonnull i8* @_Znwm(i64 %[[NEWSIZE]])
+  // CHECK: %[[ALIGN2:.+]] = call i64 @llvm.coro.align.i64()
+  // CHECK: %[[ALIGNED:.+]] = getelementptr inbounds i8, i8* %[[MEM2]],
+  // CHECK: call void @llvm.assume(i1 true) [ "align"(i8* %[[ALIGNED]], i64 %[[ALIGN2]]) ]
+  // CHECK: %[[ADDR:.+]] = call i8** @llvm.coro.raw.frame.ptr.addr()
+  // CHECK: store i8* %[[MEM2]], i8** %[[ADDR]], align 8
   // CHECK: br label %[[InitBB]]
 
   // CHECK: [[InitBB]]:
-  // CHECK: %[[PHI:.+]] = phi i8* [ null, %{{.+}} ], [ %call, %[[AllocBB]] ]
+  // CHECK: %[[PHI:.+]] = phi i8* [ null, %{{.+}} ], [ %[[MEM]], %[[AllocBB]] ], [ %[[ALIGNED]], %[[AlignAllocBB]] ]
   // CHECK: %[[FRAME:.+]] = call i8* @llvm.coro.begin(token %[[ID]], i8* %[[PHI]])
 
   // CHECK: %[[MEM:.+]] = call i8* @llvm.coro.free(token %[[ID]], i8* %[[FRAME]])
   // CHECK: %[[NeedDealloc:.+]] = icmp ne i8* %[[MEM]], null
-  // CHECK: br i1 %[[NeedDealloc]], label %[[FreeBB:.+]], label %[[Afterwards:.+]]
+  // CHECK: br i1 %[[NeedDealloc]], label %[[CheckAlignBB:.+]], label %[[Afterwards:.+]]
+
+  // CHECK: [[CheckAlignBB]]:
+  // CHECK: %[[ALIGN:.+]] = call i64 @llvm.coro.align.i64()
+  // CHECK: %[[CMP:.+]] = icmp ugt i64 %[[ALIGN]], 16
+  // CHECK: br i1 %[[CMP]], label %[[AlignedFreeBB:.+]], label %[[FreeBB:.+]]
 
   // CHECK: [[FreeBB]]:
-  // CHECK: call void @_ZdlPv(i8* %[[MEM]])
-  // CHECK: br label %[[Afterwards]]
+  // CHECK-NEXT: call void @_ZdlPv(i8* %[[MEM]])
+  // CHECK-NEXT: br label %[[Afterwards]]
+
+  // CHECK: [[AlignedFreeBB]]:
+  // CHECK-NEXT: %[[OFFSET:.+]] = call i32 @llvm.coro.raw.frame.ptr.offset.i32()
+  // CHECK-NEXT: %[[ADDR:.+]] = getelementptr inbounds i8, i8* %[[MEM]], i32 %[[OFFSET]]
+  // CHECK-NEXT: %[[ADDR2:.+]] = bitcast i8* %[[ADDR]] to i8**
+  // CHECK-NEXT: %[[MEM:.+]] = load i8*, i8** %[[ADDR2]], align 8
+  // CHECK-NEXT: call void @_ZdlPv(i8* %[[MEM]])
+  // CHECK-NEXT: br label %[[Afterwards]]
 
   // CHECK: [[Afterwards]]:
   // CHECK: ret void
@@ -157,6 +188,7 @@
 // CHECK-LABEL: f1b(
 extern "C" void f1b(promise_matching_global_placement_new_tag, dummy *) {
   // CHECK: call noalias nonnull i8* @_Znwm(i64
+  // CHECK-NOT: call noalias nonnull i8* @_ZnwmSt11align_val_t(i64
   co_return;
 }
 
@@ -182,6 +214,7 @@
   // CHECK: %[[FRAME:.+]] = call i8* @llvm.coro.begin(
   // CHECK: %[[MEM:.+]] = call i8* @llvm.coro.free(token %[[ID]], i8* %[[FRAME]])
   // CHECK: call void @_ZNSt12experimental16coroutine_traitsIJv18promise_delete_tagEE12promise_typedlEPv(i8* %[[MEM]])
+  // CHECK: call void @_ZNSt12experimental16coroutine_traitsIJv18promise_delete_tagEE12promise_typedlEPv(i8*
   co_return;
 }
 
@@ -229,16 +262,41 @@
   // CHECK: %[[RetVal:.+]] = alloca i32
   // CHECK: %[[Gro:.+]] = alloca i32
   // CHECK: %[[ID:.+]] = call token @llvm.coro.id(i32 16
+  // CHECK: br i1 %{{.*}}, label %[[CheckAlignBB:.+]], label %[[OKBB:.+]]
+
+  // CHECK: [[CheckAlignBB]]:
+  // CHECK: %[[ALIGN:.+]] = call i64 @llvm.coro.align.i64()
+  // CHECK: %[[CMP:.+]] = icmp ugt i64 %[[ALIGN]], 16
+  // CHECK: br i1 %[[CMP]], label %[[AlignAllocBB:.+]], label %[[AllocBB:.+]]
+
+  // CHECK: [[AllocBB]]:
   // CHECK: %[[SIZE:.+]] = call i64 @llvm.coro.size.i64()
   // CHECK: %[[MEM:.+]] = call noalias i8* @_ZnwmRKSt9nothrow_t(i64 %[[SIZE]], %"struct.std::nothrow_t"* nonnull align 1 dereferenceable(1) @_ZStL7nothrow)
   // CHECK: %[[OK:.+]] = icmp ne i8* %[[MEM]], null
-  // CHECK: br i1 %[[OK]], label %[[OKBB:.+]], label %[[ERRBB:.+]]
+  // CHECK: br i1 %[[OK]], label %[[OKBB]], label %[[ERRBB:.+]]
 
   // CHECK: [[ERRBB]]:
   // CHECK:   %[[FailRet:.+]] = call i32 @_ZNSt12experimental16coroutine_traitsIJi28promise_on_alloc_failure_tagEE12promise_type39get_return_object_on_allocation_failureEv(
   // CHECK:   store i32 %[[FailRet]], i32* %[[RetVal]]
   // CHECK:   br label %[[RetBB:.+]]
 
+  // CHECK: [[AlignAllocBB]]:
+  // CHECK: %[[SIZE:.+]] = call i64 @llvm.coro.size.i64()
+  // CHECK: %[[ALIGN:.+]] = call i64 @llvm.coro.align.i64()
+  // CHECK: %[[PAD:.+]] = sub nsw i64 %[[ALIGN]], 16
+  // CHECK: %[[NEWSIZE:.+]] = add i64 %[[SIZE]], %[[PAD]]
+  // CHECK: %[[MEM2:.+]] = call noalias i8* @_ZnwmRKSt9nothrow_t(i64 %[[NEWSIZE]], %"struct.std::nothrow_t"* nonnull align 1 dereferenceable(1) @_ZStL7nothrow)
+  // CHECK: %[[OK:.+]] = icmp ne i8* %[[MEM2]], null
+  // CHECK: br i1 %[[OK]], label %[[AlignAllocBB2:.+]], label %[[ERRBB:.+]]
+
+  // CHECK: [[AlignAllocBB2]]:
+  // CHECK: %[[ALIGN2:.+]] = call i64 @llvm.coro.align.i64()
+  // CHECK: %[[ALIGNED:.+]] = getelementptr inbounds i8, i8* %[[MEM2]],
+  // CHECK: call void @llvm.assume(i1 true) [ "align"(i8* %[[ALIGNED]], i64 %[[ALIGN2]]) ]
+  // CHECK: %[[ADDR:.+]] = call i8** @llvm.coro.raw.frame.ptr.addr()
+  // CHECK: store i8* %[[MEM2]], i8** %[[ADDR]], align 8
+  // CHECK: br label %[[OKBB]]
+
   // CHECK: [[OKBB]]:
   // CHECK:   %[[OkRet:.+]] = call i32 @_ZNSt12experimental16coroutine_traitsIJi28promise_on_alloc_failure_tagEE12promise_type17get_return_objectEv(
   // CHECK:   store i32 %[[OkRet]], i32* %[[Gro]]
Index: clang/lib/CodeGen/CodeGenFunction.h
===================================================================
--- clang/lib/CodeGen/CodeGenFunction.h
+++ clang/lib/CodeGen/CodeGenFunction.h
@@ -1917,6 +1917,11 @@
   void EmitOpenCLKernelMetadata(const FunctionDecl *FD,
                                 llvm::Function *Fn);
 
+  llvm::Value *EmitBuiltinAlignTo(llvm::Value *Src, llvm::Type *SrcType,
+                                  llvm::Value *Alignment, llvm::Value *Mask,
+                                  llvm::IntegerType *IntType, const Expr *E,
+                                  bool AlignUp);
+
 public:
   CodeGenFunction(CodeGenModule &cgm, bool suppressNewContext=false);
   ~CodeGenFunction();
@@ -4068,6 +4073,8 @@
   RValue EmitBuiltinIsAligned(const CallExpr *E);
   /// Emit IR for __builtin_align_up/__builtin_align_down.
   RValue EmitBuiltinAlignTo(const CallExpr *E, bool AlignUp);
+  llvm::Value *EmitBuiltinAlignTo(llvm::Value *Src, llvm::Value *Align,
+                                  const Expr *E, bool AlignUp);
 
   llvm::Function *generateBuiltinOSLogHelperFunction(
       const analyze_os_log::OSLogBufferLayout &Layout,
Index: clang/lib/CodeGen/CGCoroutine.cpp
===================================================================
--- clang/lib/CodeGen/CGCoroutine.cpp
+++ clang/lib/CodeGen/CGCoroutine.cpp
@@ -12,9 +12,12 @@
 
 #include "CGCleanup.h"
 #include "CodeGenFunction.h"
-#include "llvm/ADT/ScopeExit.h"
 #include "clang/AST/StmtCXX.h"
 #include "clang/AST/StmtVisitor.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include <cstdint>
 
 using namespace clang;
 using namespace CodeGen;
@@ -75,6 +78,7 @@
   // Stores the last emitted coro.free for the deallocate expressions, we use it
   // to wrap dealloc code with if(auto mem = coro.free) dealloc(mem).
   llvm::CallInst *LastCoroFree = nullptr;
+  bool LastCoroFreeUsedForDealloc = false;
 
   // If coro.id came from the builtin, remember the expression to give better
   // diagnostic. If CoroIdExpr is nullptr, the coro.id was created by
@@ -412,9 +416,81 @@
     }
   }
 };
+
+void OverAllocateFrame(CodeGenFunction &CGF, llvm::CallInst *CI, bool IsAlloc) {
+  unsigned CoroSizeIdx = IsAlloc ? 0 : 1;
+  CGBuilderTy &Builder = CGF.Builder;
+  auto OrigIP = Builder.saveIP();
+  Builder.SetInsertPoint(CI);
+  llvm::Function *CoroAlign =
+      CGF.CGM.getIntrinsic(llvm::Intrinsic::coro_align, CGF.SizeTy);
+  const auto &TI = CGF.CGM.getContext().getTargetInfo();
+  unsigned AlignOfNew = TI.getNewAlign() / TI.getCharWidth();
+  auto *AlignCall = Builder.CreateCall(CoroAlign);
+  auto *AlignOfNewInt = llvm::ConstantInt::get(CGF.SizeTy, AlignOfNew, true);
+  auto *Diff = Builder.CreateNSWSub(AlignCall, AlignOfNewInt);
+  auto *NewCoroSize = Builder.CreateAdd(CI->getArgOperand(CoroSizeIdx), Diff);
+  CI->setArgOperand(CoroSizeIdx, NewCoroSize);
+  Builder.restoreIP(OrigIP);
+}
+
+void EmitDynamicAlignedDealloc(CodeGenFunction &CGF,
+                               llvm::BasicBlock *AlignedFreeBB,
+                               llvm::CallInst *CoroFree) {
+  llvm::CallInst *Dealloc = nullptr;
+  for (llvm::User *U : CoroFree->users()) {
+    if (auto *CI = dyn_cast<llvm::CallInst>(U))
+      if (CI->getParent() == CGF.Builder.GetInsertBlock())
+        Dealloc = CI;
+  }
+  assert(Dealloc);
+
+  CGF.Builder.SetInsertPoint(AlignedFreeBB->getFirstNonPHI());
+
+  // Replace `coro.free` argument with the address from coroutine frame.
+
+  llvm::Function *RawFramePtrOffsetIntrin = CGF.CGM.getIntrinsic(
+      llvm::Intrinsic::coro_raw_frame_ptr_offset, CGF.Int32Ty);
+  auto *RawFramePtrOffset = CGF.Builder.CreateCall(RawFramePtrOffsetIntrin);
+  auto *FramePtrAddrStart =
+      CGF.Builder.CreateInBoundsGEP(CoroFree, {RawFramePtrOffset});
+  auto *FramePtrAddr = CGF.Builder.CreatePointerCast(
+      FramePtrAddrStart, CGF.Int8PtrTy->getPointerTo());
+  auto *FramePtr =
+      CGF.Builder.CreateLoad({FramePtrAddr, CGF.getPointerAlign()});
+  Dealloc->setArgOperand(0, FramePtr);
+
+  // Match size_t argument with the one used during allocation.
+
+  assert(Dealloc->getNumArgOperands() >= 1);
+  if (Dealloc->getNumArgOperands() > 1) {
+    // Size may only be the second argument of allocator call.
+    if (auto *CoroSize =
+            dyn_cast<llvm::IntrinsicInst>(Dealloc->getArgOperand(1)))
+      if (CoroSize->getIntrinsicID() == llvm::Intrinsic::coro_size)
+        OverAllocateFrame(CGF, Dealloc, /*IsAlloc*/ false);
+  }
+
+  CGF.Builder.SetInsertPoint(AlignedFreeBB);
+}
+
+void EmitCheckAlignBasicBlock(CodeGenFunction &CGF,
+                              llvm::BasicBlock *CheckAlignBB,
+                              llvm::BasicBlock *AlignBB,
+                              llvm::BasicBlock *NonAlignBB) {
+  CGF.EmitBlock(CheckAlignBB);
+
+  auto &Builder = CGF.Builder;
+  auto &TI = CGF.CGM.getContext().getTargetInfo();
+  unsigned NewAlign = TI.getNewAlign() / TI.getCharWidth();
+  auto *CoroAlign = Builder.CreateCall(
+      CGF.CGM.getIntrinsic(llvm::Intrinsic::coro_align, CGF.SizeTy));
+  auto *AlignOfNew = llvm::ConstantInt::get(CGF.SizeTy, NewAlign);
+  auto *Cmp =
+      Builder.CreateICmp(llvm::CmpInst::ICMP_UGT, CoroAlign, AlignOfNew);
+  Builder.CreateCondBr(Cmp, AlignBB, NonAlignBB);
 }
 
-namespace {
 // Make sure to call coro.delete on scope exit.
 struct CallCoroDelete final : public EHScopeStack::Cleanup {
   Stmt *Deallocate;
@@ -432,21 +508,33 @@
     // call.
     BasicBlock *SaveInsertBlock = CGF.Builder.GetInsertBlock();
 
+    auto *CheckAlignBB = CGF.createBasicBlock("coro.free.check.align");
+    auto *AlignedFreeBB = CGF.createBasicBlock("coro.free.align");
     auto *FreeBB = CGF.createBasicBlock("coro.free");
+    auto *AfterFreeBB = CGF.createBasicBlock("after.coro.free");
+
+    EmitCheckAlignBasicBlock(CGF, CheckAlignBB, AlignedFreeBB, FreeBB);
+
     CGF.EmitBlock(FreeBB);
     CGF.EmitStmt(Deallocate);
-
-    auto *AfterFreeBB = CGF.createBasicBlock("after.coro.free");
-    CGF.EmitBlock(AfterFreeBB);
+    CGF.Builder.CreateBr(AfterFreeBB);
 
     // We should have captured coro.free from the emission of deallocate.
     auto *CoroFree = CGF.CurCoro.Data->LastCoroFree;
+    CGF.CurCoro.Data->LastCoroFreeUsedForDealloc = true;
     if (!CoroFree) {
       CGF.CGM.Error(Deallocate->getBeginLoc(),
                     "Deallocation expressoin does not refer to coro.free");
       return;
     }
 
+    CGF.EmitBlock(AlignedFreeBB);
+    CGF.EmitStmt(Deallocate);
+    CGF.CurCoro.Data->LastCoroFreeUsedForDealloc = false;
+    EmitDynamicAlignedDealloc(CGF, AlignedFreeBB, CoroFree);
+
+    CGF.EmitBlock(AfterFreeBB);
+
     // Get back to the block we were originally and move coro.free there.
     auto *InsertPt = SaveInsertBlock->getTerminator();
     CoroFree->moveBefore(InsertPt);
@@ -455,7 +543,7 @@
     // Add if (auto *mem = coro.free) Deallocate;
     auto *NullPtr = llvm::ConstantPointerNull::get(CGF.Int8PtrTy);
     auto *Cond = CGF.Builder.CreateICmpNE(CoroFree, NullPtr);
-    CGF.Builder.CreateCondBr(Cond, FreeBB, AfterFreeBB);
+    CGF.Builder.CreateCondBr(Cond, CheckAlignBB, AfterFreeBB);
 
     // No longer need old terminator.
     InsertPt->eraseFromParent();
@@ -547,9 +635,13 @@
 
   auto *EntryBB = Builder.GetInsertBlock();
   auto *AllocBB = createBasicBlock("coro.alloc");
+  auto *AlignAllocBB = createBasicBlock("coro.alloc.align");
+  auto *CheckAlignBB = createBasicBlock("coro.alloc.check.align");
   auto *InitBB = createBasicBlock("coro.init");
   auto *FinalBB = createBasicBlock("coro.final");
   auto *RetBB = createBasicBlock("coro.ret");
+  llvm::BasicBlock *RetOnFailureBB = nullptr;
+  llvm::BasicBlock *AlignAllocBB2 = nullptr;
 
   auto *CoroId = Builder.CreateCall(
       CGM.getIntrinsic(llvm::Intrinsic::coro_id),
@@ -564,7 +656,9 @@
   auto *CoroAlloc = Builder.CreateCall(
       CGM.getIntrinsic(llvm::Intrinsic::coro_alloc), {CoroId});
 
-  Builder.CreateCondBr(CoroAlloc, AllocBB, InitBB);
+  Builder.CreateCondBr(CoroAlloc, CheckAlignBB, InitBB);
+
+  EmitCheckAlignBasicBlock(*this, CheckAlignBB, AlignAllocBB, AllocBB);
 
   EmitBlock(AllocBB);
   auto *AllocateCall = EmitScalarExpr(S.getAllocate());
@@ -572,10 +666,9 @@
 
   // Handle allocation failure if 'ReturnStmtOnAllocFailure' was provided.
   if (auto *RetOnAllocFailure = S.getReturnStmtOnAllocFailure()) {
-    auto *RetOnFailureBB = createBasicBlock("coro.ret.on.failure");
+    RetOnFailureBB = createBasicBlock("coro.ret.on.failure");
 
     // See if allocation was successful.
-    auto *NullPtr = llvm::ConstantPointerNull::get(Int8PtrTy);
     auto *Cond = Builder.CreateICmpNE(AllocateCall, NullPtr);
     Builder.CreateCondBr(Cond, InitBB, RetOnFailureBB);
 
@@ -587,12 +680,65 @@
     Builder.CreateBr(InitBB);
   }
 
+  EmitBlock(AlignAllocBB);
+
+  auto *AlignedAllocateCall = EmitScalarExpr(S.getAllocate());
+
+  // If the coroutine frame is overaligned and only an allocation function
+  // that does not take `std::align_val_t` is available, the proper alignement
+  // for coroutine frame is achieved by allocating more memory than needed and
+  // dynamically adjust the frame start address at runtime. The codegen'd IR
+  // looks like:
+  // void *rawFrame = nullptr;
+  // ...
+  // if (llvm.coro.alloc()) {
+  //   size_t size = llvm.coro.size(), align = llvm.coro.align();
+  //   if (align > NEW_ALIGN) {
+  // #if <operator new without std::align_val_t argument is selected by Sema>
+  //     size += align - NEW_ALIGN + sizeof(void*);
+  //     frame = operator new(size);
+  //     rawFrame = frame;
+  //     frame = (frame + align - 1) & ~(align - 1);
+  // #else
+  //     // If an aligned allocation function is selected.
+  //     frame = operator new(size, align);
+  // #endif
+  //   } else {
+  //     frame = operator new(size);
+  //   }
+  // }
+
+  // size += align - NEW_ALIGN + sizeof(void*);
+  OverAllocateFrame(*this, cast<llvm::CallInst>(AlignedAllocateCall),
+                    /*IsAlloc*/ true);
+  if (S.getReturnStmtOnAllocFailure()) {
+    auto *Cond = Builder.CreateICmpNE(AlignedAllocateCall, NullPtr);
+    AlignAllocBB2 = createBasicBlock("coro.alloc.align2");
+    Builder.CreateCondBr(Cond, AlignAllocBB2, RetOnFailureBB);
+    EmitBlock(AlignAllocBB2);
+  }
+  // frame = (frame + align - 1) & ~(align - 1);
+  auto *CoroAlign =
+      Builder.CreateCall(CGM.getIntrinsic(llvm::Intrinsic::coro_align, SizeTy));
+  auto *AlignedUpAddr =
+      EmitBuiltinAlignTo(AlignedAllocateCall, CoroAlign, S.getAllocate(), true);
+  // rawFrame = frame;
+  auto *RawFramePtrAddrIntrin =
+      CGM.getIntrinsic(llvm::Intrinsic::coro_raw_frame_ptr_addr);
+  auto *RawFramePtrAddr = Builder.CreateCall(RawFramePtrAddrIntrin);
+  Builder.CreateStore(AlignedAllocateCall,
+                      {RawFramePtrAddr, getPointerAlign()});
+  AlignedAllocateCall = AlignedUpAddr;
+
   EmitBlock(InitBB);
 
   // Pass the result of the allocation to coro.begin.
-  auto *Phi = Builder.CreatePHI(VoidPtrTy, 2);
+  auto *Phi = Builder.CreatePHI(VoidPtrTy, 3);
   Phi->addIncoming(NullPtr, EntryBB);
   Phi->addIncoming(AllocateCall, AllocOrInvokeContBB);
+  Phi->addIncoming(AlignedAllocateCall,
+                   AlignAllocBB2 ? AlignAllocBB2 : AlignAllocBB);
+
   auto *CoroBegin = Builder.CreateCall(
       CGM.getIntrinsic(llvm::Intrinsic::coro_begin), {CoroId, Phi});
   CurCoro.Data->CoroBegin = CoroBegin;
@@ -729,6 +875,10 @@
   case llvm::Intrinsic::coro_alloc:
   case llvm::Intrinsic::coro_begin:
   case llvm::Intrinsic::coro_free: {
+    // Make deallocation and aligned deallocation share one `coro.free`.
+    if (CurCoro.Data && CurCoro.Data->LastCoroFreeUsedForDealloc)
+      return RValue::get(CurCoro.Data->LastCoroFree);
+
     if (CurCoro.Data && CurCoro.Data->CoroId) {
       Args.push_back(CurCoro.Data->CoroId);
       break;
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -16958,28 +16958,28 @@
       llvm::Constant::getNullValue(Args.IntType), "is_aligned"));
 }
 
-/// Generate (x & ~(y-1)) to align down or ((x+(y-1)) & ~(y-1)) to align up.
-/// Note: For pointer types we can avoid ptrtoint/inttoptr pairs by using the
-/// llvm.ptrmask instrinsic (with a GEP before in the align_up case).
-/// TODO: actually use ptrmask once most optimization passes know about it.
-RValue CodeGenFunction::EmitBuiltinAlignTo(const CallExpr *E, bool AlignUp) {
-  BuiltinAlignArgs Args(E, *this);
-  llvm::Value *SrcAddr = Args.Src;
-  if (Args.Src->getType()->isPointerTy())
-    SrcAddr = Builder.CreatePtrToInt(Args.Src, Args.IntType, "intptr");
+llvm::Value *CodeGenFunction::EmitBuiltinAlignTo(llvm::Value *Src,
+                                                 llvm::Type *SrcType,
+                                                 llvm::Value *Alignment,
+                                                 llvm::Value *Mask,
+                                                 llvm::IntegerType *IntType,
+                                                 const Expr *E, bool AlignUp) {
+  llvm::Value *SrcAddr = Src;
+  if (Src->getType()->isPointerTy())
+    SrcAddr = Builder.CreatePtrToInt(Src, IntType, "intptr");
   llvm::Value *SrcForMask = SrcAddr;
   if (AlignUp) {
     // When aligning up we have to first add the mask to ensure we go over the
     // next alignment value and then align down to the next valid multiple.
     // By adding the mask, we ensure that align_up on an already aligned
     // value will not change the value.
-    SrcForMask = Builder.CreateAdd(SrcForMask, Args.Mask, "over_boundary");
+    SrcForMask = Builder.CreateAdd(SrcForMask, Mask, "over_boundary");
   }
   // Invert the mask to only clear the lower bits.
-  llvm::Value *InvertedMask = Builder.CreateNot(Args.Mask, "inverted_mask");
+  llvm::Value *InvertedMask = Builder.CreateNot(Mask, "inverted_mask");
   llvm::Value *Result =
       Builder.CreateAnd(SrcForMask, InvertedMask, "aligned_result");
-  if (Args.Src->getType()->isPointerTy()) {
+  if (Src->getType()->isPointerTy()) {
     /// TODO: Use ptrmask instead of ptrtoint+gep once it is optimized well.
     // Result = Builder.CreateIntrinsic(
     //  Intrinsic::ptrmask, {Args.SrcType, SrcForMask->getType(), Args.IntType},
@@ -16988,7 +16988,7 @@
     llvm::Value *Difference = Builder.CreateSub(Result, SrcAddr, "diff");
     // The result must point to the same underlying allocation. This means we
     // can use an inbounds GEP to enable better optimization.
-    Value *Base = EmitCastToVoidPtr(Args.Src);
+    Value *Base = EmitCastToVoidPtr(Src);
     if (getLangOpts().isSignedOverflowDefined())
       Result = Builder.CreateGEP(Base, Difference, "aligned_result");
     else
@@ -16996,13 +16996,41 @@
                                       /*SignedIndices=*/true,
                                       /*isSubtraction=*/!AlignUp,
                                       E->getExprLoc(), "aligned_result");
-    Result = Builder.CreatePointerCast(Result, Args.SrcType);
+    Result = Builder.CreatePointerCast(Result, SrcType);
     // Emit an alignment assumption to ensure that the new alignment is
     // propagated to loads/stores, etc.
-    emitAlignmentAssumption(Result, E, E->getExprLoc(), Args.Alignment);
+    emitAlignmentAssumption(Result, E, E->getExprLoc(), Alignment);
   }
-  assert(Result->getType() == Args.SrcType);
-  return RValue::get(Result);
+  assert(Result->getType() == SrcType);
+  return Result;
+}
+
+/// Generate (x & ~(y-1)) to align down or ((x+(y-1)) & ~(y-1)) to align up.
+/// Note: For pointer types we can avoid ptrtoint/inttoptr pairs by using the
+/// llvm.ptrmask instrinsic (with a GEP before in the align_up case).
+/// TODO: actually use ptrmask once most optimization passes know about it.
+RValue CodeGenFunction::EmitBuiltinAlignTo(const CallExpr *E, bool AlignUp) {
+  BuiltinAlignArgs Args(E, *this);
+  return RValue::get(EmitBuiltinAlignTo(Args.Src, Args.SrcType, Args.Alignment,
+                                        Args.Mask, Args.IntType, E, AlignUp));
+}
+
+llvm::Value *CodeGenFunction::EmitBuiltinAlignTo(llvm::Value *Src,
+                                                 llvm::Value *Align,
+                                                 const Expr *E, bool AlignUp) {
+  llvm::Type *SrcType = Src->getType();
+  llvm::IntegerType *IntType = nullptr;
+  if (SrcType->isPointerTy()) {
+    IntType = IntegerType::get(
+        getLLVMContext(), CGM.getDataLayout().getIndexTypeSizeInBits(SrcType));
+  } else {
+    assert(SrcType->isIntegerTy());
+    IntType = cast<llvm::IntegerType>(SrcType);
+  }
+  llvm::Value *Alignment = Align;
+  auto *One = llvm::ConstantInt::get(IntType, 1);
+  llvm::Value *Mask = Builder.CreateSub(Alignment, One, "mask");
+  return EmitBuiltinAlignTo(Src, SrcType, Alignment, Mask, IntType, E, AlignUp);
 }
 
 Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to