https://github.com/vtjnash created 
https://github.com/llvm/llvm-project/pull/150808

When code has the pattern of an alloca written only by memcpy followed by just 
reads of that result, we can remove the memcpy if we can show it legal to just 
forward the users to the source directly. This is a generalization of some of 
the other passes in the file, since it can hoist even if it understands very 
little about the subsequent operations, while the existing passes require 
specific analysis of the subsequent instruction (such as memcpy-to-memcpy) to 
be able to do their designed replacements. This is a common pattern where a 
frontend does a large defensive copy from a mutable struct and then only 
actually uses the value from one field of it. Past optimizations in this file 
only supported the case where "uses the value from one field" was limited to 
another memcpy, while this permits any operation as long as it doesn't write 
back to the memory (or capture it). Opening as draft since I think this works 
and it passes CI, but it'd help to have review that there weren't any other 
semantic conditions that should be checked to confirm the optimization is 
correct to perform.

More precision will be achieved (for cond_nohoist and phi_nohoist tests) if 
https://reviews.llvm.org/D119929?id=409338 lands.

>From 13257aee6ea1bbe028ad033236b943964b9a8c80 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjn...@gmail.com>
Date: Thu, 22 May 2025 20:22:24 +0000
Subject: [PATCH] [MemCpyOpt] add memcpy removal for simple dest

When code has the pattern of an alloca written only by memcpy followed
by just reads of that result, we can remove the memcpy if we can show it
legal to just forward the users to the source directly. This is a
generalization of some of the other passes in the file, since it can
hoist even if it understands very little about the subsequent
operations, while the existing passes require specific analysis of the
subsequent instruction (such as memcpy-to-memcpy) to be able to do their
designed replacements.

More precision will be achieved (for cond_nohoist and phi_nohoist tests)
if https://reviews.llvm.org/D119929?id=409338 lands.

The following is an (exhaustive) list of all existing tests which
were noted to trigger the new optimization:
  Clang :: CodeGen/attr-counted-by.c
  LLVM :: Analysis/ScopedNoAliasAA/alias-scope-merging.ll
  LLVM :: Transforms/MemCpyOpt/callslot_badaa.ll
  LLVM :: Transforms/MemCpyOpt/lifetime.ll
  LLVM :: Transforms/MemCpyOpt/memcpy-byval-forwarding-clobbers.ll
  LLVM :: Transforms/MemCpyOpt/memcpy.ll
---
 clang/test/CodeGen/attr-counted-by.c          |  76 +++--
 .../llvm/Transforms/Scalar/MemCpyOptimizer.h  |   4 +
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 284 +++++++++++++++++-
 .../ScopedNoAliasAA/alias-scope-merging.ll    |  30 +-
 .../Transforms/MemCpyOpt/callslot_badaa.ll    |  21 +-
 llvm/test/Transforms/MemCpyOpt/lifetime.ll    |   7 +-
 .../memcpy-byval-forwarding-clobbers.ll       |   2 -
 .../test/Transforms/MemCpyOpt/memcpy-hoist.ll | 240 +++++++++++++++
 .../Transforms/MemCpyOpt/memcpy-nohoist.ll    | 183 +++++++++++
 llvm/test/Transforms/MemCpyOpt/memcpy.ll      |  22 +-
 10 files changed, 786 insertions(+), 83 deletions(-)
 create mode 100644 llvm/test/Transforms/MemCpyOpt/memcpy-hoist.ll
 create mode 100644 llvm/test/Transforms/MemCpyOpt/memcpy-nohoist.ll

diff --git a/clang/test/CodeGen/attr-counted-by.c 
b/clang/test/CodeGen/attr-counted-by.c
index 101949af208e1..2353717344cb2 100644
--- a/clang/test/CodeGen/attr-counted-by.c
+++ b/clang/test/CodeGen/attr-counted-by.c
@@ -1234,12 +1234,9 @@ int test12_a, test12_b;
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i32 @test12(
 // NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[INDEX:%.*]]) local_unnamed_addr 
#[[ATTR4:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[BAZ:%.*]] = alloca [[STRUCT_HANG:%.*]], 
align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    call void @llvm.lifetime.start.p0(i64 24, 
ptr nonnull [[BAZ]]) #[[ATTR12:[0-9]+]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef 
nonnull align 4 dereferenceable(24) [[BAZ]], ptr noundef nonnull align 4 
dereferenceable(24) @test12_bar, i64 24, i1 false), !tbaa.struct 
[[TBAA_STRUCT7:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [6 
x i32], ptr [[BAZ]], i64 0, i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], 
align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [6 
x i32], ptr @test12_bar, i64 0, i64 [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], 
align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[TMP0]], ptr @test12_b, align 4, 
!tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr 
inbounds nuw (i8, ptr @test12_foo, i64 4), align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[TMP1]], ptr @test12_a, align 4, 
!tbaa [[TBAA2]]
@@ -1276,12 +1273,9 @@ int test12_a, test12_b;
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i32 @test12(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[INDEX:%.*]]) 
local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[BAZ:%.*]] = alloca [[STRUCT_HANG:%.*]], 
align 4
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    call void @llvm.lifetime.start.p0(i64 24, 
ptr nonnull [[BAZ]]) #[[ATTR10:[0-9]+]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr 
noundef nonnull align 4 dereferenceable(24) [[BAZ]], ptr noundef nonnull align 
4 dereferenceable(24) @test12_bar, i64 24, i1 false), !tbaa.struct 
[[TBAA_STRUCT7:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to 
i64
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds 
[6 x i32], ptr [[BAZ]], i64 0, i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr 
[[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds 
[6 x i32], ptr @test12_bar, i64 0, i64 [[IDXPROM]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr 
[[ARRAYIDX]], align 4
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 [[TMP0]], ptr @test12_b, align 
4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr 
getelementptr inbounds nuw (i8, ptr @test12_foo, i64 4), align 4, !tbaa 
[[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 [[TMP1]], ptr @test12_a, align 
4, !tbaa [[TBAA2]]
@@ -1327,12 +1321,12 @@ struct test13_bar {
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 0
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i32 @test13(
-// NO-SANITIZE-WITH-ATTR-SAME: i64 noundef [[INDEX:%.*]]) local_unnamed_addr 
#[[ATTR7:[0-9]+]] {
+// NO-SANITIZE-WITH-ATTR-SAME: i64 noundef [[INDEX:%.*]]) local_unnamed_addr 
#[[ATTR5:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr @test13_f, 
align 8, !tbaa [[TBAA8:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr @test13_f, 
align 8, !tbaa [[TBAA7:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[REVMAP:%.*]] = getelementptr inbounds nuw 
i8, ptr [[TMP0]], i64 16
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 
x ptr], ptr [[REVMAP]], i64 0, i64 [[INDEX]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store ptr null, ptr [[ARRAYIDX]], align 8, 
!tbaa [[TBAA12:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store ptr null, ptr [[ARRAYIDX]], align 8, 
!tbaa [[TBAA11:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 0
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i32 @test13(
@@ -1354,12 +1348,12 @@ struct test13_bar {
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 0
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i32 @test13(
-// NO-SANITIZE-WITHOUT-ATTR-SAME: i64 noundef [[INDEX:%.*]]) 
local_unnamed_addr #[[ATTR5:[0-9]+]] {
+// NO-SANITIZE-WITHOUT-ATTR-SAME: i64 noundef [[INDEX:%.*]]) 
local_unnamed_addr #[[ATTR3:[0-9]+]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr @test13_f, 
align 8, !tbaa [[TBAA8:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr @test13_f, 
align 8, !tbaa [[TBAA7:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[REVMAP:%.*]] = getelementptr inbounds 
nuw i8, ptr [[TMP0]], i64 16
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds 
[0 x ptr], ptr [[REVMAP]], i64 0, i64 [[INDEX]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store ptr null, ptr [[ARRAYIDX]], align 
8, !tbaa [[TBAA12:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store ptr null, ptr [[ARRAYIDX]], align 
8, !tbaa [[TBAA11:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 0
 //
 int test13(long index) {
@@ -1643,7 +1637,7 @@ struct tests_foo {
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP0]]
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test24(
-// NO-SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef readonly 
captures(none) [[VAR:%.*]]) local_unnamed_addr #[[ATTR6:[0-9]+]] {
+// NO-SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef readonly 
captures(none) [[VAR:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr 
inbounds nuw i8, ptr [[VAR]], i64 84
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr 
[[ARRAYIDX1]], align 4, !tbaa [[TBAA2]]
@@ -1670,9 +1664,9 @@ int test24(int c, struct tests_foo *var) {
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP2]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test25(
-// NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef readonly 
captures(none) [[VAR:%.*]]) local_unnamed_addr #[[ATTR8:[0-9]+]] {
+// NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef readonly 
captures(none) [[VAR:%.*]]) local_unnamed_addr #[[ATTR6:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 
8, !tbaa [[TBAA14:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 
8, !tbaa [[TBAA13:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds 
nuw i8, ptr [[TMP0]], i64 44
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], 
align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP1]]
@@ -1686,9 +1680,9 @@ int test24(int c, struct tests_foo *var) {
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP1]]
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test25(
-// NO-SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef readonly 
captures(none) [[VAR:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+// NO-SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef readonly 
captures(none) [[VAR:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], 
align 8, !tbaa [[TBAA14:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], 
align 8, !tbaa [[TBAA13:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds 
nuw i8, ptr [[TMP0]], i64 44
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr 
[[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP1]]
@@ -1741,7 +1735,7 @@ struct test26_foo {
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP0]]
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test26(
-// NO-SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef readonly 
captures(none) [[FOO:%.*]]) local_unnamed_addr #[[ATTR6]] {
+// NO-SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef readonly 
captures(none) [[FOO:%.*]]) local_unnamed_addr #[[ATTR4]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds nuw 
i8, ptr [[FOO]], i64 8
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[C]] to i64
@@ -1801,7 +1795,7 @@ struct test27_foo {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ENTRIES:%.*]] = getelementptr inbounds nuw 
i8, ptr [[P]], i64 24
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 
x ptr], ptr [[ENTRIES]], i64 0, i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], 
align 8, !tbaa [[TBAA16:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], 
align 8, !tbaa [[TBAA15:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[J]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds 
[[STRUCT_TEST27_BAR:%.*]], ptr [[TMP0]], i64 [[IDXPROM1]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret ptr [[ARRAYIDX2]]
@@ -1818,12 +1812,12 @@ struct test27_foo {
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret ptr [[ARRAYIDX4]]
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local ptr @test27(
-// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) 
[[P:%.*]], i32 noundef [[I:%.*]], i32 noundef [[J:%.*]]) local_unnamed_addr 
#[[ATTR6]] {
+// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) 
[[P:%.*]], i32 noundef [[I:%.*]], i32 noundef [[J:%.*]]) local_unnamed_addr 
#[[ATTR4]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ENTRIES:%.*]] = getelementptr inbounds 
nuw i8, ptr [[P]], i64 24
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds 
[0 x ptr], ptr [[ENTRIES]], i64 0, i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr 
[[ARRAYIDX]], align 8, !tbaa [[TBAA16:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr 
[[ARRAYIDX]], align 8, !tbaa [[TBAA15:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[J]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr 
inbounds [[STRUCT_TEST27_BAR:%.*]], ptr [[TMP0]], i64 [[IDXPROM1]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret ptr [[ARRAYIDX2]]
@@ -1860,11 +1854,11 @@ struct test28_foo {
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP5]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test28(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], 
i32 noundef [[I:%.*]]) local_unnamed_addr #[[ATTR8]] {
+// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], 
i32 noundef [[I:%.*]]) local_unnamed_addr #[[ATTR6]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 8, 
!tbaa [[TBAA18:![0-9]+]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 
8, !tbaa [[TBAA18]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 
8, !tbaa [[TBAA18]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 8, 
!tbaa [[TBAA17:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 
8, !tbaa [[TBAA17]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 
8, !tbaa [[TBAA17]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds nuw i8, 
ptr [[TMP2]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 
x i32], ptr [[ARR]], i64 0, i64 [[IDXPROM]]
@@ -1884,11 +1878,11 @@ struct test28_foo {
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP3]]
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test28(
-// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) 
[[P:%.*]], i32 noundef [[I:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) 
[[P:%.*]], i32 noundef [[I:%.*]]) local_unnamed_addr #[[ATTR5]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 
8, !tbaa [[TBAA18:![0-9]+]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], 
align 8, !tbaa [[TBAA18]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], 
align 8, !tbaa [[TBAA18]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 
8, !tbaa [[TBAA17:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], 
align 8, !tbaa [[TBAA17]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], 
align 8, !tbaa [[TBAA17]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds nuw 
i8, ptr [[TMP2]], i64 12
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds 
[0 x i32], ptr [[ARR]], i64 0, i64 [[IDXPROM]]
@@ -1936,11 +1930,11 @@ struct annotated_struct_array {
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test29(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) 
[[ANN:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) 
local_unnamed_addr #[[ATTR9:[0-9]+]] {
+// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) 
[[ANN:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) 
local_unnamed_addr #[[ATTR7:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX1]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds 
[10 x ptr], ptr [[ANN]], i64 0, i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], 
align 8, !tbaa [[TBAA20:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], 
align 8, !tbaa [[TBAA19:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw 
i8, ptr [[TMP0]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr 
inbounds nuw i8, ptr [[TMP0]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr 
[[COUNTED_BY_GEP]], align 4
@@ -1970,11 +1964,11 @@ struct annotated_struct_array {
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test29(
-// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) 
[[ANN:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) 
local_unnamed_addr #[[ATTR8:[0-9]+]] {
+// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) 
[[ANN:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) 
local_unnamed_addr #[[ATTR6:[0-9]+]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX1]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds 
[10 x ptr], ptr [[ANN]], i64 0, i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr 
[[ARRAYIDX]], align 8, !tbaa [[TBAA20:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr 
[[ARRAYIDX]], align 8, !tbaa [[TBAA19:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds 
nuw i8, ptr [[TMP0]], i64 12
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[IDX2]] to 
i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr 
inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM5]]
@@ -2119,7 +2113,7 @@ struct annotated_with_array {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw 
i8, ptr [[PTR]], i64 344
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[IDX1]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds 
[0 x i64], ptr [[ARRAY]], i64 0, i64 [[IDXPROM1]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i64 [[TMP4]], ptr [[ARRAYIDX2]], align 
8, !tbaa [[TBAA22:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i64 [[TMP4]], ptr [[ARRAYIDX2]], align 
8, !tbaa [[TBAA21:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test32(
@@ -2144,7 +2138,7 @@ struct annotated_with_array {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds 
nuw i8, ptr [[PTR]], i64 344
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[IDX1]] to 
i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr 
inbounds [0 x i64], ptr [[ARRAY]], i64 0, i64 [[IDXPROM1]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i64 -1, ptr [[ARRAYIDX2]], align 8, 
!tbaa [[TBAA22:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i64 -1, ptr [[ARRAYIDX2]], align 8, 
!tbaa [[TBAA21:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test32(struct annotated_with_array *ptr, int idx1, int idx2) {
@@ -2428,7 +2422,7 @@ struct {
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test36(
-// NO-SANITIZE-WITH-ATTR-SAME: ) local_unnamed_addr #[[ATTR10:[0-9]+]] {
+// NO-SANITIZE-WITH-ATTR-SAME: ) local_unnamed_addr #[[ATTR8:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
@@ -2438,7 +2432,7 @@ struct {
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test36(
-// NO-SANITIZE-WITHOUT-ATTR-SAME: ) local_unnamed_addr #[[ATTR9:[0-9]+]] {
+// NO-SANITIZE-WITHOUT-ATTR-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h 
b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
index 496d2958fc2d0..1c723a87ea992 100644
--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -79,11 +79,15 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
                                      BatchAAResults &BAA);
   bool performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, MemSetInst *MemSet,
                                   BatchAAResults &BAA);
+  bool performStackHoistOptzn(MemCpyInst *M, AllocaInst *DestAlloca,
+                              TypeSize Size, BatchAAResults &BAA);
   bool processByValArgument(CallBase &CB, unsigned ArgNo);
   bool processImmutArgument(CallBase &CB, unsigned ArgNo);
   Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
                                     Value *ByteVal);
   bool moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI);
+  bool canMoveUp(Instruction *I, Instruction *P);
+  void moveUp(Instruction *I, Instruction *P);
   bool performStackMoveOptzn(Instruction *Load, Instruction *Store,
                              AllocaInst *DestAlloca, AllocaInst *SrcAlloca,
                              TypeSize Size, BatchAAResults &BAA);
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp 
b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 9220abb974d21..3b238859b32cb 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1533,6 +1533,7 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction 
*Load, Instruction *Store,
   }
 
   // Check that copy is full with static size.
+  // TODO: use coversInputFully instead here
   const DataLayout &DL = DestAlloca->getDataLayout();
   std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL);
   if (!SrcSize || Size != *SrcSize) {
@@ -1721,6 +1722,245 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction 
*Load, Instruction *Store,
   return true;
 }
 
+// This method checks if possible to lift an instruction to position P.
+// It will check that it could lift the instruction and its argument.
+// The method returns true if it would be successful.
+bool MemCpyOptPass::canMoveUp(Instruction *I, Instruction *P) {
+  if (I->getIterator() == std::next(P->getIterator()))
+    return true;
+  // TODO: check isGuaranteedToTransferExecutionToSuccessor on all instructions
+  // between I and P
+  SmallSet<const Instruction *, 20> Visited;
+  SmallVector<Instruction *, 8> Worklist;
+  Worklist.push_back(I);
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+    for (Value *Op : I->operands()) {
+      Instruction *U = dyn_cast<Instruction>(Op);
+      if (U && !DT->dominates(U, P)) { // (U dom P) is sufficient, but is it 
the
+                                       // necessary condition?
+        // Cannot hoist if this has visible side-effects or is a PHINode (which
+        // would require converting to a SelectInst in moveUp)
+        if (isa<PHINode>(U) || U->mayReadOrWriteMemory())
+          return false;
+        if (!Visited.insert(U).second)
+          continue;                  // Already examined this in another branch
+        assert(DT->dominates(P, U)); // Ensure that SSA is correct
+        Worklist.push_back(U);
+      }
+    }
+  }
+  return true;
+}
+
+// Unconditionally move I to P, including all operands
+void MemCpyOptPass::moveUp(Instruction *I, Instruction *P) {
+  if (I->getIterator() == std::next(P->getIterator()))
+    return;
+  // Compute a valid instruction order with DFS walk
+  SmallSet<const Instruction *, 20> Lifted;
+  SmallVector<Instruction *, 8> PreOrder;
+  SmallVector<Instruction *, 8> PostOrder;
+  PreOrder.push_back(I);
+  while (!PreOrder.empty()) {
+    Instruction *I = PreOrder.back();
+    if (Lifted.count(I))
+      continue;
+    for (Value *Op : I->operands()) {
+      Instruction *U = dyn_cast<Instruction>(Op);
+      if (U && !Lifted.count(U) && !DT->dominates(U, P)) {
+        assert(!isa<PHINode>(U));
+        PreOrder.push_back(U);
+      }
+    }
+    if (PreOrder.back() == I) {
+      // All ops scheduled, can now schedule this too
+      Lifted.insert(I);
+      PreOrder.pop_back();
+      PostOrder.push_back(I);
+    }
+  }
+
+  MemoryUseOrDef *MemInsertPoint = MSSA->getMemoryAccess(P);
+  assert(MemInsertPoint);
+
+  // Now move them
+  for (Instruction *I : PostOrder) {
+    LLVM_DEBUG(dbgs() << "Lifting " << *I << " after " << *P << "\n");
+    if (I->getIterator() == std::next(P->getIterator())) {
+      P = I;
+      continue;
+    }
+    I->moveAfter(P);
+    P = I;
+    if (MemoryUseOrDef *MA = MSSA->getMemoryAccess(I)) {
+      MSSAU->moveAfter(MA, MemInsertPoint);
+      MemInsertPoint = MA;
+    }
+  }
+}
+
+bool MemCpyOptPass::performStackHoistOptzn(MemCpyInst *M,
+                                           AllocaInst *DestAlloca,
+                                           TypeSize Size, BatchAAResults &BAA) 
{
+  LLVM_DEBUG(dbgs() << "Stack Hoist: Attempting to optimize:\n" << *M << "\n");
+
+  // TODO: this should be fine as long as there are only LoadInst and GEP, but
+  // we would need more effort to re-write the GEP addrspaces
+  if (DestAlloca->getType() != M->getSource()->getType())
+    return false;
+
+  // Check that copy is full with static size.
+  // TODO: use coversInputFully instead here
+  const DataLayout &DL = DestAlloca->getDataLayout();
+  std::optional<TypeSize> DestSize = DestAlloca->getAllocationSize(DL);
+  if (!DestSize || Size != *DestSize) {
+    LLVM_DEBUG(dbgs() << "Stack Hoist: Destination alloca size mismatch\n");
+    return false;
+  }
+
+  MemoryUseOrDef *MA = MSSA->getMemoryAccess(M);
+  if (!MA)
+    // Degenerate case: memcpy marked as not accessing memory.
+    return false;
+
+  // The source align must be larger than or equal the alloca's
+  // align. If not so, we check to see if we can force the source of the memcpy
+  // to the alignment we need. If we fail, we bail out.
+  // TODO: we don't have to bail out if we only have LoadInst or other
+  // intrinsics (e.g. memcpy) and can instead reduce the alignment of the use.
+  Align MAlign = M->getSourceAlign().valueOrOne();
+  Align AllocaAlign = DestAlloca->getAlign();
+  if (MAlign < AllocaAlign &&
+      getOrEnforceKnownAlignment(M->getSource(), AllocaAlign, DL, M, AC, DT) <
+          AllocaAlign)
+    return false;
+
+  // Check that dest is never captured, unescaped alloca.
+  MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Size));
+  MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
+  SmallVector<Instruction *, 4> LifetimeMarkers;
+  SmallVector<std::pair<Instruction *, bool>, 4> AAMetadataInstrs;
+
+  SmallVector<std::pair<Instruction *, bool>, 8> Worklist;
+  Worklist.push_back({DestAlloca, true});
+  unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking();
+  Worklist.reserve(MaxUsesToExplore);
+  SmallSet<const Instruction *, 20> Visited;
+  while (!Worklist.empty()) {
+    Instruction *I;
+    bool mayMove;
+    std::tie(I, mayMove) = Worklist.pop_back_val();
+    for (const Use &U : I->uses()) {
+      auto *UI = cast<Instruction>(U.getUser());
+      // We don't care about the store itself.
+      if (UI == M)
+        continue;
+      if (UI->isLifetimeStartOrEnd()) {
+        LifetimeMarkers.push_back(UI);
+        continue;
+      }
+      if (Visited.size() >= MaxUsesToExplore) {
+        LLVM_DEBUG(
+            dbgs()
+            << "Stack Hoist: Exceeded max uses to see ModRef, bailing\n");
+        return false;
+      }
+      if (!Visited.insert(UI).second)
+        continue;
+      UseCaptureInfo CI(CaptureComponents::None, CaptureComponents::None);
+      for (const Use &U : UI->operands()) {
+        auto UCI = DetermineUseCaptureKind(U, DestAlloca);
+        CI.UseCC |= UCI.UseCC;
+        CI.ResultCC |= UCI.ResultCC;
+      }
+      if (capturesAnything(CI.UseCC))
+        return false;
+      if (!DT->dominates(UI, M)) {
+        if (UI->mayWriteToMemory()) {
+          ModRefInfo Res = BAA.getModRefInfo(UI, DestLoc);
+          if (isModSet(Res))
+            return false;
+          mayMove = false;
+        }
+        if (UI->mayReadFromMemory()) { // TODO: (U dom P) is sufficient, but is
+                                       // it the necessary condition?
+          ModRefInfo Res = BAA.getModRefInfo(UI, DestLoc);
+          if (isRefSet(Res)) {
+            if (!DT->dominates(M, UI))
+              // TODO: does writtenBetween care about dom, or just moveUp?
+              return false;
+            // If UI can modify SrcLoc, then we cannot alias it to DestLoc
+            ModRefInfo Res = BAA.getModRefInfo(UI, SrcLoc);
+            if (isModSet(Res))
+              return false;
+            bool moveUp = writtenBetween(MSSA, BAA, SrcLoc, MA,
+                                         MSSA->getMemoryAccess(UI));
+            if (moveUp) {
+              // It is safe to move this read up as long as the memory it reads
+              // doesn't change between UI and M (such as only reading DestLoc
+              // with a LoadInst, or argmemonly), and UI post-dominates M (so
+              // we are guaranteed to eventually execute UI)
+              if (!mayMove)
+                return false;
+              if (!isa<LoadInst>(UI))
+                return false;
+              if (!PDT->dominates(UI, M))
+                return false;
+            }
+            AAMetadataInstrs.push_back({UI, moveUp});
+          }
+        }
+      }
+      if (capturesAnything(CI.ResultCC)) {
+        if (mayMove)
+          // If this instruction may capture something other than UI (such as a
+          // SelectInst) or have other side-effects (such as a call), then we
+          // should not try to move this instruction, or instructions that use
+          // this. (canMoveUp will later check this for the other operands too)
+          mayMove = isa<GEPOperator>(UI);
+        Worklist.push_back({UI, mayMove});
+      }
+    }
+  }
+
+  // Nothing useful to to (the alloca is dead anyways and later passes will
+  // likely remove it) Exit early now to minimize spurious test changes
+  // TODO: delete in a later PR
+  if (AAMetadataInstrs.empty())
+    return false;
+
+  // Check that all instructions we need to move will be able to move
+  // Otherwise it may not be worthwhile to move any of them.
+  for (auto &I : AAMetadataInstrs) {
+    if (I.second && !canMoveUp(I.first, M))
+      return false;
+  }
+
+  // Remove all lifetime markers before RAUW since they no longer apply.
+  for (Instruction *I : LifetimeMarkers)
+    eraseInstruction(I);
+
+  // As this transformation can cause memory accesses that didn't previously
+  // alias to begin to alias one another, we remove !alias.scope, !noalias,
+  // !tbaa and !tbaa_struct metadata from any uses of either alloca.
+  // This is conservative, but more precision (replacing with the source tags
+  // from M) doesn't seem worthwhile right now.
+  for (auto &U : AAMetadataInstrs) {
+    Instruction *I = U.first;
+    I->setMetadata(LLVMContext::MD_alias_scope, nullptr);
+    I->setMetadata(LLVMContext::MD_noalias, nullptr);
+    I->setMetadata(LLVMContext::MD_tbaa, nullptr);
+    I->setMetadata(LLVMContext::MD_tbaa_struct, nullptr);
+    if (U.second)
+      moveUp(I, M);
+  }
+  DestAlloca->replaceAllUsesWith(M->getSource());
+  eraseInstruction(DestAlloca);
+
+  return true;
+}
+
 static bool isZeroSize(Value *Size) {
   if (auto *I = dyn_cast<Instruction>(Size))
     if (auto *Res = simplifyInstruction(I, I->getDataLayout()))
@@ -1844,25 +2084,39 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, 
BasicBlock::iterator &BBI) {
     }
   }
 
-  // If the transfer is from a stack slot to a stack slot, then we may be able
-  // to perform the stack-move optimization. See the comments in
-  // performStackMoveOptzn() for more details.
   auto *DestAlloca = dyn_cast<AllocaInst>(M->getDest());
-  if (!DestAlloca)
-    return false;
   auto *SrcAlloca = dyn_cast<AllocaInst>(M->getSource());
-  if (!SrcAlloca)
+  // Remaining transforms are mainly only interesting if we can eliminate some
+  // stack data
+  if (!DestAlloca && !SrcAlloca)
     return false;
+
   ConstantInt *Len = dyn_cast<ConstantInt>(M->getLength());
-  if (Len == nullptr)
-    return false;
-  if (performStackMoveOptzn(M, M, DestAlloca, SrcAlloca,
-                            TypeSize::getFixed(Len->getZExtValue()), BAA)) {
-    // Avoid invalidating the iterator.
-    BBI = M->getNextNode()->getIterator();
-    eraseInstruction(M);
-    ++NumMemCpyInstr;
-    return true;
+  // If the transfer is from a stack slot to a stack slot, then we may be able
+  // to perform the stack-move optimization. See the comments in
+  // performStackMoveOptzn() for more details.
+  if (DestAlloca && SrcAlloca && Len) {
+    if (performStackMoveOptzn(M, M, DestAlloca, SrcAlloca,
+                              TypeSize::getFixed(Len->getZExtValue()), BAA)) {
+      // Avoid invalidating the iterator.
+      BBI = M->getNextNode()->getIterator();
+      eraseInstruction(M);
+      ++NumMemCpyInstr;
+      return true;
+    }
+  }
+
+  if (DestAlloca && Len) {
+    // If the transfer is to a stack slot, see if we can hoist all the uses of
+    // the stack slot to here instead
+    if (performStackHoistOptzn(M, DestAlloca,
+                               TypeSize::getFixed(Len->getZExtValue()), BAA)) {
+      // Avoid invalidating the iterator.
+      BBI = M->getNextNode()->getIterator();
+      eraseInstruction(M);
+      ++NumMemCpyInstr;
+      return true;
+    }
   }
 
   return false;
diff --git a/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll 
b/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll
index 840a5172561dc..5381f8d155061 100644
--- a/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll
+++ b/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll
@@ -1,14 +1,32 @@
-; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 
UTC_ARGS: --version 5
+; RUN: opt < %s -S -passes=memcpyopt | FileCheck %s
 
 declare void @use(ptr)
 
 ; Alias scopes are merged by taking the intersection of domains, then the 
union of the scopes within those domains
 define i8 @test(i8 %input) {
+; CHECK-LABEL: define i8 @test(
+; CHECK-SAME: i8 [[INPUT:%.*]]) {
+; CHECK-NEXT:    [[DST:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[SRC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull 
[[SRC]]), !noalias [[META0:![0-9]+]]
+; CHECK-NEXT:    store i8 [[INPUT]], ptr [[SRC]], align 1
+; CHECK-NEXT:    [[RET_VALUE:%.*]] = load i8, ptr [[SRC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[SRC]]), 
!noalias [[META0]]
+; CHECK-NEXT:    ret i8 [[RET_VALUE]]
+;
   %tmp = alloca i8
   %dst = alloca i8
   %src = alloca i8
+<<<<<<< HEAD
 ; CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 
%src, i64 1, i1 false), !alias.scope ![[SCOPE:[0-9]+]]
   call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %src), !noalias !4
+||||||| parent of 827019c4bdfb (add memcpy removal for simple dest)
+; CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 
%src, i64 1, i1 false), !alias.scope ![[SCOPE:[0-9]+]]
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %src), !noalias !4
+=======
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %src), !noalias !4
+>>>>>>> 827019c4bdfb (add memcpy removal for simple dest)
   store i8 %input, ptr %src
   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, 
i1 false), !alias.scope !0
   call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !4
@@ -19,9 +37,6 @@ define i8 @test(i8 %input) {
 }
 
 ; Merged scope contains "callee0: %a" and "callee0 : %b"
-; CHECK-DAG: ![[CALLEE0_A:[0-9]+]] = distinct !{!{{[0-9]+}}, !{{[0-9]+}}, 
!"callee0: %a"}
-; CHECK-DAG: ![[CALLEE0_B:[0-9]+]] = distinct !{!{{[0-9]+}}, !{{[0-9]+}}, 
!"callee0: %b"}
-; CHECK-DAG: ![[SCOPE]] = !{![[CALLEE0_A]], ![[CALLEE0_B]]}
 
 declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
@@ -38,3 +53,10 @@ declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
 
 !7 = distinct !{!7, !8, !"callee2: %a"}
 !8 = distinct !{!8, !"callee2"}
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]], !"callee0: %b"}
+; CHECK: [[META2]] = distinct !{[[META2]], !"callee0"}
+; CHECK: [[META3]] = distinct !{[[META3]], [[META4:![0-9]+]], !"callee1: %a"}
+; CHECK: [[META4]] = distinct !{[[META4]], !"callee1"}
+;.
diff --git a/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll 
b/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll
index 601498e36a7a3..151df35fbc950 100644
--- a/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll
+++ b/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 
UTC_ARGS: --version 5
+; RUN: opt < %s -S -passes=memcpyopt | FileCheck %s
 
 declare void @use(ptr)
 
@@ -15,11 +16,20 @@ declare void @use(ptr)
 ;  !5 = distinct !{!5, !"callee0"}
 ; Which is incorrect because the lifetime.end of %src will now "noalias" the 
above memcpy.
 define i8 @test(i8 %input) {
+; CHECK-LABEL: define i8 @test(
+; CHECK-SAME: i8 [[INPUT:%.*]]) {
+; CHECK-NEXT:    [[DST:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[SRC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 1, ptr nonnull 
[[SRC]]), !noalias [[META0:![0-9]+]]
+; CHECK-NEXT:    store i8 [[INPUT]], ptr [[SRC]], align 1
+; CHECK-NEXT:    [[RET_VALUE:%.*]] = load i8, ptr [[SRC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 1, ptr nonnull [[SRC]]), 
!noalias [[META0]]
+; CHECK-NEXT:    call void @use(ptr [[SRC]])
+; CHECK-NEXT:    ret i8 [[RET_VALUE]]
+;
   %tmp = alloca i8
   %dst = alloca i8
   %src = alloca i8
-; NOTE: we're matching the full line and looking for the lack of !alias.scope 
here
-; CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 
%src, i64 1, i1 false)
   call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %src), !noalias !3
   store i8 %input, ptr %src
   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, 
i1 false), !alias.scope !0
@@ -40,3 +50,8 @@ declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
 !3 = !{!4}
 !4 = distinct !{!4, !5, !"callee1: %a"}
 !5 = distinct !{!5, !"callee1"}
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]], !"callee1: %a"}
+; CHECK: [[META2]] = distinct !{[[META2]], !"callee1"}
+;.
diff --git a/llvm/test/Transforms/MemCpyOpt/lifetime.ll 
b/llvm/test/Transforms/MemCpyOpt/lifetime.ll
index e9fc06b1e1da9..48f9d972d3a66 100644
--- a/llvm/test/Transforms/MemCpyOpt/lifetime.ll
+++ b/llvm/test/Transforms/MemCpyOpt/lifetime.ll
@@ -74,13 +74,8 @@ define i32 @call_slot_move_lifetime_start() {
 define i32 @call_slot_two_lifetime_starts() {
 ; CHECK-LABEL: @call_slot_two_lifetime_starts(
 ; CHECK-NEXT:    [[TMP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[DST:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    call void @call(ptr [[TMP]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DST]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DST]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DST]], ptr 
align 4 [[TMP]], i64 4, i1 false)
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[DST]])
-; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[DST]], align 4
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[TMP]], align 4
 ; CHECK-NEXT:    ret i32 [[V]]
 ;
   %tmp = alloca i32
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-byval-forwarding-clobbers.ll 
b/llvm/test/Transforms/MemCpyOpt/memcpy-byval-forwarding-clobbers.ll
index 383040c6c89e2..8f10fa05957e1 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy-byval-forwarding-clobbers.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-byval-forwarding-clobbers.ll
@@ -65,12 +65,10 @@ entry:
 define i1 @alloca_forwarding_call_clobber_after() {
 ; CHECK-LABEL: @alloca_forwarding_call_clobber_after(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_1:%.*]] = alloca i64, align 8
 ; CHECK-NEXT:    [[A_2:%.*]] = alloca i64, align 8
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[A_2]])
 ; CHECK-NEXT:    call void @init(ptr sret(i64) align 8 [[A_2]])
 ; CHECK-NEXT:    store i8 0, ptr [[A_2]], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[A_1]], ptr [[A_2]], 
i64 8, i1 false)
 ; CHECK-NEXT:    [[CALL:%.*]] = call i1 @check(ptr byval(i64) align 8 [[A_2]])
 ; CHECK-NEXT:    call void @clobber(ptr [[A_2]])
 ; CHECK-NEXT:    ret i1 [[CALL]]
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-hoist.ll 
b/llvm/test/Transforms/MemCpyOpt/memcpy-hoist.ll
new file mode 100644
index 0000000000000..7d90e222a12b0
--- /dev/null
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-hoist.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 
UTC_ARGS: --version 5
+; RUN: opt < %s -passes=memcpyopt -S -verify-memoryssa | FileCheck %s
+
+declare void @clobber()
+declare ptr @newmem()
+declare i64 @copyout(ptr)
+
+define i64 @simple_hoist(ptr align(8) %0, i64 signext %i) {
+; CHECK-LABEL: define i64 @simple_hoist(
+; CHECK-SAME: ptr align 8 [[TMP0:%.*]], i64 signext [[I:%.*]]) {
+; CHECK-NEXT:  [[TOP:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP0]], i64 [[I]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP2]], i64 -8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8
+; CHECK-NEXT:    call void @clobber()
+; CHECK-NEXT:    ret i64 [[TMP4]]
+;
+top:
+  %1 = alloca [8 x i64], align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %0, i64 64, i1 false), !tbaa 
!0, !alias.scope !3, !noalias !7
+  %2 = getelementptr i64, ptr %1, i64 %i
+  %3 = getelementptr i8, ptr %2, i64 -8
+  call void @clobber()
+  %4 = load i64, ptr %3, align 8, !tbaa !11, !alias.scope !13, !noalias !14
+  ret i64 %4
+}
+
+define i64 @use_nohoist(i64 signext %i) {
+; CHECK-LABEL: define i64 @use_nohoist(
+; CHECK-SAME: i64 signext [[I:%.*]]) {
+; CHECK-NEXT:  [[TOP:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call noalias align 8 ptr @newmem()
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[TMP0]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 -8
+; CHECK-NEXT:    call void @clobber()
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @copyout(ptr readonly captures(none) 
[[TMP2]])
+; CHECK-NEXT:    ret i64 [[TMP3]]
+;
+top:
+  %0 = alloca [8 x i64], align 8
+  %1 = call noalias align(8) ptr @newmem()
+  call void @llvm.memcpy.p0.p0.i64(ptr %0, ptr %1, i64 64, i1 false), !tbaa 
!0, !alias.scope !3, !noalias !7
+  %2 = getelementptr i64, ptr %0, i64 %i
+  %3 = getelementptr i8, ptr %2, i64 -8
+  call void @clobber()
+  %4 = call i64 @copyout(ptr readonly nocapture %3)
+  ret i64 %4
+}
+
+define i64 @use_hoist(i64 signext %i) {
+; CHECK-LABEL: define i64 @use_hoist(
+; CHECK-SAME: i64 signext [[I:%.*]]) {
+; CHECK-NEXT:  [[TOP:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca [8 x i64], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call align 8 ptr @newmem()
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[TMP0]], ptr [[TMP1]], 
i64 64, i1 false), !tbaa [[TBAA0:![0-9]+]], !alias.scope [[META3:![0-9]+]], 
!noalias [[META7:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP0]], i64 [[I]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP2]], i64 -8
+; CHECK-NEXT:    call void @clobber()
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @copyout(ptr captures(none) [[TMP3]]) 
#[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    ret i64 [[TMP4]]
+;
+top:
+  %0 = alloca [8 x i64], align 8
+  %1 = call align(8) ptr @newmem()
+  call void @llvm.memcpy.p0.p0.i64(ptr %0, ptr %1, i64 64, i1 false), !tbaa 
!0, !alias.scope !3, !noalias !7
+  %2 = getelementptr i64, ptr %0, i64 %i
+  %3 = getelementptr i8, ptr %2, i64 -8
+  call void @clobber()
+  %4 = call i64 @copyout(ptr nocapture %3) readonly
+  ret i64 %4
+}
+
+define i64 @cond_nohoist(ptr align(8) %0, i64 signext %i, i1 %c) {
+; CHECK-LABEL: define i64 @cond_nohoist(
+; CHECK-SAME: ptr align 8 [[TMP0:%.*]], i64 signext [[I:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[TOP:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca [8 x i64], align 8
+; CHECK-NEXT:    [[P:%.*]] = getelementptr i64, ptr [[TMP1]], i64 1
+; CHECK-NEXT:    [[UD:%.*]] = load i64, ptr [[P]], align 8, !tbaa 
[[TBAA11:![0-9]+]], !alias.scope [[META13:![0-9]+]], !noalias [[META14:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[TMP1]], ptr [[TMP0]], 
i64 64, i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META7]]
+; CHECK-NEXT:    call void @clobber()
+; CHECK-NEXT:    br i1 [[C]], label %[[L1:.*]], label %[[L2:.*]]
+; CHECK:       [[L1]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i64 [[I]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP2]], i64 -8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8, !tbaa 
[[TBAA11]], !alias.scope [[META13]], !noalias [[META14]]
+; CHECK-NEXT:    ret i64 [[TMP6]]
+; CHECK:       [[L2]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP1]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP5]], align 8, !tbaa 
[[TBAA11]], !alias.scope [[META13]], !noalias [[META14]]
+; CHECK-NEXT:    ret i64 [[TMP3]]
+;
+top:
+  %1 = alloca [8 x i64], align 8
+  %p = getelementptr i64, ptr %1, i64 1
+  %ud = load i64, ptr %p, align 8, !tbaa !11, !alias.scope !13, !noalias !14
+  call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %0, i64 64, i1 false), !tbaa 
!0, !alias.scope !3, !noalias !7
+  call void @clobber()
+  br i1 %c, label %L1, label %L2
+
+L1:                                               ; preds = %top
+  %2 = getelementptr i64, ptr %1, i64 %i
+  %3 = getelementptr i8, ptr %2, i64 -8
+  %4 = load i64, ptr %3, align 8, !tbaa !11, !alias.scope !13, !noalias !14
+  ret i64 %4
+
+L2:                                               ; preds = %top
+  %5 = getelementptr i64, ptr %1, i64 2
+  %6 = load i64, ptr %5, align 8, !tbaa !11, !alias.scope !13, !noalias !14
+  ret i64 %6
+}
+
+define i64 @pdt_hoist(ptr align(8) %0, i64 signext %i, i1 %c) {
+; CHECK-LABEL: define i64 @pdt_hoist(
+; CHECK-SAME: ptr align 8 [[TMP0:%.*]], i64 signext [[I:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[TOP:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP1]], align 8
+; CHECK-NEXT:    br i1 [[C]], label %[[L1:.*]], label %[[L2:.*]]
+; CHECK:       [[L1]]:
+; CHECK-NEXT:    br label %[[L2]]
+; CHECK:       [[L2]]:
+; CHECK-NEXT:    call void @clobber()
+; CHECK-NEXT:    ret i64 [[TMP3]]
+;
+top:
+  %1 = alloca [8 x i64], align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %0, i64 64, i1 false), !tbaa 
!0, !alias.scope !3, !noalias !7
+  br i1 %c, label %L1, label %L2
+
+L1:                                               ; preds = %top
+  br label %L2
+
+L2:                                               ; preds = %L1, %top
+  %2 = getelementptr i64, ptr %1, i64 2
+  call void @clobber()
+  %3 = load i64, ptr %2, align 8, !tbaa !11, !alias.scope !13, !noalias !14
+  ret i64 %3
+}
+
+define i64 @phi_nohoist(ptr align(8) %0, i64 signext %i, i1 %c) {
+; CHECK-LABEL: define i64 @phi_nohoist(
+; CHECK-SAME: ptr align 8 [[TMP0:%.*]], i64 signext [[I:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[TOP:.*]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca [8 x i64], align 8
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[TMP1]], ptr [[TMP0]], 
i64 64, i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META7]]
+; CHECK-NEXT:    call void @clobber()
+; CHECK-NEXT:    br i1 [[C]], label %[[L1:.*]], label %[[L2:.*]]
+; CHECK:       [[L1]]:
+; CHECK-NEXT:    br label %[[L2]]
+; CHECK:       [[L2]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 1, %[[L1]] ], [ 2, %[[TOP]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP1]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8, !tbaa 
[[TBAA11]], !alias.scope [[META13]], !noalias [[META14]]
+; CHECK-NEXT:    ret i64 [[TMP4]]
+;
+top:
+  %1 = alloca [8 x i64], align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %0, i64 64, i1 false), !tbaa 
!0, !alias.scope !3, !noalias !7
+  call void @clobber()
+  br i1 %c, label %L1, label %L2
+
+L1:                                               ; preds = %top
+  br label %L2
+
+L2:                                               ; preds = %L1, %top
+  %2 = phi i64 [ 1, %L1 ], [ 2, %top ]
+  %3 = getelementptr i64, ptr %1, i64 %2
+  %4 = load i64, ptr %3, align 8, !tbaa !11, !alias.scope !13, !noalias !14
+  ret i64 %4
+}
+
+define i64 @vla_hoist(ptr align(8) %0, i64 signext %i, i1 %c) {
+; CHECK-LABEL: define i64 @vla_hoist(
+; CHECK-SAME: ptr align 8 [[TMP0:%.*]], i64 signext [[I:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[TOP:.*]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[L1:.*]], label %[[L2:.*]]
+; CHECK:       [[L1]]:
+; CHECK-NEXT:    br label %[[L2]]
+; CHECK:       [[L2]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 1, %[[L1]] ], [ 2, %[[TOP]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP0]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8
+; CHECK-NEXT:    call void @clobber()
+; CHECK-NEXT:    ret i64 [[TMP4]]
+;
+top:
+  br i1 %c, label %L1, label %L2
+
+L1:                                               ; preds = %top
+  br label %L2
+
+L2:                                               ; preds = %L1, %top
+  %1 = phi i64 [ 1, %L1 ], [ 2, %top ]
+  %2 = alloca [8 x i64], align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr %2, ptr %0, i64 64, i1 false), !tbaa 
!0, !alias.scope !3, !noalias !7
+  %3 = getelementptr i64, ptr %2, i64 %1
+  call void @clobber()
+  %4 = load i64, ptr %3, align 8, !tbaa !11, !alias.scope !13, !noalias !14
+  ret i64 %4
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: 
readwrite)
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias writeonly captures(none), ptr 
noalias readonly captures(none), i64, i1 immarg) #0
+
+attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: 
readwrite) }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"jtbaa", !2, i64 0}
+!2 = !{!"jtbaa"}
+!3 = !{!4, !6}
+!4 = !{!"jnoalias_data", !5}
+!5 = !{!"jnoalias"}
+!6 = !{!"jnoalias_stack", !5}
+!7 = !{!8, !9, !10}
+!8 = !{!"jnoalias_gcframe", !5}
+!9 = !{!"jnoalias_typemd", !5}
+!10 = !{!"jnoalias_const", !5}
+!11 = !{!12, !12, i64 0}
+!12 = !{!"jtbaa_stack", !1, i64 0}
+!13 = !{!6}
+!14 = !{!8, !4, !9, !10}
+;.
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"jtbaa", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"jtbaa"}
+; CHECK: [[META3]] = !{[[META4:![0-9]+]], [[META6:![0-9]+]]}
+; CHECK: [[META4]] = !{!"jnoalias_data", [[META5:![0-9]+]]}
+; CHECK: [[META5]] = !{!"jnoalias"}
+; CHECK: [[META6]] = !{!"jnoalias_stack", [[META5]]}
+; CHECK: [[META7]] = !{[[META8:![0-9]+]], [[META9:![0-9]+]], 
[[META10:![0-9]+]]}
+; CHECK: [[META8]] = !{!"jnoalias_gcframe", [[META5]]}
+; CHECK: [[META9]] = !{!"jnoalias_typemd", [[META5]]}
+; CHECK: [[META10]] = !{!"jnoalias_const", [[META5]]}
+; CHECK: [[TBAA11]] = !{[[META12:![0-9]+]], [[META12]], i64 0}
+; CHECK: [[META12]] = !{!"jtbaa_stack", [[META1]], i64 0}
+; CHECK: [[META13]] = !{[[META6]]}
+; CHECK: [[META14]] = !{[[META8]], [[META4]], [[META9]], [[META10]]}
+;.
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-nohoist.ll 
b/llvm/test/Transforms/MemCpyOpt/memcpy-nohoist.ll
new file mode 100644
index 0000000000000..cb26e8026147c
--- /dev/null
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-nohoist.ll
@@ -0,0 +1,183 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 
UTC_ARGS: --version 5
+; RUN: opt < %s -passes=memcpyopt -S -verify-memoryssa | FileCheck %s
+
+declare ptr @newmem()
+declare i64 @copyout(ptr)
+
+define i64 @simple_hoist(ptr align(8) %0, i64 signext %i) {
+; CHECK-LABEL: define i64 @simple_hoist(
+; CHECK-SAME: ptr align 8 [[TMP0:%.*]], i64 signext [[I:%.*]]) {
+; CHECK-NEXT:  [[TOP:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP0]], i64 [[I]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP2]], i64 -8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8
+; CHECK-NEXT:    ret i64 [[TMP4]]
+;
+top:
+  %1 = alloca [8 x i64], align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %0, i64 64, i1 false), !tbaa 
!0, !alias.scope !3, !noalias !7
+  %2 = getelementptr i64, ptr %1, i64 %i
+  %3 = getelementptr i8, ptr %2, i64 -8
+  %4 = load i64, ptr %3, align 8, !tbaa !11, !alias.scope !13, !noalias !14
+  ret i64 %4
+}
+
+define i64 @cond_nohoist(ptr align(8) %0, i64 signext %i, i1 %c) {
+; CHECK-LABEL: define i64 @cond_nohoist(
+; CHECK-SAME: ptr align 8 [[TMP0:%.*]], i64 signext [[I:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[TOP:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca [8 x i64], align 8
+; CHECK-NEXT:    [[P:%.*]] = getelementptr i64, ptr [[TMP1]], i64 1
+; CHECK-NEXT:    [[UD:%.*]] = load i64, ptr [[P]], align 8, !tbaa 
[[TBAA0:![0-9]+]], !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[TMP1]], ptr [[TMP0]], 
i64 64, i1 false), !tbaa [[TBAA12:![0-9]+]], !alias.scope [[META13:![0-9]+]], 
!noalias [[META14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label %[[L1:.*]], label %[[L2:.*]]
+; CHECK:       [[L1]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i64 [[I]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP2]], i64 -8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8, !tbaa 
[[TBAA0]], !alias.scope [[META4]], !noalias [[META7]]
+; CHECK-NEXT:    ret i64 [[TMP6]]
+; CHECK:       [[L2]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP1]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP5]], align 8, !tbaa 
[[TBAA0]], !alias.scope [[META4]], !noalias [[META7]]
+; CHECK-NEXT:    ret i64 [[TMP3]]
+;
+top:
+  %1 = alloca [8 x i64], align 8
+  %p = getelementptr i64, ptr %1, i64 1
+  %ud = load i64, ptr %p, align 8, !tbaa !11, !alias.scope !13, !noalias !14
+  call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %0, i64 64, i1 false), !tbaa 
!0, !alias.scope !3, !noalias !7
+  br i1 %c, label %L1, label %L2
+
+L1:                                               ; preds = %top
+  %2 = getelementptr i64, ptr %1, i64 %i
+  %3 = getelementptr i8, ptr %2, i64 -8
+  %4 = load i64, ptr %3, align 8, !tbaa !11, !alias.scope !13, !noalias !14
+  ret i64 %4
+
+L2:                                               ; preds = %top
+  %5 = getelementptr i64, ptr %1, i64 2
+  %6 = load i64, ptr %5, align 8, !tbaa !11, !alias.scope !13, !noalias !14
+  ret i64 %6
+}
+
+define i64 @pdt_hoist(ptr align(8) %0, i64 signext %i, i1 %c) {
+; CHECK-LABEL: define i64 @pdt_hoist(
+; CHECK-SAME: ptr align 8 [[TMP0:%.*]], i64 signext [[I:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[TOP:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    br i1 [[C]], label %[[L1:.*]], label %[[L2:.*]]
+; CHECK:       [[L1]]:
+; CHECK-NEXT:    br label %[[L2]]
+; CHECK:       [[L2]]:
+; CHECK-NEXT:    ret i64 [[TMP3]]
+;
+top:
+  %1 = alloca [8 x i64], align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %0, i64 64, i1 false), !tbaa 
!0, !alias.scope !3, !noalias !7
+  br i1 %c, label %L1, label %L2
+
+L1:                                               ; preds = %top
+  br label %L2
+
+L2:                                               ; preds = %L1, %top
+  %2 = getelementptr i64, ptr %1, i64 2
+  %3 = load i64, ptr %2, align 8, !tbaa !11, !alias.scope !13, !noalias !14
+  ret i64 %3
+}
+
+define i64 @phi_nohoist(ptr align(8) %0, i64 signext %i, i1 %c) {
+; CHECK-LABEL: define i64 @phi_nohoist(
+; CHECK-SAME: ptr align 8 [[TMP0:%.*]], i64 signext [[I:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[TOP:.*]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca [8 x i64], align 8
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[TMP1]], ptr [[TMP0]], 
i64 64, i1 false), !tbaa [[TBAA12]], !alias.scope [[META13]], !noalias 
[[META14]]
+; CHECK-NEXT:    br i1 [[C]], label %[[L1:.*]], label %[[L2:.*]]
+; CHECK:       [[L1]]:
+; CHECK-NEXT:    br label %[[L2]]
+; CHECK:       [[L2]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 1, %[[L1]] ], [ 2, %[[TOP]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP1]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8, !tbaa 
[[TBAA0]], !alias.scope [[META4]], !noalias [[META7]]
+; CHECK-NEXT:    ret i64 [[TMP4]]
+;
+top:
+  %1 = alloca [8 x i64], align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %0, i64 64, i1 false), !tbaa 
!0, !alias.scope !3, !noalias !7
+  br i1 %c, label %L1, label %L2
+
+L1:                                               ; preds = %top
+  br label %L2
+
+L2:                                               ; preds = %L1, %top
+  %2 = phi i64 [ 1, %L1 ], [ 2, %top ]
+  %3 = getelementptr i64, ptr %1, i64 %2
+  %4 = load i64, ptr %3, align 8, !tbaa !11, !alias.scope !13, !noalias !14
+  ret i64 %4
+}
+
+define i64 @vla_hoist(ptr align(8) %0, i64 signext %i, i1 %c) {
+; CHECK-LABEL: define i64 @vla_hoist(
+; CHECK-SAME: ptr align 8 [[TMP0:%.*]], i64 signext [[I:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[TOP:.*]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[L1:.*]], label %[[L2:.*]]
+; CHECK:       [[L1]]:
+; CHECK-NEXT:    br label %[[L2]]
+; CHECK:       [[L2]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 1, %[[L1]] ], [ 2, %[[TOP]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP0]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8
+; CHECK-NEXT:    ret i64 [[TMP4]]
+;
+top:
+  br i1 %c, label %L1, label %L2
+
+L1:                                               ; preds = %top
+  br label %L2
+
+L2:                                               ; preds = %L1, %top
+  %1 = phi i64 [ 1, %L1 ], [ 2, %top ]
+  %2 = alloca [8 x i64], align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr %2, ptr %0, i64 64, i1 false), !tbaa 
!0, !alias.scope !3, !noalias !7
+  %3 = getelementptr i64, ptr %2, i64 %1
+  %4 = load i64, ptr %3, align 8, !tbaa !11, !alias.scope !13, !noalias !14
+  ret i64 %4
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: 
readwrite)
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias writeonly captures(none), ptr 
noalias readonly captures(none), i64, i1 immarg) #0
+
+attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: 
readwrite) }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"jtbaa", !2, i64 0}
+!2 = !{!"jtbaa"}
+!3 = !{!4, !6}
+!4 = !{!"jnoalias_data", !5}
+!5 = !{!"jnoalias"}
+!6 = !{!"jnoalias_stack", !5}
+!7 = !{!8, !9, !10}
+!8 = !{!"jnoalias_gcframe", !5}
+!9 = !{!"jnoalias_typemd", !5}
+!10 = !{!"jnoalias_const", !5}
+!11 = !{!12, !12, i64 0}
+!12 = !{!"jtbaa_stack", !1, i64 0}
+!13 = !{!6}
+!14 = !{!8, !4, !9, !10}
+;.
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"jtbaa_stack", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"jtbaa", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"jtbaa"}
+; CHECK: [[META4]] = !{[[META5:![0-9]+]]}
+; CHECK: [[META5]] = !{!"jnoalias_stack", [[META6:![0-9]+]]}
+; CHECK: [[META6]] = !{!"jnoalias"}
+; CHECK: [[META7]] = !{[[META8:![0-9]+]], [[META9:![0-9]+]], 
[[META10:![0-9]+]], [[META11:![0-9]+]]}
+; CHECK: [[META8]] = !{!"jnoalias_gcframe", [[META6]]}
+; CHECK: [[META9]] = !{!"jnoalias_data", [[META6]]}
+; CHECK: [[META10]] = !{!"jnoalias_typemd", [[META6]]}
+; CHECK: [[META11]] = !{!"jnoalias_const", [[META6]]}
+; CHECK: [[TBAA12]] = !{[[META2]], [[META2]], i64 0}
+; CHECK: [[META13]] = !{[[META9]], [[META5]]}
+; CHECK: [[META14]] = !{[[META8]], [[META10]], [[META11]]}
+;.
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll 
b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
index 89d8eb1ee6711..944518e9f3b3a 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
@@ -179,7 +179,7 @@ define void @test3(ptr noalias writable sret(%0) 
%agg.result) nounwind  {
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 16 
[[AGG_RESULT:%.*]], ptr align 16 @x, i32 32, i1 false)
 ; CHECK-NEXT:    ret void
 ;
-  %x.0 = alloca %0
+  %x.0 = alloca %0, align 16
   call void @llvm.memcpy.p0.p0.i32(ptr align 16 %x.0, ptr align 16 @x, i32 32, 
i1 false)
   call void @llvm.memcpy.p0.p0.i32(ptr align 16 %agg.result, ptr align 16 
%x.0, i32 32, i1 false)
   ret void
@@ -192,7 +192,7 @@ define void @test4(ptr %P) {
 ; CHECK-NEXT:    call void @test4a(ptr byval(i8) align 1 [[P:%.*]])
 ; CHECK-NEXT:    ret void
 ;
-  %A = alloca %1
+  %A = alloca %1, align 4
   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %A, ptr align 4 %P, i64 8, i1 
false)
   call void @test4a(ptr align 1 byval(i8) %A)
   ret void
@@ -201,12 +201,12 @@ define void @test4(ptr %P) {
 ; Make sure we don't remove the memcpy if the source address space doesn't 
match the byval argument
 define void @test4_addrspace(ptr addrspace(1) %P) {
 ; CHECK-LABEL: @test4_addrspace(
-; CHECK-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 8
+; CHECK-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 4
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p1.i64(ptr align 4 [[A1]], ptr 
addrspace(1) align 4 [[P:%.*]], i64 8, i1 false)
 ; CHECK-NEXT:    call void @test4a(ptr byval(i8) align 1 [[A1]])
 ; CHECK-NEXT:    ret void
 ;
-  %a1 = alloca %1
+  %a1 = alloca %1, align 4
   call void @llvm.memcpy.p0.p1.i64(ptr align 4 %a1, ptr addrspace(1) align 4 
%P, i64 8, i1 false)
   call void @test4a(ptr align 1 byval(i8) %a1)
   ret void
@@ -214,13 +214,13 @@ define void @test4_addrspace(ptr addrspace(1) %P) {
 
 define void @test4_write_between(ptr %P) {
 ; CHECK-LABEL: @test4_write_between(
-; CHECK-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 8
+; CHECK-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 4
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[A1]], ptr 
align 4 [[P:%.*]], i64 8, i1 false)
 ; CHECK-NEXT:    store i8 0, ptr [[A1]], align 1
 ; CHECK-NEXT:    call void @test4a(ptr byval(i8) align 1 [[A1]])
 ; CHECK-NEXT:    ret void
 ;
-  %a1 = alloca %1
+  %a1 = alloca %1, align 4
   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %a1, ptr align 4 %P, i64 8, i1 
false)
   store i8 0, ptr %a1
   call void @test4a(ptr align 1 byval(i8) %a1)
@@ -229,15 +229,13 @@ define void @test4_write_between(ptr %P) {
 
 define i8 @test4_read_between(ptr %P) {
 ; CHECK-LABEL: @test4_read_between(
-; CHECK-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 8
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[A1]], ptr 
align 4 [[P:%.*]], i64 8, i1 false)
-; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[A1]], align 1
+; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[P:%.*]], align 1
 ; CHECK-NEXT:    call void @test4a(ptr byval(i8) align 1 [[P]])
 ; CHECK-NEXT:    ret i8 [[X]]
 ;
-  %a1 = alloca %1
+  %a1 = alloca %1, align 4
   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %a1, ptr align 4 %P, i64 8, i1 
false)
-  %x = load i8, ptr %a1
+  %x = load i8, ptr %a1, align 1
   call void @test4a(ptr align 1 byval(i8) %a1)
   ret i8 %x
 }
@@ -251,7 +249,7 @@ define void @test4_non_local(ptr %P, i1 %c) {
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
-  %a1 = alloca %1
+  %a1 = alloca %1, align 4
   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %a1, ptr align 4 %P, i64 8, i1 
false)
   br i1 %c, label %call, label %exit
 

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to