https://github.com/yxsamliu updated https://github.com/llvm/llvm-project/pull/165159
>From 1b871128e4aab8dc02a5a320492acaf2f96a0eba Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" <[email protected]> Date: Mon, 30 Mar 2026 14:20:08 -0400 Subject: [PATCH 1/2] [SROA] Canonicalize homogeneous structs into fixed vectors When SROA selects a partition type and the type from getTypePartition is a homogeneous struct (all elements same type, no padding), canonicalize it to a fixed vector. For example, { i64, i64 } becomes <2 x i64>. This enables vector promotion of allocas that would otherwise remain as unpromoted struct types, particularly the std::function swap pattern where a { i64, i64 } temporary is used with three memcpy calls. Converting to <2 x i64> allows SROA to replace the alloca with a single vector load and store. Structs with padding, pointer elements, sub-byte elements (i1), or non-homogeneous field types are excluded by tryCanonicalizeStructToVector. --- clang/test/CodeGenOpenCL/nullptr.cl | 4 +- llvm/lib/Transforms/Scalar/SROA.cpp | 63 +++ llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 77 ++-- .../assignment-tracking/sroa/user-memcpy.ll | 8 +- .../DebugInfo/Generic/sroa-alloca-offset.ll | 6 +- llvm/test/DebugInfo/X86/sroasplit-4.ll | 12 +- .../SROA/struct-to-vector-subpartition.ll | 73 ++++ llvm/test/Transforms/SROA/struct-to-vector.ll | 392 ++++++++++++++++++ llvm/test/Transforms/SROA/tbaa-struct3.ll | 9 +- 9 files changed, 576 insertions(+), 68 deletions(-) create mode 100644 llvm/test/Transforms/SROA/struct-to-vector-subpartition.ll create mode 100644 llvm/test/Transforms/SROA/struct-to-vector.ll diff --git a/clang/test/CodeGenOpenCL/nullptr.cl b/clang/test/CodeGenOpenCL/nullptr.cl index 976e12c0bef47..f45df110ec243 100644 --- a/clang/test/CodeGenOpenCL/nullptr.cl +++ b/clang/test/CodeGenOpenCL/nullptr.cl @@ -597,10 +597,10 @@ typedef struct { } StructTy3; // CHECK-LABEL: test_memset_private -// SPIR64: call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %ptr, i8 0, i64 32, i1 false) +// SPIR64: store <4 x i64> zeroinitializer, ptr %ptr, align 8 // SPIR64: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr %ptr, i64 32 // SPIR64: store ptr addrspacecast (ptr addrspace(4) null to ptr), ptr [[GEP]], align 8 -// AMDGCN: call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 8 {{.*}}, i8 0, i64 32, i1 false) +// AMDGCN: store <4 x i64> zeroinitializer, ptr addrspace(5) %ptr, align 8 // AMDGCN: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) %ptr, i32 32 // AMDGCN: store ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)), ptr addrspace(5) [[GEP]] // AMDGCN: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) {{.*}}, i32 36 diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 760b84000fe7b..bd1d6aa90ba74 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -5121,6 +5121,64 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { return true; } +/// Try to canonicalize a homogeneous, tightly-packed struct to a vector type. +/// +/// For structs where all elements have the same type and are tightly packed +/// (no padding), we can represent them as a fixed vector which enables better +/// optimization (e.g., vector selects instead of memcpy). +/// +/// \param STy The struct type to try to canonicalize. +/// \param DL The DataLayout for size/alignment queries. +/// \returns The equivalent vector type, or nullptr if not applicable. +static FixedVectorType *tryCanonicalizeStructToVector(StructType *STy, + const DataLayout &DL) { + unsigned NumElts = STy->getNumElements(); + if (NumElts != 2 && NumElts != 4) + return nullptr; + + // All elements must be the same type. + Type *EltTy = STy->getElementType(0); + for (unsigned I = 1; I < NumElts; ++I) + if (STy->getElementType(I) != EltTy) + return nullptr; + + // Element type must be valid for vectors. + if (!VectorType::isValidElementType(EltTy)) + return nullptr; + + // Only allow integer types >= 8 bits or floating point. + if (auto *IT = dyn_cast<IntegerType>(EltTy)) { + if (IT->getBitWidth() < 8) + return nullptr; + } else if (!EltTy->isFloatingPointTy()) { + return nullptr; + } + + // Element size must be fixed and non-zero. + TypeSize EltTS = DL.getTypeAllocSize(EltTy); + if (!EltTS.isFixed()) + return nullptr; + uint64_t EltSize = EltTS.getFixedValue(); + if (EltSize < 1) + return nullptr; + + const StructLayout *SL = DL.getStructLayout(STy); + uint64_t StructSize = SL->getSizeInBytes(); + if (StructSize == 0) + return nullptr; + + // Must be tightly packed: size == NumElts * EltSize. + if (StructSize != NumElts * EltSize) + return nullptr; + + // Verify each element is at the expected offset (no padding). + for (unsigned I = 0; I < NumElts; ++I) + if (SL->getElementOffset(I) != I * EltSize) + return nullptr; + + return FixedVectorType::get(EltTy, NumElts); +} + /// Select a partition type for an alloca partition. /// /// Try to compute a friendly type for this partition of the alloca. This @@ -5194,6 +5252,11 @@ selectPartitionType(Partition &P, const DataLayout &DL, AllocaInst &AI, isIntegerWideningViable(P, LargestIntTy, DL)) return {LargestIntTy, true, nullptr}; + // Try homogeneous struct to vector canonicalization. + if (auto *STy = dyn_cast<StructType>(TypePartitionTy)) + if (auto *VTy = tryCanonicalizeStructToVector(STy, DL)) + return {VTy, false, nullptr}; + // Fallback to TypePartitionTy and we probably won't promote. return {TypePartitionTy, false, nullptr}; } diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index 2b99a2af52719..a3144c5768431 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -455,64 +455,39 @@ define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly ; PTX-NEXT: .local .align 8 .b8 __local_depot9[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; -; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<47>; +; PTX-NEXT: .reg .b32 %r<23>; +; PTX-NEXT: .reg .b64 %rd<4>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry ; PTX-NEXT: mov.b64 %SPL, __local_depot9; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: ld.param.b64 %rd1, [memcpy_to_param_param_0]; -; PTX-NEXT: add.u64 %rd2, %SPL, 0; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; ; PTX-NEXT: ld.param.b32 %r1, [memcpy_to_param_param_1+4]; -; PTX-NEXT: st.local.b32 [%rd2+4], %r1; ; PTX-NEXT: ld.param.b32 %r2, [memcpy_to_param_param_1]; -; PTX-NEXT: st.local.b32 [%rd2], %r2; -; PTX-NEXT: ld.volatile.b8 %rd3, [%rd1]; -; PTX-NEXT: ld.volatile.b8 %rd4, [%rd1+1]; -; PTX-NEXT: shl.b64 %rd5, %rd4, 8; -; PTX-NEXT: or.b64 %rd6, %rd5, %rd3; -; PTX-NEXT: ld.volatile.b8 %rd7, [%rd1+2]; -; PTX-NEXT: shl.b64 %rd8, %rd7, 16; -; PTX-NEXT: ld.volatile.b8 %rd9, [%rd1+3]; -; PTX-NEXT: shl.b64 %rd10, %rd9, 24; -; PTX-NEXT: or.b64 %rd11, %rd10, %rd8; -; PTX-NEXT: or.b64 %rd12, %rd11, %rd6; -; PTX-NEXT: ld.volatile.b8 %rd13, [%rd1+4]; -; PTX-NEXT: ld.volatile.b8 %rd14, [%rd1+5]; -; PTX-NEXT: shl.b64 %rd15, %rd14, 8; -; PTX-NEXT: or.b64 %rd16, %rd15, %rd13; -; PTX-NEXT: ld.volatile.b8 %rd17, [%rd1+6]; -; PTX-NEXT: shl.b64 %rd18, %rd17, 16; -; PTX-NEXT: ld.volatile.b8 %rd19, [%rd1+7]; -; PTX-NEXT: shl.b64 %rd20, %rd19, 24; -; PTX-NEXT: or.b64 %rd21, %rd20, %rd18; -; PTX-NEXT: or.b64 %rd22, %rd21, %rd16; -; PTX-NEXT: shl.b64 %rd23, %rd22, 32; -; PTX-NEXT: or.b64 %rd24, %rd23, %rd12; -; PTX-NEXT: st.volatile.b64 [%SP], %rd24; -; PTX-NEXT: ld.volatile.b8 %rd25, [%rd1+8]; -; PTX-NEXT: ld.volatile.b8 %rd26, [%rd1+9]; -; PTX-NEXT: shl.b64 %rd27, %rd26, 8; -; PTX-NEXT: or.b64 %rd28, %rd27, %rd25; -; PTX-NEXT: ld.volatile.b8 %rd29, [%rd1+10]; -; PTX-NEXT: shl.b64 %rd30, %rd29, 16; -; PTX-NEXT: ld.volatile.b8 %rd31, [%rd1+11]; -; PTX-NEXT: shl.b64 %rd32, %rd31, 24; -; PTX-NEXT: or.b64 %rd33, %rd32, %rd30; -; PTX-NEXT: or.b64 %rd34, %rd33, %rd28; -; PTX-NEXT: ld.volatile.b8 %rd35, [%rd1+12]; -; PTX-NEXT: ld.volatile.b8 %rd36, [%rd1+13]; -; PTX-NEXT: shl.b64 %rd37, %rd36, 8; -; PTX-NEXT: or.b64 %rd38, %rd37, %rd35; -; PTX-NEXT: ld.volatile.b8 %rd39, [%rd1+14]; -; PTX-NEXT: shl.b64 %rd40, %rd39, 16; -; PTX-NEXT: ld.volatile.b8 %rd41, [%rd1+15]; -; PTX-NEXT: shl.b64 %rd42, %rd41, 24; -; PTX-NEXT: or.b64 %rd43, %rd42, %rd40; -; PTX-NEXT: or.b64 %rd44, %rd43, %rd38; -; PTX-NEXT: shl.b64 %rd45, %rd44, 32; -; PTX-NEXT: or.b64 %rd46, %rd45, %rd34; -; PTX-NEXT: st.volatile.b64 [%SP+8], %rd46; +; PTX-NEXT: st.v2.b32 [%SP], {%r2, %r1}; +; PTX-NEXT: ld.volatile.global.b8 %r3, [%rd2+4]; +; PTX-NEXT: ld.volatile.global.b8 %r4, [%rd2+5]; +; PTX-NEXT: shl.b32 %r5, %r4, 8; +; PTX-NEXT: or.b32 %r6, %r5, %r3; +; PTX-NEXT: ld.volatile.global.b8 %r7, [%rd2+6]; +; PTX-NEXT: shl.b32 %r8, %r7, 16; +; PTX-NEXT: ld.volatile.global.b8 %r9, [%rd2+7]; +; PTX-NEXT: shl.b32 %r10, %r9, 24; +; PTX-NEXT: or.b32 %r11, %r10, %r8; +; PTX-NEXT: or.b32 %r12, %r11, %r6; +; PTX-NEXT: ld.volatile.global.b8 %r13, [%rd2]; +; PTX-NEXT: ld.volatile.global.b8 %r14, [%rd2+1]; +; PTX-NEXT: shl.b32 %r15, %r14, 8; +; PTX-NEXT: or.b32 %r16, %r15, %r13; +; PTX-NEXT: ld.volatile.global.b8 %r17, [%rd2+2]; +; PTX-NEXT: shl.b32 %r18, %r17, 16; +; PTX-NEXT: ld.volatile.global.b8 %r19, [%rd2+3]; +; PTX-NEXT: shl.b32 %r20, %r19, 24; +; PTX-NEXT: or.b32 %r21, %r20, %r18; +; PTX-NEXT: or.b32 %r22, %r21, %r16; +; PTX-NEXT: add.u64 %rd3, %SPL, 0; +; PTX-NEXT: st.local.v2.b32 [%rd3], {%r22, %r12}; ; PTX-NEXT: ret; entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true) diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll index ded78f4ff83f4..00127c85db1fb 100644 --- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll +++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll @@ -21,8 +21,8 @@ ;; Allocas have been promoted - the linked dbg.assigns have been removed. ;; | V3i point = {0, 0, 0}; -; CHECK-NEXT: #dbg_value(i64 0, ![[point:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 64), -; CHECK-NEXT: #dbg_value(i64 0, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64), +;; point.x and point.y are promoted together as <2 x i64> zeroinitializer. +; CHECK-NEXT: #dbg_value(<2 x i64> zeroinitializer, ![[point:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), ;; point.z = 5000; ; CHECK-NEXT: #dbg_value(i64 5000, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 128, 64), @@ -40,8 +40,8 @@ ; CHECK-NEXT: #dbg_value(i64 %other.sroa.3.0.copyload, ![[other]], !DIExpression(DW_OP_LLVM_fragment, 128, 64), ;; | std::memcpy(&point.y, &other.x, sizeof(long) * 2); -;; other is now 3 scalars: -;; point.y = other.x +;; point.x and point.y are a <2 x i64>, insertelement updates point.y: +; CHECK-NEXT: {{.*}} = insertelement <2 x i64> zeroinitializer, i64 %other.sroa.0.0.copyload, i32 1 ; CHECK-NEXT: #dbg_value(i64 %other.sroa.0.0.copyload, ![[point]], !DIExpression(DW_OP_LLVM_fragment, 64, 64), ;; ;; point.z = other.y diff --git a/llvm/test/DebugInfo/Generic/sroa-alloca-offset.ll b/llvm/test/DebugInfo/Generic/sroa-alloca-offset.ll index 6718711f83e04..30d8e80f8d73a 100644 --- a/llvm/test/DebugInfo/Generic/sroa-alloca-offset.ll +++ b/llvm/test/DebugInfo/Generic/sroa-alloca-offset.ll @@ -140,9 +140,7 @@ entry: ;; 16 bit variable f (!62): value vgf (lower bits) ;; 16 bit variable g (!63): value vgf (upper bits) ;; -;; 16 bit variable h (!64): deref dead_64_128 -; COMMON-NEXT: %[[dead_64_128:.*]] = alloca %struct.two -; COMMON-NEXT: #dbg_declare(ptr %[[dead_64_128]], ![[h:[0-9]+]], !DIExpression(), +;; 16 bit variable h (!64): value vh (vector) ; COMMON-NEXT: %[[ve:.*]] = load i32, ptr @gf ;; FIXME: mem2reg bug - offset is incorrect - see comment above. ; COMMON-NEXT: #dbg_value(i32 %[[ve]], ![[e:[0-9]+]], !DIExpression(DW_OP_plus_uconst, 2), @@ -150,6 +148,8 @@ entry: ; COMMON-NEXT: #dbg_value(i32 %[[vfg]], ![[f:[0-9]+]], !DIExpression(), ;; FIXME: mem2reg bug - offset is incorrect - see comment above. ; COMMON-NEXT: #dbg_value(i32 %[[vfg]], ![[g:[0-9]+]], !DIExpression(DW_OP_plus_uconst, 2), +; COMMON-NEXT: %[[vh:.*]] = load <2 x i32>, ptr getelementptr inbounds (i8, ptr @gf, i64 8) +; COMMON-NEXT: #dbg_value(<2 x i32> %[[vh]], ![[h:[0-9]+]], !DIExpression(), define dso_local noundef i32 @_Z4fun3v() #0 !dbg !55 { entry: %0 = alloca %struct.four, align 4 diff --git a/llvm/test/DebugInfo/X86/sroasplit-4.ll b/llvm/test/DebugInfo/X86/sroasplit-4.ll index d5ce348e9896e..97eba1d206eeb 100644 --- a/llvm/test/DebugInfo/X86/sroasplit-4.ll +++ b/llvm/test/DebugInfo/X86/sroasplit-4.ll @@ -2,11 +2,15 @@ ; ; Test that recursively splitting an alloca updates the debug info correctly. ; CHECK: %[[T:.*]] = load i64, ptr @t, align 8 -; CHECK: #dbg_value(i64 %[[T]], ![[Y:.*]], !DIExpression(DW_OP_LLVM_fragment, 0, 64), +; CHECK: %[[VEC1:.*]] = insertelement <2 x i64> {{.*}}, i64 %[[T]], i32 0 +; CHECK: #dbg_value(<2 x i64> %[[VEC1]], ![[Y:.*]], !DIExpression(), ; CHECK: %[[T1:.*]] = load i64, ptr @t, align 8 -; CHECK: #dbg_value(i64 %[[T1]], ![[Y]], !DIExpression(DW_OP_LLVM_fragment, 64, 64), -; CHECK: #dbg_value(i64 %[[T]], ![[R:.*]], !DIExpression(DW_OP_LLVM_fragment, 192, 64), -; CHECK: #dbg_value(i64 %[[T1]], ![[R]], !DIExpression(DW_OP_LLVM_fragment, 256, 64), +; CHECK: %[[VEC2:.*]] = insertelement <2 x i64> %[[VEC1]], i64 %[[T1]], i32 1 +; CHECK: #dbg_value(<2 x i64> %[[VEC2]], ![[Y]], !DIExpression(), +; CHECK: #dbg_value(i32 0, ![[R:.*]], !DIExpression(DW_OP_LLVM_fragment, 0, 32), +; CHECK: #dbg_value(i64 0, ![[R]], !DIExpression(DW_OP_LLVM_fragment, 64, 64), +; CHECK: #dbg_value(i64 0, ![[R]], !DIExpression(DW_OP_LLVM_fragment, 128, 64), +; CHECK: #dbg_value(<2 x i64> %[[VEC2]], ![[R]], !DIExpression(DW_OP_LLVM_fragment, 192, 128), ; ; struct p { ; __SIZE_TYPE__ s; diff --git a/llvm/test/Transforms/SROA/struct-to-vector-subpartition.ll b/llvm/test/Transforms/SROA/struct-to-vector-subpartition.ll new file mode 100644 index 0000000000000..58d2cfc69e052 --- /dev/null +++ b/llvm/test/Transforms/SROA/struct-to-vector-subpartition.ll @@ -0,0 +1,73 @@ +; RUN: opt -passes=sroa -S %s | FileCheck %s +; NOTE: Do not autogenerate. This test intentionally uses targeted CHECK +; patterns for clarity. + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; When SROA splits { ptr, i64, i64, i64 } into [0,8), [8,16), [16,32), +; the [16,32) partition type from getTypePartition is { i64, i64 }. +; tryCanonicalizeStructToVector converts this to <2 x i64>, making the +; sub-partition promotable and eliminating the alloca entirely. + +; CHECK-LABEL: define void @test_subpartition_type( +; CHECK-NOT: alloca +; CHECK: load <2 x i64> +; CHECK: store <2 x i64> +define void @test_subpartition_type(ptr %src, ptr %dst) { +entry: + %a = alloca { ptr, i64, i64, i64 }, align 8 + call void @llvm.lifetime.start.p0(i64 32, ptr %a) + + ; Copy all 32 bytes from src into %a (splittable) + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a, ptr align 8 %src, i64 32, i1 false) + + ; Load ptr at [0,8) -- forces partition boundary at 8 + %p = load ptr, ptr %a, align 8 + + ; Load i64 at [8,16) -- forces partition boundary at 16 + %gep.a.8 = getelementptr inbounds i8, ptr %a, i64 8 + %v1 = load i64, ptr %gep.a.8, align 8 + + ; Only splittable memcpy uses touch [16,32), so SROA creates a single + ; [16,32) partition. getTypePartition returns { i64, i64 } for this, + ; which is canonicalized to <2 x i64>. + + ; Copy all 32 bytes from %a to dst (splittable) + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %a, i64 32, i1 false) + + call void @llvm.lifetime.end.p0(i64 32, ptr %a) + ret void +} + +; Element-wise { double, double } access through a phi. +; The phi between two allocas prevents SROA slice analysis +; ("A pointer to this alloca escaped"), so the allocas survive. + +; CHECK-LABEL: define void @test_elementwise_phi( +; CHECK-NOT: <2 x double> +define void @test_elementwise_phi(ptr %src0, ptr %src1, i1 %cond, ptr %dst) { +entry: + %a = alloca { double, double }, align 8 + %b = alloca { double, double }, align 8 + %a.1 = getelementptr inbounds i8, ptr %a, i64 8 + %b.1 = getelementptr inbounds i8, ptr %b, i64 8 + %v0 = load double, ptr %src0, align 8 + %v1 = load double, ptr %src1, align 8 + store double %v0, ptr %a, align 8 + store double %v1, ptr %a.1, align 8 + store double 0.0, ptr %b, align 8 + store double 0.0, ptr %b.1, align 8 + br i1 %cond, label %if.then, label %if.else + +if.then: + br label %merge + +if.else: + br label %merge + +merge: + %sel = phi ptr [ %a, %if.then ], [ %b, %if.else ] + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %sel, i64 16, i1 false) + ret void +} diff --git a/llvm/test/Transforms/SROA/struct-to-vector.ll b/llvm/test/Transforms/SROA/struct-to-vector.ll new file mode 100644 index 0000000000000..a4f68c53952ab --- /dev/null +++ b/llvm/test/Transforms/SROA/struct-to-vector.ll @@ -0,0 +1,392 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes='sroa,gvn,instcombine,simplifycfg' -S %s | FileCheck %s +%struct.myint4 = type { i32, i32, i32, i32 } + +define dso_local void @foo_flat(ptr noundef %x, i64 %y.coerce0, i64 %y.coerce1, i32 noundef %cond) { +; CHECK-LABEL: define dso_local void @foo_flat( +; CHECK-SAME: ptr noundef [[X:%.*]], i64 [[Y_COERCE0:%.*]], i64 [[Y_COERCE1:%.*]], i32 noundef [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[COND]], 0 +; CHECK-NEXT: [[Y_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[Y_COERCE0]], i64 0 +; CHECK-NEXT: [[Y_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[Y_SROA_0_0_VEC_INSERT]], i64 [[Y_COERCE1]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[Y_SROA_0_8_VEC_INSERT]] to <4 x i32> +; CHECK-NEXT: [[COND1_SROA_SPECULATED:%.*]] = select i1 [[TOBOOL_NOT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP0]] +; CHECK-NEXT: store <4 x i32> [[COND1_SROA_SPECULATED]], ptr [[X]], align 16 +; CHECK-NEXT: ret void +; +entry: + %y = alloca %struct.myint4, align 16 + %x.addr = alloca ptr, align 8 + %cond.addr = alloca i32, align 4 + %temp = alloca %struct.myint4, align 16 + %zero = alloca %struct.myint4, align 16 + %data = alloca %struct.myint4, align 16 + %0 = getelementptr inbounds nuw { i64, i64 }, ptr %y, i32 0, i32 0 + store i64 %y.coerce0, ptr %0, align 16 + %1 = getelementptr inbounds nuw { i64, i64 }, ptr %y, i32 0, i32 1 + store i64 %y.coerce1, ptr %1, align 8 + store ptr %x, ptr %x.addr, align 8 + store i32 %cond, ptr %cond.addr, align 4 + call void @llvm.lifetime.start.p0(ptr %temp) + call void @llvm.memcpy.p0.p0.i64(ptr align 16 %temp, ptr align 16 %y, i64 16, i1 false) + call void @llvm.lifetime.start.p0(ptr %zero) + call void @llvm.memset.p0.i64(ptr align 16 %zero, i8 0, i64 16, i1 false) + call void @llvm.lifetime.start.p0(ptr %data) + %2 = load i32, ptr %cond.addr, align 4 + %tobool = icmp ne i32 %2, 0 + br i1 %tobool, label %cond.true, label %cond.false + +cond.true: + br label %cond.end + +cond.false: + br label %cond.end + +cond.end: + %cond1 = phi ptr [ %temp, %cond.true ], [ %zero, %cond.false ] + call void @llvm.memcpy.p0.p0.i64(ptr align 16 %data, ptr align 16 %cond1, i64 16, i1 false) + %3 = load ptr, ptr %x.addr, align 8 + call void @llvm.memcpy.p0.p0.i64(ptr align 16 %3, ptr align 16 %data, i64 16, i1 false) + call void @llvm.lifetime.end.p0(ptr %data) + call void @llvm.lifetime.end.p0(ptr %zero) + call void @llvm.lifetime.end.p0(ptr %temp) + ret void +} +%struct.myint4_base_n = type { i32, i32, i32, i32 } +%struct.myint4_nested = type { %struct.myint4_base_n } + +define dso_local void @foo_nested(ptr noundef %x, i64 %y.coerce0, i64 %y.coerce1, i32 noundef %cond) { +; CHECK-LABEL: define dso_local void @foo_nested( +; CHECK-SAME: ptr noundef [[X:%.*]], i64 [[Y_COERCE0:%.*]], i64 [[Y_COERCE1:%.*]], i32 noundef [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[COND]], 0 +; CHECK-NEXT: [[Y_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[Y_COERCE0]], i64 0 +; CHECK-NEXT: [[Y_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[Y_SROA_0_0_VEC_INSERT]], i64 [[Y_COERCE1]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[Y_SROA_0_8_VEC_INSERT]] to <4 x i32> +; CHECK-NEXT: [[COND1_SROA_SPECULATED:%.*]] = select i1 [[TOBOOL_NOT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP0]] +; CHECK-NEXT: store <4 x i32> [[COND1_SROA_SPECULATED]], ptr [[X]], align 16 +; CHECK-NEXT: ret void +; +entry: + %y = alloca %struct.myint4_nested, align 16 + %x.addr = alloca ptr, align 8 + %cond.addr = alloca i32, align 4 + %temp = alloca %struct.myint4_nested, align 16 + %zero = alloca %struct.myint4_nested, align 16 + %data = alloca %struct.myint4_nested, align 16 + %0 = getelementptr inbounds nuw { i64, i64 }, ptr %y, i32 0, i32 0 + store i64 %y.coerce0, ptr %0, align 16 + %1 = getelementptr inbounds nuw { i64, i64 }, ptr %y, i32 0, i32 1 + store i64 %y.coerce1, ptr %1, align 8 + store ptr %x, ptr %x.addr, align 8 + store i32 %cond, ptr %cond.addr, align 4 + call void @llvm.lifetime.start.p0(ptr %temp) + call void @llvm.memcpy.p0.p0.i64(ptr align 16 %temp, ptr align 16 %y, i64 16, i1 false) + call void @llvm.lifetime.start.p0(ptr %zero) + call void @llvm.memset.p0.i64(ptr align 16 %zero, i8 0, i64 16, i1 false) + call void @llvm.lifetime.start.p0(ptr %data) + %2 = load i32, ptr %cond.addr, align 4 + %tobool = icmp ne i32 %2, 0 + br i1 %tobool, label %cond.true, label %cond.false + +cond.true: + br label %cond.end + +cond.false: + br label %cond.end + +cond.end: + %cond1 = phi ptr [ %temp, %cond.true ], [ %zero, %cond.false ] + call void @llvm.memcpy.p0.p0.i64(ptr align 16 %data, ptr align 16 %cond1, i64 16, i1 false) + %3 = load ptr, ptr %x.addr, align 8 + call void @llvm.memcpy.p0.p0.i64(ptr align 16 %3, ptr align 16 %data, i64 16, i1 false) + call void @llvm.lifetime.end.p0(ptr %data) + call void @llvm.lifetime.end.p0(ptr %zero) + call void @llvm.lifetime.end.p0(ptr %temp) + ret void +} + +%struct.padded = type { i32, i8, i32, i8 } +define dso_local void @foo_padded(ptr noundef %x, i32 %a0, i8 %a1, +; CHECK-LABEL: define dso_local void @foo_padded( +; CHECK-SAME: ptr noundef [[X:%.*]], i32 [[A0:%.*]], i8 [[A1:%.*]], i32 [[A2:%.*]], i8 [[A3:%.*]], i32 noundef [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TEMP:%.*]] = alloca [[STRUCT_PADDED:%.*]], align 4 +; CHECK-NEXT: [[ZERO:%.*]] = alloca [[STRUCT_PADDED]], align 4 +; CHECK-NEXT: [[DATA:%.*]] = alloca [[STRUCT_PADDED]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TEMP]]) +; CHECK-NEXT: store i32 [[A0]], ptr [[TEMP]], align 4 +; CHECK-NEXT: [[Y_SROA_2_0_TEMP_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[TEMP]], i64 4 +; CHECK-NEXT: store i8 [[A1]], ptr [[Y_SROA_2_0_TEMP_SROA_IDX]], align 4 +; CHECK-NEXT: [[Y_SROA_31_0_TEMP_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[TEMP]], i64 8 +; CHECK-NEXT: store i32 [[A2]], ptr [[Y_SROA_31_0_TEMP_SROA_IDX]], align 4 +; CHECK-NEXT: [[Y_SROA_4_0_TEMP_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[TEMP]], i64 12 +; CHECK-NEXT: store i8 [[A3]], ptr [[Y_SROA_4_0_TEMP_SROA_IDX]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[ZERO]]) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 dereferenceable(16) [[ZERO]], i8 0, i64 16, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[DATA]]) +; CHECK-NEXT: [[TOBOOL_PAD_NOT:%.*]] = icmp eq i32 [[COND]], 0 +; CHECK-NEXT: [[ZERO_TEMP:%.*]] = select i1 [[TOBOOL_PAD_NOT]], ptr [[ZERO]], ptr [[TEMP]] +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(16) [[DATA]], ptr noundef nonnull align 4 dereferenceable(16) [[ZERO_TEMP]], i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(16) [[X]], ptr noundef nonnull align 4 dereferenceable(16) [[DATA]], i64 16, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[DATA]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[ZERO]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TEMP]]) +; CHECK-NEXT: ret void +; + i32 %a2, i8 %a3, + i32 noundef %cond) { +entry: + %y = alloca %struct.padded, align 4 + %x.addr = alloca ptr, align 8 + %cond.addr = alloca i32, align 4 + %temp = alloca %struct.padded, align 4 + %zero = alloca %struct.padded, align 4 + %data = alloca %struct.padded, align 4 + %y_i32_0 = getelementptr inbounds %struct.padded, ptr %y, i32 0, i32 0 + store i32 %a0, ptr %y_i32_0, align 4 + %y_i8_1 = getelementptr inbounds %struct.padded, ptr %y, i32 0, i32 1 + store i8 %a1, ptr %y_i8_1, align 1 + %y_i32_2 = getelementptr inbounds %struct.padded, ptr %y, i32 0, i32 2 + store i32 %a2, ptr %y_i32_2, align 4 + %y_i8_3 = getelementptr inbounds %struct.padded, ptr %y, i32 0, i32 3 + store i8 %a3, ptr %y_i8_3, align 1 + store ptr %x, ptr %x.addr, align 8 + store i32 %cond, ptr %cond.addr, align 4 + call void @llvm.lifetime.start.p0(ptr %temp) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %temp, ptr align 4 %y, + i64 16, i1 false) + call void @llvm.lifetime.start.p0(ptr %zero) + call void @llvm.memset.p0.i64(ptr align 4 %zero, i8 0, i64 16, i1 false) + call void @llvm.lifetime.start.p0(ptr %data) + %c.pad = load i32, ptr %cond.addr, align 4 + %tobool.pad = icmp ne i32 %c.pad, 0 + br i1 %tobool.pad, label %cond.true.pad, label %cond.false.pad + +cond.true.pad: + br label %cond.end.pad + +cond.false.pad: + br label %cond.end.pad + +cond.end.pad: + %cond1.pad = phi ptr [ %temp, %cond.true.pad ], [ %zero, %cond.false.pad ] + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 %cond1.pad, + i64 16, i1 false) + %xv.pad = load ptr, ptr %x.addr, align 8 + call void @llvm.memcpy.p0.p0.i64(ptr align 4 %xv.pad, ptr align 4 %data, + i64 16, i1 false) + call void @llvm.lifetime.end.p0(ptr %data) + call void @llvm.lifetime.end.p0(ptr %zero) + call void @llvm.lifetime.end.p0(ptr %temp) + ret void +} + +%struct.nonhomo = type { i32, i64, i32, i64 } +define dso_local void @foo_nonhomo(ptr noundef %x, i32 %a0, i64 %a1, +; CHECK-LABEL: define dso_local void @foo_nonhomo( +; CHECK-SAME: ptr noundef [[X:%.*]], i32 [[A0:%.*]], i64 [[A1:%.*]], i32 [[A2:%.*]], i64 [[A3:%.*]], i32 noundef [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TEMP:%.*]] = alloca [[STRUCT_NONHOMO:%.*]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca [[STRUCT_NONHOMO]], align 8 +; CHECK-NEXT: [[DATA:%.*]] = alloca [[STRUCT_NONHOMO]], align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TEMP]]) +; CHECK-NEXT: store i32 [[A0]], ptr [[TEMP]], align 8 +; CHECK-NEXT: [[Y_SROA_2_0_TEMP_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[TEMP]], i64 4 +; CHECK-NEXT: store i64 [[A1]], ptr [[Y_SROA_2_0_TEMP_SROA_IDX]], align 4 +; CHECK-NEXT: [[Y_SROA_3_0_TEMP_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[TEMP]], i64 12 +; CHECK-NEXT: store i32 [[A2]], ptr [[Y_SROA_3_0_TEMP_SROA_IDX]], align 4 +; CHECK-NEXT: [[Y_SROA_4_0_TEMP_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[TEMP]], i64 16 +; CHECK-NEXT: store i64 [[A3]], ptr [[Y_SROA_4_0_TEMP_SROA_IDX]], align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[ZERO]]) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[ZERO]], i8 0, i64 32, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[DATA]]) +; CHECK-NEXT: [[TOBOOL_NH_NOT:%.*]] = icmp eq i32 [[COND]], 0 +; CHECK-NEXT: [[ZERO_TEMP:%.*]] = select i1 [[TOBOOL_NH_NOT]], ptr [[ZERO]], ptr [[TEMP]] +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[DATA]], ptr noundef nonnull align 8 dereferenceable(32) [[ZERO_TEMP]], i64 32, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[X]], ptr noundef nonnull align 8 dereferenceable(32) [[DATA]], i64 32, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[DATA]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[ZERO]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TEMP]]) +; CHECK-NEXT: ret void +; + i32 %a2, i64 %a3, + i32 noundef %cond) { +entry: + %y = alloca %struct.nonhomo, align 8 + %x.addr = alloca ptr, align 8 + %cond.addr = alloca i32, align 4 + %temp = alloca %struct.nonhomo, align 8 + %zero = alloca %struct.nonhomo, align 8 + %data = alloca %struct.nonhomo, align 8 + %y_i32_0n = getelementptr inbounds %struct.nonhomo, ptr %y, i32 0, i32 0 + store i32 %a0, ptr %y_i32_0n, align 4 + %y_i64_1n = getelementptr inbounds %struct.nonhomo, ptr %y, i32 0, i32 1 + store i64 %a1, ptr %y_i64_1n, align 8 + %y_i32_2n = getelementptr inbounds %struct.nonhomo, ptr %y, i32 0, i32 2 + store i32 %a2, ptr %y_i32_2n, align 4 + %y_i64_3n = getelementptr inbounds %struct.nonhomo, ptr %y, i32 0, i32 3 + store i64 %a3, ptr %y_i64_3n, align 8 + store ptr %x, ptr %x.addr, align 8 + store i32 %cond, ptr %cond.addr, align 4 + call void @llvm.lifetime.start.p0(ptr %temp) + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %temp, ptr align 8 %y, + i64 32, i1 false) + call void @llvm.lifetime.start.p0(ptr %zero) + call void @llvm.memset.p0.i64(ptr align 8 %zero, i8 0, i64 32, i1 false) + call void @llvm.lifetime.start.p0(ptr %data) + %c.nh = load i32, ptr %cond.addr, align 4 + %tobool.nh = icmp ne i32 %c.nh, 0 + br i1 %tobool.nh, label %cond.true.nh, label %cond.false.nh + +cond.true.nh: + br label %cond.end.nh + +cond.false.nh: + br label %cond.end.nh + +cond.end.nh: + %cond1.nh = phi ptr [ %temp, %cond.true.nh ], [ %zero, %cond.false.nh ] + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %data, ptr align 8 %cond1.nh, + i64 32, i1 false) + %xv.nh = load ptr, ptr %x.addr, align 8 + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %xv.nh, ptr align 8 %data, + i64 32, i1 false) + call void @llvm.lifetime.end.p0(ptr %data) + call void @llvm.lifetime.end.p0(ptr %zero) + call void @llvm.lifetime.end.p0(ptr %temp) + ret void +} + +%struct.i1x4 = type { i1, i1, i1, i1 } +define dso_local void @foo_i1(ptr noundef %x, i64 %dummy0, i64 %dummy1, +; CHECK-LABEL: define dso_local void @foo_i1( +; CHECK-SAME: ptr noundef [[X:%.*]], i64 [[DUMMY0:%.*]], i64 [[DUMMY1:%.*]], i32 noundef [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TEMP:%.*]] = alloca [[STRUCT_I1X4:%.*]], align 1 +; CHECK-NEXT: [[ZERO:%.*]] = alloca [[STRUCT_I1X4]], align 1 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TEMP]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[ZERO]]) +; CHECK-NEXT: store i32 0, ptr [[ZERO]], align 1 +; CHECK-NEXT: [[TOBOOL_I1_NOT:%.*]] = icmp eq i32 [[COND]], 0 +; CHECK-NEXT: [[ZERO_TEMP:%.*]] = select i1 [[TOBOOL_I1_NOT]], ptr [[ZERO]], ptr [[TEMP]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ZERO_TEMP]], align 1 +; CHECK-NEXT: store i32 [[TMP0]], ptr [[X]], align 1 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[ZERO]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TEMP]]) +; CHECK-NEXT: ret void +; + i32 noundef %cond) { +entry: + %y = alloca %struct.i1x4, align 1 + %x.addr = alloca ptr, align 8 + %cond.addr = alloca i32, align 4 + %temp = alloca %struct.i1x4, align 1 + %zero = alloca %struct.i1x4, align 1 + %data = alloca %struct.i1x4, align 1 + store ptr %x, ptr %x.addr, align 8 + store i32 %cond, ptr %cond.addr, align 4 + call void @llvm.lifetime.start.p0(ptr %temp) + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %temp, ptr align 1 %y, + i64 4, i1 false) + call void @llvm.lifetime.start.p0(ptr %zero) + call void @llvm.memset.p0.i64(ptr align 1 %zero, i8 0, i64 4, i1 false) + call void @llvm.lifetime.start.p0(ptr %data) + %c.i1 = load i32, ptr %cond.addr, align 4 + %tobool.i1 = icmp ne i32 %c.i1, 0 + br i1 %tobool.i1, label %cond.true.i1, label %cond.false.i1 + +cond.true.i1: + br label %cond.end.i1 + +cond.false.i1: + br label %cond.end.i1 + +cond.end.i1: + %cond1.i1 = phi ptr [ %temp, %cond.true.i1 ], [ %zero, %cond.false.i1 ] + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %data, ptr align 1 %cond1.i1, + i64 4, i1 false) + %xv.i1 = load ptr, ptr %x.addr, align 8 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %xv.i1, ptr align 1 %data, + i64 4, i1 false) + call void @llvm.lifetime.end.p0(ptr %data) + call void @llvm.lifetime.end.p0(ptr %zero) + call void @llvm.lifetime.end.p0(ptr %temp) + ret void +} + +%struct.ptr4 = type { ptr, ptr, ptr, ptr } +define dso_local void @foo_ptr(ptr noundef %x, ptr %p0, ptr %p1, +; CHECK-LABEL: define dso_local void @foo_ptr( +; CHECK-SAME: ptr noundef [[X:%.*]], ptr [[P0:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]], ptr [[P3:%.*]], i32 noundef [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TEMP:%.*]] = alloca [[STRUCT_PTR4:%.*]], align 8 +; CHECK-NEXT: [[ZERO:%.*]] = alloca [[STRUCT_PTR4]], align 8 +; CHECK-NEXT: [[DATA:%.*]] = alloca [[STRUCT_PTR4]], align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TEMP]]) +; CHECK-NEXT: store ptr [[P0]], ptr [[TEMP]], align 8 +; CHECK-NEXT: [[Y_SROA_2_0_TEMP_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[TEMP]], i64 8 +; CHECK-NEXT: store ptr [[P1]], ptr [[Y_SROA_2_0_TEMP_SROA_IDX]], align 8 +; CHECK-NEXT: [[Y_SROA_3_0_TEMP_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[TEMP]], i64 16 +; CHECK-NEXT: store ptr [[P2]], ptr [[Y_SROA_3_0_TEMP_SROA_IDX]], align 8 +; CHECK-NEXT: [[Y_SROA_4_0_TEMP_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[TEMP]], i64 24 +; CHECK-NEXT: store ptr [[P3]], ptr [[Y_SROA_4_0_TEMP_SROA_IDX]], align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[ZERO]]) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[ZERO]], i8 0, i64 32, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[DATA]]) +; CHECK-NEXT: [[TOBOOL_PTR_NOT:%.*]] = icmp eq i32 [[COND]], 0 +; CHECK-NEXT: [[ZERO_TEMP:%.*]] = select i1 [[TOBOOL_PTR_NOT]], ptr [[ZERO]], ptr [[TEMP]] +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[DATA]], ptr noundef nonnull align 8 dereferenceable(32) [[ZERO_TEMP]], i64 32, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[X]], ptr noundef nonnull align 8 dereferenceable(32) [[DATA]], i64 32, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[DATA]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[ZERO]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TEMP]]) +; CHECK-NEXT: ret void +; + ptr %p2, ptr %p3, + i32 noundef %cond) { +entry: + %y = alloca %struct.ptr4, align 8 + %x.addr = alloca ptr, align 8 + %cond.addr = alloca i32, align 4 + %temp = alloca %struct.ptr4, align 8 + %zero = alloca %struct.ptr4, align 8 + %data = alloca %struct.ptr4, align 8 + %y_p0 = getelementptr inbounds %struct.ptr4, ptr %y, i32 0, i32 0 + store ptr %p0, ptr %y_p0, align 8 + %y_p1 = getelementptr inbounds %struct.ptr4, ptr %y, i32 0, i32 1 + store ptr %p1, ptr %y_p1, align 8 + %y_p2 = getelementptr inbounds %struct.ptr4, ptr %y, i32 0, i32 2 + store ptr %p2, ptr %y_p2, align 8 + %y_p3 = getelementptr inbounds %struct.ptr4, ptr %y, i32 0, i32 3 + store ptr %p3, ptr %y_p3, align 8 + store ptr %x, ptr %x.addr, align 8 + store i32 %cond, ptr %cond.addr, align 4 + call void @llvm.lifetime.start.p0(ptr %temp) + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %temp, ptr align 8 %y, + i64 32, i1 false) + call void @llvm.lifetime.start.p0(ptr %zero) + call void @llvm.memset.p0.i64(ptr align 8 %zero, i8 0, i64 32, i1 false) + call void @llvm.lifetime.start.p0(ptr %data) + %c.ptr = load i32, ptr %cond.addr, align 4 + %tobool.ptr = icmp ne i32 %c.ptr, 0 + br i1 %tobool.ptr, label %cond.true.ptr, label %cond.false.ptr + +cond.true.ptr: + br label %cond.end.ptr + +cond.false.ptr: + br label %cond.end.ptr + +cond.end.ptr: + %cond1.ptr = phi ptr [ %temp, %cond.true.ptr ], [ %zero, %cond.false.ptr ] + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %data, ptr align 8 %cond1.ptr, + i64 32, i1 false) + %xv.ptr = load ptr, ptr %x.addr, align 8 + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %xv.ptr, ptr align 8 %data, + i64 32, i1 false) + call void @llvm.lifetime.end.p0(ptr %data) + call void @llvm.lifetime.end.p0(ptr %zero) + call void @llvm.lifetime.end.p0(ptr %temp) + ret void +} diff --git a/llvm/test/Transforms/SROA/tbaa-struct3.ll b/llvm/test/Transforms/SROA/tbaa-struct3.ll index 6a0cacc7016f7..97e82db27c378 100644 --- a/llvm/test/Transforms/SROA/tbaa-struct3.ll +++ b/llvm/test/Transforms/SROA/tbaa-struct3.ll @@ -73,12 +73,13 @@ define void @load_store_transfer_split_struct_tbaa_2_i31(ptr dereferenceable(24) ; CHECK-LABEL: define void @load_store_transfer_split_struct_tbaa_2_i31( ; CHECK-SAME: ptr dereferenceable(24) [[RES:%.*]], i31 [[A:%.*]], i31 [[B:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP:%.*]] = alloca { i31, i31 }, align 4 -; CHECK-NEXT: store i31 [[A]], ptr [[TMP]], align 4 +; CHECK-NEXT: [[TMP:%.*]] = alloca <2 x i31>, align 8 +; CHECK-NEXT: store i31 [[A]], ptr [[TMP]], align 8 ; CHECK-NEXT: [[TMP_4_TMP_4_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 4 ; CHECK-NEXT: store i31 [[B]], ptr [[TMP_4_TMP_4_SROA_IDX]], align 4 -; CHECK-NEXT: [[TMP_0_L1:%.*]] = load i62, ptr [[TMP]], align 4, !tbaa.struct [[TBAA_STRUCT4:![0-9]+]] -; CHECK-NEXT: store i62 [[TMP_0_L1]], ptr [[RES]], align 4, !tbaa.struct [[TBAA_STRUCT4]] +; CHECK-NEXT: [[TMP_SROA_0_0_TMP_SROA_0_0_L1:%.*]] = load <2 x i31>, ptr [[TMP]], align 8, !tbaa.struct [[TBAA_STRUCT4:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i31> [[TMP_SROA_0_0_TMP_SROA_0_0_L1]] to i62 +; CHECK-NEXT: store i62 [[TMP0]], ptr [[RES]], align 4, !tbaa.struct [[TBAA_STRUCT4]] ; CHECK-NEXT: ret void ; entry: >From fd53d1ee9d7a73774dc9b779676a17f3a50da1d7 Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" <[email protected]> Date: Sun, 12 Apr 2026 23:14:34 -0400 Subject: [PATCH 2/2] [SROA] Refine struct-to-vector fallback recovery Track local alloca provenance and use it to recover the safe integer memcpy-based fallback cases without reopening the broader regressions from the aggressive homogeneous-struct canonicalization, and document the provenance-based fallback rationale in the implementation. --- llvm/lib/Transforms/Scalar/SROA.cpp | 236 ++++++++++++++++-- .../struct-to-vector-fp-store-only-tail.ll | 39 +++ .../struct-to-vector-mapok-extra-alloca.ll | 35 +++ .../SROA/struct-to-vector-subpartition.ll | 17 +- llvm/test/Transforms/SROA/struct-to-vector.ll | 32 +-- 5 files changed, 323 insertions(+), 36 deletions(-) create mode 100644 llvm/test/Transforms/SROA/struct-to-vector-fp-store-only-tail.ll create mode 100644 llvm/test/Transforms/SROA/struct-to-vector-mapok-extra-alloca.ll diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index bd1d6aa90ba74..f0383b77f3cfc 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -156,6 +156,23 @@ using RewriteableMemOp = std::variant<PossiblySpeculatableLoad, UnspeculatableStore>; using RewriteableMemOps = SmallVector<RewriteableMemOp, 2>; +/// Provenance bits for allocas rewritten by SROA. +/// +/// These flags describe how the current alloca relates to the original +/// aggregate so the struct-to-vector fallback can distinguish original +/// full-record allocas from smaller pieces created by earlier SROA rewrites. +/// They are tracked only inside this pass; they are not IR metadata. +enum AllocaProvenanceFlag : unsigned { + APF_None = 0, + /// The alloca is only a subaggregate of the original aggregate. + APF_Subaggregate = 1u << 0, + /// The subaggregate starts at a non-zero offset within the original. + APF_NonPrefix = 1u << 1, + /// The subaggregate is strictly interior: it starts after offset 0 and ends + /// before the end of the original aggregate. + APF_Interior = 1u << 2, +}; + /// An optimization pass providing Scalar Replacement of Aggregates. /// /// This pass takes allocations which can be completely analyzed (that is, they @@ -180,6 +197,14 @@ class SROA { AssumptionCache *const AC; const bool PreserveCFG; + /// Side table for the provenance bits above. + /// + /// New allocas created by rewritePartition() inherit and refine the + /// provenance of the source alloca. Keeping this as pass-local state lets the + /// heuristic reason about original-vs-derived allocas without changing the + /// emitted IR. + DenseMap<const AllocaInst *, unsigned> AllocaProvenance; + /// Worklist of alloca instructions to simplify. /// /// Each alloca in the function is added to this. Each new alloca formed gets @@ -260,6 +285,18 @@ class SROA { void clobberUse(Use &U); bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas); bool promoteAllocas(); + + unsigned getAllocaProvenance(const AllocaInst &AI) const { + auto It = AllocaProvenance.find(&AI); + return It == AllocaProvenance.end() ? APF_None : It->second; + } + + void setAllocaProvenance(const AllocaInst &AI, unsigned Flags) { + if (Flags == APF_None) + AllocaProvenance.erase(&AI); + else + AllocaProvenance[&AI] = Flags; + } }; } // end anonymous namespace @@ -5179,6 +5216,101 @@ static FixedVectorType *tryCanonicalizeStructToVector(StructType *STy, return FixedVectorType::get(EltTy, NumElts); } +/// Decide whether it is profitable to canonicalize a homogeneous struct +/// partition to a vector after the usual promotion choices have already failed. +/// +/// This helper consolidates the local heuristics from the follow-up tuning work: +/// we only canonicalize when the partition has a non-splittable whole-partition +/// use and does not have any non-splittable sub-element loads. +/// +/// The current heuristic is intentionally conservative: +/// - Default allow case: a real whole-partition use, because that is the +/// clearest signal that a vector type can carry a whole value profitably. +/// - Default reject case: sub-element loads, because they usually turn the +/// vector back into lane extraction traffic. +/// +/// We also recover a narrow class of memcpy-only integer cases, even though +/// they are classified as splittable rather than whole-partition uses: +/// - interior i64 subaggregates produced by earlier SROA rewrites +/// - original full-record integer aggregates of size >= 32 bytes +/// +/// Rationale: +/// - Some memcpy-only integer cases are real wins and stay in whole-value form +/// after canonicalization. +/// - But recovering all full-record memcpy-only cases is too broad: in +/// benchmark cases like arrow/interfaces, an original full-record 16-byte +/// {i64, i64} helper can look locally profitable while still making the +/// enclosing caller worse after inlining and backend lowering. +/// - So the fallback stays narrow and uses provenance to distinguish the safer +/// recovered cases from the broader risky bucket. +/// +/// Intuition: +/// - Good: whole-value traffic can benefit from a vector type. +/// %tmp = load { i64, i64 }, ptr %src +/// store { i64, i64 } %tmp, ptr %dst +/// Canonicalizing to <2 x i64> exposes a single whole-value load/store. +/// +/// - Bad: field-by-field reads become extractelement traffic. +/// %x = load i32, ptr %p +/// %y = load i32, ptr (gep %p, 4) +/// Canonicalizing { i32, i32 } to <2 x i32> only adds lane extraction. +/// +/// - Bad: a store-only FP tail can seed later SLP divergence without a clear +/// SROA win. +/// store float %a, ptr %p0 +/// store float %b, ptr %p1 +/// store float %c, ptr %p2 +/// store float %d, ptr %p3 +/// Canonicalizing this to a temporary <4 x float> store was enough to change +/// later vectorization in benchmark cases like glTFImporter. +static bool +shouldCanonicalizeHomogeneousStructToVector(Partition &P, const DataLayout &DL, + AllocaInst &AI, + bool IsI64Candidate, + unsigned ProvenanceFlags) { + bool HasWholePartitionUse = false; + bool HasSubElementLoad = false; + bool HasRecoverableSplittableTransfer = false; + bool IsInteriorSubaggregate = (ProvenanceFlags & APF_Interior) != 0; + bool IsOriginalFullRecord = + (ProvenanceFlags & APF_Subaggregate) == 0 && P.beginOffset() == 0 && + DL.getTypeAllocSize(AI.getAllocatedType()).isFixed() && + DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue() == P.size(); + + for (const Slice &S : P) { + if (S.isDead()) + continue; + + auto *U = S.getUse(); + if (!U) + continue; + + if (S.isSplittable()) { + if (IsI64Candidate && IsInteriorSubaggregate && + S.beginOffset() == P.beginOffset() && + S.endOffset() == P.endOffset() && isa<MemIntrinsic>(U->getUser())) + HasRecoverableSplittableTransfer = true; + if (IsI64Candidate && IsOriginalFullRecord && P.size() >= 32 && + S.beginOffset() == P.beginOffset() && + S.endOffset() == P.endOffset() && isa<MemIntrinsic>(U->getUser())) + HasRecoverableSplittableTransfer = true; + continue; + } + + uint64_t SliceSize = S.endOffset() - S.beginOffset(); + if (SliceSize < P.size()) { + if (isa<LoadInst>(U->getUser())) + HasSubElementLoad = true; + continue; + } + + HasWholePartitionUse = true; + } + + return (HasWholePartitionUse || HasRecoverableSplittableTransfer) && + !HasSubElementLoad; +} + /// Select a partition type for an alloca partition. /// /// Try to compute a friendly type for this partition of the alloca. This @@ -5192,7 +5324,25 @@ static FixedVectorType *tryCanonicalizeStructToVector(StructType *STy, /// nullptr. static std::tuple<Type *, bool, VectorType *> selectPartitionType(Partition &P, const DataLayout &DL, AllocaInst &AI, - LLVMContext &C) { + LLVMContext &C, unsigned ProvenanceFlags) { + auto LogChoice = [&](StringRef Path, Type *ChosenTy, VectorType *ChosenVecTy, + bool ChosenIntWidening) { + LLVM_DEBUG({ + dbgs() << "selectPartitionType path=" << Path + << " func=" << AI.getFunction()->getName() << " alloca="; + if (AI.hasName()) + dbgs() << AI.getName(); + else + dbgs() << "<unnamed>"; + dbgs() << " partition=[" << P.beginOffset() << "," << P.endOffset() + << ") size=" << P.size() << " allocated=" << *AI.getAllocatedType(); + if (ChosenTy) + dbgs() << " chosen=" << *ChosenTy; + if (ChosenVecTy) + dbgs() << " vec=" << *ChosenVecTy; + dbgs() << " intwiden=" << ChosenIntWidening << "\n"; + }); + }; // First check if the partition is viable for vector promotion. // // We prefer vector promotion over integer widening promotion when: @@ -5209,8 +5359,10 @@ selectPartitionType(Partition &P, const DataLayout &DL, AllocaInst &AI, // promotion. If the vector has one element, let the below code select // whether we promote with the vector or scalar. if (VecTy && VecTy->getElementType()->isFloatingPointTy() && - VecTy->getElementCount().getFixedValue() > 1) + VecTy->getElementCount().getFixedValue() > 1) { + LogChoice("direct-fp-vecty", VecTy, VecTy, false); return {VecTy, false, VecTy}; + } // Check if there is a common type that all slices of the partition use that // spans the partition. @@ -5222,10 +5374,13 @@ selectPartitionType(Partition &P, const DataLayout &DL, AllocaInst &AI, // We prefer vector promotion here because if vector promotion is viable // and there is a common type used, then it implies the second listed // condition for preferring vector promotion is true. - if (VecTy) + if (VecTy) { + LogChoice("common-type-vecty", VecTy, VecTy, false); return {VecTy, false, VecTy}; - return {CommonUseTy, isIntegerWideningViable(P, CommonUseTy, DL), - nullptr}; + } + bool IntWiden = isIntegerWideningViable(P, CommonUseTy, DL); + LogChoice("common-type", CommonUseTy, nullptr, IntWiden); + return {CommonUseTy, IntWiden, nullptr}; } } @@ -5241,37 +5396,76 @@ selectPartitionType(Partition &P, const DataLayout &DL, AllocaInst &AI, DL.isLegalInteger(P.size() * 8)) TypePartitionTy = Type::getIntNTy(C, P.size() * 8); // There was no common type used, so we prefer integer widening promotion. - if (isIntegerWideningViable(P, TypePartitionTy, DL)) + if (isIntegerWideningViable(P, TypePartitionTy, DL)) { + LogChoice("type-partition-intwiden", TypePartitionTy, nullptr, true); return {TypePartitionTy, true, nullptr}; - if (VecTy) + } + if (VecTy) { + LogChoice("type-partition-vecty", VecTy, VecTy, false); return {VecTy, false, VecTy}; + } // If we couldn't promote with TypePartitionTy, try with the largest // integer type used. if (LargestIntTy && DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size() && - isIntegerWideningViable(P, LargestIntTy, DL)) + isIntegerWideningViable(P, LargestIntTy, DL)) { + LogChoice("largest-int-intwiden", LargestIntTy, nullptr, true); return {LargestIntTy, true, nullptr}; + } // Try homogeneous struct to vector canonicalization. + // + // This is intentionally more conservative than tryCanonicalize... + // alone: after the normal promotion paths above fail, we only want the + // struct-to-vector fallback when it can expose a real whole-value use, + // not when it would merely create vector lanes that later passes have to + // unpack again. if (auto *STy = dyn_cast<StructType>(TypePartitionTy)) - if (auto *VTy = tryCanonicalizeStructToVector(STy, DL)) - return {VTy, false, nullptr}; + if (auto *VTy = tryCanonicalizeStructToVector(STy, DL)) { + bool AllowStructFallback = shouldCanonicalizeHomogeneousStructToVector( + P, DL, AI, VTy->getElementType()->isIntegerTy(64), + ProvenanceFlags); + LLVM_DEBUG({ + dbgs() << "selectPartitionType struct-fallback-candidate" + << " func=" << AI.getFunction()->getName() << " alloca="; + if (AI.hasName()) + dbgs() << AI.getName(); + else + dbgs() << "<unnamed>"; + dbgs() << " partition=[" << P.beginOffset() << "," << P.endOffset() + << ") size=" << P.size() << " type-partition=" << *STy + << " candidate-vec=" << *VTy << " allow=" << AllowStructFallback + << "\n"; + }); + if (AllowStructFallback) { + LogChoice("struct-fallback-vecty", VTy, nullptr, false); + return {VTy, false, nullptr}; + } + } // Fallback to TypePartitionTy and we probably won't promote. + LogChoice("type-partition-fallback", TypePartitionTy, nullptr, false); return {TypePartitionTy, false, nullptr}; } // Select the largest integer type used if it spans the partition. if (LargestIntTy && - DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size()) + DL.getTypeAllocSize(LargestIntTy).getFixedValue() >= P.size()) { + LogChoice("largest-int-fallback", LargestIntTy, nullptr, false); return {LargestIntTy, false, nullptr}; + } // Select a legal integer type if it spans the partition. - if (DL.isLegalInteger(P.size() * 8)) - return {Type::getIntNTy(C, P.size() * 8), false, nullptr}; + if (DL.isLegalInteger(P.size() * 8)) { + Type *IntTy = Type::getIntNTy(C, P.size() * 8); + LogChoice("legal-int-fallback", IntTy, nullptr, false); + return {IntTy, false, nullptr}; + } // Fallback to an i8 array. - return {ArrayType::get(Type::getInt8Ty(C), P.size()), false, nullptr}; + Type *ArrayTy = ArrayType::get(Type::getInt8Ty(C), P.size()); + LogChoice("byte-array-fallback", ArrayTy, nullptr, false); + return {ArrayTy, false, nullptr}; } /// Rewrite an alloca partition's users. @@ -5288,8 +5482,9 @@ std::pair<AllocaInst *, uint64_t> SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P) { const DataLayout &DL = AI.getDataLayout(); // Select the type for the new alloca that spans the partition. + unsigned ProvenanceFlags = getAllocaProvenance(AI); auto [PartitionTy, IsIntegerWideningViable, VecTy] = - selectPartitionType(P, DL, AI, *C); + selectPartitionType(P, DL, AI, *C, ProvenanceFlags); // Check for the case where we're going to rewrite to a new alloca of the // exact same type as the original, and with the same access offsets. In that @@ -5316,6 +5511,17 @@ SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P) { AI.getIterator()); // Copy the old AI debug location over to the new one. NewAI->setDebugLoc(AI.getDebugLoc()); + unsigned NewProvenanceFlags = ProvenanceFlags; + if (P.beginOffset() != 0 || + P.size() != DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue()) + NewProvenanceFlags |= APF_Subaggregate; + if (P.beginOffset() != 0) + NewProvenanceFlags |= APF_NonPrefix; + if (P.beginOffset() != 0 && + P.endOffset() < + DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue()) + NewProvenanceFlags |= APF_Interior; + setAllocaProvenance(*NewAI, NewProvenanceFlags); ++NumNewAllocas; } diff --git a/llvm/test/Transforms/SROA/struct-to-vector-fp-store-only-tail.ll b/llvm/test/Transforms/SROA/struct-to-vector-fp-store-only-tail.ll new file mode 100644 index 0000000000000..f223c2e8efa70 --- /dev/null +++ b/llvm/test/Transforms/SROA/struct-to-vector-fp-store-only-tail.ll @@ -0,0 +1,39 @@ +; RUN: opt -passes=sroa -S %s | FileCheck %s +; NOTE: Do not autogenerate. This regression test checks a specific store-only +; homogeneous float slice pattern that current SROA can over-vectorize. + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" + +%class.aiMatrix4x4t = type { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite) +declare void @llvm.memcpy.p0.p0.i64(ptr noalias writeonly captures(none), ptr noalias readonly captures(none), i64, i1 immarg) + +; This reduced case exercises a homogeneous FP tail slice that is only stored. +; The fixed behavior keeps the scalar/memcpy shape; the buggy behavior deletes +; the 3-float temporary and replaces it with a non-const vector store +; (`store <4 x float> %...`) seeded by FP struct-to-vector canonicalization. +; +; CHECK-LABEL: define ptr @store_only_fp_tail() +; CHECK: %.sroa.3 = alloca { float, float, float }, align 8 +; CHECK: %.sroa.4 = alloca { float, float, float, float, float, float, float, float, float, float, float }, align 8 +; CHECK: %.sroa.0.sroa.1 = alloca { float, float, float }, align 8 +; CHECK: %.sroa.2 = alloca { float, float, float, float, float, float, float, float, float, float, float }, align 8 +; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %.sroa.3, ptr align 8 %.sroa.0.sroa.1, i64 12, i1 false) +; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %.sroa.4, ptr align 8 %.sroa.2, i64 44, i1 false) +; CHECK: store float 0.000000e+00, ptr null, align 1 +; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 getelementptr inbounds (i8, ptr null, i64 4), ptr align 8 %.sroa.3, i64 12, i1 false) +; CHECK: store float 0.000000e+00, ptr getelementptr inbounds (i8, ptr null, i64 16), align 1 +; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 getelementptr inbounds (i8, ptr null, i64 20), ptr align 8 %.sroa.4, i64 44, i1 false) +; CHECK-NOT: store <4 x float> % +define ptr @store_only_fp_tail() { + %1 = alloca %class.aiMatrix4x4t, align 4 + %2 = alloca %class.aiMatrix4x4t, align 4 + %3 = getelementptr i8, ptr %2, i64 16 + store float 0.000000e+00, ptr %3, align 4 + call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %2, i64 64, i1 false) + store float 0.000000e+00, ptr %1, align 4 + call void @llvm.memcpy.p0.p0.i64(ptr null, ptr %1, i64 64, i1 false) + ret ptr null +} diff --git a/llvm/test/Transforms/SROA/struct-to-vector-mapok-extra-alloca.ll b/llvm/test/Transforms/SROA/struct-to-vector-mapok-extra-alloca.ll new file mode 100644 index 0000000000000..f9ce644156b31 --- /dev/null +++ b/llvm/test/Transforms/SROA/struct-to-vector-mapok-extra-alloca.ll @@ -0,0 +1,35 @@ +; RUN: opt -passes='default<O3>' -S %s | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK-LABEL: define void @"_ZN102_$LT$futures_util..stream..try_stream..MapOk$LT$St$C$F$GT$$u20$as$u20$futures_core..stream..Stream$GT$9poll_next17h555df33481d9c33cE" +; CHECK: [[TMP:%.*]] = alloca [11 x i64], align 8 +; CHECK: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP]]) +; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(88) [[TMP]], ptr noundef nonnull align 1 dereferenceable(88) %0, i64 88, i1 false) +; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(88) %0, ptr noundef nonnull align 8 dereferenceable(88) [[TMP]], i64 88, i1 false) +; CHECK: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP]]) +define void @"_ZN102_$LT$futures_util..stream..try_stream..MapOk$LT$St$C$F$GT$$u20$as$u20$futures_core..stream..Stream$GT$9poll_next17h555df33481d9c33cE"(ptr %0) { + call void @"_ZN101_$LT$futures_util..stream..stream..map..Map$LT$St$C$F$GT$$u20$as$u20$futures_core..stream..Stream$GT$9poll_next17h5aa844c062b2077eE"(ptr %0) + ret void +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite) +declare void @llvm.memcpy.p0.p0.i64(ptr noalias writeonly captures(none), ptr noalias readonly captures(none), i64, i1 immarg) #0 + +define void @"_ZN101_$LT$futures_util..stream..stream..map..Map$LT$St$C$F$GT$$u20$as$u20$futures_core..stream..Stream$GT$9poll_next17h5aa844c062b2077eE"(ptr %0) { + %.sroa.5 = alloca [11 x i64], align 8 + %2 = load i64, ptr %0, align 8 + %3 = icmp eq i64 %2, 0 + br i1 %3, label %"_ZN122_$LT$futures_util..fns..MapOkFn$LT$F$GT$$u20$as$u20$futures_util..fns..FnMut1$LT$core..result..Result$LT$T$C$E$GT$$GT$$GT$8call_mut17h252763b5559d12fbE.exit", label %4 + +4: ; preds = %1 + call void @llvm.memcpy.p0.p0.i64(ptr %.sroa.5, ptr %0, i64 88, i1 false) + br label %"_ZN122_$LT$futures_util..fns..MapOkFn$LT$F$GT$$u20$as$u20$futures_util..fns..FnMut1$LT$core..result..Result$LT$T$C$E$GT$$GT$$GT$8call_mut17h252763b5559d12fbE.exit" + +"_ZN122_$LT$futures_util..fns..MapOkFn$LT$F$GT$$u20$as$u20$futures_util..fns..FnMut1$LT$core..result..Result$LT$T$C$E$GT$$GT$$GT$8call_mut17h252763b5559d12fbE.exit": ; preds = %4, %1 + call void @llvm.memcpy.p0.p0.i64(ptr %0, ptr %.sroa.5, i64 88, i1 false) + ret void +} + +attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } diff --git a/llvm/test/Transforms/SROA/struct-to-vector-subpartition.ll b/llvm/test/Transforms/SROA/struct-to-vector-subpartition.ll index 58d2cfc69e052..fafded012ae4a 100644 --- a/llvm/test/Transforms/SROA/struct-to-vector-subpartition.ll +++ b/llvm/test/Transforms/SROA/struct-to-vector-subpartition.ll @@ -7,13 +7,16 @@ target triple = "x86_64-unknown-linux-gnu" ; When SROA splits { ptr, i64, i64, i64 } into [0,8), [8,16), [16,32), ; the [16,32) partition type from getTypePartition is { i64, i64 }. -; tryCanonicalizeStructToVector converts this to <2 x i64>, making the -; sub-partition promotable and eliminating the alloca entirely. +; The current whole-use heuristic intentionally does NOT canonicalize this +; sub-partition to <2 x i64>, because the [16,32) slice is only touched by +; splittable memcpy traffic and has no non-splittable whole-partition use. +; Keeping it scalar avoids creating a vectorized temporary that later passes +; may not be able to promote away. ; CHECK-LABEL: define void @test_subpartition_type( -; CHECK-NOT: alloca -; CHECK: load <2 x i64> -; CHECK: store <2 x i64> +; CHECK: %a.sroa.6 = alloca { i64, i64 }, align 8 +; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a.sroa.6, ptr align 8 %a.sroa.6.0.src.sroa_idx, i64 16, i1 false) +; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %a.sroa.6.0.dst.sroa_idx, ptr align 8 %a.sroa.6, i64 16, i1 false) define void @test_subpartition_type(ptr %src, ptr %dst) { entry: %a = alloca { ptr, i64, i64, i64 }, align 8 @@ -30,8 +33,8 @@ entry: %v1 = load i64, ptr %gep.a.8, align 8 ; Only splittable memcpy uses touch [16,32), so SROA creates a single - ; [16,32) partition. getTypePartition returns { i64, i64 } for this, - ; which is canonicalized to <2 x i64>. + ; [16,32) partition. getTypePartition returns { i64, i64 } for this, but + ; the whole-use heuristic keeps it in scalar/memcpy form. ; Copy all 32 bytes from %a to dst (splittable) call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %a, i64 32, i1 false) diff --git a/llvm/test/Transforms/SROA/struct-to-vector.ll b/llvm/test/Transforms/SROA/struct-to-vector.ll index a4f68c53952ab..58945334b7961 100644 --- a/llvm/test/Transforms/SROA/struct-to-vector.ll +++ b/llvm/test/Transforms/SROA/struct-to-vector.ll @@ -7,11 +7,11 @@ define dso_local void @foo_flat(ptr noundef %x, i64 %y.coerce0, i64 %y.coerce1, ; CHECK-SAME: ptr noundef [[X:%.*]], i64 [[Y_COERCE0:%.*]], i64 [[Y_COERCE1:%.*]], i32 noundef [[COND:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[COND]], 0 -; CHECK-NEXT: [[Y_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[Y_COERCE0]], i64 0 -; CHECK-NEXT: [[Y_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[Y_SROA_0_0_VEC_INSERT]], i64 [[Y_COERCE1]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[Y_SROA_0_8_VEC_INSERT]] to <4 x i32> -; CHECK-NEXT: [[COND1_SROA_SPECULATED:%.*]] = select i1 [[TOBOOL_NOT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP0]] -; CHECK-NEXT: store <4 x i32> [[COND1_SROA_SPECULATED]], ptr [[X]], align 16 +; CHECK-NEXT: [[DOTY_COERCE0:%.*]] = select i1 [[TOBOOL_NOT]], i64 0, i64 [[Y_COERCE0]] +; CHECK-NEXT: [[DOTY_COERCE1:%.*]] = select i1 [[TOBOOL_NOT]], i64 0, i64 [[Y_COERCE1]] +; CHECK-NEXT: store i64 [[DOTY_COERCE0]], ptr [[X]], align 16 +; CHECK-NEXT: [[X_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 8 +; CHECK-NEXT: store i64 [[DOTY_COERCE1]], ptr [[X_REPACK7]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -44,9 +44,11 @@ cond.false: cond.end: %cond1 = phi ptr [ %temp, %cond.true ], [ %zero, %cond.false ] - call void @llvm.memcpy.p0.p0.i64(ptr align 16 %data, ptr align 16 %cond1, i64 16, i1 false) + %whole = load { i64, i64 }, ptr %cond1, align 16 + store { i64, i64 } %whole, ptr %data, align 16 %3 = load ptr, ptr %x.addr, align 8 - call void @llvm.memcpy.p0.p0.i64(ptr align 16 %3, ptr align 16 %data, i64 16, i1 false) + %whole2 = load { i64, i64 }, ptr %data, align 16 + store { i64, i64 } %whole2, ptr %3, align 16 call void @llvm.lifetime.end.p0(ptr %data) call void @llvm.lifetime.end.p0(ptr %zero) call void @llvm.lifetime.end.p0(ptr %temp) @@ -60,11 +62,11 @@ define dso_local void @foo_nested(ptr noundef %x, i64 %y.coerce0, i64 %y.coerce1 ; CHECK-SAME: ptr noundef [[X:%.*]], i64 [[Y_COERCE0:%.*]], i64 [[Y_COERCE1:%.*]], i32 noundef [[COND:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[COND]], 0 -; CHECK-NEXT: [[Y_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[Y_COERCE0]], i64 0 -; CHECK-NEXT: [[Y_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[Y_SROA_0_0_VEC_INSERT]], i64 [[Y_COERCE1]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[Y_SROA_0_8_VEC_INSERT]] to <4 x i32> -; CHECK-NEXT: [[COND1_SROA_SPECULATED:%.*]] = select i1 [[TOBOOL_NOT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP0]] -; CHECK-NEXT: store <4 x i32> [[COND1_SROA_SPECULATED]], ptr [[X]], align 16 +; CHECK-NEXT: [[DOTY_COERCE0:%.*]] = select i1 [[TOBOOL_NOT]], i64 0, i64 [[Y_COERCE0]] +; CHECK-NEXT: [[DOTY_COERCE1:%.*]] = select i1 [[TOBOOL_NOT]], i64 0, i64 [[Y_COERCE1]] +; CHECK-NEXT: store i64 [[DOTY_COERCE0]], ptr [[X]], align 16 +; CHECK-NEXT: [[X_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 8 +; CHECK-NEXT: store i64 [[DOTY_COERCE1]], ptr [[X_REPACK7]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -97,9 +99,11 @@ cond.false: cond.end: %cond1 = phi ptr [ %temp, %cond.true ], [ %zero, %cond.false ] - call void @llvm.memcpy.p0.p0.i64(ptr align 16 %data, ptr align 16 %cond1, i64 16, i1 false) + %whole = load { i64, i64 }, ptr %cond1, align 16 + store { i64, i64 } %whole, ptr %data, align 16 %3 = load ptr, ptr %x.addr, align 8 - call void @llvm.memcpy.p0.p0.i64(ptr align 16 %3, ptr align 16 %data, i64 16, i1 false) + %whole2 = load { i64, i64 }, ptr %data, align 16 + store { i64, i64 } %whole2, ptr %3, align 16 call void @llvm.lifetime.end.p0(ptr %data) call void @llvm.lifetime.end.p0(ptr %zero) call void @llvm.lifetime.end.p0(ptr %temp) _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
