hliao created this revision.
hliao added reviewers: arsenm, chandlerc.
Herald added subscribers: llvm-commits, cfe-commits, kerbowa, dexonsmith,
steven_wu, hiraditya, nhaehnle, wdng, jvesely.
Herald added projects: clang, LLVM.
So far, SROA could only handle convertible pointer pairs if they are in the
same address space. Just like no-op cast, a no-op `addrspacecast` also changes
no bits, it could also be used to convert pointer pairs from different address
spaces. That benefits `infer-address-spaces` pass to propagate address spaces.
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D81670
Files:
clang/test/CodeGen/thinlto-distributed-newpm.ll
clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
llvm/include/llvm/Analysis/TargetTransformInfo.h
llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
llvm/include/llvm/CodeGen/BasicTTIImpl.h
llvm/include/llvm/Transforms/Scalar/SROA.h
llvm/lib/Analysis/TargetTransformInfo.cpp
llvm/lib/Transforms/Scalar/SROA.cpp
llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
llvm/test/Transforms/SROA/noop-addrspacecast.ll
Index: llvm/test/Transforms/SROA/noop-addrspacecast.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/SROA/noop-addrspacecast.ll
@@ -0,0 +1,19 @@
+; RUN: opt -S -o - -sroa %s | FileCheck %s
+; RUN: opt -S -o - -passes=sroa %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+; CHECK-LABEL: @noop_addrspacecast(
+; CHECK-NEXT: = addrspacecast i32 addrspace(1)* %{{.*}} to i32*
+; CHECK-NEXT: store i32 0, i32* %{{.*}}
+; CHECK-NEXT: ret void
+define void @noop_addrspacecast(i32 addrspace(1)* %x.coerce) {
+ %x = alloca i32*, align 8, addrspace(5)
+ %x1 = addrspacecast i32* addrspace(5)* %x to i32**
+ %x2 = bitcast i32** %x1 to i32 addrspace(1)**
+ store i32 addrspace(1)* %x.coerce, i32 addrspace(1)** %x2
+ %x3 = load i32*, i32** %x1
+ store i32 0, i32* %x3
+ ret void
+}
Index: llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
===================================================================
--- llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -123,15 +123,15 @@
; CHECK-O-NEXT: Running pass: CGSCCToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
; CHECK-O-NEXT: Starting {{.*}}Function pass manager run.
; CHECK-O-NEXT: Running pass: SROA
-; These next two can appear in any order since they are accessed as parameters
+; These next three can appear in any order since they are accessed as parameters
; on the same call to SROA::runImpl
+; CHECK-O1-DAG: Running analysis: TargetIRAnalysis on foo
+; CHECK-O2-DAG: Running analysis: TargetIRAnalysis on foo
+; CHECK-Os-DAG: Running analysis: TargetIRAnalysis on foo
+; CHECK-Oz-DAG: Running analysis: TargetIRAnalysis on foo
; CHECK-O-DAG: Running analysis: DominatorTreeAnalysis on foo
; CHECK-O-DAG: Running analysis: AssumptionAnalysis on foo
; CHECK-O-NEXT: Running pass: EarlyCSEPass
-; CHECK-O1-NEXT: Running analysis: TargetIRAnalysis on foo
-; CHECK-O2-NEXT: Running analysis: TargetIRAnalysis on foo
-; CHECK-Os-NEXT: Running analysis: TargetIRAnalysis on foo
-; CHECK-Oz-NEXT: Running analysis: TargetIRAnalysis on foo
; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
Index: llvm/lib/Transforms/Scalar/SROA.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/SROA.cpp
+++ llvm/lib/Transforms/Scalar/SROA.cpp
@@ -41,6 +41,7 @@
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/PtrUseVisitor.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
@@ -1677,7 +1678,9 @@
/// ensure that we only try to convert viable values. The strategy is that we
/// will peel off single element struct and array wrappings to get to an
/// underlying value, and convert that value.
-static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
+static bool canConvertValue(const DataLayout &DL,
+ const TargetTransformInfo &TTI, Type *OldTy,
+ Type *NewTy) {
if (OldTy == NewTy)
return true;
@@ -1703,8 +1706,11 @@
NewTy = NewTy->getScalarType();
if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
- return cast<PointerType>(NewTy)->getPointerAddressSpace() ==
- cast<PointerType>(OldTy)->getPointerAddressSpace();
+ // Pointers are convertible if they have the same address space or that
+ // address space casting is a no-op.
+ unsigned OldAS = cast<PointerType>(OldTy)->getPointerAddressSpace();
+ unsigned NewAS = cast<PointerType>(NewTy)->getPointerAddressSpace();
+ return OldAS == NewAS || TTI.isNoopAddrSpaceCast(OldAS, NewAS);
}
// We can convert integers to integral pointers, but not to non-integral
@@ -1729,10 +1735,11 @@
/// This will try various different casting techniques, such as bitcasts,
/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
/// two types for viability with this routine.
-static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
- Type *NewTy) {
+static Value *convertValue(const DataLayout &DL, const TargetTransformInfo &TTI,
+ IRBuilderTy &IRB, Value *V, Type *NewTy) {
Type *OldTy = V->getType();
- assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
+ assert(canConvertValue(DL, TTI, OldTy, NewTy) &&
+ "Value not convertable to type");
if (OldTy == NewTy)
return V;
@@ -1772,6 +1779,9 @@
return IRB.CreatePtrToInt(V, NewTy);
}
+ if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy())
+ return IRB.CreatePointerBitCastOrAddrSpaceCast(V, NewTy);
+
return IRB.CreateBitCast(V, NewTy);
}
@@ -1782,7 +1792,8 @@
static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
VectorType *Ty,
uint64_t ElementSize,
- const DataLayout &DL) {
+ const DataLayout &DL,
+ const TargetTransformInfo &TTI) {
// First validate the slice offsets.
uint64_t BeginOffset =
std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
@@ -1826,7 +1837,7 @@
assert(LTy->isIntegerTy());
LTy = SplitIntTy;
}
- if (!canConvertValue(DL, SliceTy, LTy))
+ if (!canConvertValue(DL, TTI, SliceTy, LTy))
return false;
} else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
if (SI->isVolatile())
@@ -1836,7 +1847,7 @@
assert(STy->isIntegerTy());
STy = SplitIntTy;
}
- if (!canConvertValue(DL, STy, SliceTy))
+ if (!canConvertValue(DL, TTI, STy, SliceTy))
return false;
} else {
return false;
@@ -1854,7 +1865,8 @@
/// SSA value. We only can ensure this for a limited set of operations, and we
/// don't want to do the rewrites unless we are confident that the result will
/// be promotable, so we have an early test here.
-static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
+static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL,
+ const TargetTransformInfo &TTI) {
// Collect the candidate types for vector-based promotion. Also track whether
// we have different element types.
SmallVector<VectorType *, 4> CandidateTys;
@@ -1953,11 +1965,11 @@
ElementSize /= 8;
for (const Slice &S : P)
- if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
+ if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, TTI))
return false;
for (const Slice *S : P.splitSliceTails())
- if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
+ if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, TTI))
return false;
return true;
@@ -1977,6 +1989,7 @@
uint64_t AllocBeginOffset,
Type *AllocaTy,
const DataLayout &DL,
+ const TargetTransformInfo &TTI,
bool &WholeAllocaOp) {
uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedSize();
@@ -2009,7 +2022,7 @@
if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
return false;
} else if (RelBegin != 0 || RelEnd != Size ||
- !canConvertValue(DL, AllocaTy, LI->getType())) {
+ !canConvertValue(DL, TTI, AllocaTy, LI->getType())) {
// Non-integer loads need to be convertible from the alloca type so that
// they are promotable.
return false;
@@ -2034,7 +2047,7 @@
if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
return false;
} else if (RelBegin != 0 || RelEnd != Size ||
- !canConvertValue(DL, ValueTy, AllocaTy)) {
+ !canConvertValue(DL, TTI, ValueTy, AllocaTy)) {
// Non-integer stores need to be convertible to the alloca type so that
// they are promotable.
return false;
@@ -2061,7 +2074,8 @@
/// stores to a particular alloca into wider loads and stores and be able to
/// promote the resulting alloca.
static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
- const DataLayout &DL) {
+ const DataLayout &DL,
+ const TargetTransformInfo &TTI) {
uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedSize();
// Don't create integer types larger than the maximum bitwidth.
if (SizeInBits > IntegerType::MAX_INT_BITS)
@@ -2075,8 +2089,8 @@
// be converted to the alloca type, whatever that is. We don't want to force
// the alloca itself to have an integer type if there is a more suitable one.
Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
- if (!canConvertValue(DL, AllocaTy, IntTy) ||
- !canConvertValue(DL, IntTy, AllocaTy))
+ if (!canConvertValue(DL, TTI, AllocaTy, IntTy) ||
+ !canConvertValue(DL, TTI, IntTy, AllocaTy))
return false;
// While examining uses, we ensure that the alloca has a covering load or
@@ -2090,12 +2104,12 @@
P.begin() != P.end() ? false : DL.isLegalInteger(SizeInBits);
for (const Slice &S : P)
- if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
+ if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL, TTI,
WholeAllocaOp))
return false;
for (const Slice *S : P.splitSliceTails())
- if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
+ if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL, TTI,
WholeAllocaOp))
return false;
@@ -2247,6 +2261,7 @@
using Base = InstVisitor<AllocaSliceRewriter, bool>;
const DataLayout &DL;
+ const TargetTransformInfo &TTI;
AllocaSlices &AS;
SROA &Pass;
AllocaInst &OldAI, &NewAI;
@@ -2296,14 +2311,14 @@
IRBuilderTy IRB;
public:
- AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
- AllocaInst &OldAI, AllocaInst &NewAI,
- uint64_t NewAllocaBeginOffset,
+ AllocaSliceRewriter(const DataLayout &DL, const TargetTransformInfo &TTI,
+ AllocaSlices &AS, SROA &Pass, AllocaInst &OldAI,
+ AllocaInst &NewAI, uint64_t NewAllocaBeginOffset,
uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
VectorType *PromotableVecTy,
SmallSetVector<PHINode *, 8> &PHIUsers,
SmallSetVector<SelectInst *, 8> &SelectUsers)
- : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
+ : DL(DL), TTI(TTI), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
NewAllocaBeginOffset(NewAllocaBeginOffset),
NewAllocaEndOffset(NewAllocaEndOffset),
NewAllocaTy(NewAI.getAllocatedType()),
@@ -2449,7 +2464,7 @@
assert(!LI.isVolatile());
Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
NewAI.getAlign(), "load");
- V = convertValue(DL, IRB, V, IntTy);
+ V = convertValue(DL, TTI, IRB, V, IntTy);
assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
@@ -2490,7 +2505,7 @@
V = rewriteIntegerLoad(LI);
} else if (NewBeginOffset == NewAllocaBeginOffset &&
NewEndOffset == NewAllocaEndOffset &&
- (canConvertValue(DL, NewAllocaTy, TargetTy) ||
+ (canConvertValue(DL, TTI, NewAllocaTy, TargetTy) ||
(IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
TargetTy->isIntegerTy()))) {
LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
@@ -2543,7 +2558,7 @@
V = NewLI;
IsPtrAdjusted = true;
}
- V = convertValue(DL, IRB, V, TargetTy);
+ V = convertValue(DL, TTI, IRB, V, TargetTy);
if (IsSplit) {
assert(!LI.isVolatile());
@@ -2589,7 +2604,7 @@
? ElementTy
: FixedVectorType::get(ElementTy, NumElements);
if (V->getType() != SliceTy)
- V = convertValue(DL, IRB, V, SliceTy);
+ V = convertValue(DL, TTI, IRB, V, SliceTy);
// Mix in the existing elements.
Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
@@ -2612,12 +2627,12 @@
IntTy->getBitWidth()) {
Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
NewAI.getAlign(), "oldload");
- Old = convertValue(DL, IRB, Old, IntTy);
+ Old = convertValue(DL, TTI, IRB, Old, IntTy);
assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
}
- V = convertValue(DL, IRB, V, NewAllocaTy);
+ V = convertValue(DL, TTI, IRB, V, NewAllocaTy);
StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
LLVMContext::MD_access_group});
@@ -2665,7 +2680,7 @@
StoreInst *NewSI;
if (NewBeginOffset == NewAllocaBeginOffset &&
NewEndOffset == NewAllocaEndOffset &&
- (canConvertValue(DL, V->getType(), NewAllocaTy) ||
+ (canConvertValue(DL, TTI, V->getType(), NewAllocaTy) ||
(IsStorePastEnd && NewAllocaTy->isIntegerTy() &&
V->getType()->isIntegerTy()))) {
// If this is an integer store past the end of slice (and thus the bytes
@@ -2680,7 +2695,7 @@
V = IRB.CreateTrunc(V, AITy, "load.trunc");
}
- V = convertValue(DL, IRB, V, NewAllocaTy);
+ V = convertValue(DL, TTI, IRB, V, NewAllocaTy);
NewSI =
IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), SI.isVolatile());
} else {
@@ -2775,7 +2790,7 @@
const auto Len = C->getZExtValue();
auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
- return canConvertValue(DL, SrcTy, AllocaTy) &&
+ return canConvertValue(DL, TTI, SrcTy, AllocaTy) &&
DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedSize());
}();
@@ -2812,7 +2827,7 @@
Value *Splat = getIntegerSplat(
II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8);
- Splat = convertValue(DL, IRB, Splat, ElementTy);
+ Splat = convertValue(DL, TTI, IRB, Splat, ElementTy);
if (NumElements > 1)
Splat = getVectorSplat(Splat, NumElements);
@@ -2831,14 +2846,14 @@
EndOffset != NewAllocaBeginOffset)) {
Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
NewAI.getAlign(), "oldload");
- Old = convertValue(DL, IRB, Old, IntTy);
+ Old = convertValue(DL, TTI, IRB, Old, IntTy);
uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
V = insertInteger(DL, IRB, Old, V, Offset, "insert");
} else {
assert(V->getType() == IntTy &&
"Wrong type for an alloca wide integer!");
}
- V = convertValue(DL, IRB, V, AllocaTy);
+ V = convertValue(DL, TTI, IRB, V, AllocaTy);
} else {
// Established these invariants above.
assert(NewBeginOffset == NewAllocaBeginOffset);
@@ -2849,7 +2864,7 @@
if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
V = getVectorSplat(V, AllocaVecTy->getNumElements());
- V = convertValue(DL, IRB, V, AllocaTy);
+ V = convertValue(DL, TTI, IRB, V, AllocaTy);
}
StoreInst *New =
@@ -3023,7 +3038,7 @@
} else if (IntTy && !IsWholeAlloca && !IsDest) {
Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
NewAI.getAlign(), "load");
- Src = convertValue(DL, IRB, Src, IntTy);
+ Src = convertValue(DL, TTI, IRB, Src, IntTy);
uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
} else {
@@ -3041,10 +3056,10 @@
} else if (IntTy && !IsWholeAlloca && IsDest) {
Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
NewAI.getAlign(), "oldload");
- Old = convertValue(DL, IRB, Old, IntTy);
+ Old = convertValue(DL, TTI, IRB, Old, IntTy);
uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
- Src = convertValue(DL, IRB, Src, NewAllocaTy);
+ Src = convertValue(DL, TTI, IRB, Src, NewAllocaTy);
}
StoreInst *Store = cast<StoreInst>(
@@ -4231,10 +4246,10 @@
SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
assert(DL.getTypeAllocSize(SliceTy).getFixedSize() >= P.size());
- bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
+ bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL, *TTI);
VectorType *VecTy =
- IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
+ IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, *TTI);
if (VecTy)
SliceTy = VecTy;
@@ -4277,7 +4292,7 @@
SmallSetVector<PHINode *, 8> PHIUsers;
SmallSetVector<SelectInst *, 8> SelectUsers;
- AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
+ AllocaSliceRewriter Rewriter(DL, *TTI, AS, *this, AI, *NewAI, P.beginOffset(),
P.endOffset(), IsIntegerPromotable, VecTy,
PHIUsers, SelectUsers);
bool Promotable = true;
@@ -4653,11 +4668,13 @@
}
PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
- AssumptionCache &RunAC) {
+ AssumptionCache &RunAC,
+ const TargetTransformInfo &RunTTI) {
LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
C = &F.getContext();
DT = &RunDT;
AC = &RunAC;
+ TTI = &RunTTI;
BasicBlock &EntryBB = F.getEntryBlock();
for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
@@ -4711,7 +4728,8 @@
PreservedAnalyses SROA::run(Function &F, FunctionAnalysisManager &AM) {
return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F),
- AM.getResult<AssumptionAnalysis>(F));
+ AM.getResult<AssumptionAnalysis>(F),
+ AM.getResult<TargetIRAnalysis>(F));
}
/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
@@ -4735,13 +4753,15 @@
auto PA = Impl.runImpl(
F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
- getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
+ getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F));
return !PA.areAllPreserved();
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
AU.setPreservesCFG();
}
@@ -4757,5 +4777,6 @@
"Scalar Replacement Of Aggregates", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
false, false)
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -291,6 +291,11 @@
return TTIImpl->collectFlatAddressOperands(OpIndexes, IID);
}
+bool TargetTransformInfo::isNoopAddrSpaceCast(unsigned FromAS,
+ unsigned ToAS) const {
+ return TTIImpl->isNoopAddrSpaceCast(FromAS, ToAS);
+}
+
Value *TargetTransformInfo::rewriteIntrinsicWithAddressSpace(
IntrinsicInst *II, Value *OldV, Value *NewV) const {
return TTIImpl->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
Index: llvm/include/llvm/Transforms/Scalar/SROA.h
===================================================================
--- llvm/include/llvm/Transforms/Scalar/SROA.h
+++ llvm/include/llvm/Transforms/Scalar/SROA.h
@@ -30,6 +30,7 @@
class LLVMContext;
class PHINode;
class SelectInst;
+class TargetTransformInfo;
class Use;
/// A private "module" namespace for types and utilities used by SROA. These
@@ -65,6 +66,7 @@
LLVMContext *C = nullptr;
DominatorTree *DT = nullptr;
AssumptionCache *AC = nullptr;
+ const TargetTransformInfo *TTI = nullptr;
/// Worklist of alloca instructions to simplify.
///
@@ -120,7 +122,8 @@
/// Helper used by both the public run method and by the legacy pass.
PreservedAnalyses runImpl(Function &F, DominatorTree &RunDT,
- AssumptionCache &RunAC);
+ AssumptionCache &RunAC,
+ const TargetTransformInfo &RunTTI);
bool presplitLoadsAndStores(AllocaInst &AI, sroa::AllocaSlices &AS);
AllocaInst *rewritePartition(AllocaInst &AI, sroa::AllocaSlices &AS,
Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -222,6 +222,10 @@
return false;
}
+ bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
+ return getTLI()->isNoopAddrSpaceCast(FromAS, ToAS);
+ }
+
Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
Value *NewV) const {
return nullptr;
Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -87,6 +87,10 @@
return false;
}
+ bool isNoopAddrSpaceCast(unsigned, unsigned) const {
+ return false;
+ }
+
Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
Value *NewV) const {
return nullptr;
Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -376,6 +376,8 @@
bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
Intrinsic::ID IID) const;
+ bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const;
+
/// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p
/// NewV, which has a different address space. This should happen for every
/// operand index that collectFlatAddressOperands returned for the intrinsic.
@@ -1245,6 +1247,7 @@
virtual unsigned getFlatAddressSpace() = 0;
virtual bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
Intrinsic::ID IID) const = 0;
+ virtual bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const = 0;
virtual Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
Value *OldV,
Value *NewV) const = 0;
@@ -1517,6 +1520,10 @@
return Impl.collectFlatAddressOperands(OpIndexes, IID);
}
+ bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override {
+ return Impl.isNoopAddrSpaceCast(FromAS, ToAS);
+ }
+
Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
Value *NewV) const override {
return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
Index: clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
===================================================================
--- clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
+++ clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
@@ -1,37 +1,52 @@
-// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -o - | FileCheck --check-prefixes=COMMON,CHECK %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -disable-O0-optnone -o - | opt -S -O2 | FileCheck %s --check-prefixes=COMMON,OPT
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -x hip %s -o - | FileCheck -check-prefix=HOST %s
#include "Inputs/cuda.h"
// Coerced struct from `struct S` without all generic pointers lowered into
// global ones.
-// CHECK: %struct.S.coerce = type { i32 addrspace(1)*, float addrspace(1)* }
-// CHECK: %struct.T.coerce = type { [2 x float addrspace(1)*] }
+// COMMON: %struct.S.coerce = type { i32 addrspace(1)*, float addrspace(1)* }
+// COMMON: %struct.T.coerce = type { [2 x float addrspace(1)*] }
// On the host-side compilation, generic pointer won't be coerced.
// HOST-NOT: %struct.S.coerce
// HOST-NOT: %struct.T.coerce
-// CHECK: define amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)* %x.coerce)
// HOST: define void @_Z22__device_stub__kernel1Pi(i32* %x)
+// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)*{{.*}} %x.coerce)
+// CHECK: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
+// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
+// OPT-NOT: alloca
+// OPT-NOT: inttoptr
__global__ void kernel1(int *x) {
x[0]++;
}
-// CHECK: define amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)* nonnull align 4 dereferenceable(4) %x.coerce)
// HOST: define void @_Z22__device_stub__kernel2Ri(i32* nonnull align 4 dereferenceable(4) %x)
+// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)*{{.*}} nonnull align 4 dereferenceable(4) %x.coerce)
+// CHECK: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
+// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
+// OPT-NOT: alloca
+// OPT-NOT: ptrtoint
+// OPT-NOT: inttoptr
__global__ void kernel2(int &x) {
x++;
}
-// CHECK: define amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y)
// HOST: define void @_Z22__device_stub__kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y)
+// CHECK-LABEL: define amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)*{{.*}} %x, i32 addrspace(1)*{{.*}} %y)
+// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
__global__ void kernel3(__attribute__((address_space(2))) int *x,
__attribute__((address_space(1))) int *y) {
y[0] = x[0];
}
-// CHECK: define void @_Z4funcPi(i32* %x)
+// COMMON-LABEL: define void @_Z4funcPi(i32*{{.*}} %x)
+// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
+// OPT-NOT: alloca
+// OPT-NOT: ptrtoint
+// OPT-NOT: inttoptr
__device__ void func(int *x) {
x[0]++;
}
@@ -42,16 +57,22 @@
};
// `by-val` struct will be coerced into a similar struct with all generic
// pointers lowerd into global ones.
-// CHECK: define amdgpu_kernel void @_Z7kernel41S(%struct.S.coerce %s.coerce)
// HOST: define void @_Z22__device_stub__kernel41S(i32* %s.coerce0, float* %s.coerce1)
+// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel41S(%struct.S.coerce %s.coerce)
+// OPT-NOT: alloca
+// OPT-NOT: ptrtoint
+// OPT-NOT: inttoptr
__global__ void kernel4(struct S s) {
s.x[0]++;
s.y[0] += 1.f;
}
// If a pointer to struct is passed, only the pointer itself is coerced into the global one.
-// CHECK: define amdgpu_kernel void @_Z7kernel5P1S(%struct.S addrspace(1)* %s.coerce)
// HOST: define void @_Z22__device_stub__kernel5P1S(%struct.S* %s)
+// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel5P1S(%struct.S addrspace(1)*{{.*}} %s.coerce)
+// OPT-NOT: alloca
+// OPT-NOT: ptrtoint
+// OPT-NOT: inttoptr
__global__ void kernel5(struct S *s) {
s->x[0]++;
s->y[0] += 1.f;
@@ -61,16 +82,22 @@
float *x[2];
};
// `by-val` array is also coerced.
-// CHECK: define amdgpu_kernel void @_Z7kernel61T(%struct.T.coerce %t.coerce)
// HOST: define void @_Z22__device_stub__kernel61T(float* %t.coerce0, float* %t.coerce1)
+// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel61T(%struct.T.coerce %t.coerce)
+// OPT-NOT: alloca
+// OPT-NOT: ptrtoint
+// OPT-NOT: inttoptr
__global__ void kernel6(struct T t) {
t.x[0][0] += 1.f;
t.x[1][0] += 2.f;
}
// Check that coerced pointers retain the noalias attribute when qualified with __restrict.
-// CHECK: define amdgpu_kernel void @_Z7kernel7Pi(i32 addrspace(1)* noalias %x.coerce)
// HOST: define void @_Z22__device_stub__kernel7Pi(i32* noalias %x)
+// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel7Pi(i32 addrspace(1)* noalias{{.*}} %x.coerce)
+// OPT-NOT: alloca
+// OPT-NOT: ptrtoint
+// OPT-NOT: inttoptr
__global__ void kernel7(int *__restrict x) {
x[0]++;
}
Index: clang/test/CodeGen/thinlto-distributed-newpm.ll
===================================================================
--- clang/test/CodeGen/thinlto-distributed-newpm.ll
+++ clang/test/CodeGen/thinlto-distributed-newpm.ll
@@ -91,13 +91,13 @@
; CHECK-O3: Running pass: CGSCCToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
; CHECK-O: Starting {{.*}}Function pass manager run.
; CHECK-O: Running pass: SROA on main
-; These next two can appear in any order since they are accessed as parameters
+; These next three can appear in any order since they are accessed as parameters
; on the same call to SROA::runImpl
+; CHECK-O2-DAG: Running analysis: TargetIRAnalysis on main
; CHECK-O-DAG: Running analysis: DominatorTreeAnalysis on main
; CHECK-O-DAG: Running analysis: AssumptionAnalysis on main
; CHECK-O: Running pass: EarlyCSEPass on main
; CHECK-O: Running analysis: TargetLibraryAnalysis on main
-; CHECK-O2: Running analysis: TargetIRAnalysis on main
; CHECK-O: Running analysis: MemorySSAAnalysis on main
; CHECK-O: Running analysis: AAManager on main
; CHECK-O: Running analysis: BasicAA on main
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits