[memcpyopt] Teach memcpyopt to optimize across basic blocks
This teaches memcpyopt to make a non-local memdep query when a local query
indicates that the dependency is non-local. This notably allows it to
eliminate many more llvm.memcpy calls in common Rust code, often by 20-30%.
This is r319482 and r319483, along with fixes for PR35519: fix the
optimization that merges stores into memsets to preserve cached memdep
info, and fix memdep's non-local caching strategy to not assume that larger
queries are always more conservative than smaller ones.
Fixes PR28958 and PR35519.
Differential Revision: https://reviews.llvm.org/D40802
llvm-svn: 321138
diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index a6c5901..d5538f5 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -919,6 +919,14 @@
Instruction *QueryInst, SmallVectorImpl<NonLocalDepResult> &Result) {
const MemoryLocation Loc = MemoryLocation::get(QueryInst);
bool isLoad = isa<LoadInst>(QueryInst);
+ return getNonLocalPointerDependencyFrom(QueryInst, Loc, isLoad, Result);
+}
+
+void MemoryDependenceResults::getNonLocalPointerDependencyFrom(
+ Instruction *QueryInst,
+ const MemoryLocation &Loc,
+ bool isLoad,
+ SmallVectorImpl<NonLocalDepResult> &Result) {
BasicBlock *FromBB = QueryInst->getParent();
assert(FromBB);
@@ -1118,21 +1126,15 @@
// If we already have a cache entry for this CacheKey, we may need to do some
// work to reconcile the cache entry and the current query.
if (!Pair.second) {
- if (CacheInfo->Size < Loc.Size) {
- // The query's Size is greater than the cached one. Throw out the
- // cached data and proceed with the query at the greater size.
+ if (CacheInfo->Size != Loc.Size) {
+ // The query's Size differs from the cached one. Throw out the
+ // cached data and proceed with the query at the new size.
CacheInfo->Pair = BBSkipFirstBlockPair();
CacheInfo->Size = Loc.Size;
for (auto &Entry : CacheInfo->NonLocalDeps)
if (Instruction *Inst = Entry.getResult().getInst())
RemoveFromReverseMap(ReverseNonLocalPtrDeps, Inst, CacheKey);
CacheInfo->NonLocalDeps.clear();
- } else if (CacheInfo->Size > Loc.Size) {
- // This query's Size is less than the cached one. Conservatively restart
- // the query using the greater size.
- return getNonLocalPointerDepFromBB(
- QueryInst, Pointer, Loc.getWithNewSize(CacheInfo->Size), isLoad,
- StartBB, Result, Visited, SkipFirstBlock);
}
// If the query's AATags are inconsistent with the cached one,
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 9c870b4..6af3fef9 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -476,22 +476,33 @@
Alignment = DL.getABITypeAlignment(EltType);
}
- AMemSet =
- Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
+ // Remember the debug location.
+ DebugLoc Loc;
+ if (!Range.TheStores.empty())
+ Loc = Range.TheStores[0]->getDebugLoc();
DEBUG(dbgs() << "Replace stores:\n";
for (Instruction *SI : Range.TheStores)
- dbgs() << *SI << '\n';
- dbgs() << "With: " << *AMemSet << '\n');
-
- if (!Range.TheStores.empty())
- AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
+ dbgs() << *SI << '\n');
// Zap all the stores.
for (Instruction *SI : Range.TheStores) {
MD->removeInstruction(SI);
SI->eraseFromParent();
}
+
+ // Create the memset after removing the stores, so that if there any cached
+ // non-local dependencies on the removed instructions in
+ // MemoryDependenceAnalysis, the cache entries are updated to "dirty"
+ // entries pointing below the memset, so subsequent queries include the
+ // memset.
+ AMemSet =
+ Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
+ if (!Range.TheStores.empty())
+ AMemSet->setDebugLoc(Loc);
+
+ DEBUG(dbgs() << "With: " << *AMemSet << '\n');
+
++NumMemSetInfer;
}
@@ -1031,9 +1042,22 @@
//
// NOTE: This is conservative, it will stop on any read from the source loc,
// not just the defining memcpy.
- MemDepResult SourceDep =
- MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
- M->getIterator(), M->getParent());
+ MemoryLocation SourceLoc = MemoryLocation::getForSource(MDep);
+ MemDepResult SourceDep = MD->getPointerDependencyFrom(SourceLoc, false,
+ M->getIterator(), M->getParent());
+
+ if (SourceDep.isNonLocal()) {
+ SmallVector<NonLocalDepResult, 2> NonLocalDepResults;
+ MD->getNonLocalPointerDependencyFrom(M, SourceLoc, /*isLoad=*/false,
+ NonLocalDepResults);
+ if (NonLocalDepResults.size() == 1) {
+ SourceDep = NonLocalDepResults[0].getResult();
+ assert((!SourceDep.getInst() ||
+ LookupDomTree().dominates(SourceDep.getInst(), M)) &&
+ "when memdep returns exactly one result, it should dominate");
+ }
+ }
+
if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
return false;
@@ -1235,6 +1259,18 @@
MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
SrcLoc, true, M->getIterator(), M->getParent());
+ if (SrcDepInfo.isNonLocal()) {
+ SmallVector<NonLocalDepResult, 2> NonLocalDepResults;
+ MD->getNonLocalPointerDependencyFrom(M, SrcLoc, /*isLoad=*/true,
+ NonLocalDepResults);
+ if (NonLocalDepResults.size() == 1) {
+ SrcDepInfo = NonLocalDepResults[0].getResult();
+ assert((!SrcDepInfo.getInst() ||
+ LookupDomTree().dominates(SrcDepInfo.getInst(), M)) &&
+ "when memdep returns exactly one result, it should dominate");
+ }
+ }
+
if (SrcDepInfo.isClobber()) {
if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
return processMemCpyMemCpyDependence(M, MDep);