[InstCombineCalls] Unfold element atomic memcpy instruction
Differential Revision: https://reviews.llvm.org/D28909
llvm-svn: 294453
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 13f2f9e..23f8035 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -60,6 +60,12 @@
STATISTIC(NumSimplified, "Number of library calls simplified");
+static cl::opt<uint64_t> UnfoldElementAtomicMemcpyMaxElements(
+ "unfold-element-atomic-memcpy-max-elements",
+ cl::init(16),
+ cl::desc("Maximum number of elements in atomic memcpy the optimizer is "
+ "allowed to unfold"));
+
/// Return the specified type promoted as it would be to pass though a va_arg
/// area.
static Type *getPromotedType(Type *Ty) {
@@ -108,6 +114,78 @@
return ConstantVector::get(BoolVec);
}
+Instruction *
+InstCombiner::SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI) {
+ // Try to unfold this intrinsic into sequence of explicit atomic loads and
+ // stores.
+ // First check that number of elements is compile time constant.
+ auto *NumElementsCI = dyn_cast<ConstantInt>(AMI->getNumElements());
+ if (!NumElementsCI)
+ return nullptr;
+
+ // Check that there are not too many elements.
+ uint64_t NumElements = NumElementsCI->getZExtValue();
+ if (NumElements >= UnfoldElementAtomicMemcpyMaxElements)
+ return nullptr;
+
+ // Don't unfold into illegal integers
+ uint64_t ElementSizeInBytes = AMI->getElementSizeInBytes() * 8;
+ if (!getDataLayout().isLegalInteger(ElementSizeInBytes))
+ return nullptr;
+
+ // Cast source and destination to the correct type. Intrinsic input arguments
+ // are usually represented as i8*.
+ // Often operands will be explicitly casted to i8* and we can just strip
+ // those casts instead of inserting new ones. However it's easier to rely on
+ // other InstCombine rules which will cover trivial cases anyway.
+ Value *Src = AMI->getRawSource();
+ Value *Dst = AMI->getRawDest();
+ Type *ElementPointerType = Type::getIntNPtrTy(
+ AMI->getContext(), ElementSizeInBytes, Src->getType()->getPointerAddressSpace());
+
+ Value *SrcCasted = Builder->CreatePointerCast(Src, ElementPointerType,
+ "memcpy_unfold.src_casted");
+ Value *DstCasted = Builder->CreatePointerCast(Dst, ElementPointerType,
+ "memcpy_unfold.dst_casted");
+
+ for (uint64_t i = 0; i < NumElements; ++i) {
+ // Get current element addresses
+ ConstantInt *ElementIdxCI =
+ ConstantInt::get(AMI->getContext(), APInt(64, i));
+ Value *SrcElementAddr =
+ Builder->CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr");
+ Value *DstElementAddr =
+ Builder->CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr");
+
+ // Load from the source. Transfer alignment information and mark load as
+ // unordered atomic.
+ LoadInst *Load = Builder->CreateLoad(SrcElementAddr, "memcpy_unfold.val");
+ Load->setOrdering(AtomicOrdering::Unordered);
+ // We know alignment of the first element. It is also guaranteed by the
+ // verifier that element size is less or equal than first element alignment
+ // and both of this values are powers of two.
+ // This means that all subsequent accesses are at least element size
+ // aligned.
+ // TODO: We can infer better alignment but there is no evidence that this
+ // will matter.
+ Load->setAlignment(i == 0 ? AMI->getSrcAlignment()
+ : AMI->getElementSizeInBytes());
+ Load->setDebugLoc(AMI->getDebugLoc());
+
+ // Store loaded value via unordered atomic store.
+ StoreInst *Store = Builder->CreateStore(Load, DstElementAddr);
+ Store->setOrdering(AtomicOrdering::Unordered);
+ Store->setAlignment(i == 0 ? AMI->getDstAlignment()
+ : AMI->getElementSizeInBytes());
+ Store->setDebugLoc(AMI->getDebugLoc());
+ }
+
+ // Set the number of elements of the copy to 0, it will be deleted on the
+ // next iteration.
+ AMI->setNumElements(Constant::getNullValue(NumElementsCI->getType()));
+ return AMI;
+}
+
Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT);
unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT);
@@ -1839,6 +1917,9 @@
if (Constant *C = dyn_cast<Constant>(AMI->getNumElements()))
if (C->isNullValue())
return eraseInstFromFunction(*AMI);
+
+ if (Instruction *I = SimplifyElementAtomicMemCpy(AMI))
+ return I;
}
if (Instruction *I = SimplifyNVVMIntrinsic(II, *this))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 9e04f5d..68bae0e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -650,6 +650,8 @@
Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
Instruction *MatchBSwap(BinaryOperator &I);
bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
+
+ Instruction *SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI);
Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
Instruction *SimplifyMemSet(MemSetInst *MI);