Blame - lib/Transforms/Scalar/MemCpyOptimizer.cpp - platform/external/llvm

blob: f990ba870ec162645e3ff8ac10ea02eaefdf5e30 [file] [log] [blame]

Owen Anderson	a723d1e	2008-04-09 08:23:16 +0000	[diff] [blame^]	1	//===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// This pass performs various transformations related to eliminating memcpy
				11	// calls, or transforming sets of stores into memset's.
				12	//
				13	//===----------------------------------------------------------------------===//
				14
				15	#define DEBUG_TYPE "memcpyopt"
				16	#include "llvm/Transforms/Scalar.h"
				17	#include "llvm/BasicBlock.h"
				18	#include "llvm/Constants.h"
				19	#include "llvm/DerivedTypes.h"
				20	#include "llvm/Function.h"
				21	#include "llvm/IntrinsicInst.h"
				22	#include "llvm/Instructions.h"
				23	#include "llvm/ParameterAttributes.h"
				24	#include "llvm/Value.h"
				25	#include "llvm/ADT/DepthFirstIterator.h"
				26	#include "llvm/ADT/SmallVector.h"
				27	#include "llvm/ADT/Statistic.h"
				28	#include "llvm/Analysis/Dominators.h"
				29	#include "llvm/Analysis/AliasAnalysis.h"
				30	#include "llvm/Analysis/MemoryDependenceAnalysis.h"
				31	#include "llvm/Support/CFG.h"
				32	#include "llvm/Support/CommandLine.h"
				33	#include "llvm/Support/Compiler.h"
				34	#include "llvm/Support/Debug.h"
				35	#include "llvm/Support/GetElementPtrTypeIterator.h"
				36	#include "llvm/Target/TargetData.h"
				37	#include <list>
				38	using namespace llvm;
				39
				40	STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
				41	STATISTIC(NumMemSetInfer, "Number of memsets inferred");
				42
				43	namespace {
				44	cl::opt<bool>
				45	FormMemSet("form-memset-from-stores",
				46	cl::desc("Transform straight-line stores to memsets"),
				47	cl::init(true), cl::Hidden);
				48	}
				49
				50	/// isBytewiseValue - If the specified value can be set by repeating the same
				51	/// byte in memory, return the i8 value that it is represented with. This is
				52	/// true for all i8 values obviously, but is also true for i32 0, i32 -1,
				53	/// i16 0xF0F0, double 0.0 etc. If the value can't be handled with a repeated
				54	/// byte store (e.g. i16 0x1234), return null.
				55	static Value isBytewiseValue(Value V) {
				56	// All byte-wide stores are splatable, even of arbitrary variables.
				57	if (V->getType() == Type::Int8Ty) return V;
				58
				59	// Constant float and double values can be handled as integer values if the
				60	// corresponding integer value is "byteable". An important case is 0.0.
				61	if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
				62	if (CFP->getType() == Type::FloatTy)
				63	V = ConstantExpr::getBitCast(CFP, Type::Int32Ty);
				64	if (CFP->getType() == Type::DoubleTy)
				65	V = ConstantExpr::getBitCast(CFP, Type::Int64Ty);
				66	// Don't handle long double formats, which have strange constraints.
				67	}
				68
				69	// We can handle constant integers that are power of two in size and a
				70	// multiple of 8 bits.
				71	if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
				72	unsigned Width = CI->getBitWidth();
				73	if (isPowerOf2_32(Width) && Width > 8) {
				74	// We can handle this value if the recursive binary decomposition is the
				75	// same at all levels.
				76	APInt Val = CI->getValue();
				77	APInt Val2;
				78	while (Val.getBitWidth() != 8) {
				79	unsigned NextWidth = Val.getBitWidth()/2;
				80	Val2 = Val.lshr(NextWidth);
				81	Val2.trunc(Val.getBitWidth()/2);
				82	Val.trunc(Val.getBitWidth()/2);
				83
				84	// If the top/bottom halves aren't the same, reject it.
				85	if (Val != Val2)
				86	return 0;
				87	}
				88	return ConstantInt::get(Val);
				89	}
				90	}
				91
				92	// Conceptually, we could handle things like:
				93	// %a = zext i8 %X to i16
				94	// %b = shl i16 %a, 8
				95	// %c = or i16 %a, %b
				96	// but until there is an example that actually needs this, it doesn't seem
				97	// worth worrying about.
				98	return 0;
				99	}
				100
				101	static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx,
				102	bool &VariableIdxFound, TargetData &TD) {
				103	// Skip over the first indices.
				104	gep_type_iterator GTI = gep_type_begin(GEP);
				105	for (unsigned i = 1; i != Idx; ++i, ++GTI)
				106	/skip along/;
				107
				108	// Compute the offset implied by the rest of the indices.
				109	int64_t Offset = 0;
				110	for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
				111	ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i));
				112	if (OpC == 0)
				113	return VariableIdxFound = true;
				114	if (OpC->isZero()) continue; // No offset.
				115
				116	// Handle struct indices, which add their field offset to the pointer.
				117	if (const StructType STy = dyn_cast<StructType>(GTI)) {
				118	Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
				119	continue;
				120	}
				121
				122	// Otherwise, we have a sequential type like an array or vector. Multiply
				123	// the index by the ElementSize.
				124	uint64_t Size = TD.getABITypeSize(GTI.getIndexedType());
				125	Offset += Size*OpC->getSExtValue();
				126	}
				127
				128	return Offset;
				129	}
				130
				131	/// IsPointerOffset - Return true if Ptr1 is provably equal to Ptr2 plus a
				132	/// constant offset, and return that constant offset. For example, Ptr1 might
				133	/// be &A[42], and Ptr2 might be &A[40]. In this case offset would be -8.
				134	static bool IsPointerOffset(Value Ptr1, Value Ptr2, int64_t &Offset,
				135	TargetData &TD) {
				136	// Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical
				137	// base. After that base, they may have some number of common (and
				138	// potentially variable) indices. After that they handle some constant
				139	// offset, which determines their offset from each other. At this point, we
				140	// handle no other case.
				141	GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
				142	GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
				143	if (!GEP1 \|\| !GEP2 \|\| GEP1->getOperand(0) != GEP2->getOperand(0))
				144	return false;
				145
				146	// Skip any common indices and track the GEP types.
				147	unsigned Idx = 1;
				148	for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx)
				149	if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx))
				150	break;
				151
				152	bool VariableIdxFound = false;
				153	int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, TD);
				154	int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, TD);
				155	if (VariableIdxFound) return false;
				156
				157	Offset = Offset2-Offset1;
				158	return true;
				159	}
				160
				161
				162	/// MemsetRange - Represents a range of memset'd bytes with the ByteVal value.
				163	/// This allows us to analyze stores like:
				164	/// store 0 -> P+1
				165	/// store 0 -> P+0
				166	/// store 0 -> P+3
				167	/// store 0 -> P+2
				168	/// which sometimes happens with stores to arrays of structs etc. When we see
				169	/// the first store, we make a range [1, 2). The second store extends the range
				170	/// to [0, 2). The third makes a new range [2, 3). The fourth store joins the
				171	/// two ranges into [0, 3) which is memset'able.
				172	namespace {
				173	struct MemsetRange {
				174	// Start/End - A semi range that describes the span that this range covers.
				175	// The range is closed at the start and open at the end: [Start, End).
				176	int64_t Start, End;
				177
				178	/// StartPtr - The getelementptr instruction that points to the start of the
				179	/// range.
				180	Value *StartPtr;
				181
				182	/// Alignment - The known alignment of the first store.
				183	unsigned Alignment;
				184
				185	/// TheStores - The actual stores that make up this range.
				186	SmallVector<StoreInst*, 16> TheStores;
				187
				188	bool isProfitableToUseMemset(const TargetData &TD) const;
				189
				190	};
				191	} // end anon namespace
				192
				193	bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const {
				194	// If we found more than 8 stores to merge or 64 bytes, use memset.
				195	if (TheStores.size() >= 8 \|\| End-Start >= 64) return true;
				196
				197	// Assume that the code generator is capable of merging pairs of stores
				198	// together if it wants to.
				199	if (TheStores.size() <= 2) return false;
				200
				201	// If we have fewer than 8 stores, it can still be worthwhile to do this.
				202	// For example, merging 4 i8 stores into an i32 store is useful almost always.
				203	// However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
				204	// memset will be split into 2 32-bit stores anyway) and doing so can
				205	// pessimize the llvm optimizer.
				206	//
				207	// Since we don't have perfect knowledge here, make some assumptions: assume
				208	// the maximum GPR width is the same size as the pointer size and assume that
				209	// this width can be stored. If so, check to see whether we will end up
				210	// actually reducing the number of stores used.
				211	unsigned Bytes = unsigned(End-Start);
				212	unsigned NumPointerStores = Bytes/TD.getPointerSize();
				213
				214	// Assume the remaining bytes if any are done a byte at a time.
				215	unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize();
				216
				217	// If we will reduce the # stores (according to this heuristic), do the
				218	// transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
				219	// etc.
				220	return TheStores.size() > NumPointerStores+NumByteStores;
				221	}
				222
				223
				224	namespace {
				225	class MemsetRanges {
				226	/// Ranges - A sorted list of the memset ranges. We use std::list here
				227	/// because each element is relatively large and expensive to copy.
				228	std::list<MemsetRange> Ranges;
				229	typedef std::list<MemsetRange>::iterator range_iterator;
				230	TargetData &TD;
				231	public:
				232	MemsetRanges(TargetData &td) : TD(td) {}
				233
				234	typedef std::list<MemsetRange>::const_iterator const_iterator;
				235	const_iterator begin() const { return Ranges.begin(); }
				236	const_iterator end() const { return Ranges.end(); }
				237	bool empty() const { return Ranges.empty(); }
				238
				239	void addStore(int64_t OffsetFromFirst, StoreInst *SI);
				240	};
				241
				242	} // end anon namespace
				243
				244
				245	/// addStore - Add a new store to the MemsetRanges data structure. This adds a
				246	/// new range for the specified store at the specified offset, merging into
				247	/// existing ranges as appropriate.
				248	void MemsetRanges::addStore(int64_t Start, StoreInst *SI) {
				249	int64_t End = Start+TD.getTypeStoreSize(SI->getOperand(0)->getType());
				250
				251	// Do a linear search of the ranges to see if this can be joined and/or to
				252	// find the insertion point in the list. We keep the ranges sorted for
				253	// simplicity here. This is a linear search of a linked list, which is ugly,
				254	// however the number of ranges is limited, so this won't get crazy slow.
				255	range_iterator I = Ranges.begin(), E = Ranges.end();
				256
				257	while (I != E && Start > I->End)
				258	++I;
				259
				260	// We now know that I == E, in which case we didn't find anything to merge
				261	// with, or that Start <= I->End. If End < I->Start or I == E, then we need
				262	// to insert a new range. Handle this now.
				263	if (I == E \|\| End < I->Start) {
				264	MemsetRange &R = *Ranges.insert(I, MemsetRange());
				265	R.Start = Start;
				266	R.End = End;
				267	R.StartPtr = SI->getPointerOperand();
				268	R.Alignment = SI->getAlignment();
				269	R.TheStores.push_back(SI);
				270	return;
				271	}
				272
				273	// This store overlaps with I, add it.
				274	I->TheStores.push_back(SI);
				275
				276	// At this point, we may have an interval that completely contains our store.
				277	// If so, just add it to the interval and return.
				278	if (I->Start <= Start && I->End >= End)
				279	return;
				280
				281	// Now we know that Start <= I->End and End >= I->Start so the range overlaps
				282	// but is not entirely contained within the range.
				283
				284	// See if the range extends the start of the range. In this case, it couldn't
				285	// possibly cause it to join the prior range, because otherwise we would have
				286	// stopped on it.
				287	if (Start < I->Start) {
				288	I->Start = Start;
				289	I->StartPtr = SI->getPointerOperand();
				290	}
				291
				292	// Now we know that Start <= I->End and Start >= I->Start (so the startpoint
				293	// is in or right at the end of I), and that End >= I->Start. Extend I out to
				294	// End.
				295	if (End > I->End) {
				296	I->End = End;
				297	range_iterator NextI = I;;
				298	while (++NextI != E && End >= NextI->Start) {
				299	// Merge the range in.
				300	I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end());
				301	if (NextI->End > I->End)
				302	I->End = NextI->End;
				303	Ranges.erase(NextI);
				304	NextI = I;
				305	}
				306	}
				307	}
				308
				309	//===----------------------------------------------------------------------===//
				310	// MemCpyOpt Pass
				311	//===----------------------------------------------------------------------===//
				312
				313	namespace {
				314
				315	class VISIBILITY_HIDDEN MemCpyOpt : public FunctionPass {
				316	bool runOnFunction(Function &F);
				317	public:
				318	static char ID; // Pass identification, replacement for typeid
				319	MemCpyOpt() : FunctionPass((intptr_t)&ID) { }
				320
				321	private:
				322	// This transformation requires dominator postdominator info
				323	virtual void getAnalysisUsage(AnalysisUsage &AU) const {
				324	AU.setPreservesCFG();
				325	AU.addRequired<DominatorTree>();
				326	AU.addRequired<MemoryDependenceAnalysis>();
				327	AU.addRequired<AliasAnalysis>();
				328	AU.addRequired<TargetData>();
				329	AU.addPreserved<AliasAnalysis>();
				330	AU.addPreserved<MemoryDependenceAnalysis>();
				331	AU.addPreserved<TargetData>();
				332	}
				333
				334	// Helper fuctions
				335	bool processInstruction(Instruction* I,
				336	SmallVectorImpl<Instruction*> &toErase);
				337	bool processStore(StoreInst SI, SmallVectorImpl<Instruction> &toErase);
				338	bool processMemCpy(MemCpyInst* M, MemCpyInst* MDep,
				339	SmallVectorImpl<Instruction*> &toErase);
				340	bool performCallSlotOptzn(MemCpyInst* cpy, CallInst* C,
				341	SmallVectorImpl<Instruction*> &toErase);
				342	bool iterateOnFunction(Function &F);
				343	};
				344
				345	char MemCpyOpt::ID = 0;
				346	}
				347
				348	// createMemCpyOptPass - The public interface to this file...
				349	FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); }
				350
				351	static RegisterPass<MemCpyOpt> X("memcpyopt",
				352	"MemCpy Optimization");
				353
				354
				355
				356	/// processStore - When GVN is scanning forward over instructions, we look for
				357	/// some other patterns to fold away. In particular, this looks for stores to
				358	/// neighboring locations of memory. If it sees enough consequtive ones
				359	/// (currently 4) it attempts to merge them together into a memcpy/memset.
				360	bool MemCpyOpt::processStore(StoreInst SI, SmallVectorImpl<Instruction> &toErase) {
				361	if (!FormMemSet) return false;
				362	if (SI->isVolatile()) return false;
				363
				364	// There are two cases that are interesting for this code to handle: memcpy
				365	// and memset. Right now we only handle memset.
				366
				367	// Ensure that the value being stored is something that can be memset'able a
				368	// byte at a time like "0" or "-1" or any width, as well as things like
				369	// 0xA0A0A0A0 and 0.0.
				370	Value *ByteVal = isBytewiseValue(SI->getOperand(0));
				371	if (!ByteVal)
				372	return false;
				373
				374	TargetData &TD = getAnalysis<TargetData>();
				375	AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
				376
				377	// Okay, so we now have a single store that can be splatable. Scan to find
				378	// all subsequent stores of the same value to offset from the same pointer.
				379	// Join these together into ranges, so we can decide whether contiguous blocks
				380	// are stored.
				381	MemsetRanges Ranges(TD);
				382
				383	Value *StartPtr = SI->getPointerOperand();
				384
				385	BasicBlock::iterator BI = SI;
				386	for (++BI; !isa<TerminatorInst>(BI); ++BI) {
				387	if (isa<CallInst>(BI) \|\| isa<InvokeInst>(BI)) {
				388	// If the call is readnone, ignore it, otherwise bail out. We don't even
				389	// allow readonly here because we don't want something like:
				390	// A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
				391	if (AA.getModRefBehavior(CallSite::get(BI)) ==
				392	AliasAnalysis::DoesNotAccessMemory)
				393	continue;
				394
				395	// TODO: If this is a memset, try to join it in.
				396
				397	break;
				398	} else if (isa<VAArgInst>(BI) \|\| isa<LoadInst>(BI))
				399	break;
				400
				401	// If this is a non-store instruction it is fine, ignore it.
				402	StoreInst *NextStore = dyn_cast<StoreInst>(BI);
				403	if (NextStore == 0) continue;
				404
				405	// If this is a store, see if we can merge it in.
				406	if (NextStore->isVolatile()) break;
				407
				408	// Check to see if this stored value is of the same byte-splattable value.
				409	if (ByteVal != isBytewiseValue(NextStore->getOperand(0)))
				410	break;
				411
				412	// Check to see if this store is to a constant offset from the start ptr.
				413	int64_t Offset;
				414	if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset, TD))
				415	break;
				416
				417	Ranges.addStore(Offset, NextStore);
				418	}
				419
				420	// If we have no ranges, then we just had a single store with nothing that
				421	// could be merged in. This is a very common case of course.
				422	if (Ranges.empty())
				423	return false;
				424
				425	// If we had at least one store that could be merged in, add the starting
				426	// store as well. We try to avoid this unless there is at least something
				427	// interesting as a small compile-time optimization.
				428	Ranges.addStore(0, SI);
				429
				430
				431	Function *MemSetF = 0;
				432
				433	// Now that we have full information about ranges, loop over the ranges and
				434	// emit memset's for anything big enough to be worthwhile.
				435	bool MadeChange = false;
				436	for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end();
				437	I != E; ++I) {
				438	const MemsetRange &Range = *I;
				439
				440	if (Range.TheStores.size() == 1) continue;
				441
				442	// If it is profitable to lower this range to memset, do so now.
				443	if (!Range.isProfitableToUseMemset(TD))
				444	continue;
				445
				446	// Otherwise, we do want to transform this! Create a new memset. We put
				447	// the memset right before the first instruction that isn't part of this
				448	// memset block. This ensure that the memset is dominated by any addressing
				449	// instruction needed by the start of the block.
				450	BasicBlock::iterator InsertPt = BI;
				451
				452	if (MemSetF == 0)
				453	MemSetF = Intrinsic::getDeclaration(SI->getParent()->getParent()
				454	->getParent(), Intrinsic::memset_i64);
				455
				456	// Get the starting pointer of the block.
				457	StartPtr = Range.StartPtr;
				458
				459	// Cast the start ptr to be i8* as memset requires.
				460	const Type *i8Ptr = PointerType::getUnqual(Type::Int8Ty);
				461	if (StartPtr->getType() != i8Ptr)
				462	StartPtr = new BitCastInst(StartPtr, i8Ptr, StartPtr->getNameStart(),
				463	InsertPt);
				464
				465	Value *Ops[] = {
				466	StartPtr, ByteVal, // Start, value
				467	ConstantInt::get(Type::Int64Ty, Range.End-Range.Start), // size
				468	ConstantInt::get(Type::Int32Ty, Range.Alignment) // align
				469	};
				470	Value *C = CallInst::Create(MemSetF, Ops, Ops+4, "", InsertPt);
				471	DEBUG(cerr << "Replace stores:\n";
				472	for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i)
				473	cerr << *Range.TheStores[i];
				474	cerr << "With: " << *C); C=C;
				475
				476	// Zap all the stores.
				477	toErase.append(Range.TheStores.begin(), Range.TheStores.end());
				478	++NumMemSetInfer;
				479	MadeChange = true;
				480	}
				481
				482	return MadeChange;
				483	}
				484
				485
				486	/// performCallSlotOptzn - takes a memcpy and a call that it depends on,
				487	/// and checks for the possibility of a call slot optimization by having
				488	/// the call write its result directly into the destination of the memcpy.
				489	bool MemCpyOpt::performCallSlotOptzn(MemCpyInst cpy, CallInst C,
				490	SmallVectorImpl<Instruction*> &toErase) {
				491	// The general transformation to keep in mind is
				492	//
				493	// call @func(..., src, ...)
				494	// memcpy(dest, src, ...)
				495	//
				496	// ->
				497	//
				498	// memcpy(dest, src, ...)
				499	// call @func(..., dest, ...)
				500	//
				501	// Since moving the memcpy is technically awkward, we additionally check that
				502	// src only holds uninitialized values at the moment of the call, meaning that
				503	// the memcpy can be discarded rather than moved.
				504
				505	// Deliberately get the source and destination with bitcasts stripped away,
				506	// because we'll need to do type comparisons based on the underlying type.
				507	Value* cpyDest = cpy->getDest();
				508	Value* cpySrc = cpy->getSource();
				509	CallSite CS = CallSite::get(C);
				510
				511	// We need to be able to reason about the size of the memcpy, so we require
				512	// that it be a constant.
				513	ConstantInt* cpyLength = dyn_cast<ConstantInt>(cpy->getLength());
				514	if (!cpyLength)
				515	return false;
				516
				517	// Require that src be an alloca. This simplifies the reasoning considerably.
				518	AllocaInst* srcAlloca = dyn_cast<AllocaInst>(cpySrc);
				519	if (!srcAlloca)
				520	return false;
				521
				522	// Check that all of src is copied to dest.
				523	TargetData& TD = getAnalysis<TargetData>();
				524
				525	ConstantInt* srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize());
				526	if (!srcArraySize)
				527	return false;
				528
				529	uint64_t srcSize = TD.getABITypeSize(srcAlloca->getAllocatedType()) *
				530	srcArraySize->getZExtValue();
				531
				532	if (cpyLength->getZExtValue() < srcSize)
				533	return false;
				534
				535	// Check that accessing the first srcSize bytes of dest will not cause a
				536	// trap. Otherwise the transform is invalid since it might cause a trap
				537	// to occur earlier than it otherwise would.
				538	if (AllocaInst* A = dyn_cast<AllocaInst>(cpyDest)) {
				539	// The destination is an alloca. Check it is larger than srcSize.
				540	ConstantInt* destArraySize = dyn_cast<ConstantInt>(A->getArraySize());
				541	if (!destArraySize)
				542	return false;
				543
				544	uint64_t destSize = TD.getABITypeSize(A->getAllocatedType()) *
				545	destArraySize->getZExtValue();
				546
				547	if (destSize < srcSize)
				548	return false;
				549	} else if (Argument* A = dyn_cast<Argument>(cpyDest)) {
				550	// If the destination is an sret parameter then only accesses that are
				551	// outside of the returned struct type can trap.
				552	if (!A->hasStructRetAttr())
				553	return false;
				554
				555	const Type* StructTy = cast<PointerType>(A->getType())->getElementType();
				556	uint64_t destSize = TD.getABITypeSize(StructTy);
				557
				558	if (destSize < srcSize)
				559	return false;
				560	} else {
				561	return false;
				562	}
				563
				564	// Check that src is not accessed except via the call and the memcpy. This
				565	// guarantees that it holds only undefined values when passed in (so the final
				566	// memcpy can be dropped), that it is not read or written between the call and
				567	// the memcpy, and that writing beyond the end of it is undefined.
				568	SmallVector<User*, 8> srcUseList(srcAlloca->use_begin(),
				569	srcAlloca->use_end());
				570	while (!srcUseList.empty()) {
				571	User* UI = srcUseList.back();
				572	srcUseList.pop_back();
				573
				574	if (isa<GetElementPtrInst>(UI) \|\| isa<BitCastInst>(UI)) {
				575	for (User::use_iterator I = UI->use_begin(), E = UI->use_end();
				576	I != E; ++I)
				577	srcUseList.push_back(*I);
				578	} else if (UI != C && UI != cpy) {
				579	return false;
				580	}
				581	}
				582
				583	// Since we're changing the parameter to the callsite, we need to make sure
				584	// that what would be the new parameter dominates the callsite.
				585	DominatorTree& DT = getAnalysis<DominatorTree>();
				586	if (Instruction* cpyDestInst = dyn_cast<Instruction>(cpyDest))
				587	if (!DT.dominates(cpyDestInst, C))
				588	return false;
				589
				590	// In addition to knowing that the call does not access src in some
				591	// unexpected manner, for example via a global, which we deduce from
				592	// the use analysis, we also need to know that it does not sneakily
				593	// access dest. We rely on AA to figure this out for us.
				594	AliasAnalysis& AA = getAnalysis<AliasAnalysis>();
				595	if (AA.getModRefInfo(C, cpy->getRawDest(), srcSize) !=
				596	AliasAnalysis::NoModRef)
				597	return false;
				598
				599	// All the checks have passed, so do the transformation.
				600	for (unsigned i = 0; i < CS.arg_size(); ++i)
				601	if (CS.getArgument(i) == cpySrc) {
				602	if (cpySrc->getType() != cpyDest->getType())
				603	cpyDest = CastInst::createPointerCast(cpyDest, cpySrc->getType(),
				604	cpyDest->getName(), C);
				605	CS.setArgument(i, cpyDest);
				606	}
				607
				608	// Drop any cached information about the call, because we may have changed
				609	// its dependence information by changing its parameter.
				610	MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
				611	MD.dropInstruction(C);
				612
				613	// Remove the memcpy
				614	MD.removeInstruction(cpy);
				615	toErase.push_back(cpy);
				616
				617	return true;
				618	}
				619
				620	/// processMemCpy - perform simplication of memcpy's. If we have memcpy A which
				621	/// copies X to Y, and memcpy B which copies Y to Z, then we can rewrite B to be
				622	/// a memcpy from X to Z (or potentially a memmove, depending on circumstances).
				623	/// This allows later passes to remove the first memcpy altogether.
				624	bool MemCpyOpt::processMemCpy(MemCpyInst* M, MemCpyInst* MDep,
				625	SmallVectorImpl<Instruction*> &toErase) {
				626	// We can only transforms memcpy's where the dest of one is the source of the
				627	// other
				628	if (M->getSource() != MDep->getDest())
				629	return false;
				630
				631	// Second, the length of the memcpy's must be the same, or the preceeding one
				632	// must be larger than the following one.
				633	ConstantInt* C1 = dyn_cast<ConstantInt>(MDep->getLength());
				634	ConstantInt* C2 = dyn_cast<ConstantInt>(M->getLength());
				635	if (!C1 \|\| !C2)
				636	return false;
				637
				638	uint64_t DepSize = C1->getValue().getZExtValue();
				639	uint64_t CpySize = C2->getValue().getZExtValue();
				640
				641	if (DepSize < CpySize)
				642	return false;
				643
				644	// Finally, we have to make sure that the dest of the second does not
				645	// alias the source of the first
				646	AliasAnalysis& AA = getAnalysis<AliasAnalysis>();
				647	if (AA.alias(M->getRawDest(), CpySize, MDep->getRawSource(), DepSize) !=
				648	AliasAnalysis::NoAlias)
				649	return false;
				650	else if (AA.alias(M->getRawDest(), CpySize, M->getRawSource(), CpySize) !=
				651	AliasAnalysis::NoAlias)
				652	return false;
				653	else if (AA.alias(MDep->getRawDest(), DepSize, MDep->getRawSource(), DepSize)
				654	!= AliasAnalysis::NoAlias)
				655	return false;
				656
				657	// If all checks passed, then we can transform these memcpy's
				658	Function* MemCpyFun = Intrinsic::getDeclaration(
				659	M->getParent()->getParent()->getParent(),
				660	M->getIntrinsicID());
				661
				662	std::vector<Value*> args;
				663	args.push_back(M->getRawDest());
				664	args.push_back(MDep->getRawSource());
				665	args.push_back(M->getLength());
				666	args.push_back(M->getAlignment());
				667
				668	CallInst* C = CallInst::Create(MemCpyFun, args.begin(), args.end(), "", M);
				669
				670	MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
				671	if (MD.getDependency(C) == MDep) {
				672	MD.dropInstruction(M);
				673	toErase.push_back(M);
				674	return true;
				675	}
				676
				677	MD.removeInstruction(C);
				678	toErase.push_back(C);
				679	return false;
				680	}
				681
				682	/// processInstruction - When calculating availability, handle an instruction
				683	/// by inserting it into the appropriate sets
				684	bool MemCpyOpt::processInstruction(Instruction *I,
				685	SmallVectorImpl<Instruction*> &toErase) {
				686	if (StoreInst *SI = dyn_cast<StoreInst>(I))
				687	return processStore(SI, toErase);
				688
				689	if (MemCpyInst* M = dyn_cast<MemCpyInst>(I)) {
				690	MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
				691
				692	// The are two possible optimizations we can do for memcpy:
				693	// a) memcpy-memcpy xform which exposes redundance for DSE
				694	// b) call-memcpy xform for return slot optimization
				695	Instruction* dep = MD.getDependency(M);
				696	if (dep == MemoryDependenceAnalysis::None \|\|
				697	dep == MemoryDependenceAnalysis::NonLocal)
				698	return false;
				699	if (MemCpyInst *MemCpy = dyn_cast<MemCpyInst>(dep))
				700	return processMemCpy(M, MemCpy, toErase);
				701	if (CallInst* C = dyn_cast<CallInst>(dep))
				702	return performCallSlotOptzn(M, C, toErase);
				703	return false;
				704	}
				705
				706	return false;
				707	}
				708
				709	// MemCpyOpt::runOnFunction - This is the main transformation entry point for a
				710	// function.
				711	//
				712	bool MemCpyOpt::runOnFunction(Function& F) {
				713
				714	bool changed = false;
				715	bool shouldContinue = true;
				716
				717	while (shouldContinue) {
				718	shouldContinue = iterateOnFunction(F);
				719	changed \|= shouldContinue;
				720	}
				721
				722	return changed;
				723	}
				724
				725
				726	// MemCpyOpt::iterateOnFunction - Executes one iteration of GVN
				727	bool MemCpyOpt::iterateOnFunction(Function &F) {
				728	bool changed_function = false;
				729
				730	DominatorTree &DT = getAnalysis<DominatorTree>();
				731
				732	SmallVector<Instruction*, 8> toErase;
				733
				734	// Top-down walk of the dominator tree
				735	for (df_iterator<DomTreeNode*> DI = df_begin(DT.getRootNode()),
				736	E = df_end(DT.getRootNode()); DI != E; ++DI) {
				737
				738	BasicBlock* BB = DI->getBlock();
				739	for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
				740	BI != BE;) {
				741	changed_function \|= processInstruction(BI, toErase);
				742	if (toErase.empty()) {
				743	++BI;
				744	continue;
				745	}
				746
				747	// If we need some instructions deleted, do it now.
				748	NumMemCpyInstr += toErase.size();
				749
				750	// Avoid iterator invalidation.
				751	bool AtStart = BI == BB->begin();
				752	if (!AtStart)
				753	--BI;
				754
				755	for (SmallVector<Instruction*, 4>::iterator I = toErase.begin(),
				756	E = toErase.end(); I != E; ++I)
				757	(*I)->eraseFromParent();
				758
				759	if (AtStart)
				760	BI = BB->begin();
				761	else
				762	++BI;
				763
				764	toErase.clear();
				765	}
				766	}
				767
				768	return changed_function;
				769	}