Blame - llvm/lib/Analysis/LoopAccessAnalysis.cpp - toolchain/llvm-project

blob: 2f4a9402167e1442618aeb5536658aacd67f484e [file] [log] [blame]

Adam Nemet	0456327	2015-02-01 16:56:15 +0000	[diff] [blame^]	1	//===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// The implementation for the loop memory dependence that was originally
				11	// developed for the loop vectorizer.
				12	//
				13	//===----------------------------------------------------------------------===//
				14
				15	#include "llvm/Analysis/LoopAccessAnalysis.h"
				16	#include "llvm/Analysis/LoopInfo.h"
				17	#include "llvm/Analysis/ValueTracking.h"
				18	#include "llvm/IR/DiagnosticInfo.h"
				19	#include "llvm/IR/Dominators.h"
				20	#include "llvm/Support/Debug.h"
				21	#include "llvm/Transforms/Utils/VectorUtils.h"
				22	using namespace llvm;
				23
				24	#define DEBUG_TYPE "loop-vectorize"
				25
				26	void VectorizationReport::emitAnalysis(VectorizationReport &Message,
				27	const Function *TheFunction,
				28	const Loop *TheLoop) {
				29	DebugLoc DL = TheLoop->getStartLoc();
				30	if (Instruction *I = Message.getInstr())
				31	DL = I->getDebugLoc();
				32	emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE,
				33	*TheFunction, DL, Message.str());
				34	}
				35
				36	Value llvm::stripIntegerCast(Value V) {
				37	if (CastInst *CI = dyn_cast<CastInst>(V))
				38	if (CI->getOperand(0)->getType()->isIntegerTy())
				39	return CI->getOperand(0);
				40	return V;
				41	}
				42
				43	const SCEV llvm::replaceSymbolicStrideSCEV(ScalarEvolution SE,
				44	ValueToValueMap &PtrToStride,
				45	Value Ptr, Value OrigPtr) {
				46
				47	const SCEV *OrigSCEV = SE->getSCEV(Ptr);
				48
				49	// If there is an entry in the map return the SCEV of the pointer with the
				50	// symbolic stride replaced by one.
				51	ValueToValueMap::iterator SI = PtrToStride.find(OrigPtr ? OrigPtr : Ptr);
				52	if (SI != PtrToStride.end()) {
				53	Value *StrideVal = SI->second;
				54
				55	// Strip casts.
				56	StrideVal = stripIntegerCast(StrideVal);
				57
				58	// Replace symbolic stride by one.
				59	Value *One = ConstantInt::get(StrideVal->getType(), 1);
				60	ValueToValueMap RewriteMap;
				61	RewriteMap[StrideVal] = One;
				62
				63	const SCEV *ByOne =
				64	SCEVParameterRewriter::rewrite(OrigSCEV, *SE, RewriteMap, true);
				65	DEBUG(dbgs() << "LV: Replacing SCEV: " << OrigSCEV << " by: " << ByOne
				66	<< "\n");
				67	return ByOne;
				68	}
				69
				70	// Otherwise, just return the SCEV of the original pointer.
				71	return SE->getSCEV(Ptr);
				72	}
				73
				74	void LoopAccessAnalysis::RuntimePointerCheck::insert(ScalarEvolution *SE,
				75	Loop Lp, Value Ptr,
				76	bool WritePtr,
				77	unsigned DepSetId,
				78	unsigned ASId,
				79	ValueToValueMap &Strides) {
				80	// Get the stride replaced scev.
				81	const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
				82	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
				83	assert(AR && "Invalid addrec expression");
				84	const SCEV *Ex = SE->getBackedgeTakenCount(Lp);
				85	const SCEV ScEnd = AR->evaluateAtIteration(Ex, SE);
				86	Pointers.push_back(Ptr);
				87	Starts.push_back(AR->getStart());
				88	Ends.push_back(ScEnd);
				89	IsWritePtr.push_back(WritePtr);
				90	DependencySetId.push_back(DepSetId);
				91	AliasSetId.push_back(ASId);
				92	}
				93
				94	namespace {
				95	/// \brief Analyses memory accesses in a loop.
				96	///
				97	/// Checks whether run time pointer checks are needed and builds sets for data
				98	/// dependence checking.
				99	class AccessAnalysis {
				100	public:
				101	/// \brief Read or write access location.
				102	typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
				103	typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
				104
				105	/// \brief Set of potential dependent memory accesses.
				106	typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
				107
				108	AccessAnalysis(const DataLayout Dl, AliasAnalysis AA, DepCandidates &DA) :
				109	DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {}
				110
				111	/// \brief Register a load and whether it is only read from.
				112	void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) {
				113	Value Ptr = const_cast<Value>(Loc.Ptr);
				114	AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
				115	Accesses.insert(MemAccessInfo(Ptr, false));
				116	if (IsReadOnly)
				117	ReadOnlyPtr.insert(Ptr);
				118	}
				119
				120	/// \brief Register a store.
				121	void addStore(AliasAnalysis::Location &Loc) {
				122	Value Ptr = const_cast<Value>(Loc.Ptr);
				123	AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
				124	Accesses.insert(MemAccessInfo(Ptr, true));
				125	}
				126
				127	/// \brief Check whether we can check the pointers at runtime for
				128	/// non-intersection.
				129	bool canCheckPtrAtRT(LoopAccessAnalysis::RuntimePointerCheck &RtCheck,
				130	unsigned &NumComparisons,
				131	ScalarEvolution SE, Loop TheLoop,
				132	ValueToValueMap &Strides,
				133	bool ShouldCheckStride = false);
				134
				135	/// \brief Goes over all memory accesses, checks whether a RT check is needed
				136	/// and builds sets of dependent accesses.
				137	void buildDependenceSets() {
				138	processMemAccesses();
				139	}
				140
				141	bool isRTCheckNeeded() { return IsRTCheckNeeded; }
				142
				143	bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
				144	void resetDepChecks() { CheckDeps.clear(); }
				145
				146	MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; }
				147
				148	private:
				149	typedef SetVector<MemAccessInfo> PtrAccessSet;
				150
				151	/// \brief Go over all memory access and check whether runtime pointer checks
				152	/// are needed /// and build sets of dependency check candidates.
				153	void processMemAccesses();
				154
				155	/// Set of all accesses.
				156	PtrAccessSet Accesses;
				157
				158	/// Set of accesses that need a further dependence check.
				159	MemAccessInfoSet CheckDeps;
				160
				161	/// Set of pointers that are read only.
				162	SmallPtrSet<Value*, 16> ReadOnlyPtr;
				163
				164	const DataLayout *DL;
				165
				166	/// An alias set tracker to partition the access set by underlying object and
				167	//intrinsic property (such as TBAA metadata).
				168	AliasSetTracker AST;
				169
				170	/// Sets of potentially dependent accesses - members of one set share an
				171	/// underlying pointer. The set "CheckDeps" identfies which sets really need a
				172	/// dependence check.
				173	DepCandidates &DepCands;
				174
				175	bool IsRTCheckNeeded;
				176	};
				177
				178	} // end anonymous namespace
				179
				180	/// \brief Check whether a pointer can participate in a runtime bounds check.
				181	static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides,
				182	Value *Ptr) {
				183	const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
				184	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
				185	if (!AR)
				186	return false;
				187
				188	return AR->isAffine();
				189	}
				190
				191	/// \brief Check the stride of the pointer and ensure that it does not wrap in
				192	/// the address space.
				193	static int isStridedPtr(ScalarEvolution SE, const DataLayout DL, Value *Ptr,
				194	const Loop *Lp, ValueToValueMap &StridesMap);
				195
				196	bool AccessAnalysis::canCheckPtrAtRT(
				197	LoopAccessAnalysis::RuntimePointerCheck &RtCheck,
				198	unsigned &NumComparisons, ScalarEvolution SE, Loop TheLoop,
				199	ValueToValueMap &StridesMap, bool ShouldCheckStride) {
				200	// Find pointers with computable bounds. We are going to use this information
				201	// to place a runtime bound check.
				202	bool CanDoRT = true;
				203
				204	bool IsDepCheckNeeded = isDependencyCheckNeeded();
				205	NumComparisons = 0;
				206
				207	// We assign a consecutive id to access from different alias sets.
				208	// Accesses between different groups doesn't need to be checked.
				209	unsigned ASId = 1;
				210	for (auto &AS : AST) {
				211	unsigned NumReadPtrChecks = 0;
				212	unsigned NumWritePtrChecks = 0;
				213
				214	// We assign consecutive id to access from different dependence sets.
				215	// Accesses within the same set don't need a runtime check.
				216	unsigned RunningDepId = 1;
				217	DenseMap<Value *, unsigned> DepSetId;
				218
				219	for (auto A : AS) {
				220	Value *Ptr = A.getValue();
				221	bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
				222	MemAccessInfo Access(Ptr, IsWrite);
				223
				224	if (IsWrite)
				225	++NumWritePtrChecks;
				226	else
				227	++NumReadPtrChecks;
				228
				229	if (hasComputableBounds(SE, StridesMap, Ptr) &&
				230	// When we run after a failing dependency check we have to make sure we
				231	// don't have wrapping pointers.
				232	(!ShouldCheckStride \|\|
				233	isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) {
				234	// The id of the dependence set.
				235	unsigned DepId;
				236
				237	if (IsDepCheckNeeded) {
				238	Value *Leader = DepCands.getLeaderValue(Access).getPointer();
				239	unsigned &LeaderId = DepSetId[Leader];
				240	if (!LeaderId)
				241	LeaderId = RunningDepId++;
				242	DepId = LeaderId;
				243	} else
				244	// Each access has its own dependence set.
				245	DepId = RunningDepId++;
				246
				247	RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap);
				248
				249	DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n');
				250	} else {
				251	CanDoRT = false;
				252	}
				253	}
				254
				255	if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2)
				256	NumComparisons += 0; // Only one dependence set.
				257	else {
				258	NumComparisons += (NumWritePtrChecks * (NumReadPtrChecks +
				259	NumWritePtrChecks - 1));
				260	}
				261
				262	++ASId;
				263	}
				264
				265	// If the pointers that we would use for the bounds comparison have different
				266	// address spaces, assume the values aren't directly comparable, so we can't
				267	// use them for the runtime check. We also have to assume they could
				268	// overlap. In the future there should be metadata for whether address spaces
				269	// are disjoint.
				270	unsigned NumPointers = RtCheck.Pointers.size();
				271	for (unsigned i = 0; i < NumPointers; ++i) {
				272	for (unsigned j = i + 1; j < NumPointers; ++j) {
				273	// Only need to check pointers between two different dependency sets.
				274	if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j])
				275	continue;
				276	// Only need to check pointers in the same alias set.
				277	if (RtCheck.AliasSetId[i] != RtCheck.AliasSetId[j])
				278	continue;
				279
				280	Value *PtrI = RtCheck.Pointers[i];
				281	Value *PtrJ = RtCheck.Pointers[j];
				282
				283	unsigned ASi = PtrI->getType()->getPointerAddressSpace();
				284	unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
				285	if (ASi != ASj) {
				286	DEBUG(dbgs() << "LV: Runtime check would require comparison between"
				287	" different address spaces\n");
				288	return false;
				289	}
				290	}
				291	}
				292
				293	return CanDoRT;
				294	}
				295
				296	void AccessAnalysis::processMemAccesses() {
				297	// We process the set twice: first we process read-write pointers, last we
				298	// process read-only pointers. This allows us to skip dependence tests for
				299	// read-only pointers.
				300
				301	DEBUG(dbgs() << "LV: Processing memory accesses...\n");
				302	DEBUG(dbgs() << " AST: "; AST.dump());
				303	DEBUG(dbgs() << "LV: Accesses:\n");
				304	DEBUG({
				305	for (auto A : Accesses)
				306	dbgs() << "\t" << *A.getPointer() << " (" <<
				307	(A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ?
				308	"read-only" : "read")) << ")\n";
				309	});
				310
				311	// The AliasSetTracker has nicely partitioned our pointers by metadata
				312	// compatibility and potential for underlying-object overlap. As a result, we
				313	// only need to check for potential pointer dependencies within each alias
				314	// set.
				315	for (auto &AS : AST) {
				316	// Note that both the alias-set tracker and the alias sets themselves used
				317	// linked lists internally and so the iteration order here is deterministic
				318	// (matching the original instruction order within each set).
				319
				320	bool SetHasWrite = false;
				321
				322	// Map of pointers to last access encountered.
				323	typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap;
				324	UnderlyingObjToAccessMap ObjToLastAccess;
				325
				326	// Set of access to check after all writes have been processed.
				327	PtrAccessSet DeferredAccesses;
				328
				329	// Iterate over each alias set twice, once to process read/write pointers,
				330	// and then to process read-only pointers.
				331	for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
				332	bool UseDeferred = SetIteration > 0;
				333	PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses;
				334
				335	for (auto AV : AS) {
				336	Value *Ptr = AV.getValue();
				337
				338	// For a single memory access in AliasSetTracker, Accesses may contain
				339	// both read and write, and they both need to be handled for CheckDeps.
				340	for (auto AC : S) {
				341	if (AC.getPointer() != Ptr)
				342	continue;
				343
				344	bool IsWrite = AC.getInt();
				345
				346	// If we're using the deferred access set, then it contains only
				347	// reads.
				348	bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
				349	if (UseDeferred && !IsReadOnlyPtr)
				350	continue;
				351	// Otherwise, the pointer must be in the PtrAccessSet, either as a
				352	// read or a write.
				353	assert(((IsReadOnlyPtr && UseDeferred) \|\| IsWrite \|\|
				354	S.count(MemAccessInfo(Ptr, false))) &&
				355	"Alias-set pointer not in the access set?");
				356
				357	MemAccessInfo Access(Ptr, IsWrite);
				358	DepCands.insert(Access);
				359
				360	// Memorize read-only pointers for later processing and skip them in
				361	// the first round (they need to be checked after we have seen all
				362	// write pointers). Note: we also mark pointer that are not
				363	// consecutive as "read-only" pointers (so that we check
				364	// "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
				365	if (!UseDeferred && IsReadOnlyPtr) {
				366	DeferredAccesses.insert(Access);
				367	continue;
				368	}
				369
				370	// If this is a write - check other reads and writes for conflicts. If
				371	// this is a read only check other writes for conflicts (but only if
				372	// there is no other write to the ptr - this is an optimization to
				373	// catch "a[i] = a[i] + " without having to do a dependence check).
				374	if ((IsWrite \|\| IsReadOnlyPtr) && SetHasWrite) {
				375	CheckDeps.insert(Access);
				376	IsRTCheckNeeded = true;
				377	}
				378
				379	if (IsWrite)
				380	SetHasWrite = true;
				381
				382	// Create sets of pointers connected by a shared alias set and
				383	// underlying object.
				384	typedef SmallVector<Value *, 16> ValueVector;
				385	ValueVector TempObjects;
				386	GetUnderlyingObjects(Ptr, TempObjects, DL);
				387	for (Value *UnderlyingObj : TempObjects) {
				388	UnderlyingObjToAccessMap::iterator Prev =
				389	ObjToLastAccess.find(UnderlyingObj);
				390	if (Prev != ObjToLastAccess.end())
				391	DepCands.unionSets(Access, Prev->second);
				392
				393	ObjToLastAccess[UnderlyingObj] = Access;
				394	}
				395	}
				396	}
				397	}
				398	}
				399	}
				400
				401	namespace {
				402	/// \brief Checks memory dependences among accesses to the same underlying
				403	/// object to determine whether there vectorization is legal or not (and at
				404	/// which vectorization factor).
				405	///
				406	/// This class works under the assumption that we already checked that memory
				407	/// locations with different underlying pointers are "must-not alias".
				408	/// We use the ScalarEvolution framework to symbolically evalutate access
				409	/// functions pairs. Since we currently don't restructure the loop we can rely
				410	/// on the program order of memory accesses to determine their safety.
				411	/// At the moment we will only deem accesses as safe for:
				412	/// * A negative constant distance assuming program order.
				413	///
				414	/// Safe: tmp = a[i + 1]; OR a[i + 1] = x;
				415	/// a[i] = tmp; y = a[i];
				416	///
				417	/// The latter case is safe because later checks guarantuee that there can't
				418	/// be a cycle through a phi node (that is, we check that "x" and "y" is not
				419	/// the same variable: a header phi can only be an induction or a reduction, a
				420	/// reduction can't have a memory sink, an induction can't have a memory
				421	/// source). This is important and must not be violated (or we have to
				422	/// resort to checking for cycles through memory).
				423	///
				424	/// * A positive constant distance assuming program order that is bigger
				425	/// than the biggest memory access.
				426	///
				427	/// tmp = a[i] OR b[i] = x
				428	/// a[i+2] = tmp y = b[i+2];
				429	///
				430	/// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively.
				431	///
				432	/// * Zero distances and all accesses have the same size.
				433	///
				434	class MemoryDepChecker {
				435	public:
				436	typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
				437	typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
				438
				439	MemoryDepChecker(ScalarEvolution Se, const DataLayout Dl, const Loop *L,
				440	const LoopAccessAnalysis::VectorizerParams &VectParams)
				441	: SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0),
				442	ShouldRetryWithRuntimeCheck(false), VectParams(VectParams) {}
				443
				444	/// \brief Register the location (instructions are given increasing numbers)
				445	/// of a write access.
				446	void addAccess(StoreInst *SI) {
				447	Value *Ptr = SI->getPointerOperand();
				448	Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
				449	InstMap.push_back(SI);
				450	++AccessIdx;
				451	}
				452
				453	/// \brief Register the location (instructions are given increasing numbers)
				454	/// of a write access.
				455	void addAccess(LoadInst *LI) {
				456	Value *Ptr = LI->getPointerOperand();
				457	Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
				458	InstMap.push_back(LI);
				459	++AccessIdx;
				460	}
				461
				462	/// \brief Check whether the dependencies between the accesses are safe.
				463	///
				464	/// Only checks sets with elements in \p CheckDeps.
				465	bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
				466	MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides);
				467
				468	/// \brief The maximum number of bytes of a vector register we can vectorize
				469	/// the accesses safely with.
				470	unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
				471
				472	/// \brief In same cases when the dependency check fails we can still
				473	/// vectorize the loop with a dynamic array access check.
				474	bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; }
				475
				476	private:
				477	ScalarEvolution *SE;
				478	const DataLayout *DL;
				479	const Loop *InnermostLoop;
				480
				481	/// \brief Maps access locations (ptr, read/write) to program order.
				482	DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses;
				483
				484	/// \brief Memory access instructions in program order.
				485	SmallVector<Instruction *, 16> InstMap;
				486
				487	/// \brief The program order index to be used for the next instruction.
				488	unsigned AccessIdx;
				489
				490	// We can access this many bytes in parallel safely.
				491	unsigned MaxSafeDepDistBytes;
				492
				493	/// \brief If we see a non-constant dependence distance we can still try to
				494	/// vectorize this loop with runtime checks.
				495	bool ShouldRetryWithRuntimeCheck;
				496
				497	/// \brief Vectorizer parameters used by the analysis.
				498	LoopAccessAnalysis::VectorizerParams VectParams;
				499
				500	/// \brief Check whether there is a plausible dependence between the two
				501	/// accesses.
				502	///
				503	/// Access \p A must happen before \p B in program order. The two indices
				504	/// identify the index into the program order map.
				505	///
				506	/// This function checks whether there is a plausible dependence (or the
				507	/// absence of such can't be proved) between the two accesses. If there is a
				508	/// plausible dependence but the dependence distance is bigger than one
				509	/// element access it records this distance in \p MaxSafeDepDistBytes (if this
				510	/// distance is smaller than any other distance encountered so far).
				511	/// Otherwise, this function returns true signaling a possible dependence.
				512	bool isDependent(const MemAccessInfo &A, unsigned AIdx,
				513	const MemAccessInfo &B, unsigned BIdx,
				514	ValueToValueMap &Strides);
				515
				516	/// \brief Check whether the data dependence could prevent store-load
				517	/// forwarding.
				518	bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
				519	};
				520
				521	} // end anonymous namespace
				522
				523	static bool isInBoundsGep(Value *Ptr) {
				524	if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
				525	return GEP->isInBounds();
				526	return false;
				527	}
				528
				529	/// \brief Check whether the access through \p Ptr has a constant stride.
				530	static int isStridedPtr(ScalarEvolution SE, const DataLayout DL, Value *Ptr,
				531	const Loop *Lp, ValueToValueMap &StridesMap) {
				532	const Type *Ty = Ptr->getType();
				533	assert(Ty->isPointerTy() && "Unexpected non-ptr");
				534
				535	// Make sure that the pointer does not point to aggregate types.
				536	const PointerType *PtrTy = cast<PointerType>(Ty);
				537	if (PtrTy->getElementType()->isAggregateType()) {
				538	DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr <<
				539	"\n");
				540	return 0;
				541	}
				542
				543	const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Ptr);
				544
				545	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
				546	if (!AR) {
				547	DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer "
				548	<< Ptr << " SCEV: " << PtrScev << "\n");
				549	return 0;
				550	}
				551
				552	// The accesss function must stride over the innermost loop.
				553	if (Lp != AR->getLoop()) {
				554	DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " <<
				555	Ptr << " SCEV: " << PtrScev << "\n");
				556	}
				557
				558	// The address calculation must not wrap. Otherwise, a dependence could be
				559	// inverted.
				560	// An inbounds getelementptr that is a AddRec with a unit stride
				561	// cannot wrap per definition. The unit stride requirement is checked later.
				562	// An getelementptr without an inbounds attribute and unit stride would have
				563	// to access the pointer value "0" which is undefined behavior in address
				564	// space 0, therefore we can also vectorize this case.
				565	bool IsInBoundsGEP = isInBoundsGep(Ptr);
				566	bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask);
				567	bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
				568	if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) {
				569	DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space "
				570	<< Ptr << " SCEV: " << PtrScev << "\n");
				571	return 0;
				572	}
				573
				574	// Check the step is constant.
				575	const SCEV Step = AR->getStepRecurrence(SE);
				576
				577	// Calculate the pointer stride and check if it is consecutive.
				578	const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
				579	if (!C) {
				580	DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr <<
				581	" SCEV: " << *PtrScev << "\n");
				582	return 0;
				583	}
				584
				585	int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType());
				586	const APInt &APStepVal = C->getValue()->getValue();
				587
				588	// Huge step value - give up.
				589	if (APStepVal.getBitWidth() > 64)
				590	return 0;
				591
				592	int64_t StepVal = APStepVal.getSExtValue();
				593
				594	// Strided access.
				595	int64_t Stride = StepVal / Size;
				596	int64_t Rem = StepVal % Size;
				597	if (Rem)
				598	return 0;
				599
				600	// If the SCEV could wrap but we have an inbounds gep with a unit stride we
				601	// know we can't "wrap around the address space". In case of address space
				602	// zero we know that this won't happen without triggering undefined behavior.
				603	if (!IsNoWrapAddRec && (IsInBoundsGEP \|\| IsInAddressSpaceZero) &&
				604	Stride != 1 && Stride != -1)
				605	return 0;
				606
				607	return Stride;
				608	}
				609
				610	bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance,
				611	unsigned TypeByteSize) {
				612	// If loads occur at a distance that is not a multiple of a feasible vector
				613	// factor store-load forwarding does not take place.
				614	// Positive dependences might cause troubles because vectorizing them might
				615	// prevent store-load forwarding making vectorized code run a lot slower.
				616	// a[i] = a[i-3] ^ a[i-8];
				617	// The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
				618	// hence on your typical architecture store-load forwarding does not take
				619	// place. Vectorizing in such cases does not make sense.
				620	// Store-load forwarding distance.
				621	const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize;
				622	// Maximum vector factor.
				623	unsigned MaxVFWithoutSLForwardIssues = VectParams.MaxVectorWidth*TypeByteSize;
				624	if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues)
				625	MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes;
				626
				627	for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues;
				628	vf *= 2) {
				629	if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) {
				630	MaxVFWithoutSLForwardIssues = (vf >>=1);
				631	break;
				632	}
				633	}
				634
				635	if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) {
				636	DEBUG(dbgs() << "LV: Distance " << Distance <<
				637	" that could cause a store-load forwarding conflict\n");
				638	return true;
				639	}
				640
				641	if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes &&
				642	MaxVFWithoutSLForwardIssues != VectParams.MaxVectorWidth*TypeByteSize)
				643	MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues;
				644	return false;
				645	}
				646
				647	bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
				648	const MemAccessInfo &B, unsigned BIdx,
				649	ValueToValueMap &Strides) {
				650	assert (AIdx < BIdx && "Must pass arguments in program order");
				651
				652	Value *APtr = A.getPointer();
				653	Value *BPtr = B.getPointer();
				654	bool AIsWrite = A.getInt();
				655	bool BIsWrite = B.getInt();
				656
				657	// Two reads are independent.
				658	if (!AIsWrite && !BIsWrite)
				659	return false;
				660
				661	// We cannot check pointers in different address spaces.
				662	if (APtr->getType()->getPointerAddressSpace() !=
				663	BPtr->getType()->getPointerAddressSpace())
				664	return true;
				665
				666	const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr);
				667	const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr);
				668
				669	int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop, Strides);
				670	int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop, Strides);
				671
				672	const SCEV *Src = AScev;
				673	const SCEV *Sink = BScev;
				674
				675	// If the induction step is negative we have to invert source and sink of the
				676	// dependence.
				677	if (StrideAPtr < 0) {
				678	//Src = BScev;
				679	//Sink = AScev;
				680	std::swap(APtr, BPtr);
				681	std::swap(Src, Sink);
				682	std::swap(AIsWrite, BIsWrite);
				683	std::swap(AIdx, BIdx);
				684	std::swap(StrideAPtr, StrideBPtr);
				685	}
				686
				687	const SCEV *Dist = SE->getMinusSCEV(Sink, Src);
				688
				689	DEBUG(dbgs() << "LV: Src Scev: " << Src << "Sink Scev: " << Sink
				690	<< "(Induction step: " << StrideAPtr << ")\n");
				691	DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to "
				692	<< InstMap[BIdx] << ": " << Dist << "\n");
				693
				694	// Need consecutive accesses. We don't want to vectorize
				695	// "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
				696	// the address space.
				697	if (!StrideAPtr \|\| !StrideBPtr \|\| StrideAPtr != StrideBPtr){
				698	DEBUG(dbgs() << "Non-consecutive pointer access\n");
				699	return true;
				700	}
				701
				702	const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
				703	if (!C) {
				704	DEBUG(dbgs() << "LV: Dependence because of non-constant distance\n");
				705	ShouldRetryWithRuntimeCheck = true;
				706	return true;
				707	}
				708
				709	Type *ATy = APtr->getType()->getPointerElementType();
				710	Type *BTy = BPtr->getType()->getPointerElementType();
				711	unsigned TypeByteSize = DL->getTypeAllocSize(ATy);
				712
				713	// Negative distances are not plausible dependencies.
				714	const APInt &Val = C->getValue()->getValue();
				715	if (Val.isNegative()) {
				716	bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
				717	if (IsTrueDataDependence &&
				718	(couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) \|\|
				719	ATy != BTy))
				720	return true;
				721
				722	DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n");
				723	return false;
				724	}
				725
				726	// Write to the same location with the same size.
				727	// Could be improved to assert type sizes are the same (i32 == float, etc).
				728	if (Val == 0) {
				729	if (ATy == BTy)
				730	return false;
				731	DEBUG(dbgs() << "LV: Zero dependence difference but different types\n");
				732	return true;
				733	}
				734
				735	assert(Val.isStrictlyPositive() && "Expect a positive value");
				736
				737	// Positive distance bigger than max vectorization factor.
				738	if (ATy != BTy) {
				739	DEBUG(dbgs() <<
				740	"LV: ReadWrite-Write positive dependency with different types\n");
				741	return false;
				742	}
				743
				744	unsigned Distance = (unsigned) Val.getZExtValue();
				745
				746	// Bail out early if passed-in parameters make vectorization not feasible.
				747	unsigned ForcedFactor = (VectParams.VectorizationFactor ?
				748	VectParams.VectorizationFactor : 1);
				749	unsigned ForcedUnroll = (VectParams.VectorizationInterleave ?
				750	VectParams.VectorizationInterleave : 1);
				751
				752	// The distance must be bigger than the size needed for a vectorized version
				753	// of the operation and the size of the vectorized operation must not be
				754	// bigger than the currrent maximum size.
				755	if (Distance < 2*TypeByteSize \|\|
				756	2*TypeByteSize > MaxSafeDepDistBytes \|\|
				757	Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
				758	DEBUG(dbgs() << "LV: Failure because of Positive distance "
				759	<< Val.getSExtValue() << '\n');
				760	return true;
				761	}
				762
				763	MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
				764	Distance : MaxSafeDepDistBytes;
				765
				766	bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
				767	if (IsTrueDataDependence &&
				768	couldPreventStoreLoadForward(Distance, TypeByteSize))
				769	return true;
				770
				771	DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() <<
				772	" with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n');
				773
				774	return false;
				775	}
				776
				777	bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
				778	MemAccessInfoSet &CheckDeps,
				779	ValueToValueMap &Strides) {
				780
				781	MaxSafeDepDistBytes = -1U;
				782	while (!CheckDeps.empty()) {
				783	MemAccessInfo CurAccess = *CheckDeps.begin();
				784
				785	// Get the relevant memory access set.
				786	EquivalenceClasses<MemAccessInfo>::iterator I =
				787	AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
				788
				789	// Check accesses within this set.
				790	EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE;
				791	AI = AccessSets.member_begin(I), AE = AccessSets.member_end();
				792
				793	// Check every access pair.
				794	while (AI != AE) {
				795	CheckDeps.erase(*AI);
				796	EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI);
				797	while (OI != AE) {
				798	// Check every accessing instruction pair in program order.
				799	for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
				800	I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
				801	for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(),
				802	I2E = Accesses[*OI].end(); I2 != I2E; ++I2) {
				803	if (I1 < I2 && isDependent(AI, I1, OI, I2, Strides))
				804	return false;
				805	if (I2 < I1 && isDependent(OI, I2, AI, I1, Strides))
				806	return false;
				807	}
				808	++OI;
				809	}
				810	AI++;
				811	}
				812	}
				813	return true;
				814	}
				815
				816	bool LoopAccessAnalysis::canVectorizeMemory(ValueToValueMap &Strides) {
				817
				818	typedef SmallVector<Value*, 16> ValueVector;
				819	typedef SmallPtrSet<Value*, 16> ValueSet;
				820
				821	// Holds the Load and Store instructions.
				822	ValueVector Loads;
				823	ValueVector Stores;
				824
				825	// Holds all the different accesses in the loop.
				826	unsigned NumReads = 0;
				827	unsigned NumReadWrites = 0;
				828
				829	PtrRtCheck.Pointers.clear();
				830	PtrRtCheck.Need = false;
				831
				832	const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
				833	MemoryDepChecker DepChecker(SE, DL, TheLoop, VectParams);
				834
				835	// For each block.
				836	for (Loop::block_iterator bb = TheLoop->block_begin(),
				837	be = TheLoop->block_end(); bb != be; ++bb) {
				838
				839	// Scan the BB and collect legal loads and stores.
				840	for (BasicBlock::iterator it = (bb)->begin(), e = (bb)->end(); it != e;
				841	++it) {
				842
				843	// If this is a load, save it. If this instruction can read from memory
				844	// but is not a load, then we quit. Notice that we don't handle function
				845	// calls that read or write.
				846	if (it->mayReadFromMemory()) {
				847	// Many math library functions read the rounding mode. We will only
				848	// vectorize a loop if it contains known function calls that don't set
				849	// the flag. Therefore, it is safe to ignore this read from memory.
				850	CallInst *Call = dyn_cast<CallInst>(it);
				851	if (Call && getIntrinsicIDForCall(Call, TLI))
				852	continue;
				853
				854	LoadInst *Ld = dyn_cast<LoadInst>(it);
				855	if (!Ld \|\| (!Ld->isSimple() && !IsAnnotatedParallel)) {
				856	emitAnalysis(VectorizationReport(Ld)
				857	<< "read with atomic ordering or volatile read");
				858	DEBUG(dbgs() << "LV: Found a non-simple load.\n");
				859	return false;
				860	}
				861	NumLoads++;
				862	Loads.push_back(Ld);
				863	DepChecker.addAccess(Ld);
				864	continue;
				865	}
				866
				867	// Save 'store' instructions. Abort if other instructions write to memory.
				868	if (it->mayWriteToMemory()) {
				869	StoreInst *St = dyn_cast<StoreInst>(it);
				870	if (!St) {
				871	emitAnalysis(VectorizationReport(it) <<
				872	"instruction cannot be vectorized");
				873	return false;
				874	}
				875	if (!St->isSimple() && !IsAnnotatedParallel) {
				876	emitAnalysis(VectorizationReport(St)
				877	<< "write with atomic ordering or volatile write");
				878	DEBUG(dbgs() << "LV: Found a non-simple store.\n");
				879	return false;
				880	}
				881	NumStores++;
				882	Stores.push_back(St);
				883	DepChecker.addAccess(St);
				884	}
				885	} // Next instr.
				886	} // Next block.
				887
				888	// Now we have two lists that hold the loads and the stores.
				889	// Next, we find the pointers that they use.
				890
				891	// Check if we see any stores. If there are no stores, then we don't
				892	// care if the pointers are restrict.
				893	if (!Stores.size()) {
				894	DEBUG(dbgs() << "LV: Found a read-only loop!\n");
				895	return true;
				896	}
				897
				898	AccessAnalysis::DepCandidates DependentAccesses;
				899	AccessAnalysis Accesses(DL, AA, DependentAccesses);
				900
				901	// Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
				902	// multiple times on the same object. If the ptr is accessed twice, once
				903	// for read and once for write, it will only appear once (on the write
				904	// list). This is okay, since we are going to check for conflicts between
				905	// writes and between reads and writes, but not between reads and reads.
				906	ValueSet Seen;
				907
				908	ValueVector::iterator I, IE;
				909	for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
				910	StoreInst ST = cast<StoreInst>(I);
				911	Value* Ptr = ST->getPointerOperand();
				912
				913	if (isUniform(Ptr)) {
				914	emitAnalysis(
				915	VectorizationReport(ST)
				916	<< "write to a loop invariant address could not be vectorized");
				917	DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
				918	return false;
				919	}
				920
				921	// If we did not see this pointer before, insert it to the read-write
				922	// list. At this phase it is only a 'write' list.
				923	if (Seen.insert(Ptr).second) {
				924	++NumReadWrites;
				925
				926	AliasAnalysis::Location Loc = AA->getLocation(ST);
				927	// The TBAA metadata could have a control dependency on the predication
				928	// condition, so we cannot rely on it when determining whether or not we
				929	// need runtime pointer checks.
				930	if (blockNeedsPredication(ST->getParent()))
				931	Loc.AATags.TBAA = nullptr;
				932
				933	Accesses.addStore(Loc);
				934	}
				935	}
				936
				937	if (IsAnnotatedParallel) {
				938	DEBUG(dbgs()
				939	<< "LV: A loop annotated parallel, ignore memory dependency "
				940	<< "checks.\n");
				941	return true;
				942	}
				943
				944	for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
				945	LoadInst LD = cast<LoadInst>(I);
				946	Value* Ptr = LD->getPointerOperand();
				947	// If we did not see this pointer before, insert it to the
				948	// read list. If we did see it before, then it is already in
				949	// the read-write list. This allows us to vectorize expressions
				950	// such as A[i] += x; Because the address of A[i] is a read-write
				951	// pointer. This only works if the index of A[i] is consecutive.
				952	// If the address of i is unknown (for example A[B[i]]) then we may
				953	// read a few words, modify, and write a few words, and some of the
				954	// words may be written to the same address.
				955	bool IsReadOnlyPtr = false;
				956	if (Seen.insert(Ptr).second \|\|
				957	!isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) {
				958	++NumReads;
				959	IsReadOnlyPtr = true;
				960	}
				961
				962	AliasAnalysis::Location Loc = AA->getLocation(LD);
				963	// The TBAA metadata could have a control dependency on the predication
				964	// condition, so we cannot rely on it when determining whether or not we
				965	// need runtime pointer checks.
				966	if (blockNeedsPredication(LD->getParent()))
				967	Loc.AATags.TBAA = nullptr;
				968
				969	Accesses.addLoad(Loc, IsReadOnlyPtr);
				970	}
				971
				972	// If we write (or read-write) to a single destination and there are no
				973	// other reads in this loop then is it safe to vectorize.
				974	if (NumReadWrites == 1 && NumReads == 0) {
				975	DEBUG(dbgs() << "LV: Found a write-only loop!\n");
				976	return true;
				977	}
				978
				979	// Build dependence sets and check whether we need a runtime pointer bounds
				980	// check.
				981	Accesses.buildDependenceSets();
				982	bool NeedRTCheck = Accesses.isRTCheckNeeded();
				983
				984	// Find pointers with computable bounds. We are going to use this information
				985	// to place a runtime bound check.
				986	unsigned NumComparisons = 0;
				987	bool CanDoRT = false;
				988	if (NeedRTCheck)
				989	CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop,
				990	Strides);
				991
				992	DEBUG(dbgs() << "LV: We need to do " << NumComparisons <<
				993	" pointer comparisons.\n");
				994
				995	// If we only have one set of dependences to check pointers among we don't
				996	// need a runtime check.
				997	if (NumComparisons == 0 && NeedRTCheck)
				998	NeedRTCheck = false;
				999
				1000	// Check that we did not collect too many pointers or found an unsizeable
				1001	// pointer.
				1002	if (!CanDoRT \|\| NumComparisons > VectParams.RuntimeMemoryCheckThreshold) {
				1003	PtrRtCheck.reset();
				1004	CanDoRT = false;
				1005	}
				1006
				1007	if (CanDoRT) {
				1008	DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
				1009	}
				1010
				1011	if (NeedRTCheck && !CanDoRT) {
				1012	emitAnalysis(VectorizationReport() << "cannot identify array bounds");
				1013	DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<
				1014	"the array bounds.\n");
				1015	PtrRtCheck.reset();
				1016	return false;
				1017	}
				1018
				1019	PtrRtCheck.Need = NeedRTCheck;
				1020
				1021	bool CanVecMem = true;
				1022	if (Accesses.isDependencyCheckNeeded()) {
				1023	DEBUG(dbgs() << "LV: Checking memory dependencies\n");
				1024	CanVecMem = DepChecker.areDepsSafe(
				1025	DependentAccesses, Accesses.getDependenciesToCheck(), Strides);
				1026	MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes();
				1027
				1028	if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) {
				1029	DEBUG(dbgs() << "LV: Retrying with memory checks\n");
				1030	NeedRTCheck = true;
				1031
				1032	// Clear the dependency checks. We assume they are not needed.
				1033	Accesses.resetDepChecks();
				1034
				1035	PtrRtCheck.reset();
				1036	PtrRtCheck.Need = true;
				1037
				1038	CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE,
				1039	TheLoop, Strides, true);
				1040	// Check that we did not collect too many pointers or found an unsizeable
				1041	// pointer.
				1042	if (!CanDoRT \|\| NumComparisons > VectParams.RuntimeMemoryCheckThreshold) {
				1043	if (!CanDoRT && NumComparisons > 0)
				1044	emitAnalysis(VectorizationReport()
				1045	<< "cannot check memory dependencies at runtime");
				1046	else
				1047	emitAnalysis(VectorizationReport()
				1048	<< NumComparisons << " exceeds limit of "
				1049	<< VectParams.RuntimeMemoryCheckThreshold
				1050	<< " dependent memory operations checked at runtime");
				1051	DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n");
				1052	PtrRtCheck.reset();
				1053	return false;
				1054	}
				1055
				1056	CanVecMem = true;
				1057	}
				1058	}
				1059
				1060	if (!CanVecMem)
				1061	emitAnalysis(VectorizationReport() <<
				1062	"unsafe dependent memory operations in loop");
				1063
				1064	DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") <<
				1065	" need a runtime memory check.\n");
				1066
				1067	return CanVecMem;
				1068	}
				1069
				1070	bool LoopAccessAnalysis::blockNeedsPredication(BasicBlock *BB) {
				1071	assert(TheLoop->contains(BB) && "Unknown block used");
				1072
				1073	// Blocks that do not dominate the latch need predication.
				1074	BasicBlock* Latch = TheLoop->getLoopLatch();
				1075	return !DT->dominates(BB, Latch);
				1076	}
				1077
				1078	void LoopAccessAnalysis::emitAnalysis(VectorizationReport &Message) {
				1079	VectorizationReport::emitAnalysis(Message, TheFunction, TheLoop);
				1080	}
				1081
				1082	bool LoopAccessAnalysis::isUniform(Value *V) {
				1083	return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
				1084	}