Blame - llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp - toolchain/llvm-project

blob: b5948475e1f76449dfd174cb95d6eb4f621c4542 [file] [log] [blame]

Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1	//===--- HexagonLoopIdiomRecognition.cpp ----------------------------------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9
				10	#define DEBUG_TYPE "hexagon-lir"
				11
				12	#include "llvm/ADT/SetVector.h"
				13	#include "llvm/ADT/SmallSet.h"
				14	#include "llvm/Analysis/AliasAnalysis.h"
				15	#include "llvm/Analysis/InstructionSimplify.h"
				16	#include "llvm/Analysis/LoopPass.h"
				17	#include "llvm/Analysis/ScalarEvolution.h"
				18	#include "llvm/Analysis/ScalarEvolutionExpander.h"
				19	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
				20	#include "llvm/Analysis/TargetLibraryInfo.h"
				21	#include "llvm/Analysis/ValueTracking.h"
				22	#include "llvm/IR/DataLayout.h"
				23	#include "llvm/IR/Dominators.h"
				24	#include "llvm/IR/IRBuilder.h"
				25	#include "llvm/IR/PatternMatch.h"
				26	#include "llvm/Transforms/Scalar.h"
				27	#include "llvm/Transforms/Utils/Local.h"
				28	#include "llvm/Support/Debug.h"
				29	#include "llvm/Support/raw_ostream.h"
				30
				31	#include <algorithm>
				32	#include <array>
				33
				34	using namespace llvm;
				35
				36	static cl::opt<bool> DisableMemcpyIdiom("disable-memcpy-idiom",
				37	cl::Hidden, cl::init(false),
				38	cl::desc("Disable generation of memcpy in loop idiom recognition"));
				39
				40	static cl::opt<bool> DisableMemmoveIdiom("disable-memmove-idiom",
				41	cl::Hidden, cl::init(false),
				42	cl::desc("Disable generation of memmove in loop idiom recognition"));
				43
				44	static cl::opt<unsigned> RuntimeMemSizeThreshold("runtime-mem-idiom-threshold",
				45	cl::Hidden, cl::init(0), cl::desc("Threshold (in bytes) for the runtime "
				46	"check guarding the memmove."));
				47
				48	static cl::opt<unsigned> CompileTimeMemSizeThreshold(
				49	"compile-time-mem-idiom-threshold", cl::Hidden, cl::init(64),
				50	cl::desc("Threshold (in bytes) to perform the transformation, if the "
				51	"runtime loop count (mem transfer size) is known at compile-time."));
				52
				53	static cl::opt<bool> OnlyNonNestedMemmove("only-nonnested-memmove-idiom",
				54	cl::Hidden, cl::init(true),
				55	cl::desc("Only enable generating memmove in non-nested loops"));
				56
				57	cl::opt<bool> HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy",
				58	cl::Hidden, cl::init(false),
				59	cl::desc("Enable Hexagon-specific memcpy for volatile destination."));
				60
				61	static const char *HexagonVolatileMemcpyName
				62	= "hexagon_memcpy_forward_vp4cp4n2";
				63
				64
				65	namespace llvm {
				66	void initializeHexagonLoopIdiomRecognizePass(PassRegistry&);
				67	Pass *createHexagonLoopIdiomPass();
				68	}
				69
				70	namespace {
				71	class HexagonLoopIdiomRecognize : public LoopPass {
				72	public:
				73	static char ID;
				74	explicit HexagonLoopIdiomRecognize() : LoopPass(ID) {
				75	initializeHexagonLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
				76	}
				77	StringRef getPassName() const override {
				78	return "Recognize Hexagon-specific loop idioms";
				79	}
				80
				81	void getAnalysisUsage(AnalysisUsage &AU) const override {
				82	AU.addRequired<LoopInfoWrapperPass>();
				83	AU.addRequiredID(LoopSimplifyID);
				84	AU.addRequiredID(LCSSAID);
				85	AU.addRequired<AAResultsWrapperPass>();
				86	AU.addPreserved<AAResultsWrapperPass>();
				87	AU.addRequired<ScalarEvolutionWrapperPass>();
				88	AU.addRequired<DominatorTreeWrapperPass>();
				89	AU.addRequired<TargetLibraryInfoWrapperPass>();
				90	AU.addPreserved<TargetLibraryInfoWrapperPass>();
				91	}
				92
				93	bool runOnLoop(Loop *L, LPPassManager &LPM) override;
				94
				95	private:
				96	unsigned getStoreSizeInBytes(StoreInst *SI);
				97	int getSCEVStride(const SCEVAddRecExpr *StoreEv);
				98	bool isLegalStore(Loop CurLoop, StoreInst SI);
				99	void collectStores(Loop CurLoop, BasicBlock BB,
				100	SmallVectorImpl<StoreInst*> &Stores);
				101	bool processCopyingStore(Loop CurLoop, StoreInst SI, const SCEV *BECount);
				102	bool coverLoop(Loop L, SmallVectorImpl<Instruction> &Insts) const;
				103	bool runOnLoopBlock(Loop CurLoop, BasicBlock BB, const SCEV *BECount,
				104	SmallVectorImpl<BasicBlock*> &ExitBlocks);
				105	bool runOnCountableLoop(Loop *L);
				106
				107	AliasAnalysis *AA;
				108	const DataLayout *DL;
				109	DominatorTree *DT;
				110	LoopInfo *LF;
				111	const TargetLibraryInfo *TLI;
				112	ScalarEvolution *SE;
				113	bool HasMemcpy, HasMemmove;
				114	};
				115	}
				116
				117	char HexagonLoopIdiomRecognize::ID = 0;
				118
				119	INITIALIZE_PASS_BEGIN(HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
				120	"Recognize Hexagon-specific loop idioms", false, false)
				121	INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
				122	INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
				123	INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
				124	INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
				125	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
				126	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
				127	INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
				128	INITIALIZE_PASS_END(HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
				129	"Recognize Hexagon-specific loop idioms", false, false)
				130
				131
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	132	namespace {
				133	struct Simplifier {
				134	typedef std::function<Value* (Instruction*, LLVMContext&)> Rule;
				135
				136	void addRule(const Rule &R) { Rules.push_back(R); }
				137
				138	private:
Krzysztof Parzyszek	10fbac0	2017-03-23 23:01:22 +0000	[diff] [blame]	139	struct WorkListType {
				140	WorkListType() = default;
				141
				142	void push_back(Value* V) {
				143	// Do not push back duplicates.
				144	if (!S.count(V)) { Q.push_back(V); S.insert(V); }
				145	}
				146	Value *pop_front_val() {
				147	Value *V = Q.front(); Q.pop_front(); S.erase(V);
				148	return V;
				149	}
				150	bool empty() const { return Q.empty(); }
				151
				152	private:
				153	std::deque<Value*> Q;
				154	std::set<Value*> S;
				155	};
				156
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	157	typedef std::set<Value*> ValueSetType;
				158	std::vector<Rule> Rules;
				159
				160	public:
				161	struct Context {
				162	typedef DenseMap<Value,Value> ValueMapType;
				163
				164	Value *Root;
				165	ValueSetType Used; // The set of all cloned values used by Root.
				166	ValueSetType Clones; // The set of all cloned values.
				167	LLVMContext &Ctx;
				168
				169	Context(Instruction *Exp)
				170	: Ctx(Exp->getParent()->getParent()->getContext()) {
				171	initialize(Exp);
				172	}
				173	~Context() { cleanup(); }
				174	void print(raw_ostream &OS, const Value *V) const;
				175
				176	Value materialize(BasicBlock B, BasicBlock::iterator At);
				177
				178	private:
				179	void initialize(Instruction *Exp);
				180	void cleanup();
				181
				182	template <typename FuncT> void traverse(Value *V, FuncT F);
				183	void record(Value *V);
				184	void use(Value *V);
				185	void unuse(Value *V);
				186
				187	bool equal(const Instruction I, const Instruction J) const;
				188	Value find(Value Tree, Value *Sub) const;
				189	Value subst(Value Tree, Value OldV, Value NewV);
				190	void replace(Value OldV, Value NewV);
				191	void link(Instruction I, BasicBlock B, BasicBlock::iterator At);
				192
				193	friend struct Simplifier;
				194	};
				195
				196	Value *simplify(Context &C);
				197	};
				198
				199	struct PE {
				200	PE(const Simplifier::Context &c, Value *v = nullptr) : C(c), V(v) {}
				201	const Simplifier::Context &C;
				202	const Value *V;
				203	};
				204
				205	raw_ostream &operator<< (raw_ostream &OS, const PE &P) LLVM_ATTRIBUTE_USED;
				206	raw_ostream &operator<< (raw_ostream &OS, const PE &P) {
				207	P.C.print(OS, P.V ? P.V : P.C.Root);
				208	return OS;
				209	}
				210	}
				211
				212
				213	template <typename FuncT>
				214	void Simplifier::Context::traverse(Value *V, FuncT F) {
				215	WorkListType Q;
				216	Q.push_back(V);
				217
				218	while (!Q.empty()) {
Krzysztof Parzyszek	10fbac0	2017-03-23 23:01:22 +0000	[diff] [blame]	219	Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	220	if (!U \|\| U->getParent())
				221	continue;
				222	if (!F(U))
				223	continue;
				224	for (Value *Op : U->operands())
				225	Q.push_back(Op);
				226	}
				227	}
				228
				229
				230	void Simplifier::Context::print(raw_ostream &OS, const Value *V) const {
				231	const auto *U = dyn_cast<const Instruction>(V);
				232	if (!U) {
				233	OS << V << '(' << *V << ')';
				234	return;
				235	}
				236
				237	if (U->getParent()) {
				238	OS << U << '(';
				239	U->printAsOperand(OS, true);
				240	OS << ')';
				241	return;
				242	}
				243
				244	unsigned N = U->getNumOperands();
				245	if (N != 0)
				246	OS << U << '(';
				247	OS << U->getOpcodeName();
				248	for (const Value *Op : U->operands()) {
				249	OS << ' ';
				250	print(OS, Op);
				251	}
				252	if (N != 0)
				253	OS << ')';
				254	}
				255
				256
				257	void Simplifier::Context::initialize(Instruction *Exp) {
				258	// Perform a deep clone of the expression, set Root to the root
				259	// of the clone, and build a map from the cloned values to the
				260	// original ones.
				261	ValueMapType M;
				262	BasicBlock *Block = Exp->getParent();
				263	WorkListType Q;
				264	Q.push_back(Exp);
				265
				266	while (!Q.empty()) {
Krzysztof Parzyszek	10fbac0	2017-03-23 23:01:22 +0000	[diff] [blame]	267	Value *V = Q.pop_front_val();
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	268	if (M.find(V) != M.end())
				269	continue;
				270	if (Instruction *U = dyn_cast<Instruction>(V)) {
				271	if (isa<PHINode>(U) \|\| U->getParent() != Block)
				272	continue;
				273	for (Value *Op : U->operands())
				274	Q.push_back(Op);
				275	M.insert({U, U->clone()});
				276	}
				277	}
				278
				279	for (std::pair<Value,Value> P : M) {
				280	Instruction *U = cast<Instruction>(P.second);
				281	for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
				282	auto F = M.find(U->getOperand(i));
				283	if (F != M.end())
				284	U->setOperand(i, F->second);
				285	}
				286	}
				287
				288	auto R = M.find(Exp);
				289	assert(R != M.end());
				290	Root = R->second;
				291
				292	record(Root);
				293	use(Root);
				294	}
				295
				296
				297	void Simplifier::Context::record(Value *V) {
				298	auto Record = [this](Instruction *U) -> bool {
				299	Clones.insert(U);
				300	return true;
				301	};
				302	traverse(V, Record);
				303	}
				304
				305
				306	void Simplifier::Context::use(Value *V) {
				307	auto Use = [this](Instruction *U) -> bool {
				308	Used.insert(U);
				309	return true;
				310	};
				311	traverse(V, Use);
				312	}
				313
				314
				315	void Simplifier::Context::unuse(Value *V) {
				316	if (!isa<Instruction>(V) \|\| cast<Instruction>(V)->getParent() != nullptr)
				317	return;
				318
				319	auto Unuse = [this](Instruction *U) -> bool {
				320	if (!U->use_empty())
				321	return false;
				322	Used.erase(U);
				323	return true;
				324	};
				325	traverse(V, Unuse);
				326	}
				327
				328
				329	Value Simplifier::Context::subst(Value Tree, Value OldV, Value NewV) {
				330	if (Tree == OldV)
				331	return NewV;
				332	if (OldV == NewV)
				333	return Tree;
				334
				335	WorkListType Q;
				336	Q.push_back(Tree);
				337	while (!Q.empty()) {
Krzysztof Parzyszek	10fbac0	2017-03-23 23:01:22 +0000	[diff] [blame]	338	Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	339	// If U is not an instruction, or it's not a clone, skip it.
				340	if (!U \|\| U->getParent())
				341	continue;
				342	for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
				343	Value *Op = U->getOperand(i);
				344	if (Op == OldV) {
				345	U->setOperand(i, NewV);
				346	unuse(OldV);
				347	} else {
				348	Q.push_back(Op);
				349	}
				350	}
				351	}
				352	return Tree;
				353	}
				354
				355
				356	void Simplifier::Context::replace(Value OldV, Value NewV) {
				357	if (Root == OldV) {
				358	Root = NewV;
				359	use(Root);
				360	return;
				361	}
				362
				363	// NewV may be a complex tree that has just been created by one of the
				364	// transformation rules. We need to make sure that it is commoned with
				365	// the existing Root to the maximum extent possible.
				366	// Identify all subtrees of NewV (including NewV itself) that have
				367	// equivalent counterparts in Root, and replace those subtrees with
				368	// these counterparts.
				369	WorkListType Q;
				370	Q.push_back(NewV);
				371	while (!Q.empty()) {
Krzysztof Parzyszek	10fbac0	2017-03-23 23:01:22 +0000	[diff] [blame]	372	Value *V = Q.pop_front_val();
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	373	Instruction *U = dyn_cast<Instruction>(V);
				374	if (!U \|\| U->getParent())
				375	continue;
				376	if (Value *DupV = find(Root, V)) {
				377	if (DupV != V)
				378	NewV = subst(NewV, V, DupV);
				379	} else {
				380	for (Value *Op : U->operands())
				381	Q.push_back(Op);
				382	}
				383	}
				384
				385	// Now, simply replace OldV with NewV in Root.
				386	Root = subst(Root, OldV, NewV);
				387	use(Root);
				388	}
				389
				390
				391	void Simplifier::Context::cleanup() {
				392	for (Value *V : Clones) {
				393	Instruction *U = cast<Instruction>(V);
				394	if (!U->getParent())
				395	U->dropAllReferences();
				396	}
				397
				398	for (Value *V : Clones) {
				399	Instruction *U = cast<Instruction>(V);
				400	if (!U->getParent())
				401	delete U;
				402	}
				403	}
				404
				405
				406	bool Simplifier::Context::equal(const Instruction *I,
				407	const Instruction *J) const {
				408	if (I == J)
				409	return true;
				410	if (!I->isSameOperationAs(J))
				411	return false;
				412	if (isa<PHINode>(I))
				413	return I->isIdenticalTo(J);
				414
				415	for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) {
				416	Value OpI = I->getOperand(i), OpJ = J->getOperand(i);
				417	if (OpI == OpJ)
				418	continue;
				419	auto *InI = dyn_cast<const Instruction>(OpI);
				420	auto *InJ = dyn_cast<const Instruction>(OpJ);
				421	if (InI && InJ) {
				422	if (!equal(InI, InJ))
				423	return false;
				424	} else if (InI != InJ \|\| !InI)
				425	return false;
				426	}
				427	return true;
				428	}
				429
				430
				431	Value Simplifier::Context::find(Value Tree, Value *Sub) const {
				432	Instruction *SubI = dyn_cast<Instruction>(Sub);
				433	WorkListType Q;
				434	Q.push_back(Tree);
				435
				436	while (!Q.empty()) {
Krzysztof Parzyszek	10fbac0	2017-03-23 23:01:22 +0000	[diff] [blame]	437	Value *V = Q.pop_front_val();
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	438	if (V == Sub)
				439	return V;
				440	Instruction *U = dyn_cast<Instruction>(V);
				441	if (!U \|\| U->getParent())
				442	continue;
				443	if (SubI && equal(SubI, U))
				444	return U;
				445	assert(!isa<PHINode>(U));
				446	for (Value *Op : U->operands())
				447	Q.push_back(Op);
				448	}
				449	return nullptr;
				450	}
				451
				452
				453	void Simplifier::Context::link(Instruction I, BasicBlock B,
				454	BasicBlock::iterator At) {
				455	if (I->getParent())
				456	return;
				457
				458	for (Value *Op : I->operands()) {
				459	if (Instruction *OpI = dyn_cast<Instruction>(Op))
				460	link(OpI, B, At);
				461	}
				462
				463	B->getInstList().insert(At, I);
				464	}
				465
				466
				467	Value Simplifier::Context::materialize(BasicBlock B,
				468	BasicBlock::iterator At) {
				469	if (Instruction *RootI = dyn_cast<Instruction>(Root))
				470	link(RootI, B, At);
				471	return Root;
				472	}
				473
				474
				475	Value *Simplifier::simplify(Context &C) {
				476	WorkListType Q;
				477	Q.push_back(C.Root);
Krzysztof Parzyszek	10fbac0	2017-03-23 23:01:22 +0000	[diff] [blame]	478	unsigned Count = 0;
				479	const unsigned Limit = 100000;
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	480
				481	while (!Q.empty()) {
Krzysztof Parzyszek	10fbac0	2017-03-23 23:01:22 +0000	[diff] [blame]	482	if (Count++ >= Limit)
				483	break;
				484	Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	485	if (!U \|\| U->getParent() \|\| !C.Used.count(U))
				486	continue;
				487	bool Changed = false;
				488	for (Rule &R : Rules) {
				489	Value *W = R(U, C.Ctx);
				490	if (!W)
				491	continue;
				492	Changed = true;
				493	C.record(W);
				494	C.replace(U, W);
				495	Q.push_back(C.Root);
				496	break;
				497	}
				498	if (!Changed) {
				499	for (Value *Op : U->operands())
				500	Q.push_back(Op);
				501	}
				502	}
Krzysztof Parzyszek	10fbac0	2017-03-23 23:01:22 +0000	[diff] [blame]	503	assert(Count < Limit && "Infinite loop in HLIR/simplify?");
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	504	return C.Root;
				505	}
				506
				507
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	508	//===----------------------------------------------------------------------===//
				509	//
				510	// Implementation of PolynomialMultiplyRecognize
				511	//
				512	//===----------------------------------------------------------------------===//
				513
				514	namespace {
				515	class PolynomialMultiplyRecognize {
				516	public:
				517	explicit PolynomialMultiplyRecognize(Loop *loop, const DataLayout &dl,
				518	const DominatorTree &dt, const TargetLibraryInfo &tli,
				519	ScalarEvolution &se)
				520	: CurLoop(loop), DL(dl), DT(dt), TLI(tli), SE(se) {}
				521
				522	bool recognize();
				523	private:
				524	typedef SetVector<Value*> ValueSeq;
				525
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	526	IntegerType *getPmpyType() const {
				527	LLVMContext &Ctx = CurLoop->getHeader()->getParent()->getContext();
				528	return IntegerType::get(Ctx, 32);
				529	}
				530	bool isPromotableTo(Value V, IntegerType Ty);
				531	void promoteTo(Instruction In, IntegerType DestTy, BasicBlock *LoopB);
				532	bool promoteTypes(BasicBlock LoopB, BasicBlock ExitB);
				533
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	534	Value getCountIV(BasicBlock BB);
				535	bool findCycle(Value Out, Value In, ValueSeq &Cycle);
				536	void classifyCycle(Instruction *DivI, ValueSeq &Cycle, ValueSeq &Early,
				537	ValueSeq &Late);
				538	bool classifyInst(Instruction *UseI, ValueSeq &Early, ValueSeq &Late);
				539	bool commutesWithShift(Instruction *I);
				540	bool highBitsAreZero(Value *V, unsigned IterCount);
				541	bool keepsHighBitsZero(Value *V, unsigned IterCount);
				542	bool isOperandShifted(Instruction I, Value Op);
				543	bool convertShiftsToLeft(BasicBlock LoopB, BasicBlock ExitB,
				544	unsigned IterCount);
				545	void cleanupLoopBody(BasicBlock *LoopB);
				546
				547	struct ParsedValues {
				548	ParsedValues() : M(nullptr), P(nullptr), Q(nullptr), R(nullptr),
				549	X(nullptr), Res(nullptr), IterCount(0), Left(false), Inv(false) {}
				550	Value M, P, Q, R, *X;
				551	Instruction *Res;
				552	unsigned IterCount;
				553	bool Left, Inv;
				554	};
				555
				556	bool matchLeftShift(SelectInst SelI, Value CIV, ParsedValues &PV);
				557	bool matchRightShift(SelectInst *SelI, ParsedValues &PV);
				558	bool scanSelect(SelectInst SI, BasicBlock LoopB, BasicBlock *PrehB,
				559	Value *CIV, ParsedValues &PV, bool PreScan);
				560	unsigned getInverseMxN(unsigned QP);
				561	Value *generate(BasicBlock::iterator At, ParsedValues &PV);
				562
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	563	void setupSimplifier();
				564
				565	Simplifier Simp;
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	566	Loop *CurLoop;
				567	const DataLayout &DL;
				568	const DominatorTree &DT;
				569	const TargetLibraryInfo &TLI;
				570	ScalarEvolution &SE;
				571	};
				572	}
				573
				574
				575	Value PolynomialMultiplyRecognize::getCountIV(BasicBlock BB) {
				576	pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
				577	if (std::distance(PI, PE) != 2)
				578	return nullptr;
				579	BasicBlock PB = (PI == BB) ? std::next(PI) : PI;
				580
				581	for (auto I = BB->begin(), E = BB->end(); I != E && isa<PHINode>(I); ++I) {
				582	auto *PN = cast<PHINode>(I);
				583	Value *InitV = PN->getIncomingValueForBlock(PB);
				584	if (!isa<ConstantInt>(InitV) \|\| !cast<ConstantInt>(InitV)->isZero())
				585	continue;
				586	Value *IterV = PN->getIncomingValueForBlock(BB);
				587	if (!isa<BinaryOperator>(IterV))
				588	continue;
				589	auto *BO = dyn_cast<BinaryOperator>(IterV);
				590	if (BO->getOpcode() != Instruction::Add)
				591	continue;
				592	Value *IncV = nullptr;
				593	if (BO->getOperand(0) == PN)
				594	IncV = BO->getOperand(1);
				595	else if (BO->getOperand(1) == PN)
				596	IncV = BO->getOperand(0);
				597	if (IncV == nullptr)
				598	continue;
				599
				600	if (auto *T = dyn_cast<ConstantInt>(IncV))
				601	if (T->getZExtValue() == 1)
				602	return PN;
				603	}
				604	return nullptr;
				605	}
				606
				607
				608	static void replaceAllUsesOfWithIn(Value I, Value J, BasicBlock *BB) {
				609	for (auto UI = I->user_begin(), UE = I->user_end(); UI != UE;) {
				610	Use &TheUse = UI.getUse();
				611	++UI;
				612	if (auto *II = dyn_cast<Instruction>(TheUse.getUser()))
				613	if (BB == II->getParent())
				614	II->replaceUsesOfWith(I, J);
				615	}
				616	}
				617
				618
				619	bool PolynomialMultiplyRecognize::matchLeftShift(SelectInst *SelI,
				620	Value *CIV, ParsedValues &PV) {
				621	// Match the following:
				622	// select (X & (1 << i)) != 0 ? R ^ (Q << i) : R
				623	// select (X & (1 << i)) == 0 ? R : R ^ (Q << i)
				624	// The condition may also check for equality with the masked value, i.e
				625	// select (X & (1 << i)) == (1 << i) ? R ^ (Q << i) : R
				626	// select (X & (1 << i)) != (1 << i) ? R : R ^ (Q << i);
				627
				628	Value *CondV = SelI->getCondition();
				629	Value *TrueV = SelI->getTrueValue();
				630	Value *FalseV = SelI->getFalseValue();
				631
				632	using namespace PatternMatch;
				633
				634	CmpInst::Predicate P;
				635	Value A = nullptr, B = nullptr, *C = nullptr;
				636
				637	if (!match(CondV, m_ICmp(P, m_And(m_Value(A), m_Value(B)), m_Value(C))) &&
				638	!match(CondV, m_ICmp(P, m_Value(C), m_And(m_Value(A), m_Value(B)))))
				639	return false;
				640	if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
				641	return false;
				642	// Matched: select (A & B) == C ? ... : ...
				643	// select (A & B) != C ? ... : ...
				644
				645	Value X = nullptr, Sh1 = nullptr;
				646	// Check (A & B) for (X & (1 << i)):
				647	if (match(A, m_Shl(m_One(), m_Specific(CIV)))) {
				648	Sh1 = A;
				649	X = B;
				650	} else if (match(B, m_Shl(m_One(), m_Specific(CIV)))) {
				651	Sh1 = B;
				652	X = A;
				653	} else {
				654	// TODO: Could also check for an induction variable containing single
				655	// bit shifted left by 1 in each iteration.
				656	return false;
				657	}
				658
				659	bool TrueIfZero;
				660
				661	// Check C against the possible values for comparison: 0 and (1 << i):
				662	if (match(C, m_Zero()))
				663	TrueIfZero = (P == CmpInst::ICMP_EQ);
				664	else if (C == Sh1)
				665	TrueIfZero = (P == CmpInst::ICMP_NE);
				666	else
				667	return false;
				668
				669	// So far, matched:
				670	// select (X & (1 << i)) ? ... : ...
				671	// including variations of the check against zero/non-zero value.
				672
				673	Value ShouldSameV = nullptr, ShouldXoredV = nullptr;
				674	if (TrueIfZero) {
				675	ShouldSameV = TrueV;
				676	ShouldXoredV = FalseV;
				677	} else {
				678	ShouldSameV = FalseV;
				679	ShouldXoredV = TrueV;
				680	}
				681
				682	Value Q = nullptr, R = nullptr, Y = nullptr, Z = nullptr;
				683	Value *T = nullptr;
				684	if (match(ShouldXoredV, m_Xor(m_Value(Y), m_Value(Z)))) {
				685	// Matched: select +++ ? ... : Y ^ Z
				686	// select +++ ? Y ^ Z : ...
				687	// where +++ denotes previously checked matches.
				688	if (ShouldSameV == Y)
				689	T = Z;
				690	else if (ShouldSameV == Z)
				691	T = Y;
				692	else
				693	return false;
				694	R = ShouldSameV;
				695	// Matched: select +++ ? R : R ^ T
				696	// select +++ ? R ^ T : R
				697	// depending on TrueIfZero.
				698
				699	} else if (match(ShouldSameV, m_Zero())) {
				700	// Matched: select +++ ? 0 : ...
				701	// select +++ ? ... : 0
				702	if (!SelI->hasOneUse())
				703	return false;
				704	T = ShouldXoredV;
				705	// Matched: select +++ ? 0 : T
				706	// select +++ ? T : 0
				707
				708	Value U = SelI->user_begin();
				709	if (!match(U, m_Xor(m_Specific(SelI), m_Value(R))) &&
				710	!match(U, m_Xor(m_Value(R), m_Specific(SelI))))
				711	return false;
				712	// Matched: xor (select +++ ? 0 : T), R
				713	// xor (select +++ ? T : 0), R
				714	} else
				715	return false;
				716
				717	// The xor input value T is isolated into its own match so that it could
				718	// be checked against an induction variable containing a shifted bit
				719	// (todo).
				720	// For now, check against (Q << i).
				721	if (!match(T, m_Shl(m_Value(Q), m_Specific(CIV))) &&
				722	!match(T, m_Shl(m_ZExt(m_Value(Q)), m_ZExt(m_Specific(CIV)))))
				723	return false;
				724	// Matched: select +++ ? R : R ^ (Q << i)
				725	// select +++ ? R ^ (Q << i) : R
				726
				727	PV.X = X;
				728	PV.Q = Q;
				729	PV.R = R;
				730	PV.Left = true;
				731	return true;
				732	}
				733
				734
				735	bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
				736	ParsedValues &PV) {
				737	// Match the following:
				738	// select (X & 1) != 0 ? (R >> 1) ^ Q : (R >> 1)
				739	// select (X & 1) == 0 ? (R >> 1) : (R >> 1) ^ Q
				740	// The condition may also check for equality with the masked value, i.e
				741	// select (X & 1) == 1 ? (R >> 1) ^ Q : (R >> 1)
				742	// select (X & 1) != 1 ? (R >> 1) : (R >> 1) ^ Q
				743
				744	Value *CondV = SelI->getCondition();
				745	Value *TrueV = SelI->getTrueValue();
				746	Value *FalseV = SelI->getFalseValue();
				747
				748	using namespace PatternMatch;
				749
				750	Value *C = nullptr;
				751	CmpInst::Predicate P;
				752	bool TrueIfZero;
				753
				754	if (match(CondV, m_ICmp(P, m_Value(C), m_Zero())) \|\|
				755	match(CondV, m_ICmp(P, m_Zero(), m_Value(C)))) {
				756	if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
				757	return false;
				758	// Matched: select C == 0 ? ... : ...
				759	// select C != 0 ? ... : ...
				760	TrueIfZero = (P == CmpInst::ICMP_EQ);
				761	} else if (match(CondV, m_ICmp(P, m_Value(C), m_One())) \|\|
				762	match(CondV, m_ICmp(P, m_One(), m_Value(C)))) {
				763	if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
				764	return false;
				765	// Matched: select C == 1 ? ... : ...
				766	// select C != 1 ? ... : ...
				767	TrueIfZero = (P == CmpInst::ICMP_NE);
				768	} else
				769	return false;
				770
				771	Value *X = nullptr;
				772	if (!match(C, m_And(m_Value(X), m_One())) &&
				773	!match(C, m_And(m_One(), m_Value(X))))
				774	return false;
				775	// Matched: select (X & 1) == +++ ? ... : ...
				776	// select (X & 1) != +++ ? ... : ...
				777
				778	Value R = nullptr, Q = nullptr;
				779	if (TrueIfZero) {
				780	// The select's condition is true if the tested bit is 0.
				781	// TrueV must be the shift, FalseV must be the xor.
				782	if (!match(TrueV, m_LShr(m_Value(R), m_One())))
				783	return false;
				784	// Matched: select +++ ? (R >> 1) : ...
				785	if (!match(FalseV, m_Xor(m_Specific(TrueV), m_Value(Q))) &&
				786	!match(FalseV, m_Xor(m_Value(Q), m_Specific(TrueV))))
				787	return false;
				788	// Matched: select +++ ? (R >> 1) : (R >> 1) ^ Q
				789	// with commuting ^.
				790	} else {
				791	// The select's condition is true if the tested bit is 1.
				792	// TrueV must be the xor, FalseV must be the shift.
				793	if (!match(FalseV, m_LShr(m_Value(R), m_One())))
				794	return false;
				795	// Matched: select +++ ? ... : (R >> 1)
				796	if (!match(TrueV, m_Xor(m_Specific(FalseV), m_Value(Q))) &&
				797	!match(TrueV, m_Xor(m_Value(Q), m_Specific(FalseV))))
				798	return false;
				799	// Matched: select +++ ? (R >> 1) ^ Q : (R >> 1)
				800	// with commuting ^.
				801	}
				802
				803	PV.X = X;
				804	PV.Q = Q;
				805	PV.R = R;
				806	PV.Left = false;
				807	return true;
				808	}
				809
				810
				811	bool PolynomialMultiplyRecognize::scanSelect(SelectInst *SelI,
				812	BasicBlock LoopB, BasicBlock PrehB, Value *CIV, ParsedValues &PV,
				813	bool PreScan) {
				814	using namespace PatternMatch;
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	815	// The basic pattern for R = P.Q is:
				816	// for i = 0..31
				817	// R = phi (0, R')
				818	// if (P & (1 << i)) ; test-bit(P, i)
				819	// R' = R ^ (Q << i)
				820	//
				821	// Similarly, the basic pattern for R = (P/Q).Q - P
				822	// for i = 0..31
				823	// R = phi(P, R')
				824	// if (R & (1 << i))
				825	// R' = R ^ (Q << i)
				826
				827	// There exist idioms, where instead of Q being shifted left, P is shifted
				828	// right. This produces a result that is shifted right by 32 bits (the
				829	// non-shifted result is 64-bit).
				830	//
				831	// For R = P.Q, this would be:
				832	// for i = 0..31
				833	// R = phi (0, R')
				834	// if ((P >> i) & 1)
				835	// R' = (R >> 1) ^ Q ; R is cycled through the loop, so it must
				836	// else ; be shifted by 1, not i.
				837	// R' = R >> 1
				838	//
				839	// And for the inverse:
				840	// for i = 0..31
				841	// R = phi (P, R')
				842	// if (R & 1)
				843	// R' = (R >> 1) ^ Q
				844	// else
				845	// R' = R >> 1
				846
				847	// The left-shifting idioms share the same pattern:
				848	// select (X & (1 << i)) ? R ^ (Q << i) : R
				849	// Similarly for right-shifting idioms:
				850	// select (X & 1) ? (R >> 1) ^ Q
				851
				852	if (matchLeftShift(SelI, CIV, PV)) {
				853	// If this is a pre-scan, getting this far is sufficient.
				854	if (PreScan)
				855	return true;
				856
				857	// Need to make sure that the SelI goes back into R.
				858	auto *RPhi = dyn_cast<PHINode>(PV.R);
				859	if (!RPhi)
				860	return false;
				861	if (SelI != RPhi->getIncomingValueForBlock(LoopB))
				862	return false;
				863	PV.Res = SelI;
				864
				865	// If X is loop invariant, it must be the input polynomial, and the
				866	// idiom is the basic polynomial multiply.
				867	if (CurLoop->isLoopInvariant(PV.X)) {
				868	PV.P = PV.X;
				869	PV.Inv = false;
				870	} else {
				871	// X is not loop invariant. If X == R, this is the inverse pmpy.
				872	// Otherwise, check for an xor with an invariant value. If the
				873	// variable argument to the xor is R, then this is still a valid
				874	// inverse pmpy.
				875	PV.Inv = true;
				876	if (PV.X != PV.R) {
				877	Value Var = nullptr, Inv = nullptr, X1 = nullptr, X2 = nullptr;
				878	if (!match(PV.X, m_Xor(m_Value(X1), m_Value(X2))))
				879	return false;
				880	auto *I1 = dyn_cast<Instruction>(X1);
				881	auto *I2 = dyn_cast<Instruction>(X2);
				882	if (!I1 \|\| I1->getParent() != LoopB) {
				883	Var = X2;
				884	Inv = X1;
				885	} else if (!I2 \|\| I2->getParent() != LoopB) {
				886	Var = X1;
				887	Inv = X2;
				888	} else
				889	return false;
				890	if (Var != PV.R)
				891	return false;
				892	PV.M = Inv;
				893	}
				894	// The input polynomial P still needs to be determined. It will be
				895	// the entry value of R.
				896	Value *EntryP = RPhi->getIncomingValueForBlock(PrehB);
				897	PV.P = EntryP;
				898	}
				899
				900	return true;
				901	}
				902
				903	if (matchRightShift(SelI, PV)) {
				904	// If this is an inverse pattern, the Q polynomial must be known at
				905	// compile time.
				906	if (PV.Inv && !isa<ConstantInt>(PV.Q))
				907	return false;
				908	if (PreScan)
				909	return true;
				910	// There is no exact matching of right-shift pmpy.
				911	return false;
				912	}
				913
				914	return false;
				915	}
				916
				917
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	918	bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val,
				919	IntegerType *DestTy) {
				920	IntegerType *T = dyn_cast<IntegerType>(Val->getType());
				921	if (!T \|\| T->getBitWidth() > DestTy->getBitWidth())
				922	return false;
				923	if (T->getBitWidth() == DestTy->getBitWidth())
				924	return true;
				925	// Non-instructions are promotable. The reason why an instruction may not
				926	// be promotable is that it may produce a different result if its operands
				927	// and the result are promoted, for example, it may produce more non-zero
				928	// bits. While it would still be possible to represent the proper result
				929	// in a wider type, it may require adding additional instructions (which
				930	// we don't want to do).
				931	Instruction *In = dyn_cast<Instruction>(Val);
				932	if (!In)
				933	return true;
				934	// The bitwidth of the source type is smaller than the destination.
				935	// Check if the individual operation can be promoted.
				936	switch (In->getOpcode()) {
				937	case Instruction::PHI:
				938	case Instruction::ZExt:
				939	case Instruction::And:
				940	case Instruction::Or:
				941	case Instruction::Xor:
				942	case Instruction::LShr: // Shift right is ok.
				943	case Instruction::Select:
				944	return true;
				945	case Instruction::ICmp:
				946	if (CmpInst *CI = cast<CmpInst>(In))
				947	return CI->isEquality() \|\| CI->isUnsigned();
				948	llvm_unreachable("Cast failed unexpectedly");
				949	case Instruction::Add:
				950	return In->hasNoSignedWrap() && In->hasNoUnsignedWrap();
				951	}
				952	return false;
				953	}
				954
				955
				956	void PolynomialMultiplyRecognize::promoteTo(Instruction *In,
				957	IntegerType DestTy, BasicBlock LoopB) {
				958	// Leave boolean values alone.
				959	if (!In->getType()->isIntegerTy(1))
				960	In->mutateType(DestTy);
				961	unsigned DestBW = DestTy->getBitWidth();
				962
				963	// Handle PHIs.
				964	if (PHINode *P = dyn_cast<PHINode>(In)) {
				965	unsigned N = P->getNumIncomingValues();
				966	for (unsigned i = 0; i != N; ++i) {
				967	BasicBlock *InB = P->getIncomingBlock(i);
				968	if (InB == LoopB)
				969	continue;
				970	Value *InV = P->getIncomingValue(i);
				971	IntegerType *Ty = cast<IntegerType>(InV->getType());
				972	// Do not promote values in PHI nodes of type i1.
				973	if (Ty != P->getType()) {
				974	// If the value type does not match the PHI type, the PHI type
				975	// must have been promoted.
				976	assert(Ty->getBitWidth() < DestBW);
				977	InV = IRBuilder<>(InB->getTerminator()).CreateZExt(InV, DestTy);
				978	P->setIncomingValue(i, InV);
				979	}
				980	}
				981	} else if (ZExtInst *Z = dyn_cast<ZExtInst>(In)) {
				982	Value *Op = Z->getOperand(0);
				983	if (Op->getType() == Z->getType())
				984	Z->replaceAllUsesWith(Op);
				985	Z->eraseFromParent();
				986	return;
				987	}
				988
				989	// Promote immediates.
				990	for (unsigned i = 0, n = In->getNumOperands(); i != n; ++i) {
				991	if (ConstantInt *CI = dyn_cast<ConstantInt>(In->getOperand(i)))
				992	if (CI->getType()->getBitWidth() < DestBW)
				993	In->setOperand(i, ConstantInt::get(DestTy, CI->getZExtValue()));
				994	}
				995	}
				996
				997
				998	bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB,
				999	BasicBlock *ExitB) {
				1000	assert(LoopB);
				1001	// Skip loops where the exit block has more than one predecessor. The values
				1002	// coming from the loop block will be promoted to another type, and so the
				1003	// values coming into the exit block from other predecessors would also have
				1004	// to be promoted.
				1005	if (!ExitB \|\| (ExitB->getSinglePredecessor() != LoopB))
				1006	return false;
				1007	IntegerType *DestTy = getPmpyType();
				1008	// Check if the exit values have types that are no wider than the type
				1009	// that we want to promote to.
				1010	unsigned DestBW = DestTy->getBitWidth();
				1011	for (Instruction &In : *ExitB) {
				1012	PHINode *P = dyn_cast<PHINode>(&In);
				1013	if (!P)
				1014	break;
				1015	if (P->getNumIncomingValues() != 1)
				1016	return false;
				1017	assert(P->getIncomingBlock(0) == LoopB);
				1018	IntegerType *T = dyn_cast<IntegerType>(P->getType());
				1019	if (!T \|\| T->getBitWidth() > DestBW)
				1020	return false;
				1021	}
				1022
				1023	// Check all instructions in the loop.
				1024	for (Instruction &In : *LoopB)
				1025	if (!In.isTerminator() && !isPromotableTo(&In, DestTy))
				1026	return false;
				1027
				1028	// Perform the promotion.
				1029	std::vector<Instruction*> LoopIns;
				1030	std::transform(LoopB->begin(), LoopB->end(), std::back_inserter(LoopIns),
				1031	[](Instruction &In) { return &In; });
				1032	for (Instruction *In : LoopIns)
				1033	promoteTo(In, DestTy, LoopB);
				1034
				1035	// Fix up the PHI nodes in the exit block.
				1036	Instruction *EndI = ExitB->getFirstNonPHI();
				1037	BasicBlock::iterator End = EndI ? EndI->getIterator() : ExitB->end();
				1038	for (auto I = ExitB->begin(); I != End; ++I) {
				1039	PHINode *P = dyn_cast<PHINode>(I);
				1040	if (!P)
				1041	break;
				1042	Type *Ty0 = P->getIncomingValue(0)->getType();
				1043	Type *PTy = P->getType();
				1044	if (PTy != Ty0) {
				1045	assert(Ty0 == DestTy);
				1046	// In order to create the trunc, P must have the promoted type.
				1047	P->mutateType(Ty0);
				1048	Value *T = IRBuilder<>(ExitB, End).CreateTrunc(P, PTy);
				1049	// In order for the RAUW to work, the types of P and T must match.
				1050	P->mutateType(PTy);
				1051	P->replaceAllUsesWith(T);
				1052	// Final update of the P's type.
				1053	P->mutateType(Ty0);
				1054	cast<Instruction>(T)->setOperand(0, P);
				1055	}
				1056	}
				1057
				1058	return true;
				1059	}
				1060
				1061
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1062	bool PolynomialMultiplyRecognize::findCycle(Value Out, Value In,
				1063	ValueSeq &Cycle) {
				1064	// Out = ..., In, ...
				1065	if (Out == In)
				1066	return true;
				1067
				1068	auto *BB = cast<Instruction>(Out)->getParent();
				1069	bool HadPhi = false;
				1070
				1071	for (auto U : Out->users()) {
				1072	auto I = dyn_cast<Instruction>(&U);
				1073	if (I == nullptr \|\| I->getParent() != BB)
				1074	continue;
				1075	// Make sure that there are no multi-iteration cycles, e.g.
				1076	// p1 = phi(p2)
				1077	// p2 = phi(p1)
				1078	// The cycle p1->p2->p1 would span two loop iterations.
				1079	// Check that there is only one phi in the cycle.
				1080	bool IsPhi = isa<PHINode>(I);
				1081	if (IsPhi && HadPhi)
				1082	return false;
				1083	HadPhi \|= IsPhi;
				1084	if (Cycle.count(I))
				1085	return false;
				1086	Cycle.insert(I);
				1087	if (findCycle(I, In, Cycle))
				1088	break;
				1089	Cycle.remove(I);
				1090	}
				1091	return !Cycle.empty();
				1092	}
				1093
				1094
				1095	void PolynomialMultiplyRecognize::classifyCycle(Instruction *DivI,
				1096	ValueSeq &Cycle, ValueSeq &Early, ValueSeq &Late) {
				1097	// All the values in the cycle that are between the phi node and the
				1098	// divider instruction will be classified as "early", all other values
				1099	// will be "late".
				1100
				1101	bool IsE = true;
				1102	unsigned I, N = Cycle.size();
				1103	for (I = 0; I < N; ++I) {
				1104	Value *V = Cycle[I];
				1105	if (DivI == V)
				1106	IsE = false;
				1107	else if (!isa<PHINode>(V))
				1108	continue;
				1109	// Stop if found either.
				1110	break;
				1111	}
				1112	// "I" is the index of either DivI or the phi node, whichever was first.
				1113	// "E" is "false" or "true" respectively.
				1114	ValueSeq &First = !IsE ? Early : Late;
				1115	for (unsigned J = 0; J < I; ++J)
				1116	First.insert(Cycle[J]);
				1117
				1118	ValueSeq &Second = IsE ? Early : Late;
				1119	Second.insert(Cycle[I]);
				1120	for (++I; I < N; ++I) {
				1121	Value *V = Cycle[I];
				1122	if (DivI == V \|\| isa<PHINode>(V))
				1123	break;
				1124	Second.insert(V);
				1125	}
				1126
				1127	for (; I < N; ++I)
				1128	First.insert(Cycle[I]);
				1129	}
				1130
				1131
				1132	bool PolynomialMultiplyRecognize::classifyInst(Instruction *UseI,
				1133	ValueSeq &Early, ValueSeq &Late) {
				1134	// Select is an exception, since the condition value does not have to be
				1135	// classified in the same way as the true/false values. The true/false
				1136	// values do have to be both early or both late.
				1137	if (UseI->getOpcode() == Instruction::Select) {
				1138	Value TV = UseI->getOperand(1), FV = UseI->getOperand(2);
				1139	if (Early.count(TV) \|\| Early.count(FV)) {
				1140	if (Late.count(TV) \|\| Late.count(FV))
				1141	return false;
				1142	Early.insert(UseI);
				1143	} else if (Late.count(TV) \|\| Late.count(FV)) {
				1144	if (Early.count(TV) \|\| Early.count(FV))
				1145	return false;
				1146	Late.insert(UseI);
				1147	}
				1148	return true;
				1149	}
				1150
				1151	// Not sure what would be the example of this, but the code below relies
				1152	// on having at least one operand.
				1153	if (UseI->getNumOperands() == 0)
				1154	return true;
				1155
				1156	bool AE = true, AL = true;
				1157	for (auto &I : UseI->operands()) {
				1158	if (Early.count(&*I))
				1159	AL = false;
				1160	else if (Late.count(&*I))
				1161	AE = false;
				1162	}
				1163	// If the operands appear "all early" and "all late" at the same time,
				1164	// then it means that none of them are actually classified as either.
				1165	// This is harmless.
				1166	if (AE && AL)
				1167	return true;
				1168	// Conversely, if they are neither "all early" nor "all late", then
				1169	// we have a mixture of early and late operands that is not a known
				1170	// exception.
				1171	if (!AE && !AL)
				1172	return false;
				1173
				1174	// Check that we have covered the two special cases.
				1175	assert(AE != AL);
				1176
				1177	if (AE)
				1178	Early.insert(UseI);
				1179	else
				1180	Late.insert(UseI);
				1181	return true;
				1182	}
				1183
				1184
				1185	bool PolynomialMultiplyRecognize::commutesWithShift(Instruction *I) {
				1186	switch (I->getOpcode()) {
				1187	case Instruction::And:
				1188	case Instruction::Or:
				1189	case Instruction::Xor:
				1190	case Instruction::LShr:
				1191	case Instruction::Shl:
				1192	case Instruction::Select:
				1193	case Instruction::ICmp:
				1194	case Instruction::PHI:
				1195	break;
				1196	default:
				1197	return false;
				1198	}
				1199	return true;
				1200	}
				1201
				1202
				1203	bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V,
				1204	unsigned IterCount) {
				1205	auto *T = dyn_cast<IntegerType>(V->getType());
				1206	if (!T)
				1207	return false;
				1208
				1209	unsigned BW = T->getBitWidth();
				1210	APInt K0(BW, 0), K1(BW, 0);
				1211	computeKnownBits(V, K0, K1, DL);
				1212	return K0.countLeadingOnes() >= IterCount;
				1213	}
				1214
				1215
				1216	bool PolynomialMultiplyRecognize::keepsHighBitsZero(Value *V,
				1217	unsigned IterCount) {
				1218	// Assume that all inputs to the value have the high bits zero.
				1219	// Check if the value itself preserves the zeros in the high bits.
				1220	if (auto *C = dyn_cast<ConstantInt>(V))
				1221	return C->getValue().countLeadingZeros() >= IterCount;
				1222
				1223	if (auto *I = dyn_cast<Instruction>(V)) {
				1224	switch (I->getOpcode()) {
				1225	case Instruction::And:
				1226	case Instruction::Or:
				1227	case Instruction::Xor:
				1228	case Instruction::LShr:
				1229	case Instruction::Select:
				1230	case Instruction::ICmp:
				1231	case Instruction::PHI:
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	1232	case Instruction::ZExt:
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1233	return true;
				1234	}
				1235	}
				1236
				1237	return false;
				1238	}
				1239
				1240
				1241	bool PolynomialMultiplyRecognize::isOperandShifted(Instruction I, Value Op) {
				1242	unsigned Opc = I->getOpcode();
				1243	if (Opc == Instruction::Shl \|\| Opc == Instruction::LShr)
				1244	return Op != I->getOperand(1);
				1245	return true;
				1246	}
				1247
				1248
				1249	bool PolynomialMultiplyRecognize::convertShiftsToLeft(BasicBlock *LoopB,
				1250	BasicBlock *ExitB, unsigned IterCount) {
				1251	Value *CIV = getCountIV(LoopB);
				1252	if (CIV == nullptr)
				1253	return false;
				1254	auto *CIVTy = dyn_cast<IntegerType>(CIV->getType());
				1255	if (CIVTy == nullptr)
				1256	return false;
				1257
				1258	ValueSeq RShifts;
				1259	ValueSeq Early, Late, Cycled;
				1260
				1261	// Find all value cycles that contain logical right shifts by 1.
				1262	for (Instruction &I : *LoopB) {
				1263	using namespace PatternMatch;
				1264	Value *V = nullptr;
				1265	if (!match(&I, m_LShr(m_Value(V), m_One())))
				1266	continue;
				1267	ValueSeq C;
				1268	if (!findCycle(&I, V, C))
				1269	continue;
				1270
				1271	// Found a cycle.
				1272	C.insert(&I);
				1273	classifyCycle(&I, C, Early, Late);
				1274	Cycled.insert(C.begin(), C.end());
				1275	RShifts.insert(&I);
				1276	}
				1277
				1278	// Find the set of all values affected by the shift cycles, i.e. all
				1279	// cycled values, and (recursively) all their users.
				1280	ValueSeq Users(Cycled.begin(), Cycled.end());
				1281	for (unsigned i = 0; i < Users.size(); ++i) {
				1282	Value *V = Users[i];
				1283	if (!isa<IntegerType>(V->getType()))
				1284	return false;
				1285	auto *R = cast<Instruction>(V);
				1286	// If the instruction does not commute with shifts, the loop cannot
				1287	// be unshifted.
				1288	if (!commutesWithShift(R))
				1289	return false;
				1290	for (auto I = R->user_begin(), E = R->user_end(); I != E; ++I) {
				1291	auto T = cast<Instruction>(I);
				1292	// Skip users from outside of the loop. They will be handled later.
				1293	// Also, skip the right-shifts and phi nodes, since they mix early
				1294	// and late values.
				1295	if (T->getParent() != LoopB \|\| RShifts.count(T) \|\| isa<PHINode>(T))
				1296	continue;
				1297
				1298	Users.insert(T);
				1299	if (!classifyInst(T, Early, Late))
				1300	return false;
				1301	}
				1302	}
				1303
				1304	if (Users.size() == 0)
				1305	return false;
				1306
				1307	// Verify that high bits remain zero.
				1308	ValueSeq Internal(Users.begin(), Users.end());
				1309	ValueSeq Inputs;
				1310	for (unsigned i = 0; i < Internal.size(); ++i) {
				1311	auto *R = dyn_cast<Instruction>(Internal[i]);
				1312	if (!R)
				1313	continue;
				1314	for (Value *Op : R->operands()) {
				1315	auto *T = dyn_cast<Instruction>(Op);
				1316	if (T && T->getParent() != LoopB)
				1317	Inputs.insert(Op);
				1318	else
				1319	Internal.insert(Op);
				1320	}
				1321	}
				1322	for (Value *V : Inputs)
				1323	if (!highBitsAreZero(V, IterCount))
				1324	return false;
				1325	for (Value *V : Internal)
				1326	if (!keepsHighBitsZero(V, IterCount))
				1327	return false;
				1328
				1329	// Finally, the work can be done. Unshift each user.
				1330	IRBuilder<> IRB(LoopB);
				1331	std::map<Value,Value> ShiftMap;
				1332	typedef std::map<std::pair<Value,Type>,Value*> CastMapType;
				1333	CastMapType CastMap;
				1334
				1335	auto upcast = [] (CastMapType &CM, IRBuilder<> &IRB, Value *V,
				1336	IntegerType Ty) -> Value {
				1337	auto H = CM.find(std::make_pair(V, Ty));
				1338	if (H != CM.end())
				1339	return H->second;
				1340	Value *CV = IRB.CreateIntCast(V, Ty, false);
				1341	CM.insert(std::make_pair(std::make_pair(V, Ty), CV));
				1342	return CV;
				1343	};
				1344
				1345	for (auto I = LoopB->begin(), E = LoopB->end(); I != E; ++I) {
				1346	if (isa<PHINode>(I) \|\| !Users.count(&*I))
				1347	continue;
				1348	using namespace PatternMatch;
				1349	// Match lshr x, 1.
				1350	Value *V = nullptr;
				1351	if (match(&*I, m_LShr(m_Value(V), m_One()))) {
				1352	replaceAllUsesOfWithIn(&*I, V, LoopB);
				1353	continue;
				1354	}
				1355	// For each non-cycled operand, replace it with the corresponding
				1356	// value shifted left.
				1357	for (auto &J : I->operands()) {
				1358	Value *Op = J.get();
				1359	if (!isOperandShifted(&*I, Op))
				1360	continue;
				1361	if (Users.count(Op))
				1362	continue;
				1363	// Skip shifting zeros.
				1364	if (isa<ConstantInt>(Op) && cast<ConstantInt>(Op)->isZero())
				1365	continue;
				1366	// Check if we have already generated a shift for this value.
				1367	auto F = ShiftMap.find(Op);
				1368	Value *W = (F != ShiftMap.end()) ? F->second : nullptr;
				1369	if (W == nullptr) {
				1370	IRB.SetInsertPoint(&*I);
				1371	// First, the shift amount will be CIV or CIV+1, depending on
				1372	// whether the value is early or late. Instead of creating CIV+1,
				1373	// do a single shift of the value.
				1374	Value ShAmt = CIV, ShVal = Op;
				1375	auto *VTy = cast<IntegerType>(ShVal->getType());
				1376	auto *ATy = cast<IntegerType>(ShAmt->getType());
				1377	if (Late.count(&*I))
				1378	ShVal = IRB.CreateShl(Op, ConstantInt::get(VTy, 1));
				1379	// Second, the types of the shifted value and the shift amount
				1380	// must match.
				1381	if (VTy != ATy) {
				1382	if (VTy->getBitWidth() < ATy->getBitWidth())
				1383	ShVal = upcast(CastMap, IRB, ShVal, ATy);
				1384	else
				1385	ShAmt = upcast(CastMap, IRB, ShAmt, VTy);
				1386	}
				1387	// Ready to generate the shift and memoize it.
				1388	W = IRB.CreateShl(ShVal, ShAmt);
				1389	ShiftMap.insert(std::make_pair(Op, W));
				1390	}
				1391	I->replaceUsesOfWith(Op, W);
				1392	}
				1393	}
				1394
				1395	// Update the users outside of the loop to account for having left
				1396	// shifts. They would normally be shifted right in the loop, so shift
				1397	// them right after the loop exit.
				1398	// Take advantage of the loop-closed SSA form, which has all the post-
				1399	// loop values in phi nodes.
				1400	IRB.SetInsertPoint(ExitB, ExitB->getFirstInsertionPt());
				1401	for (auto P = ExitB->begin(), Q = ExitB->end(); P != Q; ++P) {
				1402	if (!isa<PHINode>(P))
				1403	break;
				1404	auto *PN = cast<PHINode>(P);
				1405	Value *U = PN->getIncomingValueForBlock(LoopB);
				1406	if (!Users.count(U))
				1407	continue;
				1408	Value *S = IRB.CreateLShr(PN, ConstantInt::get(PN->getType(), IterCount));
				1409	PN->replaceAllUsesWith(S);
				1410	// The above RAUW will create
				1411	// S = lshr S, IterCount
				1412	// so we need to fix it back into
				1413	// S = lshr PN, IterCount
				1414	cast<User>(S)->replaceUsesOfWith(S, PN);
				1415	}
				1416
				1417	return true;
				1418	}
				1419
				1420
				1421	void PolynomialMultiplyRecognize::cleanupLoopBody(BasicBlock *LoopB) {
				1422	for (auto &I : *LoopB)
				1423	if (Value *SV = SimplifyInstruction(&I, DL, &TLI, &DT))
				1424	I.replaceAllUsesWith(SV);
				1425
				1426	for (auto I = LoopB->begin(), N = I; I != LoopB->end(); I = N) {
				1427	N = std::next(I);
				1428	RecursivelyDeleteTriviallyDeadInstructions(&*I, &TLI);
				1429	}
				1430	}
				1431
				1432
				1433	unsigned PolynomialMultiplyRecognize::getInverseMxN(unsigned QP) {
				1434	// Arrays of coefficients of Q and the inverse, C.
				1435	// Q[i] = coefficient at x^i.
				1436	std::array<char,32> Q, C;
				1437
				1438	for (unsigned i = 0; i < 32; ++i) {
				1439	Q[i] = QP & 1;
				1440	QP >>= 1;
				1441	}
				1442	assert(Q[0] == 1);
				1443
				1444	// Find C, such that
				1445	// (Q[n]x^n + ... + Q[1]x + Q[0]) * (C[n]x^n + ... + C[1]x + C[0]) = 1
				1446	//
				1447	// For it to have a solution, Q[0] must be 1. Since this is Z2[x], the
				1448	// operations * and + are & and ^ respectively.
				1449	//
				1450	// Find C[i] recursively, by comparing i-th coefficient in the product
				1451	// with 0 (or 1 for i=0).
				1452	//
				1453	// C[0] = 1, since C[0] = Q[0], and Q[0] = 1.
				1454	C[0] = 1;
				1455	for (unsigned i = 1; i < 32; ++i) {
				1456	// Solve for C[i] in:
				1457	// C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i]Q[0] = 0
				1458	// This is equivalent to
				1459	// C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i] = 0
				1460	// which is
				1461	// C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] = C[i]
				1462	unsigned T = 0;
				1463	for (unsigned j = 0; j < i; ++j)
				1464	T = T ^ (C[j] & Q[i-j]);
				1465	C[i] = T;
				1466	}
				1467
				1468	unsigned QV = 0;
				1469	for (unsigned i = 0; i < 32; ++i)
				1470	if (C[i])
				1471	QV \|= (1 << i);
				1472
				1473	return QV;
				1474	}
				1475
				1476
				1477	Value *PolynomialMultiplyRecognize::generate(BasicBlock::iterator At,
				1478	ParsedValues &PV) {
				1479	IRBuilder<> B(&*At);
				1480	Module *M = At->getParent()->getParent()->getParent();
				1481	Value *PMF = Intrinsic::getDeclaration(M, Intrinsic::hexagon_M4_pmpyw);
				1482
				1483	Value P = PV.P, Q = PV.Q, *P0 = P;
				1484	unsigned IC = PV.IterCount;
				1485
				1486	if (PV.M != nullptr)
				1487	P0 = P = B.CreateXor(P, PV.M);
				1488
				1489	// Create a bit mask to clear the high bits beyond IterCount.
				1490	auto *BMI = ConstantInt::get(P->getType(), APInt::getLowBitsSet(32, IC));
				1491
				1492	if (PV.IterCount != 32)
				1493	P = B.CreateAnd(P, BMI);
				1494
				1495	if (PV.Inv) {
				1496	auto *QI = dyn_cast<ConstantInt>(PV.Q);
				1497	assert(QI && QI->getBitWidth() <= 32);
				1498
				1499	// Again, clearing bits beyond IterCount.
				1500	unsigned M = (1 << PV.IterCount) - 1;
				1501	unsigned Tmp = (QI->getZExtValue() \| 1) & M;
				1502	unsigned QV = getInverseMxN(Tmp) & M;
				1503	auto *QVI = ConstantInt::get(QI->getType(), QV);
				1504	P = B.CreateCall(PMF, {P, QVI});
				1505	P = B.CreateTrunc(P, QI->getType());
				1506	if (IC != 32)
				1507	P = B.CreateAnd(P, BMI);
				1508	}
				1509
				1510	Value *R = B.CreateCall(PMF, {P, Q});
				1511
				1512	if (PV.M != nullptr)
				1513	R = B.CreateXor(R, B.CreateIntCast(P0, R->getType(), false));
				1514
				1515	return R;
				1516	}
				1517
				1518
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	1519	void PolynomialMultiplyRecognize::setupSimplifier() {
				1520	Simp.addRule(
				1521	// Sink zext past bitwise operations.
				1522	[](Instruction I, LLVMContext &Ctx) -> Value {
				1523	if (I->getOpcode() != Instruction::ZExt)
				1524	return nullptr;
				1525	Instruction *T = dyn_cast<Instruction>(I->getOperand(0));
				1526	if (!T)
				1527	return nullptr;
				1528	switch (T->getOpcode()) {
				1529	case Instruction::And:
				1530	case Instruction::Or:
				1531	case Instruction::Xor:
				1532	break;
				1533	default:
				1534	return nullptr;
				1535	}
				1536	IRBuilder<> B(Ctx);
				1537	return B.CreateBinOp(cast<BinaryOperator>(T)->getOpcode(),
				1538	B.CreateZExt(T->getOperand(0), I->getType()),
				1539	B.CreateZExt(T->getOperand(1), I->getType()));
				1540	});
				1541	Simp.addRule(
				1542	// (xor (and x a) (and y a)) -> (and (xor x y) a)
				1543	[](Instruction I, LLVMContext &Ctx) -> Value {
				1544	if (I->getOpcode() != Instruction::Xor)
				1545	return nullptr;
				1546	Instruction *And0 = dyn_cast<Instruction>(I->getOperand(0));
				1547	Instruction *And1 = dyn_cast<Instruction>(I->getOperand(1));
				1548	if (!And0 \|\| !And1)
				1549	return nullptr;
				1550	if (And0->getOpcode() != Instruction::And \|\|
				1551	And1->getOpcode() != Instruction::And)
				1552	return nullptr;
				1553	if (And0->getOperand(1) != And1->getOperand(1))
				1554	return nullptr;
				1555	IRBuilder<> B(Ctx);
				1556	return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1->getOperand(0)),
				1557	And0->getOperand(1));
				1558	});
				1559	Simp.addRule(
				1560	// (Op (select c x y) z) -> (select c (Op x z) (Op y z))
				1561	// (Op x (select c y z)) -> (select c (Op x y) (Op x z))
				1562	[](Instruction I, LLVMContext &Ctx) -> Value {
				1563	BinaryOperator *BO = dyn_cast<BinaryOperator>(I);
				1564	if (!BO)
				1565	return nullptr;
				1566	Instruction::BinaryOps Op = BO->getOpcode();
				1567	if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(0))) {
				1568	IRBuilder<> B(Ctx);
				1569	Value X = Sel->getTrueValue(), Y = Sel->getFalseValue();
				1570	Value *Z = BO->getOperand(1);
				1571	return B.CreateSelect(Sel->getCondition(),
				1572	B.CreateBinOp(Op, X, Z),
				1573	B.CreateBinOp(Op, Y, Z));
				1574	}
				1575	if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(1))) {
				1576	IRBuilder<> B(Ctx);
				1577	Value *X = BO->getOperand(0);
				1578	Value Y = Sel->getTrueValue(), Z = Sel->getFalseValue();
				1579	return B.CreateSelect(Sel->getCondition(),
				1580	B.CreateBinOp(Op, X, Y),
				1581	B.CreateBinOp(Op, X, Z));
				1582	}
				1583	return nullptr;
				1584	});
				1585	Simp.addRule(
				1586	// (select c (select c x y) z) -> (select c x z)
				1587	// (select c x (select c y z)) -> (select c x z)
				1588	[](Instruction I, LLVMContext &Ctx) -> Value {
				1589	SelectInst *Sel = dyn_cast<SelectInst>(I);
				1590	if (!Sel)
				1591	return nullptr;
				1592	IRBuilder<> B(Ctx);
				1593	Value *C = Sel->getCondition();
				1594	if (SelectInst *Sel0 = dyn_cast<SelectInst>(Sel->getTrueValue())) {
				1595	if (Sel0->getCondition() == C)
				1596	return B.CreateSelect(C, Sel0->getTrueValue(), Sel->getFalseValue());
				1597	}
				1598	if (SelectInst *Sel1 = dyn_cast<SelectInst>(Sel->getFalseValue())) {
				1599	if (Sel1->getCondition() == C)
				1600	return B.CreateSelect(C, Sel->getTrueValue(), Sel1->getFalseValue());
				1601	}
				1602	return nullptr;
				1603	});
				1604	Simp.addRule(
				1605	// (or (lshr x 1) 0x800.0) -> (xor (lshr x 1) 0x800.0)
				1606	[](Instruction I, LLVMContext &Ctx) -> Value {
				1607	if (I->getOpcode() != Instruction::Or)
				1608	return nullptr;
				1609	Instruction *LShr = dyn_cast<Instruction>(I->getOperand(0));
				1610	if (!LShr \|\| LShr->getOpcode() != Instruction::LShr)
				1611	return nullptr;
				1612	ConstantInt *One = dyn_cast<ConstantInt>(LShr->getOperand(1));
				1613	if (!One \|\| One->getZExtValue() != 1)
				1614	return nullptr;
				1615	ConstantInt *Msb = dyn_cast<ConstantInt>(I->getOperand(1));
				1616	if (!Msb \|\| Msb->getZExtValue() != Msb->getType()->getSignBit())
				1617	return nullptr;
				1618	return IRBuilder<>(Ctx).CreateXor(LShr, Msb);
				1619	});
				1620	Simp.addRule(
				1621	// (lshr (BitOp x y) c) -> (BitOp (lshr x c) (lshr y c))
				1622	[](Instruction I, LLVMContext &Ctx) -> Value {
				1623	if (I->getOpcode() != Instruction::LShr)
				1624	return nullptr;
				1625	BinaryOperator *BitOp = dyn_cast<BinaryOperator>(I->getOperand(0));
				1626	if (!BitOp)
				1627	return nullptr;
				1628	switch (BitOp->getOpcode()) {
				1629	case Instruction::And:
				1630	case Instruction::Or:
				1631	case Instruction::Xor:
				1632	break;
				1633	default:
				1634	return nullptr;
				1635	}
				1636	IRBuilder<> B(Ctx);
				1637	Value *S = I->getOperand(1);
				1638	return B.CreateBinOp(BitOp->getOpcode(),
				1639	B.CreateLShr(BitOp->getOperand(0), S),
				1640	B.CreateLShr(BitOp->getOperand(1), S));
				1641	});
				1642	Simp.addRule(
				1643	// (BitOp1 (BitOp2 x a) b) -> (BitOp2 x (BitOp1 a b))
				1644	[](Instruction I, LLVMContext &Ctx) -> Value {
				1645	auto IsBitOp = [](unsigned Op) -> bool {
				1646	switch (Op) {
				1647	case Instruction::And:
				1648	case Instruction::Or:
				1649	case Instruction::Xor:
				1650	return true;
				1651	}
				1652	return false;
				1653	};
				1654	BinaryOperator *BitOp1 = dyn_cast<BinaryOperator>(I);
				1655	if (!BitOp1 \|\| !IsBitOp(BitOp1->getOpcode()))
				1656	return nullptr;
				1657	BinaryOperator *BitOp2 = dyn_cast<BinaryOperator>(BitOp1->getOperand(0));
				1658	if (!BitOp2 \|\| !IsBitOp(BitOp2->getOpcode()))
				1659	return nullptr;
				1660	ConstantInt *CA = dyn_cast<ConstantInt>(BitOp2->getOperand(1));
				1661	ConstantInt *CB = dyn_cast<ConstantInt>(BitOp1->getOperand(1));
				1662	if (!CA \|\| !CB)
				1663	return nullptr;
				1664	IRBuilder<> B(Ctx);
				1665	Value *X = BitOp2->getOperand(0);
				1666	return B.CreateBinOp(BitOp2->getOpcode(), X,
				1667	B.CreateBinOp(BitOp1->getOpcode(), CA, CB));
				1668	});
				1669	}
				1670
				1671
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1672	bool PolynomialMultiplyRecognize::recognize() {
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	1673	DEBUG(dbgs() << "Starting PolynomialMultiplyRecognize on loop\n"
				1674	<< *CurLoop << '\n');
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1675	// Restrictions:
				1676	// - The loop must consist of a single block.
				1677	// - The iteration count must be known at compile-time.
				1678	// - The loop must have an induction variable starting from 0, and
				1679	// incremented in each iteration of the loop.
				1680	BasicBlock *LoopB = CurLoop->getHeader();
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	1681	DEBUG(dbgs() << "Loop header:\n" << *LoopB);
				1682
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1683	if (LoopB != CurLoop->getLoopLatch())
				1684	return false;
				1685	BasicBlock *ExitB = CurLoop->getExitBlock();
				1686	if (ExitB == nullptr)
				1687	return false;
				1688	BasicBlock *EntryB = CurLoop->getLoopPreheader();
				1689	if (EntryB == nullptr)
				1690	return false;
				1691
				1692	unsigned IterCount = 0;
				1693	const SCEV *CT = SE.getBackedgeTakenCount(CurLoop);
				1694	if (isa<SCEVCouldNotCompute>(CT))
				1695	return false;
				1696	if (auto *CV = dyn_cast<SCEVConstant>(CT))
				1697	IterCount = CV->getValue()->getZExtValue() + 1;
				1698
				1699	Value *CIV = getCountIV(LoopB);
				1700	ParsedValues PV;
				1701	PV.IterCount = IterCount;
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	1702	DEBUG(dbgs() << "Loop IV: " << *CIV << "\nIterCount: " << IterCount << '\n');
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1703
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	1704	setupSimplifier();
				1705
				1706	// Perform a preliminary scan of select instructions to see if any of them
				1707	// looks like a generator of the polynomial multiply steps. Assume that a
				1708	// loop can only contain a single transformable operation, so stop the
				1709	// traversal after the first reasonable candidate was found.
				1710	// XXX: Currently this approach can modify the loop before being 100% sure
				1711	// that the transformation can be carried out.
				1712	bool FoundPreScan = false;
				1713	for (Instruction &In : *LoopB) {
				1714	SelectInst *SI = dyn_cast<SelectInst>(&In);
				1715	if (!SI)
				1716	continue;
				1717
				1718	Simplifier::Context C(SI);
				1719	Value *T = Simp.simplify(C);
				1720	SelectInst *SelI = (T && isa<SelectInst>(T)) ? cast<SelectInst>(T) : SI;
				1721	DEBUG(dbgs() << "scanSelect(pre-scan): " << PE(C, SelI) << '\n');
				1722	if (scanSelect(SelI, LoopB, EntryB, CIV, PV, true)) {
				1723	FoundPreScan = true;
				1724	if (SelI != SI) {
				1725	Value *NewSel = C.materialize(LoopB, SI->getIterator());
				1726	SI->replaceAllUsesWith(NewSel);
				1727	RecursivelyDeleteTriviallyDeadInstructions(SI, &TLI);
				1728	}
				1729	break;
				1730	}
				1731	}
				1732
				1733	if (!FoundPreScan) {
				1734	DEBUG(dbgs() << "Have not found candidates for pmpy\n");
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1735	return false;
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	1736	}
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1737
				1738	if (!PV.Left) {
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	1739	// The right shift version actually only returns the higher bits of
				1740	// the result (each iteration discards the LSB). If we want to convert it
				1741	// to a left-shifting loop, the working data type must be at least as
				1742	// wide as the target's pmpy instruction.
				1743	if (!promoteTypes(LoopB, ExitB))
				1744	return false;
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1745	convertShiftsToLeft(LoopB, ExitB, IterCount);
				1746	cleanupLoopBody(LoopB);
				1747	}
				1748
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	1749	// Scan the loop again, find the generating select instruction.
				1750	bool FoundScan = false;
				1751	for (Instruction &In : *LoopB) {
				1752	SelectInst *SelI = dyn_cast<SelectInst>(&In);
				1753	if (!SelI)
				1754	continue;
				1755	DEBUG(dbgs() << "scanSelect: " << *SelI << '\n');
				1756	FoundScan = scanSelect(SelI, LoopB, EntryB, CIV, PV, false);
				1757	if (FoundScan)
				1758	break;
				1759	}
				1760	assert(FoundScan);
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1761
				1762	DEBUG({
				1763	StringRef PP = (PV.M ? "(P+M)" : "P");
				1764	if (!PV.Inv)
				1765	dbgs() << "Found pmpy idiom: R = " << PP << ".Q\n";
				1766	else
				1767	dbgs() << "Found inverse pmpy idiom: R = (" << PP << "/Q).Q) + "
				1768	<< PP << "\n";
				1769	dbgs() << " Res:" << PV.Res << "\n P:" << PV.P << "\n";
				1770	if (PV.M)
				1771	dbgs() << " M:" << *PV.M << "\n";
				1772	dbgs() << " Q:" << *PV.Q << "\n";
				1773	dbgs() << " Iteration count:" << PV.IterCount << "\n";
				1774	});
				1775
				1776	BasicBlock::iterator At(EntryB->getTerminator());
				1777	Value *PM = generate(At, PV);
				1778	if (PM == nullptr)
				1779	return false;
				1780
				1781	if (PM->getType() != PV.Res->getType())
				1782	PM = IRBuilder<>(&*At).CreateIntCast(PM, PV.Res->getType(), false);
				1783
				1784	PV.Res->replaceAllUsesWith(PM);
				1785	PV.Res->eraseFromParent();
				1786	return true;
				1787	}
				1788
				1789
				1790	unsigned HexagonLoopIdiomRecognize::getStoreSizeInBytes(StoreInst *SI) {
				1791	uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType());
				1792	assert(((SizeInBits & 7) \|\| (SizeInBits >> 32) == 0) &&
				1793	"Don't overflow unsigned.");
				1794	return (unsigned)SizeInBits >> 3;
				1795	}
				1796
				1797
				1798	int HexagonLoopIdiomRecognize::getSCEVStride(const SCEVAddRecExpr *S) {
				1799	if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getOperand(1)))
				1800	return SC->getAPInt().getSExtValue();
				1801	return 0;
				1802	}
				1803
				1804
				1805	bool HexagonLoopIdiomRecognize::isLegalStore(Loop CurLoop, StoreInst SI) {
Krzysztof Parzyszek	35ce5da	2017-01-27 20:40:14 +0000	[diff] [blame]	1806	// Allow volatile stores if HexagonVolatileMemcpy is enabled.
				1807	if (!(SI->isVolatile() && HexagonVolatileMemcpy) && !SI->isSimple())
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1808	return false;
				1809
				1810	Value *StoredVal = SI->getValueOperand();
				1811	Value *StorePtr = SI->getPointerOperand();
				1812
				1813	// Reject stores that are so large that they overflow an unsigned.
				1814	uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
				1815	if ((SizeInBits & 7) \|\| (SizeInBits >> 32) != 0)
				1816	return false;
				1817
				1818	// See if the pointer expression is an AddRec like {base,+,1} on the current
				1819	// loop, which indicates a strided store. If we have something else, it's a
				1820	// random store we can't handle.
				1821	auto *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
				1822	if (!StoreEv \|\| StoreEv->getLoop() != CurLoop \|\| !StoreEv->isAffine())
				1823	return false;
				1824
				1825	// Check to see if the stride matches the size of the store. If so, then we
				1826	// know that every byte is touched in the loop.
				1827	int Stride = getSCEVStride(StoreEv);
				1828	if (Stride == 0)
				1829	return false;
				1830	unsigned StoreSize = getStoreSizeInBytes(SI);
				1831	if (StoreSize != unsigned(std::abs(Stride)))
				1832	return false;
				1833
				1834	// The store must be feeding a non-volatile load.
				1835	LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
				1836	if (!LI \|\| !LI->isSimple())
				1837	return false;
				1838
				1839	// See if the pointer expression is an AddRec like {base,+,1} on the current
				1840	// loop, which indicates a strided load. If we have something else, it's a
				1841	// random load we can't handle.
				1842	Value *LoadPtr = LI->getPointerOperand();
				1843	auto *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
				1844	if (!LoadEv \|\| LoadEv->getLoop() != CurLoop \|\| !LoadEv->isAffine())
				1845	return false;
				1846
				1847	// The store and load must share the same stride.
				1848	if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
				1849	return false;
				1850
				1851	// Success. This store can be converted into a memcpy.
				1852	return true;
				1853	}
				1854
				1855
				1856	/// mayLoopAccessLocation - Return true if the specified loop might access the
				1857	/// specified pointer location, which is a loop-strided access. The 'Access'
				1858	/// argument specifies what the verboten forms of access are (read or write).
				1859	static bool
				1860	mayLoopAccessLocation(Value Ptr, ModRefInfo Access, Loop L,
				1861	const SCEV *BECount, unsigned StoreSize,
				1862	AliasAnalysis &AA,
				1863	SmallPtrSetImpl<Instruction *> &Ignored) {
				1864	// Get the location that may be stored across the loop. Since the access
				1865	// is strided positively through memory, we say that the modified location
				1866	// starts at the pointer and has infinite size.
				1867	uint64_t AccessSize = MemoryLocation::UnknownSize;
				1868
				1869	// If the loop iterates a fixed number of times, we can refine the access
				1870	// size to be exactly the size of the memset, which is (BECount+1)*StoreSize
				1871	if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
				1872	AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize;
				1873
				1874	// TODO: For this to be really effective, we have to dive into the pointer
				1875	// operand in the store. Store to &A[i] of 100 will always return may alias
				1876	// with store of &A[100], we need to StoreLoc to be "A" with size of 100,
				1877	// which will then no-alias a store to &A[100].
				1878	MemoryLocation StoreLoc(Ptr, AccessSize);
				1879
				1880	for (auto *B : L->blocks())
				1881	for (auto &I : *B)
				1882	if (Ignored.count(&I) == 0 && (AA.getModRefInfo(&I, StoreLoc) & Access))
				1883	return true;
				1884
				1885	return false;
				1886	}
				1887
				1888
				1889	void HexagonLoopIdiomRecognize::collectStores(Loop CurLoop, BasicBlock BB,
				1890	SmallVectorImpl<StoreInst*> &Stores) {
				1891	Stores.clear();
				1892	for (Instruction &I : *BB)
				1893	if (StoreInst *SI = dyn_cast<StoreInst>(&I))
				1894	if (isLegalStore(CurLoop, SI))
				1895	Stores.push_back(SI);
				1896	}
				1897
				1898
				1899	bool HexagonLoopIdiomRecognize::processCopyingStore(Loop *CurLoop,
				1900	StoreInst SI, const SCEV BECount) {
Michael Kuperstein	e18aad3	2017-01-31 22:48:45 +0000	[diff] [blame]	1901	assert((SI->isSimple() \|\| (SI->isVolatile() && HexagonVolatileMemcpy)) &&
				1902	"Expected only non-volatile stores, or Hexagon-specific memcpy"
				1903	"to volatile destination.");
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1904
				1905	Value *StorePtr = SI->getPointerOperand();
				1906	auto *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
				1907	unsigned Stride = getSCEVStride(StoreEv);
				1908	unsigned StoreSize = getStoreSizeInBytes(SI);
				1909	if (Stride != StoreSize)
				1910	return false;
				1911
				1912	// See if the pointer expression is an AddRec like {base,+,1} on the current
				1913	// loop, which indicates a strided load. If we have something else, it's a
				1914	// random load we can't handle.
				1915	LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
				1916	auto *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
				1917
				1918	// The trip count of the loop and the base pointer of the addrec SCEV is
				1919	// guaranteed to be loop invariant, which means that it should dominate the
				1920	// header. This allows us to insert code for it in the preheader.
				1921	BasicBlock *Preheader = CurLoop->getLoopPreheader();
				1922	Instruction *ExpPt = Preheader->getTerminator();
				1923	IRBuilder<> Builder(ExpPt);
				1924	SCEVExpander Expander(SE, DL, "hexagon-loop-idiom");
				1925
				1926	Type IntPtrTy = Builder.getIntPtrTy(DL, SI->getPointerAddressSpace());
				1927
				1928	// Okay, we have a strided store "p[i]" of a loaded value. We can turn
				1929	// this into a memcpy/memmove in the loop preheader now if we want. However,
				1930	// this would be unsafe to do if there is anything else in the loop that may
				1931	// read or write the memory region we're storing to. For memcpy, this
				1932	// includes the load that feeds the stores. Check for an alias by generating
				1933	// the base address and checking everything.
				1934	Value *StoreBasePtr = Expander.expandCodeFor(StoreEv->getStart(),
				1935	Builder.getInt8PtrTy(SI->getPointerAddressSpace()), ExpPt);
				1936	Value *LoadBasePtr = nullptr;
				1937
				1938	bool Overlap = false;
				1939	bool DestVolatile = SI->isVolatile();
				1940	Type *BECountTy = BECount->getType();
				1941
				1942	if (DestVolatile) {
				1943	// The trip count must fit in i32, since it is the type of the "num_words"
				1944	// argument to hexagon_memcpy_forward_vp4cp4n2.
				1945	if (StoreSize != 4 \|\| DL->getTypeSizeInBits(BECountTy) > 32) {
				1946	CleanupAndExit:
				1947	// If we generated new code for the base pointer, clean up.
				1948	Expander.clear();
				1949	if (StoreBasePtr && (LoadBasePtr != StoreBasePtr)) {
				1950	RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
				1951	StoreBasePtr = nullptr;
				1952	}
				1953	if (LoadBasePtr) {
				1954	RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
				1955	LoadBasePtr = nullptr;
				1956	}
				1957	return false;
				1958	}
				1959	}
				1960
				1961	SmallPtrSet<Instruction*, 2> Ignore1;
				1962	Ignore1.insert(SI);
				1963	if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
				1964	StoreSize, *AA, Ignore1)) {
				1965	// Check if the load is the offending instruction.
				1966	Ignore1.insert(LI);
				1967	if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
				1968	StoreSize, *AA, Ignore1)) {
				1969	// Still bad. Nothing we can do.
				1970	goto CleanupAndExit;
				1971	}
				1972	// It worked with the load ignored.
				1973	Overlap = true;
				1974	}
				1975
				1976	if (!Overlap) {
				1977	if (DisableMemcpyIdiom \|\| !HasMemcpy)
				1978	goto CleanupAndExit;
				1979	} else {
				1980	// Don't generate memmove if this function will be inlined. This is
				1981	// because the caller will undergo this transformation after inlining.
				1982	Function *Func = CurLoop->getHeader()->getParent();
				1983	if (Func->hasFnAttribute(Attribute::AlwaysInline))
				1984	goto CleanupAndExit;
				1985
				1986	// In case of a memmove, the call to memmove will be executed instead
				1987	// of the loop, so we need to make sure that there is nothing else in
				1988	// the loop than the load, store and instructions that these two depend
				1989	// on.
				1990	SmallVector<Instruction*,2> Insts;
				1991	Insts.push_back(SI);
				1992	Insts.push_back(LI);
				1993	if (!coverLoop(CurLoop, Insts))
				1994	goto CleanupAndExit;
				1995
				1996	if (DisableMemmoveIdiom \|\| !HasMemmove)
				1997	goto CleanupAndExit;
				1998	bool IsNested = CurLoop->getParentLoop() != 0;
				1999	if (IsNested && OnlyNonNestedMemmove)
				2000	goto CleanupAndExit;
				2001	}
				2002
				2003	// For a memcpy, we have to make sure that the input array is not being
				2004	// mutated by the loop.
				2005	LoadBasePtr = Expander.expandCodeFor(LoadEv->getStart(),
				2006	Builder.getInt8PtrTy(LI->getPointerAddressSpace()), ExpPt);
				2007
				2008	SmallPtrSet<Instruction*, 2> Ignore2;
				2009	Ignore2.insert(SI);
				2010	if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize,
				2011	*AA, Ignore2))
				2012	goto CleanupAndExit;
				2013
				2014	// Check the stride.
				2015	bool StridePos = getSCEVStride(LoadEv) >= 0;
				2016
				2017	// Currently, the volatile memcpy only emulates traversing memory forward.
				2018	if (!StridePos && DestVolatile)
				2019	goto CleanupAndExit;
				2020
				2021	bool RuntimeCheck = (Overlap \|\| DestVolatile);
				2022
				2023	BasicBlock *ExitB;
				2024	if (RuntimeCheck) {
				2025	// The runtime check needs a single exit block.
				2026	SmallVector<BasicBlock*, 8> ExitBlocks;
				2027	CurLoop->getUniqueExitBlocks(ExitBlocks);
				2028	if (ExitBlocks.size() != 1)
				2029	goto CleanupAndExit;
				2030	ExitB = ExitBlocks[0];
				2031	}
				2032
				2033	// The # stored bytes is (BECount+1)*Size. Expand the trip count out to
				2034	// pointer size if it isn't already.
				2035	LLVMContext &Ctx = SI->getContext();
				2036	BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
				2037	unsigned Alignment = std::min(SI->getAlignment(), LI->getAlignment());
				2038	DebugLoc DLoc = SI->getDebugLoc();
				2039
				2040	const SCEV *NumBytesS =
				2041	SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
				2042	if (StoreSize != 1)
				2043	NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
				2044	SCEV::FlagNUW);
				2045	Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtrTy, ExpPt);
				2046	if (Instruction *In = dyn_cast<Instruction>(NumBytes))
				2047	if (Value Simp = SimplifyInstruction(In, DL, TLI, DT))
				2048	NumBytes = Simp;
				2049
				2050	CallInst *NewCall;
				2051
				2052	if (RuntimeCheck) {
				2053	unsigned Threshold = RuntimeMemSizeThreshold;
				2054	if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) {
				2055	uint64_t C = CI->getZExtValue();
				2056	if (Threshold != 0 && C < Threshold)
				2057	goto CleanupAndExit;
				2058	if (C < CompileTimeMemSizeThreshold)
				2059	goto CleanupAndExit;
				2060	}
				2061
				2062	BasicBlock *Header = CurLoop->getHeader();
				2063	Function *Func = Header->getParent();
				2064	Loop *ParentL = LF->getLoopFor(Preheader);
				2065	StringRef HeaderName = Header->getName();
				2066
				2067	// Create a new (empty) preheader, and update the PHI nodes in the
				2068	// header to use the new preheader.
				2069	BasicBlock *NewPreheader = BasicBlock::Create(Ctx, HeaderName+".rtli.ph",
				2070	Func, Header);
				2071	if (ParentL)
				2072	ParentL->addBasicBlockToLoop(NewPreheader, *LF);
				2073	IRBuilder<>(NewPreheader).CreateBr(Header);
				2074	for (auto &In : *Header) {
				2075	PHINode *PN = dyn_cast<PHINode>(&In);
				2076	if (!PN)
				2077	break;
				2078	int bx = PN->getBasicBlockIndex(Preheader);
				2079	if (bx >= 0)
				2080	PN->setIncomingBlock(bx, NewPreheader);
				2081	}
				2082	DT->addNewBlock(NewPreheader, Preheader);
				2083	DT->changeImmediateDominator(Header, NewPreheader);
				2084
				2085	// Check for safe conditions to execute memmove.
				2086	// If stride is positive, copying things from higher to lower addresses
				2087	// is equivalent to memmove. For negative stride, it's the other way
				2088	// around. Copying forward in memory with positive stride may not be
				2089	// same as memmove since we may be copying values that we just stored
				2090	// in some previous iteration.
				2091	Value *LA = Builder.CreatePtrToInt(LoadBasePtr, IntPtrTy);
				2092	Value *SA = Builder.CreatePtrToInt(StoreBasePtr, IntPtrTy);
				2093	Value *LowA = StridePos ? SA : LA;
				2094	Value *HighA = StridePos ? LA : SA;
				2095	Value *CmpA = Builder.CreateICmpULT(LowA, HighA);
				2096	Value *Cond = CmpA;
				2097
				2098	// Check for distance between pointers.
				2099	Value *Dist = Builder.CreateSub(HighA, LowA);
				2100	Value *CmpD = Builder.CreateICmpSLT(NumBytes, Dist);
				2101	Value *CmpEither = Builder.CreateOr(Cond, CmpD);
				2102	Cond = CmpEither;
				2103
				2104	if (Threshold != 0) {
				2105	Type *Ty = NumBytes->getType();
				2106	Value *Thr = ConstantInt::get(Ty, Threshold);
				2107	Value *CmpB = Builder.CreateICmpULT(Thr, NumBytes);
				2108	Value *CmpBoth = Builder.CreateAnd(Cond, CmpB);
				2109	Cond = CmpBoth;
				2110	}
				2111	BasicBlock *MemmoveB = BasicBlock::Create(Ctx, Header->getName()+".rtli",
				2112	Func, NewPreheader);
				2113	if (ParentL)
				2114	ParentL->addBasicBlockToLoop(MemmoveB, *LF);
				2115	Instruction *OldT = Preheader->getTerminator();
				2116	Builder.CreateCondBr(Cond, MemmoveB, NewPreheader);
				2117	OldT->eraseFromParent();
				2118	Preheader->setName(Preheader->getName()+".old");
				2119	DT->addNewBlock(MemmoveB, Preheader);
				2120	// Find the new immediate dominator of the exit block.
				2121	BasicBlock *ExitD = Preheader;
				2122	for (auto PI = pred_begin(ExitB), PE = pred_end(ExitB); PI != PE; ++PI) {
				2123	BasicBlock PB = PI;
				2124	ExitD = DT->findNearestCommonDominator(ExitD, PB);
				2125	if (!ExitD)
				2126	break;
				2127	}
				2128	// If the prior immediate dominator of ExitB was dominated by the
				2129	// old preheader, then the old preheader becomes the new immediate
				2130	// dominator. Otherwise don't change anything (because the newly
				2131	// added blocks are dominated by the old preheader).
				2132	if (ExitD && DT->dominates(Preheader, ExitD)) {
				2133	DomTreeNode *BN = DT->getNode(ExitB);
				2134	DomTreeNode *DN = DT->getNode(ExitD);
				2135	BN->setIDom(DN);
				2136	}
				2137
				2138	// Add a call to memmove to the conditional block.
				2139	IRBuilder<> CondBuilder(MemmoveB);
				2140	CondBuilder.CreateBr(ExitB);
				2141	CondBuilder.SetInsertPoint(MemmoveB->getTerminator());
				2142
				2143	if (DestVolatile) {
				2144	Type *Int32Ty = Type::getInt32Ty(Ctx);
				2145	Type *Int32PtrTy = Type::getInt32PtrTy(Ctx);
				2146	Type *VoidTy = Type::getVoidTy(Ctx);
				2147	Module *M = Func->getParent();
				2148	Constant *CF = M->getOrInsertFunction(HexagonVolatileMemcpyName, VoidTy,
Serge Guelton	59a2d7b	2017-04-11 15:01:18 +0000	[diff] [blame^]	2149	Int32PtrTy, Int32PtrTy, Int32Ty);
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	2150	Function *Fn = cast<Function>(CF);
				2151	Fn->setLinkage(Function::ExternalLinkage);
				2152
				2153	const SCEV *OneS = SE->getConstant(Int32Ty, 1);
				2154	const SCEV *BECount32 = SE->getTruncateOrZeroExtend(BECount, Int32Ty);
				2155	const SCEV *NumWordsS = SE->getAddExpr(BECount32, OneS, SCEV::FlagNUW);
				2156	Value *NumWords = Expander.expandCodeFor(NumWordsS, Int32Ty,
				2157	MemmoveB->getTerminator());
				2158	if (Instruction *In = dyn_cast<Instruction>(NumWords))
				2159	if (Value Simp = SimplifyInstruction(In, DL, TLI, DT))
				2160	NumWords = Simp;
				2161
				2162	Value *Op0 = (StoreBasePtr->getType() == Int32PtrTy)
				2163	? StoreBasePtr
				2164	: CondBuilder.CreateBitCast(StoreBasePtr, Int32PtrTy);
				2165	Value *Op1 = (LoadBasePtr->getType() == Int32PtrTy)
				2166	? LoadBasePtr
				2167	: CondBuilder.CreateBitCast(LoadBasePtr, Int32PtrTy);
				2168	NewCall = CondBuilder.CreateCall(Fn, {Op0, Op1, NumWords});
				2169	} else {
				2170	NewCall = CondBuilder.CreateMemMove(StoreBasePtr, LoadBasePtr,
				2171	NumBytes, Alignment);
				2172	}
				2173	} else {
				2174	NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr,
				2175	NumBytes, Alignment);
				2176	// Okay, the memcpy has been formed. Zap the original store and
				2177	// anything that feeds into it.
				2178	RecursivelyDeleteTriviallyDeadInstructions(SI, TLI);
				2179	}
				2180
				2181	NewCall->setDebugLoc(DLoc);
				2182
				2183	DEBUG(dbgs() << " Formed " << (Overlap ? "memmove: " : "memcpy: ")
				2184	<< *NewCall << "\n"
				2185	<< " from load ptr=" << LoadEv << " at: " << LI << "\n"
				2186	<< " from store ptr=" << StoreEv << " at: " << SI << "\n");
				2187
				2188	return true;
				2189	}
				2190
				2191
				2192	// \brief Check if the instructions in Insts, together with their dependencies
				2193	// cover the loop in the sense that the loop could be safely eliminated once
				2194	// the instructions in Insts are removed.
				2195	bool HexagonLoopIdiomRecognize::coverLoop(Loop *L,
				2196	SmallVectorImpl<Instruction*> &Insts) const {
				2197	SmallSet<BasicBlock*,8> LoopBlocks;
				2198	for (auto *B : L->blocks())
				2199	LoopBlocks.insert(B);
				2200
				2201	SetVector<Instruction*> Worklist(Insts.begin(), Insts.end());
				2202
				2203	// Collect all instructions from the loop that the instructions in Insts
				2204	// depend on (plus their dependencies, etc.). These instructions will
				2205	// constitute the expression trees that feed those in Insts, but the trees
				2206	// will be limited only to instructions contained in the loop.
				2207	for (unsigned i = 0; i < Worklist.size(); ++i) {
				2208	Instruction *In = Worklist[i];
				2209	for (auto I = In->op_begin(), E = In->op_end(); I != E; ++I) {
				2210	Instruction *OpI = dyn_cast<Instruction>(I);
				2211	if (!OpI)
				2212	continue;
				2213	BasicBlock *PB = OpI->getParent();
				2214	if (!LoopBlocks.count(PB))
				2215	continue;
				2216	Worklist.insert(OpI);
				2217	}
				2218	}
				2219
				2220	// Scan all instructions in the loop, if any of them have a user outside
				2221	// of the loop, or outside of the expressions collected above, then either
				2222	// the loop has a side-effect visible outside of it, or there are
				2223	// instructions in it that are not involved in the original set Insts.
				2224	for (auto *B : L->blocks()) {
				2225	for (auto &In : *B) {
				2226	if (isa<BranchInst>(In) \|\| isa<DbgInfoIntrinsic>(In))
				2227	continue;
				2228	if (!Worklist.count(&In) && In.mayHaveSideEffects())
				2229	return false;
				2230	for (const auto &K : In.users()) {
				2231	Instruction *UseI = dyn_cast<Instruction>(K);
				2232	if (!UseI)
				2233	continue;
				2234	BasicBlock *UseB = UseI->getParent();
				2235	if (LF->getLoopFor(UseB) != L)
				2236	return false;
				2237	}
				2238	}
				2239	}
				2240
				2241	return true;
				2242	}
				2243
				2244	/// runOnLoopBlock - Process the specified block, which lives in a counted loop
				2245	/// with the specified backedge count. This block is known to be in the current
				2246	/// loop and not in any subloops.
				2247	bool HexagonLoopIdiomRecognize::runOnLoopBlock(Loop CurLoop, BasicBlock BB,
				2248	const SCEV BECount, SmallVectorImpl<BasicBlock> &ExitBlocks) {
				2249	// We can only promote stores in this block if they are unconditionally
				2250	// executed in the loop. For a block to be unconditionally executed, it has
				2251	// to dominate all the exit blocks of the loop. Verify this now.
				2252	auto DominatedByBB = [this,BB] (BasicBlock *EB) -> bool {
				2253	return DT->dominates(BB, EB);
				2254	};
				2255	if (!std::all_of(ExitBlocks.begin(), ExitBlocks.end(), DominatedByBB))
				2256	return false;
				2257
				2258	bool MadeChange = false;
				2259	// Look for store instructions, which may be optimized to memset/memcpy.
				2260	SmallVector<StoreInst*,8> Stores;
				2261	collectStores(CurLoop, BB, Stores);
				2262
				2263	// Optimize the store into a memcpy, if it feeds an similarly strided load.
				2264	for (auto &SI : Stores)
				2265	MadeChange \|= processCopyingStore(CurLoop, SI, BECount);
				2266
				2267	return MadeChange;
				2268	}
				2269
				2270
				2271	bool HexagonLoopIdiomRecognize::runOnCountableLoop(Loop *L) {
				2272	PolynomialMultiplyRecognize PMR(L, DL, DT, TLI, SE);
				2273	if (PMR.recognize())
				2274	return true;
				2275
				2276	if (!HasMemcpy && !HasMemmove)
				2277	return false;
				2278
				2279	const SCEV *BECount = SE->getBackedgeTakenCount(L);
				2280	assert(!isa<SCEVCouldNotCompute>(BECount) &&
				2281	"runOnCountableLoop() called on a loop without a predictable"
				2282	"backedge-taken count");
				2283
				2284	SmallVector<BasicBlock *, 8> ExitBlocks;
				2285	L->getUniqueExitBlocks(ExitBlocks);
				2286
				2287	bool Changed = false;
				2288
				2289	// Scan all the blocks in the loop that are not in subloops.
				2290	for (auto *BB : L->getBlocks()) {
				2291	// Ignore blocks in subloops.
				2292	if (LF->getLoopFor(BB) != L)
				2293	continue;
				2294	Changed \|= runOnLoopBlock(L, BB, BECount, ExitBlocks);
				2295	}
				2296
				2297	return Changed;
				2298	}
				2299
				2300
				2301	bool HexagonLoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
				2302	const Module &M = *L->getHeader()->getParent()->getParent();
				2303	if (Triple(M.getTargetTriple()).getArch() != Triple::hexagon)
				2304	return false;
				2305
				2306	if (skipLoop(L))
				2307	return false;
				2308
				2309	// If the loop could not be converted to canonical form, it must have an
				2310	// indirectbr in it, just give up.
				2311	if (!L->getLoopPreheader())
				2312	return false;
				2313
				2314	// Disable loop idiom recognition if the function's name is a common idiom.
				2315	StringRef Name = L->getHeader()->getParent()->getName();
				2316	if (Name == "memset" \|\| Name == "memcpy" \|\| Name == "memmove")
				2317	return false;
				2318
				2319	AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
				2320	DL = &L->getHeader()->getModule()->getDataLayout();
				2321	DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
				2322	LF = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
				2323	TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
				2324	SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
				2325
				2326	HasMemcpy = TLI->has(LibFunc_memcpy);
				2327	HasMemmove = TLI->has(LibFunc_memmove);
				2328
				2329	if (SE->hasLoopInvariantBackedgeTakenCount(L))
				2330	return runOnCountableLoop(L);
				2331	return false;
				2332	}
				2333
				2334
				2335	Pass *llvm::createHexagonLoopIdiomPass() {
				2336	return new HexagonLoopIdiomRecognize();
				2337	}
				2338