Blame - llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp - toolchain/llvm-project

blob: 9aa185fc85a6a3a04f3da8e3258f71d266119662 [file] [log] [blame]

Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1	//===--- HexagonLoopIdiomRecognition.cpp ----------------------------------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9
				10	#define DEBUG_TYPE "hexagon-lir"
				11
				12	#include "llvm/ADT/SetVector.h"
				13	#include "llvm/ADT/SmallSet.h"
				14	#include "llvm/Analysis/AliasAnalysis.h"
				15	#include "llvm/Analysis/InstructionSimplify.h"
				16	#include "llvm/Analysis/LoopPass.h"
				17	#include "llvm/Analysis/ScalarEvolution.h"
				18	#include "llvm/Analysis/ScalarEvolutionExpander.h"
				19	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
				20	#include "llvm/Analysis/TargetLibraryInfo.h"
				21	#include "llvm/Analysis/ValueTracking.h"
				22	#include "llvm/IR/DataLayout.h"
				23	#include "llvm/IR/Dominators.h"
				24	#include "llvm/IR/IRBuilder.h"
				25	#include "llvm/IR/PatternMatch.h"
				26	#include "llvm/Transforms/Scalar.h"
				27	#include "llvm/Transforms/Utils/Local.h"
				28	#include "llvm/Support/Debug.h"
Craig Topper	b45eabc	2017-04-26 16:39:58 +0000	[diff] [blame]	29	#include "llvm/Support/KnownBits.h"
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	30	#include "llvm/Support/raw_ostream.h"
				31
				32	#include <algorithm>
				33	#include <array>
				34
				35	using namespace llvm;
				36
				37	static cl::opt<bool> DisableMemcpyIdiom("disable-memcpy-idiom",
				38	cl::Hidden, cl::init(false),
				39	cl::desc("Disable generation of memcpy in loop idiom recognition"));
				40
				41	static cl::opt<bool> DisableMemmoveIdiom("disable-memmove-idiom",
				42	cl::Hidden, cl::init(false),
				43	cl::desc("Disable generation of memmove in loop idiom recognition"));
				44
				45	static cl::opt<unsigned> RuntimeMemSizeThreshold("runtime-mem-idiom-threshold",
				46	cl::Hidden, cl::init(0), cl::desc("Threshold (in bytes) for the runtime "
				47	"check guarding the memmove."));
				48
				49	static cl::opt<unsigned> CompileTimeMemSizeThreshold(
				50	"compile-time-mem-idiom-threshold", cl::Hidden, cl::init(64),
				51	cl::desc("Threshold (in bytes) to perform the transformation, if the "
				52	"runtime loop count (mem transfer size) is known at compile-time."));
				53
				54	static cl::opt<bool> OnlyNonNestedMemmove("only-nonnested-memmove-idiom",
				55	cl::Hidden, cl::init(true),
				56	cl::desc("Only enable generating memmove in non-nested loops"));
				57
				58	cl::opt<bool> HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy",
				59	cl::Hidden, cl::init(false),
				60	cl::desc("Enable Hexagon-specific memcpy for volatile destination."));
				61
Krzysztof Parzyszek	51fd540	2017-06-01 18:00:47 +0000	[diff] [blame^]	62	static cl::opt<unsigned> SimplifyLimit("hlir-simplify-limit", cl::init(10000),
				63	cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR"));
				64
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	65	static const char *HexagonVolatileMemcpyName
				66	= "hexagon_memcpy_forward_vp4cp4n2";
				67
				68
				69	namespace llvm {
				70	void initializeHexagonLoopIdiomRecognizePass(PassRegistry&);
				71	Pass *createHexagonLoopIdiomPass();
				72	}
				73
				74	namespace {
				75	class HexagonLoopIdiomRecognize : public LoopPass {
				76	public:
				77	static char ID;
				78	explicit HexagonLoopIdiomRecognize() : LoopPass(ID) {
				79	initializeHexagonLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
				80	}
				81	StringRef getPassName() const override {
				82	return "Recognize Hexagon-specific loop idioms";
				83	}
				84
				85	void getAnalysisUsage(AnalysisUsage &AU) const override {
				86	AU.addRequired<LoopInfoWrapperPass>();
				87	AU.addRequiredID(LoopSimplifyID);
				88	AU.addRequiredID(LCSSAID);
				89	AU.addRequired<AAResultsWrapperPass>();
				90	AU.addPreserved<AAResultsWrapperPass>();
				91	AU.addRequired<ScalarEvolutionWrapperPass>();
				92	AU.addRequired<DominatorTreeWrapperPass>();
				93	AU.addRequired<TargetLibraryInfoWrapperPass>();
				94	AU.addPreserved<TargetLibraryInfoWrapperPass>();
				95	}
				96
				97	bool runOnLoop(Loop *L, LPPassManager &LPM) override;
				98
				99	private:
				100	unsigned getStoreSizeInBytes(StoreInst *SI);
				101	int getSCEVStride(const SCEVAddRecExpr *StoreEv);
				102	bool isLegalStore(Loop CurLoop, StoreInst SI);
				103	void collectStores(Loop CurLoop, BasicBlock BB,
				104	SmallVectorImpl<StoreInst*> &Stores);
				105	bool processCopyingStore(Loop CurLoop, StoreInst SI, const SCEV *BECount);
				106	bool coverLoop(Loop L, SmallVectorImpl<Instruction> &Insts) const;
				107	bool runOnLoopBlock(Loop CurLoop, BasicBlock BB, const SCEV *BECount,
				108	SmallVectorImpl<BasicBlock*> &ExitBlocks);
				109	bool runOnCountableLoop(Loop *L);
				110
				111	AliasAnalysis *AA;
				112	const DataLayout *DL;
				113	DominatorTree *DT;
				114	LoopInfo *LF;
				115	const TargetLibraryInfo *TLI;
				116	ScalarEvolution *SE;
				117	bool HasMemcpy, HasMemmove;
				118	};
				119	}
				120
				121	char HexagonLoopIdiomRecognize::ID = 0;
				122
				123	INITIALIZE_PASS_BEGIN(HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
				124	"Recognize Hexagon-specific loop idioms", false, false)
				125	INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
				126	INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
				127	INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
				128	INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
				129	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
				130	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
				131	INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
				132	INITIALIZE_PASS_END(HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
				133	"Recognize Hexagon-specific loop idioms", false, false)
				134
				135
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	136	namespace {
				137	struct Simplifier {
				138	typedef std::function<Value* (Instruction*, LLVMContext&)> Rule;
				139
				140	void addRule(const Rule &R) { Rules.push_back(R); }
				141
				142	private:
Krzysztof Parzyszek	10fbac0	2017-03-23 23:01:22 +0000	[diff] [blame]	143	struct WorkListType {
				144	WorkListType() = default;
				145
				146	void push_back(Value* V) {
				147	// Do not push back duplicates.
				148	if (!S.count(V)) { Q.push_back(V); S.insert(V); }
				149	}
				150	Value *pop_front_val() {
				151	Value *V = Q.front(); Q.pop_front(); S.erase(V);
				152	return V;
				153	}
				154	bool empty() const { return Q.empty(); }
				155
				156	private:
				157	std::deque<Value*> Q;
				158	std::set<Value*> S;
				159	};
				160
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	161	typedef std::set<Value*> ValueSetType;
				162	std::vector<Rule> Rules;
				163
				164	public:
				165	struct Context {
				166	typedef DenseMap<Value,Value> ValueMapType;
				167
				168	Value *Root;
				169	ValueSetType Used; // The set of all cloned values used by Root.
				170	ValueSetType Clones; // The set of all cloned values.
				171	LLVMContext &Ctx;
				172
				173	Context(Instruction *Exp)
				174	: Ctx(Exp->getParent()->getParent()->getContext()) {
				175	initialize(Exp);
				176	}
				177	~Context() { cleanup(); }
				178	void print(raw_ostream &OS, const Value *V) const;
				179
				180	Value materialize(BasicBlock B, BasicBlock::iterator At);
				181
				182	private:
				183	void initialize(Instruction *Exp);
				184	void cleanup();
				185
				186	template <typename FuncT> void traverse(Value *V, FuncT F);
				187	void record(Value *V);
				188	void use(Value *V);
				189	void unuse(Value *V);
				190
				191	bool equal(const Instruction I, const Instruction J) const;
				192	Value find(Value Tree, Value *Sub) const;
				193	Value subst(Value Tree, Value OldV, Value NewV);
				194	void replace(Value OldV, Value NewV);
				195	void link(Instruction I, BasicBlock B, BasicBlock::iterator At);
				196
				197	friend struct Simplifier;
				198	};
				199
				200	Value *simplify(Context &C);
				201	};
				202
				203	struct PE {
				204	PE(const Simplifier::Context &c, Value *v = nullptr) : C(c), V(v) {}
				205	const Simplifier::Context &C;
				206	const Value *V;
				207	};
				208
				209	raw_ostream &operator<< (raw_ostream &OS, const PE &P) LLVM_ATTRIBUTE_USED;
				210	raw_ostream &operator<< (raw_ostream &OS, const PE &P) {
				211	P.C.print(OS, P.V ? P.V : P.C.Root);
				212	return OS;
				213	}
				214	}
				215
				216
				217	template <typename FuncT>
				218	void Simplifier::Context::traverse(Value *V, FuncT F) {
				219	WorkListType Q;
				220	Q.push_back(V);
				221
				222	while (!Q.empty()) {
Krzysztof Parzyszek	10fbac0	2017-03-23 23:01:22 +0000	[diff] [blame]	223	Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	224	if (!U \|\| U->getParent())
				225	continue;
				226	if (!F(U))
				227	continue;
				228	for (Value *Op : U->operands())
				229	Q.push_back(Op);
				230	}
				231	}
				232
				233
				234	void Simplifier::Context::print(raw_ostream &OS, const Value *V) const {
				235	const auto *U = dyn_cast<const Instruction>(V);
				236	if (!U) {
				237	OS << V << '(' << *V << ')';
				238	return;
				239	}
				240
				241	if (U->getParent()) {
				242	OS << U << '(';
				243	U->printAsOperand(OS, true);
				244	OS << ')';
				245	return;
				246	}
				247
				248	unsigned N = U->getNumOperands();
				249	if (N != 0)
				250	OS << U << '(';
				251	OS << U->getOpcodeName();
				252	for (const Value *Op : U->operands()) {
				253	OS << ' ';
				254	print(OS, Op);
				255	}
				256	if (N != 0)
				257	OS << ')';
				258	}
				259
				260
				261	void Simplifier::Context::initialize(Instruction *Exp) {
				262	// Perform a deep clone of the expression, set Root to the root
				263	// of the clone, and build a map from the cloned values to the
				264	// original ones.
				265	ValueMapType M;
				266	BasicBlock *Block = Exp->getParent();
				267	WorkListType Q;
				268	Q.push_back(Exp);
				269
				270	while (!Q.empty()) {
Krzysztof Parzyszek	10fbac0	2017-03-23 23:01:22 +0000	[diff] [blame]	271	Value *V = Q.pop_front_val();
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	272	if (M.find(V) != M.end())
				273	continue;
				274	if (Instruction *U = dyn_cast<Instruction>(V)) {
				275	if (isa<PHINode>(U) \|\| U->getParent() != Block)
				276	continue;
				277	for (Value *Op : U->operands())
				278	Q.push_back(Op);
				279	M.insert({U, U->clone()});
				280	}
				281	}
				282
				283	for (std::pair<Value,Value> P : M) {
				284	Instruction *U = cast<Instruction>(P.second);
				285	for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
				286	auto F = M.find(U->getOperand(i));
				287	if (F != M.end())
				288	U->setOperand(i, F->second);
				289	}
				290	}
				291
				292	auto R = M.find(Exp);
				293	assert(R != M.end());
				294	Root = R->second;
				295
				296	record(Root);
				297	use(Root);
				298	}
				299
				300
				301	void Simplifier::Context::record(Value *V) {
				302	auto Record = [this](Instruction *U) -> bool {
				303	Clones.insert(U);
				304	return true;
				305	};
				306	traverse(V, Record);
				307	}
				308
				309
				310	void Simplifier::Context::use(Value *V) {
				311	auto Use = [this](Instruction *U) -> bool {
				312	Used.insert(U);
				313	return true;
				314	};
				315	traverse(V, Use);
				316	}
				317
				318
				319	void Simplifier::Context::unuse(Value *V) {
				320	if (!isa<Instruction>(V) \|\| cast<Instruction>(V)->getParent() != nullptr)
				321	return;
				322
				323	auto Unuse = [this](Instruction *U) -> bool {
				324	if (!U->use_empty())
				325	return false;
				326	Used.erase(U);
				327	return true;
				328	};
				329	traverse(V, Unuse);
				330	}
				331
				332
				333	Value Simplifier::Context::subst(Value Tree, Value OldV, Value NewV) {
				334	if (Tree == OldV)
				335	return NewV;
				336	if (OldV == NewV)
				337	return Tree;
				338
				339	WorkListType Q;
				340	Q.push_back(Tree);
				341	while (!Q.empty()) {
Krzysztof Parzyszek	10fbac0	2017-03-23 23:01:22 +0000	[diff] [blame]	342	Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	343	// If U is not an instruction, or it's not a clone, skip it.
				344	if (!U \|\| U->getParent())
				345	continue;
				346	for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
				347	Value *Op = U->getOperand(i);
				348	if (Op == OldV) {
				349	U->setOperand(i, NewV);
				350	unuse(OldV);
				351	} else {
				352	Q.push_back(Op);
				353	}
				354	}
				355	}
				356	return Tree;
				357	}
				358
				359
				360	void Simplifier::Context::replace(Value OldV, Value NewV) {
				361	if (Root == OldV) {
				362	Root = NewV;
				363	use(Root);
				364	return;
				365	}
				366
				367	// NewV may be a complex tree that has just been created by one of the
				368	// transformation rules. We need to make sure that it is commoned with
				369	// the existing Root to the maximum extent possible.
				370	// Identify all subtrees of NewV (including NewV itself) that have
				371	// equivalent counterparts in Root, and replace those subtrees with
				372	// these counterparts.
				373	WorkListType Q;
				374	Q.push_back(NewV);
				375	while (!Q.empty()) {
Krzysztof Parzyszek	10fbac0	2017-03-23 23:01:22 +0000	[diff] [blame]	376	Value *V = Q.pop_front_val();
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	377	Instruction *U = dyn_cast<Instruction>(V);
				378	if (!U \|\| U->getParent())
				379	continue;
				380	if (Value *DupV = find(Root, V)) {
				381	if (DupV != V)
				382	NewV = subst(NewV, V, DupV);
				383	} else {
				384	for (Value *Op : U->operands())
				385	Q.push_back(Op);
				386	}
				387	}
				388
				389	// Now, simply replace OldV with NewV in Root.
				390	Root = subst(Root, OldV, NewV);
				391	use(Root);
				392	}
				393
				394
				395	void Simplifier::Context::cleanup() {
				396	for (Value *V : Clones) {
				397	Instruction *U = cast<Instruction>(V);
				398	if (!U->getParent())
				399	U->dropAllReferences();
				400	}
				401
				402	for (Value *V : Clones) {
				403	Instruction *U = cast<Instruction>(V);
				404	if (!U->getParent())
Reid Kleckner	96ab872	2017-05-18 17:24:10 +0000	[diff] [blame]	405	U->deleteValue();
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	406	}
				407	}
				408
				409
				410	bool Simplifier::Context::equal(const Instruction *I,
				411	const Instruction *J) const {
				412	if (I == J)
				413	return true;
				414	if (!I->isSameOperationAs(J))
				415	return false;
				416	if (isa<PHINode>(I))
				417	return I->isIdenticalTo(J);
				418
				419	for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) {
				420	Value OpI = I->getOperand(i), OpJ = J->getOperand(i);
				421	if (OpI == OpJ)
				422	continue;
				423	auto *InI = dyn_cast<const Instruction>(OpI);
				424	auto *InJ = dyn_cast<const Instruction>(OpJ);
				425	if (InI && InJ) {
				426	if (!equal(InI, InJ))
				427	return false;
				428	} else if (InI != InJ \|\| !InI)
				429	return false;
				430	}
				431	return true;
				432	}
				433
				434
				435	Value Simplifier::Context::find(Value Tree, Value *Sub) const {
				436	Instruction *SubI = dyn_cast<Instruction>(Sub);
				437	WorkListType Q;
				438	Q.push_back(Tree);
				439
				440	while (!Q.empty()) {
Krzysztof Parzyszek	10fbac0	2017-03-23 23:01:22 +0000	[diff] [blame]	441	Value *V = Q.pop_front_val();
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	442	if (V == Sub)
				443	return V;
				444	Instruction *U = dyn_cast<Instruction>(V);
				445	if (!U \|\| U->getParent())
				446	continue;
				447	if (SubI && equal(SubI, U))
				448	return U;
				449	assert(!isa<PHINode>(U));
				450	for (Value *Op : U->operands())
				451	Q.push_back(Op);
				452	}
				453	return nullptr;
				454	}
				455
				456
				457	void Simplifier::Context::link(Instruction I, BasicBlock B,
				458	BasicBlock::iterator At) {
				459	if (I->getParent())
				460	return;
				461
				462	for (Value *Op : I->operands()) {
				463	if (Instruction *OpI = dyn_cast<Instruction>(Op))
				464	link(OpI, B, At);
				465	}
				466
				467	B->getInstList().insert(At, I);
				468	}
				469
				470
				471	Value Simplifier::Context::materialize(BasicBlock B,
				472	BasicBlock::iterator At) {
				473	if (Instruction *RootI = dyn_cast<Instruction>(Root))
				474	link(RootI, B, At);
				475	return Root;
				476	}
				477
				478
				479	Value *Simplifier::simplify(Context &C) {
				480	WorkListType Q;
				481	Q.push_back(C.Root);
Krzysztof Parzyszek	10fbac0	2017-03-23 23:01:22 +0000	[diff] [blame]	482	unsigned Count = 0;
Krzysztof Parzyszek	51fd540	2017-06-01 18:00:47 +0000	[diff] [blame^]	483	const unsigned Limit = SimplifyLimit;
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	484
				485	while (!Q.empty()) {
Krzysztof Parzyszek	10fbac0	2017-03-23 23:01:22 +0000	[diff] [blame]	486	if (Count++ >= Limit)
				487	break;
				488	Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	489	if (!U \|\| U->getParent() \|\| !C.Used.count(U))
				490	continue;
				491	bool Changed = false;
				492	for (Rule &R : Rules) {
				493	Value *W = R(U, C.Ctx);
				494	if (!W)
				495	continue;
				496	Changed = true;
				497	C.record(W);
				498	C.replace(U, W);
				499	Q.push_back(C.Root);
				500	break;
				501	}
				502	if (!Changed) {
				503	for (Value *Op : U->operands())
				504	Q.push_back(Op);
				505	}
				506	}
Krzysztof Parzyszek	51fd540	2017-06-01 18:00:47 +0000	[diff] [blame^]	507	return Count < Limit ? C.Root : nullptr;
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	508	}
				509
				510
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	511	//===----------------------------------------------------------------------===//
				512	//
				513	// Implementation of PolynomialMultiplyRecognize
				514	//
				515	//===----------------------------------------------------------------------===//
				516
				517	namespace {
				518	class PolynomialMultiplyRecognize {
				519	public:
				520	explicit PolynomialMultiplyRecognize(Loop *loop, const DataLayout &dl,
				521	const DominatorTree &dt, const TargetLibraryInfo &tli,
				522	ScalarEvolution &se)
				523	: CurLoop(loop), DL(dl), DT(dt), TLI(tli), SE(se) {}
				524
				525	bool recognize();
				526	private:
				527	typedef SetVector<Value*> ValueSeq;
				528
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	529	IntegerType *getPmpyType() const {
				530	LLVMContext &Ctx = CurLoop->getHeader()->getParent()->getContext();
				531	return IntegerType::get(Ctx, 32);
				532	}
				533	bool isPromotableTo(Value V, IntegerType Ty);
				534	void promoteTo(Instruction In, IntegerType DestTy, BasicBlock *LoopB);
				535	bool promoteTypes(BasicBlock LoopB, BasicBlock ExitB);
				536
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	537	Value getCountIV(BasicBlock BB);
				538	bool findCycle(Value Out, Value In, ValueSeq &Cycle);
				539	void classifyCycle(Instruction *DivI, ValueSeq &Cycle, ValueSeq &Early,
				540	ValueSeq &Late);
				541	bool classifyInst(Instruction *UseI, ValueSeq &Early, ValueSeq &Late);
				542	bool commutesWithShift(Instruction *I);
				543	bool highBitsAreZero(Value *V, unsigned IterCount);
				544	bool keepsHighBitsZero(Value *V, unsigned IterCount);
				545	bool isOperandShifted(Instruction I, Value Op);
				546	bool convertShiftsToLeft(BasicBlock LoopB, BasicBlock ExitB,
				547	unsigned IterCount);
				548	void cleanupLoopBody(BasicBlock *LoopB);
				549
				550	struct ParsedValues {
				551	ParsedValues() : M(nullptr), P(nullptr), Q(nullptr), R(nullptr),
				552	X(nullptr), Res(nullptr), IterCount(0), Left(false), Inv(false) {}
				553	Value M, P, Q, R, *X;
				554	Instruction *Res;
				555	unsigned IterCount;
				556	bool Left, Inv;
				557	};
				558
				559	bool matchLeftShift(SelectInst SelI, Value CIV, ParsedValues &PV);
				560	bool matchRightShift(SelectInst *SelI, ParsedValues &PV);
				561	bool scanSelect(SelectInst SI, BasicBlock LoopB, BasicBlock *PrehB,
				562	Value *CIV, ParsedValues &PV, bool PreScan);
				563	unsigned getInverseMxN(unsigned QP);
				564	Value *generate(BasicBlock::iterator At, ParsedValues &PV);
				565
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	566	void setupSimplifier();
				567
				568	Simplifier Simp;
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	569	Loop *CurLoop;
				570	const DataLayout &DL;
				571	const DominatorTree &DT;
				572	const TargetLibraryInfo &TLI;
				573	ScalarEvolution &SE;
				574	};
				575	}
				576
				577
				578	Value PolynomialMultiplyRecognize::getCountIV(BasicBlock BB) {
				579	pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
				580	if (std::distance(PI, PE) != 2)
				581	return nullptr;
				582	BasicBlock PB = (PI == BB) ? std::next(PI) : PI;
				583
				584	for (auto I = BB->begin(), E = BB->end(); I != E && isa<PHINode>(I); ++I) {
				585	auto *PN = cast<PHINode>(I);
				586	Value *InitV = PN->getIncomingValueForBlock(PB);
				587	if (!isa<ConstantInt>(InitV) \|\| !cast<ConstantInt>(InitV)->isZero())
				588	continue;
				589	Value *IterV = PN->getIncomingValueForBlock(BB);
				590	if (!isa<BinaryOperator>(IterV))
				591	continue;
				592	auto *BO = dyn_cast<BinaryOperator>(IterV);
				593	if (BO->getOpcode() != Instruction::Add)
				594	continue;
				595	Value *IncV = nullptr;
				596	if (BO->getOperand(0) == PN)
				597	IncV = BO->getOperand(1);
				598	else if (BO->getOperand(1) == PN)
				599	IncV = BO->getOperand(0);
				600	if (IncV == nullptr)
				601	continue;
				602
				603	if (auto *T = dyn_cast<ConstantInt>(IncV))
				604	if (T->getZExtValue() == 1)
				605	return PN;
				606	}
				607	return nullptr;
				608	}
				609
				610
				611	static void replaceAllUsesOfWithIn(Value I, Value J, BasicBlock *BB) {
				612	for (auto UI = I->user_begin(), UE = I->user_end(); UI != UE;) {
				613	Use &TheUse = UI.getUse();
				614	++UI;
				615	if (auto *II = dyn_cast<Instruction>(TheUse.getUser()))
				616	if (BB == II->getParent())
				617	II->replaceUsesOfWith(I, J);
				618	}
				619	}
				620
				621
				622	bool PolynomialMultiplyRecognize::matchLeftShift(SelectInst *SelI,
				623	Value *CIV, ParsedValues &PV) {
				624	// Match the following:
				625	// select (X & (1 << i)) != 0 ? R ^ (Q << i) : R
				626	// select (X & (1 << i)) == 0 ? R : R ^ (Q << i)
				627	// The condition may also check for equality with the masked value, i.e
				628	// select (X & (1 << i)) == (1 << i) ? R ^ (Q << i) : R
				629	// select (X & (1 << i)) != (1 << i) ? R : R ^ (Q << i);
				630
				631	Value *CondV = SelI->getCondition();
				632	Value *TrueV = SelI->getTrueValue();
				633	Value *FalseV = SelI->getFalseValue();
				634
				635	using namespace PatternMatch;
				636
				637	CmpInst::Predicate P;
				638	Value A = nullptr, B = nullptr, *C = nullptr;
				639
				640	if (!match(CondV, m_ICmp(P, m_And(m_Value(A), m_Value(B)), m_Value(C))) &&
				641	!match(CondV, m_ICmp(P, m_Value(C), m_And(m_Value(A), m_Value(B)))))
				642	return false;
				643	if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
				644	return false;
				645	// Matched: select (A & B) == C ? ... : ...
				646	// select (A & B) != C ? ... : ...
				647
				648	Value X = nullptr, Sh1 = nullptr;
				649	// Check (A & B) for (X & (1 << i)):
				650	if (match(A, m_Shl(m_One(), m_Specific(CIV)))) {
				651	Sh1 = A;
				652	X = B;
				653	} else if (match(B, m_Shl(m_One(), m_Specific(CIV)))) {
				654	Sh1 = B;
				655	X = A;
				656	} else {
				657	// TODO: Could also check for an induction variable containing single
				658	// bit shifted left by 1 in each iteration.
				659	return false;
				660	}
				661
				662	bool TrueIfZero;
				663
				664	// Check C against the possible values for comparison: 0 and (1 << i):
				665	if (match(C, m_Zero()))
				666	TrueIfZero = (P == CmpInst::ICMP_EQ);
				667	else if (C == Sh1)
				668	TrueIfZero = (P == CmpInst::ICMP_NE);
				669	else
				670	return false;
				671
				672	// So far, matched:
				673	// select (X & (1 << i)) ? ... : ...
				674	// including variations of the check against zero/non-zero value.
				675
				676	Value ShouldSameV = nullptr, ShouldXoredV = nullptr;
				677	if (TrueIfZero) {
				678	ShouldSameV = TrueV;
				679	ShouldXoredV = FalseV;
				680	} else {
				681	ShouldSameV = FalseV;
				682	ShouldXoredV = TrueV;
				683	}
				684
				685	Value Q = nullptr, R = nullptr, Y = nullptr, Z = nullptr;
				686	Value *T = nullptr;
				687	if (match(ShouldXoredV, m_Xor(m_Value(Y), m_Value(Z)))) {
				688	// Matched: select +++ ? ... : Y ^ Z
				689	// select +++ ? Y ^ Z : ...
				690	// where +++ denotes previously checked matches.
				691	if (ShouldSameV == Y)
				692	T = Z;
				693	else if (ShouldSameV == Z)
				694	T = Y;
				695	else
				696	return false;
				697	R = ShouldSameV;
				698	// Matched: select +++ ? R : R ^ T
				699	// select +++ ? R ^ T : R
				700	// depending on TrueIfZero.
				701
				702	} else if (match(ShouldSameV, m_Zero())) {
				703	// Matched: select +++ ? 0 : ...
				704	// select +++ ? ... : 0
				705	if (!SelI->hasOneUse())
				706	return false;
				707	T = ShouldXoredV;
				708	// Matched: select +++ ? 0 : T
				709	// select +++ ? T : 0
				710
				711	Value U = SelI->user_begin();
				712	if (!match(U, m_Xor(m_Specific(SelI), m_Value(R))) &&
				713	!match(U, m_Xor(m_Value(R), m_Specific(SelI))))
				714	return false;
				715	// Matched: xor (select +++ ? 0 : T), R
				716	// xor (select +++ ? T : 0), R
				717	} else
				718	return false;
				719
				720	// The xor input value T is isolated into its own match so that it could
				721	// be checked against an induction variable containing a shifted bit
				722	// (todo).
				723	// For now, check against (Q << i).
				724	if (!match(T, m_Shl(m_Value(Q), m_Specific(CIV))) &&
				725	!match(T, m_Shl(m_ZExt(m_Value(Q)), m_ZExt(m_Specific(CIV)))))
				726	return false;
				727	// Matched: select +++ ? R : R ^ (Q << i)
				728	// select +++ ? R ^ (Q << i) : R
				729
				730	PV.X = X;
				731	PV.Q = Q;
				732	PV.R = R;
				733	PV.Left = true;
				734	return true;
				735	}
				736
				737
				738	bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
				739	ParsedValues &PV) {
				740	// Match the following:
				741	// select (X & 1) != 0 ? (R >> 1) ^ Q : (R >> 1)
				742	// select (X & 1) == 0 ? (R >> 1) : (R >> 1) ^ Q
				743	// The condition may also check for equality with the masked value, i.e
				744	// select (X & 1) == 1 ? (R >> 1) ^ Q : (R >> 1)
				745	// select (X & 1) != 1 ? (R >> 1) : (R >> 1) ^ Q
				746
				747	Value *CondV = SelI->getCondition();
				748	Value *TrueV = SelI->getTrueValue();
				749	Value *FalseV = SelI->getFalseValue();
				750
				751	using namespace PatternMatch;
				752
				753	Value *C = nullptr;
				754	CmpInst::Predicate P;
				755	bool TrueIfZero;
				756
				757	if (match(CondV, m_ICmp(P, m_Value(C), m_Zero())) \|\|
				758	match(CondV, m_ICmp(P, m_Zero(), m_Value(C)))) {
				759	if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
				760	return false;
				761	// Matched: select C == 0 ? ... : ...
				762	// select C != 0 ? ... : ...
				763	TrueIfZero = (P == CmpInst::ICMP_EQ);
				764	} else if (match(CondV, m_ICmp(P, m_Value(C), m_One())) \|\|
				765	match(CondV, m_ICmp(P, m_One(), m_Value(C)))) {
				766	if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
				767	return false;
				768	// Matched: select C == 1 ? ... : ...
				769	// select C != 1 ? ... : ...
				770	TrueIfZero = (P == CmpInst::ICMP_NE);
				771	} else
				772	return false;
				773
				774	Value *X = nullptr;
				775	if (!match(C, m_And(m_Value(X), m_One())) &&
				776	!match(C, m_And(m_One(), m_Value(X))))
				777	return false;
				778	// Matched: select (X & 1) == +++ ? ... : ...
				779	// select (X & 1) != +++ ? ... : ...
				780
				781	Value R = nullptr, Q = nullptr;
				782	if (TrueIfZero) {
				783	// The select's condition is true if the tested bit is 0.
				784	// TrueV must be the shift, FalseV must be the xor.
				785	if (!match(TrueV, m_LShr(m_Value(R), m_One())))
				786	return false;
				787	// Matched: select +++ ? (R >> 1) : ...
				788	if (!match(FalseV, m_Xor(m_Specific(TrueV), m_Value(Q))) &&
				789	!match(FalseV, m_Xor(m_Value(Q), m_Specific(TrueV))))
				790	return false;
				791	// Matched: select +++ ? (R >> 1) : (R >> 1) ^ Q
				792	// with commuting ^.
				793	} else {
				794	// The select's condition is true if the tested bit is 1.
				795	// TrueV must be the xor, FalseV must be the shift.
				796	if (!match(FalseV, m_LShr(m_Value(R), m_One())))
				797	return false;
				798	// Matched: select +++ ? ... : (R >> 1)
				799	if (!match(TrueV, m_Xor(m_Specific(FalseV), m_Value(Q))) &&
				800	!match(TrueV, m_Xor(m_Value(Q), m_Specific(FalseV))))
				801	return false;
				802	// Matched: select +++ ? (R >> 1) ^ Q : (R >> 1)
				803	// with commuting ^.
				804	}
				805
				806	PV.X = X;
				807	PV.Q = Q;
				808	PV.R = R;
				809	PV.Left = false;
				810	return true;
				811	}
				812
				813
				814	bool PolynomialMultiplyRecognize::scanSelect(SelectInst *SelI,
				815	BasicBlock LoopB, BasicBlock PrehB, Value *CIV, ParsedValues &PV,
				816	bool PreScan) {
				817	using namespace PatternMatch;
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	818	// The basic pattern for R = P.Q is:
				819	// for i = 0..31
				820	// R = phi (0, R')
				821	// if (P & (1 << i)) ; test-bit(P, i)
				822	// R' = R ^ (Q << i)
				823	//
				824	// Similarly, the basic pattern for R = (P/Q).Q - P
				825	// for i = 0..31
				826	// R = phi(P, R')
				827	// if (R & (1 << i))
				828	// R' = R ^ (Q << i)
				829
				830	// There exist idioms, where instead of Q being shifted left, P is shifted
				831	// right. This produces a result that is shifted right by 32 bits (the
				832	// non-shifted result is 64-bit).
				833	//
				834	// For R = P.Q, this would be:
				835	// for i = 0..31
				836	// R = phi (0, R')
				837	// if ((P >> i) & 1)
				838	// R' = (R >> 1) ^ Q ; R is cycled through the loop, so it must
				839	// else ; be shifted by 1, not i.
				840	// R' = R >> 1
				841	//
				842	// And for the inverse:
				843	// for i = 0..31
				844	// R = phi (P, R')
				845	// if (R & 1)
				846	// R' = (R >> 1) ^ Q
				847	// else
				848	// R' = R >> 1
				849
				850	// The left-shifting idioms share the same pattern:
				851	// select (X & (1 << i)) ? R ^ (Q << i) : R
				852	// Similarly for right-shifting idioms:
				853	// select (X & 1) ? (R >> 1) ^ Q
				854
				855	if (matchLeftShift(SelI, CIV, PV)) {
				856	// If this is a pre-scan, getting this far is sufficient.
				857	if (PreScan)
				858	return true;
				859
				860	// Need to make sure that the SelI goes back into R.
				861	auto *RPhi = dyn_cast<PHINode>(PV.R);
				862	if (!RPhi)
				863	return false;
				864	if (SelI != RPhi->getIncomingValueForBlock(LoopB))
				865	return false;
				866	PV.Res = SelI;
				867
				868	// If X is loop invariant, it must be the input polynomial, and the
				869	// idiom is the basic polynomial multiply.
				870	if (CurLoop->isLoopInvariant(PV.X)) {
				871	PV.P = PV.X;
				872	PV.Inv = false;
				873	} else {
				874	// X is not loop invariant. If X == R, this is the inverse pmpy.
				875	// Otherwise, check for an xor with an invariant value. If the
				876	// variable argument to the xor is R, then this is still a valid
				877	// inverse pmpy.
				878	PV.Inv = true;
				879	if (PV.X != PV.R) {
				880	Value Var = nullptr, Inv = nullptr, X1 = nullptr, X2 = nullptr;
				881	if (!match(PV.X, m_Xor(m_Value(X1), m_Value(X2))))
				882	return false;
				883	auto *I1 = dyn_cast<Instruction>(X1);
				884	auto *I2 = dyn_cast<Instruction>(X2);
				885	if (!I1 \|\| I1->getParent() != LoopB) {
				886	Var = X2;
				887	Inv = X1;
				888	} else if (!I2 \|\| I2->getParent() != LoopB) {
				889	Var = X1;
				890	Inv = X2;
				891	} else
				892	return false;
				893	if (Var != PV.R)
				894	return false;
				895	PV.M = Inv;
				896	}
				897	// The input polynomial P still needs to be determined. It will be
				898	// the entry value of R.
				899	Value *EntryP = RPhi->getIncomingValueForBlock(PrehB);
				900	PV.P = EntryP;
				901	}
				902
				903	return true;
				904	}
				905
				906	if (matchRightShift(SelI, PV)) {
				907	// If this is an inverse pattern, the Q polynomial must be known at
				908	// compile time.
				909	if (PV.Inv && !isa<ConstantInt>(PV.Q))
				910	return false;
				911	if (PreScan)
				912	return true;
				913	// There is no exact matching of right-shift pmpy.
				914	return false;
				915	}
				916
				917	return false;
				918	}
				919
				920
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	921	bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val,
				922	IntegerType *DestTy) {
				923	IntegerType *T = dyn_cast<IntegerType>(Val->getType());
				924	if (!T \|\| T->getBitWidth() > DestTy->getBitWidth())
				925	return false;
				926	if (T->getBitWidth() == DestTy->getBitWidth())
				927	return true;
				928	// Non-instructions are promotable. The reason why an instruction may not
				929	// be promotable is that it may produce a different result if its operands
				930	// and the result are promoted, for example, it may produce more non-zero
				931	// bits. While it would still be possible to represent the proper result
				932	// in a wider type, it may require adding additional instructions (which
				933	// we don't want to do).
				934	Instruction *In = dyn_cast<Instruction>(Val);
				935	if (!In)
				936	return true;
				937	// The bitwidth of the source type is smaller than the destination.
				938	// Check if the individual operation can be promoted.
				939	switch (In->getOpcode()) {
				940	case Instruction::PHI:
				941	case Instruction::ZExt:
				942	case Instruction::And:
				943	case Instruction::Or:
				944	case Instruction::Xor:
				945	case Instruction::LShr: // Shift right is ok.
				946	case Instruction::Select:
				947	return true;
				948	case Instruction::ICmp:
				949	if (CmpInst *CI = cast<CmpInst>(In))
				950	return CI->isEquality() \|\| CI->isUnsigned();
				951	llvm_unreachable("Cast failed unexpectedly");
				952	case Instruction::Add:
				953	return In->hasNoSignedWrap() && In->hasNoUnsignedWrap();
				954	}
				955	return false;
				956	}
				957
				958
				959	void PolynomialMultiplyRecognize::promoteTo(Instruction *In,
				960	IntegerType DestTy, BasicBlock LoopB) {
				961	// Leave boolean values alone.
				962	if (!In->getType()->isIntegerTy(1))
				963	In->mutateType(DestTy);
				964	unsigned DestBW = DestTy->getBitWidth();
				965
				966	// Handle PHIs.
				967	if (PHINode *P = dyn_cast<PHINode>(In)) {
				968	unsigned N = P->getNumIncomingValues();
				969	for (unsigned i = 0; i != N; ++i) {
				970	BasicBlock *InB = P->getIncomingBlock(i);
				971	if (InB == LoopB)
				972	continue;
				973	Value *InV = P->getIncomingValue(i);
				974	IntegerType *Ty = cast<IntegerType>(InV->getType());
				975	// Do not promote values in PHI nodes of type i1.
				976	if (Ty != P->getType()) {
				977	// If the value type does not match the PHI type, the PHI type
				978	// must have been promoted.
				979	assert(Ty->getBitWidth() < DestBW);
				980	InV = IRBuilder<>(InB->getTerminator()).CreateZExt(InV, DestTy);
				981	P->setIncomingValue(i, InV);
				982	}
				983	}
				984	} else if (ZExtInst *Z = dyn_cast<ZExtInst>(In)) {
				985	Value *Op = Z->getOperand(0);
				986	if (Op->getType() == Z->getType())
				987	Z->replaceAllUsesWith(Op);
				988	Z->eraseFromParent();
				989	return;
				990	}
				991
				992	// Promote immediates.
				993	for (unsigned i = 0, n = In->getNumOperands(); i != n; ++i) {
				994	if (ConstantInt *CI = dyn_cast<ConstantInt>(In->getOperand(i)))
				995	if (CI->getType()->getBitWidth() < DestBW)
				996	In->setOperand(i, ConstantInt::get(DestTy, CI->getZExtValue()));
				997	}
				998	}
				999
				1000
				1001	bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB,
				1002	BasicBlock *ExitB) {
				1003	assert(LoopB);
				1004	// Skip loops where the exit block has more than one predecessor. The values
				1005	// coming from the loop block will be promoted to another type, and so the
				1006	// values coming into the exit block from other predecessors would also have
				1007	// to be promoted.
				1008	if (!ExitB \|\| (ExitB->getSinglePredecessor() != LoopB))
				1009	return false;
				1010	IntegerType *DestTy = getPmpyType();
				1011	// Check if the exit values have types that are no wider than the type
				1012	// that we want to promote to.
				1013	unsigned DestBW = DestTy->getBitWidth();
				1014	for (Instruction &In : *ExitB) {
				1015	PHINode *P = dyn_cast<PHINode>(&In);
				1016	if (!P)
				1017	break;
				1018	if (P->getNumIncomingValues() != 1)
				1019	return false;
				1020	assert(P->getIncomingBlock(0) == LoopB);
				1021	IntegerType *T = dyn_cast<IntegerType>(P->getType());
				1022	if (!T \|\| T->getBitWidth() > DestBW)
				1023	return false;
				1024	}
				1025
				1026	// Check all instructions in the loop.
				1027	for (Instruction &In : *LoopB)
				1028	if (!In.isTerminator() && !isPromotableTo(&In, DestTy))
				1029	return false;
				1030
				1031	// Perform the promotion.
				1032	std::vector<Instruction*> LoopIns;
				1033	std::transform(LoopB->begin(), LoopB->end(), std::back_inserter(LoopIns),
				1034	[](Instruction &In) { return &In; });
				1035	for (Instruction *In : LoopIns)
				1036	promoteTo(In, DestTy, LoopB);
				1037
				1038	// Fix up the PHI nodes in the exit block.
				1039	Instruction *EndI = ExitB->getFirstNonPHI();
				1040	BasicBlock::iterator End = EndI ? EndI->getIterator() : ExitB->end();
				1041	for (auto I = ExitB->begin(); I != End; ++I) {
				1042	PHINode *P = dyn_cast<PHINode>(I);
				1043	if (!P)
				1044	break;
				1045	Type *Ty0 = P->getIncomingValue(0)->getType();
				1046	Type *PTy = P->getType();
				1047	if (PTy != Ty0) {
				1048	assert(Ty0 == DestTy);
				1049	// In order to create the trunc, P must have the promoted type.
				1050	P->mutateType(Ty0);
				1051	Value *T = IRBuilder<>(ExitB, End).CreateTrunc(P, PTy);
				1052	// In order for the RAUW to work, the types of P and T must match.
				1053	P->mutateType(PTy);
				1054	P->replaceAllUsesWith(T);
				1055	// Final update of the P's type.
				1056	P->mutateType(Ty0);
				1057	cast<Instruction>(T)->setOperand(0, P);
				1058	}
				1059	}
				1060
				1061	return true;
				1062	}
				1063
				1064
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1065	bool PolynomialMultiplyRecognize::findCycle(Value Out, Value In,
				1066	ValueSeq &Cycle) {
				1067	// Out = ..., In, ...
				1068	if (Out == In)
				1069	return true;
				1070
				1071	auto *BB = cast<Instruction>(Out)->getParent();
				1072	bool HadPhi = false;
				1073
				1074	for (auto U : Out->users()) {
				1075	auto I = dyn_cast<Instruction>(&U);
				1076	if (I == nullptr \|\| I->getParent() != BB)
				1077	continue;
				1078	// Make sure that there are no multi-iteration cycles, e.g.
				1079	// p1 = phi(p2)
				1080	// p2 = phi(p1)
				1081	// The cycle p1->p2->p1 would span two loop iterations.
				1082	// Check that there is only one phi in the cycle.
				1083	bool IsPhi = isa<PHINode>(I);
				1084	if (IsPhi && HadPhi)
				1085	return false;
				1086	HadPhi \|= IsPhi;
				1087	if (Cycle.count(I))
				1088	return false;
				1089	Cycle.insert(I);
				1090	if (findCycle(I, In, Cycle))
				1091	break;
				1092	Cycle.remove(I);
				1093	}
				1094	return !Cycle.empty();
				1095	}
				1096
				1097
				1098	void PolynomialMultiplyRecognize::classifyCycle(Instruction *DivI,
				1099	ValueSeq &Cycle, ValueSeq &Early, ValueSeq &Late) {
				1100	// All the values in the cycle that are between the phi node and the
				1101	// divider instruction will be classified as "early", all other values
				1102	// will be "late".
				1103
				1104	bool IsE = true;
				1105	unsigned I, N = Cycle.size();
				1106	for (I = 0; I < N; ++I) {
				1107	Value *V = Cycle[I];
				1108	if (DivI == V)
				1109	IsE = false;
				1110	else if (!isa<PHINode>(V))
				1111	continue;
				1112	// Stop if found either.
				1113	break;
				1114	}
				1115	// "I" is the index of either DivI or the phi node, whichever was first.
				1116	// "E" is "false" or "true" respectively.
				1117	ValueSeq &First = !IsE ? Early : Late;
				1118	for (unsigned J = 0; J < I; ++J)
				1119	First.insert(Cycle[J]);
				1120
				1121	ValueSeq &Second = IsE ? Early : Late;
				1122	Second.insert(Cycle[I]);
				1123	for (++I; I < N; ++I) {
				1124	Value *V = Cycle[I];
				1125	if (DivI == V \|\| isa<PHINode>(V))
				1126	break;
				1127	Second.insert(V);
				1128	}
				1129
				1130	for (; I < N; ++I)
				1131	First.insert(Cycle[I]);
				1132	}
				1133
				1134
				1135	bool PolynomialMultiplyRecognize::classifyInst(Instruction *UseI,
				1136	ValueSeq &Early, ValueSeq &Late) {
				1137	// Select is an exception, since the condition value does not have to be
				1138	// classified in the same way as the true/false values. The true/false
				1139	// values do have to be both early or both late.
				1140	if (UseI->getOpcode() == Instruction::Select) {
				1141	Value TV = UseI->getOperand(1), FV = UseI->getOperand(2);
				1142	if (Early.count(TV) \|\| Early.count(FV)) {
				1143	if (Late.count(TV) \|\| Late.count(FV))
				1144	return false;
				1145	Early.insert(UseI);
				1146	} else if (Late.count(TV) \|\| Late.count(FV)) {
				1147	if (Early.count(TV) \|\| Early.count(FV))
				1148	return false;
				1149	Late.insert(UseI);
				1150	}
				1151	return true;
				1152	}
				1153
				1154	// Not sure what would be the example of this, but the code below relies
				1155	// on having at least one operand.
				1156	if (UseI->getNumOperands() == 0)
				1157	return true;
				1158
				1159	bool AE = true, AL = true;
				1160	for (auto &I : UseI->operands()) {
				1161	if (Early.count(&*I))
				1162	AL = false;
				1163	else if (Late.count(&*I))
				1164	AE = false;
				1165	}
				1166	// If the operands appear "all early" and "all late" at the same time,
				1167	// then it means that none of them are actually classified as either.
				1168	// This is harmless.
				1169	if (AE && AL)
				1170	return true;
				1171	// Conversely, if they are neither "all early" nor "all late", then
				1172	// we have a mixture of early and late operands that is not a known
				1173	// exception.
				1174	if (!AE && !AL)
				1175	return false;
				1176
				1177	// Check that we have covered the two special cases.
				1178	assert(AE != AL);
				1179
				1180	if (AE)
				1181	Early.insert(UseI);
				1182	else
				1183	Late.insert(UseI);
				1184	return true;
				1185	}
				1186
				1187
				1188	bool PolynomialMultiplyRecognize::commutesWithShift(Instruction *I) {
				1189	switch (I->getOpcode()) {
				1190	case Instruction::And:
				1191	case Instruction::Or:
				1192	case Instruction::Xor:
				1193	case Instruction::LShr:
				1194	case Instruction::Shl:
				1195	case Instruction::Select:
				1196	case Instruction::ICmp:
				1197	case Instruction::PHI:
				1198	break;
				1199	default:
				1200	return false;
				1201	}
				1202	return true;
				1203	}
				1204
				1205
				1206	bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V,
				1207	unsigned IterCount) {
				1208	auto *T = dyn_cast<IntegerType>(V->getType());
				1209	if (!T)
				1210	return false;
				1211
Craig Topper	b45eabc	2017-04-26 16:39:58 +0000	[diff] [blame]	1212	KnownBits Known(T->getBitWidth());
				1213	computeKnownBits(V, Known, DL);
Craig Topper	8df66c6	2017-05-12 17:20:30 +0000	[diff] [blame]	1214	return Known.countMinLeadingZeros() >= IterCount;
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1215	}
				1216
				1217
				1218	bool PolynomialMultiplyRecognize::keepsHighBitsZero(Value *V,
				1219	unsigned IterCount) {
				1220	// Assume that all inputs to the value have the high bits zero.
				1221	// Check if the value itself preserves the zeros in the high bits.
				1222	if (auto *C = dyn_cast<ConstantInt>(V))
				1223	return C->getValue().countLeadingZeros() >= IterCount;
				1224
				1225	if (auto *I = dyn_cast<Instruction>(V)) {
				1226	switch (I->getOpcode()) {
				1227	case Instruction::And:
				1228	case Instruction::Or:
				1229	case Instruction::Xor:
				1230	case Instruction::LShr:
				1231	case Instruction::Select:
				1232	case Instruction::ICmp:
				1233	case Instruction::PHI:
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	1234	case Instruction::ZExt:
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1235	return true;
				1236	}
				1237	}
				1238
				1239	return false;
				1240	}
				1241
				1242
				1243	bool PolynomialMultiplyRecognize::isOperandShifted(Instruction I, Value Op) {
				1244	unsigned Opc = I->getOpcode();
				1245	if (Opc == Instruction::Shl \|\| Opc == Instruction::LShr)
				1246	return Op != I->getOperand(1);
				1247	return true;
				1248	}
				1249
				1250
				1251	bool PolynomialMultiplyRecognize::convertShiftsToLeft(BasicBlock *LoopB,
				1252	BasicBlock *ExitB, unsigned IterCount) {
				1253	Value *CIV = getCountIV(LoopB);
				1254	if (CIV == nullptr)
				1255	return false;
				1256	auto *CIVTy = dyn_cast<IntegerType>(CIV->getType());
				1257	if (CIVTy == nullptr)
				1258	return false;
				1259
				1260	ValueSeq RShifts;
				1261	ValueSeq Early, Late, Cycled;
				1262
				1263	// Find all value cycles that contain logical right shifts by 1.
				1264	for (Instruction &I : *LoopB) {
				1265	using namespace PatternMatch;
				1266	Value *V = nullptr;
				1267	if (!match(&I, m_LShr(m_Value(V), m_One())))
				1268	continue;
				1269	ValueSeq C;
				1270	if (!findCycle(&I, V, C))
				1271	continue;
				1272
				1273	// Found a cycle.
				1274	C.insert(&I);
				1275	classifyCycle(&I, C, Early, Late);
				1276	Cycled.insert(C.begin(), C.end());
				1277	RShifts.insert(&I);
				1278	}
				1279
				1280	// Find the set of all values affected by the shift cycles, i.e. all
				1281	// cycled values, and (recursively) all their users.
				1282	ValueSeq Users(Cycled.begin(), Cycled.end());
				1283	for (unsigned i = 0; i < Users.size(); ++i) {
				1284	Value *V = Users[i];
				1285	if (!isa<IntegerType>(V->getType()))
				1286	return false;
				1287	auto *R = cast<Instruction>(V);
				1288	// If the instruction does not commute with shifts, the loop cannot
				1289	// be unshifted.
				1290	if (!commutesWithShift(R))
				1291	return false;
				1292	for (auto I = R->user_begin(), E = R->user_end(); I != E; ++I) {
				1293	auto T = cast<Instruction>(I);
				1294	// Skip users from outside of the loop. They will be handled later.
				1295	// Also, skip the right-shifts and phi nodes, since they mix early
				1296	// and late values.
				1297	if (T->getParent() != LoopB \|\| RShifts.count(T) \|\| isa<PHINode>(T))
				1298	continue;
				1299
				1300	Users.insert(T);
				1301	if (!classifyInst(T, Early, Late))
				1302	return false;
				1303	}
				1304	}
				1305
				1306	if (Users.size() == 0)
				1307	return false;
				1308
				1309	// Verify that high bits remain zero.
				1310	ValueSeq Internal(Users.begin(), Users.end());
				1311	ValueSeq Inputs;
				1312	for (unsigned i = 0; i < Internal.size(); ++i) {
				1313	auto *R = dyn_cast<Instruction>(Internal[i]);
				1314	if (!R)
				1315	continue;
				1316	for (Value *Op : R->operands()) {
				1317	auto *T = dyn_cast<Instruction>(Op);
				1318	if (T && T->getParent() != LoopB)
				1319	Inputs.insert(Op);
				1320	else
				1321	Internal.insert(Op);
				1322	}
				1323	}
				1324	for (Value *V : Inputs)
				1325	if (!highBitsAreZero(V, IterCount))
				1326	return false;
				1327	for (Value *V : Internal)
				1328	if (!keepsHighBitsZero(V, IterCount))
				1329	return false;
				1330
				1331	// Finally, the work can be done. Unshift each user.
				1332	IRBuilder<> IRB(LoopB);
				1333	std::map<Value,Value> ShiftMap;
				1334	typedef std::map<std::pair<Value,Type>,Value*> CastMapType;
				1335	CastMapType CastMap;
				1336
				1337	auto upcast = [] (CastMapType &CM, IRBuilder<> &IRB, Value *V,
				1338	IntegerType Ty) -> Value {
				1339	auto H = CM.find(std::make_pair(V, Ty));
				1340	if (H != CM.end())
				1341	return H->second;
				1342	Value *CV = IRB.CreateIntCast(V, Ty, false);
				1343	CM.insert(std::make_pair(std::make_pair(V, Ty), CV));
				1344	return CV;
				1345	};
				1346
				1347	for (auto I = LoopB->begin(), E = LoopB->end(); I != E; ++I) {
				1348	if (isa<PHINode>(I) \|\| !Users.count(&*I))
				1349	continue;
				1350	using namespace PatternMatch;
				1351	// Match lshr x, 1.
				1352	Value *V = nullptr;
				1353	if (match(&*I, m_LShr(m_Value(V), m_One()))) {
				1354	replaceAllUsesOfWithIn(&*I, V, LoopB);
				1355	continue;
				1356	}
				1357	// For each non-cycled operand, replace it with the corresponding
				1358	// value shifted left.
				1359	for (auto &J : I->operands()) {
				1360	Value *Op = J.get();
				1361	if (!isOperandShifted(&*I, Op))
				1362	continue;
				1363	if (Users.count(Op))
				1364	continue;
				1365	// Skip shifting zeros.
				1366	if (isa<ConstantInt>(Op) && cast<ConstantInt>(Op)->isZero())
				1367	continue;
				1368	// Check if we have already generated a shift for this value.
				1369	auto F = ShiftMap.find(Op);
				1370	Value *W = (F != ShiftMap.end()) ? F->second : nullptr;
				1371	if (W == nullptr) {
				1372	IRB.SetInsertPoint(&*I);
				1373	// First, the shift amount will be CIV or CIV+1, depending on
				1374	// whether the value is early or late. Instead of creating CIV+1,
				1375	// do a single shift of the value.
				1376	Value ShAmt = CIV, ShVal = Op;
				1377	auto *VTy = cast<IntegerType>(ShVal->getType());
				1378	auto *ATy = cast<IntegerType>(ShAmt->getType());
				1379	if (Late.count(&*I))
				1380	ShVal = IRB.CreateShl(Op, ConstantInt::get(VTy, 1));
				1381	// Second, the types of the shifted value and the shift amount
				1382	// must match.
				1383	if (VTy != ATy) {
				1384	if (VTy->getBitWidth() < ATy->getBitWidth())
				1385	ShVal = upcast(CastMap, IRB, ShVal, ATy);
				1386	else
				1387	ShAmt = upcast(CastMap, IRB, ShAmt, VTy);
				1388	}
				1389	// Ready to generate the shift and memoize it.
				1390	W = IRB.CreateShl(ShVal, ShAmt);
				1391	ShiftMap.insert(std::make_pair(Op, W));
				1392	}
				1393	I->replaceUsesOfWith(Op, W);
				1394	}
				1395	}
				1396
				1397	// Update the users outside of the loop to account for having left
				1398	// shifts. They would normally be shifted right in the loop, so shift
				1399	// them right after the loop exit.
				1400	// Take advantage of the loop-closed SSA form, which has all the post-
				1401	// loop values in phi nodes.
				1402	IRB.SetInsertPoint(ExitB, ExitB->getFirstInsertionPt());
				1403	for (auto P = ExitB->begin(), Q = ExitB->end(); P != Q; ++P) {
				1404	if (!isa<PHINode>(P))
				1405	break;
				1406	auto *PN = cast<PHINode>(P);
				1407	Value *U = PN->getIncomingValueForBlock(LoopB);
				1408	if (!Users.count(U))
				1409	continue;
				1410	Value *S = IRB.CreateLShr(PN, ConstantInt::get(PN->getType(), IterCount));
				1411	PN->replaceAllUsesWith(S);
				1412	// The above RAUW will create
				1413	// S = lshr S, IterCount
				1414	// so we need to fix it back into
				1415	// S = lshr PN, IterCount
				1416	cast<User>(S)->replaceUsesOfWith(S, PN);
				1417	}
				1418
				1419	return true;
				1420	}
				1421
				1422
				1423	void PolynomialMultiplyRecognize::cleanupLoopBody(BasicBlock *LoopB) {
				1424	for (auto &I : *LoopB)
Daniel Berlin	4d0fe64	2017-04-28 19:55:38 +0000	[diff] [blame]	1425	if (Value *SV = SimplifyInstruction(&I, {DL, &TLI, &DT}))
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1426	I.replaceAllUsesWith(SV);
				1427
				1428	for (auto I = LoopB->begin(), N = I; I != LoopB->end(); I = N) {
				1429	N = std::next(I);
				1430	RecursivelyDeleteTriviallyDeadInstructions(&*I, &TLI);
				1431	}
				1432	}
				1433
				1434
				1435	unsigned PolynomialMultiplyRecognize::getInverseMxN(unsigned QP) {
				1436	// Arrays of coefficients of Q and the inverse, C.
				1437	// Q[i] = coefficient at x^i.
				1438	std::array<char,32> Q, C;
				1439
				1440	for (unsigned i = 0; i < 32; ++i) {
				1441	Q[i] = QP & 1;
				1442	QP >>= 1;
				1443	}
				1444	assert(Q[0] == 1);
				1445
				1446	// Find C, such that
				1447	// (Q[n]x^n + ... + Q[1]x + Q[0]) * (C[n]x^n + ... + C[1]x + C[0]) = 1
				1448	//
				1449	// For it to have a solution, Q[0] must be 1. Since this is Z2[x], the
				1450	// operations * and + are & and ^ respectively.
				1451	//
				1452	// Find C[i] recursively, by comparing i-th coefficient in the product
				1453	// with 0 (or 1 for i=0).
				1454	//
				1455	// C[0] = 1, since C[0] = Q[0], and Q[0] = 1.
				1456	C[0] = 1;
				1457	for (unsigned i = 1; i < 32; ++i) {
				1458	// Solve for C[i] in:
				1459	// C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i]Q[0] = 0
				1460	// This is equivalent to
				1461	// C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i] = 0
				1462	// which is
				1463	// C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] = C[i]
				1464	unsigned T = 0;
				1465	for (unsigned j = 0; j < i; ++j)
				1466	T = T ^ (C[j] & Q[i-j]);
				1467	C[i] = T;
				1468	}
				1469
				1470	unsigned QV = 0;
				1471	for (unsigned i = 0; i < 32; ++i)
				1472	if (C[i])
				1473	QV \|= (1 << i);
				1474
				1475	return QV;
				1476	}
				1477
				1478
				1479	Value *PolynomialMultiplyRecognize::generate(BasicBlock::iterator At,
				1480	ParsedValues &PV) {
				1481	IRBuilder<> B(&*At);
				1482	Module *M = At->getParent()->getParent()->getParent();
				1483	Value *PMF = Intrinsic::getDeclaration(M, Intrinsic::hexagon_M4_pmpyw);
				1484
				1485	Value P = PV.P, Q = PV.Q, *P0 = P;
				1486	unsigned IC = PV.IterCount;
				1487
				1488	if (PV.M != nullptr)
				1489	P0 = P = B.CreateXor(P, PV.M);
				1490
				1491	// Create a bit mask to clear the high bits beyond IterCount.
				1492	auto *BMI = ConstantInt::get(P->getType(), APInt::getLowBitsSet(32, IC));
				1493
				1494	if (PV.IterCount != 32)
				1495	P = B.CreateAnd(P, BMI);
				1496
				1497	if (PV.Inv) {
				1498	auto *QI = dyn_cast<ConstantInt>(PV.Q);
				1499	assert(QI && QI->getBitWidth() <= 32);
				1500
				1501	// Again, clearing bits beyond IterCount.
				1502	unsigned M = (1 << PV.IterCount) - 1;
				1503	unsigned Tmp = (QI->getZExtValue() \| 1) & M;
				1504	unsigned QV = getInverseMxN(Tmp) & M;
				1505	auto *QVI = ConstantInt::get(QI->getType(), QV);
				1506	P = B.CreateCall(PMF, {P, QVI});
				1507	P = B.CreateTrunc(P, QI->getType());
				1508	if (IC != 32)
				1509	P = B.CreateAnd(P, BMI);
				1510	}
				1511
				1512	Value *R = B.CreateCall(PMF, {P, Q});
				1513
				1514	if (PV.M != nullptr)
				1515	R = B.CreateXor(R, B.CreateIntCast(P0, R->getType(), false));
				1516
				1517	return R;
				1518	}
				1519
				1520
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	1521	void PolynomialMultiplyRecognize::setupSimplifier() {
				1522	Simp.addRule(
				1523	// Sink zext past bitwise operations.
				1524	[](Instruction I, LLVMContext &Ctx) -> Value {
				1525	if (I->getOpcode() != Instruction::ZExt)
				1526	return nullptr;
				1527	Instruction *T = dyn_cast<Instruction>(I->getOperand(0));
				1528	if (!T)
				1529	return nullptr;
				1530	switch (T->getOpcode()) {
				1531	case Instruction::And:
				1532	case Instruction::Or:
				1533	case Instruction::Xor:
				1534	break;
				1535	default:
				1536	return nullptr;
				1537	}
				1538	IRBuilder<> B(Ctx);
				1539	return B.CreateBinOp(cast<BinaryOperator>(T)->getOpcode(),
				1540	B.CreateZExt(T->getOperand(0), I->getType()),
				1541	B.CreateZExt(T->getOperand(1), I->getType()));
				1542	});
				1543	Simp.addRule(
				1544	// (xor (and x a) (and y a)) -> (and (xor x y) a)
				1545	[](Instruction I, LLVMContext &Ctx) -> Value {
				1546	if (I->getOpcode() != Instruction::Xor)
				1547	return nullptr;
				1548	Instruction *And0 = dyn_cast<Instruction>(I->getOperand(0));
				1549	Instruction *And1 = dyn_cast<Instruction>(I->getOperand(1));
				1550	if (!And0 \|\| !And1)
				1551	return nullptr;
				1552	if (And0->getOpcode() != Instruction::And \|\|
				1553	And1->getOpcode() != Instruction::And)
				1554	return nullptr;
				1555	if (And0->getOperand(1) != And1->getOperand(1))
				1556	return nullptr;
				1557	IRBuilder<> B(Ctx);
				1558	return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1->getOperand(0)),
				1559	And0->getOperand(1));
				1560	});
				1561	Simp.addRule(
				1562	// (Op (select c x y) z) -> (select c (Op x z) (Op y z))
				1563	// (Op x (select c y z)) -> (select c (Op x y) (Op x z))
				1564	[](Instruction I, LLVMContext &Ctx) -> Value {
				1565	BinaryOperator *BO = dyn_cast<BinaryOperator>(I);
				1566	if (!BO)
				1567	return nullptr;
				1568	Instruction::BinaryOps Op = BO->getOpcode();
				1569	if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(0))) {
				1570	IRBuilder<> B(Ctx);
				1571	Value X = Sel->getTrueValue(), Y = Sel->getFalseValue();
				1572	Value *Z = BO->getOperand(1);
				1573	return B.CreateSelect(Sel->getCondition(),
				1574	B.CreateBinOp(Op, X, Z),
				1575	B.CreateBinOp(Op, Y, Z));
				1576	}
				1577	if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(1))) {
				1578	IRBuilder<> B(Ctx);
				1579	Value *X = BO->getOperand(0);
				1580	Value Y = Sel->getTrueValue(), Z = Sel->getFalseValue();
				1581	return B.CreateSelect(Sel->getCondition(),
				1582	B.CreateBinOp(Op, X, Y),
				1583	B.CreateBinOp(Op, X, Z));
				1584	}
				1585	return nullptr;
				1586	});
				1587	Simp.addRule(
				1588	// (select c (select c x y) z) -> (select c x z)
				1589	// (select c x (select c y z)) -> (select c x z)
				1590	[](Instruction I, LLVMContext &Ctx) -> Value {
				1591	SelectInst *Sel = dyn_cast<SelectInst>(I);
				1592	if (!Sel)
				1593	return nullptr;
				1594	IRBuilder<> B(Ctx);
				1595	Value *C = Sel->getCondition();
				1596	if (SelectInst *Sel0 = dyn_cast<SelectInst>(Sel->getTrueValue())) {
				1597	if (Sel0->getCondition() == C)
				1598	return B.CreateSelect(C, Sel0->getTrueValue(), Sel->getFalseValue());
				1599	}
				1600	if (SelectInst *Sel1 = dyn_cast<SelectInst>(Sel->getFalseValue())) {
				1601	if (Sel1->getCondition() == C)
				1602	return B.CreateSelect(C, Sel->getTrueValue(), Sel1->getFalseValue());
				1603	}
				1604	return nullptr;
				1605	});
				1606	Simp.addRule(
				1607	// (or (lshr x 1) 0x800.0) -> (xor (lshr x 1) 0x800.0)
				1608	[](Instruction I, LLVMContext &Ctx) -> Value {
				1609	if (I->getOpcode() != Instruction::Or)
				1610	return nullptr;
				1611	Instruction *LShr = dyn_cast<Instruction>(I->getOperand(0));
				1612	if (!LShr \|\| LShr->getOpcode() != Instruction::LShr)
				1613	return nullptr;
				1614	ConstantInt *One = dyn_cast<ConstantInt>(LShr->getOperand(1));
				1615	if (!One \|\| One->getZExtValue() != 1)
				1616	return nullptr;
				1617	ConstantInt *Msb = dyn_cast<ConstantInt>(I->getOperand(1));
				1618	if (!Msb \|\| Msb->getZExtValue() != Msb->getType()->getSignBit())
				1619	return nullptr;
				1620	return IRBuilder<>(Ctx).CreateXor(LShr, Msb);
				1621	});
				1622	Simp.addRule(
				1623	// (lshr (BitOp x y) c) -> (BitOp (lshr x c) (lshr y c))
				1624	[](Instruction I, LLVMContext &Ctx) -> Value {
				1625	if (I->getOpcode() != Instruction::LShr)
				1626	return nullptr;
				1627	BinaryOperator *BitOp = dyn_cast<BinaryOperator>(I->getOperand(0));
				1628	if (!BitOp)
				1629	return nullptr;
				1630	switch (BitOp->getOpcode()) {
				1631	case Instruction::And:
				1632	case Instruction::Or:
				1633	case Instruction::Xor:
				1634	break;
				1635	default:
				1636	return nullptr;
				1637	}
				1638	IRBuilder<> B(Ctx);
				1639	Value *S = I->getOperand(1);
				1640	return B.CreateBinOp(BitOp->getOpcode(),
				1641	B.CreateLShr(BitOp->getOperand(0), S),
				1642	B.CreateLShr(BitOp->getOperand(1), S));
				1643	});
				1644	Simp.addRule(
				1645	// (BitOp1 (BitOp2 x a) b) -> (BitOp2 x (BitOp1 a b))
				1646	[](Instruction I, LLVMContext &Ctx) -> Value {
				1647	auto IsBitOp = [](unsigned Op) -> bool {
				1648	switch (Op) {
				1649	case Instruction::And:
				1650	case Instruction::Or:
				1651	case Instruction::Xor:
				1652	return true;
				1653	}
				1654	return false;
				1655	};
				1656	BinaryOperator *BitOp1 = dyn_cast<BinaryOperator>(I);
				1657	if (!BitOp1 \|\| !IsBitOp(BitOp1->getOpcode()))
				1658	return nullptr;
				1659	BinaryOperator *BitOp2 = dyn_cast<BinaryOperator>(BitOp1->getOperand(0));
				1660	if (!BitOp2 \|\| !IsBitOp(BitOp2->getOpcode()))
				1661	return nullptr;
				1662	ConstantInt *CA = dyn_cast<ConstantInt>(BitOp2->getOperand(1));
				1663	ConstantInt *CB = dyn_cast<ConstantInt>(BitOp1->getOperand(1));
				1664	if (!CA \|\| !CB)
				1665	return nullptr;
				1666	IRBuilder<> B(Ctx);
				1667	Value *X = BitOp2->getOperand(0);
				1668	return B.CreateBinOp(BitOp2->getOpcode(), X,
				1669	B.CreateBinOp(BitOp1->getOpcode(), CA, CB));
				1670	});
				1671	}
				1672
				1673
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1674	bool PolynomialMultiplyRecognize::recognize() {
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	1675	DEBUG(dbgs() << "Starting PolynomialMultiplyRecognize on loop\n"
				1676	<< *CurLoop << '\n');
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1677	// Restrictions:
				1678	// - The loop must consist of a single block.
				1679	// - The iteration count must be known at compile-time.
				1680	// - The loop must have an induction variable starting from 0, and
				1681	// incremented in each iteration of the loop.
				1682	BasicBlock *LoopB = CurLoop->getHeader();
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	1683	DEBUG(dbgs() << "Loop header:\n" << *LoopB);
				1684
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1685	if (LoopB != CurLoop->getLoopLatch())
				1686	return false;
				1687	BasicBlock *ExitB = CurLoop->getExitBlock();
				1688	if (ExitB == nullptr)
				1689	return false;
				1690	BasicBlock *EntryB = CurLoop->getLoopPreheader();
				1691	if (EntryB == nullptr)
				1692	return false;
				1693
				1694	unsigned IterCount = 0;
				1695	const SCEV *CT = SE.getBackedgeTakenCount(CurLoop);
				1696	if (isa<SCEVCouldNotCompute>(CT))
				1697	return false;
				1698	if (auto *CV = dyn_cast<SCEVConstant>(CT))
				1699	IterCount = CV->getValue()->getZExtValue() + 1;
				1700
				1701	Value *CIV = getCountIV(LoopB);
				1702	ParsedValues PV;
				1703	PV.IterCount = IterCount;
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	1704	DEBUG(dbgs() << "Loop IV: " << *CIV << "\nIterCount: " << IterCount << '\n');
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1705
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	1706	setupSimplifier();
				1707
				1708	// Perform a preliminary scan of select instructions to see if any of them
				1709	// looks like a generator of the polynomial multiply steps. Assume that a
				1710	// loop can only contain a single transformable operation, so stop the
				1711	// traversal after the first reasonable candidate was found.
				1712	// XXX: Currently this approach can modify the loop before being 100% sure
				1713	// that the transformation can be carried out.
				1714	bool FoundPreScan = false;
				1715	for (Instruction &In : *LoopB) {
				1716	SelectInst *SI = dyn_cast<SelectInst>(&In);
				1717	if (!SI)
				1718	continue;
				1719
				1720	Simplifier::Context C(SI);
				1721	Value *T = Simp.simplify(C);
				1722	SelectInst *SelI = (T && isa<SelectInst>(T)) ? cast<SelectInst>(T) : SI;
				1723	DEBUG(dbgs() << "scanSelect(pre-scan): " << PE(C, SelI) << '\n');
				1724	if (scanSelect(SelI, LoopB, EntryB, CIV, PV, true)) {
				1725	FoundPreScan = true;
				1726	if (SelI != SI) {
				1727	Value *NewSel = C.materialize(LoopB, SI->getIterator());
				1728	SI->replaceAllUsesWith(NewSel);
				1729	RecursivelyDeleteTriviallyDeadInstructions(SI, &TLI);
				1730	}
				1731	break;
				1732	}
				1733	}
				1734
				1735	if (!FoundPreScan) {
				1736	DEBUG(dbgs() << "Have not found candidates for pmpy\n");
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1737	return false;
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	1738	}
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1739
				1740	if (!PV.Left) {
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	1741	// The right shift version actually only returns the higher bits of
				1742	// the result (each iteration discards the LSB). If we want to convert it
				1743	// to a left-shifting loop, the working data type must be at least as
				1744	// wide as the target's pmpy instruction.
				1745	if (!promoteTypes(LoopB, ExitB))
				1746	return false;
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1747	convertShiftsToLeft(LoopB, ExitB, IterCount);
				1748	cleanupLoopBody(LoopB);
				1749	}
				1750
Krzysztof Parzyszek	d033d1f	2017-03-21 17:09:27 +0000	[diff] [blame]	1751	// Scan the loop again, find the generating select instruction.
				1752	bool FoundScan = false;
				1753	for (Instruction &In : *LoopB) {
				1754	SelectInst *SelI = dyn_cast<SelectInst>(&In);
				1755	if (!SelI)
				1756	continue;
				1757	DEBUG(dbgs() << "scanSelect: " << *SelI << '\n');
				1758	FoundScan = scanSelect(SelI, LoopB, EntryB, CIV, PV, false);
				1759	if (FoundScan)
				1760	break;
				1761	}
				1762	assert(FoundScan);
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1763
				1764	DEBUG({
				1765	StringRef PP = (PV.M ? "(P+M)" : "P");
				1766	if (!PV.Inv)
				1767	dbgs() << "Found pmpy idiom: R = " << PP << ".Q\n";
				1768	else
				1769	dbgs() << "Found inverse pmpy idiom: R = (" << PP << "/Q).Q) + "
				1770	<< PP << "\n";
				1771	dbgs() << " Res:" << PV.Res << "\n P:" << PV.P << "\n";
				1772	if (PV.M)
				1773	dbgs() << " M:" << *PV.M << "\n";
				1774	dbgs() << " Q:" << *PV.Q << "\n";
				1775	dbgs() << " Iteration count:" << PV.IterCount << "\n";
				1776	});
				1777
				1778	BasicBlock::iterator At(EntryB->getTerminator());
				1779	Value *PM = generate(At, PV);
				1780	if (PM == nullptr)
				1781	return false;
				1782
				1783	if (PM->getType() != PV.Res->getType())
				1784	PM = IRBuilder<>(&*At).CreateIntCast(PM, PV.Res->getType(), false);
				1785
				1786	PV.Res->replaceAllUsesWith(PM);
				1787	PV.Res->eraseFromParent();
				1788	return true;
				1789	}
				1790
				1791
				1792	unsigned HexagonLoopIdiomRecognize::getStoreSizeInBytes(StoreInst *SI) {
				1793	uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType());
				1794	assert(((SizeInBits & 7) \|\| (SizeInBits >> 32) == 0) &&
				1795	"Don't overflow unsigned.");
				1796	return (unsigned)SizeInBits >> 3;
				1797	}
				1798
				1799
				1800	int HexagonLoopIdiomRecognize::getSCEVStride(const SCEVAddRecExpr *S) {
				1801	if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getOperand(1)))
				1802	return SC->getAPInt().getSExtValue();
				1803	return 0;
				1804	}
				1805
				1806
				1807	bool HexagonLoopIdiomRecognize::isLegalStore(Loop CurLoop, StoreInst SI) {
Krzysztof Parzyszek	35ce5da	2017-01-27 20:40:14 +0000	[diff] [blame]	1808	// Allow volatile stores if HexagonVolatileMemcpy is enabled.
				1809	if (!(SI->isVolatile() && HexagonVolatileMemcpy) && !SI->isSimple())
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1810	return false;
				1811
				1812	Value *StoredVal = SI->getValueOperand();
				1813	Value *StorePtr = SI->getPointerOperand();
				1814
				1815	// Reject stores that are so large that they overflow an unsigned.
				1816	uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
				1817	if ((SizeInBits & 7) \|\| (SizeInBits >> 32) != 0)
				1818	return false;
				1819
				1820	// See if the pointer expression is an AddRec like {base,+,1} on the current
				1821	// loop, which indicates a strided store. If we have something else, it's a
				1822	// random store we can't handle.
				1823	auto *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
				1824	if (!StoreEv \|\| StoreEv->getLoop() != CurLoop \|\| !StoreEv->isAffine())
				1825	return false;
				1826
				1827	// Check to see if the stride matches the size of the store. If so, then we
				1828	// know that every byte is touched in the loop.
				1829	int Stride = getSCEVStride(StoreEv);
				1830	if (Stride == 0)
				1831	return false;
				1832	unsigned StoreSize = getStoreSizeInBytes(SI);
				1833	if (StoreSize != unsigned(std::abs(Stride)))
				1834	return false;
				1835
				1836	// The store must be feeding a non-volatile load.
				1837	LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
				1838	if (!LI \|\| !LI->isSimple())
				1839	return false;
				1840
				1841	// See if the pointer expression is an AddRec like {base,+,1} on the current
				1842	// loop, which indicates a strided load. If we have something else, it's a
				1843	// random load we can't handle.
				1844	Value *LoadPtr = LI->getPointerOperand();
				1845	auto *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
				1846	if (!LoadEv \|\| LoadEv->getLoop() != CurLoop \|\| !LoadEv->isAffine())
				1847	return false;
				1848
				1849	// The store and load must share the same stride.
				1850	if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
				1851	return false;
				1852
				1853	// Success. This store can be converted into a memcpy.
				1854	return true;
				1855	}
				1856
				1857
				1858	/// mayLoopAccessLocation - Return true if the specified loop might access the
				1859	/// specified pointer location, which is a loop-strided access. The 'Access'
				1860	/// argument specifies what the verboten forms of access are (read or write).
				1861	static bool
				1862	mayLoopAccessLocation(Value Ptr, ModRefInfo Access, Loop L,
				1863	const SCEV *BECount, unsigned StoreSize,
				1864	AliasAnalysis &AA,
				1865	SmallPtrSetImpl<Instruction *> &Ignored) {
				1866	// Get the location that may be stored across the loop. Since the access
				1867	// is strided positively through memory, we say that the modified location
				1868	// starts at the pointer and has infinite size.
				1869	uint64_t AccessSize = MemoryLocation::UnknownSize;
				1870
				1871	// If the loop iterates a fixed number of times, we can refine the access
				1872	// size to be exactly the size of the memset, which is (BECount+1)*StoreSize
				1873	if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
				1874	AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize;
				1875
				1876	// TODO: For this to be really effective, we have to dive into the pointer
				1877	// operand in the store. Store to &A[i] of 100 will always return may alias
				1878	// with store of &A[100], we need to StoreLoc to be "A" with size of 100,
				1879	// which will then no-alias a store to &A[100].
				1880	MemoryLocation StoreLoc(Ptr, AccessSize);
				1881
				1882	for (auto *B : L->blocks())
				1883	for (auto &I : *B)
				1884	if (Ignored.count(&I) == 0 && (AA.getModRefInfo(&I, StoreLoc) & Access))
				1885	return true;
				1886
				1887	return false;
				1888	}
				1889
				1890
				1891	void HexagonLoopIdiomRecognize::collectStores(Loop CurLoop, BasicBlock BB,
				1892	SmallVectorImpl<StoreInst*> &Stores) {
				1893	Stores.clear();
				1894	for (Instruction &I : *BB)
				1895	if (StoreInst *SI = dyn_cast<StoreInst>(&I))
				1896	if (isLegalStore(CurLoop, SI))
				1897	Stores.push_back(SI);
				1898	}
				1899
				1900
				1901	bool HexagonLoopIdiomRecognize::processCopyingStore(Loop *CurLoop,
				1902	StoreInst SI, const SCEV BECount) {
Michael Kuperstein	e18aad3	2017-01-31 22:48:45 +0000	[diff] [blame]	1903	assert((SI->isSimple() \|\| (SI->isVolatile() && HexagonVolatileMemcpy)) &&
				1904	"Expected only non-volatile stores, or Hexagon-specific memcpy"
				1905	"to volatile destination.");
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	1906
				1907	Value *StorePtr = SI->getPointerOperand();
				1908	auto *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
				1909	unsigned Stride = getSCEVStride(StoreEv);
				1910	unsigned StoreSize = getStoreSizeInBytes(SI);
				1911	if (Stride != StoreSize)
				1912	return false;
				1913
				1914	// See if the pointer expression is an AddRec like {base,+,1} on the current
				1915	// loop, which indicates a strided load. If we have something else, it's a
				1916	// random load we can't handle.
				1917	LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
				1918	auto *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
				1919
				1920	// The trip count of the loop and the base pointer of the addrec SCEV is
				1921	// guaranteed to be loop invariant, which means that it should dominate the
				1922	// header. This allows us to insert code for it in the preheader.
				1923	BasicBlock *Preheader = CurLoop->getLoopPreheader();
				1924	Instruction *ExpPt = Preheader->getTerminator();
				1925	IRBuilder<> Builder(ExpPt);
				1926	SCEVExpander Expander(SE, DL, "hexagon-loop-idiom");
				1927
				1928	Type IntPtrTy = Builder.getIntPtrTy(DL, SI->getPointerAddressSpace());
				1929
				1930	// Okay, we have a strided store "p[i]" of a loaded value. We can turn
				1931	// this into a memcpy/memmove in the loop preheader now if we want. However,
				1932	// this would be unsafe to do if there is anything else in the loop that may
				1933	// read or write the memory region we're storing to. For memcpy, this
				1934	// includes the load that feeds the stores. Check for an alias by generating
				1935	// the base address and checking everything.
				1936	Value *StoreBasePtr = Expander.expandCodeFor(StoreEv->getStart(),
				1937	Builder.getInt8PtrTy(SI->getPointerAddressSpace()), ExpPt);
				1938	Value *LoadBasePtr = nullptr;
				1939
				1940	bool Overlap = false;
				1941	bool DestVolatile = SI->isVolatile();
				1942	Type *BECountTy = BECount->getType();
				1943
				1944	if (DestVolatile) {
				1945	// The trip count must fit in i32, since it is the type of the "num_words"
				1946	// argument to hexagon_memcpy_forward_vp4cp4n2.
				1947	if (StoreSize != 4 \|\| DL->getTypeSizeInBits(BECountTy) > 32) {
				1948	CleanupAndExit:
				1949	// If we generated new code for the base pointer, clean up.
				1950	Expander.clear();
				1951	if (StoreBasePtr && (LoadBasePtr != StoreBasePtr)) {
				1952	RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
				1953	StoreBasePtr = nullptr;
				1954	}
				1955	if (LoadBasePtr) {
				1956	RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
				1957	LoadBasePtr = nullptr;
				1958	}
				1959	return false;
				1960	}
				1961	}
				1962
				1963	SmallPtrSet<Instruction*, 2> Ignore1;
				1964	Ignore1.insert(SI);
				1965	if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
				1966	StoreSize, *AA, Ignore1)) {
				1967	// Check if the load is the offending instruction.
				1968	Ignore1.insert(LI);
				1969	if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
				1970	StoreSize, *AA, Ignore1)) {
				1971	// Still bad. Nothing we can do.
				1972	goto CleanupAndExit;
				1973	}
				1974	// It worked with the load ignored.
				1975	Overlap = true;
				1976	}
				1977
				1978	if (!Overlap) {
				1979	if (DisableMemcpyIdiom \|\| !HasMemcpy)
				1980	goto CleanupAndExit;
				1981	} else {
				1982	// Don't generate memmove if this function will be inlined. This is
				1983	// because the caller will undergo this transformation after inlining.
				1984	Function *Func = CurLoop->getHeader()->getParent();
				1985	if (Func->hasFnAttribute(Attribute::AlwaysInline))
				1986	goto CleanupAndExit;
				1987
				1988	// In case of a memmove, the call to memmove will be executed instead
				1989	// of the loop, so we need to make sure that there is nothing else in
				1990	// the loop than the load, store and instructions that these two depend
				1991	// on.
				1992	SmallVector<Instruction*,2> Insts;
				1993	Insts.push_back(SI);
				1994	Insts.push_back(LI);
				1995	if (!coverLoop(CurLoop, Insts))
				1996	goto CleanupAndExit;
				1997
				1998	if (DisableMemmoveIdiom \|\| !HasMemmove)
				1999	goto CleanupAndExit;
				2000	bool IsNested = CurLoop->getParentLoop() != 0;
				2001	if (IsNested && OnlyNonNestedMemmove)
				2002	goto CleanupAndExit;
				2003	}
				2004
				2005	// For a memcpy, we have to make sure that the input array is not being
				2006	// mutated by the loop.
				2007	LoadBasePtr = Expander.expandCodeFor(LoadEv->getStart(),
				2008	Builder.getInt8PtrTy(LI->getPointerAddressSpace()), ExpPt);
				2009
				2010	SmallPtrSet<Instruction*, 2> Ignore2;
				2011	Ignore2.insert(SI);
				2012	if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize,
				2013	*AA, Ignore2))
				2014	goto CleanupAndExit;
				2015
				2016	// Check the stride.
				2017	bool StridePos = getSCEVStride(LoadEv) >= 0;
				2018
				2019	// Currently, the volatile memcpy only emulates traversing memory forward.
				2020	if (!StridePos && DestVolatile)
				2021	goto CleanupAndExit;
				2022
				2023	bool RuntimeCheck = (Overlap \|\| DestVolatile);
				2024
				2025	BasicBlock *ExitB;
				2026	if (RuntimeCheck) {
				2027	// The runtime check needs a single exit block.
				2028	SmallVector<BasicBlock*, 8> ExitBlocks;
				2029	CurLoop->getUniqueExitBlocks(ExitBlocks);
				2030	if (ExitBlocks.size() != 1)
				2031	goto CleanupAndExit;
				2032	ExitB = ExitBlocks[0];
				2033	}
				2034
				2035	// The # stored bytes is (BECount+1)*Size. Expand the trip count out to
				2036	// pointer size if it isn't already.
				2037	LLVMContext &Ctx = SI->getContext();
				2038	BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
				2039	unsigned Alignment = std::min(SI->getAlignment(), LI->getAlignment());
				2040	DebugLoc DLoc = SI->getDebugLoc();
				2041
				2042	const SCEV *NumBytesS =
				2043	SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
				2044	if (StoreSize != 1)
				2045	NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
				2046	SCEV::FlagNUW);
				2047	Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtrTy, ExpPt);
				2048	if (Instruction *In = dyn_cast<Instruction>(NumBytes))
Daniel Berlin	4d0fe64	2017-04-28 19:55:38 +0000	[diff] [blame]	2049	if (Value Simp = SimplifyInstruction(In, {DL, TLI, DT}))
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	2050	NumBytes = Simp;
				2051
				2052	CallInst *NewCall;
				2053
				2054	if (RuntimeCheck) {
				2055	unsigned Threshold = RuntimeMemSizeThreshold;
				2056	if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) {
				2057	uint64_t C = CI->getZExtValue();
				2058	if (Threshold != 0 && C < Threshold)
				2059	goto CleanupAndExit;
				2060	if (C < CompileTimeMemSizeThreshold)
				2061	goto CleanupAndExit;
				2062	}
				2063
				2064	BasicBlock *Header = CurLoop->getHeader();
				2065	Function *Func = Header->getParent();
				2066	Loop *ParentL = LF->getLoopFor(Preheader);
				2067	StringRef HeaderName = Header->getName();
				2068
				2069	// Create a new (empty) preheader, and update the PHI nodes in the
				2070	// header to use the new preheader.
				2071	BasicBlock *NewPreheader = BasicBlock::Create(Ctx, HeaderName+".rtli.ph",
				2072	Func, Header);
				2073	if (ParentL)
				2074	ParentL->addBasicBlockToLoop(NewPreheader, *LF);
				2075	IRBuilder<>(NewPreheader).CreateBr(Header);
				2076	for (auto &In : *Header) {
				2077	PHINode *PN = dyn_cast<PHINode>(&In);
				2078	if (!PN)
				2079	break;
				2080	int bx = PN->getBasicBlockIndex(Preheader);
				2081	if (bx >= 0)
				2082	PN->setIncomingBlock(bx, NewPreheader);
				2083	}
				2084	DT->addNewBlock(NewPreheader, Preheader);
				2085	DT->changeImmediateDominator(Header, NewPreheader);
				2086
				2087	// Check for safe conditions to execute memmove.
				2088	// If stride is positive, copying things from higher to lower addresses
				2089	// is equivalent to memmove. For negative stride, it's the other way
				2090	// around. Copying forward in memory with positive stride may not be
				2091	// same as memmove since we may be copying values that we just stored
				2092	// in some previous iteration.
				2093	Value *LA = Builder.CreatePtrToInt(LoadBasePtr, IntPtrTy);
				2094	Value *SA = Builder.CreatePtrToInt(StoreBasePtr, IntPtrTy);
				2095	Value *LowA = StridePos ? SA : LA;
				2096	Value *HighA = StridePos ? LA : SA;
				2097	Value *CmpA = Builder.CreateICmpULT(LowA, HighA);
				2098	Value *Cond = CmpA;
				2099
				2100	// Check for distance between pointers.
				2101	Value *Dist = Builder.CreateSub(HighA, LowA);
				2102	Value *CmpD = Builder.CreateICmpSLT(NumBytes, Dist);
				2103	Value *CmpEither = Builder.CreateOr(Cond, CmpD);
				2104	Cond = CmpEither;
				2105
				2106	if (Threshold != 0) {
				2107	Type *Ty = NumBytes->getType();
				2108	Value *Thr = ConstantInt::get(Ty, Threshold);
				2109	Value *CmpB = Builder.CreateICmpULT(Thr, NumBytes);
				2110	Value *CmpBoth = Builder.CreateAnd(Cond, CmpB);
				2111	Cond = CmpBoth;
				2112	}
				2113	BasicBlock *MemmoveB = BasicBlock::Create(Ctx, Header->getName()+".rtli",
				2114	Func, NewPreheader);
				2115	if (ParentL)
				2116	ParentL->addBasicBlockToLoop(MemmoveB, *LF);
				2117	Instruction *OldT = Preheader->getTerminator();
				2118	Builder.CreateCondBr(Cond, MemmoveB, NewPreheader);
				2119	OldT->eraseFromParent();
				2120	Preheader->setName(Preheader->getName()+".old");
				2121	DT->addNewBlock(MemmoveB, Preheader);
				2122	// Find the new immediate dominator of the exit block.
				2123	BasicBlock *ExitD = Preheader;
				2124	for (auto PI = pred_begin(ExitB), PE = pred_end(ExitB); PI != PE; ++PI) {
				2125	BasicBlock PB = PI;
				2126	ExitD = DT->findNearestCommonDominator(ExitD, PB);
				2127	if (!ExitD)
				2128	break;
				2129	}
				2130	// If the prior immediate dominator of ExitB was dominated by the
				2131	// old preheader, then the old preheader becomes the new immediate
				2132	// dominator. Otherwise don't change anything (because the newly
				2133	// added blocks are dominated by the old preheader).
				2134	if (ExitD && DT->dominates(Preheader, ExitD)) {
				2135	DomTreeNode *BN = DT->getNode(ExitB);
				2136	DomTreeNode *DN = DT->getNode(ExitD);
				2137	BN->setIDom(DN);
				2138	}
				2139
				2140	// Add a call to memmove to the conditional block.
				2141	IRBuilder<> CondBuilder(MemmoveB);
				2142	CondBuilder.CreateBr(ExitB);
				2143	CondBuilder.SetInsertPoint(MemmoveB->getTerminator());
				2144
				2145	if (DestVolatile) {
				2146	Type *Int32Ty = Type::getInt32Ty(Ctx);
				2147	Type *Int32PtrTy = Type::getInt32PtrTy(Ctx);
				2148	Type *VoidTy = Type::getVoidTy(Ctx);
				2149	Module *M = Func->getParent();
				2150	Constant *CF = M->getOrInsertFunction(HexagonVolatileMemcpyName, VoidTy,
Serge Guelton	59a2d7b	2017-04-11 15:01:18 +0000	[diff] [blame]	2151	Int32PtrTy, Int32PtrTy, Int32Ty);
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	2152	Function *Fn = cast<Function>(CF);
				2153	Fn->setLinkage(Function::ExternalLinkage);
				2154
				2155	const SCEV *OneS = SE->getConstant(Int32Ty, 1);
				2156	const SCEV *BECount32 = SE->getTruncateOrZeroExtend(BECount, Int32Ty);
				2157	const SCEV *NumWordsS = SE->getAddExpr(BECount32, OneS, SCEV::FlagNUW);
				2158	Value *NumWords = Expander.expandCodeFor(NumWordsS, Int32Ty,
				2159	MemmoveB->getTerminator());
				2160	if (Instruction *In = dyn_cast<Instruction>(NumWords))
Daniel Berlin	4d0fe64	2017-04-28 19:55:38 +0000	[diff] [blame]	2161	if (Value Simp = SimplifyInstruction(In, {DL, TLI, DT}))
Krzysztof Parzyszek	c8b9438	2017-01-26 21:41:10 +0000	[diff] [blame]	2162	NumWords = Simp;
				2163
				2164	Value *Op0 = (StoreBasePtr->getType() == Int32PtrTy)
				2165	? StoreBasePtr
				2166	: CondBuilder.CreateBitCast(StoreBasePtr, Int32PtrTy);
				2167	Value *Op1 = (LoadBasePtr->getType() == Int32PtrTy)
				2168	? LoadBasePtr
				2169	: CondBuilder.CreateBitCast(LoadBasePtr, Int32PtrTy);
				2170	NewCall = CondBuilder.CreateCall(Fn, {Op0, Op1, NumWords});
				2171	} else {
				2172	NewCall = CondBuilder.CreateMemMove(StoreBasePtr, LoadBasePtr,
				2173	NumBytes, Alignment);
				2174	}
				2175	} else {
				2176	NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr,
				2177	NumBytes, Alignment);
				2178	// Okay, the memcpy has been formed. Zap the original store and
				2179	// anything that feeds into it.
				2180	RecursivelyDeleteTriviallyDeadInstructions(SI, TLI);
				2181	}
				2182
				2183	NewCall->setDebugLoc(DLoc);
				2184
				2185	DEBUG(dbgs() << " Formed " << (Overlap ? "memmove: " : "memcpy: ")
				2186	<< *NewCall << "\n"
				2187	<< " from load ptr=" << LoadEv << " at: " << LI << "\n"
				2188	<< " from store ptr=" << StoreEv << " at: " << SI << "\n");
				2189
				2190	return true;
				2191	}
				2192
				2193
				2194	// \brief Check if the instructions in Insts, together with their dependencies
				2195	// cover the loop in the sense that the loop could be safely eliminated once
				2196	// the instructions in Insts are removed.
				2197	bool HexagonLoopIdiomRecognize::coverLoop(Loop *L,
				2198	SmallVectorImpl<Instruction*> &Insts) const {
				2199	SmallSet<BasicBlock*,8> LoopBlocks;
				2200	for (auto *B : L->blocks())
				2201	LoopBlocks.insert(B);
				2202
				2203	SetVector<Instruction*> Worklist(Insts.begin(), Insts.end());
				2204
				2205	// Collect all instructions from the loop that the instructions in Insts
				2206	// depend on (plus their dependencies, etc.). These instructions will
				2207	// constitute the expression trees that feed those in Insts, but the trees
				2208	// will be limited only to instructions contained in the loop.
				2209	for (unsigned i = 0; i < Worklist.size(); ++i) {
				2210	Instruction *In = Worklist[i];
				2211	for (auto I = In->op_begin(), E = In->op_end(); I != E; ++I) {
				2212	Instruction *OpI = dyn_cast<Instruction>(I);
				2213	if (!OpI)
				2214	continue;
				2215	BasicBlock *PB = OpI->getParent();
				2216	if (!LoopBlocks.count(PB))
				2217	continue;
				2218	Worklist.insert(OpI);
				2219	}
				2220	}
				2221
				2222	// Scan all instructions in the loop, if any of them have a user outside
				2223	// of the loop, or outside of the expressions collected above, then either
				2224	// the loop has a side-effect visible outside of it, or there are
				2225	// instructions in it that are not involved in the original set Insts.
				2226	for (auto *B : L->blocks()) {
				2227	for (auto &In : *B) {
				2228	if (isa<BranchInst>(In) \|\| isa<DbgInfoIntrinsic>(In))
				2229	continue;
				2230	if (!Worklist.count(&In) && In.mayHaveSideEffects())
				2231	return false;
				2232	for (const auto &K : In.users()) {
				2233	Instruction *UseI = dyn_cast<Instruction>(K);
				2234	if (!UseI)
				2235	continue;
				2236	BasicBlock *UseB = UseI->getParent();
				2237	if (LF->getLoopFor(UseB) != L)
				2238	return false;
				2239	}
				2240	}
				2241	}
				2242
				2243	return true;
				2244	}
				2245
				2246	/// runOnLoopBlock - Process the specified block, which lives in a counted loop
				2247	/// with the specified backedge count. This block is known to be in the current
				2248	/// loop and not in any subloops.
				2249	bool HexagonLoopIdiomRecognize::runOnLoopBlock(Loop CurLoop, BasicBlock BB,
				2250	const SCEV BECount, SmallVectorImpl<BasicBlock> &ExitBlocks) {
				2251	// We can only promote stores in this block if they are unconditionally
				2252	// executed in the loop. For a block to be unconditionally executed, it has
				2253	// to dominate all the exit blocks of the loop. Verify this now.
				2254	auto DominatedByBB = [this,BB] (BasicBlock *EB) -> bool {
				2255	return DT->dominates(BB, EB);
				2256	};
				2257	if (!std::all_of(ExitBlocks.begin(), ExitBlocks.end(), DominatedByBB))
				2258	return false;
				2259
				2260	bool MadeChange = false;
				2261	// Look for store instructions, which may be optimized to memset/memcpy.
				2262	SmallVector<StoreInst*,8> Stores;
				2263	collectStores(CurLoop, BB, Stores);
				2264
				2265	// Optimize the store into a memcpy, if it feeds an similarly strided load.
				2266	for (auto &SI : Stores)
				2267	MadeChange \|= processCopyingStore(CurLoop, SI, BECount);
				2268
				2269	return MadeChange;
				2270	}
				2271
				2272
				2273	bool HexagonLoopIdiomRecognize::runOnCountableLoop(Loop *L) {
				2274	PolynomialMultiplyRecognize PMR(L, DL, DT, TLI, SE);
				2275	if (PMR.recognize())
				2276	return true;
				2277
				2278	if (!HasMemcpy && !HasMemmove)
				2279	return false;
				2280
				2281	const SCEV *BECount = SE->getBackedgeTakenCount(L);
				2282	assert(!isa<SCEVCouldNotCompute>(BECount) &&
				2283	"runOnCountableLoop() called on a loop without a predictable"
				2284	"backedge-taken count");
				2285
				2286	SmallVector<BasicBlock *, 8> ExitBlocks;
				2287	L->getUniqueExitBlocks(ExitBlocks);
				2288
				2289	bool Changed = false;
				2290
				2291	// Scan all the blocks in the loop that are not in subloops.
				2292	for (auto *BB : L->getBlocks()) {
				2293	// Ignore blocks in subloops.
				2294	if (LF->getLoopFor(BB) != L)
				2295	continue;
				2296	Changed \|= runOnLoopBlock(L, BB, BECount, ExitBlocks);
				2297	}
				2298
				2299	return Changed;
				2300	}
				2301
				2302
				2303	bool HexagonLoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
				2304	const Module &M = *L->getHeader()->getParent()->getParent();
				2305	if (Triple(M.getTargetTriple()).getArch() != Triple::hexagon)
				2306	return false;
				2307
				2308	if (skipLoop(L))
				2309	return false;
				2310
				2311	// If the loop could not be converted to canonical form, it must have an
				2312	// indirectbr in it, just give up.
				2313	if (!L->getLoopPreheader())
				2314	return false;
				2315
				2316	// Disable loop idiom recognition if the function's name is a common idiom.
				2317	StringRef Name = L->getHeader()->getParent()->getName();
				2318	if (Name == "memset" \|\| Name == "memcpy" \|\| Name == "memmove")
				2319	return false;
				2320
				2321	AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
				2322	DL = &L->getHeader()->getModule()->getDataLayout();
				2323	DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
				2324	LF = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
				2325	TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
				2326	SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
				2327
				2328	HasMemcpy = TLI->has(LibFunc_memcpy);
				2329	HasMemmove = TLI->has(LibFunc_memmove);
				2330
				2331	if (SE->hasLoopInvariantBackedgeTakenCount(L))
				2332	return runOnCountableLoop(L);
				2333	return false;
				2334	}
				2335
				2336
				2337	Pass *llvm::createHexagonLoopIdiomPass() {
				2338	return new HexagonLoopIdiomRecognize();
				2339	}
				2340