Blame - llvm/lib/Target/ARM/ARMParallelDSP.cpp - toolchain/llvm-project

blob: 890ed2b06126e2b979dce0143133e02b7917e3fa [file] [log] [blame]

Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	1	//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
				2	//
Chandler Carruth	2946cd7	2019-01-19 08:50:56 +0000	[diff] [blame]	3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				4	// See https://llvm.org/LICENSE.txt for license information.
				5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	6	//
				7	//===----------------------------------------------------------------------===//
				8	//
				9	/// \file
				10	/// Armv6 introduced instructions to perform 32-bit SIMD operations. The
				11	/// purpose of this pass is do some IR pattern matching to create ACLE
				12	/// DSP intrinsics, which map on these 32-bit SIMD operations.
Sjoerd Meijer	53449da	2018-07-11 12:36:25 +0000	[diff] [blame]	13	/// This pass runs only when unaligned accesses is supported/enabled.
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	14	//
				15	//===----------------------------------------------------------------------===//
				16
Sjoerd Meijer	b3e06fa	2018-07-06 14:47:09 +0000	[diff] [blame]	17	#include "llvm/ADT/Statistic.h"
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	18	#include "llvm/ADT/SmallPtrSet.h"
				19	#include "llvm/Analysis/AliasAnalysis.h"
				20	#include "llvm/Analysis/LoopAccessAnalysis.h"
				21	#include "llvm/Analysis/LoopPass.h"
				22	#include "llvm/Analysis/LoopInfo.h"
				23	#include "llvm/IR/Instructions.h"
				24	#include "llvm/IR/NoFolder.h"
				25	#include "llvm/Transforms/Scalar.h"
				26	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
				27	#include "llvm/Transforms/Utils/LoopUtils.h"
				28	#include "llvm/Pass.h"
				29	#include "llvm/PassRegistry.h"
				30	#include "llvm/PassSupport.h"
				31	#include "llvm/Support/Debug.h"
				32	#include "llvm/IR/PatternMatch.h"
				33	#include "llvm/CodeGen/TargetPassConfig.h"
				34	#include "ARM.h"
				35	#include "ARMSubtarget.h"
				36
				37	using namespace llvm;
				38	using namespace PatternMatch;
				39
Sjoerd Meijer	b3e06fa	2018-07-06 14:47:09 +0000	[diff] [blame]	40	#define DEBUG_TYPE "arm-parallel-dsp"
				41
				42	STATISTIC(NumSMLAD , "Number of smlad instructions generated");
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	43
Sjoerd Meijer	3c859b3	2018-08-14 07:43:49 +0000	[diff] [blame]	44	static cl::opt<bool>
				45	DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false),
				46	cl::desc("Disable the ARM Parallel DSP pass"));
				47
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	48	namespace {
Sam Parker	89a3799	2018-07-23 15:25:59 +0000	[diff] [blame]	49	struct OpChain;
				50	struct BinOpChain;
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	51	class Reduction;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	52
Fangrui Song	58407ca	2018-07-23 17:43:21 +0000	[diff] [blame]	53	using OpChainList = SmallVector<std::unique_ptr<OpChain>, 8>;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	54	using ReductionList = SmallVector<Reduction, 8>;
				55	using ValueList = SmallVector<Value*, 8>;
Sam Parker	4c4ff13	2019-03-14 11:14:13 +0000	[diff] [blame]	56	using MemInstList = SmallVector<LoadInst*, 8>;
Sam Parker	89a3799	2018-07-23 15:25:59 +0000	[diff] [blame]	57	using PMACPair = std::pair<BinOpChain,BinOpChain>;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	58	using PMACPairList = SmallVector<PMACPair, 8>;
				59	using Instructions = SmallVector<Instruction*,16>;
				60	using MemLocList = SmallVector<MemoryLocation, 4>;
				61
Sam Parker	89a3799	2018-07-23 15:25:59 +0000	[diff] [blame]	62	struct OpChain {
				63	Instruction *Root;
				64	ValueList AllValues;
Eli Friedman	b09c778	2018-10-18 19:34:30 +0000	[diff] [blame]	65	MemInstList VecLd; // List of all load instructions.
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	66	MemInstList Loads;
Sam Parker	89a3799	2018-07-23 15:25:59 +0000	[diff] [blame]	67	bool ReadOnly = true;
				68
				69	OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { }
Jordan Rupprecht	e5daf61	2018-07-23 17:38:05 +0000	[diff] [blame]	70	virtual ~OpChain() = default;
Sam Parker	89a3799	2018-07-23 15:25:59 +0000	[diff] [blame]	71
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	72	void PopulateLoads() {
Sam Parker	89a3799	2018-07-23 15:25:59 +0000	[diff] [blame]	73	for (auto *V : AllValues) {
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	74	if (auto *Ld = dyn_cast<LoadInst>(V))
				75	Loads.push_back(Ld);
Sam Parker	89a3799	2018-07-23 15:25:59 +0000	[diff] [blame]	76	}
				77	}
				78
				79	unsigned size() const { return AllValues.size(); }
				80	};
				81
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	82	// 'BinOpChain' holds the multiplication instructions that are candidates
				83	// for parallel execution.
Sam Parker	89a3799	2018-07-23 15:25:59 +0000	[diff] [blame]	84	struct BinOpChain : public OpChain {
				85	ValueList LHS; // List of all (narrow) left hand operands.
				86	ValueList RHS; // List of all (narrow) right hand operands.
Sam Parker	a023c7a	2018-09-12 09:17:44 +0000	[diff] [blame]	87	bool Exchange = false;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	88
Sam Parker	89a3799	2018-07-23 15:25:59 +0000	[diff] [blame]	89	BinOpChain(Instruction *I, ValueList &lhs, ValueList &rhs) :
				90	OpChain(I, lhs), LHS(lhs), RHS(rhs) {
				91	for (auto *V : RHS)
				92	AllValues.push_back(V);
				93	}
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	94
				95	bool AreSymmetrical(BinOpChain *Other);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	96	};
				97
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	98	/// Represent a sequence of multiply-accumulate operations with the aim to
				99	/// perform the multiplications in parallel.
				100	class Reduction {
				101	Instruction *Root = nullptr;
				102	Value *Acc = nullptr;
				103	OpChainList Muls;
				104	PMACPairList MulPairs;
				105	SmallPtrSet<Instruction*, 4> Adds;
				106
				107	public:
				108	Reduction() = delete;
				109
				110	Reduction (Instruction *Add) : Root(Add) { }
				111
				112	/// Record an Add instruction that is a part of the this reduction.
				113	void InsertAdd(Instruction *I) { Adds.insert(I); }
				114
				115	/// Record a BinOpChain, rooted at a Mul instruction, that is a part of
				116	/// this reduction.
				117	void InsertMul(Instruction *I, ValueList &LHS, ValueList &RHS) {
				118	Muls.push_back(make_unique<BinOpChain>(I, LHS, RHS));
				119	}
				120
				121	/// Add the incoming accumulator value, returns true if a value had not
				122	/// already been added. Returning false signals to the user that this
				123	/// reduction already has a value to initialise the accumulator.
				124	bool InsertAcc(Value *V) {
				125	if (Acc)
				126	return false;
				127	Acc = V;
				128	return true;
				129	}
				130
				131	/// Set two BinOpChains, rooted at muls, that can be executed as a single
				132	/// parallel operation.
				133	void AddMulPair(BinOpChain Mul0, BinOpChain Mul1) {
				134	MulPairs.push_back(std::make_pair(Mul0, Mul1));
				135	}
				136
				137	/// Return true if enough mul operations are found that can be executed in
				138	/// parallel.
				139	bool CreateParallelPairs();
				140
				141	/// Return the add instruction which is the root of the reduction.
				142	Instruction *getRoot() { return Root; }
				143
				144	/// Return the incoming value to be accumulated. This maybe null.
				145	Value *getAccumulator() { return Acc; }
				146
				147	/// Return the set of adds that comprise the reduction.
				148	SmallPtrSetImpl<Instruction*> &getAdds() { return Adds; }
				149
				150	/// Return the BinOpChain, rooted at mul instruction, that comprise the
				151	/// the reduction.
				152	OpChainList &getMuls() { return Muls; }
				153
				154	/// Return the BinOpChain, rooted at mul instructions, that have been
				155	/// paired for parallel execution.
				156	PMACPairList &getMulPairs() { return MulPairs; }
				157
				158	/// To finalise, replace the uses of the root with the intrinsic call.
				159	void UpdateRoot(Instruction *SMLAD) {
				160	Root->replaceAllUsesWith(SMLAD);
				161	}
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	162	};
				163
Sam Parker	4c4ff13	2019-03-14 11:14:13 +0000	[diff] [blame]	164	class WidenedLoad {
				165	LoadInst *NewLd = nullptr;
				166	SmallVector<LoadInst*, 4> Loads;
				167
				168	public:
				169	WidenedLoad(SmallVectorImpl<LoadInst> &Lds, LoadInst Wide)
				170	: NewLd(Wide) {
				171	for (auto *I : Lds)
				172	Loads.push_back(I);
				173	}
				174	LoadInst *getLoad() {
				175	return NewLd;
				176	}
				177	};
				178
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	179	class ARMParallelDSP : public LoopPass {
				180	ScalarEvolution *SE;
				181	AliasAnalysis *AA;
				182	TargetLibraryInfo *TLI;
				183	DominatorTree *DT;
				184	LoopInfo *LI;
				185	Loop *L;
				186	const DataLayout *DL;
				187	Module *M;
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	188	std::map<LoadInst, LoadInst> LoadPairs;
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	189	SmallPtrSet<LoadInst*, 4> OffsetLoads;
Sam Parker	4c4ff13	2019-03-14 11:14:13 +0000	[diff] [blame]	190	std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	191
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	192	template<unsigned>
				193	bool IsNarrowSequence(Value *V, ValueList &VL);
				194
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	195	bool RecordMemoryOps(BasicBlock *BB);
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	196	void InsertParallelMACs(Reduction &Reduction);
Fangrui Song	6816934	2018-07-03 19:12:27 +0000	[diff] [blame]	197	bool AreSequentialLoads(LoadInst Ld0, LoadInst Ld1, MemInstList &VecMem);
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	198	LoadInst* CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
				199	IntegerType *LoadTy);
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	200	bool CreateParallelPairs(Reduction &R);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	201
				202	/// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
				203	/// Dual performs two signed 16x16-bit multiplications. It adds the
				204	/// products to a 32-bit accumulate operand. Optionally, the instruction can
				205	/// exchange the halfwords of the second operand before performing the
				206	/// arithmetic.
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	207	bool MatchSMLAD(Loop *L);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	208
				209	public:
				210	static char ID;
				211
				212	ARMParallelDSP() : LoopPass(ID) { }
				213
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	214	bool doInitialization(Loop *L, LPPassManager &LPM) override {
				215	LoadPairs.clear();
				216	WideLoads.clear();
				217	return true;
				218	}
				219
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	220	void getAnalysisUsage(AnalysisUsage &AU) const override {
				221	LoopPass::getAnalysisUsage(AU);
				222	AU.addRequired<AssumptionCacheTracker>();
				223	AU.addRequired<ScalarEvolutionWrapperPass>();
				224	AU.addRequired<AAResultsWrapperPass>();
				225	AU.addRequired<TargetLibraryInfoWrapperPass>();
				226	AU.addRequired<LoopInfoWrapperPass>();
				227	AU.addRequired<DominatorTreeWrapperPass>();
				228	AU.addRequired<TargetPassConfig>();
				229	AU.addPreserved<LoopInfoWrapperPass>();
				230	AU.setPreservesCFG();
				231	}
				232
				233	bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
Sjoerd Meijer	3c859b3	2018-08-14 07:43:49 +0000	[diff] [blame]	234	if (DisableParallelDSP)
				235	return false;
Eli Friedman	b27fc95	2019-07-23 20:48:46 +0000	[diff] [blame^]	236	if (skipLoop(TheLoop))
				237	return false;
				238
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	239	L = TheLoop;
				240	SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
				241	AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
				242	TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
				243	DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
				244	LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
				245	auto &TPC = getAnalysis<TargetPassConfig>();
				246
				247	BasicBlock *Header = TheLoop->getHeader();
				248	if (!Header)
				249	return false;
				250
				251	// TODO: We assume the loop header and latch to be the same block.
				252	// This is not a fundamental restriction, but lifting this would just
				253	// require more work to do the transformation and then patch up the CFG.
				254	if (Header != TheLoop->getLoopLatch()) {
				255	LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
				256	"running pass ARMParallelDSP\n");
				257	return false;
				258	}
				259
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	260	if (!TheLoop->getLoopPreheader())
				261	InsertPreheaderForLoop(L, DT, LI, nullptr, true);
Sam Parker	9e73020	2019-03-15 10:19:32 +0000	[diff] [blame]	262
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	263	Function &F = *Header->getParent();
				264	M = F.getParent();
				265	DL = &M->getDataLayout();
				266
				267	auto &TM = TPC.getTM<TargetMachine>();
				268	auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
				269
				270	if (!ST->allowsUnalignedMem()) {
				271	LLVM_DEBUG(dbgs() << "Unaligned memory access not supported: not "
				272	"running pass ARMParallelDSP\n");
				273	return false;
				274	}
				275
				276	if (!ST->hasDSP()) {
				277	LLVM_DEBUG(dbgs() << "DSP extension not enabled: not running pass "
				278	"ARMParallelDSP\n");
				279	return false;
				280	}
				281
Sam Parker	9e73020	2019-03-15 10:19:32 +0000	[diff] [blame]	282	if (!ST->isLittle()) {
				283	LLVM_DEBUG(dbgs() << "Only supporting little endian: not running pass "
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	284	<< "ARMParallelDSP\n");
Sam Parker	9e73020	2019-03-15 10:19:32 +0000	[diff] [blame]	285	return false;
				286	}
				287
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	288	LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	289
Sam Parker	a023c7a	2018-09-12 09:17:44 +0000	[diff] [blame]	290	LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
				291	LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	292
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	293	if (!RecordMemoryOps(Header)) {
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	294	LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
				295	return false;
				296	}
				297
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	298	bool Changes = MatchSMLAD(L);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	299	return Changes;
				300	}
				301	};
				302	}
				303
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	304	template<typename MemInst>
				305	static bool AreSequentialAccesses(MemInst MemOp0, MemInst MemOp1,
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	306	const DataLayout &DL, ScalarEvolution &SE) {
Sam Parker	4c4ff13	2019-03-14 11:14:13 +0000	[diff] [blame]	307	if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE))
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	308	return true;
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	309	return false;
				310	}
				311
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	312	bool ARMParallelDSP::AreSequentialLoads(LoadInst Ld0, LoadInst Ld1,
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	313	MemInstList &VecMem) {
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	314	if (!Ld0 \|\| !Ld1)
				315	return false;
				316
Sam Parker	4c4ff13	2019-03-14 11:14:13 +0000	[diff] [blame]	317	if (!LoadPairs.count(Ld0) \|\| LoadPairs[Ld0] != Ld1)
				318	return false;
				319
				320	LLVM_DEBUG(dbgs() << "Loads are sequential and valid:\n";
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	321	dbgs() << "Ld0:"; Ld0->dump();
				322	dbgs() << "Ld1:"; Ld1->dump();
				323	);
				324
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	325	VecMem.clear();
				326	VecMem.push_back(Ld0);
				327	VecMem.push_back(Ld1);
				328	return true;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	329	}
				330
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	331	// MaxBitwidth: the maximum supported bitwidth of the elements in the DSP
				332	// instructions, which is set to 16. So here we should collect all i8 and i16
				333	// narrow operations.
				334	// TODO: we currently only collect i16, and will support i8 later, so that's
				335	// why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
				336	template<unsigned MaxBitWidth>
				337	bool ARMParallelDSP::IsNarrowSequence(Value *V, ValueList &VL) {
				338	ConstantInt *CInt;
				339
				340	if (match(V, m_ConstantInt(CInt))) {
				341	// TODO: if a constant is used, it needs to fit within the bit width.
				342	return false;
				343	}
				344
				345	auto *I = dyn_cast<Instruction>(V);
				346	if (!I)
				347	return false;
				348
				349	Value Val, LHS, *RHS;
				350	if (match(V, m_Trunc(m_Value(Val)))) {
				351	if (cast<TruncInst>(I)->getDestTy()->getIntegerBitWidth() == MaxBitWidth)
				352	return IsNarrowSequence<MaxBitWidth>(Val, VL);
				353	} else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) {
				354	// TODO: we need to implement sadd16/sadd8 for this, which enables to
				355	// also do the rewrite for smlad8.ll, but it is unsupported for now.
				356	return false;
				357	} else if (match(V, m_ZExtOrSExt(m_Value(Val)))) {
				358	if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
				359	return false;
				360
				361	if (match(Val, m_Load(m_Value()))) {
				362	auto *Ld = cast<LoadInst>(Val);
				363
				364	// Check that these load could be paired.
				365	if (!LoadPairs.count(Ld) && !OffsetLoads.count(Ld))
				366	return false;
				367
				368	VL.push_back(Val);
				369	VL.push_back(I);
				370	return true;
				371	}
				372	}
				373	return false;
				374	}
				375
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	376	/// Iterate through the block and record base, offset pairs of loads which can
				377	/// be widened into a single load.
				378	bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	379	SmallVector<LoadInst*, 8> Loads;
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	380	SmallVector<Instruction*, 8> Writes;
				381
				382	// Collect loads and instruction that may write to memory. For now we only
				383	// record loads which are simple, sign-extended and have a single user.
				384	// TODO: Allow zero-extended loads.
Sam Parker	4c4ff13	2019-03-14 11:14:13 +0000	[diff] [blame]	385	for (auto &I : *BB) {
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	386	if (I.mayWriteToMemory())
				387	Writes.push_back(&I);
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	388	auto *Ld = dyn_cast<LoadInst>(&I);
Sam Parker	4c4ff13	2019-03-14 11:14:13 +0000	[diff] [blame]	389	if (!Ld \|\| !Ld->isSimple() \|\|
				390	!Ld->hasOneUse() \|\| !isa<SExtInst>(Ld->user_back()))
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	391	continue;
				392	Loads.push_back(Ld);
				393	}
				394
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	395	using InstSet = std::set<Instruction*>;
				396	using DepMap = std::map<Instruction*, InstSet>;
				397	DepMap RAWDeps;
				398
				399	// Record any writes that may alias a load.
				400	const auto Size = LocationSize::unknown();
				401	for (auto Read : Loads) {
				402	for (auto Write : Writes) {
				403	MemoryLocation ReadLoc =
				404	MemoryLocation(Read->getPointerOperand(), Size);
				405
				406	if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc),
				407	ModRefInfo::ModRef)))
				408	continue;
				409	if (DT->dominates(Write, Read))
				410	RAWDeps[Read].insert(Write);
				411	}
				412	}
				413
				414	// Check whether there's not a write between the two loads which would
				415	// prevent them from being safely merged.
				416	auto SafeToPair = [&](LoadInst Base, LoadInst Offset) {
				417	LoadInst *Dominator = DT->dominates(Base, Offset) ? Base : Offset;
				418	LoadInst *Dominated = DT->dominates(Base, Offset) ? Offset : Base;
				419
				420	if (RAWDeps.count(Dominated)) {
				421	InstSet &WritesBefore = RAWDeps[Dominated];
				422
				423	for (auto Before : WritesBefore) {
				424
				425	// We can't move the second load backward, past a write, to merge
				426	// with the first load.
				427	if (DT->dominates(Dominator, Before))
				428	return false;
				429	}
				430	}
				431	return true;
				432	};
				433
				434	// Record base, offset load pairs.
				435	for (auto *Base : Loads) {
				436	for (auto *Offset : Loads) {
				437	if (Base == Offset)
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	438	continue;
				439
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	440	if (AreSequentialAccesses<LoadInst>(Base, Offset, DL, SE) &&
				441	SafeToPair(Base, Offset)) {
				442	LoadPairs[Base] = Offset;
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	443	OffsetLoads.insert(Offset);
Sam Parker	4c4ff13	2019-03-14 11:14:13 +0000	[diff] [blame]	444	break;
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	445	}
				446	}
				447	}
Sam Parker	4c4ff13	2019-03-14 11:14:13 +0000	[diff] [blame]	448
				449	LLVM_DEBUG(if (!LoadPairs.empty()) {
				450	dbgs() << "Consecutive load pairs:\n";
				451	for (auto &MapIt : LoadPairs) {
				452	LLVM_DEBUG(dbgs() << *MapIt.first << ", "
				453	<< *MapIt.second << "\n");
				454	}
				455	});
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	456	return LoadPairs.size() > 1;
				457	}
				458
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	459	// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
				460	// multiplications.
				461	// To use SMLAD:
				462	// 1) we first need to find integer add then look for this pattern:
				463	//
				464	// acc0 = ...
				465	// ld0 = load i16
				466	// sext0 = sext i16 %ld0 to i32
				467	// ld1 = load i16
				468	// sext1 = sext i16 %ld1 to i32
				469	// mul0 = mul %sext0, %sext1
				470	// ld2 = load i16
				471	// sext2 = sext i16 %ld2 to i32
				472	// ld3 = load i16
				473	// sext3 = sext i16 %ld3 to i32
				474	// mul1 = mul i32 %sext2, %sext3
				475	// add0 = add i32 %mul0, %acc0
				476	// acc1 = add i32 %add0, %mul1
				477	//
				478	// Which can be selected to:
				479	//
				480	// ldr r0
				481	// ldr r1
				482	// smlad r2, r0, r1, r2
				483	//
				484	// If constants are used instead of loads, these will need to be hoisted
				485	// out and into a register.
				486	//
				487	// If loop invariants are used instead of loads, these need to be packed
				488	// before the loop begins.
				489	//
				490	bool ARMParallelDSP::MatchSMLAD(Loop *L) {
				491	// Search recursively back through the operands to find a tree of values that
				492	// form a multiply-accumulate chain. The search records the Add and Mul
				493	// instructions that form the reduction and allows us to find a single value
				494	// to be used as the initial input to the accumlator.
				495	std::function<bool(Value*, Reduction&)> Search = [&]
				496	(Value *V, Reduction &R) -> bool {
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	497
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	498	// If we find a non-instruction, try to use it as the initial accumulator
				499	// value. This may have already been found during the search in which case
				500	// this function will return false, signaling a search fail.
				501	auto *I = dyn_cast<Instruction>(V);
				502	if (!I)
				503	return R.InsertAcc(V);
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	504
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	505	switch (I->getOpcode()) {
				506	default:
				507	break;
				508	case Instruction::PHI:
				509	// Could be the accumulator value.
				510	return R.InsertAcc(V);
				511	case Instruction::Add: {
				512	// Adds should be adding together two muls, or another add and a mul to
				513	// be within the mac chain. One of the operands may also be the
				514	// accumulator value at which point we should stop searching.
				515	bool ValidLHS = Search(I->getOperand(0), R);
				516	bool ValidRHS = Search(I->getOperand(1), R);
				517	if (!ValidLHS && !ValidLHS)
				518	return false;
				519	else if (ValidLHS && ValidRHS) {
				520	R.InsertAdd(I);
				521	return true;
				522	} else {
				523	R.InsertAdd(I);
				524	return R.InsertAcc(I);
				525	}
				526	}
				527	case Instruction::Mul: {
				528	Value *MulOp0 = I->getOperand(0);
				529	Value *MulOp1 = I->getOperand(1);
				530	if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1)) {
				531	ValueList LHS;
				532	ValueList RHS;
				533	if (IsNarrowSequence<16>(MulOp0, LHS) &&
				534	IsNarrowSequence<16>(MulOp1, RHS)) {
				535	R.InsertMul(I, LHS, RHS);
				536	return true;
				537	}
				538	}
				539	return false;
				540	}
				541	case Instruction::SExt:
				542	return Search(I->getOperand(0), R);
				543	}
				544	return false;
				545	};
				546
				547	bool Changed = false;
				548	SmallPtrSet<Instruction*, 4> AllAdds;
				549	BasicBlock *Latch = L->getLoopLatch();
				550
				551	for (Instruction &I : reverse(*Latch)) {
				552	if (I.getOpcode() != Instruction::Add)
				553	continue;
				554
				555	if (AllAdds.count(&I))
				556	continue;
				557
				558	const auto *Ty = I.getType();
				559	if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
				560	continue;
				561
				562	Reduction R(&I);
				563	if (!Search(&I, R))
				564	continue;
				565
				566	if (!CreateParallelPairs(R))
				567	continue;
				568
				569	InsertParallelMACs(R);
				570	Changed = true;
				571	AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
				572	}
				573
				574	return Changed;
				575	}
				576
				577	bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
				578
				579	// Not enough mul operations to make a pair.
				580	if (R.getMuls().size() < 2)
				581	return false;
				582
				583	// Check that the muls operate directly upon sign extended loads.
				584	for (auto &MulChain : R.getMuls()) {
				585	// A mul has 2 operands, and a narrow op consist of sext and a load; thus
				586	// we expect at least 4 items in this operand value list.
				587	if (MulChain->size() < 4) {
				588	LLVM_DEBUG(dbgs() << "Operand list too short.\n");
				589	return false;
				590	}
				591	MulChain->PopulateLoads();
				592	ValueList &LHS = static_cast<BinOpChain*>(MulChain.get())->LHS;
				593	ValueList &RHS = static_cast<BinOpChain*>(MulChain.get())->RHS;
				594
				595	// Use +=2 to skip over the expected extend instructions.
				596	for (unsigned i = 0, e = LHS.size(); i < e; i += 2) {
				597	if (!isa<LoadInst>(LHS[i]) \|\| !isa<LoadInst>(RHS[i]))
				598	return false;
				599	}
				600	}
				601
				602	auto CanPair = [&](Reduction &R, BinOpChain PMul0, BinOpChain PMul1) {
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	603	if (!PMul0->AreSymmetrical(PMul1))
				604	return false;
				605
				606	// The first elements of each vector should be loads with sexts. If we
				607	// find that its two pairs of consecutive loads, then these can be
				608	// transformed into two wider loads and the users can be replaced with
				609	// DSP intrinsics.
				610	for (unsigned x = 0; x < PMul0->LHS.size(); x += 2) {
				611	auto *Ld0 = dyn_cast<LoadInst>(PMul0->LHS[x]);
				612	auto *Ld1 = dyn_cast<LoadInst>(PMul1->LHS[x]);
				613	auto *Ld2 = dyn_cast<LoadInst>(PMul0->RHS[x]);
				614	auto *Ld3 = dyn_cast<LoadInst>(PMul1->RHS[x]);
				615
				616	if (!Ld0 \|\| !Ld1 \|\| !Ld2 \|\| !Ld3)
				617	return false;
				618
Sam Parker	4c4ff13	2019-03-14 11:14:13 +0000	[diff] [blame]	619	LLVM_DEBUG(dbgs() << "Loads:\n"
				620	<< " - " << *Ld0 << "\n"
				621	<< " - " << *Ld1 << "\n"
				622	<< " - " << *Ld2 << "\n"
				623	<< " - " << *Ld3 << "\n");
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	624
				625	if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
				626	if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
				627	LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	628	R.AddMulPair(PMul0, PMul1);
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	629	return true;
				630	} else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
				631	LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
				632	LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n");
				633	PMul1->Exchange = true;
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	634	R.AddMulPair(PMul0, PMul1);
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	635	return true;
				636	}
				637	} else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
				638	AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
				639	LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
				640	LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n");
				641	LLVM_DEBUG(dbgs() << " and swapping muls\n");
				642	PMul0->Exchange = true;
				643	// Only the second operand can be exchanged, so swap the muls.
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	644	R.AddMulPair(PMul1, PMul0);
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	645	return true;
				646	}
				647	}
				648	return false;
				649	};
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	650
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	651	OpChainList &Muls = R.getMuls();
				652	const unsigned Elems = Muls.size();
Sam Parker	a023c7a	2018-09-12 09:17:44 +0000	[diff] [blame]	653	SmallPtrSet<const Instruction*, 4> Paired;
				654	for (unsigned i = 0; i < Elems; ++i) {
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	655	BinOpChain PMul0 = static_cast<BinOpChain>(Muls[i].get());
Sam Parker	a023c7a	2018-09-12 09:17:44 +0000	[diff] [blame]	656	if (Paired.count(PMul0->Root))
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	657	continue;
				658
Sam Parker	a023c7a	2018-09-12 09:17:44 +0000	[diff] [blame]	659	for (unsigned j = 0; j < Elems; ++j) {
				660	if (i == j)
				661	continue;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	662
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	663	BinOpChain PMul1 = static_cast<BinOpChain>(Muls[j].get());
Sam Parker	a023c7a	2018-09-12 09:17:44 +0000	[diff] [blame]	664	if (Paired.count(PMul1->Root))
				665	continue;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	666
Sam Parker	a023c7a	2018-09-12 09:17:44 +0000	[diff] [blame]	667	const Instruction *Mul0 = PMul0->Root;
				668	const Instruction *Mul1 = PMul1->Root;
				669	if (Mul0 == Mul1)
				670	continue;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	671
Sam Parker	a023c7a	2018-09-12 09:17:44 +0000	[diff] [blame]	672	assert(PMul0 != PMul1 && "expected different chains");
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	673
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	674	if (CanPair(R, PMul0, PMul1)) {
Sam Parker	a023c7a	2018-09-12 09:17:44 +0000	[diff] [blame]	675	Paired.insert(Mul0);
				676	Paired.insert(Mul1);
				677	break;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	678	}
				679	}
				680	}
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	681	return !R.getMulPairs().empty();
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	682	}
				683
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	684
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	685	void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
				686
				687	auto CreateSMLADCall = [&](SmallVectorImpl<LoadInst*> &VecLd0,
				688	SmallVectorImpl<LoadInst*> &VecLd1,
				689	Value *Acc, bool Exchange,
				690	Instruction *InsertAfter) {
				691	// Replace the reduction chain with an intrinsic call
				692	IntegerType *Ty = IntegerType::get(M->getContext(), 32);
				693	LoadInst *WideLd0 = WideLoads.count(VecLd0[0]) ?
				694	WideLoads[VecLd0[0]]->getLoad() : CreateWideLoad(VecLd0, Ty);
				695	LoadInst *WideLd1 = WideLoads.count(VecLd1[0]) ?
				696	WideLoads[VecLd1[0]]->getLoad() : CreateWideLoad(VecLd1, Ty);
				697
				698	Value* Args[] = { WideLd0, WideLd1, Acc };
				699	Function *SMLAD = nullptr;
				700	if (Exchange)
				701	SMLAD = Acc->getType()->isIntegerTy(32) ?
				702	Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) :
				703	Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx);
				704	else
				705	SMLAD = Acc->getType()->isIntegerTy(32) ?
				706	Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) :
				707	Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
				708
				709	IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
				710	++BasicBlock::iterator(InsertAfter));
				711	Instruction *Call = Builder.CreateCall(SMLAD, Args);
				712	NumSMLAD++;
				713	return Call;
				714	};
				715
				716	Instruction *InsertAfter = R.getRoot();
				717	Value *Acc = R.getAccumulator();
				718	if (!Acc)
				719	Acc = ConstantInt::get(IntegerType::get(M->getContext(), 32), 0);
				720
				721	LLVM_DEBUG(dbgs() << "Root: " << *InsertAfter << "\n"
				722	<< "Acc: " << *Acc << "\n");
				723	for (auto &Pair : R.getMulPairs()) {
Sam Parker	a023c7a	2018-09-12 09:17:44 +0000	[diff] [blame]	724	BinOpChain *PMul0 = Pair.first;
				725	BinOpChain *PMul1 = Pair.second;
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	726	LLVM_DEBUG(dbgs() << "Muls:\n"
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	727	<< "- " << *PMul0->Root << "\n"
				728	<< "- " << *PMul1->Root << "\n");
Sam Parker	a023c7a	2018-09-12 09:17:44 +0000	[diff] [blame]	729
Sam Parker	4c4ff13	2019-03-14 11:14:13 +0000	[diff] [blame]	730	Acc = CreateSMLADCall(PMul0->VecLd, PMul1->VecLd, Acc, PMul1->Exchange,
				731	InsertAfter);
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	732	InsertAfter = cast<Instruction>(Acc);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	733	}
Sam Parker	85ad78b	2019-07-11 07:47:50 +0000	[diff] [blame]	734	R.UpdateRoot(cast<Instruction>(Acc));
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	735	}
				736
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	737	LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
				738	IntegerType *LoadTy) {
Sam Parker	4c4ff13	2019-03-14 11:14:13 +0000	[diff] [blame]	739	assert(Loads.size() == 2 && "currently only support widening two loads");
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	740
				741	LoadInst *Base = Loads[0];
				742	LoadInst *Offset = Loads[1];
				743
				744	Instruction *BaseSExt = dyn_cast<SExtInst>(Base->user_back());
				745	Instruction *OffsetSExt = dyn_cast<SExtInst>(Offset->user_back());
				746
				747	assert((BaseSExt && OffsetSExt)
				748	&& "Loads should have a single, extending, user");
				749
				750	std::function<void(Value, Value)> MoveBefore =
				751	[&](Value A, Value B) -> void {
				752	if (!isa<Instruction>(A) \|\| !isa<Instruction>(B))
				753	return;
				754
				755	auto *Source = cast<Instruction>(A);
				756	auto *Sink = cast<Instruction>(B);
				757
				758	if (DT->dominates(Source, Sink) \|\|
				759	Source->getParent() != Sink->getParent() \|\|
				760	isa<PHINode>(Source) \|\| isa<PHINode>(Sink))
				761	return;
				762
				763	Source->moveBefore(Sink);
				764	for (auto &U : Source->uses())
				765	MoveBefore(Source, U.getUser());
				766	};
				767
				768	// Insert the load at the point of the original dominating load.
				769	LoadInst *DomLoad = DT->dominates(Base, Offset) ? Base : Offset;
				770	IRBuilder<NoFolder> IRB(DomLoad->getParent(),
				771	++BasicBlock::iterator(DomLoad));
				772
				773	// Bitcast the pointer to a wider type and create the wide load, while making
				774	// sure to maintain the original alignment as this prevents ldrd from being
				775	// generated when it could be illegal due to memory alignment.
				776	const unsigned AddrSpace = DomLoad->getPointerAddressSpace();
				777	Value *VecPtr = IRB.CreateBitCast(Base->getPointerOperand(),
Eli Friedman	b09c778	2018-10-18 19:34:30 +0000	[diff] [blame]	778	LoadTy->getPointerTo(AddrSpace));
Sam Parker	4c4ff13	2019-03-14 11:14:13 +0000	[diff] [blame]	779	LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr,
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	780	Base->getAlignment());
Sam Parker	4c4ff13	2019-03-14 11:14:13 +0000	[diff] [blame]	781
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	782	// Make sure everything is in the correct order in the basic block.
				783	MoveBefore(Base->getPointerOperand(), VecPtr);
				784	MoveBefore(VecPtr, WideLoad);
Sam Parker	4c4ff13	2019-03-14 11:14:13 +0000	[diff] [blame]	785
				786	// From the wide load, create two values that equal the original two loads.
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	787	// Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
				788	// TODO: Support big-endian as well.
				789	Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
				790	BaseSExt->setOperand(0, Bottom);
Sam Parker	4c4ff13	2019-03-14 11:14:13 +0000	[diff] [blame]	791
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	792	IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
				793	Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
Sam Parker	4c4ff13	2019-03-14 11:14:13 +0000	[diff] [blame]	794	Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	795	Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
				796	OffsetSExt->setOperand(0, Trunc);
Sam Parker	4c4ff13	2019-03-14 11:14:13 +0000	[diff] [blame]	797
Sam Parker	a33e311	2019-05-13 09:23:32 +0000	[diff] [blame]	798	WideLoads.emplace(std::make_pair(Base,
Sam Parker	4c4ff13	2019-03-14 11:14:13 +0000	[diff] [blame]	799	make_unique<WidenedLoad>(Loads, WideLoad)));
				800	return WideLoad;
Eli Friedman	b09c778	2018-10-18 19:34:30 +0000	[diff] [blame]	801	}
				802
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	803	// Compare the value lists in Other to this chain.
				804	bool BinOpChain::AreSymmetrical(BinOpChain *Other) {
				805	// Element-by-element comparison of Value lists returning true if they are
				806	// instructions with the same opcode or constants with the same value.
				807	auto CompareValueList = [](const ValueList &VL0,
				808	const ValueList &VL1) {
				809	if (VL0.size() != VL1.size()) {
				810	LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
				811	<< VL0.size() << " != " << VL1.size() << "\n");
				812	return false;
				813	}
				814
				815	const unsigned Pairs = VL0.size();
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	816
				817	for (unsigned i = 0; i < Pairs; ++i) {
				818	const Value *V0 = VL0[i];
				819	const Value *V1 = VL1[i];
				820	const auto *Inst0 = dyn_cast<Instruction>(V0);
				821	const auto *Inst1 = dyn_cast<Instruction>(V1);
				822
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	823	if (!Inst0 \|\| !Inst1)
				824	return false;
				825
Sam Parker	4c4ff13	2019-03-14 11:14:13 +0000	[diff] [blame]	826	if (Inst0->isSameOperationAs(Inst1))
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	827	continue;
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	828
				829	const APInt C0, C1;
				830	if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
				831	return false;
				832	}
				833
Sam Parker	453ba91	2018-11-09 09:18:00 +0000	[diff] [blame]	834	return true;
				835	};
				836
				837	return CompareValueList(LHS, Other->LHS) &&
				838	CompareValueList(RHS, Other->RHS);
				839	}
				840
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	841	Pass *llvm::createARMParallelDSPPass() {
				842	return new ARMParallelDSP();
				843	}
				844
				845	char ARMParallelDSP::ID = 0;
				846
Sjoerd Meijer	b3e06fa	2018-07-06 14:47:09 +0000	[diff] [blame]	847	INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
Simon Pilgrim	c09b5e3	2018-06-28 18:37:16 +0000	[diff] [blame]	848	"Transform loops to use DSP intrinsics", false, false)
Sjoerd Meijer	b3e06fa	2018-07-06 14:47:09 +0000	[diff] [blame]	849	INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
Simon Pilgrim	c09b5e3	2018-06-28 18:37:16 +0000	[diff] [blame]	850	"Transform loops to use DSP intrinsics", false, false)