Blame - llvm/lib/Target/ARM/ARMParallelDSP.cpp - toolchain/llvm-project

blob: d6f98141e7f1b9577855e990e1300959db0a0f37 [file] [log] [blame]

Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	1	//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	/// \file
				11	/// Armv6 introduced instructions to perform 32-bit SIMD operations. The
				12	/// purpose of this pass is do some IR pattern matching to create ACLE
				13	/// DSP intrinsics, which map on these 32-bit SIMD operations.
Sjoerd Meijer	53449da	2018-07-11 12:36:25 +0000	[diff] [blame^]	14	/// This pass runs only when unaligned accesses is supported/enabled.
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	15	//
				16	//===----------------------------------------------------------------------===//
				17
Sjoerd Meijer	b3e06fa	2018-07-06 14:47:09 +0000	[diff] [blame]	18	#include "llvm/ADT/Statistic.h"
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	19	#include "llvm/ADT/SmallPtrSet.h"
				20	#include "llvm/Analysis/AliasAnalysis.h"
				21	#include "llvm/Analysis/LoopAccessAnalysis.h"
				22	#include "llvm/Analysis/LoopPass.h"
				23	#include "llvm/Analysis/LoopInfo.h"
				24	#include "llvm/IR/Instructions.h"
				25	#include "llvm/IR/NoFolder.h"
				26	#include "llvm/Transforms/Scalar.h"
				27	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
				28	#include "llvm/Transforms/Utils/LoopUtils.h"
				29	#include "llvm/Pass.h"
				30	#include "llvm/PassRegistry.h"
				31	#include "llvm/PassSupport.h"
				32	#include "llvm/Support/Debug.h"
				33	#include "llvm/IR/PatternMatch.h"
				34	#include "llvm/CodeGen/TargetPassConfig.h"
				35	#include "ARM.h"
				36	#include "ARMSubtarget.h"
				37
				38	using namespace llvm;
				39	using namespace PatternMatch;
				40
Sjoerd Meijer	b3e06fa	2018-07-06 14:47:09 +0000	[diff] [blame]	41	#define DEBUG_TYPE "arm-parallel-dsp"
				42
				43	STATISTIC(NumSMLAD , "Number of smlad instructions generated");
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	44
				45	namespace {
				46	struct ParallelMAC;
				47	struct Reduction;
				48
				49	using ParallelMACList = SmallVector<ParallelMAC, 8>;
				50	using ReductionList = SmallVector<Reduction, 8>;
				51	using ValueList = SmallVector<Value*, 8>;
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	52	using MemInstList = SmallVector<Instruction*, 8>;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	53	using PMACPair = std::pair<ParallelMAC,ParallelMAC>;
				54	using PMACPairList = SmallVector<PMACPair, 8>;
				55	using Instructions = SmallVector<Instruction*,16>;
				56	using MemLocList = SmallVector<MemoryLocation, 4>;
				57
				58	// 'ParallelMAC' and 'Reduction' are just some bookkeeping data structures.
				59	// 'Reduction' contains the phi-node and accumulator statement from where we
				60	// start pattern matching, and 'ParallelMAC' the multiplication
				61	// instructions that are candidates for parallel execution.
				62	struct ParallelMAC {
				63	Instruction *Mul;
				64	ValueList VL; // List of all (narrow) operands of this Mul
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	65	MemInstList VecLd; // List of all load instructions of this Mul
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	66	MemLocList MemLocs; // All memory locations read by this Mul
				67
Sjoerd Meijer	53449da	2018-07-11 12:36:25 +0000	[diff] [blame^]	68	// The MAC-chains we currently recognise are simple chains that accumulate
				69	// their results with a reducing integer add statement, and consist of
				70	// a chain of adds and muls, which have only sext and load instructions as
				71	// operands. Thus, these chains don't write memory. We check that this is
				72	// true when we collect the operands, and use this in alias analysis checks
				73	// that different parallel MACs don't interfere with each other.
				74	bool ReadOnly;
				75
				76	ParallelMAC(Instruction *I, ValueList &V, bool RdOnly)
				77	: Mul(I), VL(V), ReadOnly(RdOnly) {};
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	78	};
				79
				80	struct Reduction {
				81	PHINode *Phi; // The Phi-node from where we start
				82	// pattern matching.
				83	Instruction *AccIntAdd; // The accumulating integer add statement,
				84	// i.e, the reduction statement.
				85
Sjoerd Meijer	53449da	2018-07-11 12:36:25 +0000	[diff] [blame^]	86	ParallelMACList MACCandidates; // The MAC candidates associated with
				87	// this reduction statement.
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	88	Reduction (PHINode P, Instruction Acc) : Phi(P), AccIntAdd(Acc) { };
				89	};
				90
				91	class ARMParallelDSP : public LoopPass {
				92	ScalarEvolution *SE;
				93	AliasAnalysis *AA;
				94	TargetLibraryInfo *TLI;
				95	DominatorTree *DT;
				96	LoopInfo *LI;
				97	Loop *L;
				98	const DataLayout *DL;
				99	Module *M;
				100
				101	bool InsertParallelMACs(Reduction &Reduction, PMACPairList &PMACPairs);
Fangrui Song	6816934	2018-07-03 19:12:27 +0000	[diff] [blame]	102	bool AreSequentialLoads(LoadInst Ld0, LoadInst Ld1, MemInstList &VecMem);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	103	PMACPairList CreateParallelMACPairs(ParallelMACList &Candidates);
				104	Instruction CreateSMLADCall(LoadInst VecLd0, LoadInst *VecLd1,
				105	Instruction Acc, Instruction InsertAfter);
				106
				107	/// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
				108	/// Dual performs two signed 16x16-bit multiplications. It adds the
				109	/// products to a 32-bit accumulate operand. Optionally, the instruction can
				110	/// exchange the halfwords of the second operand before performing the
				111	/// arithmetic.
				112	bool MatchSMLAD(Function &F);
				113
				114	public:
				115	static char ID;
				116
				117	ARMParallelDSP() : LoopPass(ID) { }
				118
				119	void getAnalysisUsage(AnalysisUsage &AU) const override {
				120	LoopPass::getAnalysisUsage(AU);
				121	AU.addRequired<AssumptionCacheTracker>();
				122	AU.addRequired<ScalarEvolutionWrapperPass>();
				123	AU.addRequired<AAResultsWrapperPass>();
				124	AU.addRequired<TargetLibraryInfoWrapperPass>();
				125	AU.addRequired<LoopInfoWrapperPass>();
				126	AU.addRequired<DominatorTreeWrapperPass>();
				127	AU.addRequired<TargetPassConfig>();
				128	AU.addPreserved<LoopInfoWrapperPass>();
				129	AU.setPreservesCFG();
				130	}
				131
				132	bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
				133	L = TheLoop;
				134	SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
				135	AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
				136	TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
				137	DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
				138	LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
				139	auto &TPC = getAnalysis<TargetPassConfig>();
				140
				141	BasicBlock *Header = TheLoop->getHeader();
				142	if (!Header)
				143	return false;
				144
				145	// TODO: We assume the loop header and latch to be the same block.
				146	// This is not a fundamental restriction, but lifting this would just
				147	// require more work to do the transformation and then patch up the CFG.
				148	if (Header != TheLoop->getLoopLatch()) {
				149	LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
				150	"running pass ARMParallelDSP\n");
				151	return false;
				152	}
				153
				154	Function &F = *Header->getParent();
				155	M = F.getParent();
				156	DL = &M->getDataLayout();
				157
				158	auto &TM = TPC.getTM<TargetMachine>();
				159	auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
				160
				161	if (!ST->allowsUnalignedMem()) {
				162	LLVM_DEBUG(dbgs() << "Unaligned memory access not supported: not "
				163	"running pass ARMParallelDSP\n");
				164	return false;
				165	}
				166
				167	if (!ST->hasDSP()) {
				168	LLVM_DEBUG(dbgs() << "DSP extension not enabled: not running pass "
				169	"ARMParallelDSP\n");
				170	return false;
				171	}
				172
				173	LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
				174	bool Changes = false;
				175
				176	LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n\n");
				177	Changes = MatchSMLAD(F);
				178	return Changes;
				179	}
				180	};
				181	}
				182
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame]	183	// MaxBitwidth: the maximum supported bitwidth of the elements in the DSP
				184	// instructions, which is set to 16. So here we should collect all i8 and i16
				185	// narrow operations.
				186	// TODO: we currently only collect i16, and will support i8 later, so that's
				187	// why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
				188	template<unsigned MaxBitWidth>
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	189	static bool IsNarrowSequence(Value *V, ValueList &VL) {
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame]	190	LLVM_DEBUG(dbgs() << "Is narrow sequence? "; V->dump());
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	191	ConstantInt *CInt;
				192
				193	if (match(V, m_ConstantInt(CInt))) {
				194	// TODO: if a constant is used, it needs to fit within the bit width.
				195	return false;
				196	}
				197
				198	auto *I = dyn_cast<Instruction>(V);
				199	if (!I)
				200	return false;
				201
				202	Value Val, LHS, *RHS;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	203	if (match(V, m_Trunc(m_Value(Val)))) {
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame]	204	if (cast<TruncInst>(I)->getDestTy()->getIntegerBitWidth() == MaxBitWidth)
				205	return IsNarrowSequence<MaxBitWidth>(Val, VL);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	206	} else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) {
				207	// TODO: we need to implement sadd16/sadd8 for this, which enables to
				208	// also do the rewrite for smlad8.ll, but it is unsupported for now.
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame]	209	LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
				210	return false;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	211	} else if (match(V, m_ZExtOrSExt(m_Value(Val)))) {
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame]	212	if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) {
				213	LLVM_DEBUG(dbgs() << "No, wrong SrcTy size: " <<
				214	cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() << "\n");
				215	return false;
				216	}
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	217
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame]	218	if (match(Val, m_Load(m_Value()))) {
				219	LLVM_DEBUG(dbgs() << "Yes, found narrow Load:\t"; Val->dump());
				220	VL.push_back(Val);
				221	VL.push_back(I);
				222	return true;
				223	}
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	224	}
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame]	225	LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
				226	return false;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	227	}
				228
				229	// Element-by-element comparison of Value lists returning true if they are
				230	// instructions with the same opcode or constants with the same value.
				231	static bool AreSymmetrical(const ValueList &VL0,
				232	const ValueList &VL1) {
				233	if (VL0.size() != VL1.size()) {
				234	LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
				235	<< VL0.size() << " != " << VL1.size() << "\n");
				236	return false;
				237	}
				238
				239	const unsigned Pairs = VL0.size();
				240	LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
				241
				242	for (unsigned i = 0; i < Pairs; ++i) {
				243	const Value *V0 = VL0[i];
				244	const Value *V1 = VL1[i];
				245	const auto *Inst0 = dyn_cast<Instruction>(V0);
				246	const auto *Inst1 = dyn_cast<Instruction>(V1);
				247
				248	LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
				249	dbgs() << "mul1: "; V0->dump();
				250	dbgs() << "mul2: "; V1->dump());
				251
				252	if (!Inst0 \|\| !Inst1)
				253	return false;
				254
				255	if (Inst0->isSameOperationAs(Inst1)) {
				256	LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
				257	continue;
				258	}
				259
				260	const APInt C0, C1;
				261	if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
				262	return false;
				263	}
				264
				265	LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
				266	return true;
				267	}
				268
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	269	template<typename MemInst>
				270	static bool AreSequentialAccesses(MemInst MemOp0, MemInst MemOp1,
				271	MemInstList &VecMem, const DataLayout &DL,
				272	ScalarEvolution &SE) {
				273	if (!MemOp0->isSimple() \|\| !MemOp1->isSimple()) {
				274	LLVM_DEBUG(dbgs() << "No, not touching volatile access\n");
				275	return false;
				276	}
				277	if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) {
				278	VecMem.push_back(MemOp0);
				279	VecMem.push_back(MemOp1);
				280	LLVM_DEBUG(dbgs() << "OK: accesses are consecutive.\n");
				281	return true;
				282	}
				283	LLVM_DEBUG(dbgs() << "No, accesses aren't consecutive.\n");
				284	return false;
				285	}
				286
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	287	bool ARMParallelDSP::AreSequentialLoads(LoadInst Ld0, LoadInst Ld1,
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	288	MemInstList &VecMem) {
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	289	if (!Ld0 \|\| !Ld1)
				290	return false;
				291
				292	LLVM_DEBUG(dbgs() << "Are consecutive loads:\n";
				293	dbgs() << "Ld0:"; Ld0->dump();
				294	dbgs() << "Ld1:"; Ld1->dump();
				295	);
				296
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	297	if (!Ld0->hasOneUse() \|\| !Ld1->hasOneUse()) {
				298	LLVM_DEBUG(dbgs() << "No, load has more than one use.\n");
				299	return false;
				300	}
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	301
				302	return AreSequentialAccesses<LoadInst>(Ld0, Ld1, VecMem, DL, SE);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	303	}
				304
				305	PMACPairList
				306	ARMParallelDSP::CreateParallelMACPairs(ParallelMACList &Candidates) {
				307	const unsigned Elems = Candidates.size();
				308	PMACPairList PMACPairs;
				309
				310	if (Elems < 2)
				311	return PMACPairs;
				312
				313	// TODO: for now we simply try to match consecutive pairs i and i+1.
				314	// We can compare all elements, but then we need to compare and evaluate
				315	// different solutions.
				316	for(unsigned i=0; i<Elems-1; i+=2) {
				317	ParallelMAC &PMul0 = Candidates[i];
				318	ParallelMAC &PMul1 = Candidates[i+1];
				319	const Instruction *Mul0 = PMul0.Mul;
				320	const Instruction *Mul1 = PMul1.Mul;
				321
				322	if (Mul0 == Mul1)
				323	continue;
				324
				325	LLVM_DEBUG(dbgs() << "\nCheck parallel muls:\n";
				326	dbgs() << "- "; Mul0->dump();
				327	dbgs() << "- "; Mul1->dump());
				328
				329	const ValueList &VL0 = PMul0.VL;
				330	const ValueList &VL1 = PMul1.VL;
				331
				332	if (!AreSymmetrical(VL0, VL1))
				333	continue;
				334
				335	LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
				336	// The first elements of each vector should be loads with sexts. If we find
				337	// that its two pairs of consecutive loads, then these can be transformed
				338	// into two wider loads and the users can be replaced with DSP
				339	// intrinsics.
				340	for (unsigned x = 0; x < VL0.size(); x += 4) {
				341	auto *Ld0 = dyn_cast<LoadInst>(VL0[x]);
				342	auto *Ld1 = dyn_cast<LoadInst>(VL1[x]);
				343	auto *Ld2 = dyn_cast<LoadInst>(VL0[x+2]);
				344	auto *Ld3 = dyn_cast<LoadInst>(VL1[x+2]);
				345
				346	LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n";
				347	dbgs() << "\t mul1: "; VL0[x]->dump();
				348	dbgs() << "\t mul2: "; VL1[x]->dump();
				349	dbgs() << "and operands " << x + 2 << ":\n";
				350	dbgs() << "\t mul1: "; VL0[x+2]->dump();
				351	dbgs() << "\t mul2: "; VL1[x+2]->dump());
				352
				353	if (AreSequentialLoads(Ld0, Ld1, Candidates[i].VecLd) &&
				354	AreSequentialLoads(Ld2, Ld3, Candidates[i+1].VecLd)) {
				355	LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
				356	PMACPairs.push_back(std::make_pair(&PMul0, &PMul1));
				357	}
				358	}
				359	}
				360	return PMACPairs;
				361	}
				362
				363	bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction,
				364	PMACPairList &PMACPairs) {
				365	Instruction *Acc = Reduction.Phi;
				366	Instruction *InsertAfter = Reduction.AccIntAdd;
				367
				368	for (auto &Pair : PMACPairs) {
				369	LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
				370	dbgs() << "- "; Pair.first->Mul->dump();
				371	dbgs() << "- "; Pair.second->Mul->dump());
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	372	auto *VecLd0 = cast<LoadInst>(Pair.first->VecLd[0]);
				373	auto *VecLd1 = cast<LoadInst>(Pair.second->VecLd[0]);
				374	Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, InsertAfter);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	375	InsertAfter = Acc;
				376	}
				377
				378	if (Acc != Reduction.Phi) {
				379	LLVM_DEBUG(dbgs() << "Replace Accumulate: "; Acc->dump());
				380	Reduction.AccIntAdd->replaceAllUsesWith(Acc);
				381	return true;
				382	}
				383	return false;
				384	}
				385
				386	static ReductionList MatchReductions(Function &F, Loop *TheLoop,
				387	BasicBlock *Header) {
				388	ReductionList Reductions;
				389	RecurrenceDescriptor RecDesc;
				390	const bool HasFnNoNaNAttr =
				391	F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
				392	const BasicBlock *Latch = TheLoop->getLoopLatch();
				393
				394	// We need a preheader as getIncomingValueForBlock assumes there is one.
Sjoerd Meijer	53449da	2018-07-11 12:36:25 +0000	[diff] [blame^]	395	if (!TheLoop->getLoopPreheader()) {
				396	LLVM_DEBUG(dbgs() << "No preheader found, bailing out\n");
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	397	return Reductions;
Sjoerd Meijer	53449da	2018-07-11 12:36:25 +0000	[diff] [blame^]	398	}
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	399
				400	for (PHINode &Phi : Header->phis()) {
				401	const auto *Ty = Phi.getType();
				402	if (!Ty->isIntegerTy(32))
				403	continue;
				404
				405	const bool IsReduction =
				406	RecurrenceDescriptor::AddReductionVar(&Phi,
				407	RecurrenceDescriptor::RK_IntegerAdd,
				408	TheLoop, HasFnNoNaNAttr, RecDesc);
				409	if (!IsReduction)
				410	continue;
				411
				412	Instruction *Acc = dyn_cast<Instruction>(Phi.getIncomingValueForBlock(Latch));
				413	if (!Acc)
				414	continue;
				415
				416	Reductions.push_back(Reduction(&Phi, Acc));
				417	}
				418
				419	LLVM_DEBUG(
				420	dbgs() << "\nAccumulating integer additions (reductions) found:\n";
				421	for (auto R : Reductions) {
				422	dbgs() << "- "; R.Phi->dump();
				423	dbgs() << "-> "; R.AccIntAdd->dump();
				424	}
				425	);
				426	return Reductions;
				427	}
				428
Sjoerd Meijer	53449da	2018-07-11 12:36:25 +0000	[diff] [blame^]	429	static void AddMACCandidate(ParallelMACList &Candidates, const Instruction *Acc,
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	430	Value MulOp0, Value MulOp1, int MulOpNum) {
				431	Instruction *Mul = dyn_cast<Instruction>(Acc->getOperand(MulOpNum));
				432	LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
				433	ValueList VL;
				434	if (IsNarrowSequence<16>(MulOp0, VL) &&
				435	IsNarrowSequence<16>(MulOp1, VL)) {
				436	LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
Sjoerd Meijer	53449da	2018-07-11 12:36:25 +0000	[diff] [blame^]	437
				438	bool MayWriteMem = false;
				439	for (auto &V : VL) {
				440	if (dyn_cast<Instruction>(V)->mayWriteToMemory()) {
				441	MayWriteMem = true;
				442	break;
				443	}
				444	}
				445	Candidates.push_back(ParallelMAC(Mul, VL, !MayWriteMem));
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	446	}
				447	}
				448
				449	static ParallelMACList MatchParallelMACs(Reduction &R) {
				450	ParallelMACList Candidates;
				451	const Instruction *Acc = R.AccIntAdd;
				452	Value A, MulOp0, *MulOp1;
				453	LLVM_DEBUG(dbgs() << "\n- Analysing:\t"; Acc->dump());
				454
				455	// Pattern 1: the accumulator is the RHS of the mul.
				456	while(match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)),
				457	m_Value(A)))){
Sjoerd Meijer	53449da	2018-07-11 12:36:25 +0000	[diff] [blame^]	458	AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 0);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	459	Acc = dyn_cast<Instruction>(A);
				460	}
				461	// Pattern 2: the accumulator is the LHS of the mul.
				462	while(match(Acc, m_Add(m_Value(A),
				463	m_Mul(m_Value(MulOp0), m_Value(MulOp1))))) {
Sjoerd Meijer	53449da	2018-07-11 12:36:25 +0000	[diff] [blame^]	464	AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 1);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	465	Acc = dyn_cast<Instruction>(A);
				466	}
				467
				468	// The last mul in the chain has a slightly different pattern:
				469	// the mul is the first operand
				470	if (match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)), m_Value(A))))
Sjoerd Meijer	53449da	2018-07-11 12:36:25 +0000	[diff] [blame^]	471	AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 0);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	472
				473	// Because we start at the bottom of the chain, and we work our way up,
				474	// the muls are added in reverse program order to the list.
				475	std::reverse(Candidates.begin(), Candidates.end());
				476	return Candidates;
				477	}
				478
				479	// Collects all instructions that are not part of the MAC chains, which is the
				480	// set of instructions that can potentially alias with the MAC operands.
Sjoerd Meijer	53449da	2018-07-11 12:36:25 +0000	[diff] [blame^]	481	static void AliasCandidates(BasicBlock *Header, Instructions &Reads,
				482	Instructions &Writes) {
				483	for (auto &I : *Header) {
				484	if (I.mayReadFromMemory())
				485	Reads.push_back(&I);
				486	if (I.mayWriteToMemory())
				487	Writes.push_back(&I);
				488	}
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	489	}
				490
Sjoerd Meijer	53449da	2018-07-11 12:36:25 +0000	[diff] [blame^]	491	// Check whether statements in the basic block that write to memory alias with
				492	// the memory locations accessed by the MAC-chains.
				493	// TODO: we need the read statements when we accept more complicated chains.
				494	static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
				495	Instructions &Writes, ParallelMACList &MACCandidates) {
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	496	LLVM_DEBUG(dbgs() << "Alias checks:\n");
Sjoerd Meijer	53449da	2018-07-11 12:36:25 +0000	[diff] [blame^]	497	for (auto &MAC : MACCandidates) {
				498	LLVM_DEBUG(dbgs() << "mul: "; MAC.Mul->dump());
				499
				500	// At the moment, we allow only simple chains that only consist of reads,
				501	// accumulate their result with an integer add, and thus that don't write
				502	// memory, and simply bail if they do.
				503	if (!MAC.ReadOnly)
				504	return true;
				505
				506	// Now for all writes in the basic block, check that they don't alias with
				507	// the memory locations accessed by our MAC-chain:
				508	for (auto *I : Writes) {
				509	LLVM_DEBUG(dbgs() << "- "; I->dump());
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	510	assert(MAC.MemLocs.size() >= 2 && "expecting at least 2 memlocs");
				511	for (auto &MemLoc : MAC.MemLocs) {
				512	if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
				513	ModRefInfo::ModRef))) {
				514	LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
				515	return true;
				516	}
				517	}
				518	}
				519	}
Sjoerd Meijer	53449da	2018-07-11 12:36:25 +0000	[diff] [blame^]	520
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	521	LLVM_DEBUG(dbgs() << "OK: no aliases found!\n");
				522	return false;
				523	}
				524
				525	static bool SetMemoryLocations(ParallelMACList &Candidates) {
				526	const auto Size = MemoryLocation::UnknownSize;
				527	for (auto &C : Candidates) {
				528	// A mul has 2 operands, and a narrow op consist of sext and a load; thus
				529	// we expect at least 4 items in this operand value list.
				530	if (C.VL.size() < 4) {
				531	LLVM_DEBUG(dbgs() << "Operand list too short.\n");
				532	return false;
				533	}
				534
				535	for (unsigned i = 0; i < C.VL.size(); i += 4) {
				536	auto *LdOp0 = dyn_cast<LoadInst>(C.VL[i]);
				537	auto *LdOp1 = dyn_cast<LoadInst>(C.VL[i+2]);
				538	if (!LdOp0 \|\| !LdOp1)
				539	return false;
				540
				541	C.MemLocs.push_back(MemoryLocation(LdOp0->getPointerOperand(), Size));
				542	C.MemLocs.push_back(MemoryLocation(LdOp1->getPointerOperand(), Size));
				543	}
				544	}
				545	return true;
				546	}
				547
				548	// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
				549	// multiplications.
				550	// To use SMLAD:
				551	// 1) we first need to find integer add reduction PHIs,
				552	// 2) then from the PHI, look for this pattern:
				553	//
				554	// acc0 = phi i32 [0, %entry], [%acc1, %loop.body]
				555	// ld0 = load i16
				556	// sext0 = sext i16 %ld0 to i32
				557	// ld1 = load i16
				558	// sext1 = sext i16 %ld1 to i32
				559	// mul0 = mul %sext0, %sext1
				560	// ld2 = load i16
				561	// sext2 = sext i16 %ld2 to i32
				562	// ld3 = load i16
				563	// sext3 = sext i16 %ld3 to i32
				564	// mul1 = mul i32 %sext2, %sext3
				565	// add0 = add i32 %mul0, %acc0
				566	// acc1 = add i32 %add0, %mul1
				567	//
				568	// Which can be selected to:
				569	//
				570	// ldr.h r0
				571	// ldr.h r1
				572	// smlad r2, r0, r1, r2
				573	//
				574	// If constants are used instead of loads, these will need to be hoisted
				575	// out and into a register.
				576	//
				577	// If loop invariants are used instead of loads, these need to be packed
				578	// before the loop begins.
				579	//
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	580	bool ARMParallelDSP::MatchSMLAD(Function &F) {
				581	BasicBlock *Header = L->getHeader();
				582	LLVM_DEBUG(dbgs() << "= Matching SMLAD =\n";
				583	dbgs() << "Header block:\n"; Header->dump();
				584	dbgs() << "Loop info:\n\n"; L->dump());
				585
				586	bool Changed = false;
				587	ReductionList Reductions = MatchReductions(F, L, Header);
				588
				589	for (auto &R : Reductions) {
				590	ParallelMACList MACCandidates = MatchParallelMACs(R);
				591	if (!SetMemoryLocations(MACCandidates))
				592	continue;
Sjoerd Meijer	53449da	2018-07-11 12:36:25 +0000	[diff] [blame^]	593	R.MACCandidates = MACCandidates;
				594
				595	LLVM_DEBUG(dbgs() << "MAC candidates:\n";
				596	for (auto &M : R.MACCandidates)
				597	M.Mul->dump();
				598	dbgs() << "\n";);
				599	}
				600
				601	// Collect all instructions that may read or write memory. Our alias
				602	// analysis checks bail out if any of these instructions aliases with an
				603	// instruction from the MAC-chain.
				604	Instructions Reads, Writes;
				605	AliasCandidates(Header, Reads, Writes);
				606
				607	for (auto &R : Reductions) {
				608	if (AreAliased(AA, Reads, Writes, R.MACCandidates))
				609	return false;
				610	PMACPairList PMACPairs = CreateParallelMACPairs(R.MACCandidates);
				611	Changed \|= InsertParallelMACs(R, PMACPairs);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	612	}
				613
				614	LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
				615	return Changed;
				616	}
				617
				618	static void CreateLoadIns(IRBuilder<NoFolder> &IRB, Instruction *Acc,
				619	LoadInst **VecLd) {
				620	const Type *AccTy = Acc->getType();
				621	const unsigned AddrSpace = (*VecLd)->getPointerAddressSpace();
				622
				623	Value VecPtr = IRB.CreateBitCast((VecLd)->getPointerOperand(),
				624	AccTy->getPointerTo(AddrSpace));
				625	VecLd = IRB.CreateAlignedLoad(VecPtr, (VecLd)->getAlignment());
				626	}
				627
				628	Instruction ARMParallelDSP::CreateSMLADCall(LoadInst VecLd0, LoadInst *VecLd1,
				629	Instruction *Acc,
				630	Instruction *InsertAfter) {
				631	LLVM_DEBUG(dbgs() << "Create SMLAD intrinsic using:\n";
				632	dbgs() << "- "; VecLd0->dump();
				633	dbgs() << "- "; VecLd1->dump();
				634	dbgs() << "- "; Acc->dump());
				635
				636	IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
				637	++BasicBlock::iterator(InsertAfter));
				638
				639	// Replace the reduction chain with an intrinsic call
				640	CreateLoadIns(Builder, Acc, &VecLd0);
				641	CreateLoadIns(Builder, Acc, &VecLd1);
				642	Value* Args[] = { VecLd0, VecLd1, Acc };
				643	Function *SMLAD = Intrinsic::getDeclaration(M, Intrinsic::arm_smlad);
				644	CallInst *Call = Builder.CreateCall(SMLAD, Args);
Sjoerd Meijer	b3e06fa	2018-07-06 14:47:09 +0000	[diff] [blame]	645	NumSMLAD++;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	646	return Call;
				647	}
				648
				649	Pass *llvm::createARMParallelDSPPass() {
				650	return new ARMParallelDSP();
				651	}
				652
				653	char ARMParallelDSP::ID = 0;
				654
Sjoerd Meijer	b3e06fa	2018-07-06 14:47:09 +0000	[diff] [blame]	655	INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
Simon Pilgrim	c09b5e3	2018-06-28 18:37:16 +0000	[diff] [blame]	656	"Transform loops to use DSP intrinsics", false, false)
Sjoerd Meijer	b3e06fa	2018-07-06 14:47:09 +0000	[diff] [blame]	657	INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
Simon Pilgrim	c09b5e3	2018-06-28 18:37:16 +0000	[diff] [blame]	658	"Transform loops to use DSP intrinsics", false, false)