Blame - llvm/lib/Target/ARM/ARMParallelDSP.cpp - toolchain/llvm-project

blob: 660b6c8f58088dc39bd3eae19a44f9f639f5fa7e [file] [log] [blame]

Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	1	//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	/// \file
				11	/// Armv6 introduced instructions to perform 32-bit SIMD operations. The
				12	/// purpose of this pass is do some IR pattern matching to create ACLE
				13	/// DSP intrinsics, which map on these 32-bit SIMD operations.
				14	//
				15	//===----------------------------------------------------------------------===//
				16
Sjoerd Meijer	b3e06fa	2018-07-06 14:47:09 +0000	[diff] [blame]	17	#include "llvm/ADT/Statistic.h"
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	18	#include "llvm/ADT/SmallPtrSet.h"
				19	#include "llvm/Analysis/AliasAnalysis.h"
				20	#include "llvm/Analysis/LoopAccessAnalysis.h"
				21	#include "llvm/Analysis/LoopPass.h"
				22	#include "llvm/Analysis/LoopInfo.h"
				23	#include "llvm/IR/Instructions.h"
				24	#include "llvm/IR/NoFolder.h"
				25	#include "llvm/Transforms/Scalar.h"
				26	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
				27	#include "llvm/Transforms/Utils/LoopUtils.h"
				28	#include "llvm/Pass.h"
				29	#include "llvm/PassRegistry.h"
				30	#include "llvm/PassSupport.h"
				31	#include "llvm/Support/Debug.h"
				32	#include "llvm/IR/PatternMatch.h"
				33	#include "llvm/CodeGen/TargetPassConfig.h"
				34	#include "ARM.h"
				35	#include "ARMSubtarget.h"
				36
				37	using namespace llvm;
				38	using namespace PatternMatch;
				39
Sjoerd Meijer	b3e06fa	2018-07-06 14:47:09 +0000	[diff] [blame]	40	#define DEBUG_TYPE "arm-parallel-dsp"
				41
				42	STATISTIC(NumSMLAD , "Number of smlad instructions generated");
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	43
				44	namespace {
				45	struct ParallelMAC;
				46	struct Reduction;
				47
				48	using ParallelMACList = SmallVector<ParallelMAC, 8>;
				49	using ReductionList = SmallVector<Reduction, 8>;
				50	using ValueList = SmallVector<Value*, 8>;
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	51	using MemInstList = SmallVector<Instruction*, 8>;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	52	using PMACPair = std::pair<ParallelMAC,ParallelMAC>;
				53	using PMACPairList = SmallVector<PMACPair, 8>;
				54	using Instructions = SmallVector<Instruction*,16>;
				55	using MemLocList = SmallVector<MemoryLocation, 4>;
				56
				57	// 'ParallelMAC' and 'Reduction' are just some bookkeeping data structures.
				58	// 'Reduction' contains the phi-node and accumulator statement from where we
				59	// start pattern matching, and 'ParallelMAC' the multiplication
				60	// instructions that are candidates for parallel execution.
				61	struct ParallelMAC {
				62	Instruction *Mul;
				63	ValueList VL; // List of all (narrow) operands of this Mul
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	64	MemInstList VecLd; // List of all load instructions of this Mul
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	65	MemLocList MemLocs; // All memory locations read by this Mul
				66
				67	ParallelMAC(Instruction *I, ValueList &V) : Mul(I), VL(V) {};
				68	};
				69
				70	struct Reduction {
				71	PHINode *Phi; // The Phi-node from where we start
				72	// pattern matching.
				73	Instruction *AccIntAdd; // The accumulating integer add statement,
				74	// i.e, the reduction statement.
				75
				76	Reduction (PHINode P, Instruction Acc) : Phi(P), AccIntAdd(Acc) { };
				77	};
				78
				79	class ARMParallelDSP : public LoopPass {
				80	ScalarEvolution *SE;
				81	AliasAnalysis *AA;
				82	TargetLibraryInfo *TLI;
				83	DominatorTree *DT;
				84	LoopInfo *LI;
				85	Loop *L;
				86	const DataLayout *DL;
				87	Module *M;
				88
				89	bool InsertParallelMACs(Reduction &Reduction, PMACPairList &PMACPairs);
Fangrui Song	6816934	2018-07-03 19:12:27 +0000	[diff] [blame]	90	bool AreSequentialLoads(LoadInst Ld0, LoadInst Ld1, MemInstList &VecMem);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	91	PMACPairList CreateParallelMACPairs(ParallelMACList &Candidates);
				92	Instruction CreateSMLADCall(LoadInst VecLd0, LoadInst *VecLd1,
				93	Instruction Acc, Instruction InsertAfter);
				94
				95	/// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
				96	/// Dual performs two signed 16x16-bit multiplications. It adds the
				97	/// products to a 32-bit accumulate operand. Optionally, the instruction can
				98	/// exchange the halfwords of the second operand before performing the
				99	/// arithmetic.
				100	bool MatchSMLAD(Function &F);
				101
				102	public:
				103	static char ID;
				104
				105	ARMParallelDSP() : LoopPass(ID) { }
				106
				107	void getAnalysisUsage(AnalysisUsage &AU) const override {
				108	LoopPass::getAnalysisUsage(AU);
				109	AU.addRequired<AssumptionCacheTracker>();
				110	AU.addRequired<ScalarEvolutionWrapperPass>();
				111	AU.addRequired<AAResultsWrapperPass>();
				112	AU.addRequired<TargetLibraryInfoWrapperPass>();
				113	AU.addRequired<LoopInfoWrapperPass>();
				114	AU.addRequired<DominatorTreeWrapperPass>();
				115	AU.addRequired<TargetPassConfig>();
				116	AU.addPreserved<LoopInfoWrapperPass>();
				117	AU.setPreservesCFG();
				118	}
				119
				120	bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
				121	L = TheLoop;
				122	SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
				123	AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
				124	TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
				125	DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
				126	LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
				127	auto &TPC = getAnalysis<TargetPassConfig>();
				128
				129	BasicBlock *Header = TheLoop->getHeader();
				130	if (!Header)
				131	return false;
				132
				133	// TODO: We assume the loop header and latch to be the same block.
				134	// This is not a fundamental restriction, but lifting this would just
				135	// require more work to do the transformation and then patch up the CFG.
				136	if (Header != TheLoop->getLoopLatch()) {
				137	LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
				138	"running pass ARMParallelDSP\n");
				139	return false;
				140	}
				141
				142	Function &F = *Header->getParent();
				143	M = F.getParent();
				144	DL = &M->getDataLayout();
				145
				146	auto &TM = TPC.getTM<TargetMachine>();
				147	auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
				148
				149	if (!ST->allowsUnalignedMem()) {
				150	LLVM_DEBUG(dbgs() << "Unaligned memory access not supported: not "
				151	"running pass ARMParallelDSP\n");
				152	return false;
				153	}
				154
				155	if (!ST->hasDSP()) {
				156	LLVM_DEBUG(dbgs() << "DSP extension not enabled: not running pass "
				157	"ARMParallelDSP\n");
				158	return false;
				159	}
				160
				161	LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
				162	bool Changes = false;
				163
				164	LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n\n");
				165	Changes = MatchSMLAD(F);
				166	return Changes;
				167	}
				168	};
				169	}
				170
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame]	171	// MaxBitwidth: the maximum supported bitwidth of the elements in the DSP
				172	// instructions, which is set to 16. So here we should collect all i8 and i16
				173	// narrow operations.
				174	// TODO: we currently only collect i16, and will support i8 later, so that's
				175	// why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
				176	template<unsigned MaxBitWidth>
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	177	static bool IsNarrowSequence(Value *V, ValueList &VL) {
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame]	178	LLVM_DEBUG(dbgs() << "Is narrow sequence? "; V->dump());
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	179	ConstantInt *CInt;
				180
				181	if (match(V, m_ConstantInt(CInt))) {
				182	// TODO: if a constant is used, it needs to fit within the bit width.
				183	return false;
				184	}
				185
				186	auto *I = dyn_cast<Instruction>(V);
				187	if (!I)
				188	return false;
				189
				190	Value Val, LHS, *RHS;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	191	if (match(V, m_Trunc(m_Value(Val)))) {
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame]	192	if (cast<TruncInst>(I)->getDestTy()->getIntegerBitWidth() == MaxBitWidth)
				193	return IsNarrowSequence<MaxBitWidth>(Val, VL);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	194	} else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) {
				195	// TODO: we need to implement sadd16/sadd8 for this, which enables to
				196	// also do the rewrite for smlad8.ll, but it is unsupported for now.
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame]	197	LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
				198	return false;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	199	} else if (match(V, m_ZExtOrSExt(m_Value(Val)))) {
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame]	200	if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) {
				201	LLVM_DEBUG(dbgs() << "No, wrong SrcTy size: " <<
				202	cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() << "\n");
				203	return false;
				204	}
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	205
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame]	206	if (match(Val, m_Load(m_Value()))) {
				207	LLVM_DEBUG(dbgs() << "Yes, found narrow Load:\t"; Val->dump());
				208	VL.push_back(Val);
				209	VL.push_back(I);
				210	return true;
				211	}
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	212	}
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame]	213	LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
				214	return false;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	215	}
				216
				217	// Element-by-element comparison of Value lists returning true if they are
				218	// instructions with the same opcode or constants with the same value.
				219	static bool AreSymmetrical(const ValueList &VL0,
				220	const ValueList &VL1) {
				221	if (VL0.size() != VL1.size()) {
				222	LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
				223	<< VL0.size() << " != " << VL1.size() << "\n");
				224	return false;
				225	}
				226
				227	const unsigned Pairs = VL0.size();
				228	LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
				229
				230	for (unsigned i = 0; i < Pairs; ++i) {
				231	const Value *V0 = VL0[i];
				232	const Value *V1 = VL1[i];
				233	const auto *Inst0 = dyn_cast<Instruction>(V0);
				234	const auto *Inst1 = dyn_cast<Instruction>(V1);
				235
				236	LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
				237	dbgs() << "mul1: "; V0->dump();
				238	dbgs() << "mul2: "; V1->dump());
				239
				240	if (!Inst0 \|\| !Inst1)
				241	return false;
				242
				243	if (Inst0->isSameOperationAs(Inst1)) {
				244	LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
				245	continue;
				246	}
				247
				248	const APInt C0, C1;
				249	if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
				250	return false;
				251	}
				252
				253	LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
				254	return true;
				255	}
				256
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	257	template<typename MemInst>
				258	static bool AreSequentialAccesses(MemInst MemOp0, MemInst MemOp1,
				259	MemInstList &VecMem, const DataLayout &DL,
				260	ScalarEvolution &SE) {
				261	if (!MemOp0->isSimple() \|\| !MemOp1->isSimple()) {
				262	LLVM_DEBUG(dbgs() << "No, not touching volatile access\n");
				263	return false;
				264	}
				265	if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) {
				266	VecMem.push_back(MemOp0);
				267	VecMem.push_back(MemOp1);
				268	LLVM_DEBUG(dbgs() << "OK: accesses are consecutive.\n");
				269	return true;
				270	}
				271	LLVM_DEBUG(dbgs() << "No, accesses aren't consecutive.\n");
				272	return false;
				273	}
				274
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	275	bool ARMParallelDSP::AreSequentialLoads(LoadInst Ld0, LoadInst Ld1,
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	276	MemInstList &VecMem) {
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	277	if (!Ld0 \|\| !Ld1)
				278	return false;
				279
				280	LLVM_DEBUG(dbgs() << "Are consecutive loads:\n";
				281	dbgs() << "Ld0:"; Ld0->dump();
				282	dbgs() << "Ld1:"; Ld1->dump();
				283	);
				284
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	285	if (!Ld0->hasOneUse() \|\| !Ld1->hasOneUse()) {
				286	LLVM_DEBUG(dbgs() << "No, load has more than one use.\n");
				287	return false;
				288	}
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	289
				290	return AreSequentialAccesses<LoadInst>(Ld0, Ld1, VecMem, DL, SE);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	291	}
				292
				293	PMACPairList
				294	ARMParallelDSP::CreateParallelMACPairs(ParallelMACList &Candidates) {
				295	const unsigned Elems = Candidates.size();
				296	PMACPairList PMACPairs;
				297
				298	if (Elems < 2)
				299	return PMACPairs;
				300
				301	// TODO: for now we simply try to match consecutive pairs i and i+1.
				302	// We can compare all elements, but then we need to compare and evaluate
				303	// different solutions.
				304	for(unsigned i=0; i<Elems-1; i+=2) {
				305	ParallelMAC &PMul0 = Candidates[i];
				306	ParallelMAC &PMul1 = Candidates[i+1];
				307	const Instruction *Mul0 = PMul0.Mul;
				308	const Instruction *Mul1 = PMul1.Mul;
				309
				310	if (Mul0 == Mul1)
				311	continue;
				312
				313	LLVM_DEBUG(dbgs() << "\nCheck parallel muls:\n";
				314	dbgs() << "- "; Mul0->dump();
				315	dbgs() << "- "; Mul1->dump());
				316
				317	const ValueList &VL0 = PMul0.VL;
				318	const ValueList &VL1 = PMul1.VL;
				319
				320	if (!AreSymmetrical(VL0, VL1))
				321	continue;
				322
				323	LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
				324	// The first elements of each vector should be loads with sexts. If we find
				325	// that its two pairs of consecutive loads, then these can be transformed
				326	// into two wider loads and the users can be replaced with DSP
				327	// intrinsics.
				328	for (unsigned x = 0; x < VL0.size(); x += 4) {
				329	auto *Ld0 = dyn_cast<LoadInst>(VL0[x]);
				330	auto *Ld1 = dyn_cast<LoadInst>(VL1[x]);
				331	auto *Ld2 = dyn_cast<LoadInst>(VL0[x+2]);
				332	auto *Ld3 = dyn_cast<LoadInst>(VL1[x+2]);
				333
				334	LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n";
				335	dbgs() << "\t mul1: "; VL0[x]->dump();
				336	dbgs() << "\t mul2: "; VL1[x]->dump();
				337	dbgs() << "and operands " << x + 2 << ":\n";
				338	dbgs() << "\t mul1: "; VL0[x+2]->dump();
				339	dbgs() << "\t mul2: "; VL1[x+2]->dump());
				340
				341	if (AreSequentialLoads(Ld0, Ld1, Candidates[i].VecLd) &&
				342	AreSequentialLoads(Ld2, Ld3, Candidates[i+1].VecLd)) {
				343	LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
				344	PMACPairs.push_back(std::make_pair(&PMul0, &PMul1));
				345	}
				346	}
				347	}
				348	return PMACPairs;
				349	}
				350
				351	bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction,
				352	PMACPairList &PMACPairs) {
				353	Instruction *Acc = Reduction.Phi;
				354	Instruction *InsertAfter = Reduction.AccIntAdd;
				355
				356	for (auto &Pair : PMACPairs) {
				357	LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
				358	dbgs() << "- "; Pair.first->Mul->dump();
				359	dbgs() << "- "; Pair.second->Mul->dump());
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	360	auto *VecLd0 = cast<LoadInst>(Pair.first->VecLd[0]);
				361	auto *VecLd1 = cast<LoadInst>(Pair.second->VecLd[0]);
				362	Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, InsertAfter);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	363	InsertAfter = Acc;
				364	}
				365
				366	if (Acc != Reduction.Phi) {
				367	LLVM_DEBUG(dbgs() << "Replace Accumulate: "; Acc->dump());
				368	Reduction.AccIntAdd->replaceAllUsesWith(Acc);
				369	return true;
				370	}
				371	return false;
				372	}
				373
				374	static ReductionList MatchReductions(Function &F, Loop *TheLoop,
				375	BasicBlock *Header) {
				376	ReductionList Reductions;
				377	RecurrenceDescriptor RecDesc;
				378	const bool HasFnNoNaNAttr =
				379	F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
				380	const BasicBlock *Latch = TheLoop->getLoopLatch();
				381
				382	// We need a preheader as getIncomingValueForBlock assumes there is one.
				383	if (!TheLoop->getLoopPreheader())
				384	return Reductions;
				385
				386	for (PHINode &Phi : Header->phis()) {
				387	const auto *Ty = Phi.getType();
				388	if (!Ty->isIntegerTy(32))
				389	continue;
				390
				391	const bool IsReduction =
				392	RecurrenceDescriptor::AddReductionVar(&Phi,
				393	RecurrenceDescriptor::RK_IntegerAdd,
				394	TheLoop, HasFnNoNaNAttr, RecDesc);
				395	if (!IsReduction)
				396	continue;
				397
				398	Instruction *Acc = dyn_cast<Instruction>(Phi.getIncomingValueForBlock(Latch));
				399	if (!Acc)
				400	continue;
				401
				402	Reductions.push_back(Reduction(&Phi, Acc));
				403	}
				404
				405	LLVM_DEBUG(
				406	dbgs() << "\nAccumulating integer additions (reductions) found:\n";
				407	for (auto R : Reductions) {
				408	dbgs() << "- "; R.Phi->dump();
				409	dbgs() << "-> "; R.AccIntAdd->dump();
				410	}
				411	);
				412	return Reductions;
				413	}
				414
				415	static void AddCandidateMAC(ParallelMACList &Candidates, const Instruction *Acc,
				416	Value MulOp0, Value MulOp1, int MulOpNum) {
				417	Instruction *Mul = dyn_cast<Instruction>(Acc->getOperand(MulOpNum));
				418	LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
				419	ValueList VL;
				420	if (IsNarrowSequence<16>(MulOp0, VL) &&
				421	IsNarrowSequence<16>(MulOp1, VL)) {
				422	LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
				423	Candidates.push_back(ParallelMAC(Mul, VL));
				424	}
				425	}
				426
				427	static ParallelMACList MatchParallelMACs(Reduction &R) {
				428	ParallelMACList Candidates;
				429	const Instruction *Acc = R.AccIntAdd;
				430	Value A, MulOp0, *MulOp1;
				431	LLVM_DEBUG(dbgs() << "\n- Analysing:\t"; Acc->dump());
				432
				433	// Pattern 1: the accumulator is the RHS of the mul.
				434	while(match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)),
				435	m_Value(A)))){
				436	AddCandidateMAC(Candidates, Acc, MulOp0, MulOp1, 0);
				437	Acc = dyn_cast<Instruction>(A);
				438	}
				439	// Pattern 2: the accumulator is the LHS of the mul.
				440	while(match(Acc, m_Add(m_Value(A),
				441	m_Mul(m_Value(MulOp0), m_Value(MulOp1))))) {
				442	AddCandidateMAC(Candidates, Acc, MulOp0, MulOp1, 1);
				443	Acc = dyn_cast<Instruction>(A);
				444	}
				445
				446	// The last mul in the chain has a slightly different pattern:
				447	// the mul is the first operand
				448	if (match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)), m_Value(A))))
				449	AddCandidateMAC(Candidates, Acc, MulOp0, MulOp1, 0);
				450
				451	// Because we start at the bottom of the chain, and we work our way up,
				452	// the muls are added in reverse program order to the list.
				453	std::reverse(Candidates.begin(), Candidates.end());
				454	return Candidates;
				455	}
				456
				457	// Collects all instructions that are not part of the MAC chains, which is the
				458	// set of instructions that can potentially alias with the MAC operands.
				459	static Instructions AliasCandidates(BasicBlock *Header,
				460	ParallelMACList &MACCandidates) {
				461	Instructions Aliases;
				462	auto IsMACCandidate = [] (Instruction *I, ParallelMACList &MACCandidates) {
				463	for (auto &MAC : MACCandidates)
				464	for (auto *Val : MAC.VL)
				465	if (I == MAC.Mul \|\| Val == I)
				466	return true;
				467	return false;
				468	};
				469
				470	std::for_each(Header->begin(), Header->end(),
				471	[&Aliases, &MACCandidates, &IsMACCandidate] (Instruction &I) {
				472	if (I.mayReadOrWriteMemory() &&
				473	!IsMACCandidate(&I, MACCandidates))
				474	Aliases.push_back(&I); });
				475	return Aliases;
				476	}
				477
				478	// This compares all instructions from the "alias candidates" set, i.e., all
				479	// instructions that are not part of the MAC-chain, with all instructions in
				480	// the MAC candidate set, to see if instructions are aliased.
				481	static bool AreAliased(AliasAnalysis *AA, Instructions AliasCandidates,
				482	ParallelMACList &MACCandidates) {
				483	LLVM_DEBUG(dbgs() << "Alias checks:\n");
				484	for (auto *I : AliasCandidates) {
				485	LLVM_DEBUG(dbgs() << "- "; I->dump());
				486	for (auto &MAC : MACCandidates) {
				487	LLVM_DEBUG(dbgs() << "mul: "; MAC.Mul->dump());
				488	assert(MAC.MemLocs.size() >= 2 && "expecting at least 2 memlocs");
				489	for (auto &MemLoc : MAC.MemLocs) {
				490	if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
				491	ModRefInfo::ModRef))) {
				492	LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
				493	return true;
				494	}
				495	}
				496	}
				497	}
				498	LLVM_DEBUG(dbgs() << "OK: no aliases found!\n");
				499	return false;
				500	}
				501
				502	static bool SetMemoryLocations(ParallelMACList &Candidates) {
				503	const auto Size = MemoryLocation::UnknownSize;
				504	for (auto &C : Candidates) {
				505	// A mul has 2 operands, and a narrow op consist of sext and a load; thus
				506	// we expect at least 4 items in this operand value list.
				507	if (C.VL.size() < 4) {
				508	LLVM_DEBUG(dbgs() << "Operand list too short.\n");
				509	return false;
				510	}
				511
				512	for (unsigned i = 0; i < C.VL.size(); i += 4) {
				513	auto *LdOp0 = dyn_cast<LoadInst>(C.VL[i]);
				514	auto *LdOp1 = dyn_cast<LoadInst>(C.VL[i+2]);
				515	if (!LdOp0 \|\| !LdOp1)
				516	return false;
				517
				518	C.MemLocs.push_back(MemoryLocation(LdOp0->getPointerOperand(), Size));
				519	C.MemLocs.push_back(MemoryLocation(LdOp1->getPointerOperand(), Size));
				520	}
				521	}
				522	return true;
				523	}
				524
				525	// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
				526	// multiplications.
				527	// To use SMLAD:
				528	// 1) we first need to find integer add reduction PHIs,
				529	// 2) then from the PHI, look for this pattern:
				530	//
				531	// acc0 = phi i32 [0, %entry], [%acc1, %loop.body]
				532	// ld0 = load i16
				533	// sext0 = sext i16 %ld0 to i32
				534	// ld1 = load i16
				535	// sext1 = sext i16 %ld1 to i32
				536	// mul0 = mul %sext0, %sext1
				537	// ld2 = load i16
				538	// sext2 = sext i16 %ld2 to i32
				539	// ld3 = load i16
				540	// sext3 = sext i16 %ld3 to i32
				541	// mul1 = mul i32 %sext2, %sext3
				542	// add0 = add i32 %mul0, %acc0
				543	// acc1 = add i32 %add0, %mul1
				544	//
				545	// Which can be selected to:
				546	//
				547	// ldr.h r0
				548	// ldr.h r1
				549	// smlad r2, r0, r1, r2
				550	//
				551	// If constants are used instead of loads, these will need to be hoisted
				552	// out and into a register.
				553	//
				554	// If loop invariants are used instead of loads, these need to be packed
				555	// before the loop begins.
				556	//
				557	// Can only be enabled for cores which support unaligned accesses.
				558	//
				559	bool ARMParallelDSP::MatchSMLAD(Function &F) {
				560	BasicBlock *Header = L->getHeader();
				561	LLVM_DEBUG(dbgs() << "= Matching SMLAD =\n";
				562	dbgs() << "Header block:\n"; Header->dump();
				563	dbgs() << "Loop info:\n\n"; L->dump());
				564
				565	bool Changed = false;
				566	ReductionList Reductions = MatchReductions(F, L, Header);
				567
				568	for (auto &R : Reductions) {
				569	ParallelMACList MACCandidates = MatchParallelMACs(R);
				570	if (!SetMemoryLocations(MACCandidates))
				571	continue;
				572	Instructions Aliases = AliasCandidates(Header, MACCandidates);
				573	if (AreAliased(AA, Aliases, MACCandidates))
				574	continue;
				575	PMACPairList PMACPairs = CreateParallelMACPairs(MACCandidates);
				576	Changed = InsertParallelMACs(R, PMACPairs) \|\| Changed;
				577	}
				578
				579	LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
				580	return Changed;
				581	}
				582
				583	static void CreateLoadIns(IRBuilder<NoFolder> &IRB, Instruction *Acc,
				584	LoadInst **VecLd) {
				585	const Type *AccTy = Acc->getType();
				586	const unsigned AddrSpace = (*VecLd)->getPointerAddressSpace();
				587
				588	Value VecPtr = IRB.CreateBitCast((VecLd)->getPointerOperand(),
				589	AccTy->getPointerTo(AddrSpace));
				590	VecLd = IRB.CreateAlignedLoad(VecPtr, (VecLd)->getAlignment());
				591	}
				592
				593	Instruction ARMParallelDSP::CreateSMLADCall(LoadInst VecLd0, LoadInst *VecLd1,
				594	Instruction *Acc,
				595	Instruction *InsertAfter) {
				596	LLVM_DEBUG(dbgs() << "Create SMLAD intrinsic using:\n";
				597	dbgs() << "- "; VecLd0->dump();
				598	dbgs() << "- "; VecLd1->dump();
				599	dbgs() << "- "; Acc->dump());
				600
				601	IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
				602	++BasicBlock::iterator(InsertAfter));
				603
				604	// Replace the reduction chain with an intrinsic call
				605	CreateLoadIns(Builder, Acc, &VecLd0);
				606	CreateLoadIns(Builder, Acc, &VecLd1);
				607	Value* Args[] = { VecLd0, VecLd1, Acc };
				608	Function *SMLAD = Intrinsic::getDeclaration(M, Intrinsic::arm_smlad);
				609	CallInst *Call = Builder.CreateCall(SMLAD, Args);
Sjoerd Meijer	b3e06fa	2018-07-06 14:47:09 +0000	[diff] [blame]	610	NumSMLAD++;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	611	return Call;
				612	}
				613
				614	Pass *llvm::createARMParallelDSPPass() {
				615	return new ARMParallelDSP();
				616	}
				617
				618	char ARMParallelDSP::ID = 0;
				619
Sjoerd Meijer	b3e06fa	2018-07-06 14:47:09 +0000	[diff] [blame]	620	INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
Simon Pilgrim	c09b5e3	2018-06-28 18:37:16 +0000	[diff] [blame]	621	"Transform loops to use DSP intrinsics", false, false)
Sjoerd Meijer	b3e06fa	2018-07-06 14:47:09 +0000	[diff] [blame]	622	INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
Simon Pilgrim	c09b5e3	2018-06-28 18:37:16 +0000	[diff] [blame]	623	"Transform loops to use DSP intrinsics", false, false)