Blame - llvm/lib/Target/ARM/ARMParallelDSP.cpp - toolchain/llvm-project

blob: 3a24f74fff5c68b1e3d1602ab7d16f2abfc1d47a [file] [log] [blame]

Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	1	//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	/// \file
				11	/// Armv6 introduced instructions to perform 32-bit SIMD operations. The
				12	/// purpose of this pass is do some IR pattern matching to create ACLE
				13	/// DSP intrinsics, which map on these 32-bit SIMD operations.
				14	//
				15	//===----------------------------------------------------------------------===//
				16
				17	#include "llvm/ADT/SmallPtrSet.h"
				18	#include "llvm/Analysis/AliasAnalysis.h"
				19	#include "llvm/Analysis/LoopAccessAnalysis.h"
				20	#include "llvm/Analysis/LoopPass.h"
				21	#include "llvm/Analysis/LoopInfo.h"
				22	#include "llvm/IR/Instructions.h"
				23	#include "llvm/IR/NoFolder.h"
				24	#include "llvm/Transforms/Scalar.h"
				25	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
				26	#include "llvm/Transforms/Utils/LoopUtils.h"
				27	#include "llvm/Pass.h"
				28	#include "llvm/PassRegistry.h"
				29	#include "llvm/PassSupport.h"
				30	#include "llvm/Support/Debug.h"
				31	#include "llvm/IR/PatternMatch.h"
				32	#include "llvm/CodeGen/TargetPassConfig.h"
				33	#include "ARM.h"
				34	#include "ARMSubtarget.h"
				35
				36	using namespace llvm;
				37	using namespace PatternMatch;
				38
				39	#define DEBUG_TYPE "parallel-dsp"
				40
				41	namespace {
				42	struct ParallelMAC;
				43	struct Reduction;
				44
				45	using ParallelMACList = SmallVector<ParallelMAC, 8>;
				46	using ReductionList = SmallVector<Reduction, 8>;
				47	using ValueList = SmallVector<Value*, 8>;
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	48	using MemInstList = SmallVector<Instruction*, 8>;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	49	using PMACPair = std::pair<ParallelMAC,ParallelMAC>;
				50	using PMACPairList = SmallVector<PMACPair, 8>;
				51	using Instructions = SmallVector<Instruction*,16>;
				52	using MemLocList = SmallVector<MemoryLocation, 4>;
				53
				54	// 'ParallelMAC' and 'Reduction' are just some bookkeeping data structures.
				55	// 'Reduction' contains the phi-node and accumulator statement from where we
				56	// start pattern matching, and 'ParallelMAC' the multiplication
				57	// instructions that are candidates for parallel execution.
				58	struct ParallelMAC {
				59	Instruction *Mul;
				60	ValueList VL; // List of all (narrow) operands of this Mul
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	61	MemInstList VecLd; // List of all load instructions of this Mul
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	62	MemLocList MemLocs; // All memory locations read by this Mul
				63
				64	ParallelMAC(Instruction *I, ValueList &V) : Mul(I), VL(V) {};
				65	};
				66
				67	struct Reduction {
				68	PHINode *Phi; // The Phi-node from where we start
				69	// pattern matching.
				70	Instruction *AccIntAdd; // The accumulating integer add statement,
				71	// i.e, the reduction statement.
				72
				73	Reduction (PHINode P, Instruction Acc) : Phi(P), AccIntAdd(Acc) { };
				74	};
				75
				76	class ARMParallelDSP : public LoopPass {
				77	ScalarEvolution *SE;
				78	AliasAnalysis *AA;
				79	TargetLibraryInfo *TLI;
				80	DominatorTree *DT;
				81	LoopInfo *LI;
				82	Loop *L;
				83	const DataLayout *DL;
				84	Module *M;
				85
				86	bool InsertParallelMACs(Reduction &Reduction, PMACPairList &PMACPairs);
Fangrui Song	6816934	2018-07-03 19:12:27 +0000	[diff] [blame]	87	bool AreSequentialLoads(LoadInst Ld0, LoadInst Ld1, MemInstList &VecMem);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	88	PMACPairList CreateParallelMACPairs(ParallelMACList &Candidates);
				89	Instruction CreateSMLADCall(LoadInst VecLd0, LoadInst *VecLd1,
				90	Instruction Acc, Instruction InsertAfter);
				91
				92	/// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
				93	/// Dual performs two signed 16x16-bit multiplications. It adds the
				94	/// products to a 32-bit accumulate operand. Optionally, the instruction can
				95	/// exchange the halfwords of the second operand before performing the
				96	/// arithmetic.
				97	bool MatchSMLAD(Function &F);
				98
				99	public:
				100	static char ID;
				101
				102	ARMParallelDSP() : LoopPass(ID) { }
				103
				104	void getAnalysisUsage(AnalysisUsage &AU) const override {
				105	LoopPass::getAnalysisUsage(AU);
				106	AU.addRequired<AssumptionCacheTracker>();
				107	AU.addRequired<ScalarEvolutionWrapperPass>();
				108	AU.addRequired<AAResultsWrapperPass>();
				109	AU.addRequired<TargetLibraryInfoWrapperPass>();
				110	AU.addRequired<LoopInfoWrapperPass>();
				111	AU.addRequired<DominatorTreeWrapperPass>();
				112	AU.addRequired<TargetPassConfig>();
				113	AU.addPreserved<LoopInfoWrapperPass>();
				114	AU.setPreservesCFG();
				115	}
				116
				117	bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
				118	L = TheLoop;
				119	SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
				120	AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
				121	TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
				122	DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
				123	LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
				124	auto &TPC = getAnalysis<TargetPassConfig>();
				125
				126	BasicBlock *Header = TheLoop->getHeader();
				127	if (!Header)
				128	return false;
				129
				130	// TODO: We assume the loop header and latch to be the same block.
				131	// This is not a fundamental restriction, but lifting this would just
				132	// require more work to do the transformation and then patch up the CFG.
				133	if (Header != TheLoop->getLoopLatch()) {
				134	LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
				135	"running pass ARMParallelDSP\n");
				136	return false;
				137	}
				138
				139	Function &F = *Header->getParent();
				140	M = F.getParent();
				141	DL = &M->getDataLayout();
				142
				143	auto &TM = TPC.getTM<TargetMachine>();
				144	auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
				145
				146	if (!ST->allowsUnalignedMem()) {
				147	LLVM_DEBUG(dbgs() << "Unaligned memory access not supported: not "
				148	"running pass ARMParallelDSP\n");
				149	return false;
				150	}
				151
				152	if (!ST->hasDSP()) {
				153	LLVM_DEBUG(dbgs() << "DSP extension not enabled: not running pass "
				154	"ARMParallelDSP\n");
				155	return false;
				156	}
				157
				158	LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
				159	bool Changes = false;
				160
				161	LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n\n");
				162	Changes = MatchSMLAD(F);
				163	return Changes;
				164	}
				165	};
				166	}
				167
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame^]	168	// MaxBitwidth: the maximum supported bitwidth of the elements in the DSP
				169	// instructions, which is set to 16. So here we should collect all i8 and i16
				170	// narrow operations.
				171	// TODO: we currently only collect i16, and will support i8 later, so that's
				172	// why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
				173	template<unsigned MaxBitWidth>
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	174	static bool IsNarrowSequence(Value *V, ValueList &VL) {
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame^]	175	LLVM_DEBUG(dbgs() << "Is narrow sequence? "; V->dump());
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	176	ConstantInt *CInt;
				177
				178	if (match(V, m_ConstantInt(CInt))) {
				179	// TODO: if a constant is used, it needs to fit within the bit width.
				180	return false;
				181	}
				182
				183	auto *I = dyn_cast<Instruction>(V);
				184	if (!I)
				185	return false;
				186
				187	Value Val, LHS, *RHS;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	188	if (match(V, m_Trunc(m_Value(Val)))) {
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame^]	189	if (cast<TruncInst>(I)->getDestTy()->getIntegerBitWidth() == MaxBitWidth)
				190	return IsNarrowSequence<MaxBitWidth>(Val, VL);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	191	} else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) {
				192	// TODO: we need to implement sadd16/sadd8 for this, which enables to
				193	// also do the rewrite for smlad8.ll, but it is unsupported for now.
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame^]	194	LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
				195	return false;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	196	} else if (match(V, m_ZExtOrSExt(m_Value(Val)))) {
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame^]	197	if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) {
				198	LLVM_DEBUG(dbgs() << "No, wrong SrcTy size: " <<
				199	cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() << "\n");
				200	return false;
				201	}
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	202
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame^]	203	if (match(Val, m_Load(m_Value()))) {
				204	LLVM_DEBUG(dbgs() << "Yes, found narrow Load:\t"; Val->dump());
				205	VL.push_back(Val);
				206	VL.push_back(I);
				207	return true;
				208	}
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	209	}
Sjoerd Meijer	27be58b	2018-07-05 08:21:40 +0000	[diff] [blame^]	210	LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
				211	return false;
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	212	}
				213
				214	// Element-by-element comparison of Value lists returning true if they are
				215	// instructions with the same opcode or constants with the same value.
				216	static bool AreSymmetrical(const ValueList &VL0,
				217	const ValueList &VL1) {
				218	if (VL0.size() != VL1.size()) {
				219	LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
				220	<< VL0.size() << " != " << VL1.size() << "\n");
				221	return false;
				222	}
				223
				224	const unsigned Pairs = VL0.size();
				225	LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
				226
				227	for (unsigned i = 0; i < Pairs; ++i) {
				228	const Value *V0 = VL0[i];
				229	const Value *V1 = VL1[i];
				230	const auto *Inst0 = dyn_cast<Instruction>(V0);
				231	const auto *Inst1 = dyn_cast<Instruction>(V1);
				232
				233	LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
				234	dbgs() << "mul1: "; V0->dump();
				235	dbgs() << "mul2: "; V1->dump());
				236
				237	if (!Inst0 \|\| !Inst1)
				238	return false;
				239
				240	if (Inst0->isSameOperationAs(Inst1)) {
				241	LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
				242	continue;
				243	}
				244
				245	const APInt C0, C1;
				246	if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
				247	return false;
				248	}
				249
				250	LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
				251	return true;
				252	}
				253
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	254	template<typename MemInst>
				255	static bool AreSequentialAccesses(MemInst MemOp0, MemInst MemOp1,
				256	MemInstList &VecMem, const DataLayout &DL,
				257	ScalarEvolution &SE) {
				258	if (!MemOp0->isSimple() \|\| !MemOp1->isSimple()) {
				259	LLVM_DEBUG(dbgs() << "No, not touching volatile access\n");
				260	return false;
				261	}
				262	if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) {
				263	VecMem.push_back(MemOp0);
				264	VecMem.push_back(MemOp1);
				265	LLVM_DEBUG(dbgs() << "OK: accesses are consecutive.\n");
				266	return true;
				267	}
				268	LLVM_DEBUG(dbgs() << "No, accesses aren't consecutive.\n");
				269	return false;
				270	}
				271
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	272	bool ARMParallelDSP::AreSequentialLoads(LoadInst Ld0, LoadInst Ld1,
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	273	MemInstList &VecMem) {
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	274	if (!Ld0 \|\| !Ld1)
				275	return false;
				276
				277	LLVM_DEBUG(dbgs() << "Are consecutive loads:\n";
				278	dbgs() << "Ld0:"; Ld0->dump();
				279	dbgs() << "Ld1:"; Ld1->dump();
				280	);
				281
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	282	if (!Ld0->hasOneUse() \|\| !Ld1->hasOneUse()) {
				283	LLVM_DEBUG(dbgs() << "No, load has more than one use.\n");
				284	return false;
				285	}
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	286
				287	return AreSequentialAccesses<LoadInst>(Ld0, Ld1, VecMem, DL, SE);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	288	}
				289
				290	PMACPairList
				291	ARMParallelDSP::CreateParallelMACPairs(ParallelMACList &Candidates) {
				292	const unsigned Elems = Candidates.size();
				293	PMACPairList PMACPairs;
				294
				295	if (Elems < 2)
				296	return PMACPairs;
				297
				298	// TODO: for now we simply try to match consecutive pairs i and i+1.
				299	// We can compare all elements, but then we need to compare and evaluate
				300	// different solutions.
				301	for(unsigned i=0; i<Elems-1; i+=2) {
				302	ParallelMAC &PMul0 = Candidates[i];
				303	ParallelMAC &PMul1 = Candidates[i+1];
				304	const Instruction *Mul0 = PMul0.Mul;
				305	const Instruction *Mul1 = PMul1.Mul;
				306
				307	if (Mul0 == Mul1)
				308	continue;
				309
				310	LLVM_DEBUG(dbgs() << "\nCheck parallel muls:\n";
				311	dbgs() << "- "; Mul0->dump();
				312	dbgs() << "- "; Mul1->dump());
				313
				314	const ValueList &VL0 = PMul0.VL;
				315	const ValueList &VL1 = PMul1.VL;
				316
				317	if (!AreSymmetrical(VL0, VL1))
				318	continue;
				319
				320	LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
				321	// The first elements of each vector should be loads with sexts. If we find
				322	// that its two pairs of consecutive loads, then these can be transformed
				323	// into two wider loads and the users can be replaced with DSP
				324	// intrinsics.
				325	for (unsigned x = 0; x < VL0.size(); x += 4) {
				326	auto *Ld0 = dyn_cast<LoadInst>(VL0[x]);
				327	auto *Ld1 = dyn_cast<LoadInst>(VL1[x]);
				328	auto *Ld2 = dyn_cast<LoadInst>(VL0[x+2]);
				329	auto *Ld3 = dyn_cast<LoadInst>(VL1[x+2]);
				330
				331	LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n";
				332	dbgs() << "\t mul1: "; VL0[x]->dump();
				333	dbgs() << "\t mul2: "; VL1[x]->dump();
				334	dbgs() << "and operands " << x + 2 << ":\n";
				335	dbgs() << "\t mul1: "; VL0[x+2]->dump();
				336	dbgs() << "\t mul2: "; VL1[x+2]->dump());
				337
				338	if (AreSequentialLoads(Ld0, Ld1, Candidates[i].VecLd) &&
				339	AreSequentialLoads(Ld2, Ld3, Candidates[i+1].VecLd)) {
				340	LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
				341	PMACPairs.push_back(std::make_pair(&PMul0, &PMul1));
				342	}
				343	}
				344	}
				345	return PMACPairs;
				346	}
				347
				348	bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction,
				349	PMACPairList &PMACPairs) {
				350	Instruction *Acc = Reduction.Phi;
				351	Instruction *InsertAfter = Reduction.AccIntAdd;
				352
				353	for (auto &Pair : PMACPairs) {
				354	LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
				355	dbgs() << "- "; Pair.first->Mul->dump();
				356	dbgs() << "- "; Pair.second->Mul->dump());
Sam Parker	ffc1681	2018-07-03 12:44:16 +0000	[diff] [blame]	357	auto *VecLd0 = cast<LoadInst>(Pair.first->VecLd[0]);
				358	auto *VecLd1 = cast<LoadInst>(Pair.second->VecLd[0]);
				359	Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, InsertAfter);
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	360	InsertAfter = Acc;
				361	}
				362
				363	if (Acc != Reduction.Phi) {
				364	LLVM_DEBUG(dbgs() << "Replace Accumulate: "; Acc->dump());
				365	Reduction.AccIntAdd->replaceAllUsesWith(Acc);
				366	return true;
				367	}
				368	return false;
				369	}
				370
				371	static ReductionList MatchReductions(Function &F, Loop *TheLoop,
				372	BasicBlock *Header) {
				373	ReductionList Reductions;
				374	RecurrenceDescriptor RecDesc;
				375	const bool HasFnNoNaNAttr =
				376	F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
				377	const BasicBlock *Latch = TheLoop->getLoopLatch();
				378
				379	// We need a preheader as getIncomingValueForBlock assumes there is one.
				380	if (!TheLoop->getLoopPreheader())
				381	return Reductions;
				382
				383	for (PHINode &Phi : Header->phis()) {
				384	const auto *Ty = Phi.getType();
				385	if (!Ty->isIntegerTy(32))
				386	continue;
				387
				388	const bool IsReduction =
				389	RecurrenceDescriptor::AddReductionVar(&Phi,
				390	RecurrenceDescriptor::RK_IntegerAdd,
				391	TheLoop, HasFnNoNaNAttr, RecDesc);
				392	if (!IsReduction)
				393	continue;
				394
				395	Instruction *Acc = dyn_cast<Instruction>(Phi.getIncomingValueForBlock(Latch));
				396	if (!Acc)
				397	continue;
				398
				399	Reductions.push_back(Reduction(&Phi, Acc));
				400	}
				401
				402	LLVM_DEBUG(
				403	dbgs() << "\nAccumulating integer additions (reductions) found:\n";
				404	for (auto R : Reductions) {
				405	dbgs() << "- "; R.Phi->dump();
				406	dbgs() << "-> "; R.AccIntAdd->dump();
				407	}
				408	);
				409	return Reductions;
				410	}
				411
				412	static void AddCandidateMAC(ParallelMACList &Candidates, const Instruction *Acc,
				413	Value MulOp0, Value MulOp1, int MulOpNum) {
				414	Instruction *Mul = dyn_cast<Instruction>(Acc->getOperand(MulOpNum));
				415	LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
				416	ValueList VL;
				417	if (IsNarrowSequence<16>(MulOp0, VL) &&
				418	IsNarrowSequence<16>(MulOp1, VL)) {
				419	LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
				420	Candidates.push_back(ParallelMAC(Mul, VL));
				421	}
				422	}
				423
				424	static ParallelMACList MatchParallelMACs(Reduction &R) {
				425	ParallelMACList Candidates;
				426	const Instruction *Acc = R.AccIntAdd;
				427	Value A, MulOp0, *MulOp1;
				428	LLVM_DEBUG(dbgs() << "\n- Analysing:\t"; Acc->dump());
				429
				430	// Pattern 1: the accumulator is the RHS of the mul.
				431	while(match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)),
				432	m_Value(A)))){
				433	AddCandidateMAC(Candidates, Acc, MulOp0, MulOp1, 0);
				434	Acc = dyn_cast<Instruction>(A);
				435	}
				436	// Pattern 2: the accumulator is the LHS of the mul.
				437	while(match(Acc, m_Add(m_Value(A),
				438	m_Mul(m_Value(MulOp0), m_Value(MulOp1))))) {
				439	AddCandidateMAC(Candidates, Acc, MulOp0, MulOp1, 1);
				440	Acc = dyn_cast<Instruction>(A);
				441	}
				442
				443	// The last mul in the chain has a slightly different pattern:
				444	// the mul is the first operand
				445	if (match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)), m_Value(A))))
				446	AddCandidateMAC(Candidates, Acc, MulOp0, MulOp1, 0);
				447
				448	// Because we start at the bottom of the chain, and we work our way up,
				449	// the muls are added in reverse program order to the list.
				450	std::reverse(Candidates.begin(), Candidates.end());
				451	return Candidates;
				452	}
				453
				454	// Collects all instructions that are not part of the MAC chains, which is the
				455	// set of instructions that can potentially alias with the MAC operands.
				456	static Instructions AliasCandidates(BasicBlock *Header,
				457	ParallelMACList &MACCandidates) {
				458	Instructions Aliases;
				459	auto IsMACCandidate = [] (Instruction *I, ParallelMACList &MACCandidates) {
				460	for (auto &MAC : MACCandidates)
				461	for (auto *Val : MAC.VL)
				462	if (I == MAC.Mul \|\| Val == I)
				463	return true;
				464	return false;
				465	};
				466
				467	std::for_each(Header->begin(), Header->end(),
				468	[&Aliases, &MACCandidates, &IsMACCandidate] (Instruction &I) {
				469	if (I.mayReadOrWriteMemory() &&
				470	!IsMACCandidate(&I, MACCandidates))
				471	Aliases.push_back(&I); });
				472	return Aliases;
				473	}
				474
				475	// This compares all instructions from the "alias candidates" set, i.e., all
				476	// instructions that are not part of the MAC-chain, with all instructions in
				477	// the MAC candidate set, to see if instructions are aliased.
				478	static bool AreAliased(AliasAnalysis *AA, Instructions AliasCandidates,
				479	ParallelMACList &MACCandidates) {
				480	LLVM_DEBUG(dbgs() << "Alias checks:\n");
				481	for (auto *I : AliasCandidates) {
				482	LLVM_DEBUG(dbgs() << "- "; I->dump());
				483	for (auto &MAC : MACCandidates) {
				484	LLVM_DEBUG(dbgs() << "mul: "; MAC.Mul->dump());
				485	assert(MAC.MemLocs.size() >= 2 && "expecting at least 2 memlocs");
				486	for (auto &MemLoc : MAC.MemLocs) {
				487	if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
				488	ModRefInfo::ModRef))) {
				489	LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
				490	return true;
				491	}
				492	}
				493	}
				494	}
				495	LLVM_DEBUG(dbgs() << "OK: no aliases found!\n");
				496	return false;
				497	}
				498
				499	static bool SetMemoryLocations(ParallelMACList &Candidates) {
				500	const auto Size = MemoryLocation::UnknownSize;
				501	for (auto &C : Candidates) {
				502	// A mul has 2 operands, and a narrow op consist of sext and a load; thus
				503	// we expect at least 4 items in this operand value list.
				504	if (C.VL.size() < 4) {
				505	LLVM_DEBUG(dbgs() << "Operand list too short.\n");
				506	return false;
				507	}
				508
				509	for (unsigned i = 0; i < C.VL.size(); i += 4) {
				510	auto *LdOp0 = dyn_cast<LoadInst>(C.VL[i]);
				511	auto *LdOp1 = dyn_cast<LoadInst>(C.VL[i+2]);
				512	if (!LdOp0 \|\| !LdOp1)
				513	return false;
				514
				515	C.MemLocs.push_back(MemoryLocation(LdOp0->getPointerOperand(), Size));
				516	C.MemLocs.push_back(MemoryLocation(LdOp1->getPointerOperand(), Size));
				517	}
				518	}
				519	return true;
				520	}
				521
				522	// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
				523	// multiplications.
				524	// To use SMLAD:
				525	// 1) we first need to find integer add reduction PHIs,
				526	// 2) then from the PHI, look for this pattern:
				527	//
				528	// acc0 = phi i32 [0, %entry], [%acc1, %loop.body]
				529	// ld0 = load i16
				530	// sext0 = sext i16 %ld0 to i32
				531	// ld1 = load i16
				532	// sext1 = sext i16 %ld1 to i32
				533	// mul0 = mul %sext0, %sext1
				534	// ld2 = load i16
				535	// sext2 = sext i16 %ld2 to i32
				536	// ld3 = load i16
				537	// sext3 = sext i16 %ld3 to i32
				538	// mul1 = mul i32 %sext2, %sext3
				539	// add0 = add i32 %mul0, %acc0
				540	// acc1 = add i32 %add0, %mul1
				541	//
				542	// Which can be selected to:
				543	//
				544	// ldr.h r0
				545	// ldr.h r1
				546	// smlad r2, r0, r1, r2
				547	//
				548	// If constants are used instead of loads, these will need to be hoisted
				549	// out and into a register.
				550	//
				551	// If loop invariants are used instead of loads, these need to be packed
				552	// before the loop begins.
				553	//
				554	// Can only be enabled for cores which support unaligned accesses.
				555	//
				556	bool ARMParallelDSP::MatchSMLAD(Function &F) {
				557	BasicBlock *Header = L->getHeader();
				558	LLVM_DEBUG(dbgs() << "= Matching SMLAD =\n";
				559	dbgs() << "Header block:\n"; Header->dump();
				560	dbgs() << "Loop info:\n\n"; L->dump());
				561
				562	bool Changed = false;
				563	ReductionList Reductions = MatchReductions(F, L, Header);
				564
				565	for (auto &R : Reductions) {
				566	ParallelMACList MACCandidates = MatchParallelMACs(R);
				567	if (!SetMemoryLocations(MACCandidates))
				568	continue;
				569	Instructions Aliases = AliasCandidates(Header, MACCandidates);
				570	if (AreAliased(AA, Aliases, MACCandidates))
				571	continue;
				572	PMACPairList PMACPairs = CreateParallelMACPairs(MACCandidates);
				573	Changed = InsertParallelMACs(R, PMACPairs) \|\| Changed;
				574	}
				575
				576	LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
				577	return Changed;
				578	}
				579
				580	static void CreateLoadIns(IRBuilder<NoFolder> &IRB, Instruction *Acc,
				581	LoadInst **VecLd) {
				582	const Type *AccTy = Acc->getType();
				583	const unsigned AddrSpace = (*VecLd)->getPointerAddressSpace();
				584
				585	Value VecPtr = IRB.CreateBitCast((VecLd)->getPointerOperand(),
				586	AccTy->getPointerTo(AddrSpace));
				587	VecLd = IRB.CreateAlignedLoad(VecPtr, (VecLd)->getAlignment());
				588	}
				589
				590	Instruction ARMParallelDSP::CreateSMLADCall(LoadInst VecLd0, LoadInst *VecLd1,
				591	Instruction *Acc,
				592	Instruction *InsertAfter) {
				593	LLVM_DEBUG(dbgs() << "Create SMLAD intrinsic using:\n";
				594	dbgs() << "- "; VecLd0->dump();
				595	dbgs() << "- "; VecLd1->dump();
				596	dbgs() << "- "; Acc->dump());
				597
				598	IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
				599	++BasicBlock::iterator(InsertAfter));
				600
				601	// Replace the reduction chain with an intrinsic call
				602	CreateLoadIns(Builder, Acc, &VecLd0);
				603	CreateLoadIns(Builder, Acc, &VecLd1);
				604	Value* Args[] = { VecLd0, VecLd1, Acc };
				605	Function *SMLAD = Intrinsic::getDeclaration(M, Intrinsic::arm_smlad);
				606	CallInst *Call = Builder.CreateCall(SMLAD, Args);
				607	return Call;
				608	}
				609
				610	Pass *llvm::createARMParallelDSPPass() {
				611	return new ARMParallelDSP();
				612	}
				613
				614	char ARMParallelDSP::ID = 0;
				615
				616	INITIALIZE_PASS_BEGIN(ARMParallelDSP, "parallel-dsp",
Simon Pilgrim	c09b5e3	2018-06-28 18:37:16 +0000	[diff] [blame]	617	"Transform loops to use DSP intrinsics", false, false)
Sjoerd Meijer	c89ca55	2018-06-28 12:55:29 +0000	[diff] [blame]	618	INITIALIZE_PASS_END(ARMParallelDSP, "parallel-dsp",
Simon Pilgrim	c09b5e3	2018-06-28 18:37:16 +0000	[diff] [blame]	619	"Transform loops to use DSP intrinsics", false, false)