blob: 6ef03c644d1ac0b07e3e1503655db88da99a3706 [file] [log] [blame]
Sam Parker2200a9b2019-07-31 07:32:03 +00001//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
Sjoerd Meijerc89ca552018-06-28 12:55:29 +00002//
Chandler Carruth2946cd72019-01-19 08:50:56 +00003// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Sjoerd Meijerc89ca552018-06-28 12:55:29 +00006//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Armv6 introduced instructions to perform 32-bit SIMD operations. The
11/// purpose of this pass is do some IR pattern matching to create ACLE
12/// DSP intrinsics, which map on these 32-bit SIMD operations.
Sjoerd Meijer53449da2018-07-11 12:36:25 +000013/// This pass runs only when unaligned accesses is supported/enabled.
Sjoerd Meijerc89ca552018-06-28 12:55:29 +000014//
15//===----------------------------------------------------------------------===//
16
Sjoerd Meijerb3e06fa2018-07-06 14:47:09 +000017#include "llvm/ADT/Statistic.h"
Sjoerd Meijerc89ca552018-06-28 12:55:29 +000018#include "llvm/ADT/SmallPtrSet.h"
19#include "llvm/Analysis/AliasAnalysis.h"
20#include "llvm/Analysis/LoopAccessAnalysis.h"
Sjoerd Meijerc89ca552018-06-28 12:55:29 +000021#include "llvm/IR/Instructions.h"
22#include "llvm/IR/NoFolder.h"
23#include "llvm/Transforms/Scalar.h"
24#include "llvm/Transforms/Utils/BasicBlockUtils.h"
Sjoerd Meijerc89ca552018-06-28 12:55:29 +000025#include "llvm/Pass.h"
26#include "llvm/PassRegistry.h"
27#include "llvm/PassSupport.h"
28#include "llvm/Support/Debug.h"
29#include "llvm/IR/PatternMatch.h"
30#include "llvm/CodeGen/TargetPassConfig.h"
31#include "ARM.h"
32#include "ARMSubtarget.h"
33
34using namespace llvm;
35using namespace PatternMatch;
36
Sjoerd Meijerb3e06fa2018-07-06 14:47:09 +000037#define DEBUG_TYPE "arm-parallel-dsp"
38
39STATISTIC(NumSMLAD , "Number of smlad instructions generated");
Sjoerd Meijerc89ca552018-06-28 12:55:29 +000040
Sjoerd Meijer3c859b32018-08-14 07:43:49 +000041static cl::opt<bool>
42DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false),
43 cl::desc("Disable the ARM Parallel DSP pass"));
44
Sjoerd Meijerc89ca552018-06-28 12:55:29 +000045namespace {
Sam Parker414dd1c2019-07-29 08:41:51 +000046 struct MulCandidate;
Sam Parker85ad78b2019-07-11 07:47:50 +000047 class Reduction;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +000048
Sam Parkercd385992019-08-02 08:21:17 +000049 using MulCandList = SmallVector<std::unique_ptr<MulCandidate>, 8>;
50 using MemInstList = SmallVectorImpl<LoadInst*>;
51 using MulPairList = SmallVector<std::pair<MulCandidate*, MulCandidate*>, 8>;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +000052
Sam Parker414dd1c2019-07-29 08:41:51 +000053 // 'MulCandidate' holds the multiplication instructions that are candidates
Sam Parker3da59e52019-07-26 14:11:40 +000054 // for parallel execution.
Sam Parker414dd1c2019-07-29 08:41:51 +000055 struct MulCandidate {
Sam Parker89a37992018-07-23 15:25:59 +000056 Instruction *Root;
Sam Parker414dd1c2019-07-29 08:41:51 +000057 Value* LHS;
58 Value* RHS;
Sam Parker3da59e52019-07-26 14:11:40 +000059 bool Exchange = false;
Sam Parker89a37992018-07-23 15:25:59 +000060 bool ReadOnly = true;
Sam Parkercd385992019-08-02 08:21:17 +000061 SmallVector<LoadInst*, 2> VecLd; // Container for loads to widen.
Sam Parker89a37992018-07-23 15:25:59 +000062
Sam Parker14c6dfd2019-08-02 07:32:28 +000063 MulCandidate(Instruction *I, Value *lhs, Value *rhs) :
64 Root(I), LHS(lhs), RHS(rhs) { }
Sam Parker89a37992018-07-23 15:25:59 +000065
Sam Parker414dd1c2019-07-29 08:41:51 +000066 bool HasTwoLoadInputs() const {
67 return isa<LoadInst>(LHS) && isa<LoadInst>(RHS);
68 }
Sam Parker7ca8c6f2019-08-01 08:17:51 +000069
70 LoadInst *getBaseLoad() const {
Sam Parker0dba7912019-08-09 07:48:50 +000071 return VecLd.front();
Sam Parker7ca8c6f2019-08-01 08:17:51 +000072 }
Sjoerd Meijerc89ca552018-06-28 12:55:29 +000073 };
74
Sam Parker85ad78b2019-07-11 07:47:50 +000075 /// Represent a sequence of multiply-accumulate operations with the aim to
76 /// perform the multiplications in parallel.
77 class Reduction {
78 Instruction *Root = nullptr;
79 Value *Acc = nullptr;
Sam Parker414dd1c2019-07-29 08:41:51 +000080 MulCandList Muls;
Sam Parkercd385992019-08-02 08:21:17 +000081 MulPairList MulPairs;
Sam Parker85ad78b2019-07-11 07:47:50 +000082 SmallPtrSet<Instruction*, 4> Adds;
83
84 public:
85 Reduction() = delete;
86
87 Reduction (Instruction *Add) : Root(Add) { }
88
89 /// Record an Add instruction that is a part of the this reduction.
90 void InsertAdd(Instruction *I) { Adds.insert(I); }
91
Sam Parker414dd1c2019-07-29 08:41:51 +000092 /// Record a MulCandidate, rooted at a Mul instruction, that is a part of
Sam Parker85ad78b2019-07-11 07:47:50 +000093 /// this reduction.
Sam Parker14c6dfd2019-08-02 07:32:28 +000094 void InsertMul(Instruction *I, Value *LHS, Value *RHS) {
Sam Parker414dd1c2019-07-29 08:41:51 +000095 Muls.push_back(make_unique<MulCandidate>(I, LHS, RHS));
Sam Parker85ad78b2019-07-11 07:47:50 +000096 }
97
98 /// Add the incoming accumulator value, returns true if a value had not
99 /// already been added. Returning false signals to the user that this
100 /// reduction already has a value to initialise the accumulator.
101 bool InsertAcc(Value *V) {
102 if (Acc)
103 return false;
104 Acc = V;
105 return true;
106 }
107
Sam Parker414dd1c2019-07-29 08:41:51 +0000108 /// Set two MulCandidates, rooted at muls, that can be executed as a single
Sam Parker85ad78b2019-07-11 07:47:50 +0000109 /// parallel operation.
Sam Parker414dd1c2019-07-29 08:41:51 +0000110 void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1) {
Sam Parker85ad78b2019-07-11 07:47:50 +0000111 MulPairs.push_back(std::make_pair(Mul0, Mul1));
112 }
113
114 /// Return true if enough mul operations are found that can be executed in
115 /// parallel.
116 bool CreateParallelPairs();
117
118 /// Return the add instruction which is the root of the reduction.
119 Instruction *getRoot() { return Root; }
120
Sam Parker7ca8c6f2019-08-01 08:17:51 +0000121 bool is64Bit() const { return Root->getType()->isIntegerTy(64); }
122
Sam Parker85ad78b2019-07-11 07:47:50 +0000123 /// Return the incoming value to be accumulated. This maybe null.
124 Value *getAccumulator() { return Acc; }
125
126 /// Return the set of adds that comprise the reduction.
127 SmallPtrSetImpl<Instruction*> &getAdds() { return Adds; }
128
Sam Parker414dd1c2019-07-29 08:41:51 +0000129 /// Return the MulCandidate, rooted at mul instruction, that comprise the
Sam Parker85ad78b2019-07-11 07:47:50 +0000130 /// the reduction.
Sam Parker414dd1c2019-07-29 08:41:51 +0000131 MulCandList &getMuls() { return Muls; }
Sam Parker85ad78b2019-07-11 07:47:50 +0000132
Sam Parker414dd1c2019-07-29 08:41:51 +0000133 /// Return the MulCandidate, rooted at mul instructions, that have been
Sam Parker85ad78b2019-07-11 07:47:50 +0000134 /// paired for parallel execution.
Sam Parkercd385992019-08-02 08:21:17 +0000135 MulPairList &getMulPairs() { return MulPairs; }
Sam Parker85ad78b2019-07-11 07:47:50 +0000136
137 /// To finalise, replace the uses of the root with the intrinsic call.
138 void UpdateRoot(Instruction *SMLAD) {
139 Root->replaceAllUsesWith(SMLAD);
140 }
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000141 };
142
Sam Parker4c4ff132019-03-14 11:14:13 +0000143 class WidenedLoad {
144 LoadInst *NewLd = nullptr;
145 SmallVector<LoadInst*, 4> Loads;
146
147 public:
148 WidenedLoad(SmallVectorImpl<LoadInst*> &Lds, LoadInst *Wide)
149 : NewLd(Wide) {
150 for (auto *I : Lds)
151 Loads.push_back(I);
152 }
153 LoadInst *getLoad() {
154 return NewLd;
155 }
156 };
157
Sam Parker2200a9b2019-07-31 07:32:03 +0000158 class ARMParallelDSP : public FunctionPass {
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000159 ScalarEvolution *SE;
160 AliasAnalysis *AA;
161 TargetLibraryInfo *TLI;
162 DominatorTree *DT;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000163 const DataLayout *DL;
164 Module *M;
Sam Parker453ba912018-11-09 09:18:00 +0000165 std::map<LoadInst*, LoadInst*> LoadPairs;
Sam Parker85ad78b2019-07-11 07:47:50 +0000166 SmallPtrSet<LoadInst*, 4> OffsetLoads;
Sam Parker4c4ff132019-03-14 11:14:13 +0000167 std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000168
Sam Parker85ad78b2019-07-11 07:47:50 +0000169 template<unsigned>
Sam Parker14c6dfd2019-08-02 07:32:28 +0000170 bool IsNarrowSequence(Value *V, Value *&Src);
Sam Parker85ad78b2019-07-11 07:47:50 +0000171
Sam Parkera33e3112019-05-13 09:23:32 +0000172 bool RecordMemoryOps(BasicBlock *BB);
Sam Parker85ad78b2019-07-11 07:47:50 +0000173 void InsertParallelMACs(Reduction &Reduction);
Fangrui Song68169342018-07-03 19:12:27 +0000174 bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
Sam Parkercd385992019-08-02 08:21:17 +0000175 LoadInst* CreateWideLoad(MemInstList &Loads, IntegerType *LoadTy);
Sam Parker85ad78b2019-07-11 07:47:50 +0000176 bool CreateParallelPairs(Reduction &R);
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000177
178 /// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
179 /// Dual performs two signed 16x16-bit multiplications. It adds the
180 /// products to a 32-bit accumulate operand. Optionally, the instruction can
181 /// exchange the halfwords of the second operand before performing the
182 /// arithmetic.
Sam Parker2200a9b2019-07-31 07:32:03 +0000183 bool MatchSMLAD(Function &F);
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000184
185 public:
186 static char ID;
187
Sam Parker2200a9b2019-07-31 07:32:03 +0000188 ARMParallelDSP() : FunctionPass(ID) { }
Sam Parkera33e3112019-05-13 09:23:32 +0000189
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000190 void getAnalysisUsage(AnalysisUsage &AU) const override {
Sam Parker2200a9b2019-07-31 07:32:03 +0000191 FunctionPass::getAnalysisUsage(AU);
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000192 AU.addRequired<AssumptionCacheTracker>();
193 AU.addRequired<ScalarEvolutionWrapperPass>();
194 AU.addRequired<AAResultsWrapperPass>();
195 AU.addRequired<TargetLibraryInfoWrapperPass>();
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000196 AU.addRequired<DominatorTreeWrapperPass>();
197 AU.addRequired<TargetPassConfig>();
Sam Parker2200a9b2019-07-31 07:32:03 +0000198 AU.addPreserved<ScalarEvolutionWrapperPass>();
199 AU.addPreserved<GlobalsAAWrapperPass>();
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000200 AU.setPreservesCFG();
201 }
202
Sam Parker2200a9b2019-07-31 07:32:03 +0000203 bool runOnFunction(Function &F) override {
Sjoerd Meijer3c859b32018-08-14 07:43:49 +0000204 if (DisableParallelDSP)
205 return false;
Sam Parker2200a9b2019-07-31 07:32:03 +0000206 if (skipFunction(F))
Eli Friedmanb27fc952019-07-23 20:48:46 +0000207 return false;
208
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000209 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
210 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
211 TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
212 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000213 auto &TPC = getAnalysis<TargetPassConfig>();
214
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000215 M = F.getParent();
216 DL = &M->getDataLayout();
217
218 auto &TM = TPC.getTM<TargetMachine>();
219 auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
220
221 if (!ST->allowsUnalignedMem()) {
222 LLVM_DEBUG(dbgs() << "Unaligned memory access not supported: not "
223 "running pass ARMParallelDSP\n");
224 return false;
225 }
226
227 if (!ST->hasDSP()) {
228 LLVM_DEBUG(dbgs() << "DSP extension not enabled: not running pass "
229 "ARMParallelDSP\n");
230 return false;
231 }
232
Sam Parker9e730202019-03-15 10:19:32 +0000233 if (!ST->isLittle()) {
234 LLVM_DEBUG(dbgs() << "Only supporting little endian: not running pass "
Sam Parkera33e3112019-05-13 09:23:32 +0000235 << "ARMParallelDSP\n");
Sam Parker9e730202019-03-15 10:19:32 +0000236 return false;
237 }
238
Sam Parkera023c7a2018-09-12 09:17:44 +0000239 LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
240 LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
Sam Parker453ba912018-11-09 09:18:00 +0000241
Sam Parker2200a9b2019-07-31 07:32:03 +0000242 bool Changes = MatchSMLAD(F);
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000243 return Changes;
244 }
245 };
246}
247
Sam Parkerffc16812018-07-03 12:44:16 +0000248template<typename MemInst>
249static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1,
Sam Parker453ba912018-11-09 09:18:00 +0000250 const DataLayout &DL, ScalarEvolution &SE) {
Sam Parker4c4ff132019-03-14 11:14:13 +0000251 if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE))
Sam Parkerffc16812018-07-03 12:44:16 +0000252 return true;
Sam Parkerffc16812018-07-03 12:44:16 +0000253 return false;
254}
255
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000256bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
Sam Parkerffc16812018-07-03 12:44:16 +0000257 MemInstList &VecMem) {
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000258 if (!Ld0 || !Ld1)
259 return false;
260
Sam Parker4c4ff132019-03-14 11:14:13 +0000261 if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
262 return false;
263
264 LLVM_DEBUG(dbgs() << "Loads are sequential and valid:\n";
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000265 dbgs() << "Ld0:"; Ld0->dump();
266 dbgs() << "Ld1:"; Ld1->dump();
267 );
268
Sam Parker453ba912018-11-09 09:18:00 +0000269 VecMem.clear();
270 VecMem.push_back(Ld0);
271 VecMem.push_back(Ld1);
272 return true;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000273}
274
Sam Parker85ad78b2019-07-11 07:47:50 +0000275// MaxBitwidth: the maximum supported bitwidth of the elements in the DSP
276// instructions, which is set to 16. So here we should collect all i8 and i16
277// narrow operations.
278// TODO: we currently only collect i16, and will support i8 later, so that's
279// why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
280template<unsigned MaxBitWidth>
Sam Parker14c6dfd2019-08-02 07:32:28 +0000281bool ARMParallelDSP::IsNarrowSequence(Value *V, Value *&Src) {
Sam Parker74400652019-07-26 10:57:42 +0000282 if (auto *SExt = dyn_cast<SExtInst>(V)) {
283 if (SExt->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
Sam Parker85ad78b2019-07-11 07:47:50 +0000284 return false;
285
Sam Parker74400652019-07-26 10:57:42 +0000286 if (auto *Ld = dyn_cast<LoadInst>(SExt->getOperand(0))) {
Sam Parker85ad78b2019-07-11 07:47:50 +0000287 // Check that these load could be paired.
288 if (!LoadPairs.count(Ld) && !OffsetLoads.count(Ld))
289 return false;
290
Sam Parker14c6dfd2019-08-02 07:32:28 +0000291 Src = Ld;
Sam Parker85ad78b2019-07-11 07:47:50 +0000292 return true;
293 }
294 }
295 return false;
296}
297
Sam Parkera33e3112019-05-13 09:23:32 +0000298/// Iterate through the block and record base, offset pairs of loads which can
299/// be widened into a single load.
300bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
Sam Parker453ba912018-11-09 09:18:00 +0000301 SmallVector<LoadInst*, 8> Loads;
Sam Parkera33e3112019-05-13 09:23:32 +0000302 SmallVector<Instruction*, 8> Writes;
Sam Parker2200a9b2019-07-31 07:32:03 +0000303 LoadPairs.clear();
304 WideLoads.clear();
Sam Parkera33e3112019-05-13 09:23:32 +0000305
306 // Collect loads and instruction that may write to memory. For now we only
307 // record loads which are simple, sign-extended and have a single user.
308 // TODO: Allow zero-extended loads.
Sam Parker4c4ff132019-03-14 11:14:13 +0000309 for (auto &I : *BB) {
Sam Parkera33e3112019-05-13 09:23:32 +0000310 if (I.mayWriteToMemory())
311 Writes.push_back(&I);
Sam Parker453ba912018-11-09 09:18:00 +0000312 auto *Ld = dyn_cast<LoadInst>(&I);
Sam Parker4c4ff132019-03-14 11:14:13 +0000313 if (!Ld || !Ld->isSimple() ||
314 !Ld->hasOneUse() || !isa<SExtInst>(Ld->user_back()))
Sam Parker453ba912018-11-09 09:18:00 +0000315 continue;
316 Loads.push_back(Ld);
317 }
318
Sam Parkera33e3112019-05-13 09:23:32 +0000319 using InstSet = std::set<Instruction*>;
320 using DepMap = std::map<Instruction*, InstSet>;
321 DepMap RAWDeps;
322
323 // Record any writes that may alias a load.
324 const auto Size = LocationSize::unknown();
325 for (auto Read : Loads) {
326 for (auto Write : Writes) {
327 MemoryLocation ReadLoc =
328 MemoryLocation(Read->getPointerOperand(), Size);
329
330 if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc),
331 ModRefInfo::ModRef)))
332 continue;
333 if (DT->dominates(Write, Read))
334 RAWDeps[Read].insert(Write);
335 }
336 }
337
338 // Check whether there's not a write between the two loads which would
339 // prevent them from being safely merged.
340 auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) {
341 LoadInst *Dominator = DT->dominates(Base, Offset) ? Base : Offset;
342 LoadInst *Dominated = DT->dominates(Base, Offset) ? Offset : Base;
343
344 if (RAWDeps.count(Dominated)) {
345 InstSet &WritesBefore = RAWDeps[Dominated];
346
347 for (auto Before : WritesBefore) {
Sam Parkera33e3112019-05-13 09:23:32 +0000348 // We can't move the second load backward, past a write, to merge
349 // with the first load.
350 if (DT->dominates(Dominator, Before))
351 return false;
352 }
353 }
354 return true;
355 };
356
357 // Record base, offset load pairs.
358 for (auto *Base : Loads) {
359 for (auto *Offset : Loads) {
360 if (Base == Offset)
Sam Parker453ba912018-11-09 09:18:00 +0000361 continue;
362
Sam Parkera33e3112019-05-13 09:23:32 +0000363 if (AreSequentialAccesses<LoadInst>(Base, Offset, *DL, *SE) &&
364 SafeToPair(Base, Offset)) {
365 LoadPairs[Base] = Offset;
Sam Parker85ad78b2019-07-11 07:47:50 +0000366 OffsetLoads.insert(Offset);
Sam Parker4c4ff132019-03-14 11:14:13 +0000367 break;
Sam Parker453ba912018-11-09 09:18:00 +0000368 }
369 }
370 }
Sam Parker4c4ff132019-03-14 11:14:13 +0000371
372 LLVM_DEBUG(if (!LoadPairs.empty()) {
373 dbgs() << "Consecutive load pairs:\n";
374 for (auto &MapIt : LoadPairs) {
375 LLVM_DEBUG(dbgs() << *MapIt.first << ", "
376 << *MapIt.second << "\n");
377 }
378 });
Sam Parker453ba912018-11-09 09:18:00 +0000379 return LoadPairs.size() > 1;
380}
381
Sam Parker2200a9b2019-07-31 07:32:03 +0000382// The pass needs to identify integer add/sub reductions of 16-bit vector
Sam Parker85ad78b2019-07-11 07:47:50 +0000383// multiplications.
384// To use SMLAD:
385// 1) we first need to find integer add then look for this pattern:
386//
387// acc0 = ...
388// ld0 = load i16
389// sext0 = sext i16 %ld0 to i32
390// ld1 = load i16
391// sext1 = sext i16 %ld1 to i32
392// mul0 = mul %sext0, %sext1
393// ld2 = load i16
394// sext2 = sext i16 %ld2 to i32
395// ld3 = load i16
396// sext3 = sext i16 %ld3 to i32
397// mul1 = mul i32 %sext2, %sext3
398// add0 = add i32 %mul0, %acc0
399// acc1 = add i32 %add0, %mul1
400//
401// Which can be selected to:
402//
403// ldr r0
404// ldr r1
405// smlad r2, r0, r1, r2
406//
407// If constants are used instead of loads, these will need to be hoisted
408// out and into a register.
409//
410// If loop invariants are used instead of loads, these need to be packed
411// before the loop begins.
412//
Sam Parker2200a9b2019-07-31 07:32:03 +0000413bool ARMParallelDSP::MatchSMLAD(Function &F) {
Sam Parker85ad78b2019-07-11 07:47:50 +0000414 // Search recursively back through the operands to find a tree of values that
415 // form a multiply-accumulate chain. The search records the Add and Mul
416 // instructions that form the reduction and allows us to find a single value
417 // to be used as the initial input to the accumlator.
Sam Parker2200a9b2019-07-31 07:32:03 +0000418 std::function<bool(Value*, BasicBlock*, Reduction&)> Search = [&]
419 (Value *V, BasicBlock *BB, Reduction &R) -> bool {
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000420
Sam Parker85ad78b2019-07-11 07:47:50 +0000421 // If we find a non-instruction, try to use it as the initial accumulator
422 // value. This may have already been found during the search in which case
423 // this function will return false, signaling a search fail.
424 auto *I = dyn_cast<Instruction>(V);
425 if (!I)
426 return R.InsertAcc(V);
Sam Parker453ba912018-11-09 09:18:00 +0000427
Sam Parker2200a9b2019-07-31 07:32:03 +0000428 if (I->getParent() != BB)
429 return false;
430
Sam Parker85ad78b2019-07-11 07:47:50 +0000431 switch (I->getOpcode()) {
432 default:
433 break;
434 case Instruction::PHI:
435 // Could be the accumulator value.
436 return R.InsertAcc(V);
437 case Instruction::Add: {
438 // Adds should be adding together two muls, or another add and a mul to
439 // be within the mac chain. One of the operands may also be the
440 // accumulator value at which point we should stop searching.
Sam Parker2200a9b2019-07-31 07:32:03 +0000441 bool ValidLHS = Search(I->getOperand(0), BB, R);
442 bool ValidRHS = Search(I->getOperand(1), BB, R);
Sam Parker85ad78b2019-07-11 07:47:50 +0000443 if (!ValidLHS && !ValidLHS)
444 return false;
445 else if (ValidLHS && ValidRHS) {
446 R.InsertAdd(I);
447 return true;
448 } else {
449 R.InsertAdd(I);
450 return R.InsertAcc(I);
451 }
452 }
453 case Instruction::Mul: {
454 Value *MulOp0 = I->getOperand(0);
455 Value *MulOp1 = I->getOperand(1);
456 if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1)) {
Sam Parker14c6dfd2019-08-02 07:32:28 +0000457 Value *LHS = nullptr;
458 Value *RHS = nullptr;
Sam Parker85ad78b2019-07-11 07:47:50 +0000459 if (IsNarrowSequence<16>(MulOp0, LHS) &&
460 IsNarrowSequence<16>(MulOp1, RHS)) {
461 R.InsertMul(I, LHS, RHS);
462 return true;
463 }
464 }
465 return false;
466 }
467 case Instruction::SExt:
Sam Parker2200a9b2019-07-31 07:32:03 +0000468 return Search(I->getOperand(0), BB, R);
Sam Parker85ad78b2019-07-11 07:47:50 +0000469 }
470 return false;
471 };
472
473 bool Changed = false;
Sam Parker85ad78b2019-07-11 07:47:50 +0000474
Sam Parker2200a9b2019-07-31 07:32:03 +0000475 for (auto &BB : F) {
476 SmallPtrSet<Instruction*, 4> AllAdds;
477 if (!RecordMemoryOps(&BB))
Sam Parker85ad78b2019-07-11 07:47:50 +0000478 continue;
479
Sam Parker2200a9b2019-07-31 07:32:03 +0000480 for (Instruction &I : reverse(BB)) {
481 if (I.getOpcode() != Instruction::Add)
482 continue;
Sam Parker85ad78b2019-07-11 07:47:50 +0000483
Sam Parker2200a9b2019-07-31 07:32:03 +0000484 if (AllAdds.count(&I))
485 continue;
Sam Parker85ad78b2019-07-11 07:47:50 +0000486
Sam Parker2200a9b2019-07-31 07:32:03 +0000487 const auto *Ty = I.getType();
488 if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
489 continue;
Sam Parker85ad78b2019-07-11 07:47:50 +0000490
Sam Parker2200a9b2019-07-31 07:32:03 +0000491 Reduction R(&I);
492 if (!Search(&I, &BB, R))
493 continue;
Sam Parker85ad78b2019-07-11 07:47:50 +0000494
Sam Parker2200a9b2019-07-31 07:32:03 +0000495 if (!CreateParallelPairs(R))
496 continue;
497
498 InsertParallelMACs(R);
499 Changed = true;
500 AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
501 }
Sam Parker85ad78b2019-07-11 07:47:50 +0000502 }
503
504 return Changed;
505}
506
507bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
508
509 // Not enough mul operations to make a pair.
510 if (R.getMuls().size() < 2)
511 return false;
512
513 // Check that the muls operate directly upon sign extended loads.
Sam Parker414dd1c2019-07-29 08:41:51 +0000514 for (auto &MulCand : R.getMuls()) {
515 if (!MulCand->HasTwoLoadInputs())
Sam Parker85ad78b2019-07-11 07:47:50 +0000516 return false;
Sam Parker85ad78b2019-07-11 07:47:50 +0000517 }
518
Sam Parker414dd1c2019-07-29 08:41:51 +0000519 auto CanPair = [&](Reduction &R, MulCandidate *PMul0, MulCandidate *PMul1) {
Sam Parker453ba912018-11-09 09:18:00 +0000520 // The first elements of each vector should be loads with sexts. If we
521 // find that its two pairs of consecutive loads, then these can be
522 // transformed into two wider loads and the users can be replaced with
523 // DSP intrinsics.
Sam Parker414dd1c2019-07-29 08:41:51 +0000524 auto Ld0 = static_cast<LoadInst*>(PMul0->LHS);
525 auto Ld1 = static_cast<LoadInst*>(PMul1->LHS);
526 auto Ld2 = static_cast<LoadInst*>(PMul0->RHS);
527 auto Ld3 = static_cast<LoadInst*>(PMul1->RHS);
Sam Parker453ba912018-11-09 09:18:00 +0000528
Sam Parker414dd1c2019-07-29 08:41:51 +0000529 LLVM_DEBUG(dbgs() << "Loads:\n"
530 << " - " << *Ld0 << "\n"
531 << " - " << *Ld1 << "\n"
532 << " - " << *Ld2 << "\n"
533 << " - " << *Ld3 << "\n");
Sam Parker453ba912018-11-09 09:18:00 +0000534
Sam Parker414dd1c2019-07-29 08:41:51 +0000535 if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
536 if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
Sam Parker453ba912018-11-09 09:18:00 +0000537 LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
Sam Parker414dd1c2019-07-29 08:41:51 +0000538 R.AddMulPair(PMul0, PMul1);
539 return true;
540 } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
541 LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
542 LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n");
543 PMul1->Exchange = true;
544 R.AddMulPair(PMul0, PMul1);
Sam Parker453ba912018-11-09 09:18:00 +0000545 return true;
546 }
Sam Parker414dd1c2019-07-29 08:41:51 +0000547 } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
548 AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
549 LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
550 LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n");
551 LLVM_DEBUG(dbgs() << " and swapping muls\n");
552 PMul0->Exchange = true;
553 // Only the second operand can be exchanged, so swap the muls.
554 R.AddMulPair(PMul1, PMul0);
555 return true;
Sam Parker453ba912018-11-09 09:18:00 +0000556 }
557 return false;
558 };
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000559
Sam Parker414dd1c2019-07-29 08:41:51 +0000560 MulCandList &Muls = R.getMuls();
Sam Parker85ad78b2019-07-11 07:47:50 +0000561 const unsigned Elems = Muls.size();
Sam Parkera023c7a2018-09-12 09:17:44 +0000562 SmallPtrSet<const Instruction*, 4> Paired;
563 for (unsigned i = 0; i < Elems; ++i) {
Sam Parker414dd1c2019-07-29 08:41:51 +0000564 MulCandidate *PMul0 = static_cast<MulCandidate*>(Muls[i].get());
Sam Parkera023c7a2018-09-12 09:17:44 +0000565 if (Paired.count(PMul0->Root))
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000566 continue;
567
Sam Parkera023c7a2018-09-12 09:17:44 +0000568 for (unsigned j = 0; j < Elems; ++j) {
569 if (i == j)
570 continue;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000571
Sam Parker414dd1c2019-07-29 08:41:51 +0000572 MulCandidate *PMul1 = static_cast<MulCandidate*>(Muls[j].get());
Sam Parkera023c7a2018-09-12 09:17:44 +0000573 if (Paired.count(PMul1->Root))
574 continue;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000575
Sam Parkera023c7a2018-09-12 09:17:44 +0000576 const Instruction *Mul0 = PMul0->Root;
577 const Instruction *Mul1 = PMul1->Root;
578 if (Mul0 == Mul1)
579 continue;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000580
Sam Parkera023c7a2018-09-12 09:17:44 +0000581 assert(PMul0 != PMul1 && "expected different chains");
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000582
Sam Parker85ad78b2019-07-11 07:47:50 +0000583 if (CanPair(R, PMul0, PMul1)) {
Sam Parkera023c7a2018-09-12 09:17:44 +0000584 Paired.insert(Mul0);
585 Paired.insert(Mul1);
586 break;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000587 }
588 }
589 }
Sam Parker85ad78b2019-07-11 07:47:50 +0000590 return !R.getMulPairs().empty();
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000591}
592
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000593
Sam Parker85ad78b2019-07-11 07:47:50 +0000594void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
595
Sam Parker7ca8c6f2019-08-01 08:17:51 +0000596 auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1,
597 Value *Acc, bool Exchange,
598 Instruction *InsertAfter) {
Sam Parker85ad78b2019-07-11 07:47:50 +0000599 // Replace the reduction chain with an intrinsic call
Sam Parker85ad78b2019-07-11 07:47:50 +0000600
601 Value* Args[] = { WideLd0, WideLd1, Acc };
602 Function *SMLAD = nullptr;
603 if (Exchange)
604 SMLAD = Acc->getType()->isIntegerTy(32) ?
605 Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) :
606 Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx);
607 else
608 SMLAD = Acc->getType()->isIntegerTy(32) ?
609 Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) :
610 Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
611
612 IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
613 ++BasicBlock::iterator(InsertAfter));
614 Instruction *Call = Builder.CreateCall(SMLAD, Args);
615 NumSMLAD++;
616 return Call;
617 };
618
619 Instruction *InsertAfter = R.getRoot();
620 Value *Acc = R.getAccumulator();
621 if (!Acc)
622 Acc = ConstantInt::get(IntegerType::get(M->getContext(), 32), 0);
623
Sam Parker7ca8c6f2019-08-01 08:17:51 +0000624 IntegerType *Ty = IntegerType::get(M->getContext(), 32);
Sam Parker85ad78b2019-07-11 07:47:50 +0000625 LLVM_DEBUG(dbgs() << "Root: " << *InsertAfter << "\n"
626 << "Acc: " << *Acc << "\n");
627 for (auto &Pair : R.getMulPairs()) {
Sam Parker7ca8c6f2019-08-01 08:17:51 +0000628 MulCandidate *LHSMul = Pair.first;
629 MulCandidate *RHSMul = Pair.second;
Sam Parker85ad78b2019-07-11 07:47:50 +0000630 LLVM_DEBUG(dbgs() << "Muls:\n"
Sam Parker7ca8c6f2019-08-01 08:17:51 +0000631 << "- " << *LHSMul->Root << "\n"
632 << "- " << *RHSMul->Root << "\n");
633 LoadInst *BaseLHS = LHSMul->getBaseLoad();
634 LoadInst *BaseRHS = RHSMul->getBaseLoad();
635 LoadInst *WideLHS = WideLoads.count(BaseLHS) ?
636 WideLoads[BaseLHS]->getLoad() : CreateWideLoad(LHSMul->VecLd, Ty);
637 LoadInst *WideRHS = WideLoads.count(BaseRHS) ?
638 WideLoads[BaseRHS]->getLoad() : CreateWideLoad(RHSMul->VecLd, Ty);
Sam Parkera023c7a2018-09-12 09:17:44 +0000639
Sam Parker7ca8c6f2019-08-01 08:17:51 +0000640 Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter);
Sam Parker85ad78b2019-07-11 07:47:50 +0000641 InsertAfter = cast<Instruction>(Acc);
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000642 }
Sam Parker85ad78b2019-07-11 07:47:50 +0000643 R.UpdateRoot(cast<Instruction>(Acc));
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000644}
645
Sam Parkercd385992019-08-02 08:21:17 +0000646LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads,
Sam Parkera33e3112019-05-13 09:23:32 +0000647 IntegerType *LoadTy) {
Sam Parker4c4ff132019-03-14 11:14:13 +0000648 assert(Loads.size() == 2 && "currently only support widening two loads");
Sam Parkera33e3112019-05-13 09:23:32 +0000649
650 LoadInst *Base = Loads[0];
651 LoadInst *Offset = Loads[1];
652
653 Instruction *BaseSExt = dyn_cast<SExtInst>(Base->user_back());
654 Instruction *OffsetSExt = dyn_cast<SExtInst>(Offset->user_back());
655
656 assert((BaseSExt && OffsetSExt)
657 && "Loads should have a single, extending, user");
658
659 std::function<void(Value*, Value*)> MoveBefore =
660 [&](Value *A, Value *B) -> void {
661 if (!isa<Instruction>(A) || !isa<Instruction>(B))
662 return;
663
664 auto *Source = cast<Instruction>(A);
665 auto *Sink = cast<Instruction>(B);
666
667 if (DT->dominates(Source, Sink) ||
668 Source->getParent() != Sink->getParent() ||
669 isa<PHINode>(Source) || isa<PHINode>(Sink))
670 return;
671
672 Source->moveBefore(Sink);
Sam Parkeraeb21b92019-07-24 09:38:39 +0000673 for (auto &Op : Source->operands())
674 MoveBefore(Op, Source);
Sam Parkera33e3112019-05-13 09:23:32 +0000675 };
676
677 // Insert the load at the point of the original dominating load.
678 LoadInst *DomLoad = DT->dominates(Base, Offset) ? Base : Offset;
679 IRBuilder<NoFolder> IRB(DomLoad->getParent(),
680 ++BasicBlock::iterator(DomLoad));
681
682 // Bitcast the pointer to a wider type and create the wide load, while making
683 // sure to maintain the original alignment as this prevents ldrd from being
684 // generated when it could be illegal due to memory alignment.
685 const unsigned AddrSpace = DomLoad->getPointerAddressSpace();
686 Value *VecPtr = IRB.CreateBitCast(Base->getPointerOperand(),
Eli Friedmanb09c7782018-10-18 19:34:30 +0000687 LoadTy->getPointerTo(AddrSpace));
Sam Parker4c4ff132019-03-14 11:14:13 +0000688 LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr,
Sam Parkera33e3112019-05-13 09:23:32 +0000689 Base->getAlignment());
Sam Parker4c4ff132019-03-14 11:14:13 +0000690
Sam Parkera33e3112019-05-13 09:23:32 +0000691 // Make sure everything is in the correct order in the basic block.
692 MoveBefore(Base->getPointerOperand(), VecPtr);
693 MoveBefore(VecPtr, WideLoad);
Sam Parker4c4ff132019-03-14 11:14:13 +0000694
695 // From the wide load, create two values that equal the original two loads.
Sam Parkera33e3112019-05-13 09:23:32 +0000696 // Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
697 // TODO: Support big-endian as well.
698 Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
Sam Parker0dba7912019-08-09 07:48:50 +0000699 Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType());
700 BaseSExt->replaceAllUsesWith(NewBaseSExt);
Sam Parker4c4ff132019-03-14 11:14:13 +0000701
Sam Parkera33e3112019-05-13 09:23:32 +0000702 IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
703 Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
Sam Parker4c4ff132019-03-14 11:14:13 +0000704 Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
Sam Parkera33e3112019-05-13 09:23:32 +0000705 Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
Sam Parker0dba7912019-08-09 07:48:50 +0000706 Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType());
707 OffsetSExt->replaceAllUsesWith(NewOffsetSExt);
Sam Parker4c4ff132019-03-14 11:14:13 +0000708
Sam Parkera33e3112019-05-13 09:23:32 +0000709 WideLoads.emplace(std::make_pair(Base,
Sam Parker4c4ff132019-03-14 11:14:13 +0000710 make_unique<WidenedLoad>(Loads, WideLoad)));
711 return WideLoad;
Eli Friedmanb09c7782018-10-18 19:34:30 +0000712}
713
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000714Pass *llvm::createARMParallelDSPPass() {
715 return new ARMParallelDSP();
716}
717
718char ARMParallelDSP::ID = 0;
719
Sjoerd Meijerb3e06fa2018-07-06 14:47:09 +0000720INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
Sam Parker2200a9b2019-07-31 07:32:03 +0000721 "Transform functions to use DSP intrinsics", false, false)
Sjoerd Meijerb3e06fa2018-07-06 14:47:09 +0000722INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
Sam Parker2200a9b2019-07-31 07:32:03 +0000723 "Transform functions to use DSP intrinsics", false, false)