blob: e5f6a61852eefc1fdc3bad5ce7cb02009fb98da9 [file] [log] [blame]
Sjoerd Meijerc89ca552018-06-28 12:55:29 +00001//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// Armv6 introduced instructions to perform 32-bit SIMD operations. The
12/// purpose of this pass is do some IR pattern matching to create ACLE
13/// DSP intrinsics, which map on these 32-bit SIMD operations.
Sjoerd Meijer53449da2018-07-11 12:36:25 +000014/// This pass runs only when unaligned accesses is supported/enabled.
Sjoerd Meijerc89ca552018-06-28 12:55:29 +000015//
16//===----------------------------------------------------------------------===//
17
Sjoerd Meijerb3e06fa2018-07-06 14:47:09 +000018#include "llvm/ADT/Statistic.h"
Sjoerd Meijerc89ca552018-06-28 12:55:29 +000019#include "llvm/ADT/SmallPtrSet.h"
20#include "llvm/Analysis/AliasAnalysis.h"
21#include "llvm/Analysis/LoopAccessAnalysis.h"
22#include "llvm/Analysis/LoopPass.h"
23#include "llvm/Analysis/LoopInfo.h"
24#include "llvm/IR/Instructions.h"
25#include "llvm/IR/NoFolder.h"
26#include "llvm/Transforms/Scalar.h"
27#include "llvm/Transforms/Utils/BasicBlockUtils.h"
28#include "llvm/Transforms/Utils/LoopUtils.h"
29#include "llvm/Pass.h"
30#include "llvm/PassRegistry.h"
31#include "llvm/PassSupport.h"
32#include "llvm/Support/Debug.h"
33#include "llvm/IR/PatternMatch.h"
34#include "llvm/CodeGen/TargetPassConfig.h"
35#include "ARM.h"
36#include "ARMSubtarget.h"
37
38using namespace llvm;
39using namespace PatternMatch;
40
Sjoerd Meijerb3e06fa2018-07-06 14:47:09 +000041#define DEBUG_TYPE "arm-parallel-dsp"
42
43STATISTIC(NumSMLAD , "Number of smlad instructions generated");
Sjoerd Meijerc89ca552018-06-28 12:55:29 +000044
Sjoerd Meijer3c859b32018-08-14 07:43:49 +000045static cl::opt<bool>
46DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false),
47 cl::desc("Disable the ARM Parallel DSP pass"));
48
Sjoerd Meijerc89ca552018-06-28 12:55:29 +000049namespace {
Sam Parker89a37992018-07-23 15:25:59 +000050 struct OpChain;
51 struct BinOpChain;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +000052 struct Reduction;
53
Fangrui Song58407ca2018-07-23 17:43:21 +000054 using OpChainList = SmallVector<std::unique_ptr<OpChain>, 8>;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +000055 using ReductionList = SmallVector<Reduction, 8>;
56 using ValueList = SmallVector<Value*, 8>;
Sam Parkerffc16812018-07-03 12:44:16 +000057 using MemInstList = SmallVector<Instruction*, 8>;
Sam Parker2ef3c0d2018-10-17 13:02:48 +000058 using LoadInstList = SmallVector<LoadInst*, 8>;
Sam Parker89a37992018-07-23 15:25:59 +000059 using PMACPair = std::pair<BinOpChain*,BinOpChain*>;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +000060 using PMACPairList = SmallVector<PMACPair, 8>;
61 using Instructions = SmallVector<Instruction*,16>;
62 using MemLocList = SmallVector<MemoryLocation, 4>;
63
Sam Parker89a37992018-07-23 15:25:59 +000064 struct OpChain {
65 Instruction *Root;
66 ValueList AllValues;
Sam Parker2ef3c0d2018-10-17 13:02:48 +000067 MemInstList VecLd; // List of all sequential load instructions.
68 LoadInstList Loads; // List of all load instructions.
Sam Parker89a37992018-07-23 15:25:59 +000069 MemLocList MemLocs; // All memory locations read by this tree.
70 bool ReadOnly = true;
71
72 OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { }
Jordan Rupprechte5daf612018-07-23 17:38:05 +000073 virtual ~OpChain() = default;
Sam Parker89a37992018-07-23 15:25:59 +000074
75 void SetMemoryLocations() {
George Burgess IV6ef80022018-10-10 21:28:44 +000076 const auto Size = LocationSize::unknown();
Sam Parker89a37992018-07-23 15:25:59 +000077 for (auto *V : AllValues) {
78 if (auto *I = dyn_cast<Instruction>(V)) {
79 if (I->mayWriteToMemory())
80 ReadOnly = false;
Sam Parker2ef3c0d2018-10-17 13:02:48 +000081 if (auto *Ld = dyn_cast<LoadInst>(V)) {
Sam Parker89a37992018-07-23 15:25:59 +000082 MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size));
Sam Parker2ef3c0d2018-10-17 13:02:48 +000083 Loads.push_back(Ld);
84 }
Sam Parker89a37992018-07-23 15:25:59 +000085 }
86 }
87 }
88
89 unsigned size() const { return AllValues.size(); }
90 };
91
92 // 'BinOpChain' and 'Reduction' are just some bookkeeping data structures.
Sjoerd Meijerc89ca552018-06-28 12:55:29 +000093 // 'Reduction' contains the phi-node and accumulator statement from where we
Sam Parker89a37992018-07-23 15:25:59 +000094 // start pattern matching, and 'BinOpChain' the multiplication
Sjoerd Meijerc89ca552018-06-28 12:55:29 +000095 // instructions that are candidates for parallel execution.
Sam Parker89a37992018-07-23 15:25:59 +000096 struct BinOpChain : public OpChain {
97 ValueList LHS; // List of all (narrow) left hand operands.
98 ValueList RHS; // List of all (narrow) right hand operands.
Sam Parkera023c7a2018-09-12 09:17:44 +000099 bool Exchange = false;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000100
Sam Parker89a37992018-07-23 15:25:59 +0000101 BinOpChain(Instruction *I, ValueList &lhs, ValueList &rhs) :
102 OpChain(I, lhs), LHS(lhs), RHS(rhs) {
103 for (auto *V : RHS)
104 AllValues.push_back(V);
105 }
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000106 };
107
108 struct Reduction {
109 PHINode *Phi; // The Phi-node from where we start
110 // pattern matching.
111 Instruction *AccIntAdd; // The accumulating integer add statement,
112 // i.e, the reduction statement.
113
Sam Parker89a37992018-07-23 15:25:59 +0000114 OpChainList MACCandidates; // The MAC candidates associated with
Sjoerd Meijer53449da2018-07-11 12:36:25 +0000115 // this reduction statement.
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000116 Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
117 };
118
119 class ARMParallelDSP : public LoopPass {
120 ScalarEvolution *SE;
121 AliasAnalysis *AA;
122 TargetLibraryInfo *TLI;
123 DominatorTree *DT;
124 LoopInfo *LI;
125 Loop *L;
126 const DataLayout *DL;
127 Module *M;
128
129 bool InsertParallelMACs(Reduction &Reduction, PMACPairList &PMACPairs);
Fangrui Song68169342018-07-03 19:12:27 +0000130 bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
Sam Parker89a37992018-07-23 15:25:59 +0000131 PMACPairList CreateParallelMACPairs(OpChainList &Candidates);
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000132 Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
Sam Parkera023c7a2018-09-12 09:17:44 +0000133 Instruction *Acc, bool Exchange,
134 Instruction *InsertAfter);
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000135
136 /// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
137 /// Dual performs two signed 16x16-bit multiplications. It adds the
138 /// products to a 32-bit accumulate operand. Optionally, the instruction can
139 /// exchange the halfwords of the second operand before performing the
140 /// arithmetic.
141 bool MatchSMLAD(Function &F);
Sam Parker2ef3c0d2018-10-17 13:02:48 +0000142 bool MatchTopBottomMuls(BasicBlock *LoopBody);
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000143
144 public:
145 static char ID;
146
147 ARMParallelDSP() : LoopPass(ID) { }
148
149 void getAnalysisUsage(AnalysisUsage &AU) const override {
150 LoopPass::getAnalysisUsage(AU);
151 AU.addRequired<AssumptionCacheTracker>();
152 AU.addRequired<ScalarEvolutionWrapperPass>();
153 AU.addRequired<AAResultsWrapperPass>();
154 AU.addRequired<TargetLibraryInfoWrapperPass>();
155 AU.addRequired<LoopInfoWrapperPass>();
156 AU.addRequired<DominatorTreeWrapperPass>();
157 AU.addRequired<TargetPassConfig>();
158 AU.addPreserved<LoopInfoWrapperPass>();
159 AU.setPreservesCFG();
160 }
161
162 bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
Sjoerd Meijer3c859b32018-08-14 07:43:49 +0000163 if (DisableParallelDSP)
164 return false;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000165 L = TheLoop;
166 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
167 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
168 TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
169 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
170 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
171 auto &TPC = getAnalysis<TargetPassConfig>();
172
173 BasicBlock *Header = TheLoop->getHeader();
174 if (!Header)
175 return false;
176
177 // TODO: We assume the loop header and latch to be the same block.
178 // This is not a fundamental restriction, but lifting this would just
179 // require more work to do the transformation and then patch up the CFG.
180 if (Header != TheLoop->getLoopLatch()) {
181 LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
182 "running pass ARMParallelDSP\n");
183 return false;
184 }
185
186 Function &F = *Header->getParent();
187 M = F.getParent();
188 DL = &M->getDataLayout();
189
190 auto &TM = TPC.getTM<TargetMachine>();
191 auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
192
193 if (!ST->allowsUnalignedMem()) {
194 LLVM_DEBUG(dbgs() << "Unaligned memory access not supported: not "
195 "running pass ARMParallelDSP\n");
196 return false;
197 }
198
199 if (!ST->hasDSP()) {
200 LLVM_DEBUG(dbgs() << "DSP extension not enabled: not running pass "
201 "ARMParallelDSP\n");
202 return false;
203 }
204
205 LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
206 bool Changes = false;
207
Sam Parkera023c7a2018-09-12 09:17:44 +0000208 LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
209 LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000210 Changes = MatchSMLAD(F);
Sam Parker2ef3c0d2018-10-17 13:02:48 +0000211 if (!Changes)
212 Changes = MatchTopBottomMuls(Header);
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000213 return Changes;
214 }
215 };
216}
217
Sjoerd Meijer27be58b2018-07-05 08:21:40 +0000218// MaxBitwidth: the maximum supported bitwidth of the elements in the DSP
219// instructions, which is set to 16. So here we should collect all i8 and i16
220// narrow operations.
221// TODO: we currently only collect i16, and will support i8 later, so that's
222// why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
223template<unsigned MaxBitWidth>
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000224static bool IsNarrowSequence(Value *V, ValueList &VL) {
Sjoerd Meijer27be58b2018-07-05 08:21:40 +0000225 LLVM_DEBUG(dbgs() << "Is narrow sequence? "; V->dump());
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000226 ConstantInt *CInt;
227
228 if (match(V, m_ConstantInt(CInt))) {
229 // TODO: if a constant is used, it needs to fit within the bit width.
230 return false;
231 }
232
233 auto *I = dyn_cast<Instruction>(V);
234 if (!I)
235 return false;
236
237 Value *Val, *LHS, *RHS;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000238 if (match(V, m_Trunc(m_Value(Val)))) {
Sjoerd Meijer27be58b2018-07-05 08:21:40 +0000239 if (cast<TruncInst>(I)->getDestTy()->getIntegerBitWidth() == MaxBitWidth)
240 return IsNarrowSequence<MaxBitWidth>(Val, VL);
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000241 } else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) {
242 // TODO: we need to implement sadd16/sadd8 for this, which enables to
243 // also do the rewrite for smlad8.ll, but it is unsupported for now.
Sjoerd Meijer27be58b2018-07-05 08:21:40 +0000244 LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
245 return false;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000246 } else if (match(V, m_ZExtOrSExt(m_Value(Val)))) {
Sjoerd Meijer27be58b2018-07-05 08:21:40 +0000247 if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) {
248 LLVM_DEBUG(dbgs() << "No, wrong SrcTy size: " <<
249 cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() << "\n");
250 return false;
251 }
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000252
Sjoerd Meijer27be58b2018-07-05 08:21:40 +0000253 if (match(Val, m_Load(m_Value()))) {
254 LLVM_DEBUG(dbgs() << "Yes, found narrow Load:\t"; Val->dump());
255 VL.push_back(Val);
256 VL.push_back(I);
257 return true;
258 }
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000259 }
Sjoerd Meijer27be58b2018-07-05 08:21:40 +0000260 LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
261 return false;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000262}
263
264// Element-by-element comparison of Value lists returning true if they are
265// instructions with the same opcode or constants with the same value.
266static bool AreSymmetrical(const ValueList &VL0,
267 const ValueList &VL1) {
268 if (VL0.size() != VL1.size()) {
269 LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
270 << VL0.size() << " != " << VL1.size() << "\n");
271 return false;
272 }
273
274 const unsigned Pairs = VL0.size();
275 LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
276
277 for (unsigned i = 0; i < Pairs; ++i) {
278 const Value *V0 = VL0[i];
279 const Value *V1 = VL1[i];
280 const auto *Inst0 = dyn_cast<Instruction>(V0);
281 const auto *Inst1 = dyn_cast<Instruction>(V1);
282
283 LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
284 dbgs() << "mul1: "; V0->dump();
285 dbgs() << "mul2: "; V1->dump());
286
287 if (!Inst0 || !Inst1)
288 return false;
289
290 if (Inst0->isSameOperationAs(Inst1)) {
291 LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
292 continue;
293 }
294
295 const APInt *C0, *C1;
296 if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
297 return false;
298 }
299
300 LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
301 return true;
302}
303
Sam Parkerffc16812018-07-03 12:44:16 +0000304template<typename MemInst>
305static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1,
306 MemInstList &VecMem, const DataLayout &DL,
307 ScalarEvolution &SE) {
308 if (!MemOp0->isSimple() || !MemOp1->isSimple()) {
309 LLVM_DEBUG(dbgs() << "No, not touching volatile access\n");
310 return false;
311 }
312 if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) {
Sam Parkera023c7a2018-09-12 09:17:44 +0000313 VecMem.clear();
Sam Parkerffc16812018-07-03 12:44:16 +0000314 VecMem.push_back(MemOp0);
315 VecMem.push_back(MemOp1);
316 LLVM_DEBUG(dbgs() << "OK: accesses are consecutive.\n");
317 return true;
318 }
319 LLVM_DEBUG(dbgs() << "No, accesses aren't consecutive.\n");
320 return false;
321}
322
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000323bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
Sam Parkerffc16812018-07-03 12:44:16 +0000324 MemInstList &VecMem) {
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000325 if (!Ld0 || !Ld1)
326 return false;
327
328 LLVM_DEBUG(dbgs() << "Are consecutive loads:\n";
329 dbgs() << "Ld0:"; Ld0->dump();
330 dbgs() << "Ld1:"; Ld1->dump();
331 );
332
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000333 if (!Ld0->hasOneUse() || !Ld1->hasOneUse()) {
334 LLVM_DEBUG(dbgs() << "No, load has more than one use.\n");
335 return false;
336 }
Sam Parkerffc16812018-07-03 12:44:16 +0000337
338 return AreSequentialAccesses<LoadInst>(Ld0, Ld1, VecMem, *DL, *SE);
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000339}
340
341PMACPairList
Sam Parker89a37992018-07-23 15:25:59 +0000342ARMParallelDSP::CreateParallelMACPairs(OpChainList &Candidates) {
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000343 const unsigned Elems = Candidates.size();
344 PMACPairList PMACPairs;
345
346 if (Elems < 2)
347 return PMACPairs;
348
Sam Parkera023c7a2018-09-12 09:17:44 +0000349 SmallPtrSet<const Instruction*, 4> Paired;
350 for (unsigned i = 0; i < Elems; ++i) {
Fangrui Song58407ca2018-07-23 17:43:21 +0000351 BinOpChain *PMul0 = static_cast<BinOpChain*>(Candidates[i].get());
Sam Parkera023c7a2018-09-12 09:17:44 +0000352 if (Paired.count(PMul0->Root))
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000353 continue;
354
Sam Parkera023c7a2018-09-12 09:17:44 +0000355 for (unsigned j = 0; j < Elems; ++j) {
356 if (i == j)
357 continue;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000358
Sam Parkera023c7a2018-09-12 09:17:44 +0000359 BinOpChain *PMul1 = static_cast<BinOpChain*>(Candidates[j].get());
360 if (Paired.count(PMul1->Root))
361 continue;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000362
Sam Parkera023c7a2018-09-12 09:17:44 +0000363 const Instruction *Mul0 = PMul0->Root;
364 const Instruction *Mul1 = PMul1->Root;
365 if (Mul0 == Mul1)
366 continue;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000367
Sam Parkera023c7a2018-09-12 09:17:44 +0000368 assert(PMul0 != PMul1 && "expected different chains");
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000369
Sam Parkera023c7a2018-09-12 09:17:44 +0000370 LLVM_DEBUG(dbgs() << "\nCheck parallel muls:\n";
371 dbgs() << "- "; Mul0->dump();
372 dbgs() << "- "; Mul1->dump());
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000373
Sam Parkera023c7a2018-09-12 09:17:44 +0000374 const ValueList &Mul0_LHS = PMul0->LHS;
375 const ValueList &Mul0_RHS = PMul0->RHS;
376 const ValueList &Mul1_LHS = PMul1->LHS;
377 const ValueList &Mul1_RHS = PMul1->RHS;
378
379 if (!AreSymmetrical(Mul0_LHS, Mul1_LHS) ||
380 !AreSymmetrical(Mul0_RHS, Mul1_RHS))
381 continue;
382
383 LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
384 // The first elements of each vector should be loads with sexts. If we
385 // find that its two pairs of consecutive loads, then these can be
386 // transformed into two wider loads and the users can be replaced with
387 // DSP intrinsics.
388 bool Found = false;
389 for (unsigned x = 0; x < Mul0_LHS.size(); x += 2) {
390 auto *Ld0 = dyn_cast<LoadInst>(Mul0_LHS[x]);
391 auto *Ld1 = dyn_cast<LoadInst>(Mul1_LHS[x]);
392 auto *Ld2 = dyn_cast<LoadInst>(Mul0_RHS[x]);
393 auto *Ld3 = dyn_cast<LoadInst>(Mul1_RHS[x]);
394
395 if (!Ld0 || !Ld1 || !Ld2 || !Ld3)
396 continue;
397
398 LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n"
399 << "\t Ld0: " << *Ld0 << "\n"
400 << "\t Ld1: " << *Ld1 << "\n"
401 << "and operands " << x + 2 << ":\n"
402 << "\t Ld2: " << *Ld2 << "\n"
403 << "\t Ld3: " << *Ld3 << "\n");
404
405 if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
406 if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
407 LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
408 PMACPairs.push_back(std::make_pair(PMul0, PMul1));
409 Found = true;
410 } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
411 LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
412 LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n");
413 PMul1->Exchange = true;
414 PMACPairs.push_back(std::make_pair(PMul0, PMul1));
415 Found = true;
416 }
417 } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd)) {
418 if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
419 LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
420 LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n");
421 LLVM_DEBUG(dbgs() << " and swapping muls\n");
422 PMul0->Exchange = true;
423 // Only the second operand can be exchanged, so swap the muls.
424 PMACPairs.push_back(std::make_pair(PMul1, PMul0));
425 Found = true;
426 }
427 }
428 }
429 if (Found) {
430 Paired.insert(Mul0);
431 Paired.insert(Mul1);
432 break;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000433 }
434 }
435 }
436 return PMACPairs;
437}
438
439bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction,
440 PMACPairList &PMACPairs) {
441 Instruction *Acc = Reduction.Phi;
442 Instruction *InsertAfter = Reduction.AccIntAdd;
443
444 for (auto &Pair : PMACPairs) {
Sam Parkera023c7a2018-09-12 09:17:44 +0000445 BinOpChain *PMul0 = Pair.first;
446 BinOpChain *PMul1 = Pair.second;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000447 LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
Sam Parkera023c7a2018-09-12 09:17:44 +0000448 dbgs() << "- "; PMul0->Root->dump();
449 dbgs() << "- "; PMul1->Root->dump());
450
451 auto *VecLd0 = cast<LoadInst>(PMul0->VecLd[0]);
452 auto *VecLd1 = cast<LoadInst>(PMul1->VecLd[0]);
453 Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, PMul1->Exchange, InsertAfter);
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000454 InsertAfter = Acc;
455 }
456
457 if (Acc != Reduction.Phi) {
458 LLVM_DEBUG(dbgs() << "Replace Accumulate: "; Acc->dump());
459 Reduction.AccIntAdd->replaceAllUsesWith(Acc);
460 return true;
461 }
462 return false;
463}
464
Sam Parker89a37992018-07-23 15:25:59 +0000465static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header,
466 ReductionList &Reductions) {
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000467 RecurrenceDescriptor RecDesc;
468 const bool HasFnNoNaNAttr =
469 F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
470 const BasicBlock *Latch = TheLoop->getLoopLatch();
471
472 // We need a preheader as getIncomingValueForBlock assumes there is one.
Sjoerd Meijer53449da2018-07-11 12:36:25 +0000473 if (!TheLoop->getLoopPreheader()) {
474 LLVM_DEBUG(dbgs() << "No preheader found, bailing out\n");
Sam Parker89a37992018-07-23 15:25:59 +0000475 return;
Sjoerd Meijer53449da2018-07-11 12:36:25 +0000476 }
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000477
478 for (PHINode &Phi : Header->phis()) {
479 const auto *Ty = Phi.getType();
Sam Parker01db2982018-09-11 14:01:22 +0000480 if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000481 continue;
482
483 const bool IsReduction =
484 RecurrenceDescriptor::AddReductionVar(&Phi,
485 RecurrenceDescriptor::RK_IntegerAdd,
486 TheLoop, HasFnNoNaNAttr, RecDesc);
487 if (!IsReduction)
488 continue;
489
490 Instruction *Acc = dyn_cast<Instruction>(Phi.getIncomingValueForBlock(Latch));
491 if (!Acc)
492 continue;
493
494 Reductions.push_back(Reduction(&Phi, Acc));
495 }
496
497 LLVM_DEBUG(
498 dbgs() << "\nAccumulating integer additions (reductions) found:\n";
Sam Parker89a37992018-07-23 15:25:59 +0000499 for (auto &R : Reductions) {
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000500 dbgs() << "- "; R.Phi->dump();
501 dbgs() << "-> "; R.AccIntAdd->dump();
502 }
503 );
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000504}
505
Sam Parker2ef3c0d2018-10-17 13:02:48 +0000506static void AddMulCandidate(OpChainList &Candidates,
Sam Parker01db2982018-09-11 14:01:22 +0000507 Instruction *Mul,
508 Value *MulOp0, Value *MulOp1) {
Sam Parker2ef3c0d2018-10-17 13:02:48 +0000509 LLVM_DEBUG(dbgs() << "OK, found mul:\t"; Mul->dump());
Sam Parker01db2982018-09-11 14:01:22 +0000510 assert(Mul->getOpcode() == Instruction::Mul &&
511 "expected mul instruction");
Sam Parker89a37992018-07-23 15:25:59 +0000512 ValueList LHS;
513 ValueList RHS;
514 if (IsNarrowSequence<16>(MulOp0, LHS) &&
515 IsNarrowSequence<16>(MulOp1, RHS)) {
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000516 LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
Fangrui Song58407ca2018-07-23 17:43:21 +0000517 Candidates.push_back(make_unique<BinOpChain>(Mul, LHS, RHS));
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000518 }
519}
520
Sam Parker89a37992018-07-23 15:25:59 +0000521static void MatchParallelMACSequences(Reduction &R,
522 OpChainList &Candidates) {
Sam Parkera023c7a2018-09-12 09:17:44 +0000523 Instruction *Acc = R.AccIntAdd;
524 LLVM_DEBUG(dbgs() << "\n- Analysing:\t" << *Acc);
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000525
Sam Parkera023c7a2018-09-12 09:17:44 +0000526 // Returns false to signal the search should be stopped.
527 std::function<bool(Value*)> Match =
528 [&Candidates, &Match](Value *V) -> bool {
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000529
Sam Parkera023c7a2018-09-12 09:17:44 +0000530 auto *I = dyn_cast<Instruction>(V);
Sam Parker11879112018-09-12 09:58:56 +0000531 if (!I)
Sam Parkera023c7a2018-09-12 09:17:44 +0000532 return false;
Sam Parker01db2982018-09-11 14:01:22 +0000533
Sam Parkera023c7a2018-09-12 09:17:44 +0000534 Value *MulOp0, *MulOp1;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000535
Sam Parkera023c7a2018-09-12 09:17:44 +0000536 switch (I->getOpcode()) {
537 case Instruction::Add:
538 if (Match(I->getOperand(0)) || (Match(I->getOperand(1))))
539 return true;
540 break;
541 case Instruction::Mul:
542 if (match (I, (m_Mul(m_Value(MulOp0), m_Value(MulOp1))))) {
Sam Parker2ef3c0d2018-10-17 13:02:48 +0000543 AddMulCandidate(Candidates, I, MulOp0, MulOp1);
Sam Parkera023c7a2018-09-12 09:17:44 +0000544 return false;
545 }
546 break;
547 case Instruction::SExt:
548 if (match (I, (m_SExt(m_Mul(m_Value(MulOp0), m_Value(MulOp1)))))) {
549 Instruction *Mul = cast<Instruction>(I->getOperand(0));
Sam Parker2ef3c0d2018-10-17 13:02:48 +0000550 AddMulCandidate(Candidates, Mul, MulOp0, MulOp1);
Sam Parkera023c7a2018-09-12 09:17:44 +0000551 return false;
552 }
553 break;
554 }
555 return false;
556 };
557
558 while (Match (Acc));
559 LLVM_DEBUG(dbgs() << "Finished matching MAC sequences, found "
560 << Candidates.size() << " candidates.\n");
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000561}
562
563// Collects all instructions that are not part of the MAC chains, which is the
564// set of instructions that can potentially alias with the MAC operands.
Sjoerd Meijer53449da2018-07-11 12:36:25 +0000565static void AliasCandidates(BasicBlock *Header, Instructions &Reads,
566 Instructions &Writes) {
567 for (auto &I : *Header) {
568 if (I.mayReadFromMemory())
569 Reads.push_back(&I);
570 if (I.mayWriteToMemory())
571 Writes.push_back(&I);
572 }
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000573}
574
Sjoerd Meijer53449da2018-07-11 12:36:25 +0000575// Check whether statements in the basic block that write to memory alias with
576// the memory locations accessed by the MAC-chains.
577// TODO: we need the read statements when we accept more complicated chains.
578static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
Sam Parker2ef3c0d2018-10-17 13:02:48 +0000579 Instructions &Writes, OpChainList &Candidates) {
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000580 LLVM_DEBUG(dbgs() << "Alias checks:\n");
Sam Parker2ef3c0d2018-10-17 13:02:48 +0000581 for (auto &Candidate : Candidates) {
582 LLVM_DEBUG(dbgs() << "mul: "; Candidate->Root->dump());
583 Candidate->SetMemoryLocations();
Sjoerd Meijer53449da2018-07-11 12:36:25 +0000584
585 // At the moment, we allow only simple chains that only consist of reads,
586 // accumulate their result with an integer add, and thus that don't write
587 // memory, and simply bail if they do.
Sam Parker2ef3c0d2018-10-17 13:02:48 +0000588 if (!Candidate->ReadOnly)
Sjoerd Meijer53449da2018-07-11 12:36:25 +0000589 return true;
590
591 // Now for all writes in the basic block, check that they don't alias with
592 // the memory locations accessed by our MAC-chain:
593 for (auto *I : Writes) {
594 LLVM_DEBUG(dbgs() << "- "; I->dump());
Sam Parker2ef3c0d2018-10-17 13:02:48 +0000595 assert(Candidate->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
596 for (auto &MemLoc : Candidate->MemLocs) {
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000597 if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
598 ModRefInfo::ModRef))) {
599 LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
600 return true;
601 }
602 }
603 }
604 }
Sjoerd Meijer53449da2018-07-11 12:36:25 +0000605
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000606 LLVM_DEBUG(dbgs() << "OK: no aliases found!\n");
607 return false;
608}
609
Sam Parker2ef3c0d2018-10-17 13:02:48 +0000610static bool CheckMulMemory(OpChainList &Candidates) {
Fangrui Song58407ca2018-07-23 17:43:21 +0000611 for (auto &C : Candidates) {
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000612 // A mul has 2 operands, and a narrow op consist of sext and a load; thus
613 // we expect at least 4 items in this operand value list.
Sam Parker89a37992018-07-23 15:25:59 +0000614 if (C->size() < 4) {
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000615 LLVM_DEBUG(dbgs() << "Operand list too short.\n");
616 return false;
617 }
Fangrui Song58407ca2018-07-23 17:43:21 +0000618 ValueList &LHS = static_cast<BinOpChain*>(C.get())->LHS;
619 ValueList &RHS = static_cast<BinOpChain*>(C.get())->RHS;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000620
Sam Parker89a37992018-07-23 15:25:59 +0000621 // Use +=2 to skip over the expected extend instructions.
622 for (unsigned i = 0, e = LHS.size(); i < e; i += 2) {
623 if (!isa<LoadInst>(LHS[i]) || !isa<LoadInst>(RHS[i]))
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000624 return false;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000625 }
626 }
627 return true;
628}
629
Sam Parker2ef3c0d2018-10-17 13:02:48 +0000630static LoadInst *CreateLoadIns(IRBuilder<NoFolder> &IRB, LoadInst *BaseLoad,
631 const Type *LoadTy) {
632 const unsigned AddrSpace = BaseLoad->getPointerAddressSpace();
633
634 Value *VecPtr = IRB.CreateBitCast(BaseLoad->getPointerOperand(),
635 LoadTy->getPointerTo(AddrSpace));
636 return IRB.CreateAlignedLoad(VecPtr, BaseLoad->getAlignment());
637}
638
639/// Given two instructions, return the one that comes first in the basic block.
640/// A work around for not being able to do > or < on bb iterators.
641static Instruction* GetFirst(Instruction *A, Instruction *B) {
642 BasicBlock::iterator First(A);
643 BasicBlock::iterator Second(B);
644
645 BasicBlock *BB = A->getParent();
646 assert(BB == B->getParent() &&
647 "Can't compare instructions in different blocks");
648 BasicBlock::iterator Last = BB->end();
649
650 // Iterate through the block, if the 'First' iterator is found, then return
651 // Second.
652 while (Second != Last) {
653 if (Second == First)
654 return B;
655 ++Second;
656 }
657 return A;
658}
659
660/// Attempt to widen loads and use smulbb, smulbt, smultb and smultt muls.
661// TODO: This, like smlad generation, expects the leave operands to be loads
662// that are sign extended. We should be able to handle scalar values as well
663// performing these muls on word x half types to generate smulwb and smulwt.
664bool ARMParallelDSP::MatchTopBottomMuls(BasicBlock *LoopBody) {
665 LLVM_DEBUG(dbgs() << "Attempting to find BT|TB muls.\n");
666
667 OpChainList Candidates;
668 for (auto &I : *LoopBody) {
669 if (I.getOpcode() == Instruction::Mul) {
670 Type *Ty = I.getType();
671 if (Ty->isIntegerTy() &&
672 (Ty->getScalarSizeInBits() == 32 ||
673 Ty->getScalarSizeInBits() == 64))
674 AddMulCandidate(Candidates, &I, I.getOperand(0), I.getOperand(1));
675 }
676 }
677
678 if (Candidates.empty())
679 return false;
680
681 Instructions Reads;
682 Instructions Writes;
683 AliasCandidates(LoopBody, Reads, Writes);
684
685 if (AreAliased(AA, Reads, Writes, Candidates))
686 return false;
687
688 DenseMap<LoadInst*, LoadInst*> SeqLoads;
689 SmallPtrSet<LoadInst*, 8> OffsetLoads;
690
691 for (unsigned i = 0; i < Candidates.size(); ++i) {
692 for (unsigned j = 0; j < Candidates.size(); ++j) {
693 if (i == j)
694 continue;
695
696 OpChain *MulChain0 = Candidates[i].get();
697 OpChain *MulChain1 = Candidates[j].get();
698
699 for (auto *Ld0 : MulChain0->Loads) {
700 if (SeqLoads.count(Ld0) || OffsetLoads.count(Ld0))
701 continue;
702
703 for (auto *Ld1 : MulChain1->Loads) {
704 if (SeqLoads.count(Ld1) || OffsetLoads.count(Ld1))
705 continue;
706
707 MemInstList VecMem;
708 if (AreSequentialLoads(Ld0, Ld1, VecMem)) {
709 SeqLoads[Ld0] = Ld1;
710 OffsetLoads.insert(Ld1);
711 }
712 }
713 }
714 }
715 }
716
717 if (SeqLoads.empty())
718 return false;
719
720 IRBuilder<NoFolder> IRB(LoopBody);
721 const Type *Ty = IntegerType::get(M->getContext(), 32);
722
723 auto IsUserMul = [](Use &U) {
724 auto *Mul = cast<Instruction>(U.getUser());
725 return Mul->getOpcode() == Instruction::Mul;
726 };
727
728 LLVM_DEBUG(dbgs() << "Found some sequential loads, now widening:\n");
729 for (auto &Pair : SeqLoads) {
730 LoadInst *BaseLd = Pair.first;
731 LoadInst *OffsetLd = Pair.second;
732
733 // Check that all the base users are muls.
734 auto *BaseSExt = cast<Instruction>(BaseLd->user_back());
735 for (Use &U : BaseSExt->uses()) {
736 if (!IsUserMul(U))
737 return false;
738 }
739
740 // Check that all the offset users are muls.
741 // TODO We exit early on finding a sext user which isn't a mul, but many
742 // arm instructions would be able to perform the necessary shift too.
743 auto *OffsetSExt = cast<Instruction>(OffsetLd->user_back());
744 for (Use &U : OffsetSExt->uses()) {
745 if (!IsUserMul(U))
746 return false;
747 }
748
749 LLVM_DEBUG(dbgs() << " - with base load: " << *BaseLd << "\n");
750 LLVM_DEBUG(dbgs() << " - with offset load: " << *OffsetLd << "\n");
751 Instruction *InsertPt = GetFirst(BaseLd, OffsetLd);
752 IRB.SetInsertPoint(InsertPt);
753 LoadInst *WideLd = CreateLoadIns(IRB, BaseLd, Ty);
754 LLVM_DEBUG(dbgs() << " - created wide load: " << *WideLd << "\n");
755
756 // Move the pointer operands before their users.
757 std::function<void(Instruction*, Instruction*)> MoveBefore =
758 [&MoveBefore](Instruction *Source, Instruction *Sink) -> void {
759 Source->moveBefore(Sink);
760 for (Use &U : Source->operands()) {
761 Value *Op = U.get();
762 if (auto *I = dyn_cast<Instruction>(Op)) {
763 if (isa<PHINode>(I) || I->getParent() != Source->getParent())
764 continue;
765 MoveBefore(I, Source);
766 }
767 }
768 };
769
770 // If we're inserting the load before BaseLd, we probably need to move the
771 // the pointer operand too. This operand is cast to an i32* in
772 // CreateLoadIns.
773 if (InsertPt != BaseLd) {
774 if (auto *GEP = dyn_cast<GetElementPtrInst>(BaseLd->getPointerOperand()))
775 MoveBefore(GEP, cast<Instruction>(WideLd->getPointerOperand()));
776 }
777
778 // BaseUser needs to: (asr (shl WideLoad, 16), 16)
779 // OffsetUser needs to: (asr WideLoad, 16)
780 auto *Top = cast<Instruction>(IRB.CreateAShr(WideLd, 16));
781 auto *Shl = cast<Instruction>(IRB.CreateShl(WideLd, 16));
782 auto *Bottom = cast<Instruction>(IRB.CreateAShr(Shl, 16));
783
784 BaseSExt->replaceAllUsesWith(Bottom);
785 OffsetSExt->replaceAllUsesWith(Top);
786
787 BaseSExt->eraseFromParent();
788 OffsetSExt->eraseFromParent();
789 BaseLd->eraseFromParent();
790 OffsetLd->eraseFromParent();
791 }
792 LLVM_DEBUG(dbgs() << "Block after top bottom mul replacements:\n"
793 << *LoopBody << "\n");
794 return true;
795}
796
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000797// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
798// multiplications.
799// To use SMLAD:
800// 1) we first need to find integer add reduction PHIs,
801// 2) then from the PHI, look for this pattern:
802//
803// acc0 = phi i32 [0, %entry], [%acc1, %loop.body]
804// ld0 = load i16
805// sext0 = sext i16 %ld0 to i32
806// ld1 = load i16
807// sext1 = sext i16 %ld1 to i32
808// mul0 = mul %sext0, %sext1
809// ld2 = load i16
810// sext2 = sext i16 %ld2 to i32
811// ld3 = load i16
812// sext3 = sext i16 %ld3 to i32
813// mul1 = mul i32 %sext2, %sext3
814// add0 = add i32 %mul0, %acc0
815// acc1 = add i32 %add0, %mul1
816//
817// Which can be selected to:
818//
819// ldr.h r0
820// ldr.h r1
821// smlad r2, r0, r1, r2
822//
823// If constants are used instead of loads, these will need to be hoisted
824// out and into a register.
825//
826// If loop invariants are used instead of loads, these need to be packed
827// before the loop begins.
828//
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000829bool ARMParallelDSP::MatchSMLAD(Function &F) {
830 BasicBlock *Header = L->getHeader();
831 LLVM_DEBUG(dbgs() << "= Matching SMLAD =\n";
832 dbgs() << "Header block:\n"; Header->dump();
833 dbgs() << "Loop info:\n\n"; L->dump());
834
Sam Parker89a37992018-07-23 15:25:59 +0000835 ReductionList Reductions;
836 MatchReductions(F, L, Header, Reductions);
Sam Parker2ef3c0d2018-10-17 13:02:48 +0000837 if (Reductions.empty())
838 return false;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000839
840 for (auto &R : Reductions) {
Sam Parker89a37992018-07-23 15:25:59 +0000841 OpChainList MACCandidates;
842 MatchParallelMACSequences(R, MACCandidates);
Sam Parker2ef3c0d2018-10-17 13:02:48 +0000843 if (!CheckMulMemory(MACCandidates))
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000844 continue;
Sam Parker89a37992018-07-23 15:25:59 +0000845
Fangrui Song58407ca2018-07-23 17:43:21 +0000846 R.MACCandidates = std::move(MACCandidates);
Sjoerd Meijer53449da2018-07-11 12:36:25 +0000847
848 LLVM_DEBUG(dbgs() << "MAC candidates:\n";
849 for (auto &M : R.MACCandidates)
Sam Parker89a37992018-07-23 15:25:59 +0000850 M->Root->dump();
Sjoerd Meijer53449da2018-07-11 12:36:25 +0000851 dbgs() << "\n";);
852 }
853
854 // Collect all instructions that may read or write memory. Our alias
855 // analysis checks bail out if any of these instructions aliases with an
856 // instruction from the MAC-chain.
857 Instructions Reads, Writes;
858 AliasCandidates(Header, Reads, Writes);
859
Sam Parker2ef3c0d2018-10-17 13:02:48 +0000860 bool Changed = false;
Sjoerd Meijer53449da2018-07-11 12:36:25 +0000861 for (auto &R : Reductions) {
862 if (AreAliased(AA, Reads, Writes, R.MACCandidates))
863 return false;
864 PMACPairList PMACPairs = CreateParallelMACPairs(R.MACCandidates);
865 Changed |= InsertParallelMACs(R, PMACPairs);
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000866 }
867
868 LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
869 return Changed;
870}
871
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000872Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
Sam Parkera023c7a2018-09-12 09:17:44 +0000873 Instruction *Acc, bool Exchange,
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000874 Instruction *InsertAfter) {
Sam Parkera023c7a2018-09-12 09:17:44 +0000875 LLVM_DEBUG(dbgs() << "Create SMLAD intrinsic using:\n"
876 << "- " << *VecLd0 << "\n"
877 << "- " << *VecLd1 << "\n"
878 << "- " << *Acc << "\n"
879 << "Exchange: " << Exchange << "\n");
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000880
881 IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
882 ++BasicBlock::iterator(InsertAfter));
883
884 // Replace the reduction chain with an intrinsic call
Sam Parker01db2982018-09-11 14:01:22 +0000885 const Type *Ty = IntegerType::get(M->getContext(), 32);
Sam Parker2ef3c0d2018-10-17 13:02:48 +0000886 LoadInst *NewLd0 = CreateLoadIns(Builder, &VecLd0[0], Ty);
887 LoadInst *NewLd1 = CreateLoadIns(Builder, &VecLd1[0], Ty);
Sam Parkera023c7a2018-09-12 09:17:44 +0000888 Value* Args[] = { NewLd0, NewLd1, Acc };
889 Function *SMLAD = nullptr;
890 if (Exchange)
891 SMLAD = Acc->getType()->isIntegerTy(32) ?
892 Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) :
893 Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx);
894 else
895 SMLAD = Acc->getType()->isIntegerTy(32) ?
896 Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) :
897 Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000898 CallInst *Call = Builder.CreateCall(SMLAD, Args);
Sjoerd Meijerb3e06fa2018-07-06 14:47:09 +0000899 NumSMLAD++;
Sjoerd Meijerc89ca552018-06-28 12:55:29 +0000900 return Call;
901}
902
903Pass *llvm::createARMParallelDSPPass() {
904 return new ARMParallelDSP();
905}
906
907char ARMParallelDSP::ID = 0;
908
Sjoerd Meijerb3e06fa2018-07-06 14:47:09 +0000909INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
Simon Pilgrimc09b5e32018-06-28 18:37:16 +0000910 "Transform loops to use DSP intrinsics", false, false)
Sjoerd Meijerb3e06fa2018-07-06 14:47:09 +0000911INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
Simon Pilgrimc09b5e32018-06-28 18:37:16 +0000912 "Transform loops to use DSP intrinsics", false, false)