blob: 7455a02b06dc01267ab451dea9509a7047b52c2c [file] [log] [blame]
Eugene Zelenko4d060b72017-07-29 00:56:56 +00001//===- HexagonLoopIdiomRecognition.cpp ------------------------------------===//
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00002//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9
10#define DEBUG_TYPE "hexagon-lir"
11
Eugene Zelenko4d060b72017-07-29 00:56:56 +000012#include "llvm/ADT/APInt.h"
13#include "llvm/ADT/DenseMap.h"
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +000014#include "llvm/ADT/SetVector.h"
Eugene Zelenko4d060b72017-07-29 00:56:56 +000015#include "llvm/ADT/SmallPtrSet.h"
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +000016#include "llvm/ADT/SmallSet.h"
Eugene Zelenko4d060b72017-07-29 00:56:56 +000017#include "llvm/ADT/SmallVector.h"
18#include "llvm/ADT/StringRef.h"
19#include "llvm/ADT/Triple.h"
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +000020#include "llvm/Analysis/AliasAnalysis.h"
21#include "llvm/Analysis/InstructionSimplify.h"
Eugene Zelenko4d060b72017-07-29 00:56:56 +000022#include "llvm/Analysis/LoopInfo.h"
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +000023#include "llvm/Analysis/LoopPass.h"
Eugene Zelenko4d060b72017-07-29 00:56:56 +000024#include "llvm/Analysis/MemoryLocation.h"
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +000025#include "llvm/Analysis/ScalarEvolution.h"
26#include "llvm/Analysis/ScalarEvolutionExpander.h"
27#include "llvm/Analysis/ScalarEvolutionExpressions.h"
28#include "llvm/Analysis/TargetLibraryInfo.h"
29#include "llvm/Analysis/ValueTracking.h"
Eugene Zelenko4d060b72017-07-29 00:56:56 +000030#include "llvm/IR/Attributes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +000034#include "llvm/IR/DataLayout.h"
Eugene Zelenko4d060b72017-07-29 00:56:56 +000035#include "llvm/IR/DebugLoc.h"
36#include "llvm/IR/DerivedTypes.h"
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +000037#include "llvm/IR/Dominators.h"
Eugene Zelenko4d060b72017-07-29 00:56:56 +000038#include "llvm/IR/Function.h"
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +000039#include "llvm/IR/IRBuilder.h"
Eugene Zelenko4d060b72017-07-29 00:56:56 +000040#include "llvm/IR/InstrTypes.h"
41#include "llvm/IR/Instruction.h"
42#include "llvm/IR/Instructions.h"
43#include "llvm/IR/IntrinsicInst.h"
44#include "llvm/IR/Intrinsics.h"
45#include "llvm/IR/Module.h"
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +000046#include "llvm/IR/PatternMatch.h"
Eugene Zelenko4d060b72017-07-29 00:56:56 +000047#include "llvm/IR/Type.h"
48#include "llvm/IR/User.h"
49#include "llvm/IR/Value.h"
50#include "llvm/Pass.h"
51#include "llvm/Support/Casting.h"
52#include "llvm/Support/CommandLine.h"
53#include "llvm/Support/Compiler.h"
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +000054#include "llvm/Support/Debug.h"
Eugene Zelenko4d060b72017-07-29 00:56:56 +000055#include "llvm/Support/ErrorHandling.h"
Craig Topperb45eabc2017-04-26 16:39:58 +000056#include "llvm/Support/KnownBits.h"
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +000057#include "llvm/Support/raw_ostream.h"
Chandler Carruth6bda14b2017-06-06 11:49:48 +000058#include "llvm/Transforms/Scalar.h"
59#include "llvm/Transforms/Utils/Local.h"
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +000060#include <algorithm>
61#include <array>
Eugene Zelenko4d060b72017-07-29 00:56:56 +000062#include <cassert>
63#include <cstdint>
64#include <cstdlib>
65#include <deque>
66#include <functional>
67#include <iterator>
68#include <map>
69#include <set>
70#include <utility>
71#include <vector>
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +000072
73using namespace llvm;
74
75static cl::opt<bool> DisableMemcpyIdiom("disable-memcpy-idiom",
76 cl::Hidden, cl::init(false),
77 cl::desc("Disable generation of memcpy in loop idiom recognition"));
78
79static cl::opt<bool> DisableMemmoveIdiom("disable-memmove-idiom",
80 cl::Hidden, cl::init(false),
81 cl::desc("Disable generation of memmove in loop idiom recognition"));
82
83static cl::opt<unsigned> RuntimeMemSizeThreshold("runtime-mem-idiom-threshold",
84 cl::Hidden, cl::init(0), cl::desc("Threshold (in bytes) for the runtime "
85 "check guarding the memmove."));
86
87static cl::opt<unsigned> CompileTimeMemSizeThreshold(
88 "compile-time-mem-idiom-threshold", cl::Hidden, cl::init(64),
89 cl::desc("Threshold (in bytes) to perform the transformation, if the "
90 "runtime loop count (mem transfer size) is known at compile-time."));
91
92static cl::opt<bool> OnlyNonNestedMemmove("only-nonnested-memmove-idiom",
93 cl::Hidden, cl::init(true),
94 cl::desc("Only enable generating memmove in non-nested loops"));
95
96cl::opt<bool> HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy",
97 cl::Hidden, cl::init(false),
98 cl::desc("Enable Hexagon-specific memcpy for volatile destination."));
99
Krzysztof Parzyszek51fd5402017-06-01 18:00:47 +0000100static cl::opt<unsigned> SimplifyLimit("hlir-simplify-limit", cl::init(10000),
101 cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR"));
102
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000103static const char *HexagonVolatileMemcpyName
104 = "hexagon_memcpy_forward_vp4cp4n2";
105
106
107namespace llvm {
Eugene Zelenko4d060b72017-07-29 00:56:56 +0000108
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000109 void initializeHexagonLoopIdiomRecognizePass(PassRegistry&);
110 Pass *createHexagonLoopIdiomPass();
Eugene Zelenko4d060b72017-07-29 00:56:56 +0000111
112} // end namespace llvm
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000113
114namespace {
Eugene Zelenko4d060b72017-07-29 00:56:56 +0000115
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000116 class HexagonLoopIdiomRecognize : public LoopPass {
117 public:
118 static char ID;
Eugene Zelenko4d060b72017-07-29 00:56:56 +0000119
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000120 explicit HexagonLoopIdiomRecognize() : LoopPass(ID) {
121 initializeHexagonLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
122 }
Eugene Zelenko4d060b72017-07-29 00:56:56 +0000123
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000124 StringRef getPassName() const override {
125 return "Recognize Hexagon-specific loop idioms";
126 }
127
128 void getAnalysisUsage(AnalysisUsage &AU) const override {
129 AU.addRequired<LoopInfoWrapperPass>();
130 AU.addRequiredID(LoopSimplifyID);
131 AU.addRequiredID(LCSSAID);
132 AU.addRequired<AAResultsWrapperPass>();
133 AU.addPreserved<AAResultsWrapperPass>();
134 AU.addRequired<ScalarEvolutionWrapperPass>();
135 AU.addRequired<DominatorTreeWrapperPass>();
136 AU.addRequired<TargetLibraryInfoWrapperPass>();
137 AU.addPreserved<TargetLibraryInfoWrapperPass>();
138 }
139
140 bool runOnLoop(Loop *L, LPPassManager &LPM) override;
141
142 private:
143 unsigned getStoreSizeInBytes(StoreInst *SI);
144 int getSCEVStride(const SCEVAddRecExpr *StoreEv);
145 bool isLegalStore(Loop *CurLoop, StoreInst *SI);
146 void collectStores(Loop *CurLoop, BasicBlock *BB,
147 SmallVectorImpl<StoreInst*> &Stores);
148 bool processCopyingStore(Loop *CurLoop, StoreInst *SI, const SCEV *BECount);
149 bool coverLoop(Loop *L, SmallVectorImpl<Instruction*> &Insts) const;
150 bool runOnLoopBlock(Loop *CurLoop, BasicBlock *BB, const SCEV *BECount,
151 SmallVectorImpl<BasicBlock*> &ExitBlocks);
152 bool runOnCountableLoop(Loop *L);
153
154 AliasAnalysis *AA;
155 const DataLayout *DL;
156 DominatorTree *DT;
157 LoopInfo *LF;
158 const TargetLibraryInfo *TLI;
159 ScalarEvolution *SE;
160 bool HasMemcpy, HasMemmove;
161 };
Eugene Zelenko4d060b72017-07-29 00:56:56 +0000162
163 struct Simplifier {
164 using Rule = std::function<Value * (Instruction *, LLVMContext &)>;
165
166 void addRule(const Rule &R) { Rules.push_back(R); }
167
168 private:
169 struct WorkListType {
170 WorkListType() = default;
171
172 void push_back(Value* V) {
173 // Do not push back duplicates.
174 if (!S.count(V)) { Q.push_back(V); S.insert(V); }
175 }
176
177 Value *pop_front_val() {
178 Value *V = Q.front(); Q.pop_front(); S.erase(V);
179 return V;
180 }
181
182 bool empty() const { return Q.empty(); }
183
184 private:
185 std::deque<Value*> Q;
186 std::set<Value*> S;
187 };
188
189 using ValueSetType = std::set<Value *>;
190
191 std::vector<Rule> Rules;
192
193 public:
194 struct Context {
195 using ValueMapType = DenseMap<Value *, Value *>;
196
197 Value *Root;
198 ValueSetType Used; // The set of all cloned values used by Root.
199 ValueSetType Clones; // The set of all cloned values.
200 LLVMContext &Ctx;
201
202 Context(Instruction *Exp)
203 : Ctx(Exp->getParent()->getParent()->getContext()) {
204 initialize(Exp);
205 }
206
207 ~Context() { cleanup(); }
208
209 void print(raw_ostream &OS, const Value *V) const;
210 Value *materialize(BasicBlock *B, BasicBlock::iterator At);
211
212 private:
213 friend struct Simplifier;
214
215 void initialize(Instruction *Exp);
216 void cleanup();
217
218 template <typename FuncT> void traverse(Value *V, FuncT F);
219 void record(Value *V);
220 void use(Value *V);
221 void unuse(Value *V);
222
223 bool equal(const Instruction *I, const Instruction *J) const;
224 Value *find(Value *Tree, Value *Sub) const;
225 Value *subst(Value *Tree, Value *OldV, Value *NewV);
226 void replace(Value *OldV, Value *NewV);
227 void link(Instruction *I, BasicBlock *B, BasicBlock::iterator At);
228 };
229
230 Value *simplify(Context &C);
231 };
232
233 struct PE {
234 PE(const Simplifier::Context &c, Value *v = nullptr) : C(c), V(v) {}
235
236 const Simplifier::Context &C;
237 const Value *V;
238 };
239
240 raw_ostream &operator<< (raw_ostream &OS, const PE &P) LLVM_ATTRIBUTE_USED;
241 raw_ostream &operator<< (raw_ostream &OS, const PE &P) {
242 P.C.print(OS, P.V ? P.V : P.C.Root);
243 return OS;
244 }
245
246} // end anonymous namespace
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000247
248char HexagonLoopIdiomRecognize::ID = 0;
249
250INITIALIZE_PASS_BEGIN(HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
251 "Recognize Hexagon-specific loop idioms", false, false)
252INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
253INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
254INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
255INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
256INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
257INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
258INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
259INITIALIZE_PASS_END(HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
260 "Recognize Hexagon-specific loop idioms", false, false)
261
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000262template <typename FuncT>
263void Simplifier::Context::traverse(Value *V, FuncT F) {
264 WorkListType Q;
265 Q.push_back(V);
266
267 while (!Q.empty()) {
Krzysztof Parzyszek10fbac02017-03-23 23:01:22 +0000268 Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000269 if (!U || U->getParent())
270 continue;
271 if (!F(U))
272 continue;
273 for (Value *Op : U->operands())
274 Q.push_back(Op);
275 }
276}
277
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000278void Simplifier::Context::print(raw_ostream &OS, const Value *V) const {
279 const auto *U = dyn_cast<const Instruction>(V);
280 if (!U) {
281 OS << V << '(' << *V << ')';
282 return;
283 }
284
285 if (U->getParent()) {
286 OS << U << '(';
287 U->printAsOperand(OS, true);
288 OS << ')';
289 return;
290 }
291
292 unsigned N = U->getNumOperands();
293 if (N != 0)
294 OS << U << '(';
295 OS << U->getOpcodeName();
296 for (const Value *Op : U->operands()) {
297 OS << ' ';
298 print(OS, Op);
299 }
300 if (N != 0)
301 OS << ')';
302}
303
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000304void Simplifier::Context::initialize(Instruction *Exp) {
305 // Perform a deep clone of the expression, set Root to the root
306 // of the clone, and build a map from the cloned values to the
307 // original ones.
308 ValueMapType M;
309 BasicBlock *Block = Exp->getParent();
310 WorkListType Q;
311 Q.push_back(Exp);
312
313 while (!Q.empty()) {
Krzysztof Parzyszek10fbac02017-03-23 23:01:22 +0000314 Value *V = Q.pop_front_val();
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000315 if (M.find(V) != M.end())
316 continue;
317 if (Instruction *U = dyn_cast<Instruction>(V)) {
318 if (isa<PHINode>(U) || U->getParent() != Block)
319 continue;
320 for (Value *Op : U->operands())
321 Q.push_back(Op);
322 M.insert({U, U->clone()});
323 }
324 }
325
326 for (std::pair<Value*,Value*> P : M) {
327 Instruction *U = cast<Instruction>(P.second);
328 for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
329 auto F = M.find(U->getOperand(i));
330 if (F != M.end())
331 U->setOperand(i, F->second);
332 }
333 }
334
335 auto R = M.find(Exp);
336 assert(R != M.end());
337 Root = R->second;
338
339 record(Root);
340 use(Root);
341}
342
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000343void Simplifier::Context::record(Value *V) {
344 auto Record = [this](Instruction *U) -> bool {
345 Clones.insert(U);
346 return true;
347 };
348 traverse(V, Record);
349}
350
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000351void Simplifier::Context::use(Value *V) {
352 auto Use = [this](Instruction *U) -> bool {
353 Used.insert(U);
354 return true;
355 };
356 traverse(V, Use);
357}
358
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000359void Simplifier::Context::unuse(Value *V) {
360 if (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != nullptr)
361 return;
362
363 auto Unuse = [this](Instruction *U) -> bool {
364 if (!U->use_empty())
365 return false;
366 Used.erase(U);
367 return true;
368 };
369 traverse(V, Unuse);
370}
371
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000372Value *Simplifier::Context::subst(Value *Tree, Value *OldV, Value *NewV) {
373 if (Tree == OldV)
374 return NewV;
375 if (OldV == NewV)
376 return Tree;
377
378 WorkListType Q;
379 Q.push_back(Tree);
380 while (!Q.empty()) {
Krzysztof Parzyszek10fbac02017-03-23 23:01:22 +0000381 Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000382 // If U is not an instruction, or it's not a clone, skip it.
383 if (!U || U->getParent())
384 continue;
385 for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
386 Value *Op = U->getOperand(i);
387 if (Op == OldV) {
388 U->setOperand(i, NewV);
389 unuse(OldV);
390 } else {
391 Q.push_back(Op);
392 }
393 }
394 }
395 return Tree;
396}
397
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000398void Simplifier::Context::replace(Value *OldV, Value *NewV) {
399 if (Root == OldV) {
400 Root = NewV;
401 use(Root);
402 return;
403 }
404
405 // NewV may be a complex tree that has just been created by one of the
406 // transformation rules. We need to make sure that it is commoned with
407 // the existing Root to the maximum extent possible.
408 // Identify all subtrees of NewV (including NewV itself) that have
409 // equivalent counterparts in Root, and replace those subtrees with
410 // these counterparts.
411 WorkListType Q;
412 Q.push_back(NewV);
413 while (!Q.empty()) {
Krzysztof Parzyszek10fbac02017-03-23 23:01:22 +0000414 Value *V = Q.pop_front_val();
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000415 Instruction *U = dyn_cast<Instruction>(V);
416 if (!U || U->getParent())
417 continue;
418 if (Value *DupV = find(Root, V)) {
419 if (DupV != V)
420 NewV = subst(NewV, V, DupV);
421 } else {
422 for (Value *Op : U->operands())
423 Q.push_back(Op);
424 }
425 }
426
427 // Now, simply replace OldV with NewV in Root.
428 Root = subst(Root, OldV, NewV);
429 use(Root);
430}
431
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000432void Simplifier::Context::cleanup() {
433 for (Value *V : Clones) {
434 Instruction *U = cast<Instruction>(V);
435 if (!U->getParent())
436 U->dropAllReferences();
437 }
438
439 for (Value *V : Clones) {
440 Instruction *U = cast<Instruction>(V);
441 if (!U->getParent())
Reid Kleckner96ab8722017-05-18 17:24:10 +0000442 U->deleteValue();
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000443 }
444}
445
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000446bool Simplifier::Context::equal(const Instruction *I,
447 const Instruction *J) const {
448 if (I == J)
449 return true;
450 if (!I->isSameOperationAs(J))
451 return false;
452 if (isa<PHINode>(I))
453 return I->isIdenticalTo(J);
454
455 for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) {
456 Value *OpI = I->getOperand(i), *OpJ = J->getOperand(i);
457 if (OpI == OpJ)
458 continue;
459 auto *InI = dyn_cast<const Instruction>(OpI);
460 auto *InJ = dyn_cast<const Instruction>(OpJ);
461 if (InI && InJ) {
462 if (!equal(InI, InJ))
463 return false;
464 } else if (InI != InJ || !InI)
465 return false;
466 }
467 return true;
468}
469
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000470Value *Simplifier::Context::find(Value *Tree, Value *Sub) const {
471 Instruction *SubI = dyn_cast<Instruction>(Sub);
472 WorkListType Q;
473 Q.push_back(Tree);
474
475 while (!Q.empty()) {
Krzysztof Parzyszek10fbac02017-03-23 23:01:22 +0000476 Value *V = Q.pop_front_val();
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000477 if (V == Sub)
478 return V;
479 Instruction *U = dyn_cast<Instruction>(V);
480 if (!U || U->getParent())
481 continue;
482 if (SubI && equal(SubI, U))
483 return U;
484 assert(!isa<PHINode>(U));
485 for (Value *Op : U->operands())
486 Q.push_back(Op);
487 }
488 return nullptr;
489}
490
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000491void Simplifier::Context::link(Instruction *I, BasicBlock *B,
492 BasicBlock::iterator At) {
493 if (I->getParent())
494 return;
495
496 for (Value *Op : I->operands()) {
497 if (Instruction *OpI = dyn_cast<Instruction>(Op))
498 link(OpI, B, At);
499 }
500
501 B->getInstList().insert(At, I);
502}
503
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000504Value *Simplifier::Context::materialize(BasicBlock *B,
505 BasicBlock::iterator At) {
506 if (Instruction *RootI = dyn_cast<Instruction>(Root))
507 link(RootI, B, At);
508 return Root;
509}
510
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000511Value *Simplifier::simplify(Context &C) {
512 WorkListType Q;
513 Q.push_back(C.Root);
Krzysztof Parzyszek10fbac02017-03-23 23:01:22 +0000514 unsigned Count = 0;
Krzysztof Parzyszek51fd5402017-06-01 18:00:47 +0000515 const unsigned Limit = SimplifyLimit;
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000516
517 while (!Q.empty()) {
Krzysztof Parzyszek10fbac02017-03-23 23:01:22 +0000518 if (Count++ >= Limit)
519 break;
520 Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000521 if (!U || U->getParent() || !C.Used.count(U))
522 continue;
523 bool Changed = false;
524 for (Rule &R : Rules) {
525 Value *W = R(U, C.Ctx);
526 if (!W)
527 continue;
528 Changed = true;
529 C.record(W);
530 C.replace(U, W);
531 Q.push_back(C.Root);
532 break;
533 }
534 if (!Changed) {
535 for (Value *Op : U->operands())
536 Q.push_back(Op);
537 }
538 }
Krzysztof Parzyszek51fd5402017-06-01 18:00:47 +0000539 return Count < Limit ? C.Root : nullptr;
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000540}
541
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000542//===----------------------------------------------------------------------===//
543//
544// Implementation of PolynomialMultiplyRecognize
545//
546//===----------------------------------------------------------------------===//
547
548namespace {
Eugene Zelenko4d060b72017-07-29 00:56:56 +0000549
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000550 class PolynomialMultiplyRecognize {
551 public:
552 explicit PolynomialMultiplyRecognize(Loop *loop, const DataLayout &dl,
553 const DominatorTree &dt, const TargetLibraryInfo &tli,
554 ScalarEvolution &se)
555 : CurLoop(loop), DL(dl), DT(dt), TLI(tli), SE(se) {}
556
557 bool recognize();
Eugene Zelenko4d060b72017-07-29 00:56:56 +0000558
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000559 private:
Eugene Zelenko4d060b72017-07-29 00:56:56 +0000560 using ValueSeq = SetVector<Value *>;
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000561
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000562 IntegerType *getPmpyType() const {
563 LLVMContext &Ctx = CurLoop->getHeader()->getParent()->getContext();
564 return IntegerType::get(Ctx, 32);
565 }
Eugene Zelenko4d060b72017-07-29 00:56:56 +0000566
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000567 bool isPromotableTo(Value *V, IntegerType *Ty);
568 void promoteTo(Instruction *In, IntegerType *DestTy, BasicBlock *LoopB);
569 bool promoteTypes(BasicBlock *LoopB, BasicBlock *ExitB);
570
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000571 Value *getCountIV(BasicBlock *BB);
572 bool findCycle(Value *Out, Value *In, ValueSeq &Cycle);
573 void classifyCycle(Instruction *DivI, ValueSeq &Cycle, ValueSeq &Early,
574 ValueSeq &Late);
575 bool classifyInst(Instruction *UseI, ValueSeq &Early, ValueSeq &Late);
576 bool commutesWithShift(Instruction *I);
577 bool highBitsAreZero(Value *V, unsigned IterCount);
578 bool keepsHighBitsZero(Value *V, unsigned IterCount);
579 bool isOperandShifted(Instruction *I, Value *Op);
580 bool convertShiftsToLeft(BasicBlock *LoopB, BasicBlock *ExitB,
581 unsigned IterCount);
582 void cleanupLoopBody(BasicBlock *LoopB);
583
584 struct ParsedValues {
Eugene Zelenko4d060b72017-07-29 00:56:56 +0000585 ParsedValues() = default;
586
587 Value *M = nullptr;
588 Value *P = nullptr;
589 Value *Q = nullptr;
590 Value *R = nullptr;
591 Value *X = nullptr;
592 Instruction *Res = nullptr;
593 unsigned IterCount = 0;
594 bool Left = false;
595 bool Inv = false;
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000596 };
597
598 bool matchLeftShift(SelectInst *SelI, Value *CIV, ParsedValues &PV);
599 bool matchRightShift(SelectInst *SelI, ParsedValues &PV);
600 bool scanSelect(SelectInst *SI, BasicBlock *LoopB, BasicBlock *PrehB,
601 Value *CIV, ParsedValues &PV, bool PreScan);
602 unsigned getInverseMxN(unsigned QP);
603 Value *generate(BasicBlock::iterator At, ParsedValues &PV);
604
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000605 void setupSimplifier();
606
607 Simplifier Simp;
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000608 Loop *CurLoop;
609 const DataLayout &DL;
610 const DominatorTree &DT;
611 const TargetLibraryInfo &TLI;
612 ScalarEvolution &SE;
613 };
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000614
Eugene Zelenko4d060b72017-07-29 00:56:56 +0000615} // end anonymous namespace
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000616
617Value *PolynomialMultiplyRecognize::getCountIV(BasicBlock *BB) {
618 pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
619 if (std::distance(PI, PE) != 2)
620 return nullptr;
621 BasicBlock *PB = (*PI == BB) ? *std::next(PI) : *PI;
622
623 for (auto I = BB->begin(), E = BB->end(); I != E && isa<PHINode>(I); ++I) {
624 auto *PN = cast<PHINode>(I);
625 Value *InitV = PN->getIncomingValueForBlock(PB);
626 if (!isa<ConstantInt>(InitV) || !cast<ConstantInt>(InitV)->isZero())
627 continue;
628 Value *IterV = PN->getIncomingValueForBlock(BB);
629 if (!isa<BinaryOperator>(IterV))
630 continue;
631 auto *BO = dyn_cast<BinaryOperator>(IterV);
632 if (BO->getOpcode() != Instruction::Add)
633 continue;
634 Value *IncV = nullptr;
635 if (BO->getOperand(0) == PN)
636 IncV = BO->getOperand(1);
637 else if (BO->getOperand(1) == PN)
638 IncV = BO->getOperand(0);
639 if (IncV == nullptr)
640 continue;
641
642 if (auto *T = dyn_cast<ConstantInt>(IncV))
643 if (T->getZExtValue() == 1)
644 return PN;
645 }
646 return nullptr;
647}
648
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000649static void replaceAllUsesOfWithIn(Value *I, Value *J, BasicBlock *BB) {
650 for (auto UI = I->user_begin(), UE = I->user_end(); UI != UE;) {
651 Use &TheUse = UI.getUse();
652 ++UI;
653 if (auto *II = dyn_cast<Instruction>(TheUse.getUser()))
654 if (BB == II->getParent())
655 II->replaceUsesOfWith(I, J);
656 }
657}
658
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000659bool PolynomialMultiplyRecognize::matchLeftShift(SelectInst *SelI,
660 Value *CIV, ParsedValues &PV) {
661 // Match the following:
662 // select (X & (1 << i)) != 0 ? R ^ (Q << i) : R
663 // select (X & (1 << i)) == 0 ? R : R ^ (Q << i)
664 // The condition may also check for equality with the masked value, i.e
665 // select (X & (1 << i)) == (1 << i) ? R ^ (Q << i) : R
666 // select (X & (1 << i)) != (1 << i) ? R : R ^ (Q << i);
667
668 Value *CondV = SelI->getCondition();
669 Value *TrueV = SelI->getTrueValue();
670 Value *FalseV = SelI->getFalseValue();
671
672 using namespace PatternMatch;
673
674 CmpInst::Predicate P;
675 Value *A = nullptr, *B = nullptr, *C = nullptr;
676
677 if (!match(CondV, m_ICmp(P, m_And(m_Value(A), m_Value(B)), m_Value(C))) &&
678 !match(CondV, m_ICmp(P, m_Value(C), m_And(m_Value(A), m_Value(B)))))
679 return false;
680 if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
681 return false;
682 // Matched: select (A & B) == C ? ... : ...
683 // select (A & B) != C ? ... : ...
684
685 Value *X = nullptr, *Sh1 = nullptr;
686 // Check (A & B) for (X & (1 << i)):
687 if (match(A, m_Shl(m_One(), m_Specific(CIV)))) {
688 Sh1 = A;
689 X = B;
690 } else if (match(B, m_Shl(m_One(), m_Specific(CIV)))) {
691 Sh1 = B;
692 X = A;
693 } else {
694 // TODO: Could also check for an induction variable containing single
695 // bit shifted left by 1 in each iteration.
696 return false;
697 }
698
699 bool TrueIfZero;
700
701 // Check C against the possible values for comparison: 0 and (1 << i):
702 if (match(C, m_Zero()))
703 TrueIfZero = (P == CmpInst::ICMP_EQ);
704 else if (C == Sh1)
705 TrueIfZero = (P == CmpInst::ICMP_NE);
706 else
707 return false;
708
709 // So far, matched:
710 // select (X & (1 << i)) ? ... : ...
711 // including variations of the check against zero/non-zero value.
712
713 Value *ShouldSameV = nullptr, *ShouldXoredV = nullptr;
714 if (TrueIfZero) {
715 ShouldSameV = TrueV;
716 ShouldXoredV = FalseV;
717 } else {
718 ShouldSameV = FalseV;
719 ShouldXoredV = TrueV;
720 }
721
722 Value *Q = nullptr, *R = nullptr, *Y = nullptr, *Z = nullptr;
723 Value *T = nullptr;
724 if (match(ShouldXoredV, m_Xor(m_Value(Y), m_Value(Z)))) {
725 // Matched: select +++ ? ... : Y ^ Z
726 // select +++ ? Y ^ Z : ...
727 // where +++ denotes previously checked matches.
728 if (ShouldSameV == Y)
729 T = Z;
730 else if (ShouldSameV == Z)
731 T = Y;
732 else
733 return false;
734 R = ShouldSameV;
735 // Matched: select +++ ? R : R ^ T
736 // select +++ ? R ^ T : R
737 // depending on TrueIfZero.
738
739 } else if (match(ShouldSameV, m_Zero())) {
740 // Matched: select +++ ? 0 : ...
741 // select +++ ? ... : 0
742 if (!SelI->hasOneUse())
743 return false;
744 T = ShouldXoredV;
745 // Matched: select +++ ? 0 : T
746 // select +++ ? T : 0
747
748 Value *U = *SelI->user_begin();
749 if (!match(U, m_Xor(m_Specific(SelI), m_Value(R))) &&
750 !match(U, m_Xor(m_Value(R), m_Specific(SelI))))
751 return false;
752 // Matched: xor (select +++ ? 0 : T), R
753 // xor (select +++ ? T : 0), R
754 } else
755 return false;
756
757 // The xor input value T is isolated into its own match so that it could
758 // be checked against an induction variable containing a shifted bit
759 // (todo).
760 // For now, check against (Q << i).
761 if (!match(T, m_Shl(m_Value(Q), m_Specific(CIV))) &&
762 !match(T, m_Shl(m_ZExt(m_Value(Q)), m_ZExt(m_Specific(CIV)))))
763 return false;
764 // Matched: select +++ ? R : R ^ (Q << i)
765 // select +++ ? R ^ (Q << i) : R
766
767 PV.X = X;
768 PV.Q = Q;
769 PV.R = R;
770 PV.Left = true;
771 return true;
772}
773
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000774bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
775 ParsedValues &PV) {
776 // Match the following:
777 // select (X & 1) != 0 ? (R >> 1) ^ Q : (R >> 1)
778 // select (X & 1) == 0 ? (R >> 1) : (R >> 1) ^ Q
779 // The condition may also check for equality with the masked value, i.e
780 // select (X & 1) == 1 ? (R >> 1) ^ Q : (R >> 1)
781 // select (X & 1) != 1 ? (R >> 1) : (R >> 1) ^ Q
782
783 Value *CondV = SelI->getCondition();
784 Value *TrueV = SelI->getTrueValue();
785 Value *FalseV = SelI->getFalseValue();
786
787 using namespace PatternMatch;
788
789 Value *C = nullptr;
790 CmpInst::Predicate P;
791 bool TrueIfZero;
792
793 if (match(CondV, m_ICmp(P, m_Value(C), m_Zero())) ||
794 match(CondV, m_ICmp(P, m_Zero(), m_Value(C)))) {
795 if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
796 return false;
797 // Matched: select C == 0 ? ... : ...
798 // select C != 0 ? ... : ...
799 TrueIfZero = (P == CmpInst::ICMP_EQ);
800 } else if (match(CondV, m_ICmp(P, m_Value(C), m_One())) ||
801 match(CondV, m_ICmp(P, m_One(), m_Value(C)))) {
802 if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
803 return false;
804 // Matched: select C == 1 ? ... : ...
805 // select C != 1 ? ... : ...
806 TrueIfZero = (P == CmpInst::ICMP_NE);
807 } else
808 return false;
809
810 Value *X = nullptr;
811 if (!match(C, m_And(m_Value(X), m_One())) &&
812 !match(C, m_And(m_One(), m_Value(X))))
813 return false;
814 // Matched: select (X & 1) == +++ ? ... : ...
815 // select (X & 1) != +++ ? ... : ...
816
817 Value *R = nullptr, *Q = nullptr;
818 if (TrueIfZero) {
819 // The select's condition is true if the tested bit is 0.
820 // TrueV must be the shift, FalseV must be the xor.
821 if (!match(TrueV, m_LShr(m_Value(R), m_One())))
822 return false;
823 // Matched: select +++ ? (R >> 1) : ...
824 if (!match(FalseV, m_Xor(m_Specific(TrueV), m_Value(Q))) &&
825 !match(FalseV, m_Xor(m_Value(Q), m_Specific(TrueV))))
826 return false;
827 // Matched: select +++ ? (R >> 1) : (R >> 1) ^ Q
828 // with commuting ^.
829 } else {
830 // The select's condition is true if the tested bit is 1.
831 // TrueV must be the xor, FalseV must be the shift.
832 if (!match(FalseV, m_LShr(m_Value(R), m_One())))
833 return false;
834 // Matched: select +++ ? ... : (R >> 1)
835 if (!match(TrueV, m_Xor(m_Specific(FalseV), m_Value(Q))) &&
836 !match(TrueV, m_Xor(m_Value(Q), m_Specific(FalseV))))
837 return false;
838 // Matched: select +++ ? (R >> 1) ^ Q : (R >> 1)
839 // with commuting ^.
840 }
841
842 PV.X = X;
843 PV.Q = Q;
844 PV.R = R;
845 PV.Left = false;
846 return true;
847}
848
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000849bool PolynomialMultiplyRecognize::scanSelect(SelectInst *SelI,
850 BasicBlock *LoopB, BasicBlock *PrehB, Value *CIV, ParsedValues &PV,
851 bool PreScan) {
852 using namespace PatternMatch;
Eugene Zelenko4d060b72017-07-29 00:56:56 +0000853
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +0000854 // The basic pattern for R = P.Q is:
855 // for i = 0..31
856 // R = phi (0, R')
857 // if (P & (1 << i)) ; test-bit(P, i)
858 // R' = R ^ (Q << i)
859 //
860 // Similarly, the basic pattern for R = (P/Q).Q - P
861 // for i = 0..31
862 // R = phi(P, R')
863 // if (R & (1 << i))
864 // R' = R ^ (Q << i)
865
866 // There exist idioms, where instead of Q being shifted left, P is shifted
867 // right. This produces a result that is shifted right by 32 bits (the
868 // non-shifted result is 64-bit).
869 //
870 // For R = P.Q, this would be:
871 // for i = 0..31
872 // R = phi (0, R')
873 // if ((P >> i) & 1)
874 // R' = (R >> 1) ^ Q ; R is cycled through the loop, so it must
875 // else ; be shifted by 1, not i.
876 // R' = R >> 1
877 //
878 // And for the inverse:
879 // for i = 0..31
880 // R = phi (P, R')
881 // if (R & 1)
882 // R' = (R >> 1) ^ Q
883 // else
884 // R' = R >> 1
885
886 // The left-shifting idioms share the same pattern:
887 // select (X & (1 << i)) ? R ^ (Q << i) : R
888 // Similarly for right-shifting idioms:
889 // select (X & 1) ? (R >> 1) ^ Q
890
891 if (matchLeftShift(SelI, CIV, PV)) {
892 // If this is a pre-scan, getting this far is sufficient.
893 if (PreScan)
894 return true;
895
896 // Need to make sure that the SelI goes back into R.
897 auto *RPhi = dyn_cast<PHINode>(PV.R);
898 if (!RPhi)
899 return false;
900 if (SelI != RPhi->getIncomingValueForBlock(LoopB))
901 return false;
902 PV.Res = SelI;
903
904 // If X is loop invariant, it must be the input polynomial, and the
905 // idiom is the basic polynomial multiply.
906 if (CurLoop->isLoopInvariant(PV.X)) {
907 PV.P = PV.X;
908 PV.Inv = false;
909 } else {
910 // X is not loop invariant. If X == R, this is the inverse pmpy.
911 // Otherwise, check for an xor with an invariant value. If the
912 // variable argument to the xor is R, then this is still a valid
913 // inverse pmpy.
914 PV.Inv = true;
915 if (PV.X != PV.R) {
916 Value *Var = nullptr, *Inv = nullptr, *X1 = nullptr, *X2 = nullptr;
917 if (!match(PV.X, m_Xor(m_Value(X1), m_Value(X2))))
918 return false;
919 auto *I1 = dyn_cast<Instruction>(X1);
920 auto *I2 = dyn_cast<Instruction>(X2);
921 if (!I1 || I1->getParent() != LoopB) {
922 Var = X2;
923 Inv = X1;
924 } else if (!I2 || I2->getParent() != LoopB) {
925 Var = X1;
926 Inv = X2;
927 } else
928 return false;
929 if (Var != PV.R)
930 return false;
931 PV.M = Inv;
932 }
933 // The input polynomial P still needs to be determined. It will be
934 // the entry value of R.
935 Value *EntryP = RPhi->getIncomingValueForBlock(PrehB);
936 PV.P = EntryP;
937 }
938
939 return true;
940 }
941
942 if (matchRightShift(SelI, PV)) {
943 // If this is an inverse pattern, the Q polynomial must be known at
944 // compile time.
945 if (PV.Inv && !isa<ConstantInt>(PV.Q))
946 return false;
947 if (PreScan)
948 return true;
949 // There is no exact matching of right-shift pmpy.
950 return false;
951 }
952
953 return false;
954}
955
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000956bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val,
957 IntegerType *DestTy) {
958 IntegerType *T = dyn_cast<IntegerType>(Val->getType());
959 if (!T || T->getBitWidth() > DestTy->getBitWidth())
960 return false;
961 if (T->getBitWidth() == DestTy->getBitWidth())
962 return true;
963 // Non-instructions are promotable. The reason why an instruction may not
964 // be promotable is that it may produce a different result if its operands
965 // and the result are promoted, for example, it may produce more non-zero
966 // bits. While it would still be possible to represent the proper result
967 // in a wider type, it may require adding additional instructions (which
968 // we don't want to do).
969 Instruction *In = dyn_cast<Instruction>(Val);
970 if (!In)
971 return true;
972 // The bitwidth of the source type is smaller than the destination.
973 // Check if the individual operation can be promoted.
974 switch (In->getOpcode()) {
975 case Instruction::PHI:
976 case Instruction::ZExt:
977 case Instruction::And:
978 case Instruction::Or:
979 case Instruction::Xor:
980 case Instruction::LShr: // Shift right is ok.
981 case Instruction::Select:
982 return true;
983 case Instruction::ICmp:
984 if (CmpInst *CI = cast<CmpInst>(In))
985 return CI->isEquality() || CI->isUnsigned();
986 llvm_unreachable("Cast failed unexpectedly");
987 case Instruction::Add:
988 return In->hasNoSignedWrap() && In->hasNoUnsignedWrap();
989 }
990 return false;
991}
992
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +0000993void PolynomialMultiplyRecognize::promoteTo(Instruction *In,
994 IntegerType *DestTy, BasicBlock *LoopB) {
995 // Leave boolean values alone.
996 if (!In->getType()->isIntegerTy(1))
997 In->mutateType(DestTy);
998 unsigned DestBW = DestTy->getBitWidth();
999
1000 // Handle PHIs.
1001 if (PHINode *P = dyn_cast<PHINode>(In)) {
1002 unsigned N = P->getNumIncomingValues();
1003 for (unsigned i = 0; i != N; ++i) {
1004 BasicBlock *InB = P->getIncomingBlock(i);
1005 if (InB == LoopB)
1006 continue;
1007 Value *InV = P->getIncomingValue(i);
1008 IntegerType *Ty = cast<IntegerType>(InV->getType());
1009 // Do not promote values in PHI nodes of type i1.
1010 if (Ty != P->getType()) {
1011 // If the value type does not match the PHI type, the PHI type
1012 // must have been promoted.
1013 assert(Ty->getBitWidth() < DestBW);
1014 InV = IRBuilder<>(InB->getTerminator()).CreateZExt(InV, DestTy);
1015 P->setIncomingValue(i, InV);
1016 }
1017 }
1018 } else if (ZExtInst *Z = dyn_cast<ZExtInst>(In)) {
1019 Value *Op = Z->getOperand(0);
1020 if (Op->getType() == Z->getType())
1021 Z->replaceAllUsesWith(Op);
1022 Z->eraseFromParent();
1023 return;
1024 }
1025
1026 // Promote immediates.
1027 for (unsigned i = 0, n = In->getNumOperands(); i != n; ++i) {
1028 if (ConstantInt *CI = dyn_cast<ConstantInt>(In->getOperand(i)))
1029 if (CI->getType()->getBitWidth() < DestBW)
1030 In->setOperand(i, ConstantInt::get(DestTy, CI->getZExtValue()));
1031 }
1032}
1033
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +00001034bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB,
1035 BasicBlock *ExitB) {
1036 assert(LoopB);
1037 // Skip loops where the exit block has more than one predecessor. The values
1038 // coming from the loop block will be promoted to another type, and so the
1039 // values coming into the exit block from other predecessors would also have
1040 // to be promoted.
1041 if (!ExitB || (ExitB->getSinglePredecessor() != LoopB))
1042 return false;
1043 IntegerType *DestTy = getPmpyType();
1044 // Check if the exit values have types that are no wider than the type
1045 // that we want to promote to.
1046 unsigned DestBW = DestTy->getBitWidth();
1047 for (Instruction &In : *ExitB) {
1048 PHINode *P = dyn_cast<PHINode>(&In);
1049 if (!P)
1050 break;
1051 if (P->getNumIncomingValues() != 1)
1052 return false;
1053 assert(P->getIncomingBlock(0) == LoopB);
1054 IntegerType *T = dyn_cast<IntegerType>(P->getType());
1055 if (!T || T->getBitWidth() > DestBW)
1056 return false;
1057 }
1058
1059 // Check all instructions in the loop.
1060 for (Instruction &In : *LoopB)
1061 if (!In.isTerminator() && !isPromotableTo(&In, DestTy))
1062 return false;
1063
1064 // Perform the promotion.
1065 std::vector<Instruction*> LoopIns;
1066 std::transform(LoopB->begin(), LoopB->end(), std::back_inserter(LoopIns),
1067 [](Instruction &In) { return &In; });
1068 for (Instruction *In : LoopIns)
1069 promoteTo(In, DestTy, LoopB);
1070
1071 // Fix up the PHI nodes in the exit block.
1072 Instruction *EndI = ExitB->getFirstNonPHI();
1073 BasicBlock::iterator End = EndI ? EndI->getIterator() : ExitB->end();
1074 for (auto I = ExitB->begin(); I != End; ++I) {
1075 PHINode *P = dyn_cast<PHINode>(I);
1076 if (!P)
1077 break;
1078 Type *Ty0 = P->getIncomingValue(0)->getType();
1079 Type *PTy = P->getType();
1080 if (PTy != Ty0) {
1081 assert(Ty0 == DestTy);
1082 // In order to create the trunc, P must have the promoted type.
1083 P->mutateType(Ty0);
1084 Value *T = IRBuilder<>(ExitB, End).CreateTrunc(P, PTy);
1085 // In order for the RAUW to work, the types of P and T must match.
1086 P->mutateType(PTy);
1087 P->replaceAllUsesWith(T);
1088 // Final update of the P's type.
1089 P->mutateType(Ty0);
1090 cast<Instruction>(T)->setOperand(0, P);
1091 }
1092 }
1093
1094 return true;
1095}
1096
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001097bool PolynomialMultiplyRecognize::findCycle(Value *Out, Value *In,
1098 ValueSeq &Cycle) {
1099 // Out = ..., In, ...
1100 if (Out == In)
1101 return true;
1102
1103 auto *BB = cast<Instruction>(Out)->getParent();
1104 bool HadPhi = false;
1105
1106 for (auto U : Out->users()) {
1107 auto *I = dyn_cast<Instruction>(&*U);
1108 if (I == nullptr || I->getParent() != BB)
1109 continue;
1110 // Make sure that there are no multi-iteration cycles, e.g.
1111 // p1 = phi(p2)
1112 // p2 = phi(p1)
1113 // The cycle p1->p2->p1 would span two loop iterations.
1114 // Check that there is only one phi in the cycle.
1115 bool IsPhi = isa<PHINode>(I);
1116 if (IsPhi && HadPhi)
1117 return false;
1118 HadPhi |= IsPhi;
1119 if (Cycle.count(I))
1120 return false;
1121 Cycle.insert(I);
1122 if (findCycle(I, In, Cycle))
1123 break;
1124 Cycle.remove(I);
1125 }
1126 return !Cycle.empty();
1127}
1128
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001129void PolynomialMultiplyRecognize::classifyCycle(Instruction *DivI,
1130 ValueSeq &Cycle, ValueSeq &Early, ValueSeq &Late) {
1131 // All the values in the cycle that are between the phi node and the
1132 // divider instruction will be classified as "early", all other values
1133 // will be "late".
1134
1135 bool IsE = true;
1136 unsigned I, N = Cycle.size();
1137 for (I = 0; I < N; ++I) {
1138 Value *V = Cycle[I];
1139 if (DivI == V)
1140 IsE = false;
1141 else if (!isa<PHINode>(V))
1142 continue;
1143 // Stop if found either.
1144 break;
1145 }
1146 // "I" is the index of either DivI or the phi node, whichever was first.
1147 // "E" is "false" or "true" respectively.
1148 ValueSeq &First = !IsE ? Early : Late;
1149 for (unsigned J = 0; J < I; ++J)
1150 First.insert(Cycle[J]);
1151
1152 ValueSeq &Second = IsE ? Early : Late;
1153 Second.insert(Cycle[I]);
1154 for (++I; I < N; ++I) {
1155 Value *V = Cycle[I];
1156 if (DivI == V || isa<PHINode>(V))
1157 break;
1158 Second.insert(V);
1159 }
1160
1161 for (; I < N; ++I)
1162 First.insert(Cycle[I]);
1163}
1164
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001165bool PolynomialMultiplyRecognize::classifyInst(Instruction *UseI,
1166 ValueSeq &Early, ValueSeq &Late) {
1167 // Select is an exception, since the condition value does not have to be
1168 // classified in the same way as the true/false values. The true/false
1169 // values do have to be both early or both late.
1170 if (UseI->getOpcode() == Instruction::Select) {
1171 Value *TV = UseI->getOperand(1), *FV = UseI->getOperand(2);
1172 if (Early.count(TV) || Early.count(FV)) {
1173 if (Late.count(TV) || Late.count(FV))
1174 return false;
1175 Early.insert(UseI);
1176 } else if (Late.count(TV) || Late.count(FV)) {
1177 if (Early.count(TV) || Early.count(FV))
1178 return false;
1179 Late.insert(UseI);
1180 }
1181 return true;
1182 }
1183
1184 // Not sure what would be the example of this, but the code below relies
1185 // on having at least one operand.
1186 if (UseI->getNumOperands() == 0)
1187 return true;
1188
1189 bool AE = true, AL = true;
1190 for (auto &I : UseI->operands()) {
1191 if (Early.count(&*I))
1192 AL = false;
1193 else if (Late.count(&*I))
1194 AE = false;
1195 }
1196 // If the operands appear "all early" and "all late" at the same time,
1197 // then it means that none of them are actually classified as either.
1198 // This is harmless.
1199 if (AE && AL)
1200 return true;
1201 // Conversely, if they are neither "all early" nor "all late", then
1202 // we have a mixture of early and late operands that is not a known
1203 // exception.
1204 if (!AE && !AL)
1205 return false;
1206
1207 // Check that we have covered the two special cases.
1208 assert(AE != AL);
1209
1210 if (AE)
1211 Early.insert(UseI);
1212 else
1213 Late.insert(UseI);
1214 return true;
1215}
1216
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001217bool PolynomialMultiplyRecognize::commutesWithShift(Instruction *I) {
1218 switch (I->getOpcode()) {
1219 case Instruction::And:
1220 case Instruction::Or:
1221 case Instruction::Xor:
1222 case Instruction::LShr:
1223 case Instruction::Shl:
1224 case Instruction::Select:
1225 case Instruction::ICmp:
1226 case Instruction::PHI:
1227 break;
1228 default:
1229 return false;
1230 }
1231 return true;
1232}
1233
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001234bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V,
1235 unsigned IterCount) {
1236 auto *T = dyn_cast<IntegerType>(V->getType());
1237 if (!T)
1238 return false;
1239
Craig Topperb45eabc2017-04-26 16:39:58 +00001240 KnownBits Known(T->getBitWidth());
1241 computeKnownBits(V, Known, DL);
Craig Topper8df66c62017-05-12 17:20:30 +00001242 return Known.countMinLeadingZeros() >= IterCount;
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001243}
1244
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001245bool PolynomialMultiplyRecognize::keepsHighBitsZero(Value *V,
1246 unsigned IterCount) {
1247 // Assume that all inputs to the value have the high bits zero.
1248 // Check if the value itself preserves the zeros in the high bits.
1249 if (auto *C = dyn_cast<ConstantInt>(V))
1250 return C->getValue().countLeadingZeros() >= IterCount;
1251
1252 if (auto *I = dyn_cast<Instruction>(V)) {
1253 switch (I->getOpcode()) {
1254 case Instruction::And:
1255 case Instruction::Or:
1256 case Instruction::Xor:
1257 case Instruction::LShr:
1258 case Instruction::Select:
1259 case Instruction::ICmp:
1260 case Instruction::PHI:
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +00001261 case Instruction::ZExt:
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001262 return true;
1263 }
1264 }
1265
1266 return false;
1267}
1268
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001269bool PolynomialMultiplyRecognize::isOperandShifted(Instruction *I, Value *Op) {
1270 unsigned Opc = I->getOpcode();
1271 if (Opc == Instruction::Shl || Opc == Instruction::LShr)
1272 return Op != I->getOperand(1);
1273 return true;
1274}
1275
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001276bool PolynomialMultiplyRecognize::convertShiftsToLeft(BasicBlock *LoopB,
1277 BasicBlock *ExitB, unsigned IterCount) {
1278 Value *CIV = getCountIV(LoopB);
1279 if (CIV == nullptr)
1280 return false;
1281 auto *CIVTy = dyn_cast<IntegerType>(CIV->getType());
1282 if (CIVTy == nullptr)
1283 return false;
1284
1285 ValueSeq RShifts;
1286 ValueSeq Early, Late, Cycled;
1287
1288 // Find all value cycles that contain logical right shifts by 1.
1289 for (Instruction &I : *LoopB) {
1290 using namespace PatternMatch;
Eugene Zelenko4d060b72017-07-29 00:56:56 +00001291
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001292 Value *V = nullptr;
1293 if (!match(&I, m_LShr(m_Value(V), m_One())))
1294 continue;
1295 ValueSeq C;
1296 if (!findCycle(&I, V, C))
1297 continue;
1298
1299 // Found a cycle.
1300 C.insert(&I);
1301 classifyCycle(&I, C, Early, Late);
1302 Cycled.insert(C.begin(), C.end());
1303 RShifts.insert(&I);
1304 }
1305
1306 // Find the set of all values affected by the shift cycles, i.e. all
1307 // cycled values, and (recursively) all their users.
1308 ValueSeq Users(Cycled.begin(), Cycled.end());
1309 for (unsigned i = 0; i < Users.size(); ++i) {
1310 Value *V = Users[i];
1311 if (!isa<IntegerType>(V->getType()))
1312 return false;
1313 auto *R = cast<Instruction>(V);
1314 // If the instruction does not commute with shifts, the loop cannot
1315 // be unshifted.
1316 if (!commutesWithShift(R))
1317 return false;
1318 for (auto I = R->user_begin(), E = R->user_end(); I != E; ++I) {
1319 auto *T = cast<Instruction>(*I);
1320 // Skip users from outside of the loop. They will be handled later.
1321 // Also, skip the right-shifts and phi nodes, since they mix early
1322 // and late values.
1323 if (T->getParent() != LoopB || RShifts.count(T) || isa<PHINode>(T))
1324 continue;
1325
1326 Users.insert(T);
1327 if (!classifyInst(T, Early, Late))
1328 return false;
1329 }
1330 }
1331
Eugene Zelenko4d060b72017-07-29 00:56:56 +00001332 if (Users.empty())
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001333 return false;
1334
1335 // Verify that high bits remain zero.
1336 ValueSeq Internal(Users.begin(), Users.end());
1337 ValueSeq Inputs;
1338 for (unsigned i = 0; i < Internal.size(); ++i) {
1339 auto *R = dyn_cast<Instruction>(Internal[i]);
1340 if (!R)
1341 continue;
1342 for (Value *Op : R->operands()) {
1343 auto *T = dyn_cast<Instruction>(Op);
1344 if (T && T->getParent() != LoopB)
1345 Inputs.insert(Op);
1346 else
1347 Internal.insert(Op);
1348 }
1349 }
1350 for (Value *V : Inputs)
1351 if (!highBitsAreZero(V, IterCount))
1352 return false;
1353 for (Value *V : Internal)
1354 if (!keepsHighBitsZero(V, IterCount))
1355 return false;
1356
1357 // Finally, the work can be done. Unshift each user.
1358 IRBuilder<> IRB(LoopB);
1359 std::map<Value*,Value*> ShiftMap;
Eugene Zelenko4d060b72017-07-29 00:56:56 +00001360
1361 using CastMapType = std::map<std::pair<Value *, Type *>, Value *>;
1362
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001363 CastMapType CastMap;
1364
1365 auto upcast = [] (CastMapType &CM, IRBuilder<> &IRB, Value *V,
1366 IntegerType *Ty) -> Value* {
1367 auto H = CM.find(std::make_pair(V, Ty));
1368 if (H != CM.end())
1369 return H->second;
1370 Value *CV = IRB.CreateIntCast(V, Ty, false);
1371 CM.insert(std::make_pair(std::make_pair(V, Ty), CV));
1372 return CV;
1373 };
1374
1375 for (auto I = LoopB->begin(), E = LoopB->end(); I != E; ++I) {
Eugene Zelenko4d060b72017-07-29 00:56:56 +00001376 using namespace PatternMatch;
1377
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001378 if (isa<PHINode>(I) || !Users.count(&*I))
1379 continue;
Eugene Zelenko4d060b72017-07-29 00:56:56 +00001380
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001381 // Match lshr x, 1.
1382 Value *V = nullptr;
1383 if (match(&*I, m_LShr(m_Value(V), m_One()))) {
1384 replaceAllUsesOfWithIn(&*I, V, LoopB);
1385 continue;
1386 }
1387 // For each non-cycled operand, replace it with the corresponding
1388 // value shifted left.
1389 for (auto &J : I->operands()) {
1390 Value *Op = J.get();
1391 if (!isOperandShifted(&*I, Op))
1392 continue;
1393 if (Users.count(Op))
1394 continue;
1395 // Skip shifting zeros.
1396 if (isa<ConstantInt>(Op) && cast<ConstantInt>(Op)->isZero())
1397 continue;
1398 // Check if we have already generated a shift for this value.
1399 auto F = ShiftMap.find(Op);
1400 Value *W = (F != ShiftMap.end()) ? F->second : nullptr;
1401 if (W == nullptr) {
1402 IRB.SetInsertPoint(&*I);
1403 // First, the shift amount will be CIV or CIV+1, depending on
1404 // whether the value is early or late. Instead of creating CIV+1,
1405 // do a single shift of the value.
1406 Value *ShAmt = CIV, *ShVal = Op;
1407 auto *VTy = cast<IntegerType>(ShVal->getType());
1408 auto *ATy = cast<IntegerType>(ShAmt->getType());
1409 if (Late.count(&*I))
1410 ShVal = IRB.CreateShl(Op, ConstantInt::get(VTy, 1));
1411 // Second, the types of the shifted value and the shift amount
1412 // must match.
1413 if (VTy != ATy) {
1414 if (VTy->getBitWidth() < ATy->getBitWidth())
1415 ShVal = upcast(CastMap, IRB, ShVal, ATy);
1416 else
1417 ShAmt = upcast(CastMap, IRB, ShAmt, VTy);
1418 }
1419 // Ready to generate the shift and memoize it.
1420 W = IRB.CreateShl(ShVal, ShAmt);
1421 ShiftMap.insert(std::make_pair(Op, W));
1422 }
1423 I->replaceUsesOfWith(Op, W);
1424 }
1425 }
1426
1427 // Update the users outside of the loop to account for having left
1428 // shifts. They would normally be shifted right in the loop, so shift
1429 // them right after the loop exit.
1430 // Take advantage of the loop-closed SSA form, which has all the post-
1431 // loop values in phi nodes.
1432 IRB.SetInsertPoint(ExitB, ExitB->getFirstInsertionPt());
1433 for (auto P = ExitB->begin(), Q = ExitB->end(); P != Q; ++P) {
1434 if (!isa<PHINode>(P))
1435 break;
1436 auto *PN = cast<PHINode>(P);
1437 Value *U = PN->getIncomingValueForBlock(LoopB);
1438 if (!Users.count(U))
1439 continue;
1440 Value *S = IRB.CreateLShr(PN, ConstantInt::get(PN->getType(), IterCount));
1441 PN->replaceAllUsesWith(S);
1442 // The above RAUW will create
1443 // S = lshr S, IterCount
1444 // so we need to fix it back into
1445 // S = lshr PN, IterCount
1446 cast<User>(S)->replaceUsesOfWith(S, PN);
1447 }
1448
1449 return true;
1450}
1451
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001452void PolynomialMultiplyRecognize::cleanupLoopBody(BasicBlock *LoopB) {
1453 for (auto &I : *LoopB)
Daniel Berlin4d0fe642017-04-28 19:55:38 +00001454 if (Value *SV = SimplifyInstruction(&I, {DL, &TLI, &DT}))
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001455 I.replaceAllUsesWith(SV);
1456
1457 for (auto I = LoopB->begin(), N = I; I != LoopB->end(); I = N) {
1458 N = std::next(I);
1459 RecursivelyDeleteTriviallyDeadInstructions(&*I, &TLI);
1460 }
1461}
1462
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001463unsigned PolynomialMultiplyRecognize::getInverseMxN(unsigned QP) {
1464 // Arrays of coefficients of Q and the inverse, C.
1465 // Q[i] = coefficient at x^i.
1466 std::array<char,32> Q, C;
1467
1468 for (unsigned i = 0; i < 32; ++i) {
1469 Q[i] = QP & 1;
1470 QP >>= 1;
1471 }
1472 assert(Q[0] == 1);
1473
1474 // Find C, such that
1475 // (Q[n]*x^n + ... + Q[1]*x + Q[0]) * (C[n]*x^n + ... + C[1]*x + C[0]) = 1
1476 //
1477 // For it to have a solution, Q[0] must be 1. Since this is Z2[x], the
1478 // operations * and + are & and ^ respectively.
1479 //
1480 // Find C[i] recursively, by comparing i-th coefficient in the product
1481 // with 0 (or 1 for i=0).
1482 //
1483 // C[0] = 1, since C[0] = Q[0], and Q[0] = 1.
1484 C[0] = 1;
1485 for (unsigned i = 1; i < 32; ++i) {
1486 // Solve for C[i] in:
1487 // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i]Q[0] = 0
1488 // This is equivalent to
1489 // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i] = 0
1490 // which is
1491 // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] = C[i]
1492 unsigned T = 0;
1493 for (unsigned j = 0; j < i; ++j)
1494 T = T ^ (C[j] & Q[i-j]);
1495 C[i] = T;
1496 }
1497
1498 unsigned QV = 0;
1499 for (unsigned i = 0; i < 32; ++i)
1500 if (C[i])
1501 QV |= (1 << i);
1502
1503 return QV;
1504}
1505
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001506Value *PolynomialMultiplyRecognize::generate(BasicBlock::iterator At,
1507 ParsedValues &PV) {
1508 IRBuilder<> B(&*At);
1509 Module *M = At->getParent()->getParent()->getParent();
1510 Value *PMF = Intrinsic::getDeclaration(M, Intrinsic::hexagon_M4_pmpyw);
1511
1512 Value *P = PV.P, *Q = PV.Q, *P0 = P;
1513 unsigned IC = PV.IterCount;
1514
1515 if (PV.M != nullptr)
1516 P0 = P = B.CreateXor(P, PV.M);
1517
1518 // Create a bit mask to clear the high bits beyond IterCount.
1519 auto *BMI = ConstantInt::get(P->getType(), APInt::getLowBitsSet(32, IC));
1520
1521 if (PV.IterCount != 32)
1522 P = B.CreateAnd(P, BMI);
1523
1524 if (PV.Inv) {
1525 auto *QI = dyn_cast<ConstantInt>(PV.Q);
1526 assert(QI && QI->getBitWidth() <= 32);
1527
1528 // Again, clearing bits beyond IterCount.
1529 unsigned M = (1 << PV.IterCount) - 1;
1530 unsigned Tmp = (QI->getZExtValue() | 1) & M;
1531 unsigned QV = getInverseMxN(Tmp) & M;
1532 auto *QVI = ConstantInt::get(QI->getType(), QV);
1533 P = B.CreateCall(PMF, {P, QVI});
1534 P = B.CreateTrunc(P, QI->getType());
1535 if (IC != 32)
1536 P = B.CreateAnd(P, BMI);
1537 }
1538
1539 Value *R = B.CreateCall(PMF, {P, Q});
1540
1541 if (PV.M != nullptr)
1542 R = B.CreateXor(R, B.CreateIntCast(P0, R->getType(), false));
1543
1544 return R;
1545}
1546
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +00001547void PolynomialMultiplyRecognize::setupSimplifier() {
1548 Simp.addRule(
1549 // Sink zext past bitwise operations.
1550 [](Instruction *I, LLVMContext &Ctx) -> Value* {
1551 if (I->getOpcode() != Instruction::ZExt)
1552 return nullptr;
1553 Instruction *T = dyn_cast<Instruction>(I->getOperand(0));
1554 if (!T)
1555 return nullptr;
1556 switch (T->getOpcode()) {
1557 case Instruction::And:
1558 case Instruction::Or:
1559 case Instruction::Xor:
1560 break;
1561 default:
1562 return nullptr;
1563 }
1564 IRBuilder<> B(Ctx);
1565 return B.CreateBinOp(cast<BinaryOperator>(T)->getOpcode(),
1566 B.CreateZExt(T->getOperand(0), I->getType()),
1567 B.CreateZExt(T->getOperand(1), I->getType()));
1568 });
1569 Simp.addRule(
1570 // (xor (and x a) (and y a)) -> (and (xor x y) a)
1571 [](Instruction *I, LLVMContext &Ctx) -> Value* {
1572 if (I->getOpcode() != Instruction::Xor)
1573 return nullptr;
1574 Instruction *And0 = dyn_cast<Instruction>(I->getOperand(0));
1575 Instruction *And1 = dyn_cast<Instruction>(I->getOperand(1));
1576 if (!And0 || !And1)
1577 return nullptr;
1578 if (And0->getOpcode() != Instruction::And ||
1579 And1->getOpcode() != Instruction::And)
1580 return nullptr;
1581 if (And0->getOperand(1) != And1->getOperand(1))
1582 return nullptr;
1583 IRBuilder<> B(Ctx);
1584 return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1->getOperand(0)),
1585 And0->getOperand(1));
1586 });
1587 Simp.addRule(
1588 // (Op (select c x y) z) -> (select c (Op x z) (Op y z))
1589 // (Op x (select c y z)) -> (select c (Op x y) (Op x z))
1590 [](Instruction *I, LLVMContext &Ctx) -> Value* {
1591 BinaryOperator *BO = dyn_cast<BinaryOperator>(I);
1592 if (!BO)
1593 return nullptr;
1594 Instruction::BinaryOps Op = BO->getOpcode();
1595 if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(0))) {
1596 IRBuilder<> B(Ctx);
1597 Value *X = Sel->getTrueValue(), *Y = Sel->getFalseValue();
1598 Value *Z = BO->getOperand(1);
1599 return B.CreateSelect(Sel->getCondition(),
1600 B.CreateBinOp(Op, X, Z),
1601 B.CreateBinOp(Op, Y, Z));
1602 }
1603 if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(1))) {
1604 IRBuilder<> B(Ctx);
1605 Value *X = BO->getOperand(0);
1606 Value *Y = Sel->getTrueValue(), *Z = Sel->getFalseValue();
1607 return B.CreateSelect(Sel->getCondition(),
1608 B.CreateBinOp(Op, X, Y),
1609 B.CreateBinOp(Op, X, Z));
1610 }
1611 return nullptr;
1612 });
1613 Simp.addRule(
1614 // (select c (select c x y) z) -> (select c x z)
1615 // (select c x (select c y z)) -> (select c x z)
1616 [](Instruction *I, LLVMContext &Ctx) -> Value* {
1617 SelectInst *Sel = dyn_cast<SelectInst>(I);
1618 if (!Sel)
1619 return nullptr;
1620 IRBuilder<> B(Ctx);
1621 Value *C = Sel->getCondition();
1622 if (SelectInst *Sel0 = dyn_cast<SelectInst>(Sel->getTrueValue())) {
1623 if (Sel0->getCondition() == C)
1624 return B.CreateSelect(C, Sel0->getTrueValue(), Sel->getFalseValue());
1625 }
1626 if (SelectInst *Sel1 = dyn_cast<SelectInst>(Sel->getFalseValue())) {
1627 if (Sel1->getCondition() == C)
1628 return B.CreateSelect(C, Sel->getTrueValue(), Sel1->getFalseValue());
1629 }
1630 return nullptr;
1631 });
1632 Simp.addRule(
1633 // (or (lshr x 1) 0x800.0) -> (xor (lshr x 1) 0x800.0)
1634 [](Instruction *I, LLVMContext &Ctx) -> Value* {
1635 if (I->getOpcode() != Instruction::Or)
1636 return nullptr;
1637 Instruction *LShr = dyn_cast<Instruction>(I->getOperand(0));
1638 if (!LShr || LShr->getOpcode() != Instruction::LShr)
1639 return nullptr;
1640 ConstantInt *One = dyn_cast<ConstantInt>(LShr->getOperand(1));
1641 if (!One || One->getZExtValue() != 1)
1642 return nullptr;
1643 ConstantInt *Msb = dyn_cast<ConstantInt>(I->getOperand(1));
1644 if (!Msb || Msb->getZExtValue() != Msb->getType()->getSignBit())
1645 return nullptr;
1646 return IRBuilder<>(Ctx).CreateXor(LShr, Msb);
1647 });
1648 Simp.addRule(
1649 // (lshr (BitOp x y) c) -> (BitOp (lshr x c) (lshr y c))
1650 [](Instruction *I, LLVMContext &Ctx) -> Value* {
1651 if (I->getOpcode() != Instruction::LShr)
1652 return nullptr;
1653 BinaryOperator *BitOp = dyn_cast<BinaryOperator>(I->getOperand(0));
1654 if (!BitOp)
1655 return nullptr;
1656 switch (BitOp->getOpcode()) {
1657 case Instruction::And:
1658 case Instruction::Or:
1659 case Instruction::Xor:
1660 break;
1661 default:
1662 return nullptr;
1663 }
1664 IRBuilder<> B(Ctx);
1665 Value *S = I->getOperand(1);
1666 return B.CreateBinOp(BitOp->getOpcode(),
1667 B.CreateLShr(BitOp->getOperand(0), S),
1668 B.CreateLShr(BitOp->getOperand(1), S));
1669 });
1670 Simp.addRule(
1671 // (BitOp1 (BitOp2 x a) b) -> (BitOp2 x (BitOp1 a b))
1672 [](Instruction *I, LLVMContext &Ctx) -> Value* {
1673 auto IsBitOp = [](unsigned Op) -> bool {
1674 switch (Op) {
1675 case Instruction::And:
1676 case Instruction::Or:
1677 case Instruction::Xor:
1678 return true;
1679 }
1680 return false;
1681 };
1682 BinaryOperator *BitOp1 = dyn_cast<BinaryOperator>(I);
1683 if (!BitOp1 || !IsBitOp(BitOp1->getOpcode()))
1684 return nullptr;
1685 BinaryOperator *BitOp2 = dyn_cast<BinaryOperator>(BitOp1->getOperand(0));
1686 if (!BitOp2 || !IsBitOp(BitOp2->getOpcode()))
1687 return nullptr;
1688 ConstantInt *CA = dyn_cast<ConstantInt>(BitOp2->getOperand(1));
1689 ConstantInt *CB = dyn_cast<ConstantInt>(BitOp1->getOperand(1));
1690 if (!CA || !CB)
1691 return nullptr;
1692 IRBuilder<> B(Ctx);
1693 Value *X = BitOp2->getOperand(0);
1694 return B.CreateBinOp(BitOp2->getOpcode(), X,
1695 B.CreateBinOp(BitOp1->getOpcode(), CA, CB));
1696 });
1697}
1698
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001699bool PolynomialMultiplyRecognize::recognize() {
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +00001700 DEBUG(dbgs() << "Starting PolynomialMultiplyRecognize on loop\n"
1701 << *CurLoop << '\n');
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001702 // Restrictions:
1703 // - The loop must consist of a single block.
1704 // - The iteration count must be known at compile-time.
1705 // - The loop must have an induction variable starting from 0, and
1706 // incremented in each iteration of the loop.
1707 BasicBlock *LoopB = CurLoop->getHeader();
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +00001708 DEBUG(dbgs() << "Loop header:\n" << *LoopB);
1709
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001710 if (LoopB != CurLoop->getLoopLatch())
1711 return false;
1712 BasicBlock *ExitB = CurLoop->getExitBlock();
1713 if (ExitB == nullptr)
1714 return false;
1715 BasicBlock *EntryB = CurLoop->getLoopPreheader();
1716 if (EntryB == nullptr)
1717 return false;
1718
1719 unsigned IterCount = 0;
1720 const SCEV *CT = SE.getBackedgeTakenCount(CurLoop);
1721 if (isa<SCEVCouldNotCompute>(CT))
1722 return false;
1723 if (auto *CV = dyn_cast<SCEVConstant>(CT))
1724 IterCount = CV->getValue()->getZExtValue() + 1;
1725
1726 Value *CIV = getCountIV(LoopB);
1727 ParsedValues PV;
1728 PV.IterCount = IterCount;
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +00001729 DEBUG(dbgs() << "Loop IV: " << *CIV << "\nIterCount: " << IterCount << '\n');
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001730
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +00001731 setupSimplifier();
1732
1733 // Perform a preliminary scan of select instructions to see if any of them
1734 // looks like a generator of the polynomial multiply steps. Assume that a
1735 // loop can only contain a single transformable operation, so stop the
1736 // traversal after the first reasonable candidate was found.
1737 // XXX: Currently this approach can modify the loop before being 100% sure
1738 // that the transformation can be carried out.
1739 bool FoundPreScan = false;
1740 for (Instruction &In : *LoopB) {
1741 SelectInst *SI = dyn_cast<SelectInst>(&In);
1742 if (!SI)
1743 continue;
1744
1745 Simplifier::Context C(SI);
1746 Value *T = Simp.simplify(C);
1747 SelectInst *SelI = (T && isa<SelectInst>(T)) ? cast<SelectInst>(T) : SI;
1748 DEBUG(dbgs() << "scanSelect(pre-scan): " << PE(C, SelI) << '\n');
1749 if (scanSelect(SelI, LoopB, EntryB, CIV, PV, true)) {
1750 FoundPreScan = true;
1751 if (SelI != SI) {
1752 Value *NewSel = C.materialize(LoopB, SI->getIterator());
1753 SI->replaceAllUsesWith(NewSel);
1754 RecursivelyDeleteTriviallyDeadInstructions(SI, &TLI);
1755 }
1756 break;
1757 }
1758 }
1759
1760 if (!FoundPreScan) {
1761 DEBUG(dbgs() << "Have not found candidates for pmpy\n");
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001762 return false;
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +00001763 }
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001764
1765 if (!PV.Left) {
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +00001766 // The right shift version actually only returns the higher bits of
1767 // the result (each iteration discards the LSB). If we want to convert it
1768 // to a left-shifting loop, the working data type must be at least as
1769 // wide as the target's pmpy instruction.
1770 if (!promoteTypes(LoopB, ExitB))
1771 return false;
Krzysztof Parzyszek9bd4d912017-06-13 13:51:49 +00001772 if (!convertShiftsToLeft(LoopB, ExitB, IterCount))
1773 return false;
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001774 cleanupLoopBody(LoopB);
1775 }
1776
Krzysztof Parzyszekd033d1f2017-03-21 17:09:27 +00001777 // Scan the loop again, find the generating select instruction.
1778 bool FoundScan = false;
1779 for (Instruction &In : *LoopB) {
1780 SelectInst *SelI = dyn_cast<SelectInst>(&In);
1781 if (!SelI)
1782 continue;
1783 DEBUG(dbgs() << "scanSelect: " << *SelI << '\n');
1784 FoundScan = scanSelect(SelI, LoopB, EntryB, CIV, PV, false);
1785 if (FoundScan)
1786 break;
1787 }
1788 assert(FoundScan);
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001789
1790 DEBUG({
1791 StringRef PP = (PV.M ? "(P+M)" : "P");
1792 if (!PV.Inv)
1793 dbgs() << "Found pmpy idiom: R = " << PP << ".Q\n";
1794 else
1795 dbgs() << "Found inverse pmpy idiom: R = (" << PP << "/Q).Q) + "
1796 << PP << "\n";
1797 dbgs() << " Res:" << *PV.Res << "\n P:" << *PV.P << "\n";
1798 if (PV.M)
1799 dbgs() << " M:" << *PV.M << "\n";
1800 dbgs() << " Q:" << *PV.Q << "\n";
1801 dbgs() << " Iteration count:" << PV.IterCount << "\n";
1802 });
1803
1804 BasicBlock::iterator At(EntryB->getTerminator());
1805 Value *PM = generate(At, PV);
1806 if (PM == nullptr)
1807 return false;
1808
1809 if (PM->getType() != PV.Res->getType())
1810 PM = IRBuilder<>(&*At).CreateIntCast(PM, PV.Res->getType(), false);
1811
1812 PV.Res->replaceAllUsesWith(PM);
1813 PV.Res->eraseFromParent();
1814 return true;
1815}
1816
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001817unsigned HexagonLoopIdiomRecognize::getStoreSizeInBytes(StoreInst *SI) {
1818 uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType());
1819 assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) &&
1820 "Don't overflow unsigned.");
1821 return (unsigned)SizeInBits >> 3;
1822}
1823
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001824int HexagonLoopIdiomRecognize::getSCEVStride(const SCEVAddRecExpr *S) {
1825 if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getOperand(1)))
1826 return SC->getAPInt().getSExtValue();
1827 return 0;
1828}
1829
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001830bool HexagonLoopIdiomRecognize::isLegalStore(Loop *CurLoop, StoreInst *SI) {
Krzysztof Parzyszek35ce5da2017-01-27 20:40:14 +00001831 // Allow volatile stores if HexagonVolatileMemcpy is enabled.
1832 if (!(SI->isVolatile() && HexagonVolatileMemcpy) && !SI->isSimple())
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001833 return false;
1834
1835 Value *StoredVal = SI->getValueOperand();
1836 Value *StorePtr = SI->getPointerOperand();
1837
1838 // Reject stores that are so large that they overflow an unsigned.
1839 uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
1840 if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
1841 return false;
1842
1843 // See if the pointer expression is an AddRec like {base,+,1} on the current
1844 // loop, which indicates a strided store. If we have something else, it's a
1845 // random store we can't handle.
1846 auto *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1847 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
1848 return false;
1849
1850 // Check to see if the stride matches the size of the store. If so, then we
1851 // know that every byte is touched in the loop.
1852 int Stride = getSCEVStride(StoreEv);
1853 if (Stride == 0)
1854 return false;
1855 unsigned StoreSize = getStoreSizeInBytes(SI);
1856 if (StoreSize != unsigned(std::abs(Stride)))
1857 return false;
1858
1859 // The store must be feeding a non-volatile load.
1860 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
1861 if (!LI || !LI->isSimple())
1862 return false;
1863
1864 // See if the pointer expression is an AddRec like {base,+,1} on the current
1865 // loop, which indicates a strided load. If we have something else, it's a
1866 // random load we can't handle.
1867 Value *LoadPtr = LI->getPointerOperand();
1868 auto *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1869 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
1870 return false;
1871
1872 // The store and load must share the same stride.
1873 if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
1874 return false;
1875
1876 // Success. This store can be converted into a memcpy.
1877 return true;
1878}
1879
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001880/// mayLoopAccessLocation - Return true if the specified loop might access the
1881/// specified pointer location, which is a loop-strided access. The 'Access'
1882/// argument specifies what the verboten forms of access are (read or write).
1883static bool
1884mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
1885 const SCEV *BECount, unsigned StoreSize,
1886 AliasAnalysis &AA,
1887 SmallPtrSetImpl<Instruction *> &Ignored) {
1888 // Get the location that may be stored across the loop. Since the access
1889 // is strided positively through memory, we say that the modified location
1890 // starts at the pointer and has infinite size.
1891 uint64_t AccessSize = MemoryLocation::UnknownSize;
1892
1893 // If the loop iterates a fixed number of times, we can refine the access
1894 // size to be exactly the size of the memset, which is (BECount+1)*StoreSize
1895 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
1896 AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize;
1897
1898 // TODO: For this to be really effective, we have to dive into the pointer
1899 // operand in the store. Store to &A[i] of 100 will always return may alias
1900 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
1901 // which will then no-alias a store to &A[100].
1902 MemoryLocation StoreLoc(Ptr, AccessSize);
1903
1904 for (auto *B : L->blocks())
1905 for (auto &I : *B)
1906 if (Ignored.count(&I) == 0 && (AA.getModRefInfo(&I, StoreLoc) & Access))
1907 return true;
1908
1909 return false;
1910}
1911
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001912void HexagonLoopIdiomRecognize::collectStores(Loop *CurLoop, BasicBlock *BB,
1913 SmallVectorImpl<StoreInst*> &Stores) {
1914 Stores.clear();
1915 for (Instruction &I : *BB)
1916 if (StoreInst *SI = dyn_cast<StoreInst>(&I))
1917 if (isLegalStore(CurLoop, SI))
1918 Stores.push_back(SI);
1919}
1920
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001921bool HexagonLoopIdiomRecognize::processCopyingStore(Loop *CurLoop,
1922 StoreInst *SI, const SCEV *BECount) {
Michael Kupersteine18aad32017-01-31 22:48:45 +00001923 assert((SI->isSimple() || (SI->isVolatile() && HexagonVolatileMemcpy)) &&
1924 "Expected only non-volatile stores, or Hexagon-specific memcpy"
1925 "to volatile destination.");
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00001926
1927 Value *StorePtr = SI->getPointerOperand();
1928 auto *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1929 unsigned Stride = getSCEVStride(StoreEv);
1930 unsigned StoreSize = getStoreSizeInBytes(SI);
1931 if (Stride != StoreSize)
1932 return false;
1933
1934 // See if the pointer expression is an AddRec like {base,+,1} on the current
1935 // loop, which indicates a strided load. If we have something else, it's a
1936 // random load we can't handle.
1937 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
1938 auto *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
1939
1940 // The trip count of the loop and the base pointer of the addrec SCEV is
1941 // guaranteed to be loop invariant, which means that it should dominate the
1942 // header. This allows us to insert code for it in the preheader.
1943 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1944 Instruction *ExpPt = Preheader->getTerminator();
1945 IRBuilder<> Builder(ExpPt);
1946 SCEVExpander Expander(*SE, *DL, "hexagon-loop-idiom");
1947
1948 Type *IntPtrTy = Builder.getIntPtrTy(*DL, SI->getPointerAddressSpace());
1949
1950 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1951 // this into a memcpy/memmove in the loop preheader now if we want. However,
1952 // this would be unsafe to do if there is anything else in the loop that may
1953 // read or write the memory region we're storing to. For memcpy, this
1954 // includes the load that feeds the stores. Check for an alias by generating
1955 // the base address and checking everything.
1956 Value *StoreBasePtr = Expander.expandCodeFor(StoreEv->getStart(),
1957 Builder.getInt8PtrTy(SI->getPointerAddressSpace()), ExpPt);
1958 Value *LoadBasePtr = nullptr;
1959
1960 bool Overlap = false;
1961 bool DestVolatile = SI->isVolatile();
1962 Type *BECountTy = BECount->getType();
1963
1964 if (DestVolatile) {
1965 // The trip count must fit in i32, since it is the type of the "num_words"
1966 // argument to hexagon_memcpy_forward_vp4cp4n2.
1967 if (StoreSize != 4 || DL->getTypeSizeInBits(BECountTy) > 32) {
1968CleanupAndExit:
1969 // If we generated new code for the base pointer, clean up.
1970 Expander.clear();
1971 if (StoreBasePtr && (LoadBasePtr != StoreBasePtr)) {
1972 RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
1973 StoreBasePtr = nullptr;
1974 }
1975 if (LoadBasePtr) {
1976 RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
1977 LoadBasePtr = nullptr;
1978 }
1979 return false;
1980 }
1981 }
1982
1983 SmallPtrSet<Instruction*, 2> Ignore1;
1984 Ignore1.insert(SI);
1985 if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
1986 StoreSize, *AA, Ignore1)) {
1987 // Check if the load is the offending instruction.
1988 Ignore1.insert(LI);
1989 if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
1990 StoreSize, *AA, Ignore1)) {
1991 // Still bad. Nothing we can do.
1992 goto CleanupAndExit;
1993 }
1994 // It worked with the load ignored.
1995 Overlap = true;
1996 }
1997
1998 if (!Overlap) {
1999 if (DisableMemcpyIdiom || !HasMemcpy)
2000 goto CleanupAndExit;
2001 } else {
2002 // Don't generate memmove if this function will be inlined. This is
2003 // because the caller will undergo this transformation after inlining.
2004 Function *Func = CurLoop->getHeader()->getParent();
2005 if (Func->hasFnAttribute(Attribute::AlwaysInline))
2006 goto CleanupAndExit;
2007
2008 // In case of a memmove, the call to memmove will be executed instead
2009 // of the loop, so we need to make sure that there is nothing else in
2010 // the loop than the load, store and instructions that these two depend
2011 // on.
2012 SmallVector<Instruction*,2> Insts;
2013 Insts.push_back(SI);
2014 Insts.push_back(LI);
2015 if (!coverLoop(CurLoop, Insts))
2016 goto CleanupAndExit;
2017
2018 if (DisableMemmoveIdiom || !HasMemmove)
2019 goto CleanupAndExit;
Eugene Zelenko4d060b72017-07-29 00:56:56 +00002020 bool IsNested = CurLoop->getParentLoop() != nullptr;
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00002021 if (IsNested && OnlyNonNestedMemmove)
2022 goto CleanupAndExit;
2023 }
2024
2025 // For a memcpy, we have to make sure that the input array is not being
2026 // mutated by the loop.
2027 LoadBasePtr = Expander.expandCodeFor(LoadEv->getStart(),
2028 Builder.getInt8PtrTy(LI->getPointerAddressSpace()), ExpPt);
2029
2030 SmallPtrSet<Instruction*, 2> Ignore2;
2031 Ignore2.insert(SI);
2032 if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize,
2033 *AA, Ignore2))
2034 goto CleanupAndExit;
2035
2036 // Check the stride.
2037 bool StridePos = getSCEVStride(LoadEv) >= 0;
2038
2039 // Currently, the volatile memcpy only emulates traversing memory forward.
2040 if (!StridePos && DestVolatile)
2041 goto CleanupAndExit;
2042
2043 bool RuntimeCheck = (Overlap || DestVolatile);
2044
2045 BasicBlock *ExitB;
2046 if (RuntimeCheck) {
2047 // The runtime check needs a single exit block.
2048 SmallVector<BasicBlock*, 8> ExitBlocks;
2049 CurLoop->getUniqueExitBlocks(ExitBlocks);
2050 if (ExitBlocks.size() != 1)
2051 goto CleanupAndExit;
2052 ExitB = ExitBlocks[0];
2053 }
2054
2055 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
2056 // pointer size if it isn't already.
2057 LLVMContext &Ctx = SI->getContext();
2058 BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
2059 unsigned Alignment = std::min(SI->getAlignment(), LI->getAlignment());
2060 DebugLoc DLoc = SI->getDebugLoc();
2061
2062 const SCEV *NumBytesS =
2063 SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
2064 if (StoreSize != 1)
2065 NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
2066 SCEV::FlagNUW);
2067 Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtrTy, ExpPt);
2068 if (Instruction *In = dyn_cast<Instruction>(NumBytes))
Daniel Berlin4d0fe642017-04-28 19:55:38 +00002069 if (Value *Simp = SimplifyInstruction(In, {*DL, TLI, DT}))
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00002070 NumBytes = Simp;
2071
2072 CallInst *NewCall;
2073
2074 if (RuntimeCheck) {
2075 unsigned Threshold = RuntimeMemSizeThreshold;
2076 if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) {
2077 uint64_t C = CI->getZExtValue();
2078 if (Threshold != 0 && C < Threshold)
2079 goto CleanupAndExit;
2080 if (C < CompileTimeMemSizeThreshold)
2081 goto CleanupAndExit;
2082 }
2083
2084 BasicBlock *Header = CurLoop->getHeader();
2085 Function *Func = Header->getParent();
2086 Loop *ParentL = LF->getLoopFor(Preheader);
2087 StringRef HeaderName = Header->getName();
2088
2089 // Create a new (empty) preheader, and update the PHI nodes in the
2090 // header to use the new preheader.
2091 BasicBlock *NewPreheader = BasicBlock::Create(Ctx, HeaderName+".rtli.ph",
2092 Func, Header);
2093 if (ParentL)
2094 ParentL->addBasicBlockToLoop(NewPreheader, *LF);
2095 IRBuilder<>(NewPreheader).CreateBr(Header);
2096 for (auto &In : *Header) {
2097 PHINode *PN = dyn_cast<PHINode>(&In);
2098 if (!PN)
2099 break;
2100 int bx = PN->getBasicBlockIndex(Preheader);
2101 if (bx >= 0)
2102 PN->setIncomingBlock(bx, NewPreheader);
2103 }
2104 DT->addNewBlock(NewPreheader, Preheader);
2105 DT->changeImmediateDominator(Header, NewPreheader);
2106
2107 // Check for safe conditions to execute memmove.
2108 // If stride is positive, copying things from higher to lower addresses
2109 // is equivalent to memmove. For negative stride, it's the other way
2110 // around. Copying forward in memory with positive stride may not be
2111 // same as memmove since we may be copying values that we just stored
2112 // in some previous iteration.
2113 Value *LA = Builder.CreatePtrToInt(LoadBasePtr, IntPtrTy);
2114 Value *SA = Builder.CreatePtrToInt(StoreBasePtr, IntPtrTy);
2115 Value *LowA = StridePos ? SA : LA;
2116 Value *HighA = StridePos ? LA : SA;
2117 Value *CmpA = Builder.CreateICmpULT(LowA, HighA);
2118 Value *Cond = CmpA;
2119
2120 // Check for distance between pointers.
2121 Value *Dist = Builder.CreateSub(HighA, LowA);
2122 Value *CmpD = Builder.CreateICmpSLT(NumBytes, Dist);
2123 Value *CmpEither = Builder.CreateOr(Cond, CmpD);
2124 Cond = CmpEither;
2125
2126 if (Threshold != 0) {
2127 Type *Ty = NumBytes->getType();
2128 Value *Thr = ConstantInt::get(Ty, Threshold);
2129 Value *CmpB = Builder.CreateICmpULT(Thr, NumBytes);
2130 Value *CmpBoth = Builder.CreateAnd(Cond, CmpB);
2131 Cond = CmpBoth;
2132 }
2133 BasicBlock *MemmoveB = BasicBlock::Create(Ctx, Header->getName()+".rtli",
2134 Func, NewPreheader);
2135 if (ParentL)
2136 ParentL->addBasicBlockToLoop(MemmoveB, *LF);
2137 Instruction *OldT = Preheader->getTerminator();
2138 Builder.CreateCondBr(Cond, MemmoveB, NewPreheader);
2139 OldT->eraseFromParent();
2140 Preheader->setName(Preheader->getName()+".old");
2141 DT->addNewBlock(MemmoveB, Preheader);
2142 // Find the new immediate dominator of the exit block.
2143 BasicBlock *ExitD = Preheader;
2144 for (auto PI = pred_begin(ExitB), PE = pred_end(ExitB); PI != PE; ++PI) {
2145 BasicBlock *PB = *PI;
2146 ExitD = DT->findNearestCommonDominator(ExitD, PB);
2147 if (!ExitD)
2148 break;
2149 }
2150 // If the prior immediate dominator of ExitB was dominated by the
2151 // old preheader, then the old preheader becomes the new immediate
2152 // dominator. Otherwise don't change anything (because the newly
2153 // added blocks are dominated by the old preheader).
2154 if (ExitD && DT->dominates(Preheader, ExitD)) {
2155 DomTreeNode *BN = DT->getNode(ExitB);
2156 DomTreeNode *DN = DT->getNode(ExitD);
2157 BN->setIDom(DN);
2158 }
2159
2160 // Add a call to memmove to the conditional block.
2161 IRBuilder<> CondBuilder(MemmoveB);
2162 CondBuilder.CreateBr(ExitB);
2163 CondBuilder.SetInsertPoint(MemmoveB->getTerminator());
2164
2165 if (DestVolatile) {
2166 Type *Int32Ty = Type::getInt32Ty(Ctx);
2167 Type *Int32PtrTy = Type::getInt32PtrTy(Ctx);
2168 Type *VoidTy = Type::getVoidTy(Ctx);
2169 Module *M = Func->getParent();
2170 Constant *CF = M->getOrInsertFunction(HexagonVolatileMemcpyName, VoidTy,
Serge Guelton59a2d7b2017-04-11 15:01:18 +00002171 Int32PtrTy, Int32PtrTy, Int32Ty);
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00002172 Function *Fn = cast<Function>(CF);
2173 Fn->setLinkage(Function::ExternalLinkage);
2174
2175 const SCEV *OneS = SE->getConstant(Int32Ty, 1);
2176 const SCEV *BECount32 = SE->getTruncateOrZeroExtend(BECount, Int32Ty);
2177 const SCEV *NumWordsS = SE->getAddExpr(BECount32, OneS, SCEV::FlagNUW);
2178 Value *NumWords = Expander.expandCodeFor(NumWordsS, Int32Ty,
2179 MemmoveB->getTerminator());
2180 if (Instruction *In = dyn_cast<Instruction>(NumWords))
Daniel Berlin4d0fe642017-04-28 19:55:38 +00002181 if (Value *Simp = SimplifyInstruction(In, {*DL, TLI, DT}))
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00002182 NumWords = Simp;
2183
2184 Value *Op0 = (StoreBasePtr->getType() == Int32PtrTy)
2185 ? StoreBasePtr
2186 : CondBuilder.CreateBitCast(StoreBasePtr, Int32PtrTy);
2187 Value *Op1 = (LoadBasePtr->getType() == Int32PtrTy)
2188 ? LoadBasePtr
2189 : CondBuilder.CreateBitCast(LoadBasePtr, Int32PtrTy);
2190 NewCall = CondBuilder.CreateCall(Fn, {Op0, Op1, NumWords});
2191 } else {
2192 NewCall = CondBuilder.CreateMemMove(StoreBasePtr, LoadBasePtr,
2193 NumBytes, Alignment);
2194 }
2195 } else {
2196 NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr,
2197 NumBytes, Alignment);
2198 // Okay, the memcpy has been formed. Zap the original store and
2199 // anything that feeds into it.
2200 RecursivelyDeleteTriviallyDeadInstructions(SI, TLI);
2201 }
2202
2203 NewCall->setDebugLoc(DLoc);
2204
2205 DEBUG(dbgs() << " Formed " << (Overlap ? "memmove: " : "memcpy: ")
2206 << *NewCall << "\n"
2207 << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
2208 << " from store ptr=" << *StoreEv << " at: " << *SI << "\n");
2209
2210 return true;
2211}
2212
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00002213// \brief Check if the instructions in Insts, together with their dependencies
2214// cover the loop in the sense that the loop could be safely eliminated once
2215// the instructions in Insts are removed.
2216bool HexagonLoopIdiomRecognize::coverLoop(Loop *L,
2217 SmallVectorImpl<Instruction*> &Insts) const {
2218 SmallSet<BasicBlock*,8> LoopBlocks;
2219 for (auto *B : L->blocks())
2220 LoopBlocks.insert(B);
2221
2222 SetVector<Instruction*> Worklist(Insts.begin(), Insts.end());
2223
2224 // Collect all instructions from the loop that the instructions in Insts
2225 // depend on (plus their dependencies, etc.). These instructions will
2226 // constitute the expression trees that feed those in Insts, but the trees
2227 // will be limited only to instructions contained in the loop.
2228 for (unsigned i = 0; i < Worklist.size(); ++i) {
2229 Instruction *In = Worklist[i];
2230 for (auto I = In->op_begin(), E = In->op_end(); I != E; ++I) {
2231 Instruction *OpI = dyn_cast<Instruction>(I);
2232 if (!OpI)
2233 continue;
2234 BasicBlock *PB = OpI->getParent();
2235 if (!LoopBlocks.count(PB))
2236 continue;
2237 Worklist.insert(OpI);
2238 }
2239 }
2240
2241 // Scan all instructions in the loop, if any of them have a user outside
2242 // of the loop, or outside of the expressions collected above, then either
2243 // the loop has a side-effect visible outside of it, or there are
2244 // instructions in it that are not involved in the original set Insts.
2245 for (auto *B : L->blocks()) {
2246 for (auto &In : *B) {
2247 if (isa<BranchInst>(In) || isa<DbgInfoIntrinsic>(In))
2248 continue;
2249 if (!Worklist.count(&In) && In.mayHaveSideEffects())
2250 return false;
2251 for (const auto &K : In.users()) {
2252 Instruction *UseI = dyn_cast<Instruction>(K);
2253 if (!UseI)
2254 continue;
2255 BasicBlock *UseB = UseI->getParent();
2256 if (LF->getLoopFor(UseB) != L)
2257 return false;
2258 }
2259 }
2260 }
2261
2262 return true;
2263}
2264
2265/// runOnLoopBlock - Process the specified block, which lives in a counted loop
2266/// with the specified backedge count. This block is known to be in the current
2267/// loop and not in any subloops.
2268bool HexagonLoopIdiomRecognize::runOnLoopBlock(Loop *CurLoop, BasicBlock *BB,
2269 const SCEV *BECount, SmallVectorImpl<BasicBlock*> &ExitBlocks) {
2270 // We can only promote stores in this block if they are unconditionally
2271 // executed in the loop. For a block to be unconditionally executed, it has
2272 // to dominate all the exit blocks of the loop. Verify this now.
2273 auto DominatedByBB = [this,BB] (BasicBlock *EB) -> bool {
2274 return DT->dominates(BB, EB);
2275 };
2276 if (!std::all_of(ExitBlocks.begin(), ExitBlocks.end(), DominatedByBB))
2277 return false;
2278
2279 bool MadeChange = false;
2280 // Look for store instructions, which may be optimized to memset/memcpy.
2281 SmallVector<StoreInst*,8> Stores;
2282 collectStores(CurLoop, BB, Stores);
2283
2284 // Optimize the store into a memcpy, if it feeds an similarly strided load.
2285 for (auto &SI : Stores)
2286 MadeChange |= processCopyingStore(CurLoop, SI, BECount);
2287
2288 return MadeChange;
2289}
2290
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00002291bool HexagonLoopIdiomRecognize::runOnCountableLoop(Loop *L) {
2292 PolynomialMultiplyRecognize PMR(L, *DL, *DT, *TLI, *SE);
2293 if (PMR.recognize())
2294 return true;
2295
2296 if (!HasMemcpy && !HasMemmove)
2297 return false;
2298
2299 const SCEV *BECount = SE->getBackedgeTakenCount(L);
2300 assert(!isa<SCEVCouldNotCompute>(BECount) &&
2301 "runOnCountableLoop() called on a loop without a predictable"
2302 "backedge-taken count");
2303
2304 SmallVector<BasicBlock *, 8> ExitBlocks;
2305 L->getUniqueExitBlocks(ExitBlocks);
2306
2307 bool Changed = false;
2308
2309 // Scan all the blocks in the loop that are not in subloops.
2310 for (auto *BB : L->getBlocks()) {
2311 // Ignore blocks in subloops.
2312 if (LF->getLoopFor(BB) != L)
2313 continue;
2314 Changed |= runOnLoopBlock(L, BB, BECount, ExitBlocks);
2315 }
2316
2317 return Changed;
2318}
2319
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00002320bool HexagonLoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
2321 const Module &M = *L->getHeader()->getParent()->getParent();
2322 if (Triple(M.getTargetTriple()).getArch() != Triple::hexagon)
2323 return false;
2324
2325 if (skipLoop(L))
2326 return false;
2327
2328 // If the loop could not be converted to canonical form, it must have an
2329 // indirectbr in it, just give up.
2330 if (!L->getLoopPreheader())
2331 return false;
2332
2333 // Disable loop idiom recognition if the function's name is a common idiom.
2334 StringRef Name = L->getHeader()->getParent()->getName();
2335 if (Name == "memset" || Name == "memcpy" || Name == "memmove")
2336 return false;
2337
2338 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2339 DL = &L->getHeader()->getModule()->getDataLayout();
2340 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2341 LF = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2342 TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
2343 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2344
2345 HasMemcpy = TLI->has(LibFunc_memcpy);
2346 HasMemmove = TLI->has(LibFunc_memmove);
2347
2348 if (SE->hasLoopInvariantBackedgeTakenCount(L))
2349 return runOnCountableLoop(L);
2350 return false;
2351}
2352
Krzysztof Parzyszekc8b94382017-01-26 21:41:10 +00002353Pass *llvm::createHexagonLoopIdiomPass() {
2354 return new HexagonLoopIdiomRecognize();
2355}