blob: 5ef12e7f5eab522da683b3fa5547fc5d9e2add2d [file] [log] [blame]
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001//===-- SIMachineScheduler.cpp - SI Scheduler Interface -*- C++ -*-----===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief SI Machine Scheduler interface
12//
13//===----------------------------------------------------------------------===//
14
15#include "SIMachineScheduler.h"
16#include "AMDGPUSubtarget.h"
17#include "llvm/CodeGen/LiveInterval.h"
18#include "llvm/CodeGen/LiveIntervalAnalysis.h"
19#include "llvm/CodeGen/MachineRegisterInfo.h"
20#include "llvm/CodeGen/MachineScheduler.h"
21#include "llvm/CodeGen/RegisterPressure.h"
22
23using namespace llvm;
24
25#define DEBUG_TYPE "misched"
26
27// This scheduler implements a different scheduling algorithm than
28// GenericScheduler.
29//
30// There are several specific architecture behaviours that can't be modelled
31// for GenericScheduler:
32// . When accessing the result of an SGPR load instruction, you have to wait
33// for all the SGPR load instructions before your current instruction to
34// have finished.
35// . When accessing the result of an VGPR load instruction, you have to wait
36// for all the VGPR load instructions previous to the VGPR load instruction
37// you are interested in to finish.
38// . The less the register pressure, the best load latencies are hidden
39//
40// Moreover some specifities (like the fact a lot of instructions in the shader
41// have few dependencies) makes the generic scheduler have some unpredictable
42// behaviours. For example when register pressure becomes high, it can either
43// manage to prevent register pressure from going too high, or it can
44// increase register pressure even more than if it hadn't taken register
45// pressure into account.
46//
47// Also some other bad behaviours are generated, like loading at the beginning
48// of the shader a constant in VGPR you won't need until the end of the shader.
49//
50// The scheduling problem for SI can distinguish three main parts:
51// . Hiding high latencies (texture sampling, etc)
52// . Hiding low latencies (SGPR constant loading, etc)
53// . Keeping register usage low for better latency hiding and general
54// performance
55//
56// Some other things can also affect performance, but are hard to predict
57// (cache usage, the fact the HW can issue several instructions from different
58// wavefronts if different types, etc)
59//
60// This scheduler tries to solve the scheduling problem by dividing it into
61// simpler sub-problems. It divides the instructions into blocks, schedules
62// locally inside the blocks where it takes care of low latencies, and then
63// chooses the order of the blocks by taking care of high latencies.
64// Dividing the instructions into blocks helps control keeping register
65// usage low.
66//
67// First the instructions are put into blocks.
68// We want the blocks help control register usage and hide high latencies
69// later. To help control register usage, we typically want all local
70// computations, when for example you create a result that can be comsummed
71// right away, to be contained in a block. Block inputs and outputs would
72// typically be important results that are needed in several locations of
73// the shader. Since we do want blocks to help hide high latencies, we want
74// the instructions inside the block to have a minimal set of dependencies
75// on high latencies. It will make it easy to pick blocks to hide specific
76// high latencies.
77// The block creation algorithm is divided into several steps, and several
78// variants can be tried during the scheduling process.
79//
80// Second the order of the instructions inside the blocks is choosen.
81// At that step we do take into account only register usage and hiding
82// low latency instructions
83//
84// Third the block order is choosen, there we try to hide high latencies
85// and keep register usage low.
86//
87// After the third step, a pass is done to improve the hiding of low
88// latencies.
89//
90// Actually when talking about 'low latency' or 'high latency' it includes
91// both the latency to get the cache (or global mem) data go to the register,
92// and the bandwith limitations.
93// Increasing the number of active wavefronts helps hide the former, but it
94// doesn't solve the latter, thus why even if wavefront count is high, we have
95// to try have as many instructions hiding high latencies as possible.
96// The OpenCL doc says for example latency of 400 cycles for a global mem access,
97// which is hidden by 10 instructions if the wavefront count is 10.
98
99// Some figures taken from AMD docs:
100// Both texture and constant L1 caches are 4-way associative with 64 bytes
101// lines.
102// Constant cache is shared with 4 CUs.
103// For texture sampling, the address generation unit receives 4 texture
104// addresses per cycle, thus we could expect texture sampling latency to be
105// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
106// instructions in a wavefront group are executed every 4 cycles),
107// or 16 instructions if the other wavefronts associated to the 3 other VALUs
108// of the CU do texture sampling too. (Don't take these figures too seriously,
109// as I'm not 100% sure of the computation)
110// Data exports should get similar latency.
111// For constant loading, the cache is shader with 4 CUs.
112// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
113// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
114// It means a simple s_buffer_load should take one instruction to hide, as
115// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
116// cache line.
117//
118// As of today the driver doesn't preload the constants in cache, thus the
119// first loads get extra latency. The doc says global memory access can be
120// 300-600 cycles. We do not specially take that into account when scheduling
121// As we expect the driver to be able to preload the constants soon.
122
123
124// common code //
125
126#ifndef NDEBUG
127
128static const char *getReasonStr(SIScheduleCandReason Reason) {
129 switch (Reason) {
130 case NoCand: return "NOCAND";
131 case RegUsage: return "REGUSAGE";
132 case Latency: return "LATENCY";
133 case Successor: return "SUCCESSOR";
134 case Depth: return "DEPTH";
135 case NodeOrder: return "ORDER";
136 }
137 llvm_unreachable("Unknown reason!");
138}
139
140#endif
141
142static bool tryLess(int TryVal, int CandVal,
143 SISchedulerCandidate &TryCand,
144 SISchedulerCandidate &Cand,
145 SIScheduleCandReason Reason) {
146 if (TryVal < CandVal) {
147 TryCand.Reason = Reason;
148 return true;
149 }
150 if (TryVal > CandVal) {
151 if (Cand.Reason > Reason)
152 Cand.Reason = Reason;
153 return true;
154 }
155 Cand.setRepeat(Reason);
156 return false;
157}
158
159static bool tryGreater(int TryVal, int CandVal,
160 SISchedulerCandidate &TryCand,
161 SISchedulerCandidate &Cand,
162 SIScheduleCandReason Reason) {
163 if (TryVal > CandVal) {
164 TryCand.Reason = Reason;
165 return true;
166 }
167 if (TryVal < CandVal) {
168 if (Cand.Reason > Reason)
169 Cand.Reason = Reason;
170 return true;
171 }
172 Cand.setRepeat(Reason);
173 return false;
174}
175
176// SIScheduleBlock //
177
178void SIScheduleBlock::addUnit(SUnit *SU) {
179 NodeNum2Index[SU->NodeNum] = SUnits.size();
180 SUnits.push_back(SU);
181}
182
183#ifndef NDEBUG
184
185void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
186
187 dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
188 dbgs() << '\n';
189}
190#endif
191
192void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
193 SISchedCandidate &TryCand) {
194 // Initialize the candidate if needed.
195 if (!Cand.isValid()) {
196 TryCand.Reason = NodeOrder;
197 return;
198 }
199
200 if (Cand.SGPRUsage > 60 &&
201 tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
202 return;
203
204 // Schedule low latency instructions as top as possible.
205 // Order of priority is:
206 // . Low latency instructions which do not depend on other low latency
207 // instructions we haven't waited for
208 // . Other instructions which do not depend on low latency instructions
209 // we haven't waited for
210 // . Low latencies
211 // . All other instructions
212 // Goal is to get: low latency instructions - independant instructions
213 // - (eventually some more low latency instructions)
214 // - instructions that depend on the first low latency instructions.
215 // If in the block there is a lot of constant loads, the SGPR usage
216 // could go quite high, thus above the arbitrary limit of 60 will encourage
217 // use the already loaded constants (in order to release some SGPRs) before
218 // loading more.
219 if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
220 Cand.HasLowLatencyNonWaitedParent,
221 TryCand, Cand, SIScheduleCandReason::Depth))
222 return;
223
224 if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
225 TryCand, Cand, SIScheduleCandReason::Depth))
226 return;
227
228 if (TryCand.IsLowLatency &&
229 tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
230 TryCand, Cand, SIScheduleCandReason::Depth))
231 return;
232
233 if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
234 return;
235
236 // Fall through to original instruction order.
237 if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
238 TryCand.Reason = NodeOrder;
239 }
240}
241
242SUnit* SIScheduleBlock::pickNode() {
243 SISchedCandidate TopCand;
244
245 for (SUnit* SU : TopReadySUs) {
246 SISchedCandidate TryCand;
247 std::vector<unsigned> pressure;
248 std::vector<unsigned> MaxPressure;
249 // Predict register usage after this instruction.
250 TryCand.SU = SU;
251 TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
252 TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
253 TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
254 TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
255 TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
256 TryCand.HasLowLatencyNonWaitedParent =
257 HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
258 tryCandidateTopDown(TopCand, TryCand);
259 if (TryCand.Reason != NoCand)
260 TopCand.setBest(TryCand);
261 }
262
263 return TopCand.SU;
264}
265
266
267// Schedule something valid.
268void SIScheduleBlock::fastSchedule() {
269 TopReadySUs.clear();
270 if (Scheduled)
271 undoSchedule();
272
273 for (SUnit* SU : SUnits) {
274 if (!SU->NumPredsLeft)
275 TopReadySUs.push_back(SU);
276 }
277
278 while (!TopReadySUs.empty()) {
279 SUnit *SU = TopReadySUs[0];
280 ScheduledSUnits.push_back(SU);
281 nodeScheduled(SU);
282 }
283
284 Scheduled = true;
285}
286
287// Returns if the register was set between first and last.
288static bool isDefBetween(unsigned Reg,
289 SlotIndex First, SlotIndex Last,
290 const MachineRegisterInfo *MRI,
291 const LiveIntervals *LIS) {
292 for (MachineRegisterInfo::def_instr_iterator
293 UI = MRI->def_instr_begin(Reg),
294 UE = MRI->def_instr_end(); UI != UE; ++UI) {
295 const MachineInstr* MI = &*UI;
296 if (MI->isDebugValue())
297 continue;
Duncan P. N. Exon Smith3ac9cc62016-02-27 06:40:41 +0000298 SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000299 if (InstSlot >= First && InstSlot <= Last)
300 return true;
301 }
302 return false;
303}
304
305void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
306 MachineBasicBlock::iterator EndBlock) {
307 IntervalPressure Pressure, BotPressure;
308 RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
309 LiveIntervals *LIS = DAG->getLIS();
310 MachineRegisterInfo *MRI = DAG->getMRI();
311 DAG->initRPTracker(TopRPTracker);
312 DAG->initRPTracker(BotRPTracker);
313 DAG->initRPTracker(RPTracker);
314
315 // Goes though all SU. RPTracker captures what had to be alive for the SUs
316 // to execute, and what is still alive at the end.
317 for (SUnit* SU : ScheduledSUnits) {
318 RPTracker.setPos(SU->getInstr());
319 RPTracker.advance();
320 }
321
322 // Close the RPTracker to finalize live ins/outs.
323 RPTracker.closeRegion();
324
325 // Initialize the live ins and live outs.
326 TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
327 BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
328
329 // Do not Track Physical Registers, because it messes up.
Matthias Braun5d458612016-01-20 00:23:26 +0000330 for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
331 if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
332 LiveInRegs.insert(RegMaskPair.RegUnit);
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000333 }
334 LiveOutRegs.clear();
335 // There is several possibilities to distinguish:
336 // 1) Reg is not input to any instruction in the block, but is output of one
337 // 2) 1) + read in the block and not needed after it
338 // 3) 1) + read in the block but needed in another block
339 // 4) Reg is input of an instruction but another block will read it too
340 // 5) Reg is input of an instruction and then rewritten in the block.
341 // result is not read in the block (implies used in another block)
342 // 6) Reg is input of an instruction and then rewritten in the block.
343 // result is read in the block and not needed in another block
344 // 7) Reg is input of an instruction and then rewritten in the block.
345 // result is read in the block but also needed in another block
346 // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
347 // We want LiveOutRegs to contain only Regs whose content will be read after
348 // in another block, and whose content was written in the current block,
349 // that is we want it to get 1, 3, 5, 7
350 // Since we made the MIs of a block to be packed all together before
351 // scheduling, then the LiveIntervals were correct, and the RPTracker was
352 // able to correctly handle 5 vs 6, 2 vs 3.
353 // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
354 // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
355 // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
356 // The use of findDefBetween removes the case 4.
Matthias Braun5d458612016-01-20 00:23:26 +0000357 for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
358 unsigned Reg = RegMaskPair.RegUnit;
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000359 if (TargetRegisterInfo::isVirtualRegister(Reg) &&
Duncan P. N. Exon Smith3ac9cc62016-02-27 06:40:41 +0000360 isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
361 LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
362 LIS)) {
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000363 LiveOutRegs.insert(Reg);
364 }
365 }
366
367 // Pressure = sum_alive_registers register size
368 // Internally llvm will represent some registers as big 128 bits registers
369 // for example, but they actually correspond to 4 actual 32 bits registers.
370 // Thus Pressure is not equal to num_alive_registers * constant.
371 LiveInPressure = TopPressure.MaxSetPressure;
372 LiveOutPressure = BotPressure.MaxSetPressure;
373
374 // Prepares TopRPTracker for top down scheduling.
375 TopRPTracker.closeTop();
376}
377
378void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
379 MachineBasicBlock::iterator EndBlock) {
380 if (!Scheduled)
381 fastSchedule();
382
383 // PreScheduling phase to set LiveIn and LiveOut.
384 initRegPressure(BeginBlock, EndBlock);
385 undoSchedule();
386
387 // Schedule for real now.
388
389 TopReadySUs.clear();
390
391 for (SUnit* SU : SUnits) {
392 if (!SU->NumPredsLeft)
393 TopReadySUs.push_back(SU);
394 }
395
396 while (!TopReadySUs.empty()) {
397 SUnit *SU = pickNode();
398 ScheduledSUnits.push_back(SU);
399 TopRPTracker.setPos(SU->getInstr());
400 TopRPTracker.advance();
401 nodeScheduled(SU);
402 }
403
404 // TODO: compute InternalAdditionnalPressure.
405 InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
406
407 // Check everything is right.
408#ifndef NDEBUG
409 assert(SUnits.size() == ScheduledSUnits.size() &&
410 TopReadySUs.empty());
411 for (SUnit* SU : SUnits) {
412 assert(SU->isScheduled &&
413 SU->NumPredsLeft == 0);
414 }
415#endif
416
417 Scheduled = true;
418}
419
420void SIScheduleBlock::undoSchedule() {
421 for (SUnit* SU : SUnits) {
422 SU->isScheduled = false;
423 for (SDep& Succ : SU->Succs) {
424 if (BC->isSUInBlock(Succ.getSUnit(), ID))
425 undoReleaseSucc(SU, &Succ);
426 }
427 }
428 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
429 ScheduledSUnits.clear();
430 Scheduled = false;
431}
432
433void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
434 SUnit *SuccSU = SuccEdge->getSUnit();
435
436 if (SuccEdge->isWeak()) {
437 ++SuccSU->WeakPredsLeft;
438 return;
439 }
440 ++SuccSU->NumPredsLeft;
441}
442
443void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
444 SUnit *SuccSU = SuccEdge->getSUnit();
445
446 if (SuccEdge->isWeak()) {
447 --SuccSU->WeakPredsLeft;
448 return;
449 }
450#ifndef NDEBUG
451 if (SuccSU->NumPredsLeft == 0) {
452 dbgs() << "*** Scheduling failed! ***\n";
453 SuccSU->dump(DAG);
454 dbgs() << " has been released too many times!\n";
455 llvm_unreachable(nullptr);
456 }
457#endif
458
459 --SuccSU->NumPredsLeft;
460}
461
462/// Release Successors of the SU that are in the block or not.
463void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
464 for (SDep& Succ : SU->Succs) {
465 SUnit *SuccSU = Succ.getSUnit();
466
467 if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
468 continue;
469
470 releaseSucc(SU, &Succ);
471 if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
472 TopReadySUs.push_back(SuccSU);
473 }
474}
475
476void SIScheduleBlock::nodeScheduled(SUnit *SU) {
477 // Is in TopReadySUs
478 assert (!SU->NumPredsLeft);
479 std::vector<SUnit*>::iterator I =
480 std::find(TopReadySUs.begin(), TopReadySUs.end(), SU);
481 if (I == TopReadySUs.end()) {
482 dbgs() << "Data Structure Bug in SI Scheduler\n";
483 llvm_unreachable(nullptr);
484 }
485 TopReadySUs.erase(I);
486
487 releaseSuccessors(SU, true);
488 // Scheduling this node will trigger a wait,
489 // thus propagate to other instructions that they do not need to wait either.
490 if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
491 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
492
493 if (DAG->IsLowLatencySU[SU->NodeNum]) {
494 for (SDep& Succ : SU->Succs) {
495 std::map<unsigned, unsigned>::iterator I =
496 NodeNum2Index.find(Succ.getSUnit()->NodeNum);
497 if (I != NodeNum2Index.end())
498 HasLowLatencyNonWaitedParent[I->second] = 1;
499 }
500 }
501 SU->isScheduled = true;
502}
503
504void SIScheduleBlock::finalizeUnits() {
505 // We remove links from outside blocks to enable scheduling inside the block.
506 for (SUnit* SU : SUnits) {
507 releaseSuccessors(SU, false);
508 if (DAG->IsHighLatencySU[SU->NodeNum])
509 HighLatencyBlock = true;
510 }
511 HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
512}
513
514// we maintain ascending order of IDs
515void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
516 unsigned PredID = Pred->getID();
517
518 // Check if not already predecessor.
519 for (SIScheduleBlock* P : Preds) {
520 if (PredID == P->getID())
521 return;
522 }
523 Preds.push_back(Pred);
524
525#ifndef NDEBUG
526 for (SIScheduleBlock* S : Succs) {
527 if (PredID == S->getID())
528 assert(!"Loop in the Block Graph!\n");
529 }
530#endif
531}
532
533void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
534 unsigned SuccID = Succ->getID();
535
536 // Check if not already predecessor.
537 for (SIScheduleBlock* S : Succs) {
538 if (SuccID == S->getID())
539 return;
540 }
541 if (Succ->isHighLatencyBlock())
542 ++NumHighLatencySuccessors;
543 Succs.push_back(Succ);
544#ifndef NDEBUG
545 for (SIScheduleBlock* P : Preds) {
546 if (SuccID == P->getID())
547 assert("Loop in the Block Graph!\n");
548 }
549#endif
550}
551
552#ifndef NDEBUG
553void SIScheduleBlock::printDebug(bool full) {
554 dbgs() << "Block (" << ID << ")\n";
555 if (!full)
556 return;
557
558 dbgs() << "\nContains High Latency Instruction: "
559 << HighLatencyBlock << '\n';
560 dbgs() << "\nDepends On:\n";
561 for (SIScheduleBlock* P : Preds) {
562 P->printDebug(false);
563 }
564
565 dbgs() << "\nSuccessors:\n";
566 for (SIScheduleBlock* S : Succs) {
567 S->printDebug(false);
568 }
569
570 if (Scheduled) {
571 dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
572 << LiveInPressure[DAG->getVGPRSetID()] << '\n';
573 dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
574 << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
575 dbgs() << "LiveIns:\n";
576 for (unsigned Reg : LiveInRegs)
577 dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
578
579 dbgs() << "\nLiveOuts:\n";
580 for (unsigned Reg : LiveOutRegs)
581 dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
582 }
583
584 dbgs() << "\nInstructions:\n";
585 if (!Scheduled) {
586 for (SUnit* SU : SUnits) {
587 SU->dump(DAG);
588 }
589 } else {
590 for (SUnit* SU : SUnits) {
591 SU->dump(DAG);
592 }
593 }
594
595 dbgs() << "///////////////////////\n";
596}
597
598#endif
599
600// SIScheduleBlockCreator //
601
602SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
603DAG(DAG) {
604}
605
606SIScheduleBlockCreator::~SIScheduleBlockCreator() {
607}
608
609SIScheduleBlocks
610SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
611 std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
612 Blocks.find(BlockVariant);
613 if (B == Blocks.end()) {
614 SIScheduleBlocks Res;
615 createBlocksForVariant(BlockVariant);
616 topologicalSort();
617 scheduleInsideBlocks();
618 fillStats();
619 Res.Blocks = CurrentBlocks;
620 Res.TopDownIndex2Block = TopDownIndex2Block;
621 Res.TopDownBlock2Index = TopDownBlock2Index;
622 Blocks[BlockVariant] = Res;
623 return Res;
624 } else {
625 return B->second;
626 }
627}
628
629bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
630 if (SU->NodeNum >= DAG->SUnits.size())
631 return false;
632 return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
633}
634
635void SIScheduleBlockCreator::colorHighLatenciesAlone() {
636 unsigned DAGSize = DAG->SUnits.size();
637
638 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
639 SUnit *SU = &DAG->SUnits[i];
640 if (DAG->IsHighLatencySU[SU->NodeNum]) {
641 CurrentColoring[SU->NodeNum] = NextReservedID++;
642 }
643 }
644}
645
646void SIScheduleBlockCreator::colorHighLatenciesGroups() {
647 unsigned DAGSize = DAG->SUnits.size();
648 unsigned NumHighLatencies = 0;
649 unsigned GroupSize;
650 unsigned Color = NextReservedID;
651 unsigned Count = 0;
652 std::set<unsigned> FormingGroup;
653
654 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
655 SUnit *SU = &DAG->SUnits[i];
656 if (DAG->IsHighLatencySU[SU->NodeNum])
657 ++NumHighLatencies;
658 }
659
660 if (NumHighLatencies == 0)
661 return;
662
663 if (NumHighLatencies <= 6)
664 GroupSize = 2;
665 else if (NumHighLatencies <= 12)
666 GroupSize = 3;
667 else
668 GroupSize = 4;
669
670 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
671 SUnit *SU = &DAG->SUnits[i];
672 if (DAG->IsHighLatencySU[SU->NodeNum]) {
673 unsigned CompatibleGroup = true;
674 unsigned ProposedColor = Color;
675 for (unsigned j : FormingGroup) {
676 // TODO: Currently CompatibleGroup will always be false,
677 // because the graph enforces the load order. This
678 // can be fixed, but as keeping the load order is often
679 // good for performance that causes a performance hit (both
680 // the default scheduler and this scheduler).
681 // When this scheduler determines a good load order,
682 // this can be fixed.
683 if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) ||
684 !DAG->canAddEdge(&DAG->SUnits[j], SU))
685 CompatibleGroup = false;
686 }
687 if (!CompatibleGroup || ++Count == GroupSize) {
688 FormingGroup.clear();
689 Color = ++NextReservedID;
690 if (!CompatibleGroup) {
691 ProposedColor = Color;
692 FormingGroup.insert(SU->NodeNum);
693 }
694 Count = 0;
695 } else {
696 FormingGroup.insert(SU->NodeNum);
697 }
698 CurrentColoring[SU->NodeNum] = ProposedColor;
699 }
700 }
701}
702
703void SIScheduleBlockCreator::colorComputeReservedDependencies() {
704 unsigned DAGSize = DAG->SUnits.size();
705 std::map<std::set<unsigned>, unsigned> ColorCombinations;
706
707 CurrentTopDownReservedDependencyColoring.clear();
708 CurrentBottomUpReservedDependencyColoring.clear();
709
710 CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
711 CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
712
713 // Traverse TopDown, and give different colors to SUs depending
714 // on which combination of High Latencies they depend on.
715
716 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
717 SUnit *SU = &DAG->SUnits[DAG->TopDownIndex2SU[i]];
718 std::set<unsigned> SUColors;
719
720 // Already given.
721 if (CurrentColoring[SU->NodeNum]) {
722 CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
723 CurrentColoring[SU->NodeNum];
724 continue;
725 }
726
727 for (SDep& PredDep : SU->Preds) {
728 SUnit *Pred = PredDep.getSUnit();
729 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
730 continue;
731 if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
732 SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
733 }
734 // Color 0 by default.
735 if (SUColors.empty())
736 continue;
737 // Same color than parents.
738 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
739 CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
740 *SUColors.begin();
741 else {
742 std::map<std::set<unsigned>, unsigned>::iterator Pos =
743 ColorCombinations.find(SUColors);
744 if (Pos != ColorCombinations.end()) {
745 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
746 } else {
747 CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
748 NextNonReservedID;
749 ColorCombinations[SUColors] = NextNonReservedID++;
750 }
751 }
752 }
753
754 ColorCombinations.clear();
755
756 // Same as before, but BottomUp.
757
758 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
759 SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
760 std::set<unsigned> SUColors;
761
762 // Already given.
763 if (CurrentColoring[SU->NodeNum]) {
764 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
765 CurrentColoring[SU->NodeNum];
766 continue;
767 }
768
769 for (SDep& SuccDep : SU->Succs) {
770 SUnit *Succ = SuccDep.getSUnit();
771 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
772 continue;
773 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
774 SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
775 }
776 // Keep color 0.
777 if (SUColors.empty())
778 continue;
779 // Same color than parents.
780 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
781 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
782 *SUColors.begin();
783 else {
784 std::map<std::set<unsigned>, unsigned>::iterator Pos =
785 ColorCombinations.find(SUColors);
786 if (Pos != ColorCombinations.end()) {
787 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
788 } else {
789 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
790 NextNonReservedID;
791 ColorCombinations[SUColors] = NextNonReservedID++;
792 }
793 }
794 }
795}
796
797void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
798 unsigned DAGSize = DAG->SUnits.size();
799 std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
800
801 // Every combination of colors given by the top down
802 // and bottom up Reserved node dependency
803
804 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
805 SUnit *SU = &DAG->SUnits[i];
806 std::pair<unsigned, unsigned> SUColors;
807
808 // High latency instructions: already given.
809 if (CurrentColoring[SU->NodeNum])
810 continue;
811
812 SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
813 SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
814
815 std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
816 ColorCombinations.find(SUColors);
817 if (Pos != ColorCombinations.end()) {
818 CurrentColoring[SU->NodeNum] = Pos->second;
819 } else {
820 CurrentColoring[SU->NodeNum] = NextNonReservedID;
821 ColorCombinations[SUColors] = NextNonReservedID++;
822 }
823 }
824}
825
826void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
827 unsigned DAGSize = DAG->SUnits.size();
828 std::vector<int> PendingColoring = CurrentColoring;
829
830 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
831 SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
832 std::set<unsigned> SUColors;
833 std::set<unsigned> SUColorsPending;
834
835 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
836 continue;
837
838 if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
839 CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
840 continue;
841
842 for (SDep& SuccDep : SU->Succs) {
843 SUnit *Succ = SuccDep.getSUnit();
844 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
845 continue;
846 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
847 CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
848 SUColors.insert(CurrentColoring[Succ->NodeNum]);
849 SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
850 }
851 if (SUColors.size() == 1 && SUColorsPending.size() == 1)
852 PendingColoring[SU->NodeNum] = *SUColors.begin();
853 else // TODO: Attribute new colors depending on color
854 // combination of children.
855 PendingColoring[SU->NodeNum] = NextNonReservedID++;
856 }
857 CurrentColoring = PendingColoring;
858}
859
860
861void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
862 unsigned DAGSize = DAG->SUnits.size();
863 unsigned PreviousColor;
864 std::set<unsigned> SeenColors;
865
866 if (DAGSize <= 1)
867 return;
868
869 PreviousColor = CurrentColoring[0];
870
871 for (unsigned i = 1, e = DAGSize; i != e; ++i) {
872 SUnit *SU = &DAG->SUnits[i];
873 unsigned CurrentColor = CurrentColoring[i];
874 unsigned PreviousColorSave = PreviousColor;
875 assert(i == SU->NodeNum);
876
877 if (CurrentColor != PreviousColor)
878 SeenColors.insert(PreviousColor);
879 PreviousColor = CurrentColor;
880
881 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
882 continue;
883
884 if (SeenColors.find(CurrentColor) == SeenColors.end())
885 continue;
886
887 if (PreviousColorSave != CurrentColor)
888 CurrentColoring[i] = NextNonReservedID++;
889 else
890 CurrentColoring[i] = CurrentColoring[i-1];
891 }
892}
893
894void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
895 unsigned DAGSize = DAG->SUnits.size();
896
897 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
898 SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
899 std::set<unsigned> SUColors;
900
901 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
902 continue;
903
904 // No predecessor: Vgpr constant loading.
905 // Low latency instructions usually have a predecessor (the address)
906 if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
907 continue;
908
909 for (SDep& SuccDep : SU->Succs) {
910 SUnit *Succ = SuccDep.getSUnit();
911 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
912 continue;
913 SUColors.insert(CurrentColoring[Succ->NodeNum]);
914 }
915 if (SUColors.size() == 1)
916 CurrentColoring[SU->NodeNum] = *SUColors.begin();
917 }
918}
919
920void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
921 unsigned DAGSize = DAG->SUnits.size();
922
923 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
924 SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
925 std::set<unsigned> SUColors;
926
927 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
928 continue;
929
930 for (SDep& SuccDep : SU->Succs) {
931 SUnit *Succ = SuccDep.getSUnit();
932 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
933 continue;
934 SUColors.insert(CurrentColoring[Succ->NodeNum]);
935 }
936 if (SUColors.size() == 1)
937 CurrentColoring[SU->NodeNum] = *SUColors.begin();
938 }
939}
940
941void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
942 unsigned DAGSize = DAG->SUnits.size();
943
944 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
945 SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
946 std::set<unsigned> SUColors;
947
948 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
949 continue;
950
951 for (SDep& SuccDep : SU->Succs) {
952 SUnit *Succ = SuccDep.getSUnit();
953 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
954 continue;
955 SUColors.insert(CurrentColoring[Succ->NodeNum]);
956 }
957 if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
958 CurrentColoring[SU->NodeNum] = *SUColors.begin();
959 }
960}
961
962void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
963 unsigned DAGSize = DAG->SUnits.size();
964 std::map<unsigned, unsigned> ColorCount;
965
966 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
967 SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
968 unsigned color = CurrentColoring[SU->NodeNum];
969 std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color);
970 if (Pos != ColorCount.end()) {
971 ++ColorCount[color];
972 } else {
973 ColorCount[color] = 1;
974 }
975 }
976
977 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
978 SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
979 unsigned color = CurrentColoring[SU->NodeNum];
980 std::set<unsigned> SUColors;
981
982 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
983 continue;
984
985 if (ColorCount[color] > 1)
986 continue;
987
988 for (SDep& SuccDep : SU->Succs) {
989 SUnit *Succ = SuccDep.getSUnit();
990 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
991 continue;
992 SUColors.insert(CurrentColoring[Succ->NodeNum]);
993 }
994 if (SUColors.size() == 1 && *SUColors.begin() != color) {
995 --ColorCount[color];
996 CurrentColoring[SU->NodeNum] = *SUColors.begin();
997 ++ColorCount[*SUColors.begin()];
998 }
999 }
1000}
1001
1002void SIScheduleBlockCreator::cutHugeBlocks() {
1003 // TODO
1004}
1005
1006void SIScheduleBlockCreator::regroupNoUserInstructions() {
1007 unsigned DAGSize = DAG->SUnits.size();
1008 int GroupID = NextNonReservedID++;
1009
1010 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1011 SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
1012 bool hasSuccessor = false;
1013
1014 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1015 continue;
1016
1017 for (SDep& SuccDep : SU->Succs) {
1018 SUnit *Succ = SuccDep.getSUnit();
1019 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1020 continue;
1021 hasSuccessor = true;
1022 }
1023 if (!hasSuccessor)
1024 CurrentColoring[SU->NodeNum] = GroupID;
1025 }
1026}
1027
1028void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
1029 unsigned DAGSize = DAG->SUnits.size();
1030 std::map<unsigned,unsigned> RealID;
1031
1032 CurrentBlocks.clear();
1033 CurrentColoring.clear();
1034 CurrentColoring.resize(DAGSize, 0);
1035 Node2CurrentBlock.clear();
1036
1037 // Restore links previous scheduling variant has overridden.
1038 DAG->restoreSULinksLeft();
1039
1040 NextReservedID = 1;
1041 NextNonReservedID = DAGSize + 1;
1042
1043 DEBUG(dbgs() << "Coloring the graph\n");
1044
1045 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
1046 colorHighLatenciesGroups();
1047 else
1048 colorHighLatenciesAlone();
1049 colorComputeReservedDependencies();
1050 colorAccordingToReservedDependencies();
1051 colorEndsAccordingToDependencies();
1052 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
1053 colorForceConsecutiveOrderInGroup();
1054 regroupNoUserInstructions();
1055 colorMergeConstantLoadsNextGroup();
1056 colorMergeIfPossibleNextGroupOnlyForReserved();
1057
1058 // Put SUs of same color into same block
1059 Node2CurrentBlock.resize(DAGSize, -1);
1060 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1061 SUnit *SU = &DAG->SUnits[i];
1062 unsigned Color = CurrentColoring[SU->NodeNum];
1063 if (RealID.find(Color) == RealID.end()) {
1064 int ID = CurrentBlocks.size();
1065 BlockPtrs.push_back(
1066 make_unique<SIScheduleBlock>(DAG, this, ID));
1067 CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
1068 RealID[Color] = ID;
1069 }
1070 CurrentBlocks[RealID[Color]]->addUnit(SU);
1071 Node2CurrentBlock[SU->NodeNum] = RealID[Color];
1072 }
1073
1074 // Build dependencies between blocks.
1075 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1076 SUnit *SU = &DAG->SUnits[i];
1077 int SUID = Node2CurrentBlock[i];
1078 for (SDep& SuccDep : SU->Succs) {
1079 SUnit *Succ = SuccDep.getSUnit();
1080 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1081 continue;
1082 if (Node2CurrentBlock[Succ->NodeNum] != SUID)
1083 CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]);
1084 }
1085 for (SDep& PredDep : SU->Preds) {
1086 SUnit *Pred = PredDep.getSUnit();
1087 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
1088 continue;
1089 if (Node2CurrentBlock[Pred->NodeNum] != SUID)
1090 CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
1091 }
1092 }
1093
1094 // Free root and leafs of all blocks to enable scheduling inside them.
1095 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1096 SIScheduleBlock *Block = CurrentBlocks[i];
1097 Block->finalizeUnits();
1098 }
1099 DEBUG(
1100 dbgs() << "Blocks created:\n\n";
1101 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1102 SIScheduleBlock *Block = CurrentBlocks[i];
1103 Block->printDebug(true);
1104 }
1105 );
1106}
1107
1108// Two functions taken from Codegen/MachineScheduler.cpp
1109
1110/// If this iterator is a debug value, increment until reaching the End or a
1111/// non-debug instruction.
1112static MachineBasicBlock::const_iterator
1113nextIfDebug(MachineBasicBlock::const_iterator I,
1114 MachineBasicBlock::const_iterator End) {
1115 for(; I != End; ++I) {
1116 if (!I->isDebugValue())
1117 break;
1118 }
1119 return I;
1120}
1121
1122/// Non-const version.
1123static MachineBasicBlock::iterator
1124nextIfDebug(MachineBasicBlock::iterator I,
1125 MachineBasicBlock::const_iterator End) {
1126 // Cast the return value to nonconst MachineInstr, then cast to an
1127 // instr_iterator, which does not check for null, finally return a
1128 // bundle_iterator.
1129 return MachineBasicBlock::instr_iterator(
1130 const_cast<MachineInstr*>(
1131 &*nextIfDebug(MachineBasicBlock::const_iterator(I), End)));
1132}
1133
1134void SIScheduleBlockCreator::topologicalSort() {
1135 unsigned DAGSize = CurrentBlocks.size();
1136 std::vector<int> WorkList;
1137
1138 DEBUG(dbgs() << "Topological Sort\n");
1139
1140 WorkList.reserve(DAGSize);
1141 TopDownIndex2Block.resize(DAGSize);
1142 TopDownBlock2Index.resize(DAGSize);
1143 BottomUpIndex2Block.resize(DAGSize);
1144
1145 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1146 SIScheduleBlock *Block = CurrentBlocks[i];
1147 unsigned Degree = Block->getSuccs().size();
1148 TopDownBlock2Index[i] = Degree;
1149 if (Degree == 0) {
1150 WorkList.push_back(i);
1151 }
1152 }
1153
1154 int Id = DAGSize;
1155 while (!WorkList.empty()) {
1156 int i = WorkList.back();
1157 SIScheduleBlock *Block = CurrentBlocks[i];
1158 WorkList.pop_back();
1159 TopDownBlock2Index[i] = --Id;
1160 TopDownIndex2Block[Id] = i;
1161 for (SIScheduleBlock* Pred : Block->getPreds()) {
1162 if (!--TopDownBlock2Index[Pred->getID()])
1163 WorkList.push_back(Pred->getID());
1164 }
1165 }
1166
1167#ifndef NDEBUG
1168 // Check correctness of the ordering.
1169 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1170 SIScheduleBlock *Block = CurrentBlocks[i];
1171 for (SIScheduleBlock* Pred : Block->getPreds()) {
1172 assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
1173 "Wrong Top Down topological sorting");
1174 }
1175 }
1176#endif
1177
1178 BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
1179 TopDownIndex2Block.rend());
1180}
1181
1182void SIScheduleBlockCreator::scheduleInsideBlocks() {
1183 unsigned DAGSize = CurrentBlocks.size();
1184
1185 DEBUG(dbgs() << "\nScheduling Blocks\n\n");
1186
1187 // We do schedule a valid scheduling such that a Block corresponds
1188 // to a range of instructions.
1189 DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
1190 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1191 SIScheduleBlock *Block = CurrentBlocks[i];
1192 Block->fastSchedule();
1193 }
1194
1195 // Note: the following code, and the part restoring previous position
1196 // is by far the most expensive operation of the Scheduler.
1197
1198 // Do not update CurrentTop.
1199 MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
1200 std::vector<MachineBasicBlock::iterator> PosOld;
1201 std::vector<MachineBasicBlock::iterator> PosNew;
1202 PosOld.reserve(DAG->SUnits.size());
1203 PosNew.reserve(DAG->SUnits.size());
1204
1205 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1206 int BlockIndice = TopDownIndex2Block[i];
1207 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1208 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1209
1210 for (SUnit* SU : SUs) {
1211 MachineInstr *MI = SU->getInstr();
1212 MachineBasicBlock::iterator Pos = MI;
1213 PosOld.push_back(Pos);
1214 if (&*CurrentTopFastSched == MI) {
1215 PosNew.push_back(Pos);
1216 CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
1217 DAG->getCurrentBottom());
1218 } else {
1219 // Update the instruction stream.
1220 DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
1221
1222 // Update LiveIntervals.
1223 // Note: Moving all instructions and calling handleMove everytime
1224 // is the most cpu intensive operation of the scheduler.
1225 // It would gain a lot if there was a way to recompute the
1226 // LiveIntervals for the entire scheduling region.
Duncan P. N. Exon Smithbe8f8c42016-02-27 20:14:29 +00001227 DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001228 PosNew.push_back(CurrentTopFastSched);
1229 }
1230 }
1231 }
1232
1233 // Now we have Block of SUs == Block of MI.
1234 // We do the final schedule for the instructions inside the block.
1235 // The property that all the SUs of the Block are grouped together as MI
1236 // is used for correct reg usage tracking.
1237 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1238 SIScheduleBlock *Block = CurrentBlocks[i];
1239 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1240 Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
1241 }
1242
1243 DEBUG(dbgs() << "Restoring MI Pos\n");
1244 // Restore old ordering (which prevents a LIS->handleMove bug).
1245 for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
1246 MachineBasicBlock::iterator POld = PosOld[i-1];
1247 MachineBasicBlock::iterator PNew = PosNew[i-1];
1248 if (PNew != POld) {
1249 // Update the instruction stream.
1250 DAG->getBB()->splice(POld, DAG->getBB(), PNew);
1251
1252 // Update LiveIntervals.
Duncan P. N. Exon Smithbe8f8c42016-02-27 20:14:29 +00001253 DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001254 }
1255 }
1256
1257 DEBUG(
1258 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1259 SIScheduleBlock *Block = CurrentBlocks[i];
1260 Block->printDebug(true);
1261 }
1262 );
1263}
1264
1265void SIScheduleBlockCreator::fillStats() {
1266 unsigned DAGSize = CurrentBlocks.size();
1267
1268 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1269 int BlockIndice = TopDownIndex2Block[i];
1270 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1271 if (Block->getPreds().size() == 0)
1272 Block->Depth = 0;
1273 else {
1274 unsigned Depth = 0;
1275 for (SIScheduleBlock *Pred : Block->getPreds()) {
1276 if (Depth < Pred->Depth + 1)
1277 Depth = Pred->Depth + 1;
1278 }
1279 Block->Depth = Depth;
1280 }
1281 }
1282
1283 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1284 int BlockIndice = BottomUpIndex2Block[i];
1285 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1286 if (Block->getSuccs().size() == 0)
1287 Block->Height = 0;
1288 else {
1289 unsigned Height = 0;
1290 for (SIScheduleBlock *Succ : Block->getSuccs()) {
1291 if (Height < Succ->Height + 1)
1292 Height = Succ->Height + 1;
1293 }
1294 Block->Height = Height;
1295 }
1296 }
1297}
1298
1299// SIScheduleBlockScheduler //
1300
1301SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
1302 SISchedulerBlockSchedulerVariant Variant,
1303 SIScheduleBlocks BlocksStruct) :
1304 DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
1305 LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
1306 SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
1307
1308 // Fill the usage of every output
1309 // Warning: while by construction we always have a link between two blocks
1310 // when one needs a result from the other, the number of users of an output
1311 // is not the sum of child blocks having as input the same virtual register.
1312 // Here is an example. A produces x and y. B eats x and produces x'.
1313 // C eats x' and y. The register coalescer may have attributed the same
1314 // virtual register to x and x'.
1315 // To count accurately, we do a topological sort. In case the register is
1316 // found for several parents, we increment the usage of the one with the
1317 // highest topological index.
1318 LiveOutRegsNumUsages.resize(Blocks.size());
1319 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1320 SIScheduleBlock *Block = Blocks[i];
1321 for (unsigned Reg : Block->getInRegs()) {
1322 bool Found = false;
1323 int topoInd = -1;
1324 for (SIScheduleBlock* Pred: Block->getPreds()) {
1325 std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1326 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1327
1328 if (RegPos != PredOutRegs.end()) {
1329 Found = true;
1330 if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
1331 topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
1332 }
1333 }
1334 }
1335
1336 if (!Found)
1337 continue;
1338
1339 int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
1340 std::map<unsigned, unsigned>::iterator RegPos =
1341 LiveOutRegsNumUsages[PredID].find(Reg);
1342 if (RegPos != LiveOutRegsNumUsages[PredID].end()) {
1343 ++LiveOutRegsNumUsages[PredID][Reg];
1344 } else {
1345 LiveOutRegsNumUsages[PredID][Reg] = 1;
1346 }
1347 }
1348 }
1349
1350 LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
1351 BlockNumPredsLeft.resize(Blocks.size());
1352 BlockNumSuccsLeft.resize(Blocks.size());
1353
1354 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1355 SIScheduleBlock *Block = Blocks[i];
1356 BlockNumPredsLeft[i] = Block->getPreds().size();
1357 BlockNumSuccsLeft[i] = Block->getSuccs().size();
1358 }
1359
1360#ifndef NDEBUG
1361 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1362 SIScheduleBlock *Block = Blocks[i];
1363 assert(Block->getID() == i);
1364 }
1365#endif
1366
1367 std::set<unsigned> InRegs = DAG->getInRegs();
1368 addLiveRegs(InRegs);
1369
1370 // Fill LiveRegsConsumers for regs that were already
1371 // defined before scheduling.
1372 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1373 SIScheduleBlock *Block = Blocks[i];
1374 for (unsigned Reg : Block->getInRegs()) {
1375 bool Found = false;
1376 for (SIScheduleBlock* Pred: Block->getPreds()) {
1377 std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1378 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1379
1380 if (RegPos != PredOutRegs.end()) {
1381 Found = true;
1382 break;
1383 }
1384 }
1385
1386 if (!Found) {
1387 if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end())
1388 LiveRegsConsumers[Reg] = 1;
1389 else
1390 ++LiveRegsConsumers[Reg];
1391 }
1392 }
1393 }
1394
1395 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1396 SIScheduleBlock *Block = Blocks[i];
1397 if (BlockNumPredsLeft[i] == 0) {
1398 ReadyBlocks.push_back(Block);
1399 }
1400 }
1401
1402 while (SIScheduleBlock *Block = pickBlock()) {
1403 BlocksScheduled.push_back(Block);
1404 blockScheduled(Block);
1405 }
1406
1407 DEBUG(
1408 dbgs() << "Block Order:";
1409 for (SIScheduleBlock* Block : BlocksScheduled) {
1410 dbgs() << ' ' << Block->getID();
1411 }
1412 );
1413}
1414
1415bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
1416 SIBlockSchedCandidate &TryCand) {
1417 if (!Cand.isValid()) {
1418 TryCand.Reason = NodeOrder;
1419 return true;
1420 }
1421
1422 // Try to hide high latencies.
1423 if (tryLess(TryCand.LastPosHighLatParentScheduled,
1424 Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
1425 return true;
1426 // Schedule high latencies early so you can hide them better.
1427 if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
1428 TryCand, Cand, Latency))
1429 return true;
1430 if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height,
1431 TryCand, Cand, Depth))
1432 return true;
1433 if (tryGreater(TryCand.NumHighLatencySuccessors,
1434 Cand.NumHighLatencySuccessors,
1435 TryCand, Cand, Successor))
1436 return true;
1437 return false;
1438}
1439
1440bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
1441 SIBlockSchedCandidate &TryCand) {
1442 if (!Cand.isValid()) {
1443 TryCand.Reason = NodeOrder;
1444 return true;
1445 }
1446
1447 if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
1448 TryCand, Cand, RegUsage))
1449 return true;
1450 if (tryGreater(TryCand.NumSuccessors > 0,
1451 Cand.NumSuccessors > 0,
1452 TryCand, Cand, Successor))
1453 return true;
1454 if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
1455 return true;
1456 if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
1457 TryCand, Cand, RegUsage))
1458 return true;
1459 return false;
1460}
1461
1462SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
1463 SIBlockSchedCandidate Cand;
1464 std::vector<SIScheduleBlock*>::iterator Best;
1465 SIScheduleBlock *Block;
1466 if (ReadyBlocks.empty())
1467 return nullptr;
1468
1469 DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
1470 VregCurrentUsage, SregCurrentUsage);
1471 if (VregCurrentUsage > maxVregUsage)
1472 maxVregUsage = VregCurrentUsage;
1473 if (VregCurrentUsage > maxSregUsage)
1474 maxSregUsage = VregCurrentUsage;
1475 DEBUG(
1476 dbgs() << "Picking New Blocks\n";
1477 dbgs() << "Available: ";
1478 for (SIScheduleBlock* Block : ReadyBlocks)
1479 dbgs() << Block->getID() << ' ';
1480 dbgs() << "\nCurrent Live:\n";
1481 for (unsigned Reg : LiveRegs)
1482 dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
1483 dbgs() << '\n';
1484 dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
1485 dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
1486 );
1487
1488 Cand.Block = nullptr;
1489 for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
1490 E = ReadyBlocks.end(); I != E; ++I) {
1491 SIBlockSchedCandidate TryCand;
1492 TryCand.Block = *I;
1493 TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
1494 TryCand.VGPRUsageDiff =
1495 checkRegUsageImpact(TryCand.Block->getInRegs(),
1496 TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
1497 TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
1498 TryCand.NumHighLatencySuccessors =
1499 TryCand.Block->getNumHighLatencySuccessors();
1500 TryCand.LastPosHighLatParentScheduled =
1501 (unsigned int) std::max<int> (0,
1502 LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
1503 LastPosWaitedHighLatency);
1504 TryCand.Height = TryCand.Block->Height;
1505 // Try not to increase VGPR usage too much, else we may spill.
1506 if (VregCurrentUsage > 120 ||
1507 Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
1508 if (!tryCandidateRegUsage(Cand, TryCand) &&
1509 Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
1510 tryCandidateLatency(Cand, TryCand);
1511 } else {
1512 if (!tryCandidateLatency(Cand, TryCand))
1513 tryCandidateRegUsage(Cand, TryCand);
1514 }
1515 if (TryCand.Reason != NoCand) {
1516 Cand.setBest(TryCand);
1517 Best = I;
1518 DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
1519 << getReasonStr(Cand.Reason) << '\n');
1520 }
1521 }
1522
1523 DEBUG(
1524 dbgs() << "Picking: " << Cand.Block->getID() << '\n';
1525 dbgs() << "Is a block with high latency instruction: "
1526 << (Cand.IsHighLatency ? "yes\n" : "no\n");
1527 dbgs() << "Position of last high latency dependency: "
1528 << Cand.LastPosHighLatParentScheduled << '\n';
1529 dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
1530 dbgs() << '\n';
1531 );
1532
1533 Block = Cand.Block;
1534 ReadyBlocks.erase(Best);
1535 return Block;
1536}
1537
1538// Tracking of currently alive registers to determine VGPR Usage.
1539
1540void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
1541 for (unsigned Reg : Regs) {
1542 // For now only track virtual registers.
1543 if (!TargetRegisterInfo::isVirtualRegister(Reg))
1544 continue;
1545 // If not already in the live set, then add it.
1546 (void) LiveRegs.insert(Reg);
1547 }
1548}
1549
1550void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
1551 std::set<unsigned> &Regs) {
1552 for (unsigned Reg : Regs) {
1553 // For now only track virtual registers.
1554 std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
1555 assert (Pos != LiveRegs.end() && // Reg must be live.
1556 LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
1557 LiveRegsConsumers[Reg] >= 1);
1558 --LiveRegsConsumers[Reg];
1559 if (LiveRegsConsumers[Reg] == 0)
1560 LiveRegs.erase(Pos);
1561 }
1562}
1563
1564void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
1565 for (SIScheduleBlock* Block : Parent->getSuccs()) {
1566 --BlockNumPredsLeft[Block->getID()];
1567 if (BlockNumPredsLeft[Block->getID()] == 0) {
1568 ReadyBlocks.push_back(Block);
1569 }
1570 // TODO: Improve check. When the dependency between the high latency
1571 // instructions and the instructions of the other blocks are WAR or WAW
1572 // there will be no wait triggered. We would like these cases to not
1573 // update LastPosHighLatencyParentScheduled.
1574 if (Parent->isHighLatencyBlock())
1575 LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled;
1576 }
1577}
1578
1579void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
1580 decreaseLiveRegs(Block, Block->getInRegs());
1581 addLiveRegs(Block->getOutRegs());
1582 releaseBlockSuccs(Block);
1583 for (std::map<unsigned, unsigned>::iterator RegI =
1584 LiveOutRegsNumUsages[Block->getID()].begin(),
1585 E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
1586 std::pair<unsigned, unsigned> RegP = *RegI;
1587 if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end())
1588 LiveRegsConsumers[RegP.first] = RegP.second;
1589 else {
1590 assert(LiveRegsConsumers[RegP.first] == 0);
1591 LiveRegsConsumers[RegP.first] += RegP.second;
1592 }
1593 }
1594 if (LastPosHighLatencyParentScheduled[Block->getID()] >
1595 (unsigned)LastPosWaitedHighLatency)
1596 LastPosWaitedHighLatency =
1597 LastPosHighLatencyParentScheduled[Block->getID()];
1598 ++NumBlockScheduled;
1599}
1600
1601std::vector<int>
1602SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
1603 std::set<unsigned> &OutRegs) {
1604 std::vector<int> DiffSetPressure;
1605 DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
1606
1607 for (unsigned Reg : InRegs) {
1608 // For now only track virtual registers.
1609 if (!TargetRegisterInfo::isVirtualRegister(Reg))
1610 continue;
1611 if (LiveRegsConsumers[Reg] > 1)
1612 continue;
1613 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1614 for (; PSetI.isValid(); ++PSetI) {
1615 DiffSetPressure[*PSetI] -= PSetI.getWeight();
1616 }
1617 }
1618
1619 for (unsigned Reg : OutRegs) {
1620 // For now only track virtual registers.
1621 if (!TargetRegisterInfo::isVirtualRegister(Reg))
1622 continue;
1623 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1624 for (; PSetI.isValid(); ++PSetI) {
1625 DiffSetPressure[*PSetI] += PSetI.getWeight();
1626 }
1627 }
1628
1629 return DiffSetPressure;
1630}
1631
1632// SIScheduler //
1633
1634struct SIScheduleBlockResult
1635SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
1636 SISchedulerBlockSchedulerVariant ScheduleVariant) {
1637 SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
1638 SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
1639 std::vector<SIScheduleBlock*> ScheduledBlocks;
1640 struct SIScheduleBlockResult Res;
1641
1642 ScheduledBlocks = Scheduler.getBlocks();
1643
1644 for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
1645 SIScheduleBlock *Block = ScheduledBlocks[b];
1646 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1647
1648 for (SUnit* SU : SUs)
1649 Res.SUs.push_back(SU->NodeNum);
1650 }
1651
1652 Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
1653 Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
1654 return Res;
1655}
1656
1657// SIScheduleDAGMI //
1658
1659SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
1660 ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)) {
1661 SITII = static_cast<const SIInstrInfo*>(TII);
1662 SITRI = static_cast<const SIRegisterInfo*>(TRI);
1663
1664 VGPRSetID = SITRI->getVGPR32PressureSet();
1665 SGPRSetID = SITRI->getSGPR32PressureSet();
1666}
1667
1668SIScheduleDAGMI::~SIScheduleDAGMI() {
1669}
1670
1671ScheduleDAGInstrs *llvm::createSIMachineScheduler(MachineSchedContext *C) {
1672 return new SIScheduleDAGMI(C);
1673}
1674
1675// Code adapted from scheduleDAG.cpp
1676// Does a topological sort over the SUs.
1677// Both TopDown and BottomUp
1678void SIScheduleDAGMI::topologicalSort() {
1679 std::vector<int> TopDownSU2Index;
1680 unsigned DAGSize = SUnits.size();
1681 std::vector<SUnit*> WorkList;
1682
1683 DEBUG(dbgs() << "Topological Sort\n");
1684 WorkList.reserve(DAGSize);
1685
1686 TopDownIndex2SU.resize(DAGSize);
1687 TopDownSU2Index.resize(DAGSize);
1688 BottomUpIndex2SU.resize(DAGSize);
1689
1690 WorkList.push_back(&getExitSU());
1691 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1692 SUnit *SU = &SUnits[i];
1693 int NodeNum = SU->NodeNum;
1694 unsigned Degree = SU->Succs.size();
1695 TopDownSU2Index[NodeNum] = Degree;
1696 if (Degree == 0) {
1697 assert(SU->Succs.empty() && "SUnit should have no successors");
1698 WorkList.push_back(SU);
1699 }
1700 }
1701
1702 int Id = DAGSize;
1703 while (!WorkList.empty()) {
1704 SUnit *SU = WorkList.back();
1705 WorkList.pop_back();
1706 if (SU->NodeNum < DAGSize) {
1707 TopDownSU2Index[SU->NodeNum] = --Id;
1708 TopDownIndex2SU[Id] = SU->NodeNum;
1709 }
1710 for (SDep& Pred : SU->Preds) {
1711 SUnit *SU = Pred.getSUnit();
1712 if (SU->NodeNum < DAGSize && !--TopDownSU2Index[SU->NodeNum])
1713 WorkList.push_back(SU);
1714 }
1715 }
1716
1717 BottomUpIndex2SU = std::vector<int>(TopDownIndex2SU.rbegin(),
1718 TopDownIndex2SU.rend());
1719
1720#ifndef NDEBUG
1721 // Check correctness of the ordering
1722 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1723 SUnit *SU = &SUnits[i];
1724 for (SDep& Pred : SU->Preds) {
1725 if (Pred.getSUnit()->NodeNum >= DAGSize)
1726 continue;
1727 assert(TopDownSU2Index[SU->NodeNum] >
1728 TopDownSU2Index[Pred.getSUnit()->NodeNum] &&
1729 "Wrong Top Down topological sorting");
1730 }
1731 }
1732 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1733 SUnit *SU = &SUnits[i];
1734 for (SDep& Succ : SU->Succs) {
1735 if (Succ.getSUnit()->NodeNum >= DAGSize)
1736 continue;
1737 assert(TopDownSU2Index[SU->NodeNum] <
1738 TopDownSU2Index[Succ.getSUnit()->NodeNum] &&
1739 "Wrong Bottom Up topological sorting");
1740 }
1741 }
1742#endif
1743}
1744
1745// Move low latencies further from their user without
1746// increasing SGPR usage (in general)
1747// This is to be replaced by a better pass that would
1748// take into account SGPR usage (based on VGPR Usage
1749// and the corresponding wavefront count), that would
1750// try to merge groups of loads if it make sense, etc
1751void SIScheduleDAGMI::moveLowLatencies() {
1752 unsigned DAGSize = SUnits.size();
1753 int LastLowLatencyUser = -1;
1754 int LastLowLatencyPos = -1;
1755
1756 for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
1757 SUnit *SU = &SUnits[ScheduledSUnits[i]];
1758 bool IsLowLatencyUser = false;
1759 unsigned MinPos = 0;
1760
1761 for (SDep& PredDep : SU->Preds) {
1762 SUnit *Pred = PredDep.getSUnit();
1763 if (SITII->isLowLatencyInstruction(Pred->getInstr())) {
1764 IsLowLatencyUser = true;
1765 }
1766 if (Pred->NodeNum >= DAGSize)
1767 continue;
1768 unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
1769 if (PredPos >= MinPos)
1770 MinPos = PredPos + 1;
1771 }
1772
1773 if (SITII->isLowLatencyInstruction(SU->getInstr())) {
1774 unsigned BestPos = LastLowLatencyUser + 1;
1775 if ((int)BestPos <= LastLowLatencyPos)
1776 BestPos = LastLowLatencyPos + 1;
1777 if (BestPos < MinPos)
1778 BestPos = MinPos;
1779 if (BestPos < i) {
1780 for (unsigned u = i; u > BestPos; --u) {
1781 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1782 ScheduledSUnits[u] = ScheduledSUnits[u-1];
1783 }
1784 ScheduledSUnits[BestPos] = SU->NodeNum;
1785 ScheduledSUnitsInv[SU->NodeNum] = BestPos;
1786 }
1787 LastLowLatencyPos = BestPos;
1788 if (IsLowLatencyUser)
1789 LastLowLatencyUser = BestPos;
1790 } else if (IsLowLatencyUser) {
1791 LastLowLatencyUser = i;
1792 // Moves COPY instructions on which depends
1793 // the low latency instructions too.
1794 } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
1795 bool CopyForLowLat = false;
1796 for (SDep& SuccDep : SU->Succs) {
1797 SUnit *Succ = SuccDep.getSUnit();
1798 if (SITII->isLowLatencyInstruction(Succ->getInstr())) {
1799 CopyForLowLat = true;
1800 }
1801 }
1802 if (!CopyForLowLat)
1803 continue;
1804 if (MinPos < i) {
1805 for (unsigned u = i; u > MinPos; --u) {
1806 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1807 ScheduledSUnits[u] = ScheduledSUnits[u-1];
1808 }
1809 ScheduledSUnits[MinPos] = SU->NodeNum;
1810 ScheduledSUnitsInv[SU->NodeNum] = MinPos;
1811 }
1812 }
1813 }
1814}
1815
1816void SIScheduleDAGMI::restoreSULinksLeft() {
1817 for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1818 SUnits[i].isScheduled = false;
1819 SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
1820 SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
1821 SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
1822 SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
1823 }
1824}
1825
1826// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1827template<typename _Iterator> void
1828SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
1829 unsigned &VgprUsage, unsigned &SgprUsage) {
1830 VgprUsage = 0;
1831 SgprUsage = 0;
1832 for (_Iterator RegI = First; RegI != End; ++RegI) {
1833 unsigned Reg = *RegI;
1834 // For now only track virtual registers
1835 if (!TargetRegisterInfo::isVirtualRegister(Reg))
1836 continue;
1837 PSetIterator PSetI = MRI.getPressureSets(Reg);
1838 for (; PSetI.isValid(); ++PSetI) {
1839 if (*PSetI == VGPRSetID)
1840 VgprUsage += PSetI.getWeight();
1841 else if (*PSetI == SGPRSetID)
1842 SgprUsage += PSetI.getWeight();
1843 }
1844 }
1845}
1846
1847void SIScheduleDAGMI::schedule()
1848{
1849 SmallVector<SUnit*, 8> TopRoots, BotRoots;
1850 SIScheduleBlockResult Best, Temp;
1851 DEBUG(dbgs() << "Preparing Scheduling\n");
1852
1853 buildDAGWithRegPressure();
1854 DEBUG(
1855 for(SUnit& SU : SUnits)
1856 SU.dumpAll(this)
1857 );
1858
1859 Topo.InitDAGTopologicalSorting();
1860 topologicalSort();
1861 findRootsAndBiasEdges(TopRoots, BotRoots);
1862 // We reuse several ScheduleDAGMI and ScheduleDAGMILive
1863 // functions, but to make them happy we must initialize
1864 // the default Scheduler implementation (even if we do not
1865 // run it)
1866 SchedImpl->initialize(this);
1867 initQueues(TopRoots, BotRoots);
1868
1869 // Fill some stats to help scheduling.
1870
1871 SUnitsLinksBackup = SUnits;
1872 IsLowLatencySU.clear();
1873 LowLatencyOffset.clear();
1874 IsHighLatencySU.clear();
1875
1876 IsLowLatencySU.resize(SUnits.size(), 0);
1877 LowLatencyOffset.resize(SUnits.size(), 0);
1878 IsHighLatencySU.resize(SUnits.size(), 0);
1879
1880 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1881 SUnit *SU = &SUnits[i];
Chad Rosierc27a18f2016-03-09 16:00:35 +00001882 unsigned BaseLatReg;
1883 int64_t OffLatReg;
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001884 if (SITII->isLowLatencyInstruction(SU->getInstr())) {
1885 IsLowLatencySU[i] = 1;
1886 if (SITII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseLatReg,
1887 OffLatReg, TRI))
1888 LowLatencyOffset[i] = OffLatReg;
1889 } else if (SITII->isHighLatencyInstruction(SU->getInstr()))
1890 IsHighLatencySU[i] = 1;
1891 }
1892
1893 SIScheduler Scheduler(this);
1894 Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
1895 SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
1896#if 0 // To enable when handleMove fix lands
1897 // if VGPR usage is extremely high, try other good performing variants
1898 // which could lead to lower VGPR usage
1899 if (Best.MaxVGPRUsage > 180) {
1900 std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
1901 { LatenciesAlone, BlockRegUsageLatency },
1902// { LatenciesAlone, BlockRegUsage },
1903 { LatenciesGrouped, BlockLatencyRegUsage },
1904// { LatenciesGrouped, BlockRegUsageLatency },
1905// { LatenciesGrouped, BlockRegUsage },
1906 { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1907// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1908// { LatenciesAlonePlusConsecutive, BlockRegUsage }
1909 };
1910 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1911 Temp = Scheduler.scheduleVariant(v.first, v.second);
1912 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1913 Best = Temp;
1914 }
1915 }
1916 // if VGPR usage is still extremely high, we may spill. Try other variants
1917 // which are less performing, but that could lead to lower VGPR usage.
1918 if (Best.MaxVGPRUsage > 200) {
1919 std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
1920// { LatenciesAlone, BlockRegUsageLatency },
1921 { LatenciesAlone, BlockRegUsage },
1922// { LatenciesGrouped, BlockLatencyRegUsage },
1923 { LatenciesGrouped, BlockRegUsageLatency },
1924 { LatenciesGrouped, BlockRegUsage },
1925// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1926 { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1927 { LatenciesAlonePlusConsecutive, BlockRegUsage }
1928 };
1929 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1930 Temp = Scheduler.scheduleVariant(v.first, v.second);
1931 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1932 Best = Temp;
1933 }
1934 }
1935#endif
1936 ScheduledSUnits = Best.SUs;
1937 ScheduledSUnitsInv.resize(SUnits.size());
1938
1939 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1940 ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
1941 }
1942
1943 moveLowLatencies();
1944
1945 // Tell the outside world about the result of the scheduling.
1946
1947 assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
1948 TopRPTracker.setPos(CurrentTop);
1949
1950 for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
1951 E = ScheduledSUnits.end(); I != E; ++I) {
1952 SUnit *SU = &SUnits[*I];
1953
1954 scheduleMI(SU, true);
1955
1956 DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
1957 << *SU->getInstr());
1958 }
1959
1960 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
1961
1962 placeDebugValues();
1963
1964 DEBUG({
1965 unsigned BBNum = begin()->getParent()->getNumber();
1966 dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
1967 dumpSchedule();
1968 dbgs() << '\n';
1969 });
1970}