blob: c87d2008a34d28dda9840691ec230bb4ecaec23c [file] [log] [blame]
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001//===-- SIMachineScheduler.cpp - SI Scheduler Interface -*- C++ -*-----===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief SI Machine Scheduler interface
12//
13//===----------------------------------------------------------------------===//
14
Matt Arsenault43e92fe2016-06-24 06:30:11 +000015#include "AMDGPU.h"
Nicolai Haehnle02c32912016-01-13 16:10:10 +000016#include "SIMachineScheduler.h"
Nicolai Haehnle02c32912016-01-13 16:10:10 +000017#include "llvm/CodeGen/LiveInterval.h"
18#include "llvm/CodeGen/LiveIntervalAnalysis.h"
19#include "llvm/CodeGen/MachineRegisterInfo.h"
20#include "llvm/CodeGen/MachineScheduler.h"
21#include "llvm/CodeGen/RegisterPressure.h"
22
23using namespace llvm;
24
25#define DEBUG_TYPE "misched"
26
27// This scheduler implements a different scheduling algorithm than
28// GenericScheduler.
29//
30// There are several specific architecture behaviours that can't be modelled
31// for GenericScheduler:
32// . When accessing the result of an SGPR load instruction, you have to wait
33// for all the SGPR load instructions before your current instruction to
34// have finished.
35// . When accessing the result of an VGPR load instruction, you have to wait
36// for all the VGPR load instructions previous to the VGPR load instruction
37// you are interested in to finish.
38// . The less the register pressure, the best load latencies are hidden
39//
40// Moreover some specifities (like the fact a lot of instructions in the shader
41// have few dependencies) makes the generic scheduler have some unpredictable
42// behaviours. For example when register pressure becomes high, it can either
43// manage to prevent register pressure from going too high, or it can
44// increase register pressure even more than if it hadn't taken register
45// pressure into account.
46//
47// Also some other bad behaviours are generated, like loading at the beginning
48// of the shader a constant in VGPR you won't need until the end of the shader.
49//
50// The scheduling problem for SI can distinguish three main parts:
51// . Hiding high latencies (texture sampling, etc)
52// . Hiding low latencies (SGPR constant loading, etc)
53// . Keeping register usage low for better latency hiding and general
54// performance
55//
56// Some other things can also affect performance, but are hard to predict
57// (cache usage, the fact the HW can issue several instructions from different
58// wavefronts if different types, etc)
59//
60// This scheduler tries to solve the scheduling problem by dividing it into
61// simpler sub-problems. It divides the instructions into blocks, schedules
62// locally inside the blocks where it takes care of low latencies, and then
63// chooses the order of the blocks by taking care of high latencies.
64// Dividing the instructions into blocks helps control keeping register
65// usage low.
66//
67// First the instructions are put into blocks.
68// We want the blocks help control register usage and hide high latencies
69// later. To help control register usage, we typically want all local
70// computations, when for example you create a result that can be comsummed
71// right away, to be contained in a block. Block inputs and outputs would
72// typically be important results that are needed in several locations of
73// the shader. Since we do want blocks to help hide high latencies, we want
74// the instructions inside the block to have a minimal set of dependencies
75// on high latencies. It will make it easy to pick blocks to hide specific
76// high latencies.
77// The block creation algorithm is divided into several steps, and several
78// variants can be tried during the scheduling process.
79//
Simon Pilgrime995a8082016-11-18 11:04:02 +000080// Second the order of the instructions inside the blocks is chosen.
Nicolai Haehnle02c32912016-01-13 16:10:10 +000081// At that step we do take into account only register usage and hiding
82// low latency instructions
83//
Simon Pilgrime995a8082016-11-18 11:04:02 +000084// Third the block order is chosen, there we try to hide high latencies
Nicolai Haehnle02c32912016-01-13 16:10:10 +000085// and keep register usage low.
86//
87// After the third step, a pass is done to improve the hiding of low
88// latencies.
89//
90// Actually when talking about 'low latency' or 'high latency' it includes
91// both the latency to get the cache (or global mem) data go to the register,
Simon Pilgrime995a8082016-11-18 11:04:02 +000092// and the bandwidth limitations.
Nicolai Haehnle02c32912016-01-13 16:10:10 +000093// Increasing the number of active wavefronts helps hide the former, but it
94// doesn't solve the latter, thus why even if wavefront count is high, we have
95// to try have as many instructions hiding high latencies as possible.
96// The OpenCL doc says for example latency of 400 cycles for a global mem access,
97// which is hidden by 10 instructions if the wavefront count is 10.
98
99// Some figures taken from AMD docs:
100// Both texture and constant L1 caches are 4-way associative with 64 bytes
101// lines.
102// Constant cache is shared with 4 CUs.
103// For texture sampling, the address generation unit receives 4 texture
104// addresses per cycle, thus we could expect texture sampling latency to be
105// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
106// instructions in a wavefront group are executed every 4 cycles),
107// or 16 instructions if the other wavefronts associated to the 3 other VALUs
108// of the CU do texture sampling too. (Don't take these figures too seriously,
109// as I'm not 100% sure of the computation)
110// Data exports should get similar latency.
111// For constant loading, the cache is shader with 4 CUs.
112// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
113// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
114// It means a simple s_buffer_load should take one instruction to hide, as
115// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
116// cache line.
117//
118// As of today the driver doesn't preload the constants in cache, thus the
119// first loads get extra latency. The doc says global memory access can be
120// 300-600 cycles. We do not specially take that into account when scheduling
121// As we expect the driver to be able to preload the constants soon.
122
123
124// common code //
125
126#ifndef NDEBUG
127
128static const char *getReasonStr(SIScheduleCandReason Reason) {
129 switch (Reason) {
130 case NoCand: return "NOCAND";
131 case RegUsage: return "REGUSAGE";
132 case Latency: return "LATENCY";
133 case Successor: return "SUCCESSOR";
134 case Depth: return "DEPTH";
135 case NodeOrder: return "ORDER";
136 }
137 llvm_unreachable("Unknown reason!");
138}
139
140#endif
141
142static bool tryLess(int TryVal, int CandVal,
143 SISchedulerCandidate &TryCand,
144 SISchedulerCandidate &Cand,
145 SIScheduleCandReason Reason) {
146 if (TryVal < CandVal) {
147 TryCand.Reason = Reason;
148 return true;
149 }
150 if (TryVal > CandVal) {
151 if (Cand.Reason > Reason)
152 Cand.Reason = Reason;
153 return true;
154 }
155 Cand.setRepeat(Reason);
156 return false;
157}
158
159static bool tryGreater(int TryVal, int CandVal,
160 SISchedulerCandidate &TryCand,
161 SISchedulerCandidate &Cand,
162 SIScheduleCandReason Reason) {
163 if (TryVal > CandVal) {
164 TryCand.Reason = Reason;
165 return true;
166 }
167 if (TryVal < CandVal) {
168 if (Cand.Reason > Reason)
169 Cand.Reason = Reason;
170 return true;
171 }
172 Cand.setRepeat(Reason);
173 return false;
174}
175
176// SIScheduleBlock //
177
178void SIScheduleBlock::addUnit(SUnit *SU) {
179 NodeNum2Index[SU->NodeNum] = SUnits.size();
180 SUnits.push_back(SU);
181}
182
183#ifndef NDEBUG
184
185void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
186
187 dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
188 dbgs() << '\n';
189}
190#endif
191
192void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
193 SISchedCandidate &TryCand) {
194 // Initialize the candidate if needed.
195 if (!Cand.isValid()) {
196 TryCand.Reason = NodeOrder;
197 return;
198 }
199
200 if (Cand.SGPRUsage > 60 &&
201 tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
202 return;
203
204 // Schedule low latency instructions as top as possible.
205 // Order of priority is:
206 // . Low latency instructions which do not depend on other low latency
207 // instructions we haven't waited for
208 // . Other instructions which do not depend on low latency instructions
209 // we haven't waited for
210 // . Low latencies
211 // . All other instructions
Simon Pilgrime995a8082016-11-18 11:04:02 +0000212 // Goal is to get: low latency instructions - independent instructions
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000213 // - (eventually some more low latency instructions)
214 // - instructions that depend on the first low latency instructions.
215 // If in the block there is a lot of constant loads, the SGPR usage
216 // could go quite high, thus above the arbitrary limit of 60 will encourage
217 // use the already loaded constants (in order to release some SGPRs) before
218 // loading more.
219 if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
220 Cand.HasLowLatencyNonWaitedParent,
221 TryCand, Cand, SIScheduleCandReason::Depth))
222 return;
223
224 if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
225 TryCand, Cand, SIScheduleCandReason::Depth))
226 return;
227
228 if (TryCand.IsLowLatency &&
229 tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
230 TryCand, Cand, SIScheduleCandReason::Depth))
231 return;
232
233 if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
234 return;
235
236 // Fall through to original instruction order.
237 if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
238 TryCand.Reason = NodeOrder;
239 }
240}
241
242SUnit* SIScheduleBlock::pickNode() {
243 SISchedCandidate TopCand;
244
245 for (SUnit* SU : TopReadySUs) {
246 SISchedCandidate TryCand;
247 std::vector<unsigned> pressure;
248 std::vector<unsigned> MaxPressure;
249 // Predict register usage after this instruction.
250 TryCand.SU = SU;
251 TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
252 TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
253 TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
254 TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
255 TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
256 TryCand.HasLowLatencyNonWaitedParent =
257 HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
258 tryCandidateTopDown(TopCand, TryCand);
259 if (TryCand.Reason != NoCand)
260 TopCand.setBest(TryCand);
261 }
262
263 return TopCand.SU;
264}
265
266
267// Schedule something valid.
268void SIScheduleBlock::fastSchedule() {
269 TopReadySUs.clear();
270 if (Scheduled)
271 undoSchedule();
272
273 for (SUnit* SU : SUnits) {
274 if (!SU->NumPredsLeft)
275 TopReadySUs.push_back(SU);
276 }
277
278 while (!TopReadySUs.empty()) {
279 SUnit *SU = TopReadySUs[0];
280 ScheduledSUnits.push_back(SU);
281 nodeScheduled(SU);
282 }
283
284 Scheduled = true;
285}
286
287// Returns if the register was set between first and last.
288static bool isDefBetween(unsigned Reg,
289 SlotIndex First, SlotIndex Last,
290 const MachineRegisterInfo *MRI,
291 const LiveIntervals *LIS) {
292 for (MachineRegisterInfo::def_instr_iterator
293 UI = MRI->def_instr_begin(Reg),
294 UE = MRI->def_instr_end(); UI != UE; ++UI) {
295 const MachineInstr* MI = &*UI;
296 if (MI->isDebugValue())
297 continue;
Duncan P. N. Exon Smith3ac9cc62016-02-27 06:40:41 +0000298 SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000299 if (InstSlot >= First && InstSlot <= Last)
300 return true;
301 }
302 return false;
303}
304
305void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
306 MachineBasicBlock::iterator EndBlock) {
307 IntervalPressure Pressure, BotPressure;
308 RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
309 LiveIntervals *LIS = DAG->getLIS();
310 MachineRegisterInfo *MRI = DAG->getMRI();
311 DAG->initRPTracker(TopRPTracker);
312 DAG->initRPTracker(BotRPTracker);
313 DAG->initRPTracker(RPTracker);
314
315 // Goes though all SU. RPTracker captures what had to be alive for the SUs
316 // to execute, and what is still alive at the end.
317 for (SUnit* SU : ScheduledSUnits) {
318 RPTracker.setPos(SU->getInstr());
319 RPTracker.advance();
320 }
321
322 // Close the RPTracker to finalize live ins/outs.
323 RPTracker.closeRegion();
324
325 // Initialize the live ins and live outs.
326 TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
327 BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
328
329 // Do not Track Physical Registers, because it messes up.
Matthias Braun5d458612016-01-20 00:23:26 +0000330 for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
331 if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
332 LiveInRegs.insert(RegMaskPair.RegUnit);
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000333 }
334 LiveOutRegs.clear();
335 // There is several possibilities to distinguish:
336 // 1) Reg is not input to any instruction in the block, but is output of one
337 // 2) 1) + read in the block and not needed after it
338 // 3) 1) + read in the block but needed in another block
339 // 4) Reg is input of an instruction but another block will read it too
340 // 5) Reg is input of an instruction and then rewritten in the block.
341 // result is not read in the block (implies used in another block)
342 // 6) Reg is input of an instruction and then rewritten in the block.
343 // result is read in the block and not needed in another block
344 // 7) Reg is input of an instruction and then rewritten in the block.
345 // result is read in the block but also needed in another block
346 // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
347 // We want LiveOutRegs to contain only Regs whose content will be read after
348 // in another block, and whose content was written in the current block,
349 // that is we want it to get 1, 3, 5, 7
350 // Since we made the MIs of a block to be packed all together before
351 // scheduling, then the LiveIntervals were correct, and the RPTracker was
352 // able to correctly handle 5 vs 6, 2 vs 3.
353 // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
354 // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
355 // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
356 // The use of findDefBetween removes the case 4.
Matthias Braun5d458612016-01-20 00:23:26 +0000357 for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
358 unsigned Reg = RegMaskPair.RegUnit;
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000359 if (TargetRegisterInfo::isVirtualRegister(Reg) &&
Duncan P. N. Exon Smith3ac9cc62016-02-27 06:40:41 +0000360 isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
361 LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
362 LIS)) {
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000363 LiveOutRegs.insert(Reg);
364 }
365 }
366
367 // Pressure = sum_alive_registers register size
368 // Internally llvm will represent some registers as big 128 bits registers
369 // for example, but they actually correspond to 4 actual 32 bits registers.
370 // Thus Pressure is not equal to num_alive_registers * constant.
371 LiveInPressure = TopPressure.MaxSetPressure;
372 LiveOutPressure = BotPressure.MaxSetPressure;
373
374 // Prepares TopRPTracker for top down scheduling.
375 TopRPTracker.closeTop();
376}
377
378void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
379 MachineBasicBlock::iterator EndBlock) {
380 if (!Scheduled)
381 fastSchedule();
382
383 // PreScheduling phase to set LiveIn and LiveOut.
384 initRegPressure(BeginBlock, EndBlock);
385 undoSchedule();
386
387 // Schedule for real now.
388
389 TopReadySUs.clear();
390
391 for (SUnit* SU : SUnits) {
392 if (!SU->NumPredsLeft)
393 TopReadySUs.push_back(SU);
394 }
395
396 while (!TopReadySUs.empty()) {
397 SUnit *SU = pickNode();
398 ScheduledSUnits.push_back(SU);
399 TopRPTracker.setPos(SU->getInstr());
400 TopRPTracker.advance();
401 nodeScheduled(SU);
402 }
403
404 // TODO: compute InternalAdditionnalPressure.
405 InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
406
407 // Check everything is right.
408#ifndef NDEBUG
409 assert(SUnits.size() == ScheduledSUnits.size() &&
410 TopReadySUs.empty());
411 for (SUnit* SU : SUnits) {
412 assert(SU->isScheduled &&
413 SU->NumPredsLeft == 0);
414 }
415#endif
416
417 Scheduled = true;
418}
419
420void SIScheduleBlock::undoSchedule() {
421 for (SUnit* SU : SUnits) {
422 SU->isScheduled = false;
423 for (SDep& Succ : SU->Succs) {
424 if (BC->isSUInBlock(Succ.getSUnit(), ID))
425 undoReleaseSucc(SU, &Succ);
426 }
427 }
428 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
429 ScheduledSUnits.clear();
430 Scheduled = false;
431}
432
433void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
434 SUnit *SuccSU = SuccEdge->getSUnit();
435
436 if (SuccEdge->isWeak()) {
437 ++SuccSU->WeakPredsLeft;
438 return;
439 }
440 ++SuccSU->NumPredsLeft;
441}
442
443void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
444 SUnit *SuccSU = SuccEdge->getSUnit();
445
446 if (SuccEdge->isWeak()) {
447 --SuccSU->WeakPredsLeft;
448 return;
449 }
450#ifndef NDEBUG
451 if (SuccSU->NumPredsLeft == 0) {
452 dbgs() << "*** Scheduling failed! ***\n";
453 SuccSU->dump(DAG);
454 dbgs() << " has been released too many times!\n";
455 llvm_unreachable(nullptr);
456 }
457#endif
458
459 --SuccSU->NumPredsLeft;
460}
461
462/// Release Successors of the SU that are in the block or not.
463void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
464 for (SDep& Succ : SU->Succs) {
465 SUnit *SuccSU = Succ.getSUnit();
466
Matt Arsenaultfe358062016-07-19 00:35:22 +0000467 if (SuccSU->NodeNum >= DAG->SUnits.size())
468 continue;
469
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000470 if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
471 continue;
472
473 releaseSucc(SU, &Succ);
474 if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
475 TopReadySUs.push_back(SuccSU);
476 }
477}
478
479void SIScheduleBlock::nodeScheduled(SUnit *SU) {
480 // Is in TopReadySUs
481 assert (!SU->NumPredsLeft);
David Majnemer0d955d02016-08-11 22:21:41 +0000482 std::vector<SUnit *>::iterator I = find(TopReadySUs, SU);
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000483 if (I == TopReadySUs.end()) {
484 dbgs() << "Data Structure Bug in SI Scheduler\n";
485 llvm_unreachable(nullptr);
486 }
487 TopReadySUs.erase(I);
488
489 releaseSuccessors(SU, true);
490 // Scheduling this node will trigger a wait,
491 // thus propagate to other instructions that they do not need to wait either.
492 if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
493 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
494
495 if (DAG->IsLowLatencySU[SU->NodeNum]) {
496 for (SDep& Succ : SU->Succs) {
497 std::map<unsigned, unsigned>::iterator I =
498 NodeNum2Index.find(Succ.getSUnit()->NodeNum);
499 if (I != NodeNum2Index.end())
500 HasLowLatencyNonWaitedParent[I->second] = 1;
501 }
502 }
503 SU->isScheduled = true;
504}
505
506void SIScheduleBlock::finalizeUnits() {
507 // We remove links from outside blocks to enable scheduling inside the block.
508 for (SUnit* SU : SUnits) {
509 releaseSuccessors(SU, false);
510 if (DAG->IsHighLatencySU[SU->NodeNum])
511 HighLatencyBlock = true;
512 }
513 HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
514}
515
516// we maintain ascending order of IDs
517void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
518 unsigned PredID = Pred->getID();
519
520 // Check if not already predecessor.
521 for (SIScheduleBlock* P : Preds) {
522 if (PredID == P->getID())
523 return;
524 }
525 Preds.push_back(Pred);
526
Benjamin Kramer3e9a5d32016-05-27 11:36:04 +0000527 assert(none_of(Succs,
528 [=](SIScheduleBlock *S) { return PredID == S->getID(); }) &&
529 "Loop in the Block Graph!");
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000530}
531
532void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
533 unsigned SuccID = Succ->getID();
534
535 // Check if not already predecessor.
536 for (SIScheduleBlock* S : Succs) {
537 if (SuccID == S->getID())
538 return;
539 }
540 if (Succ->isHighLatencyBlock())
541 ++NumHighLatencySuccessors;
542 Succs.push_back(Succ);
Benjamin Kramer3e9a5d32016-05-27 11:36:04 +0000543 assert(none_of(Preds,
544 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
545 "Loop in the Block Graph!");
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000546}
547
548#ifndef NDEBUG
549void SIScheduleBlock::printDebug(bool full) {
550 dbgs() << "Block (" << ID << ")\n";
551 if (!full)
552 return;
553
554 dbgs() << "\nContains High Latency Instruction: "
555 << HighLatencyBlock << '\n';
556 dbgs() << "\nDepends On:\n";
557 for (SIScheduleBlock* P : Preds) {
558 P->printDebug(false);
559 }
560
561 dbgs() << "\nSuccessors:\n";
562 for (SIScheduleBlock* S : Succs) {
563 S->printDebug(false);
564 }
565
566 if (Scheduled) {
567 dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
568 << LiveInPressure[DAG->getVGPRSetID()] << '\n';
569 dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
570 << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
571 dbgs() << "LiveIns:\n";
572 for (unsigned Reg : LiveInRegs)
573 dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
574
575 dbgs() << "\nLiveOuts:\n";
576 for (unsigned Reg : LiveOutRegs)
577 dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
578 }
579
580 dbgs() << "\nInstructions:\n";
581 if (!Scheduled) {
582 for (SUnit* SU : SUnits) {
583 SU->dump(DAG);
584 }
585 } else {
586 for (SUnit* SU : SUnits) {
587 SU->dump(DAG);
588 }
589 }
590
591 dbgs() << "///////////////////////\n";
592}
593
594#endif
595
596// SIScheduleBlockCreator //
597
598SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
599DAG(DAG) {
600}
601
602SIScheduleBlockCreator::~SIScheduleBlockCreator() {
603}
604
605SIScheduleBlocks
606SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
607 std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
608 Blocks.find(BlockVariant);
609 if (B == Blocks.end()) {
610 SIScheduleBlocks Res;
611 createBlocksForVariant(BlockVariant);
612 topologicalSort();
613 scheduleInsideBlocks();
614 fillStats();
615 Res.Blocks = CurrentBlocks;
616 Res.TopDownIndex2Block = TopDownIndex2Block;
617 Res.TopDownBlock2Index = TopDownBlock2Index;
618 Blocks[BlockVariant] = Res;
619 return Res;
620 } else {
621 return B->second;
622 }
623}
624
625bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
626 if (SU->NodeNum >= DAG->SUnits.size())
627 return false;
628 return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
629}
630
631void SIScheduleBlockCreator::colorHighLatenciesAlone() {
632 unsigned DAGSize = DAG->SUnits.size();
633
634 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
635 SUnit *SU = &DAG->SUnits[i];
636 if (DAG->IsHighLatencySU[SU->NodeNum]) {
637 CurrentColoring[SU->NodeNum] = NextReservedID++;
638 }
639 }
640}
641
642void SIScheduleBlockCreator::colorHighLatenciesGroups() {
643 unsigned DAGSize = DAG->SUnits.size();
644 unsigned NumHighLatencies = 0;
645 unsigned GroupSize;
646 unsigned Color = NextReservedID;
647 unsigned Count = 0;
648 std::set<unsigned> FormingGroup;
649
650 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
651 SUnit *SU = &DAG->SUnits[i];
652 if (DAG->IsHighLatencySU[SU->NodeNum])
653 ++NumHighLatencies;
654 }
655
656 if (NumHighLatencies == 0)
657 return;
658
659 if (NumHighLatencies <= 6)
660 GroupSize = 2;
661 else if (NumHighLatencies <= 12)
662 GroupSize = 3;
663 else
664 GroupSize = 4;
665
666 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
667 SUnit *SU = &DAG->SUnits[i];
668 if (DAG->IsHighLatencySU[SU->NodeNum]) {
669 unsigned CompatibleGroup = true;
670 unsigned ProposedColor = Color;
671 for (unsigned j : FormingGroup) {
672 // TODO: Currently CompatibleGroup will always be false,
673 // because the graph enforces the load order. This
674 // can be fixed, but as keeping the load order is often
675 // good for performance that causes a performance hit (both
676 // the default scheduler and this scheduler).
677 // When this scheduler determines a good load order,
678 // this can be fixed.
679 if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) ||
680 !DAG->canAddEdge(&DAG->SUnits[j], SU))
681 CompatibleGroup = false;
682 }
683 if (!CompatibleGroup || ++Count == GroupSize) {
684 FormingGroup.clear();
685 Color = ++NextReservedID;
686 if (!CompatibleGroup) {
687 ProposedColor = Color;
688 FormingGroup.insert(SU->NodeNum);
689 }
690 Count = 0;
691 } else {
692 FormingGroup.insert(SU->NodeNum);
693 }
694 CurrentColoring[SU->NodeNum] = ProposedColor;
695 }
696 }
697}
698
699void SIScheduleBlockCreator::colorComputeReservedDependencies() {
700 unsigned DAGSize = DAG->SUnits.size();
701 std::map<std::set<unsigned>, unsigned> ColorCombinations;
702
703 CurrentTopDownReservedDependencyColoring.clear();
704 CurrentBottomUpReservedDependencyColoring.clear();
705
706 CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
707 CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
708
709 // Traverse TopDown, and give different colors to SUs depending
710 // on which combination of High Latencies they depend on.
711
Tom Stellard4a304b32016-05-03 16:30:56 +0000712 for (unsigned SUNum : DAG->TopDownIndex2SU) {
713 SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000714 std::set<unsigned> SUColors;
715
716 // Already given.
717 if (CurrentColoring[SU->NodeNum]) {
718 CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
719 CurrentColoring[SU->NodeNum];
720 continue;
721 }
722
723 for (SDep& PredDep : SU->Preds) {
724 SUnit *Pred = PredDep.getSUnit();
725 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
726 continue;
727 if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
728 SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
729 }
730 // Color 0 by default.
731 if (SUColors.empty())
732 continue;
733 // Same color than parents.
734 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
735 CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
736 *SUColors.begin();
737 else {
738 std::map<std::set<unsigned>, unsigned>::iterator Pos =
739 ColorCombinations.find(SUColors);
740 if (Pos != ColorCombinations.end()) {
741 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
742 } else {
743 CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
744 NextNonReservedID;
745 ColorCombinations[SUColors] = NextNonReservedID++;
746 }
747 }
748 }
749
750 ColorCombinations.clear();
751
752 // Same as before, but BottomUp.
753
Tom Stellard4a304b32016-05-03 16:30:56 +0000754 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
755 SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000756 std::set<unsigned> SUColors;
757
758 // Already given.
759 if (CurrentColoring[SU->NodeNum]) {
760 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
761 CurrentColoring[SU->NodeNum];
762 continue;
763 }
764
765 for (SDep& SuccDep : SU->Succs) {
766 SUnit *Succ = SuccDep.getSUnit();
767 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
768 continue;
769 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
770 SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
771 }
772 // Keep color 0.
773 if (SUColors.empty())
774 continue;
775 // Same color than parents.
776 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
777 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
778 *SUColors.begin();
779 else {
780 std::map<std::set<unsigned>, unsigned>::iterator Pos =
781 ColorCombinations.find(SUColors);
782 if (Pos != ColorCombinations.end()) {
783 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
784 } else {
785 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
786 NextNonReservedID;
787 ColorCombinations[SUColors] = NextNonReservedID++;
788 }
789 }
790 }
791}
792
793void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
794 unsigned DAGSize = DAG->SUnits.size();
795 std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
796
797 // Every combination of colors given by the top down
798 // and bottom up Reserved node dependency
799
800 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
801 SUnit *SU = &DAG->SUnits[i];
802 std::pair<unsigned, unsigned> SUColors;
803
804 // High latency instructions: already given.
805 if (CurrentColoring[SU->NodeNum])
806 continue;
807
808 SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
809 SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
810
811 std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
812 ColorCombinations.find(SUColors);
813 if (Pos != ColorCombinations.end()) {
814 CurrentColoring[SU->NodeNum] = Pos->second;
815 } else {
816 CurrentColoring[SU->NodeNum] = NextNonReservedID;
817 ColorCombinations[SUColors] = NextNonReservedID++;
818 }
819 }
820}
821
822void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
823 unsigned DAGSize = DAG->SUnits.size();
824 std::vector<int> PendingColoring = CurrentColoring;
825
Tom Stellard4a304b32016-05-03 16:30:56 +0000826 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
827 SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000828 std::set<unsigned> SUColors;
829 std::set<unsigned> SUColorsPending;
830
831 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
832 continue;
833
834 if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
835 CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
836 continue;
837
838 for (SDep& SuccDep : SU->Succs) {
839 SUnit *Succ = SuccDep.getSUnit();
840 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
841 continue;
842 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
843 CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
844 SUColors.insert(CurrentColoring[Succ->NodeNum]);
845 SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
846 }
847 if (SUColors.size() == 1 && SUColorsPending.size() == 1)
848 PendingColoring[SU->NodeNum] = *SUColors.begin();
849 else // TODO: Attribute new colors depending on color
850 // combination of children.
851 PendingColoring[SU->NodeNum] = NextNonReservedID++;
852 }
853 CurrentColoring = PendingColoring;
854}
855
856
857void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
858 unsigned DAGSize = DAG->SUnits.size();
859 unsigned PreviousColor;
860 std::set<unsigned> SeenColors;
861
862 if (DAGSize <= 1)
863 return;
864
865 PreviousColor = CurrentColoring[0];
866
867 for (unsigned i = 1, e = DAGSize; i != e; ++i) {
868 SUnit *SU = &DAG->SUnits[i];
869 unsigned CurrentColor = CurrentColoring[i];
870 unsigned PreviousColorSave = PreviousColor;
871 assert(i == SU->NodeNum);
872
873 if (CurrentColor != PreviousColor)
874 SeenColors.insert(PreviousColor);
875 PreviousColor = CurrentColor;
876
877 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
878 continue;
879
880 if (SeenColors.find(CurrentColor) == SeenColors.end())
881 continue;
882
883 if (PreviousColorSave != CurrentColor)
884 CurrentColoring[i] = NextNonReservedID++;
885 else
886 CurrentColoring[i] = CurrentColoring[i-1];
887 }
888}
889
890void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
891 unsigned DAGSize = DAG->SUnits.size();
892
Tom Stellard4a304b32016-05-03 16:30:56 +0000893 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
894 SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000895 std::set<unsigned> SUColors;
896
897 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
898 continue;
899
900 // No predecessor: Vgpr constant loading.
901 // Low latency instructions usually have a predecessor (the address)
902 if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
903 continue;
904
905 for (SDep& SuccDep : SU->Succs) {
906 SUnit *Succ = SuccDep.getSUnit();
907 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
908 continue;
909 SUColors.insert(CurrentColoring[Succ->NodeNum]);
910 }
911 if (SUColors.size() == 1)
912 CurrentColoring[SU->NodeNum] = *SUColors.begin();
913 }
914}
915
916void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
917 unsigned DAGSize = DAG->SUnits.size();
918
Tom Stellard4a304b32016-05-03 16:30:56 +0000919 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
920 SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000921 std::set<unsigned> SUColors;
922
923 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
924 continue;
925
926 for (SDep& SuccDep : SU->Succs) {
927 SUnit *Succ = SuccDep.getSUnit();
928 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
929 continue;
930 SUColors.insert(CurrentColoring[Succ->NodeNum]);
931 }
932 if (SUColors.size() == 1)
933 CurrentColoring[SU->NodeNum] = *SUColors.begin();
934 }
935}
936
937void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
938 unsigned DAGSize = DAG->SUnits.size();
939
Tom Stellard4a304b32016-05-03 16:30:56 +0000940 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
941 SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000942 std::set<unsigned> SUColors;
943
944 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
945 continue;
946
947 for (SDep& SuccDep : SU->Succs) {
948 SUnit *Succ = SuccDep.getSUnit();
949 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
950 continue;
951 SUColors.insert(CurrentColoring[Succ->NodeNum]);
952 }
953 if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
954 CurrentColoring[SU->NodeNum] = *SUColors.begin();
955 }
956}
957
958void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
959 unsigned DAGSize = DAG->SUnits.size();
960 std::map<unsigned, unsigned> ColorCount;
961
Tom Stellard4a304b32016-05-03 16:30:56 +0000962 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
963 SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000964 unsigned color = CurrentColoring[SU->NodeNum];
965 std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color);
966 if (Pos != ColorCount.end()) {
967 ++ColorCount[color];
968 } else {
969 ColorCount[color] = 1;
970 }
971 }
972
Tom Stellard4a304b32016-05-03 16:30:56 +0000973 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
974 SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000975 unsigned color = CurrentColoring[SU->NodeNum];
976 std::set<unsigned> SUColors;
977
978 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
979 continue;
980
981 if (ColorCount[color] > 1)
982 continue;
983
984 for (SDep& SuccDep : SU->Succs) {
985 SUnit *Succ = SuccDep.getSUnit();
986 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
987 continue;
988 SUColors.insert(CurrentColoring[Succ->NodeNum]);
989 }
990 if (SUColors.size() == 1 && *SUColors.begin() != color) {
991 --ColorCount[color];
992 CurrentColoring[SU->NodeNum] = *SUColors.begin();
993 ++ColorCount[*SUColors.begin()];
994 }
995 }
996}
997
998void SIScheduleBlockCreator::cutHugeBlocks() {
999 // TODO
1000}
1001
1002void SIScheduleBlockCreator::regroupNoUserInstructions() {
1003 unsigned DAGSize = DAG->SUnits.size();
1004 int GroupID = NextNonReservedID++;
1005
Tom Stellard4a304b32016-05-03 16:30:56 +00001006 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1007 SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001008 bool hasSuccessor = false;
1009
1010 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1011 continue;
1012
1013 for (SDep& SuccDep : SU->Succs) {
1014 SUnit *Succ = SuccDep.getSUnit();
1015 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1016 continue;
1017 hasSuccessor = true;
1018 }
1019 if (!hasSuccessor)
1020 CurrentColoring[SU->NodeNum] = GroupID;
1021 }
1022}
1023
1024void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
1025 unsigned DAGSize = DAG->SUnits.size();
1026 std::map<unsigned,unsigned> RealID;
1027
1028 CurrentBlocks.clear();
1029 CurrentColoring.clear();
1030 CurrentColoring.resize(DAGSize, 0);
1031 Node2CurrentBlock.clear();
1032
1033 // Restore links previous scheduling variant has overridden.
1034 DAG->restoreSULinksLeft();
1035
1036 NextReservedID = 1;
1037 NextNonReservedID = DAGSize + 1;
1038
1039 DEBUG(dbgs() << "Coloring the graph\n");
1040
1041 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
1042 colorHighLatenciesGroups();
1043 else
1044 colorHighLatenciesAlone();
1045 colorComputeReservedDependencies();
1046 colorAccordingToReservedDependencies();
1047 colorEndsAccordingToDependencies();
1048 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
1049 colorForceConsecutiveOrderInGroup();
1050 regroupNoUserInstructions();
1051 colorMergeConstantLoadsNextGroup();
1052 colorMergeIfPossibleNextGroupOnlyForReserved();
1053
1054 // Put SUs of same color into same block
1055 Node2CurrentBlock.resize(DAGSize, -1);
1056 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1057 SUnit *SU = &DAG->SUnits[i];
1058 unsigned Color = CurrentColoring[SU->NodeNum];
1059 if (RealID.find(Color) == RealID.end()) {
1060 int ID = CurrentBlocks.size();
1061 BlockPtrs.push_back(
1062 make_unique<SIScheduleBlock>(DAG, this, ID));
1063 CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
1064 RealID[Color] = ID;
1065 }
1066 CurrentBlocks[RealID[Color]]->addUnit(SU);
1067 Node2CurrentBlock[SU->NodeNum] = RealID[Color];
1068 }
1069
1070 // Build dependencies between blocks.
1071 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1072 SUnit *SU = &DAG->SUnits[i];
1073 int SUID = Node2CurrentBlock[i];
1074 for (SDep& SuccDep : SU->Succs) {
1075 SUnit *Succ = SuccDep.getSUnit();
1076 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1077 continue;
1078 if (Node2CurrentBlock[Succ->NodeNum] != SUID)
1079 CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]);
1080 }
1081 for (SDep& PredDep : SU->Preds) {
1082 SUnit *Pred = PredDep.getSUnit();
1083 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
1084 continue;
1085 if (Node2CurrentBlock[Pred->NodeNum] != SUID)
1086 CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
1087 }
1088 }
1089
1090 // Free root and leafs of all blocks to enable scheduling inside them.
1091 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1092 SIScheduleBlock *Block = CurrentBlocks[i];
1093 Block->finalizeUnits();
1094 }
1095 DEBUG(
1096 dbgs() << "Blocks created:\n\n";
1097 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1098 SIScheduleBlock *Block = CurrentBlocks[i];
1099 Block->printDebug(true);
1100 }
1101 );
1102}
1103
1104// Two functions taken from Codegen/MachineScheduler.cpp
1105
1106/// If this iterator is a debug value, increment until reaching the End or a
1107/// non-debug instruction.
1108static MachineBasicBlock::const_iterator
1109nextIfDebug(MachineBasicBlock::const_iterator I,
1110 MachineBasicBlock::const_iterator End) {
1111 for(; I != End; ++I) {
1112 if (!I->isDebugValue())
1113 break;
1114 }
1115 return I;
1116}
1117
1118/// Non-const version.
1119static MachineBasicBlock::iterator
1120nextIfDebug(MachineBasicBlock::iterator I,
1121 MachineBasicBlock::const_iterator End) {
1122 // Cast the return value to nonconst MachineInstr, then cast to an
1123 // instr_iterator, which does not check for null, finally return a
1124 // bundle_iterator.
1125 return MachineBasicBlock::instr_iterator(
1126 const_cast<MachineInstr*>(
1127 &*nextIfDebug(MachineBasicBlock::const_iterator(I), End)));
1128}
1129
1130void SIScheduleBlockCreator::topologicalSort() {
1131 unsigned DAGSize = CurrentBlocks.size();
1132 std::vector<int> WorkList;
1133
1134 DEBUG(dbgs() << "Topological Sort\n");
1135
1136 WorkList.reserve(DAGSize);
1137 TopDownIndex2Block.resize(DAGSize);
1138 TopDownBlock2Index.resize(DAGSize);
1139 BottomUpIndex2Block.resize(DAGSize);
1140
1141 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1142 SIScheduleBlock *Block = CurrentBlocks[i];
1143 unsigned Degree = Block->getSuccs().size();
1144 TopDownBlock2Index[i] = Degree;
1145 if (Degree == 0) {
1146 WorkList.push_back(i);
1147 }
1148 }
1149
1150 int Id = DAGSize;
1151 while (!WorkList.empty()) {
1152 int i = WorkList.back();
1153 SIScheduleBlock *Block = CurrentBlocks[i];
1154 WorkList.pop_back();
1155 TopDownBlock2Index[i] = --Id;
1156 TopDownIndex2Block[Id] = i;
1157 for (SIScheduleBlock* Pred : Block->getPreds()) {
1158 if (!--TopDownBlock2Index[Pred->getID()])
1159 WorkList.push_back(Pred->getID());
1160 }
1161 }
1162
1163#ifndef NDEBUG
1164 // Check correctness of the ordering.
1165 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1166 SIScheduleBlock *Block = CurrentBlocks[i];
1167 for (SIScheduleBlock* Pred : Block->getPreds()) {
1168 assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
1169 "Wrong Top Down topological sorting");
1170 }
1171 }
1172#endif
1173
1174 BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
1175 TopDownIndex2Block.rend());
1176}
1177
1178void SIScheduleBlockCreator::scheduleInsideBlocks() {
1179 unsigned DAGSize = CurrentBlocks.size();
1180
1181 DEBUG(dbgs() << "\nScheduling Blocks\n\n");
1182
1183 // We do schedule a valid scheduling such that a Block corresponds
1184 // to a range of instructions.
1185 DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
1186 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1187 SIScheduleBlock *Block = CurrentBlocks[i];
1188 Block->fastSchedule();
1189 }
1190
1191 // Note: the following code, and the part restoring previous position
1192 // is by far the most expensive operation of the Scheduler.
1193
1194 // Do not update CurrentTop.
1195 MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
1196 std::vector<MachineBasicBlock::iterator> PosOld;
1197 std::vector<MachineBasicBlock::iterator> PosNew;
1198 PosOld.reserve(DAG->SUnits.size());
1199 PosNew.reserve(DAG->SUnits.size());
1200
1201 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1202 int BlockIndice = TopDownIndex2Block[i];
1203 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1204 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1205
1206 for (SUnit* SU : SUs) {
1207 MachineInstr *MI = SU->getInstr();
1208 MachineBasicBlock::iterator Pos = MI;
1209 PosOld.push_back(Pos);
1210 if (&*CurrentTopFastSched == MI) {
1211 PosNew.push_back(Pos);
1212 CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
1213 DAG->getCurrentBottom());
1214 } else {
1215 // Update the instruction stream.
1216 DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
1217
1218 // Update LiveIntervals.
Simon Pilgrime995a8082016-11-18 11:04:02 +00001219 // Note: Moving all instructions and calling handleMove every time
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001220 // is the most cpu intensive operation of the scheduler.
1221 // It would gain a lot if there was a way to recompute the
1222 // LiveIntervals for the entire scheduling region.
Duncan P. N. Exon Smithbe8f8c42016-02-27 20:14:29 +00001223 DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001224 PosNew.push_back(CurrentTopFastSched);
1225 }
1226 }
1227 }
1228
1229 // Now we have Block of SUs == Block of MI.
1230 // We do the final schedule for the instructions inside the block.
1231 // The property that all the SUs of the Block are grouped together as MI
1232 // is used for correct reg usage tracking.
1233 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1234 SIScheduleBlock *Block = CurrentBlocks[i];
1235 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1236 Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
1237 }
1238
1239 DEBUG(dbgs() << "Restoring MI Pos\n");
1240 // Restore old ordering (which prevents a LIS->handleMove bug).
1241 for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
1242 MachineBasicBlock::iterator POld = PosOld[i-1];
1243 MachineBasicBlock::iterator PNew = PosNew[i-1];
1244 if (PNew != POld) {
1245 // Update the instruction stream.
1246 DAG->getBB()->splice(POld, DAG->getBB(), PNew);
1247
1248 // Update LiveIntervals.
Duncan P. N. Exon Smithbe8f8c42016-02-27 20:14:29 +00001249 DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001250 }
1251 }
1252
1253 DEBUG(
1254 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1255 SIScheduleBlock *Block = CurrentBlocks[i];
1256 Block->printDebug(true);
1257 }
1258 );
1259}
1260
1261void SIScheduleBlockCreator::fillStats() {
1262 unsigned DAGSize = CurrentBlocks.size();
1263
1264 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1265 int BlockIndice = TopDownIndex2Block[i];
1266 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1267 if (Block->getPreds().size() == 0)
1268 Block->Depth = 0;
1269 else {
1270 unsigned Depth = 0;
1271 for (SIScheduleBlock *Pred : Block->getPreds()) {
1272 if (Depth < Pred->Depth + 1)
1273 Depth = Pred->Depth + 1;
1274 }
1275 Block->Depth = Depth;
1276 }
1277 }
1278
1279 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1280 int BlockIndice = BottomUpIndex2Block[i];
1281 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1282 if (Block->getSuccs().size() == 0)
1283 Block->Height = 0;
1284 else {
1285 unsigned Height = 0;
1286 for (SIScheduleBlock *Succ : Block->getSuccs()) {
1287 if (Height < Succ->Height + 1)
1288 Height = Succ->Height + 1;
1289 }
1290 Block->Height = Height;
1291 }
1292 }
1293}
1294
1295// SIScheduleBlockScheduler //
1296
1297SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
1298 SISchedulerBlockSchedulerVariant Variant,
1299 SIScheduleBlocks BlocksStruct) :
1300 DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
1301 LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
1302 SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
1303
1304 // Fill the usage of every output
1305 // Warning: while by construction we always have a link between two blocks
1306 // when one needs a result from the other, the number of users of an output
1307 // is not the sum of child blocks having as input the same virtual register.
1308 // Here is an example. A produces x and y. B eats x and produces x'.
1309 // C eats x' and y. The register coalescer may have attributed the same
1310 // virtual register to x and x'.
1311 // To count accurately, we do a topological sort. In case the register is
1312 // found for several parents, we increment the usage of the one with the
1313 // highest topological index.
1314 LiveOutRegsNumUsages.resize(Blocks.size());
1315 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1316 SIScheduleBlock *Block = Blocks[i];
1317 for (unsigned Reg : Block->getInRegs()) {
1318 bool Found = false;
1319 int topoInd = -1;
1320 for (SIScheduleBlock* Pred: Block->getPreds()) {
1321 std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1322 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1323
1324 if (RegPos != PredOutRegs.end()) {
1325 Found = true;
1326 if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
1327 topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
1328 }
1329 }
1330 }
1331
1332 if (!Found)
1333 continue;
1334
1335 int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
1336 std::map<unsigned, unsigned>::iterator RegPos =
1337 LiveOutRegsNumUsages[PredID].find(Reg);
1338 if (RegPos != LiveOutRegsNumUsages[PredID].end()) {
1339 ++LiveOutRegsNumUsages[PredID][Reg];
1340 } else {
1341 LiveOutRegsNumUsages[PredID][Reg] = 1;
1342 }
1343 }
1344 }
1345
1346 LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
1347 BlockNumPredsLeft.resize(Blocks.size());
1348 BlockNumSuccsLeft.resize(Blocks.size());
1349
1350 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1351 SIScheduleBlock *Block = Blocks[i];
1352 BlockNumPredsLeft[i] = Block->getPreds().size();
1353 BlockNumSuccsLeft[i] = Block->getSuccs().size();
1354 }
1355
1356#ifndef NDEBUG
1357 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1358 SIScheduleBlock *Block = Blocks[i];
1359 assert(Block->getID() == i);
1360 }
1361#endif
1362
1363 std::set<unsigned> InRegs = DAG->getInRegs();
1364 addLiveRegs(InRegs);
1365
1366 // Fill LiveRegsConsumers for regs that were already
1367 // defined before scheduling.
1368 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1369 SIScheduleBlock *Block = Blocks[i];
1370 for (unsigned Reg : Block->getInRegs()) {
1371 bool Found = false;
1372 for (SIScheduleBlock* Pred: Block->getPreds()) {
1373 std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1374 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1375
1376 if (RegPos != PredOutRegs.end()) {
1377 Found = true;
1378 break;
1379 }
1380 }
1381
1382 if (!Found) {
1383 if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end())
1384 LiveRegsConsumers[Reg] = 1;
1385 else
1386 ++LiveRegsConsumers[Reg];
1387 }
1388 }
1389 }
1390
1391 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1392 SIScheduleBlock *Block = Blocks[i];
1393 if (BlockNumPredsLeft[i] == 0) {
1394 ReadyBlocks.push_back(Block);
1395 }
1396 }
1397
1398 while (SIScheduleBlock *Block = pickBlock()) {
1399 BlocksScheduled.push_back(Block);
1400 blockScheduled(Block);
1401 }
1402
1403 DEBUG(
1404 dbgs() << "Block Order:";
1405 for (SIScheduleBlock* Block : BlocksScheduled) {
1406 dbgs() << ' ' << Block->getID();
1407 }
1408 );
1409}
1410
1411bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
1412 SIBlockSchedCandidate &TryCand) {
1413 if (!Cand.isValid()) {
1414 TryCand.Reason = NodeOrder;
1415 return true;
1416 }
1417
1418 // Try to hide high latencies.
1419 if (tryLess(TryCand.LastPosHighLatParentScheduled,
1420 Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
1421 return true;
1422 // Schedule high latencies early so you can hide them better.
1423 if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
1424 TryCand, Cand, Latency))
1425 return true;
1426 if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height,
1427 TryCand, Cand, Depth))
1428 return true;
1429 if (tryGreater(TryCand.NumHighLatencySuccessors,
1430 Cand.NumHighLatencySuccessors,
1431 TryCand, Cand, Successor))
1432 return true;
1433 return false;
1434}
1435
1436bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
1437 SIBlockSchedCandidate &TryCand) {
1438 if (!Cand.isValid()) {
1439 TryCand.Reason = NodeOrder;
1440 return true;
1441 }
1442
1443 if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
1444 TryCand, Cand, RegUsage))
1445 return true;
1446 if (tryGreater(TryCand.NumSuccessors > 0,
1447 Cand.NumSuccessors > 0,
1448 TryCand, Cand, Successor))
1449 return true;
1450 if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
1451 return true;
1452 if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
1453 TryCand, Cand, RegUsage))
1454 return true;
1455 return false;
1456}
1457
1458SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
1459 SIBlockSchedCandidate Cand;
1460 std::vector<SIScheduleBlock*>::iterator Best;
1461 SIScheduleBlock *Block;
1462 if (ReadyBlocks.empty())
1463 return nullptr;
1464
1465 DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
1466 VregCurrentUsage, SregCurrentUsage);
1467 if (VregCurrentUsage > maxVregUsage)
1468 maxVregUsage = VregCurrentUsage;
1469 if (VregCurrentUsage > maxSregUsage)
1470 maxSregUsage = VregCurrentUsage;
1471 DEBUG(
1472 dbgs() << "Picking New Blocks\n";
1473 dbgs() << "Available: ";
1474 for (SIScheduleBlock* Block : ReadyBlocks)
1475 dbgs() << Block->getID() << ' ';
1476 dbgs() << "\nCurrent Live:\n";
1477 for (unsigned Reg : LiveRegs)
1478 dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
1479 dbgs() << '\n';
1480 dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
1481 dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
1482 );
1483
1484 Cand.Block = nullptr;
1485 for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
1486 E = ReadyBlocks.end(); I != E; ++I) {
1487 SIBlockSchedCandidate TryCand;
1488 TryCand.Block = *I;
1489 TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
1490 TryCand.VGPRUsageDiff =
1491 checkRegUsageImpact(TryCand.Block->getInRegs(),
1492 TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
1493 TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
1494 TryCand.NumHighLatencySuccessors =
1495 TryCand.Block->getNumHighLatencySuccessors();
1496 TryCand.LastPosHighLatParentScheduled =
1497 (unsigned int) std::max<int> (0,
1498 LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
1499 LastPosWaitedHighLatency);
1500 TryCand.Height = TryCand.Block->Height;
1501 // Try not to increase VGPR usage too much, else we may spill.
1502 if (VregCurrentUsage > 120 ||
1503 Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
1504 if (!tryCandidateRegUsage(Cand, TryCand) &&
1505 Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
1506 tryCandidateLatency(Cand, TryCand);
1507 } else {
1508 if (!tryCandidateLatency(Cand, TryCand))
1509 tryCandidateRegUsage(Cand, TryCand);
1510 }
1511 if (TryCand.Reason != NoCand) {
1512 Cand.setBest(TryCand);
1513 Best = I;
1514 DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
1515 << getReasonStr(Cand.Reason) << '\n');
1516 }
1517 }
1518
1519 DEBUG(
1520 dbgs() << "Picking: " << Cand.Block->getID() << '\n';
1521 dbgs() << "Is a block with high latency instruction: "
1522 << (Cand.IsHighLatency ? "yes\n" : "no\n");
1523 dbgs() << "Position of last high latency dependency: "
1524 << Cand.LastPosHighLatParentScheduled << '\n';
1525 dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
1526 dbgs() << '\n';
1527 );
1528
1529 Block = Cand.Block;
1530 ReadyBlocks.erase(Best);
1531 return Block;
1532}
1533
1534// Tracking of currently alive registers to determine VGPR Usage.
1535
1536void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
1537 for (unsigned Reg : Regs) {
1538 // For now only track virtual registers.
1539 if (!TargetRegisterInfo::isVirtualRegister(Reg))
1540 continue;
1541 // If not already in the live set, then add it.
1542 (void) LiveRegs.insert(Reg);
1543 }
1544}
1545
1546void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
1547 std::set<unsigned> &Regs) {
1548 for (unsigned Reg : Regs) {
1549 // For now only track virtual registers.
1550 std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
1551 assert (Pos != LiveRegs.end() && // Reg must be live.
1552 LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
1553 LiveRegsConsumers[Reg] >= 1);
1554 --LiveRegsConsumers[Reg];
1555 if (LiveRegsConsumers[Reg] == 0)
1556 LiveRegs.erase(Pos);
1557 }
1558}
1559
1560void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
1561 for (SIScheduleBlock* Block : Parent->getSuccs()) {
1562 --BlockNumPredsLeft[Block->getID()];
1563 if (BlockNumPredsLeft[Block->getID()] == 0) {
1564 ReadyBlocks.push_back(Block);
1565 }
1566 // TODO: Improve check. When the dependency between the high latency
1567 // instructions and the instructions of the other blocks are WAR or WAW
1568 // there will be no wait triggered. We would like these cases to not
1569 // update LastPosHighLatencyParentScheduled.
1570 if (Parent->isHighLatencyBlock())
1571 LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled;
1572 }
1573}
1574
1575void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
1576 decreaseLiveRegs(Block, Block->getInRegs());
1577 addLiveRegs(Block->getOutRegs());
1578 releaseBlockSuccs(Block);
1579 for (std::map<unsigned, unsigned>::iterator RegI =
1580 LiveOutRegsNumUsages[Block->getID()].begin(),
1581 E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
1582 std::pair<unsigned, unsigned> RegP = *RegI;
1583 if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end())
1584 LiveRegsConsumers[RegP.first] = RegP.second;
1585 else {
1586 assert(LiveRegsConsumers[RegP.first] == 0);
1587 LiveRegsConsumers[RegP.first] += RegP.second;
1588 }
1589 }
1590 if (LastPosHighLatencyParentScheduled[Block->getID()] >
1591 (unsigned)LastPosWaitedHighLatency)
1592 LastPosWaitedHighLatency =
1593 LastPosHighLatencyParentScheduled[Block->getID()];
1594 ++NumBlockScheduled;
1595}
1596
1597std::vector<int>
1598SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
1599 std::set<unsigned> &OutRegs) {
1600 std::vector<int> DiffSetPressure;
1601 DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
1602
1603 for (unsigned Reg : InRegs) {
1604 // For now only track virtual registers.
1605 if (!TargetRegisterInfo::isVirtualRegister(Reg))
1606 continue;
1607 if (LiveRegsConsumers[Reg] > 1)
1608 continue;
1609 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1610 for (; PSetI.isValid(); ++PSetI) {
1611 DiffSetPressure[*PSetI] -= PSetI.getWeight();
1612 }
1613 }
1614
1615 for (unsigned Reg : OutRegs) {
1616 // For now only track virtual registers.
1617 if (!TargetRegisterInfo::isVirtualRegister(Reg))
1618 continue;
1619 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1620 for (; PSetI.isValid(); ++PSetI) {
1621 DiffSetPressure[*PSetI] += PSetI.getWeight();
1622 }
1623 }
1624
1625 return DiffSetPressure;
1626}
1627
1628// SIScheduler //
1629
1630struct SIScheduleBlockResult
1631SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
1632 SISchedulerBlockSchedulerVariant ScheduleVariant) {
1633 SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
1634 SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
1635 std::vector<SIScheduleBlock*> ScheduledBlocks;
1636 struct SIScheduleBlockResult Res;
1637
1638 ScheduledBlocks = Scheduler.getBlocks();
1639
1640 for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
1641 SIScheduleBlock *Block = ScheduledBlocks[b];
1642 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1643
1644 for (SUnit* SU : SUs)
1645 Res.SUs.push_back(SU->NodeNum);
1646 }
1647
1648 Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
1649 Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
1650 return Res;
1651}
1652
1653// SIScheduleDAGMI //
1654
1655SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
1656 ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)) {
1657 SITII = static_cast<const SIInstrInfo*>(TII);
1658 SITRI = static_cast<const SIRegisterInfo*>(TRI);
1659
Tom Stellard7c463c92016-08-26 21:16:37 +00001660 VGPRSetID = SITRI->getVGPRPressureSet();
1661 SGPRSetID = SITRI->getSGPRPressureSet();
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001662}
1663
1664SIScheduleDAGMI::~SIScheduleDAGMI() {
1665}
1666
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001667// Code adapted from scheduleDAG.cpp
1668// Does a topological sort over the SUs.
1669// Both TopDown and BottomUp
1670void SIScheduleDAGMI::topologicalSort() {
Tom Stellard1d3940e2016-06-09 23:48:02 +00001671 Topo.InitDAGTopologicalSorting();
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001672
Tom Stellard1d3940e2016-06-09 23:48:02 +00001673 TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
1674 BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001675}
1676
1677// Move low latencies further from their user without
1678// increasing SGPR usage (in general)
1679// This is to be replaced by a better pass that would
1680// take into account SGPR usage (based on VGPR Usage
1681// and the corresponding wavefront count), that would
1682// try to merge groups of loads if it make sense, etc
1683void SIScheduleDAGMI::moveLowLatencies() {
1684 unsigned DAGSize = SUnits.size();
1685 int LastLowLatencyUser = -1;
1686 int LastLowLatencyPos = -1;
1687
1688 for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
1689 SUnit *SU = &SUnits[ScheduledSUnits[i]];
1690 bool IsLowLatencyUser = false;
1691 unsigned MinPos = 0;
1692
1693 for (SDep& PredDep : SU->Preds) {
1694 SUnit *Pred = PredDep.getSUnit();
Duncan P. N. Exon Smith9cfc75c2016-06-30 00:01:54 +00001695 if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001696 IsLowLatencyUser = true;
1697 }
1698 if (Pred->NodeNum >= DAGSize)
1699 continue;
1700 unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
1701 if (PredPos >= MinPos)
1702 MinPos = PredPos + 1;
1703 }
1704
Duncan P. N. Exon Smith9cfc75c2016-06-30 00:01:54 +00001705 if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001706 unsigned BestPos = LastLowLatencyUser + 1;
1707 if ((int)BestPos <= LastLowLatencyPos)
1708 BestPos = LastLowLatencyPos + 1;
1709 if (BestPos < MinPos)
1710 BestPos = MinPos;
1711 if (BestPos < i) {
1712 for (unsigned u = i; u > BestPos; --u) {
1713 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1714 ScheduledSUnits[u] = ScheduledSUnits[u-1];
1715 }
1716 ScheduledSUnits[BestPos] = SU->NodeNum;
1717 ScheduledSUnitsInv[SU->NodeNum] = BestPos;
1718 }
1719 LastLowLatencyPos = BestPos;
1720 if (IsLowLatencyUser)
1721 LastLowLatencyUser = BestPos;
1722 } else if (IsLowLatencyUser) {
1723 LastLowLatencyUser = i;
1724 // Moves COPY instructions on which depends
1725 // the low latency instructions too.
1726 } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
1727 bool CopyForLowLat = false;
1728 for (SDep& SuccDep : SU->Succs) {
1729 SUnit *Succ = SuccDep.getSUnit();
Duncan P. N. Exon Smith9cfc75c2016-06-30 00:01:54 +00001730 if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001731 CopyForLowLat = true;
1732 }
1733 }
1734 if (!CopyForLowLat)
1735 continue;
1736 if (MinPos < i) {
1737 for (unsigned u = i; u > MinPos; --u) {
1738 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1739 ScheduledSUnits[u] = ScheduledSUnits[u-1];
1740 }
1741 ScheduledSUnits[MinPos] = SU->NodeNum;
1742 ScheduledSUnitsInv[SU->NodeNum] = MinPos;
1743 }
1744 }
1745 }
1746}
1747
1748void SIScheduleDAGMI::restoreSULinksLeft() {
1749 for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1750 SUnits[i].isScheduled = false;
1751 SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
1752 SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
1753 SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
1754 SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
1755 }
1756}
1757
1758// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1759template<typename _Iterator> void
1760SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
1761 unsigned &VgprUsage, unsigned &SgprUsage) {
1762 VgprUsage = 0;
1763 SgprUsage = 0;
1764 for (_Iterator RegI = First; RegI != End; ++RegI) {
1765 unsigned Reg = *RegI;
1766 // For now only track virtual registers
1767 if (!TargetRegisterInfo::isVirtualRegister(Reg))
1768 continue;
1769 PSetIterator PSetI = MRI.getPressureSets(Reg);
1770 for (; PSetI.isValid(); ++PSetI) {
1771 if (*PSetI == VGPRSetID)
1772 VgprUsage += PSetI.getWeight();
1773 else if (*PSetI == SGPRSetID)
1774 SgprUsage += PSetI.getWeight();
1775 }
1776 }
1777}
1778
1779void SIScheduleDAGMI::schedule()
1780{
1781 SmallVector<SUnit*, 8> TopRoots, BotRoots;
1782 SIScheduleBlockResult Best, Temp;
1783 DEBUG(dbgs() << "Preparing Scheduling\n");
1784
1785 buildDAGWithRegPressure();
1786 DEBUG(
1787 for(SUnit& SU : SUnits)
1788 SU.dumpAll(this)
1789 );
1790
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001791 topologicalSort();
1792 findRootsAndBiasEdges(TopRoots, BotRoots);
1793 // We reuse several ScheduleDAGMI and ScheduleDAGMILive
1794 // functions, but to make them happy we must initialize
1795 // the default Scheduler implementation (even if we do not
1796 // run it)
1797 SchedImpl->initialize(this);
1798 initQueues(TopRoots, BotRoots);
1799
1800 // Fill some stats to help scheduling.
1801
1802 SUnitsLinksBackup = SUnits;
1803 IsLowLatencySU.clear();
1804 LowLatencyOffset.clear();
1805 IsHighLatencySU.clear();
1806
1807 IsLowLatencySU.resize(SUnits.size(), 0);
1808 LowLatencyOffset.resize(SUnits.size(), 0);
1809 IsHighLatencySU.resize(SUnits.size(), 0);
1810
1811 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1812 SUnit *SU = &SUnits[i];
Chad Rosierc27a18f2016-03-09 16:00:35 +00001813 unsigned BaseLatReg;
1814 int64_t OffLatReg;
Duncan P. N. Exon Smith9cfc75c2016-06-30 00:01:54 +00001815 if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001816 IsLowLatencySU[i] = 1;
Duncan P. N. Exon Smith9cfc75c2016-06-30 00:01:54 +00001817 if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg,
1818 TRI))
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001819 LowLatencyOffset[i] = OffLatReg;
Duncan P. N. Exon Smith9cfc75c2016-06-30 00:01:54 +00001820 } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001821 IsHighLatencySU[i] = 1;
1822 }
1823
1824 SIScheduler Scheduler(this);
1825 Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
1826 SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
Matt Arsenault105c2a22016-07-01 18:03:46 +00001827
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001828 // if VGPR usage is extremely high, try other good performing variants
1829 // which could lead to lower VGPR usage
1830 if (Best.MaxVGPRUsage > 180) {
1831 std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
1832 { LatenciesAlone, BlockRegUsageLatency },
1833// { LatenciesAlone, BlockRegUsage },
1834 { LatenciesGrouped, BlockLatencyRegUsage },
1835// { LatenciesGrouped, BlockRegUsageLatency },
1836// { LatenciesGrouped, BlockRegUsage },
1837 { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1838// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1839// { LatenciesAlonePlusConsecutive, BlockRegUsage }
1840 };
1841 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1842 Temp = Scheduler.scheduleVariant(v.first, v.second);
1843 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1844 Best = Temp;
1845 }
1846 }
1847 // if VGPR usage is still extremely high, we may spill. Try other variants
1848 // which are less performing, but that could lead to lower VGPR usage.
1849 if (Best.MaxVGPRUsage > 200) {
1850 std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
1851// { LatenciesAlone, BlockRegUsageLatency },
1852 { LatenciesAlone, BlockRegUsage },
1853// { LatenciesGrouped, BlockLatencyRegUsage },
1854 { LatenciesGrouped, BlockRegUsageLatency },
1855 { LatenciesGrouped, BlockRegUsage },
1856// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1857 { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1858 { LatenciesAlonePlusConsecutive, BlockRegUsage }
1859 };
1860 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1861 Temp = Scheduler.scheduleVariant(v.first, v.second);
1862 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1863 Best = Temp;
1864 }
1865 }
Matt Arsenault105c2a22016-07-01 18:03:46 +00001866
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001867 ScheduledSUnits = Best.SUs;
1868 ScheduledSUnitsInv.resize(SUnits.size());
1869
1870 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1871 ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
1872 }
1873
1874 moveLowLatencies();
1875
1876 // Tell the outside world about the result of the scheduling.
1877
1878 assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
1879 TopRPTracker.setPos(CurrentTop);
1880
1881 for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
1882 E = ScheduledSUnits.end(); I != E; ++I) {
1883 SUnit *SU = &SUnits[*I];
1884
1885 scheduleMI(SU, true);
1886
1887 DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
1888 << *SU->getInstr());
1889 }
1890
1891 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
1892
1893 placeDebugValues();
1894
1895 DEBUG({
1896 unsigned BBNum = begin()->getParent()->getNumber();
1897 dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
1898 dumpSchedule();
1899 dbgs() << '\n';
1900 });
1901}