blob: ce4e45841151b371ede8c20aa409dd8036d601b8 [file] [log] [blame]
Eugene Zelenko6a9226d2016-12-12 22:23:53 +00001//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
Nicolai Haehnle02c32912016-01-13 16:10:10 +00002//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief SI Machine Scheduler interface
12//
13//===----------------------------------------------------------------------===//
14
Matt Arsenault43e92fe2016-06-24 06:30:11 +000015#include "AMDGPU.h"
Eugene Zelenko6a9226d2016-12-12 22:23:53 +000016#include "SIInstrInfo.h"
Nicolai Haehnle02c32912016-01-13 16:10:10 +000017#include "SIMachineScheduler.h"
Eugene Zelenko6a9226d2016-12-12 22:23:53 +000018#include "SIRegisterInfo.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SmallVector.h"
Nicolai Haehnle02c32912016-01-13 16:10:10 +000021#include "llvm/CodeGen/LiveInterval.h"
22#include "llvm/CodeGen/LiveIntervalAnalysis.h"
Eugene Zelenko6a9226d2016-12-12 22:23:53 +000023#include "llvm/CodeGen/MachineInstr.h"
Nicolai Haehnle02c32912016-01-13 16:10:10 +000024#include "llvm/CodeGen/MachineRegisterInfo.h"
25#include "llvm/CodeGen/MachineScheduler.h"
26#include "llvm/CodeGen/RegisterPressure.h"
Eugene Zelenko6a9226d2016-12-12 22:23:53 +000027#include "llvm/CodeGen/SlotIndexes.h"
28#include "llvm/Support/Debug.h"
29#include "llvm/Support/ErrorHandling.h"
30#include "llvm/Support/raw_ostream.h"
31#include "llvm/Target/TargetRegisterInfo.h"
32#include <algorithm>
33#include <cassert>
34#include <map>
35#include <set>
36#include <utility>
37#include <vector>
Nicolai Haehnle02c32912016-01-13 16:10:10 +000038
39using namespace llvm;
40
41#define DEBUG_TYPE "misched"
42
43// This scheduler implements a different scheduling algorithm than
44// GenericScheduler.
45//
46// There are several specific architecture behaviours that can't be modelled
47// for GenericScheduler:
48// . When accessing the result of an SGPR load instruction, you have to wait
49// for all the SGPR load instructions before your current instruction to
50// have finished.
51// . When accessing the result of an VGPR load instruction, you have to wait
52// for all the VGPR load instructions previous to the VGPR load instruction
53// you are interested in to finish.
54// . The less the register pressure, the best load latencies are hidden
55//
56// Moreover some specifities (like the fact a lot of instructions in the shader
57// have few dependencies) makes the generic scheduler have some unpredictable
58// behaviours. For example when register pressure becomes high, it can either
59// manage to prevent register pressure from going too high, or it can
60// increase register pressure even more than if it hadn't taken register
61// pressure into account.
62//
63// Also some other bad behaviours are generated, like loading at the beginning
64// of the shader a constant in VGPR you won't need until the end of the shader.
65//
66// The scheduling problem for SI can distinguish three main parts:
67// . Hiding high latencies (texture sampling, etc)
68// . Hiding low latencies (SGPR constant loading, etc)
69// . Keeping register usage low for better latency hiding and general
70// performance
71//
72// Some other things can also affect performance, but are hard to predict
73// (cache usage, the fact the HW can issue several instructions from different
74// wavefronts if different types, etc)
75//
76// This scheduler tries to solve the scheduling problem by dividing it into
77// simpler sub-problems. It divides the instructions into blocks, schedules
78// locally inside the blocks where it takes care of low latencies, and then
79// chooses the order of the blocks by taking care of high latencies.
80// Dividing the instructions into blocks helps control keeping register
81// usage low.
82//
83// First the instructions are put into blocks.
84// We want the blocks help control register usage and hide high latencies
85// later. To help control register usage, we typically want all local
86// computations, when for example you create a result that can be comsummed
87// right away, to be contained in a block. Block inputs and outputs would
88// typically be important results that are needed in several locations of
89// the shader. Since we do want blocks to help hide high latencies, we want
90// the instructions inside the block to have a minimal set of dependencies
91// on high latencies. It will make it easy to pick blocks to hide specific
92// high latencies.
93// The block creation algorithm is divided into several steps, and several
94// variants can be tried during the scheduling process.
95//
Simon Pilgrime995a8082016-11-18 11:04:02 +000096// Second the order of the instructions inside the blocks is chosen.
Nicolai Haehnle02c32912016-01-13 16:10:10 +000097// At that step we do take into account only register usage and hiding
98// low latency instructions
99//
Simon Pilgrime995a8082016-11-18 11:04:02 +0000100// Third the block order is chosen, there we try to hide high latencies
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000101// and keep register usage low.
102//
103// After the third step, a pass is done to improve the hiding of low
104// latencies.
105//
106// Actually when talking about 'low latency' or 'high latency' it includes
107// both the latency to get the cache (or global mem) data go to the register,
Simon Pilgrime995a8082016-11-18 11:04:02 +0000108// and the bandwidth limitations.
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000109// Increasing the number of active wavefronts helps hide the former, but it
110// doesn't solve the latter, thus why even if wavefront count is high, we have
111// to try have as many instructions hiding high latencies as possible.
112// The OpenCL doc says for example latency of 400 cycles for a global mem access,
113// which is hidden by 10 instructions if the wavefront count is 10.
114
115// Some figures taken from AMD docs:
116// Both texture and constant L1 caches are 4-way associative with 64 bytes
117// lines.
118// Constant cache is shared with 4 CUs.
119// For texture sampling, the address generation unit receives 4 texture
120// addresses per cycle, thus we could expect texture sampling latency to be
121// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
122// instructions in a wavefront group are executed every 4 cycles),
123// or 16 instructions if the other wavefronts associated to the 3 other VALUs
124// of the CU do texture sampling too. (Don't take these figures too seriously,
125// as I'm not 100% sure of the computation)
126// Data exports should get similar latency.
127// For constant loading, the cache is shader with 4 CUs.
128// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
129// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
130// It means a simple s_buffer_load should take one instruction to hide, as
131// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
132// cache line.
133//
134// As of today the driver doesn't preload the constants in cache, thus the
135// first loads get extra latency. The doc says global memory access can be
136// 300-600 cycles. We do not specially take that into account when scheduling
137// As we expect the driver to be able to preload the constants soon.
138
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000139// common code //
140
141#ifndef NDEBUG
142
143static const char *getReasonStr(SIScheduleCandReason Reason) {
144 switch (Reason) {
145 case NoCand: return "NOCAND";
146 case RegUsage: return "REGUSAGE";
147 case Latency: return "LATENCY";
148 case Successor: return "SUCCESSOR";
149 case Depth: return "DEPTH";
150 case NodeOrder: return "ORDER";
151 }
152 llvm_unreachable("Unknown reason!");
153}
154
155#endif
156
157static bool tryLess(int TryVal, int CandVal,
158 SISchedulerCandidate &TryCand,
159 SISchedulerCandidate &Cand,
160 SIScheduleCandReason Reason) {
161 if (TryVal < CandVal) {
162 TryCand.Reason = Reason;
163 return true;
164 }
165 if (TryVal > CandVal) {
166 if (Cand.Reason > Reason)
167 Cand.Reason = Reason;
168 return true;
169 }
170 Cand.setRepeat(Reason);
171 return false;
172}
173
174static bool tryGreater(int TryVal, int CandVal,
175 SISchedulerCandidate &TryCand,
176 SISchedulerCandidate &Cand,
177 SIScheduleCandReason Reason) {
178 if (TryVal > CandVal) {
179 TryCand.Reason = Reason;
180 return true;
181 }
182 if (TryVal < CandVal) {
183 if (Cand.Reason > Reason)
184 Cand.Reason = Reason;
185 return true;
186 }
187 Cand.setRepeat(Reason);
188 return false;
189}
190
191// SIScheduleBlock //
192
193void SIScheduleBlock::addUnit(SUnit *SU) {
194 NodeNum2Index[SU->NodeNum] = SUnits.size();
195 SUnits.push_back(SU);
196}
197
198#ifndef NDEBUG
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000199void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
200
201 dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
202 dbgs() << '\n';
203}
204#endif
205
206void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
207 SISchedCandidate &TryCand) {
208 // Initialize the candidate if needed.
209 if (!Cand.isValid()) {
210 TryCand.Reason = NodeOrder;
211 return;
212 }
213
214 if (Cand.SGPRUsage > 60 &&
215 tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
216 return;
217
218 // Schedule low latency instructions as top as possible.
219 // Order of priority is:
220 // . Low latency instructions which do not depend on other low latency
221 // instructions we haven't waited for
222 // . Other instructions which do not depend on low latency instructions
223 // we haven't waited for
224 // . Low latencies
225 // . All other instructions
Simon Pilgrime995a8082016-11-18 11:04:02 +0000226 // Goal is to get: low latency instructions - independent instructions
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000227 // - (eventually some more low latency instructions)
228 // - instructions that depend on the first low latency instructions.
229 // If in the block there is a lot of constant loads, the SGPR usage
230 // could go quite high, thus above the arbitrary limit of 60 will encourage
231 // use the already loaded constants (in order to release some SGPRs) before
232 // loading more.
233 if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
234 Cand.HasLowLatencyNonWaitedParent,
235 TryCand, Cand, SIScheduleCandReason::Depth))
236 return;
237
238 if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
239 TryCand, Cand, SIScheduleCandReason::Depth))
240 return;
241
242 if (TryCand.IsLowLatency &&
243 tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
244 TryCand, Cand, SIScheduleCandReason::Depth))
245 return;
246
247 if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
248 return;
249
250 // Fall through to original instruction order.
251 if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
252 TryCand.Reason = NodeOrder;
253 }
254}
255
256SUnit* SIScheduleBlock::pickNode() {
257 SISchedCandidate TopCand;
258
259 for (SUnit* SU : TopReadySUs) {
260 SISchedCandidate TryCand;
261 std::vector<unsigned> pressure;
262 std::vector<unsigned> MaxPressure;
263 // Predict register usage after this instruction.
264 TryCand.SU = SU;
265 TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
266 TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
267 TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
268 TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
269 TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
270 TryCand.HasLowLatencyNonWaitedParent =
271 HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
272 tryCandidateTopDown(TopCand, TryCand);
273 if (TryCand.Reason != NoCand)
274 TopCand.setBest(TryCand);
275 }
276
277 return TopCand.SU;
278}
279
280
281// Schedule something valid.
282void SIScheduleBlock::fastSchedule() {
283 TopReadySUs.clear();
284 if (Scheduled)
285 undoSchedule();
286
287 for (SUnit* SU : SUnits) {
288 if (!SU->NumPredsLeft)
289 TopReadySUs.push_back(SU);
290 }
291
292 while (!TopReadySUs.empty()) {
293 SUnit *SU = TopReadySUs[0];
294 ScheduledSUnits.push_back(SU);
295 nodeScheduled(SU);
296 }
297
298 Scheduled = true;
299}
300
301// Returns if the register was set between first and last.
302static bool isDefBetween(unsigned Reg,
303 SlotIndex First, SlotIndex Last,
304 const MachineRegisterInfo *MRI,
305 const LiveIntervals *LIS) {
306 for (MachineRegisterInfo::def_instr_iterator
307 UI = MRI->def_instr_begin(Reg),
308 UE = MRI->def_instr_end(); UI != UE; ++UI) {
309 const MachineInstr* MI = &*UI;
310 if (MI->isDebugValue())
311 continue;
Duncan P. N. Exon Smith3ac9cc62016-02-27 06:40:41 +0000312 SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000313 if (InstSlot >= First && InstSlot <= Last)
314 return true;
315 }
316 return false;
317}
318
319void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
320 MachineBasicBlock::iterator EndBlock) {
321 IntervalPressure Pressure, BotPressure;
322 RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
323 LiveIntervals *LIS = DAG->getLIS();
324 MachineRegisterInfo *MRI = DAG->getMRI();
325 DAG->initRPTracker(TopRPTracker);
326 DAG->initRPTracker(BotRPTracker);
327 DAG->initRPTracker(RPTracker);
328
329 // Goes though all SU. RPTracker captures what had to be alive for the SUs
330 // to execute, and what is still alive at the end.
331 for (SUnit* SU : ScheduledSUnits) {
332 RPTracker.setPos(SU->getInstr());
333 RPTracker.advance();
334 }
335
336 // Close the RPTracker to finalize live ins/outs.
337 RPTracker.closeRegion();
338
339 // Initialize the live ins and live outs.
340 TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
341 BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
342
343 // Do not Track Physical Registers, because it messes up.
Matthias Braun5d458612016-01-20 00:23:26 +0000344 for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
345 if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
346 LiveInRegs.insert(RegMaskPair.RegUnit);
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000347 }
348 LiveOutRegs.clear();
349 // There is several possibilities to distinguish:
350 // 1) Reg is not input to any instruction in the block, but is output of one
351 // 2) 1) + read in the block and not needed after it
352 // 3) 1) + read in the block but needed in another block
353 // 4) Reg is input of an instruction but another block will read it too
354 // 5) Reg is input of an instruction and then rewritten in the block.
355 // result is not read in the block (implies used in another block)
356 // 6) Reg is input of an instruction and then rewritten in the block.
357 // result is read in the block and not needed in another block
358 // 7) Reg is input of an instruction and then rewritten in the block.
359 // result is read in the block but also needed in another block
360 // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
361 // We want LiveOutRegs to contain only Regs whose content will be read after
362 // in another block, and whose content was written in the current block,
363 // that is we want it to get 1, 3, 5, 7
364 // Since we made the MIs of a block to be packed all together before
365 // scheduling, then the LiveIntervals were correct, and the RPTracker was
366 // able to correctly handle 5 vs 6, 2 vs 3.
367 // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
368 // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
369 // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
370 // The use of findDefBetween removes the case 4.
Matthias Braun5d458612016-01-20 00:23:26 +0000371 for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
372 unsigned Reg = RegMaskPair.RegUnit;
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000373 if (TargetRegisterInfo::isVirtualRegister(Reg) &&
Duncan P. N. Exon Smith3ac9cc62016-02-27 06:40:41 +0000374 isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
375 LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
376 LIS)) {
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000377 LiveOutRegs.insert(Reg);
378 }
379 }
380
381 // Pressure = sum_alive_registers register size
382 // Internally llvm will represent some registers as big 128 bits registers
383 // for example, but they actually correspond to 4 actual 32 bits registers.
384 // Thus Pressure is not equal to num_alive_registers * constant.
385 LiveInPressure = TopPressure.MaxSetPressure;
386 LiveOutPressure = BotPressure.MaxSetPressure;
387
388 // Prepares TopRPTracker for top down scheduling.
389 TopRPTracker.closeTop();
390}
391
392void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
393 MachineBasicBlock::iterator EndBlock) {
394 if (!Scheduled)
395 fastSchedule();
396
397 // PreScheduling phase to set LiveIn and LiveOut.
398 initRegPressure(BeginBlock, EndBlock);
399 undoSchedule();
400
401 // Schedule for real now.
402
403 TopReadySUs.clear();
404
405 for (SUnit* SU : SUnits) {
406 if (!SU->NumPredsLeft)
407 TopReadySUs.push_back(SU);
408 }
409
410 while (!TopReadySUs.empty()) {
411 SUnit *SU = pickNode();
412 ScheduledSUnits.push_back(SU);
413 TopRPTracker.setPos(SU->getInstr());
414 TopRPTracker.advance();
415 nodeScheduled(SU);
416 }
417
418 // TODO: compute InternalAdditionnalPressure.
419 InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
420
421 // Check everything is right.
422#ifndef NDEBUG
423 assert(SUnits.size() == ScheduledSUnits.size() &&
424 TopReadySUs.empty());
425 for (SUnit* SU : SUnits) {
426 assert(SU->isScheduled &&
427 SU->NumPredsLeft == 0);
428 }
429#endif
430
431 Scheduled = true;
432}
433
434void SIScheduleBlock::undoSchedule() {
435 for (SUnit* SU : SUnits) {
436 SU->isScheduled = false;
437 for (SDep& Succ : SU->Succs) {
438 if (BC->isSUInBlock(Succ.getSUnit(), ID))
439 undoReleaseSucc(SU, &Succ);
440 }
441 }
442 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
443 ScheduledSUnits.clear();
444 Scheduled = false;
445}
446
447void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
448 SUnit *SuccSU = SuccEdge->getSUnit();
449
450 if (SuccEdge->isWeak()) {
451 ++SuccSU->WeakPredsLeft;
452 return;
453 }
454 ++SuccSU->NumPredsLeft;
455}
456
457void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
458 SUnit *SuccSU = SuccEdge->getSUnit();
459
460 if (SuccEdge->isWeak()) {
461 --SuccSU->WeakPredsLeft;
462 return;
463 }
464#ifndef NDEBUG
465 if (SuccSU->NumPredsLeft == 0) {
466 dbgs() << "*** Scheduling failed! ***\n";
467 SuccSU->dump(DAG);
468 dbgs() << " has been released too many times!\n";
469 llvm_unreachable(nullptr);
470 }
471#endif
472
473 --SuccSU->NumPredsLeft;
474}
475
476/// Release Successors of the SU that are in the block or not.
477void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
478 for (SDep& Succ : SU->Succs) {
479 SUnit *SuccSU = Succ.getSUnit();
480
Matt Arsenaultfe358062016-07-19 00:35:22 +0000481 if (SuccSU->NodeNum >= DAG->SUnits.size())
482 continue;
483
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000484 if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
485 continue;
486
487 releaseSucc(SU, &Succ);
488 if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
489 TopReadySUs.push_back(SuccSU);
490 }
491}
492
493void SIScheduleBlock::nodeScheduled(SUnit *SU) {
494 // Is in TopReadySUs
495 assert (!SU->NumPredsLeft);
Eugene Zelenko6a9226d2016-12-12 22:23:53 +0000496 std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU);
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000497 if (I == TopReadySUs.end()) {
498 dbgs() << "Data Structure Bug in SI Scheduler\n";
499 llvm_unreachable(nullptr);
500 }
501 TopReadySUs.erase(I);
502
503 releaseSuccessors(SU, true);
504 // Scheduling this node will trigger a wait,
505 // thus propagate to other instructions that they do not need to wait either.
506 if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
507 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
508
509 if (DAG->IsLowLatencySU[SU->NodeNum]) {
510 for (SDep& Succ : SU->Succs) {
511 std::map<unsigned, unsigned>::iterator I =
512 NodeNum2Index.find(Succ.getSUnit()->NodeNum);
513 if (I != NodeNum2Index.end())
514 HasLowLatencyNonWaitedParent[I->second] = 1;
515 }
516 }
517 SU->isScheduled = true;
518}
519
520void SIScheduleBlock::finalizeUnits() {
521 // We remove links from outside blocks to enable scheduling inside the block.
522 for (SUnit* SU : SUnits) {
523 releaseSuccessors(SU, false);
524 if (DAG->IsHighLatencySU[SU->NodeNum])
525 HighLatencyBlock = true;
526 }
527 HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
528}
529
530// we maintain ascending order of IDs
531void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
532 unsigned PredID = Pred->getID();
533
534 // Check if not already predecessor.
535 for (SIScheduleBlock* P : Preds) {
536 if (PredID == P->getID())
537 return;
538 }
539 Preds.push_back(Pred);
540
Benjamin Kramer3e9a5d32016-05-27 11:36:04 +0000541 assert(none_of(Succs,
Valery Pykhtinfb990552017-03-27 18:22:39 +0000542 [=](std::pair<SIScheduleBlock*,
543 SIScheduleBlockLinkKind> S) {
544 return PredID == S.first->getID();
545 }) &&
Benjamin Kramer3e9a5d32016-05-27 11:36:04 +0000546 "Loop in the Block Graph!");
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000547}
548
Valery Pykhtinfb990552017-03-27 18:22:39 +0000549void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
550 SIScheduleBlockLinkKind Kind) {
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000551 unsigned SuccID = Succ->getID();
552
553 // Check if not already predecessor.
Valery Pykhtinfb990552017-03-27 18:22:39 +0000554 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
555 if (SuccID == S.first->getID()) {
556 if (S.second == SIScheduleBlockLinkKind::NoData &&
557 Kind == SIScheduleBlockLinkKind::Data)
558 S.second = Kind;
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000559 return;
Valery Pykhtinfb990552017-03-27 18:22:39 +0000560 }
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000561 }
562 if (Succ->isHighLatencyBlock())
563 ++NumHighLatencySuccessors;
Valery Pykhtinfb990552017-03-27 18:22:39 +0000564 Succs.push_back(std::make_pair(Succ, Kind));
565
Benjamin Kramer3e9a5d32016-05-27 11:36:04 +0000566 assert(none_of(Preds,
567 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
568 "Loop in the Block Graph!");
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000569}
570
571#ifndef NDEBUG
572void SIScheduleBlock::printDebug(bool full) {
573 dbgs() << "Block (" << ID << ")\n";
574 if (!full)
575 return;
576
577 dbgs() << "\nContains High Latency Instruction: "
578 << HighLatencyBlock << '\n';
579 dbgs() << "\nDepends On:\n";
580 for (SIScheduleBlock* P : Preds) {
581 P->printDebug(false);
582 }
583
584 dbgs() << "\nSuccessors:\n";
Valery Pykhtinfb990552017-03-27 18:22:39 +0000585 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
586 if (S.second == SIScheduleBlockLinkKind::Data)
587 dbgs() << "(Data Dep) ";
588 S.first->printDebug(false);
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000589 }
590
591 if (Scheduled) {
592 dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
593 << LiveInPressure[DAG->getVGPRSetID()] << '\n';
594 dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
595 << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
596 dbgs() << "LiveIns:\n";
597 for (unsigned Reg : LiveInRegs)
598 dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
599
600 dbgs() << "\nLiveOuts:\n";
601 for (unsigned Reg : LiveOutRegs)
602 dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
603 }
604
605 dbgs() << "\nInstructions:\n";
606 if (!Scheduled) {
607 for (SUnit* SU : SUnits) {
608 SU->dump(DAG);
609 }
610 } else {
611 for (SUnit* SU : SUnits) {
612 SU->dump(DAG);
613 }
614 }
615
Eugene Zelenko6a9226d2016-12-12 22:23:53 +0000616 dbgs() << "///////////////////////\n";
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000617}
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000618#endif
619
620// SIScheduleBlockCreator //
621
622SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
623DAG(DAG) {
624}
625
Eugene Zelenko6a9226d2016-12-12 22:23:53 +0000626SIScheduleBlockCreator::~SIScheduleBlockCreator() = default;
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000627
628SIScheduleBlocks
629SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
630 std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
631 Blocks.find(BlockVariant);
632 if (B == Blocks.end()) {
633 SIScheduleBlocks Res;
634 createBlocksForVariant(BlockVariant);
635 topologicalSort();
636 scheduleInsideBlocks();
637 fillStats();
638 Res.Blocks = CurrentBlocks;
639 Res.TopDownIndex2Block = TopDownIndex2Block;
640 Res.TopDownBlock2Index = TopDownBlock2Index;
641 Blocks[BlockVariant] = Res;
642 return Res;
643 } else {
644 return B->second;
645 }
646}
647
648bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
649 if (SU->NodeNum >= DAG->SUnits.size())
650 return false;
651 return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
652}
653
654void SIScheduleBlockCreator::colorHighLatenciesAlone() {
655 unsigned DAGSize = DAG->SUnits.size();
656
657 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
658 SUnit *SU = &DAG->SUnits[i];
659 if (DAG->IsHighLatencySU[SU->NodeNum]) {
660 CurrentColoring[SU->NodeNum] = NextReservedID++;
661 }
662 }
663}
664
665void SIScheduleBlockCreator::colorHighLatenciesGroups() {
666 unsigned DAGSize = DAG->SUnits.size();
667 unsigned NumHighLatencies = 0;
668 unsigned GroupSize;
669 unsigned Color = NextReservedID;
670 unsigned Count = 0;
671 std::set<unsigned> FormingGroup;
672
673 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
674 SUnit *SU = &DAG->SUnits[i];
675 if (DAG->IsHighLatencySU[SU->NodeNum])
676 ++NumHighLatencies;
677 }
678
679 if (NumHighLatencies == 0)
680 return;
681
682 if (NumHighLatencies <= 6)
683 GroupSize = 2;
684 else if (NumHighLatencies <= 12)
685 GroupSize = 3;
686 else
687 GroupSize = 4;
688
689 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
690 SUnit *SU = &DAG->SUnits[i];
691 if (DAG->IsHighLatencySU[SU->NodeNum]) {
692 unsigned CompatibleGroup = true;
693 unsigned ProposedColor = Color;
694 for (unsigned j : FormingGroup) {
695 // TODO: Currently CompatibleGroup will always be false,
696 // because the graph enforces the load order. This
697 // can be fixed, but as keeping the load order is often
698 // good for performance that causes a performance hit (both
699 // the default scheduler and this scheduler).
700 // When this scheduler determines a good load order,
701 // this can be fixed.
702 if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) ||
703 !DAG->canAddEdge(&DAG->SUnits[j], SU))
704 CompatibleGroup = false;
705 }
706 if (!CompatibleGroup || ++Count == GroupSize) {
707 FormingGroup.clear();
708 Color = ++NextReservedID;
709 if (!CompatibleGroup) {
710 ProposedColor = Color;
711 FormingGroup.insert(SU->NodeNum);
712 }
713 Count = 0;
714 } else {
715 FormingGroup.insert(SU->NodeNum);
716 }
717 CurrentColoring[SU->NodeNum] = ProposedColor;
718 }
719 }
720}
721
722void SIScheduleBlockCreator::colorComputeReservedDependencies() {
723 unsigned DAGSize = DAG->SUnits.size();
724 std::map<std::set<unsigned>, unsigned> ColorCombinations;
725
726 CurrentTopDownReservedDependencyColoring.clear();
727 CurrentBottomUpReservedDependencyColoring.clear();
728
729 CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
730 CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
731
732 // Traverse TopDown, and give different colors to SUs depending
733 // on which combination of High Latencies they depend on.
734
Tom Stellard4a304b32016-05-03 16:30:56 +0000735 for (unsigned SUNum : DAG->TopDownIndex2SU) {
736 SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000737 std::set<unsigned> SUColors;
738
739 // Already given.
740 if (CurrentColoring[SU->NodeNum]) {
741 CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
742 CurrentColoring[SU->NodeNum];
743 continue;
744 }
745
746 for (SDep& PredDep : SU->Preds) {
747 SUnit *Pred = PredDep.getSUnit();
748 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
749 continue;
750 if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
751 SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
752 }
753 // Color 0 by default.
754 if (SUColors.empty())
755 continue;
756 // Same color than parents.
757 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
758 CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
759 *SUColors.begin();
760 else {
761 std::map<std::set<unsigned>, unsigned>::iterator Pos =
762 ColorCombinations.find(SUColors);
763 if (Pos != ColorCombinations.end()) {
764 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
765 } else {
766 CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
767 NextNonReservedID;
768 ColorCombinations[SUColors] = NextNonReservedID++;
769 }
770 }
771 }
772
773 ColorCombinations.clear();
774
775 // Same as before, but BottomUp.
776
Tom Stellard4a304b32016-05-03 16:30:56 +0000777 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
778 SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000779 std::set<unsigned> SUColors;
780
781 // Already given.
782 if (CurrentColoring[SU->NodeNum]) {
783 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
784 CurrentColoring[SU->NodeNum];
785 continue;
786 }
787
788 for (SDep& SuccDep : SU->Succs) {
789 SUnit *Succ = SuccDep.getSUnit();
790 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
791 continue;
792 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
793 SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
794 }
795 // Keep color 0.
796 if (SUColors.empty())
797 continue;
798 // Same color than parents.
799 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
800 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
801 *SUColors.begin();
802 else {
803 std::map<std::set<unsigned>, unsigned>::iterator Pos =
804 ColorCombinations.find(SUColors);
805 if (Pos != ColorCombinations.end()) {
806 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
807 } else {
808 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
809 NextNonReservedID;
810 ColorCombinations[SUColors] = NextNonReservedID++;
811 }
812 }
813 }
814}
815
816void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
817 unsigned DAGSize = DAG->SUnits.size();
818 std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
819
820 // Every combination of colors given by the top down
821 // and bottom up Reserved node dependency
822
823 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
824 SUnit *SU = &DAG->SUnits[i];
825 std::pair<unsigned, unsigned> SUColors;
826
827 // High latency instructions: already given.
828 if (CurrentColoring[SU->NodeNum])
829 continue;
830
831 SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
832 SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
833
834 std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
835 ColorCombinations.find(SUColors);
836 if (Pos != ColorCombinations.end()) {
837 CurrentColoring[SU->NodeNum] = Pos->second;
838 } else {
839 CurrentColoring[SU->NodeNum] = NextNonReservedID;
840 ColorCombinations[SUColors] = NextNonReservedID++;
841 }
842 }
843}
844
845void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
846 unsigned DAGSize = DAG->SUnits.size();
847 std::vector<int> PendingColoring = CurrentColoring;
848
Valery Pykhtinba3a4de2017-03-27 17:26:40 +0000849 assert(DAGSize >= 1 &&
850 CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
851 CurrentTopDownReservedDependencyColoring.size() == DAGSize);
852 // If there is no reserved block at all, do nothing. We don't want
853 // everything in one block.
854 if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(),
855 CurrentBottomUpReservedDependencyColoring.end()) == 0 &&
856 *std::max_element(CurrentTopDownReservedDependencyColoring.begin(),
857 CurrentTopDownReservedDependencyColoring.end()) == 0)
858 return;
859
Tom Stellard4a304b32016-05-03 16:30:56 +0000860 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
861 SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000862 std::set<unsigned> SUColors;
863 std::set<unsigned> SUColorsPending;
864
865 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
866 continue;
867
868 if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
869 CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
870 continue;
871
872 for (SDep& SuccDep : SU->Succs) {
873 SUnit *Succ = SuccDep.getSUnit();
874 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
875 continue;
876 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
877 CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
878 SUColors.insert(CurrentColoring[Succ->NodeNum]);
879 SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
880 }
Valery Pykhtinba3a4de2017-03-27 17:26:40 +0000881 // If there is only one child/parent block, and that block
882 // is not among the ones we are removing in this path, then
883 // merge the instruction to that block
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000884 if (SUColors.size() == 1 && SUColorsPending.size() == 1)
885 PendingColoring[SU->NodeNum] = *SUColors.begin();
886 else // TODO: Attribute new colors depending on color
887 // combination of children.
888 PendingColoring[SU->NodeNum] = NextNonReservedID++;
889 }
890 CurrentColoring = PendingColoring;
891}
892
893
894void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
895 unsigned DAGSize = DAG->SUnits.size();
896 unsigned PreviousColor;
897 std::set<unsigned> SeenColors;
898
899 if (DAGSize <= 1)
900 return;
901
902 PreviousColor = CurrentColoring[0];
903
904 for (unsigned i = 1, e = DAGSize; i != e; ++i) {
905 SUnit *SU = &DAG->SUnits[i];
906 unsigned CurrentColor = CurrentColoring[i];
907 unsigned PreviousColorSave = PreviousColor;
908 assert(i == SU->NodeNum);
909
910 if (CurrentColor != PreviousColor)
911 SeenColors.insert(PreviousColor);
912 PreviousColor = CurrentColor;
913
914 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
915 continue;
916
917 if (SeenColors.find(CurrentColor) == SeenColors.end())
918 continue;
919
920 if (PreviousColorSave != CurrentColor)
921 CurrentColoring[i] = NextNonReservedID++;
922 else
923 CurrentColoring[i] = CurrentColoring[i-1];
924 }
925}
926
927void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
928 unsigned DAGSize = DAG->SUnits.size();
929
Tom Stellard4a304b32016-05-03 16:30:56 +0000930 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
931 SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000932 std::set<unsigned> SUColors;
933
934 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
935 continue;
936
937 // No predecessor: Vgpr constant loading.
938 // Low latency instructions usually have a predecessor (the address)
939 if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
940 continue;
941
942 for (SDep& SuccDep : SU->Succs) {
943 SUnit *Succ = SuccDep.getSUnit();
944 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
945 continue;
946 SUColors.insert(CurrentColoring[Succ->NodeNum]);
947 }
948 if (SUColors.size() == 1)
949 CurrentColoring[SU->NodeNum] = *SUColors.begin();
950 }
951}
952
953void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
954 unsigned DAGSize = DAG->SUnits.size();
955
Tom Stellard4a304b32016-05-03 16:30:56 +0000956 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
957 SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000958 std::set<unsigned> SUColors;
959
960 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
961 continue;
962
963 for (SDep& SuccDep : SU->Succs) {
964 SUnit *Succ = SuccDep.getSUnit();
965 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
966 continue;
967 SUColors.insert(CurrentColoring[Succ->NodeNum]);
968 }
969 if (SUColors.size() == 1)
970 CurrentColoring[SU->NodeNum] = *SUColors.begin();
971 }
972}
973
974void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
975 unsigned DAGSize = DAG->SUnits.size();
976
Tom Stellard4a304b32016-05-03 16:30:56 +0000977 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
978 SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle02c32912016-01-13 16:10:10 +0000979 std::set<unsigned> SUColors;
980
981 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
982 continue;
983
984 for (SDep& SuccDep : SU->Succs) {
985 SUnit *Succ = SuccDep.getSUnit();
986 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
987 continue;
988 SUColors.insert(CurrentColoring[Succ->NodeNum]);
989 }
990 if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
991 CurrentColoring[SU->NodeNum] = *SUColors.begin();
992 }
993}
994
995void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
996 unsigned DAGSize = DAG->SUnits.size();
997 std::map<unsigned, unsigned> ColorCount;
998
Tom Stellard4a304b32016-05-03 16:30:56 +0000999 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1000 SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001001 unsigned color = CurrentColoring[SU->NodeNum];
Valery Pykhtine2419dc2017-03-24 17:49:05 +00001002 ++ColorCount[color];
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001003 }
1004
Tom Stellard4a304b32016-05-03 16:30:56 +00001005 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1006 SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001007 unsigned color = CurrentColoring[SU->NodeNum];
1008 std::set<unsigned> SUColors;
1009
1010 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1011 continue;
1012
1013 if (ColorCount[color] > 1)
1014 continue;
1015
1016 for (SDep& SuccDep : SU->Succs) {
1017 SUnit *Succ = SuccDep.getSUnit();
1018 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1019 continue;
1020 SUColors.insert(CurrentColoring[Succ->NodeNum]);
1021 }
1022 if (SUColors.size() == 1 && *SUColors.begin() != color) {
1023 --ColorCount[color];
1024 CurrentColoring[SU->NodeNum] = *SUColors.begin();
1025 ++ColorCount[*SUColors.begin()];
1026 }
1027 }
1028}
1029
1030void SIScheduleBlockCreator::cutHugeBlocks() {
1031 // TODO
1032}
1033
1034void SIScheduleBlockCreator::regroupNoUserInstructions() {
1035 unsigned DAGSize = DAG->SUnits.size();
1036 int GroupID = NextNonReservedID++;
1037
Tom Stellard4a304b32016-05-03 16:30:56 +00001038 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1039 SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001040 bool hasSuccessor = false;
1041
1042 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1043 continue;
1044
1045 for (SDep& SuccDep : SU->Succs) {
1046 SUnit *Succ = SuccDep.getSUnit();
1047 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1048 continue;
1049 hasSuccessor = true;
1050 }
1051 if (!hasSuccessor)
1052 CurrentColoring[SU->NodeNum] = GroupID;
1053 }
1054}
1055
1056void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
1057 unsigned DAGSize = DAG->SUnits.size();
1058 std::map<unsigned,unsigned> RealID;
1059
1060 CurrentBlocks.clear();
1061 CurrentColoring.clear();
1062 CurrentColoring.resize(DAGSize, 0);
1063 Node2CurrentBlock.clear();
1064
1065 // Restore links previous scheduling variant has overridden.
1066 DAG->restoreSULinksLeft();
1067
1068 NextReservedID = 1;
1069 NextNonReservedID = DAGSize + 1;
1070
1071 DEBUG(dbgs() << "Coloring the graph\n");
1072
1073 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
1074 colorHighLatenciesGroups();
1075 else
1076 colorHighLatenciesAlone();
1077 colorComputeReservedDependencies();
1078 colorAccordingToReservedDependencies();
1079 colorEndsAccordingToDependencies();
1080 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
1081 colorForceConsecutiveOrderInGroup();
1082 regroupNoUserInstructions();
1083 colorMergeConstantLoadsNextGroup();
1084 colorMergeIfPossibleNextGroupOnlyForReserved();
1085
1086 // Put SUs of same color into same block
1087 Node2CurrentBlock.resize(DAGSize, -1);
1088 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1089 SUnit *SU = &DAG->SUnits[i];
1090 unsigned Color = CurrentColoring[SU->NodeNum];
1091 if (RealID.find(Color) == RealID.end()) {
1092 int ID = CurrentBlocks.size();
Eugene Zelenko6a9226d2016-12-12 22:23:53 +00001093 BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID));
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001094 CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
1095 RealID[Color] = ID;
1096 }
1097 CurrentBlocks[RealID[Color]]->addUnit(SU);
1098 Node2CurrentBlock[SU->NodeNum] = RealID[Color];
1099 }
1100
1101 // Build dependencies between blocks.
1102 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1103 SUnit *SU = &DAG->SUnits[i];
1104 int SUID = Node2CurrentBlock[i];
1105 for (SDep& SuccDep : SU->Succs) {
1106 SUnit *Succ = SuccDep.getSUnit();
1107 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1108 continue;
1109 if (Node2CurrentBlock[Succ->NodeNum] != SUID)
Valery Pykhtinfb990552017-03-27 18:22:39 +00001110 CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
1111 SuccDep.isCtrl() ? NoData : Data);
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001112 }
1113 for (SDep& PredDep : SU->Preds) {
1114 SUnit *Pred = PredDep.getSUnit();
1115 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
1116 continue;
1117 if (Node2CurrentBlock[Pred->NodeNum] != SUID)
1118 CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
1119 }
1120 }
1121
1122 // Free root and leafs of all blocks to enable scheduling inside them.
1123 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1124 SIScheduleBlock *Block = CurrentBlocks[i];
1125 Block->finalizeUnits();
1126 }
1127 DEBUG(
1128 dbgs() << "Blocks created:\n\n";
1129 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1130 SIScheduleBlock *Block = CurrentBlocks[i];
1131 Block->printDebug(true);
1132 }
1133 );
1134}
1135
1136// Two functions taken from Codegen/MachineScheduler.cpp
1137
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001138/// Non-const version.
1139static MachineBasicBlock::iterator
1140nextIfDebug(MachineBasicBlock::iterator I,
1141 MachineBasicBlock::const_iterator End) {
Matt Arsenaultfef7beb2016-12-22 16:06:32 +00001142 for (; I != End; ++I) {
1143 if (!I->isDebugValue())
1144 break;
1145 }
1146 return I;
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001147}
1148
1149void SIScheduleBlockCreator::topologicalSort() {
1150 unsigned DAGSize = CurrentBlocks.size();
1151 std::vector<int> WorkList;
1152
1153 DEBUG(dbgs() << "Topological Sort\n");
1154
1155 WorkList.reserve(DAGSize);
1156 TopDownIndex2Block.resize(DAGSize);
1157 TopDownBlock2Index.resize(DAGSize);
1158 BottomUpIndex2Block.resize(DAGSize);
1159
1160 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1161 SIScheduleBlock *Block = CurrentBlocks[i];
1162 unsigned Degree = Block->getSuccs().size();
1163 TopDownBlock2Index[i] = Degree;
1164 if (Degree == 0) {
1165 WorkList.push_back(i);
1166 }
1167 }
1168
1169 int Id = DAGSize;
1170 while (!WorkList.empty()) {
1171 int i = WorkList.back();
1172 SIScheduleBlock *Block = CurrentBlocks[i];
1173 WorkList.pop_back();
1174 TopDownBlock2Index[i] = --Id;
1175 TopDownIndex2Block[Id] = i;
1176 for (SIScheduleBlock* Pred : Block->getPreds()) {
1177 if (!--TopDownBlock2Index[Pred->getID()])
1178 WorkList.push_back(Pred->getID());
1179 }
1180 }
1181
1182#ifndef NDEBUG
1183 // Check correctness of the ordering.
1184 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1185 SIScheduleBlock *Block = CurrentBlocks[i];
1186 for (SIScheduleBlock* Pred : Block->getPreds()) {
1187 assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
1188 "Wrong Top Down topological sorting");
1189 }
1190 }
1191#endif
1192
1193 BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
1194 TopDownIndex2Block.rend());
1195}
1196
1197void SIScheduleBlockCreator::scheduleInsideBlocks() {
1198 unsigned DAGSize = CurrentBlocks.size();
1199
1200 DEBUG(dbgs() << "\nScheduling Blocks\n\n");
1201
1202 // We do schedule a valid scheduling such that a Block corresponds
1203 // to a range of instructions.
1204 DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
1205 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1206 SIScheduleBlock *Block = CurrentBlocks[i];
1207 Block->fastSchedule();
1208 }
1209
1210 // Note: the following code, and the part restoring previous position
1211 // is by far the most expensive operation of the Scheduler.
1212
1213 // Do not update CurrentTop.
1214 MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
1215 std::vector<MachineBasicBlock::iterator> PosOld;
1216 std::vector<MachineBasicBlock::iterator> PosNew;
1217 PosOld.reserve(DAG->SUnits.size());
1218 PosNew.reserve(DAG->SUnits.size());
1219
1220 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1221 int BlockIndice = TopDownIndex2Block[i];
1222 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1223 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1224
1225 for (SUnit* SU : SUs) {
1226 MachineInstr *MI = SU->getInstr();
1227 MachineBasicBlock::iterator Pos = MI;
1228 PosOld.push_back(Pos);
1229 if (&*CurrentTopFastSched == MI) {
1230 PosNew.push_back(Pos);
1231 CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
1232 DAG->getCurrentBottom());
1233 } else {
1234 // Update the instruction stream.
1235 DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
1236
1237 // Update LiveIntervals.
Simon Pilgrime995a8082016-11-18 11:04:02 +00001238 // Note: Moving all instructions and calling handleMove every time
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001239 // is the most cpu intensive operation of the scheduler.
1240 // It would gain a lot if there was a way to recompute the
1241 // LiveIntervals for the entire scheduling region.
Duncan P. N. Exon Smithbe8f8c42016-02-27 20:14:29 +00001242 DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001243 PosNew.push_back(CurrentTopFastSched);
1244 }
1245 }
1246 }
1247
1248 // Now we have Block of SUs == Block of MI.
1249 // We do the final schedule for the instructions inside the block.
1250 // The property that all the SUs of the Block are grouped together as MI
1251 // is used for correct reg usage tracking.
1252 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1253 SIScheduleBlock *Block = CurrentBlocks[i];
1254 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1255 Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
1256 }
1257
1258 DEBUG(dbgs() << "Restoring MI Pos\n");
1259 // Restore old ordering (which prevents a LIS->handleMove bug).
1260 for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
1261 MachineBasicBlock::iterator POld = PosOld[i-1];
1262 MachineBasicBlock::iterator PNew = PosNew[i-1];
1263 if (PNew != POld) {
1264 // Update the instruction stream.
1265 DAG->getBB()->splice(POld, DAG->getBB(), PNew);
1266
1267 // Update LiveIntervals.
Duncan P. N. Exon Smithbe8f8c42016-02-27 20:14:29 +00001268 DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001269 }
1270 }
1271
1272 DEBUG(
1273 for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1274 SIScheduleBlock *Block = CurrentBlocks[i];
1275 Block->printDebug(true);
1276 }
1277 );
1278}
1279
1280void SIScheduleBlockCreator::fillStats() {
1281 unsigned DAGSize = CurrentBlocks.size();
1282
1283 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1284 int BlockIndice = TopDownIndex2Block[i];
1285 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
Eugene Zelenko6a9226d2016-12-12 22:23:53 +00001286 if (Block->getPreds().empty())
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001287 Block->Depth = 0;
1288 else {
1289 unsigned Depth = 0;
1290 for (SIScheduleBlock *Pred : Block->getPreds()) {
1291 if (Depth < Pred->Depth + 1)
1292 Depth = Pred->Depth + 1;
1293 }
1294 Block->Depth = Depth;
1295 }
1296 }
1297
1298 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1299 int BlockIndice = BottomUpIndex2Block[i];
1300 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
Eugene Zelenko6a9226d2016-12-12 22:23:53 +00001301 if (Block->getSuccs().empty())
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001302 Block->Height = 0;
1303 else {
1304 unsigned Height = 0;
Valery Pykhtinfb990552017-03-27 18:22:39 +00001305 for (const auto &Succ : Block->getSuccs())
1306 Height = std::min(Height, Succ.first->Height + 1);
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001307 Block->Height = Height;
1308 }
1309 }
1310}
1311
1312// SIScheduleBlockScheduler //
1313
1314SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
1315 SISchedulerBlockSchedulerVariant Variant,
1316 SIScheduleBlocks BlocksStruct) :
1317 DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
1318 LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
1319 SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
1320
1321 // Fill the usage of every output
1322 // Warning: while by construction we always have a link between two blocks
1323 // when one needs a result from the other, the number of users of an output
1324 // is not the sum of child blocks having as input the same virtual register.
1325 // Here is an example. A produces x and y. B eats x and produces x'.
1326 // C eats x' and y. The register coalescer may have attributed the same
1327 // virtual register to x and x'.
1328 // To count accurately, we do a topological sort. In case the register is
1329 // found for several parents, we increment the usage of the one with the
1330 // highest topological index.
1331 LiveOutRegsNumUsages.resize(Blocks.size());
1332 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1333 SIScheduleBlock *Block = Blocks[i];
1334 for (unsigned Reg : Block->getInRegs()) {
1335 bool Found = false;
1336 int topoInd = -1;
1337 for (SIScheduleBlock* Pred: Block->getPreds()) {
1338 std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1339 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1340
1341 if (RegPos != PredOutRegs.end()) {
1342 Found = true;
1343 if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
1344 topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
1345 }
1346 }
1347 }
1348
1349 if (!Found)
1350 continue;
1351
1352 int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
Valery Pykhtine2419dc2017-03-24 17:49:05 +00001353 ++LiveOutRegsNumUsages[PredID][Reg];
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001354 }
1355 }
1356
1357 LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
1358 BlockNumPredsLeft.resize(Blocks.size());
1359 BlockNumSuccsLeft.resize(Blocks.size());
1360
1361 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1362 SIScheduleBlock *Block = Blocks[i];
1363 BlockNumPredsLeft[i] = Block->getPreds().size();
1364 BlockNumSuccsLeft[i] = Block->getSuccs().size();
1365 }
1366
1367#ifndef NDEBUG
1368 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1369 SIScheduleBlock *Block = Blocks[i];
1370 assert(Block->getID() == i);
1371 }
1372#endif
1373
1374 std::set<unsigned> InRegs = DAG->getInRegs();
1375 addLiveRegs(InRegs);
1376
Valery Pykhtinf70f6832017-03-27 17:06:36 +00001377 // Increase LiveOutRegsNumUsages for blocks
1378 // producing registers consumed in another
1379 // scheduling region.
1380 for (unsigned Reg : DAG->getOutRegs()) {
1381 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1382 // Do reverse traversal
1383 int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
1384 SIScheduleBlock *Block = Blocks[ID];
1385 const std::set<unsigned> &OutRegs = Block->getOutRegs();
1386
1387 if (OutRegs.find(Reg) == OutRegs.end())
1388 continue;
1389
1390 ++LiveOutRegsNumUsages[ID][Reg];
1391 break;
1392 }
1393 }
1394
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001395 // Fill LiveRegsConsumers for regs that were already
1396 // defined before scheduling.
1397 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1398 SIScheduleBlock *Block = Blocks[i];
1399 for (unsigned Reg : Block->getInRegs()) {
1400 bool Found = false;
1401 for (SIScheduleBlock* Pred: Block->getPreds()) {
1402 std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1403 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1404
1405 if (RegPos != PredOutRegs.end()) {
1406 Found = true;
1407 break;
1408 }
1409 }
1410
Valery Pykhtine2419dc2017-03-24 17:49:05 +00001411 if (!Found)
1412 ++LiveRegsConsumers[Reg];
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001413 }
1414 }
1415
1416 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1417 SIScheduleBlock *Block = Blocks[i];
1418 if (BlockNumPredsLeft[i] == 0) {
1419 ReadyBlocks.push_back(Block);
1420 }
1421 }
1422
1423 while (SIScheduleBlock *Block = pickBlock()) {
1424 BlocksScheduled.push_back(Block);
1425 blockScheduled(Block);
1426 }
1427
1428 DEBUG(
1429 dbgs() << "Block Order:";
1430 for (SIScheduleBlock* Block : BlocksScheduled) {
1431 dbgs() << ' ' << Block->getID();
1432 }
Valery Pykhtin57ab6992017-03-24 16:37:48 +00001433 dbgs() << '\n';
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001434 );
1435}
1436
1437bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
1438 SIBlockSchedCandidate &TryCand) {
1439 if (!Cand.isValid()) {
1440 TryCand.Reason = NodeOrder;
1441 return true;
1442 }
1443
1444 // Try to hide high latencies.
1445 if (tryLess(TryCand.LastPosHighLatParentScheduled,
1446 Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
1447 return true;
1448 // Schedule high latencies early so you can hide them better.
1449 if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
1450 TryCand, Cand, Latency))
1451 return true;
1452 if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height,
1453 TryCand, Cand, Depth))
1454 return true;
1455 if (tryGreater(TryCand.NumHighLatencySuccessors,
1456 Cand.NumHighLatencySuccessors,
1457 TryCand, Cand, Successor))
1458 return true;
1459 return false;
1460}
1461
1462bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
1463 SIBlockSchedCandidate &TryCand) {
1464 if (!Cand.isValid()) {
1465 TryCand.Reason = NodeOrder;
1466 return true;
1467 }
1468
1469 if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
1470 TryCand, Cand, RegUsage))
1471 return true;
1472 if (tryGreater(TryCand.NumSuccessors > 0,
1473 Cand.NumSuccessors > 0,
1474 TryCand, Cand, Successor))
1475 return true;
1476 if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
1477 return true;
1478 if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
1479 TryCand, Cand, RegUsage))
1480 return true;
1481 return false;
1482}
1483
1484SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
1485 SIBlockSchedCandidate Cand;
1486 std::vector<SIScheduleBlock*>::iterator Best;
1487 SIScheduleBlock *Block;
1488 if (ReadyBlocks.empty())
1489 return nullptr;
1490
1491 DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
1492 VregCurrentUsage, SregCurrentUsage);
1493 if (VregCurrentUsage > maxVregUsage)
1494 maxVregUsage = VregCurrentUsage;
Valery Pykhtinf7d10232017-03-24 16:45:50 +00001495 if (SregCurrentUsage > maxSregUsage)
1496 maxSregUsage = SregCurrentUsage;
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001497 DEBUG(
1498 dbgs() << "Picking New Blocks\n";
1499 dbgs() << "Available: ";
1500 for (SIScheduleBlock* Block : ReadyBlocks)
1501 dbgs() << Block->getID() << ' ';
1502 dbgs() << "\nCurrent Live:\n";
1503 for (unsigned Reg : LiveRegs)
1504 dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
1505 dbgs() << '\n';
1506 dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
1507 dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
1508 );
1509
1510 Cand.Block = nullptr;
1511 for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
1512 E = ReadyBlocks.end(); I != E; ++I) {
1513 SIBlockSchedCandidate TryCand;
1514 TryCand.Block = *I;
1515 TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
1516 TryCand.VGPRUsageDiff =
1517 checkRegUsageImpact(TryCand.Block->getInRegs(),
1518 TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
1519 TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
1520 TryCand.NumHighLatencySuccessors =
1521 TryCand.Block->getNumHighLatencySuccessors();
1522 TryCand.LastPosHighLatParentScheduled =
1523 (unsigned int) std::max<int> (0,
1524 LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
1525 LastPosWaitedHighLatency);
1526 TryCand.Height = TryCand.Block->Height;
1527 // Try not to increase VGPR usage too much, else we may spill.
1528 if (VregCurrentUsage > 120 ||
1529 Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
1530 if (!tryCandidateRegUsage(Cand, TryCand) &&
1531 Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
1532 tryCandidateLatency(Cand, TryCand);
1533 } else {
1534 if (!tryCandidateLatency(Cand, TryCand))
1535 tryCandidateRegUsage(Cand, TryCand);
1536 }
1537 if (TryCand.Reason != NoCand) {
1538 Cand.setBest(TryCand);
1539 Best = I;
1540 DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
1541 << getReasonStr(Cand.Reason) << '\n');
1542 }
1543 }
1544
1545 DEBUG(
1546 dbgs() << "Picking: " << Cand.Block->getID() << '\n';
1547 dbgs() << "Is a block with high latency instruction: "
1548 << (Cand.IsHighLatency ? "yes\n" : "no\n");
1549 dbgs() << "Position of last high latency dependency: "
1550 << Cand.LastPosHighLatParentScheduled << '\n';
1551 dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
1552 dbgs() << '\n';
1553 );
1554
1555 Block = Cand.Block;
1556 ReadyBlocks.erase(Best);
1557 return Block;
1558}
1559
1560// Tracking of currently alive registers to determine VGPR Usage.
1561
1562void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
1563 for (unsigned Reg : Regs) {
1564 // For now only track virtual registers.
1565 if (!TargetRegisterInfo::isVirtualRegister(Reg))
1566 continue;
1567 // If not already in the live set, then add it.
1568 (void) LiveRegs.insert(Reg);
1569 }
1570}
1571
1572void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
1573 std::set<unsigned> &Regs) {
1574 for (unsigned Reg : Regs) {
1575 // For now only track virtual registers.
1576 std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
1577 assert (Pos != LiveRegs.end() && // Reg must be live.
1578 LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
1579 LiveRegsConsumers[Reg] >= 1);
1580 --LiveRegsConsumers[Reg];
1581 if (LiveRegsConsumers[Reg] == 0)
1582 LiveRegs.erase(Pos);
1583 }
1584}
1585
1586void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
Valery Pykhtinfb990552017-03-27 18:22:39 +00001587 for (const auto &Block : Parent->getSuccs()) {
1588 if (--BlockNumPredsLeft[Block.first->getID()] == 0)
1589 ReadyBlocks.push_back(Block.first);
1590
1591 if (Parent->isHighLatencyBlock() &&
1592 Block.second == SIScheduleBlockLinkKind::Data)
1593 LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001594 }
1595}
1596
1597void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
1598 decreaseLiveRegs(Block, Block->getInRegs());
1599 addLiveRegs(Block->getOutRegs());
1600 releaseBlockSuccs(Block);
1601 for (std::map<unsigned, unsigned>::iterator RegI =
1602 LiveOutRegsNumUsages[Block->getID()].begin(),
1603 E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
1604 std::pair<unsigned, unsigned> RegP = *RegI;
Valery Pykhtine2419dc2017-03-24 17:49:05 +00001605 // We produce this register, thus it must not be previously alive.
1606 assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
1607 LiveRegsConsumers[RegP.first] == 0);
1608 LiveRegsConsumers[RegP.first] += RegP.second;
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001609 }
1610 if (LastPosHighLatencyParentScheduled[Block->getID()] >
1611 (unsigned)LastPosWaitedHighLatency)
1612 LastPosWaitedHighLatency =
1613 LastPosHighLatencyParentScheduled[Block->getID()];
1614 ++NumBlockScheduled;
1615}
1616
1617std::vector<int>
1618SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
1619 std::set<unsigned> &OutRegs) {
1620 std::vector<int> DiffSetPressure;
1621 DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
1622
1623 for (unsigned Reg : InRegs) {
1624 // For now only track virtual registers.
1625 if (!TargetRegisterInfo::isVirtualRegister(Reg))
1626 continue;
1627 if (LiveRegsConsumers[Reg] > 1)
1628 continue;
1629 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1630 for (; PSetI.isValid(); ++PSetI) {
1631 DiffSetPressure[*PSetI] -= PSetI.getWeight();
1632 }
1633 }
1634
1635 for (unsigned Reg : OutRegs) {
1636 // For now only track virtual registers.
1637 if (!TargetRegisterInfo::isVirtualRegister(Reg))
1638 continue;
1639 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1640 for (; PSetI.isValid(); ++PSetI) {
1641 DiffSetPressure[*PSetI] += PSetI.getWeight();
1642 }
1643 }
1644
1645 return DiffSetPressure;
1646}
1647
1648// SIScheduler //
1649
1650struct SIScheduleBlockResult
1651SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
1652 SISchedulerBlockSchedulerVariant ScheduleVariant) {
1653 SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
1654 SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
1655 std::vector<SIScheduleBlock*> ScheduledBlocks;
1656 struct SIScheduleBlockResult Res;
1657
1658 ScheduledBlocks = Scheduler.getBlocks();
1659
1660 for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
1661 SIScheduleBlock *Block = ScheduledBlocks[b];
1662 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1663
1664 for (SUnit* SU : SUs)
1665 Res.SUs.push_back(SU->NodeNum);
1666 }
1667
1668 Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
1669 Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
1670 return Res;
1671}
1672
1673// SIScheduleDAGMI //
1674
1675SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
Eugene Zelenko6a9226d2016-12-12 22:23:53 +00001676 ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) {
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001677 SITII = static_cast<const SIInstrInfo*>(TII);
1678 SITRI = static_cast<const SIRegisterInfo*>(TRI);
1679
Tom Stellard7c463c92016-08-26 21:16:37 +00001680 VGPRSetID = SITRI->getVGPRPressureSet();
1681 SGPRSetID = SITRI->getSGPRPressureSet();
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001682}
1683
Eugene Zelenko6a9226d2016-12-12 22:23:53 +00001684SIScheduleDAGMI::~SIScheduleDAGMI() = default;
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001685
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001686// Code adapted from scheduleDAG.cpp
1687// Does a topological sort over the SUs.
1688// Both TopDown and BottomUp
1689void SIScheduleDAGMI::topologicalSort() {
Tom Stellard1d3940e2016-06-09 23:48:02 +00001690 Topo.InitDAGTopologicalSorting();
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001691
Tom Stellard1d3940e2016-06-09 23:48:02 +00001692 TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
1693 BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001694}
1695
1696// Move low latencies further from their user without
1697// increasing SGPR usage (in general)
1698// This is to be replaced by a better pass that would
1699// take into account SGPR usage (based on VGPR Usage
1700// and the corresponding wavefront count), that would
1701// try to merge groups of loads if it make sense, etc
1702void SIScheduleDAGMI::moveLowLatencies() {
1703 unsigned DAGSize = SUnits.size();
1704 int LastLowLatencyUser = -1;
1705 int LastLowLatencyPos = -1;
1706
1707 for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
1708 SUnit *SU = &SUnits[ScheduledSUnits[i]];
1709 bool IsLowLatencyUser = false;
1710 unsigned MinPos = 0;
1711
1712 for (SDep& PredDep : SU->Preds) {
1713 SUnit *Pred = PredDep.getSUnit();
Duncan P. N. Exon Smith9cfc75c2016-06-30 00:01:54 +00001714 if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001715 IsLowLatencyUser = true;
1716 }
1717 if (Pred->NodeNum >= DAGSize)
1718 continue;
1719 unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
1720 if (PredPos >= MinPos)
1721 MinPos = PredPos + 1;
1722 }
1723
Duncan P. N. Exon Smith9cfc75c2016-06-30 00:01:54 +00001724 if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001725 unsigned BestPos = LastLowLatencyUser + 1;
1726 if ((int)BestPos <= LastLowLatencyPos)
1727 BestPos = LastLowLatencyPos + 1;
1728 if (BestPos < MinPos)
1729 BestPos = MinPos;
1730 if (BestPos < i) {
1731 for (unsigned u = i; u > BestPos; --u) {
1732 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1733 ScheduledSUnits[u] = ScheduledSUnits[u-1];
1734 }
1735 ScheduledSUnits[BestPos] = SU->NodeNum;
1736 ScheduledSUnitsInv[SU->NodeNum] = BestPos;
1737 }
1738 LastLowLatencyPos = BestPos;
1739 if (IsLowLatencyUser)
1740 LastLowLatencyUser = BestPos;
1741 } else if (IsLowLatencyUser) {
1742 LastLowLatencyUser = i;
1743 // Moves COPY instructions on which depends
1744 // the low latency instructions too.
1745 } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
1746 bool CopyForLowLat = false;
1747 for (SDep& SuccDep : SU->Succs) {
1748 SUnit *Succ = SuccDep.getSUnit();
Duncan P. N. Exon Smith9cfc75c2016-06-30 00:01:54 +00001749 if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001750 CopyForLowLat = true;
1751 }
1752 }
1753 if (!CopyForLowLat)
1754 continue;
1755 if (MinPos < i) {
1756 for (unsigned u = i; u > MinPos; --u) {
1757 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1758 ScheduledSUnits[u] = ScheduledSUnits[u-1];
1759 }
1760 ScheduledSUnits[MinPos] = SU->NodeNum;
1761 ScheduledSUnitsInv[SU->NodeNum] = MinPos;
1762 }
1763 }
1764 }
1765}
1766
1767void SIScheduleDAGMI::restoreSULinksLeft() {
1768 for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1769 SUnits[i].isScheduled = false;
1770 SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
1771 SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
1772 SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
1773 SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
1774 }
1775}
1776
1777// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1778template<typename _Iterator> void
1779SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
1780 unsigned &VgprUsage, unsigned &SgprUsage) {
1781 VgprUsage = 0;
1782 SgprUsage = 0;
1783 for (_Iterator RegI = First; RegI != End; ++RegI) {
1784 unsigned Reg = *RegI;
1785 // For now only track virtual registers
1786 if (!TargetRegisterInfo::isVirtualRegister(Reg))
1787 continue;
1788 PSetIterator PSetI = MRI.getPressureSets(Reg);
1789 for (; PSetI.isValid(); ++PSetI) {
1790 if (*PSetI == VGPRSetID)
1791 VgprUsage += PSetI.getWeight();
1792 else if (*PSetI == SGPRSetID)
1793 SgprUsage += PSetI.getWeight();
1794 }
1795 }
1796}
1797
1798void SIScheduleDAGMI::schedule()
1799{
1800 SmallVector<SUnit*, 8> TopRoots, BotRoots;
1801 SIScheduleBlockResult Best, Temp;
1802 DEBUG(dbgs() << "Preparing Scheduling\n");
1803
1804 buildDAGWithRegPressure();
1805 DEBUG(
1806 for(SUnit& SU : SUnits)
1807 SU.dumpAll(this)
1808 );
1809
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001810 topologicalSort();
1811 findRootsAndBiasEdges(TopRoots, BotRoots);
1812 // We reuse several ScheduleDAGMI and ScheduleDAGMILive
1813 // functions, but to make them happy we must initialize
1814 // the default Scheduler implementation (even if we do not
1815 // run it)
1816 SchedImpl->initialize(this);
1817 initQueues(TopRoots, BotRoots);
1818
1819 // Fill some stats to help scheduling.
1820
1821 SUnitsLinksBackup = SUnits;
1822 IsLowLatencySU.clear();
1823 LowLatencyOffset.clear();
1824 IsHighLatencySU.clear();
1825
1826 IsLowLatencySU.resize(SUnits.size(), 0);
1827 LowLatencyOffset.resize(SUnits.size(), 0);
1828 IsHighLatencySU.resize(SUnits.size(), 0);
1829
1830 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1831 SUnit *SU = &SUnits[i];
Chad Rosierc27a18f2016-03-09 16:00:35 +00001832 unsigned BaseLatReg;
1833 int64_t OffLatReg;
Duncan P. N. Exon Smith9cfc75c2016-06-30 00:01:54 +00001834 if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001835 IsLowLatencySU[i] = 1;
Duncan P. N. Exon Smith9cfc75c2016-06-30 00:01:54 +00001836 if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg,
1837 TRI))
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001838 LowLatencyOffset[i] = OffLatReg;
Duncan P. N. Exon Smith9cfc75c2016-06-30 00:01:54 +00001839 } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001840 IsHighLatencySU[i] = 1;
1841 }
1842
1843 SIScheduler Scheduler(this);
1844 Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
1845 SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
Matt Arsenault105c2a22016-07-01 18:03:46 +00001846
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001847 // if VGPR usage is extremely high, try other good performing variants
1848 // which could lead to lower VGPR usage
1849 if (Best.MaxVGPRUsage > 180) {
Benjamin Kramer80e3d5b2017-03-24 17:53:06 +00001850 static const std::pair<SISchedulerBlockCreatorVariant,
1851 SISchedulerBlockSchedulerVariant>
Benjamin Kramerc06d6722017-03-24 14:11:47 +00001852 Variants[] = {
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001853 { LatenciesAlone, BlockRegUsageLatency },
1854// { LatenciesAlone, BlockRegUsage },
1855 { LatenciesGrouped, BlockLatencyRegUsage },
1856// { LatenciesGrouped, BlockRegUsageLatency },
1857// { LatenciesGrouped, BlockRegUsage },
1858 { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1859// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1860// { LatenciesAlonePlusConsecutive, BlockRegUsage }
1861 };
1862 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1863 Temp = Scheduler.scheduleVariant(v.first, v.second);
1864 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1865 Best = Temp;
1866 }
1867 }
1868 // if VGPR usage is still extremely high, we may spill. Try other variants
1869 // which are less performing, but that could lead to lower VGPR usage.
1870 if (Best.MaxVGPRUsage > 200) {
Benjamin Kramer80e3d5b2017-03-24 17:53:06 +00001871 static const std::pair<SISchedulerBlockCreatorVariant,
1872 SISchedulerBlockSchedulerVariant>
Benjamin Kramerc06d6722017-03-24 14:11:47 +00001873 Variants[] = {
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001874// { LatenciesAlone, BlockRegUsageLatency },
1875 { LatenciesAlone, BlockRegUsage },
1876// { LatenciesGrouped, BlockLatencyRegUsage },
1877 { LatenciesGrouped, BlockRegUsageLatency },
1878 { LatenciesGrouped, BlockRegUsage },
1879// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1880 { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1881 { LatenciesAlonePlusConsecutive, BlockRegUsage }
1882 };
1883 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1884 Temp = Scheduler.scheduleVariant(v.first, v.second);
1885 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1886 Best = Temp;
1887 }
1888 }
Matt Arsenault105c2a22016-07-01 18:03:46 +00001889
Nicolai Haehnle02c32912016-01-13 16:10:10 +00001890 ScheduledSUnits = Best.SUs;
1891 ScheduledSUnitsInv.resize(SUnits.size());
1892
1893 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1894 ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
1895 }
1896
1897 moveLowLatencies();
1898
1899 // Tell the outside world about the result of the scheduling.
1900
1901 assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
1902 TopRPTracker.setPos(CurrentTop);
1903
1904 for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
1905 E = ScheduledSUnits.end(); I != E; ++I) {
1906 SUnit *SU = &SUnits[*I];
1907
1908 scheduleMI(SU, true);
1909
1910 DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
1911 << *SU->getInstr());
1912 }
1913
1914 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
1915
1916 placeDebugValues();
1917
1918 DEBUG({
1919 unsigned BBNum = begin()->getParent()->getNumber();
1920 dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
1921 dumpSchedule();
1922 dbgs() << '\n';
1923 });
1924}