Blame - llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp - toolchain/llvm-project

blob: 5ef12e7f5eab522da683b3fa5547fc5d9e2add2d [file] [log] [blame]

Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	1	//===-- SIMachineScheduler.cpp - SI Scheduler Interface -- C++ ------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	/// \file
				11	/// \brief SI Machine Scheduler interface
				12	//
				13	//===----------------------------------------------------------------------===//
				14
				15	#include "SIMachineScheduler.h"
				16	#include "AMDGPUSubtarget.h"
				17	#include "llvm/CodeGen/LiveInterval.h"
				18	#include "llvm/CodeGen/LiveIntervalAnalysis.h"
				19	#include "llvm/CodeGen/MachineRegisterInfo.h"
				20	#include "llvm/CodeGen/MachineScheduler.h"
				21	#include "llvm/CodeGen/RegisterPressure.h"
				22
				23	using namespace llvm;
				24
				25	#define DEBUG_TYPE "misched"
				26
				27	// This scheduler implements a different scheduling algorithm than
				28	// GenericScheduler.
				29	//
				30	// There are several specific architecture behaviours that can't be modelled
				31	// for GenericScheduler:
				32	// . When accessing the result of an SGPR load instruction, you have to wait
				33	// for all the SGPR load instructions before your current instruction to
				34	// have finished.
				35	// . When accessing the result of an VGPR load instruction, you have to wait
				36	// for all the VGPR load instructions previous to the VGPR load instruction
				37	// you are interested in to finish.
				38	// . The less the register pressure, the best load latencies are hidden
				39	//
				40	// Moreover some specifities (like the fact a lot of instructions in the shader
				41	// have few dependencies) makes the generic scheduler have some unpredictable
				42	// behaviours. For example when register pressure becomes high, it can either
				43	// manage to prevent register pressure from going too high, or it can
				44	// increase register pressure even more than if it hadn't taken register
				45	// pressure into account.
				46	//
				47	// Also some other bad behaviours are generated, like loading at the beginning
				48	// of the shader a constant in VGPR you won't need until the end of the shader.
				49	//
				50	// The scheduling problem for SI can distinguish three main parts:
				51	// . Hiding high latencies (texture sampling, etc)
				52	// . Hiding low latencies (SGPR constant loading, etc)
				53	// . Keeping register usage low for better latency hiding and general
				54	// performance
				55	//
				56	// Some other things can also affect performance, but are hard to predict
				57	// (cache usage, the fact the HW can issue several instructions from different
				58	// wavefronts if different types, etc)
				59	//
				60	// This scheduler tries to solve the scheduling problem by dividing it into
				61	// simpler sub-problems. It divides the instructions into blocks, schedules
				62	// locally inside the blocks where it takes care of low latencies, and then
				63	// chooses the order of the blocks by taking care of high latencies.
				64	// Dividing the instructions into blocks helps control keeping register
				65	// usage low.
				66	//
				67	// First the instructions are put into blocks.
				68	// We want the blocks help control register usage and hide high latencies
				69	// later. To help control register usage, we typically want all local
				70	// computations, when for example you create a result that can be comsummed
				71	// right away, to be contained in a block. Block inputs and outputs would
				72	// typically be important results that are needed in several locations of
				73	// the shader. Since we do want blocks to help hide high latencies, we want
				74	// the instructions inside the block to have a minimal set of dependencies
				75	// on high latencies. It will make it easy to pick blocks to hide specific
				76	// high latencies.
				77	// The block creation algorithm is divided into several steps, and several
				78	// variants can be tried during the scheduling process.
				79	//
				80	// Second the order of the instructions inside the blocks is choosen.
				81	// At that step we do take into account only register usage and hiding
				82	// low latency instructions
				83	//
				84	// Third the block order is choosen, there we try to hide high latencies
				85	// and keep register usage low.
				86	//
				87	// After the third step, a pass is done to improve the hiding of low
				88	// latencies.
				89	//
				90	// Actually when talking about 'low latency' or 'high latency' it includes
				91	// both the latency to get the cache (or global mem) data go to the register,
				92	// and the bandwith limitations.
				93	// Increasing the number of active wavefronts helps hide the former, but it
				94	// doesn't solve the latter, thus why even if wavefront count is high, we have
				95	// to try have as many instructions hiding high latencies as possible.
				96	// The OpenCL doc says for example latency of 400 cycles for a global mem access,
				97	// which is hidden by 10 instructions if the wavefront count is 10.
				98
				99	// Some figures taken from AMD docs:
				100	// Both texture and constant L1 caches are 4-way associative with 64 bytes
				101	// lines.
				102	// Constant cache is shared with 4 CUs.
				103	// For texture sampling, the address generation unit receives 4 texture
				104	// addresses per cycle, thus we could expect texture sampling latency to be
				105	// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
				106	// instructions in a wavefront group are executed every 4 cycles),
				107	// or 16 instructions if the other wavefronts associated to the 3 other VALUs
				108	// of the CU do texture sampling too. (Don't take these figures too seriously,
				109	// as I'm not 100% sure of the computation)
				110	// Data exports should get similar latency.
				111	// For constant loading, the cache is shader with 4 CUs.
				112	// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
				113	// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
				114	// It means a simple s_buffer_load should take one instruction to hide, as
				115	// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
				116	// cache line.
				117	//
				118	// As of today the driver doesn't preload the constants in cache, thus the
				119	// first loads get extra latency. The doc says global memory access can be
				120	// 300-600 cycles. We do not specially take that into account when scheduling
				121	// As we expect the driver to be able to preload the constants soon.
				122
				123
				124	// common code //
				125
				126	#ifndef NDEBUG
				127
				128	static const char *getReasonStr(SIScheduleCandReason Reason) {
				129	switch (Reason) {
				130	case NoCand: return "NOCAND";
				131	case RegUsage: return "REGUSAGE";
				132	case Latency: return "LATENCY";
				133	case Successor: return "SUCCESSOR";
				134	case Depth: return "DEPTH";
				135	case NodeOrder: return "ORDER";
				136	}
				137	llvm_unreachable("Unknown reason!");
				138	}
				139
				140	#endif
				141
				142	static bool tryLess(int TryVal, int CandVal,
				143	SISchedulerCandidate &TryCand,
				144	SISchedulerCandidate &Cand,
				145	SIScheduleCandReason Reason) {
				146	if (TryVal < CandVal) {
				147	TryCand.Reason = Reason;
				148	return true;
				149	}
				150	if (TryVal > CandVal) {
				151	if (Cand.Reason > Reason)
				152	Cand.Reason = Reason;
				153	return true;
				154	}
				155	Cand.setRepeat(Reason);
				156	return false;
				157	}
				158
				159	static bool tryGreater(int TryVal, int CandVal,
				160	SISchedulerCandidate &TryCand,
				161	SISchedulerCandidate &Cand,
				162	SIScheduleCandReason Reason) {
				163	if (TryVal > CandVal) {
				164	TryCand.Reason = Reason;
				165	return true;
				166	}
				167	if (TryVal < CandVal) {
				168	if (Cand.Reason > Reason)
				169	Cand.Reason = Reason;
				170	return true;
				171	}
				172	Cand.setRepeat(Reason);
				173	return false;
				174	}
				175
				176	// SIScheduleBlock //
				177
				178	void SIScheduleBlock::addUnit(SUnit *SU) {
				179	NodeNum2Index[SU->NodeNum] = SUnits.size();
				180	SUnits.push_back(SU);
				181	}
				182
				183	#ifndef NDEBUG
				184
				185	void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
				186
				187	dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
				188	dbgs() << '\n';
				189	}
				190	#endif
				191
				192	void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
				193	SISchedCandidate &TryCand) {
				194	// Initialize the candidate if needed.
				195	if (!Cand.isValid()) {
				196	TryCand.Reason = NodeOrder;
				197	return;
				198	}
				199
				200	if (Cand.SGPRUsage > 60 &&
				201	tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
				202	return;
				203
				204	// Schedule low latency instructions as top as possible.
				205	// Order of priority is:
				206	// . Low latency instructions which do not depend on other low latency
				207	// instructions we haven't waited for
				208	// . Other instructions which do not depend on low latency instructions
				209	// we haven't waited for
				210	// . Low latencies
				211	// . All other instructions
				212	// Goal is to get: low latency instructions - independant instructions
				213	// - (eventually some more low latency instructions)
				214	// - instructions that depend on the first low latency instructions.
				215	// If in the block there is a lot of constant loads, the SGPR usage
				216	// could go quite high, thus above the arbitrary limit of 60 will encourage
				217	// use the already loaded constants (in order to release some SGPRs) before
				218	// loading more.
				219	if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
				220	Cand.HasLowLatencyNonWaitedParent,
				221	TryCand, Cand, SIScheduleCandReason::Depth))
				222	return;
				223
				224	if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
				225	TryCand, Cand, SIScheduleCandReason::Depth))
				226	return;
				227
				228	if (TryCand.IsLowLatency &&
				229	tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
				230	TryCand, Cand, SIScheduleCandReason::Depth))
				231	return;
				232
				233	if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
				234	return;
				235
				236	// Fall through to original instruction order.
				237	if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
				238	TryCand.Reason = NodeOrder;
				239	}
				240	}
				241
				242	SUnit* SIScheduleBlock::pickNode() {
				243	SISchedCandidate TopCand;
				244
				245	for (SUnit* SU : TopReadySUs) {
				246	SISchedCandidate TryCand;
				247	std::vector<unsigned> pressure;
				248	std::vector<unsigned> MaxPressure;
				249	// Predict register usage after this instruction.
				250	TryCand.SU = SU;
				251	TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
				252	TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
				253	TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
				254	TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
				255	TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
				256	TryCand.HasLowLatencyNonWaitedParent =
				257	HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
				258	tryCandidateTopDown(TopCand, TryCand);
				259	if (TryCand.Reason != NoCand)
				260	TopCand.setBest(TryCand);
				261	}
				262
				263	return TopCand.SU;
				264	}
				265
				266
				267	// Schedule something valid.
				268	void SIScheduleBlock::fastSchedule() {
				269	TopReadySUs.clear();
				270	if (Scheduled)
				271	undoSchedule();
				272
				273	for (SUnit* SU : SUnits) {
				274	if (!SU->NumPredsLeft)
				275	TopReadySUs.push_back(SU);
				276	}
				277
				278	while (!TopReadySUs.empty()) {
				279	SUnit *SU = TopReadySUs[0];
				280	ScheduledSUnits.push_back(SU);
				281	nodeScheduled(SU);
				282	}
				283
				284	Scheduled = true;
				285	}
				286
				287	// Returns if the register was set between first and last.
				288	static bool isDefBetween(unsigned Reg,
				289	SlotIndex First, SlotIndex Last,
				290	const MachineRegisterInfo *MRI,
				291	const LiveIntervals *LIS) {
				292	for (MachineRegisterInfo::def_instr_iterator
				293	UI = MRI->def_instr_begin(Reg),
				294	UE = MRI->def_instr_end(); UI != UE; ++UI) {
				295	const MachineInstr* MI = &*UI;
				296	if (MI->isDebugValue())
				297	continue;
Duncan P. N. Exon Smith	3ac9cc6	2016-02-27 06:40:41 +0000	[diff] [blame]	298	SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	299	if (InstSlot >= First && InstSlot <= Last)
				300	return true;
				301	}
				302	return false;
				303	}
				304
				305	void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
				306	MachineBasicBlock::iterator EndBlock) {
				307	IntervalPressure Pressure, BotPressure;
				308	RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
				309	LiveIntervals *LIS = DAG->getLIS();
				310	MachineRegisterInfo *MRI = DAG->getMRI();
				311	DAG->initRPTracker(TopRPTracker);
				312	DAG->initRPTracker(BotRPTracker);
				313	DAG->initRPTracker(RPTracker);
				314
				315	// Goes though all SU. RPTracker captures what had to be alive for the SUs
				316	// to execute, and what is still alive at the end.
				317	for (SUnit* SU : ScheduledSUnits) {
				318	RPTracker.setPos(SU->getInstr());
				319	RPTracker.advance();
				320	}
				321
				322	// Close the RPTracker to finalize live ins/outs.
				323	RPTracker.closeRegion();
				324
				325	// Initialize the live ins and live outs.
				326	TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
				327	BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
				328
				329	// Do not Track Physical Registers, because it messes up.
Matthias Braun	5d45861	2016-01-20 00:23:26 +0000	[diff] [blame]	330	for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
				331	if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
				332	LiveInRegs.insert(RegMaskPair.RegUnit);
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	333	}
				334	LiveOutRegs.clear();
				335	// There is several possibilities to distinguish:
				336	// 1) Reg is not input to any instruction in the block, but is output of one
				337	// 2) 1) + read in the block and not needed after it
				338	// 3) 1) + read in the block but needed in another block
				339	// 4) Reg is input of an instruction but another block will read it too
				340	// 5) Reg is input of an instruction and then rewritten in the block.
				341	// result is not read in the block (implies used in another block)
				342	// 6) Reg is input of an instruction and then rewritten in the block.
				343	// result is read in the block and not needed in another block
				344	// 7) Reg is input of an instruction and then rewritten in the block.
				345	// result is read in the block but also needed in another block
				346	// LiveInRegs will contains all the regs in situation 4, 5, 6, 7
				347	// We want LiveOutRegs to contain only Regs whose content will be read after
				348	// in another block, and whose content was written in the current block,
				349	// that is we want it to get 1, 3, 5, 7
				350	// Since we made the MIs of a block to be packed all together before
				351	// scheduling, then the LiveIntervals were correct, and the RPTracker was
				352	// able to correctly handle 5 vs 6, 2 vs 3.
				353	// (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
				354	// The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
				355	// Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
				356	// The use of findDefBetween removes the case 4.
Matthias Braun	5d45861	2016-01-20 00:23:26 +0000	[diff] [blame]	357	for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
				358	unsigned Reg = RegMaskPair.RegUnit;
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	359	if (TargetRegisterInfo::isVirtualRegister(Reg) &&
Duncan P. N. Exon Smith	3ac9cc6	2016-02-27 06:40:41 +0000	[diff] [blame]	360	isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
				361	LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
				362	LIS)) {
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	363	LiveOutRegs.insert(Reg);
				364	}
				365	}
				366
				367	// Pressure = sum_alive_registers register size
				368	// Internally llvm will represent some registers as big 128 bits registers
				369	// for example, but they actually correspond to 4 actual 32 bits registers.
				370	// Thus Pressure is not equal to num_alive_registers * constant.
				371	LiveInPressure = TopPressure.MaxSetPressure;
				372	LiveOutPressure = BotPressure.MaxSetPressure;
				373
				374	// Prepares TopRPTracker for top down scheduling.
				375	TopRPTracker.closeTop();
				376	}
				377
				378	void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
				379	MachineBasicBlock::iterator EndBlock) {
				380	if (!Scheduled)
				381	fastSchedule();
				382
				383	// PreScheduling phase to set LiveIn and LiveOut.
				384	initRegPressure(BeginBlock, EndBlock);
				385	undoSchedule();
				386
				387	// Schedule for real now.
				388
				389	TopReadySUs.clear();
				390
				391	for (SUnit* SU : SUnits) {
				392	if (!SU->NumPredsLeft)
				393	TopReadySUs.push_back(SU);
				394	}
				395
				396	while (!TopReadySUs.empty()) {
				397	SUnit *SU = pickNode();
				398	ScheduledSUnits.push_back(SU);
				399	TopRPTracker.setPos(SU->getInstr());
				400	TopRPTracker.advance();
				401	nodeScheduled(SU);
				402	}
				403
				404	// TODO: compute InternalAdditionnalPressure.
				405	InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
				406
				407	// Check everything is right.
				408	#ifndef NDEBUG
				409	assert(SUnits.size() == ScheduledSUnits.size() &&
				410	TopReadySUs.empty());
				411	for (SUnit* SU : SUnits) {
				412	assert(SU->isScheduled &&
				413	SU->NumPredsLeft == 0);
				414	}
				415	#endif
				416
				417	Scheduled = true;
				418	}
				419
				420	void SIScheduleBlock::undoSchedule() {
				421	for (SUnit* SU : SUnits) {
				422	SU->isScheduled = false;
				423	for (SDep& Succ : SU->Succs) {
				424	if (BC->isSUInBlock(Succ.getSUnit(), ID))
				425	undoReleaseSucc(SU, &Succ);
				426	}
				427	}
				428	HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
				429	ScheduledSUnits.clear();
				430	Scheduled = false;
				431	}
				432
				433	void SIScheduleBlock::undoReleaseSucc(SUnit SU, SDep SuccEdge) {
				434	SUnit *SuccSU = SuccEdge->getSUnit();
				435
				436	if (SuccEdge->isWeak()) {
				437	++SuccSU->WeakPredsLeft;
				438	return;
				439	}
				440	++SuccSU->NumPredsLeft;
				441	}
				442
				443	void SIScheduleBlock::releaseSucc(SUnit SU, SDep SuccEdge) {
				444	SUnit *SuccSU = SuccEdge->getSUnit();
				445
				446	if (SuccEdge->isWeak()) {
				447	--SuccSU->WeakPredsLeft;
				448	return;
				449	}
				450	#ifndef NDEBUG
				451	if (SuccSU->NumPredsLeft == 0) {
				452	dbgs() << "* Scheduling failed! *\n";
				453	SuccSU->dump(DAG);
				454	dbgs() << " has been released too many times!\n";
				455	llvm_unreachable(nullptr);
				456	}
				457	#endif
				458
				459	--SuccSU->NumPredsLeft;
				460	}
				461
				462	/// Release Successors of the SU that are in the block or not.
				463	void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
				464	for (SDep& Succ : SU->Succs) {
				465	SUnit *SuccSU = Succ.getSUnit();
				466
				467	if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
				468	continue;
				469
				470	releaseSucc(SU, &Succ);
				471	if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
				472	TopReadySUs.push_back(SuccSU);
				473	}
				474	}
				475
				476	void SIScheduleBlock::nodeScheduled(SUnit *SU) {
				477	// Is in TopReadySUs
				478	assert (!SU->NumPredsLeft);
				479	std::vector<SUnit*>::iterator I =
				480	std::find(TopReadySUs.begin(), TopReadySUs.end(), SU);
				481	if (I == TopReadySUs.end()) {
				482	dbgs() << "Data Structure Bug in SI Scheduler\n";
				483	llvm_unreachable(nullptr);
				484	}
				485	TopReadySUs.erase(I);
				486
				487	releaseSuccessors(SU, true);
				488	// Scheduling this node will trigger a wait,
				489	// thus propagate to other instructions that they do not need to wait either.
				490	if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
				491	HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
				492
				493	if (DAG->IsLowLatencySU[SU->NodeNum]) {
				494	for (SDep& Succ : SU->Succs) {
				495	std::map<unsigned, unsigned>::iterator I =
				496	NodeNum2Index.find(Succ.getSUnit()->NodeNum);
				497	if (I != NodeNum2Index.end())
				498	HasLowLatencyNonWaitedParent[I->second] = 1;
				499	}
				500	}
				501	SU->isScheduled = true;
				502	}
				503
				504	void SIScheduleBlock::finalizeUnits() {
				505	// We remove links from outside blocks to enable scheduling inside the block.
				506	for (SUnit* SU : SUnits) {
				507	releaseSuccessors(SU, false);
				508	if (DAG->IsHighLatencySU[SU->NodeNum])
				509	HighLatencyBlock = true;
				510	}
				511	HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
				512	}
				513
				514	// we maintain ascending order of IDs
				515	void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
				516	unsigned PredID = Pred->getID();
				517
				518	// Check if not already predecessor.
				519	for (SIScheduleBlock* P : Preds) {
				520	if (PredID == P->getID())
				521	return;
				522	}
				523	Preds.push_back(Pred);
				524
				525	#ifndef NDEBUG
				526	for (SIScheduleBlock* S : Succs) {
				527	if (PredID == S->getID())
				528	assert(!"Loop in the Block Graph!\n");
				529	}
				530	#endif
				531	}
				532
				533	void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
				534	unsigned SuccID = Succ->getID();
				535
				536	// Check if not already predecessor.
				537	for (SIScheduleBlock* S : Succs) {
				538	if (SuccID == S->getID())
				539	return;
				540	}
				541	if (Succ->isHighLatencyBlock())
				542	++NumHighLatencySuccessors;
				543	Succs.push_back(Succ);
				544	#ifndef NDEBUG
				545	for (SIScheduleBlock* P : Preds) {
				546	if (SuccID == P->getID())
				547	assert("Loop in the Block Graph!\n");
				548	}
				549	#endif
				550	}
				551
				552	#ifndef NDEBUG
				553	void SIScheduleBlock::printDebug(bool full) {
				554	dbgs() << "Block (" << ID << ")\n";
				555	if (!full)
				556	return;
				557
				558	dbgs() << "\nContains High Latency Instruction: "
				559	<< HighLatencyBlock << '\n';
				560	dbgs() << "\nDepends On:\n";
				561	for (SIScheduleBlock* P : Preds) {
				562	P->printDebug(false);
				563	}
				564
				565	dbgs() << "\nSuccessors:\n";
				566	for (SIScheduleBlock* S : Succs) {
				567	S->printDebug(false);
				568	}
				569
				570	if (Scheduled) {
				571	dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
				572	<< LiveInPressure[DAG->getVGPRSetID()] << '\n';
				573	dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
				574	<< LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
				575	dbgs() << "LiveIns:\n";
				576	for (unsigned Reg : LiveInRegs)
				577	dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
				578
				579	dbgs() << "\nLiveOuts:\n";
				580	for (unsigned Reg : LiveOutRegs)
				581	dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
				582	}
				583
				584	dbgs() << "\nInstructions:\n";
				585	if (!Scheduled) {
				586	for (SUnit* SU : SUnits) {
				587	SU->dump(DAG);
				588	}
				589	} else {
				590	for (SUnit* SU : SUnits) {
				591	SU->dump(DAG);
				592	}
				593	}
				594
				595	dbgs() << "///////////////////////\n";
				596	}
				597
				598	#endif
				599
				600	// SIScheduleBlockCreator //
				601
				602	SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
				603	DAG(DAG) {
				604	}
				605
				606	SIScheduleBlockCreator::~SIScheduleBlockCreator() {
				607	}
				608
				609	SIScheduleBlocks
				610	SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
				611	std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
				612	Blocks.find(BlockVariant);
				613	if (B == Blocks.end()) {
				614	SIScheduleBlocks Res;
				615	createBlocksForVariant(BlockVariant);
				616	topologicalSort();
				617	scheduleInsideBlocks();
				618	fillStats();
				619	Res.Blocks = CurrentBlocks;
				620	Res.TopDownIndex2Block = TopDownIndex2Block;
				621	Res.TopDownBlock2Index = TopDownBlock2Index;
				622	Blocks[BlockVariant] = Res;
				623	return Res;
				624	} else {
				625	return B->second;
				626	}
				627	}
				628
				629	bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
				630	if (SU->NodeNum >= DAG->SUnits.size())
				631	return false;
				632	return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
				633	}
				634
				635	void SIScheduleBlockCreator::colorHighLatenciesAlone() {
				636	unsigned DAGSize = DAG->SUnits.size();
				637
				638	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				639	SUnit *SU = &DAG->SUnits[i];
				640	if (DAG->IsHighLatencySU[SU->NodeNum]) {
				641	CurrentColoring[SU->NodeNum] = NextReservedID++;
				642	}
				643	}
				644	}
				645
				646	void SIScheduleBlockCreator::colorHighLatenciesGroups() {
				647	unsigned DAGSize = DAG->SUnits.size();
				648	unsigned NumHighLatencies = 0;
				649	unsigned GroupSize;
				650	unsigned Color = NextReservedID;
				651	unsigned Count = 0;
				652	std::set<unsigned> FormingGroup;
				653
				654	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				655	SUnit *SU = &DAG->SUnits[i];
				656	if (DAG->IsHighLatencySU[SU->NodeNum])
				657	++NumHighLatencies;
				658	}
				659
				660	if (NumHighLatencies == 0)
				661	return;
				662
				663	if (NumHighLatencies <= 6)
				664	GroupSize = 2;
				665	else if (NumHighLatencies <= 12)
				666	GroupSize = 3;
				667	else
				668	GroupSize = 4;
				669
				670	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				671	SUnit *SU = &DAG->SUnits[i];
				672	if (DAG->IsHighLatencySU[SU->NodeNum]) {
				673	unsigned CompatibleGroup = true;
				674	unsigned ProposedColor = Color;
				675	for (unsigned j : FormingGroup) {
				676	// TODO: Currently CompatibleGroup will always be false,
				677	// because the graph enforces the load order. This
				678	// can be fixed, but as keeping the load order is often
				679	// good for performance that causes a performance hit (both
				680	// the default scheduler and this scheduler).
				681	// When this scheduler determines a good load order,
				682	// this can be fixed.
				683	if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) \|\|
				684	!DAG->canAddEdge(&DAG->SUnits[j], SU))
				685	CompatibleGroup = false;
				686	}
				687	if (!CompatibleGroup \|\| ++Count == GroupSize) {
				688	FormingGroup.clear();
				689	Color = ++NextReservedID;
				690	if (!CompatibleGroup) {
				691	ProposedColor = Color;
				692	FormingGroup.insert(SU->NodeNum);
				693	}
				694	Count = 0;
				695	} else {
				696	FormingGroup.insert(SU->NodeNum);
				697	}
				698	CurrentColoring[SU->NodeNum] = ProposedColor;
				699	}
				700	}
				701	}
				702
				703	void SIScheduleBlockCreator::colorComputeReservedDependencies() {
				704	unsigned DAGSize = DAG->SUnits.size();
				705	std::map<std::set<unsigned>, unsigned> ColorCombinations;
				706
				707	CurrentTopDownReservedDependencyColoring.clear();
				708	CurrentBottomUpReservedDependencyColoring.clear();
				709
				710	CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
				711	CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
				712
				713	// Traverse TopDown, and give different colors to SUs depending
				714	// on which combination of High Latencies they depend on.
				715
				716	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				717	SUnit *SU = &DAG->SUnits[DAG->TopDownIndex2SU[i]];
				718	std::set<unsigned> SUColors;
				719
				720	// Already given.
				721	if (CurrentColoring[SU->NodeNum]) {
				722	CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
				723	CurrentColoring[SU->NodeNum];
				724	continue;
				725	}
				726
				727	for (SDep& PredDep : SU->Preds) {
				728	SUnit *Pred = PredDep.getSUnit();
				729	if (PredDep.isWeak() \|\| Pred->NodeNum >= DAGSize)
				730	continue;
				731	if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
				732	SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
				733	}
				734	// Color 0 by default.
				735	if (SUColors.empty())
				736	continue;
				737	// Same color than parents.
				738	if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
				739	CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
				740	*SUColors.begin();
				741	else {
				742	std::map<std::set<unsigned>, unsigned>::iterator Pos =
				743	ColorCombinations.find(SUColors);
				744	if (Pos != ColorCombinations.end()) {
				745	CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
				746	} else {
				747	CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
				748	NextNonReservedID;
				749	ColorCombinations[SUColors] = NextNonReservedID++;
				750	}
				751	}
				752	}
				753
				754	ColorCombinations.clear();
				755
				756	// Same as before, but BottomUp.
				757
				758	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				759	SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
				760	std::set<unsigned> SUColors;
				761
				762	// Already given.
				763	if (CurrentColoring[SU->NodeNum]) {
				764	CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
				765	CurrentColoring[SU->NodeNum];
				766	continue;
				767	}
				768
				769	for (SDep& SuccDep : SU->Succs) {
				770	SUnit *Succ = SuccDep.getSUnit();
				771	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
				772	continue;
				773	if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
				774	SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
				775	}
				776	// Keep color 0.
				777	if (SUColors.empty())
				778	continue;
				779	// Same color than parents.
				780	if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
				781	CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
				782	*SUColors.begin();
				783	else {
				784	std::map<std::set<unsigned>, unsigned>::iterator Pos =
				785	ColorCombinations.find(SUColors);
				786	if (Pos != ColorCombinations.end()) {
				787	CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
				788	} else {
				789	CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
				790	NextNonReservedID;
				791	ColorCombinations[SUColors] = NextNonReservedID++;
				792	}
				793	}
				794	}
				795	}
				796
				797	void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
				798	unsigned DAGSize = DAG->SUnits.size();
				799	std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
				800
				801	// Every combination of colors given by the top down
				802	// and bottom up Reserved node dependency
				803
				804	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				805	SUnit *SU = &DAG->SUnits[i];
				806	std::pair<unsigned, unsigned> SUColors;
				807
				808	// High latency instructions: already given.
				809	if (CurrentColoring[SU->NodeNum])
				810	continue;
				811
				812	SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
				813	SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
				814
				815	std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
				816	ColorCombinations.find(SUColors);
				817	if (Pos != ColorCombinations.end()) {
				818	CurrentColoring[SU->NodeNum] = Pos->second;
				819	} else {
				820	CurrentColoring[SU->NodeNum] = NextNonReservedID;
				821	ColorCombinations[SUColors] = NextNonReservedID++;
				822	}
				823	}
				824	}
				825
				826	void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
				827	unsigned DAGSize = DAG->SUnits.size();
				828	std::vector<int> PendingColoring = CurrentColoring;
				829
				830	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				831	SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
				832	std::set<unsigned> SUColors;
				833	std::set<unsigned> SUColorsPending;
				834
				835	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
				836	continue;
				837
				838	if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 \|\|
				839	CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
				840	continue;
				841
				842	for (SDep& SuccDep : SU->Succs) {
				843	SUnit *Succ = SuccDep.getSUnit();
				844	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
				845	continue;
				846	if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 \|\|
				847	CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
				848	SUColors.insert(CurrentColoring[Succ->NodeNum]);
				849	SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
				850	}
				851	if (SUColors.size() == 1 && SUColorsPending.size() == 1)
				852	PendingColoring[SU->NodeNum] = *SUColors.begin();
				853	else // TODO: Attribute new colors depending on color
				854	// combination of children.
				855	PendingColoring[SU->NodeNum] = NextNonReservedID++;
				856	}
				857	CurrentColoring = PendingColoring;
				858	}
				859
				860
				861	void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
				862	unsigned DAGSize = DAG->SUnits.size();
				863	unsigned PreviousColor;
				864	std::set<unsigned> SeenColors;
				865
				866	if (DAGSize <= 1)
				867	return;
				868
				869	PreviousColor = CurrentColoring[0];
				870
				871	for (unsigned i = 1, e = DAGSize; i != e; ++i) {
				872	SUnit *SU = &DAG->SUnits[i];
				873	unsigned CurrentColor = CurrentColoring[i];
				874	unsigned PreviousColorSave = PreviousColor;
				875	assert(i == SU->NodeNum);
				876
				877	if (CurrentColor != PreviousColor)
				878	SeenColors.insert(PreviousColor);
				879	PreviousColor = CurrentColor;
				880
				881	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
				882	continue;
				883
				884	if (SeenColors.find(CurrentColor) == SeenColors.end())
				885	continue;
				886
				887	if (PreviousColorSave != CurrentColor)
				888	CurrentColoring[i] = NextNonReservedID++;
				889	else
				890	CurrentColoring[i] = CurrentColoring[i-1];
				891	}
				892	}
				893
				894	void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
				895	unsigned DAGSize = DAG->SUnits.size();
				896
				897	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				898	SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
				899	std::set<unsigned> SUColors;
				900
				901	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
				902	continue;
				903
				904	// No predecessor: Vgpr constant loading.
				905	// Low latency instructions usually have a predecessor (the address)
				906	if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
				907	continue;
				908
				909	for (SDep& SuccDep : SU->Succs) {
				910	SUnit *Succ = SuccDep.getSUnit();
				911	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
				912	continue;
				913	SUColors.insert(CurrentColoring[Succ->NodeNum]);
				914	}
				915	if (SUColors.size() == 1)
				916	CurrentColoring[SU->NodeNum] = *SUColors.begin();
				917	}
				918	}
				919
				920	void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
				921	unsigned DAGSize = DAG->SUnits.size();
				922
				923	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				924	SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
				925	std::set<unsigned> SUColors;
				926
				927	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
				928	continue;
				929
				930	for (SDep& SuccDep : SU->Succs) {
				931	SUnit *Succ = SuccDep.getSUnit();
				932	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
				933	continue;
				934	SUColors.insert(CurrentColoring[Succ->NodeNum]);
				935	}
				936	if (SUColors.size() == 1)
				937	CurrentColoring[SU->NodeNum] = *SUColors.begin();
				938	}
				939	}
				940
				941	void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
				942	unsigned DAGSize = DAG->SUnits.size();
				943
				944	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				945	SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
				946	std::set<unsigned> SUColors;
				947
				948	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
				949	continue;
				950
				951	for (SDep& SuccDep : SU->Succs) {
				952	SUnit *Succ = SuccDep.getSUnit();
				953	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
				954	continue;
				955	SUColors.insert(CurrentColoring[Succ->NodeNum]);
				956	}
				957	if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
				958	CurrentColoring[SU->NodeNum] = *SUColors.begin();
				959	}
				960	}
				961
				962	void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
				963	unsigned DAGSize = DAG->SUnits.size();
				964	std::map<unsigned, unsigned> ColorCount;
				965
				966	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				967	SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
				968	unsigned color = CurrentColoring[SU->NodeNum];
				969	std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color);
				970	if (Pos != ColorCount.end()) {
				971	++ColorCount[color];
				972	} else {
				973	ColorCount[color] = 1;
				974	}
				975	}
				976
				977	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				978	SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
				979	unsigned color = CurrentColoring[SU->NodeNum];
				980	std::set<unsigned> SUColors;
				981
				982	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
				983	continue;
				984
				985	if (ColorCount[color] > 1)
				986	continue;
				987
				988	for (SDep& SuccDep : SU->Succs) {
				989	SUnit *Succ = SuccDep.getSUnit();
				990	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
				991	continue;
				992	SUColors.insert(CurrentColoring[Succ->NodeNum]);
				993	}
				994	if (SUColors.size() == 1 && *SUColors.begin() != color) {
				995	--ColorCount[color];
				996	CurrentColoring[SU->NodeNum] = *SUColors.begin();
				997	++ColorCount[*SUColors.begin()];
				998	}
				999	}
				1000	}
				1001
				1002	void SIScheduleBlockCreator::cutHugeBlocks() {
				1003	// TODO
				1004	}
				1005
				1006	void SIScheduleBlockCreator::regroupNoUserInstructions() {
				1007	unsigned DAGSize = DAG->SUnits.size();
				1008	int GroupID = NextNonReservedID++;
				1009
				1010	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1011	SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
				1012	bool hasSuccessor = false;
				1013
				1014	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
				1015	continue;
				1016
				1017	for (SDep& SuccDep : SU->Succs) {
				1018	SUnit *Succ = SuccDep.getSUnit();
				1019	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
				1020	continue;
				1021	hasSuccessor = true;
				1022	}
				1023	if (!hasSuccessor)
				1024	CurrentColoring[SU->NodeNum] = GroupID;
				1025	}
				1026	}
				1027
				1028	void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
				1029	unsigned DAGSize = DAG->SUnits.size();
				1030	std::map<unsigned,unsigned> RealID;
				1031
				1032	CurrentBlocks.clear();
				1033	CurrentColoring.clear();
				1034	CurrentColoring.resize(DAGSize, 0);
				1035	Node2CurrentBlock.clear();
				1036
				1037	// Restore links previous scheduling variant has overridden.
				1038	DAG->restoreSULinksLeft();
				1039
				1040	NextReservedID = 1;
				1041	NextNonReservedID = DAGSize + 1;
				1042
				1043	DEBUG(dbgs() << "Coloring the graph\n");
				1044
				1045	if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
				1046	colorHighLatenciesGroups();
				1047	else
				1048	colorHighLatenciesAlone();
				1049	colorComputeReservedDependencies();
				1050	colorAccordingToReservedDependencies();
				1051	colorEndsAccordingToDependencies();
				1052	if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
				1053	colorForceConsecutiveOrderInGroup();
				1054	regroupNoUserInstructions();
				1055	colorMergeConstantLoadsNextGroup();
				1056	colorMergeIfPossibleNextGroupOnlyForReserved();
				1057
				1058	// Put SUs of same color into same block
				1059	Node2CurrentBlock.resize(DAGSize, -1);
				1060	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1061	SUnit *SU = &DAG->SUnits[i];
				1062	unsigned Color = CurrentColoring[SU->NodeNum];
				1063	if (RealID.find(Color) == RealID.end()) {
				1064	int ID = CurrentBlocks.size();
				1065	BlockPtrs.push_back(
				1066	make_unique<SIScheduleBlock>(DAG, this, ID));
				1067	CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
				1068	RealID[Color] = ID;
				1069	}
				1070	CurrentBlocks[RealID[Color]]->addUnit(SU);
				1071	Node2CurrentBlock[SU->NodeNum] = RealID[Color];
				1072	}
				1073
				1074	// Build dependencies between blocks.
				1075	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1076	SUnit *SU = &DAG->SUnits[i];
				1077	int SUID = Node2CurrentBlock[i];
				1078	for (SDep& SuccDep : SU->Succs) {
				1079	SUnit *Succ = SuccDep.getSUnit();
				1080	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
				1081	continue;
				1082	if (Node2CurrentBlock[Succ->NodeNum] != SUID)
				1083	CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]);
				1084	}
				1085	for (SDep& PredDep : SU->Preds) {
				1086	SUnit *Pred = PredDep.getSUnit();
				1087	if (PredDep.isWeak() \|\| Pred->NodeNum >= DAGSize)
				1088	continue;
				1089	if (Node2CurrentBlock[Pred->NodeNum] != SUID)
				1090	CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
				1091	}
				1092	}
				1093
				1094	// Free root and leafs of all blocks to enable scheduling inside them.
				1095	for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
				1096	SIScheduleBlock *Block = CurrentBlocks[i];
				1097	Block->finalizeUnits();
				1098	}
				1099	DEBUG(
				1100	dbgs() << "Blocks created:\n\n";
				1101	for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
				1102	SIScheduleBlock *Block = CurrentBlocks[i];
				1103	Block->printDebug(true);
				1104	}
				1105	);
				1106	}
				1107
				1108	// Two functions taken from Codegen/MachineScheduler.cpp
				1109
				1110	/// If this iterator is a debug value, increment until reaching the End or a
				1111	/// non-debug instruction.
				1112	static MachineBasicBlock::const_iterator
				1113	nextIfDebug(MachineBasicBlock::const_iterator I,
				1114	MachineBasicBlock::const_iterator End) {
				1115	for(; I != End; ++I) {
				1116	if (!I->isDebugValue())
				1117	break;
				1118	}
				1119	return I;
				1120	}
				1121
				1122	/// Non-const version.
				1123	static MachineBasicBlock::iterator
				1124	nextIfDebug(MachineBasicBlock::iterator I,
				1125	MachineBasicBlock::const_iterator End) {
				1126	// Cast the return value to nonconst MachineInstr, then cast to an
				1127	// instr_iterator, which does not check for null, finally return a
				1128	// bundle_iterator.
				1129	return MachineBasicBlock::instr_iterator(
				1130	const_cast<MachineInstr*>(
				1131	&*nextIfDebug(MachineBasicBlock::const_iterator(I), End)));
				1132	}
				1133
				1134	void SIScheduleBlockCreator::topologicalSort() {
				1135	unsigned DAGSize = CurrentBlocks.size();
				1136	std::vector<int> WorkList;
				1137
				1138	DEBUG(dbgs() << "Topological Sort\n");
				1139
				1140	WorkList.reserve(DAGSize);
				1141	TopDownIndex2Block.resize(DAGSize);
				1142	TopDownBlock2Index.resize(DAGSize);
				1143	BottomUpIndex2Block.resize(DAGSize);
				1144
				1145	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1146	SIScheduleBlock *Block = CurrentBlocks[i];
				1147	unsigned Degree = Block->getSuccs().size();
				1148	TopDownBlock2Index[i] = Degree;
				1149	if (Degree == 0) {
				1150	WorkList.push_back(i);
				1151	}
				1152	}
				1153
				1154	int Id = DAGSize;
				1155	while (!WorkList.empty()) {
				1156	int i = WorkList.back();
				1157	SIScheduleBlock *Block = CurrentBlocks[i];
				1158	WorkList.pop_back();
				1159	TopDownBlock2Index[i] = --Id;
				1160	TopDownIndex2Block[Id] = i;
				1161	for (SIScheduleBlock* Pred : Block->getPreds()) {
				1162	if (!--TopDownBlock2Index[Pred->getID()])
				1163	WorkList.push_back(Pred->getID());
				1164	}
				1165	}
				1166
				1167	#ifndef NDEBUG
				1168	// Check correctness of the ordering.
				1169	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1170	SIScheduleBlock *Block = CurrentBlocks[i];
				1171	for (SIScheduleBlock* Pred : Block->getPreds()) {
				1172	assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
				1173	"Wrong Top Down topological sorting");
				1174	}
				1175	}
				1176	#endif
				1177
				1178	BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
				1179	TopDownIndex2Block.rend());
				1180	}
				1181
				1182	void SIScheduleBlockCreator::scheduleInsideBlocks() {
				1183	unsigned DAGSize = CurrentBlocks.size();
				1184
				1185	DEBUG(dbgs() << "\nScheduling Blocks\n\n");
				1186
				1187	// We do schedule a valid scheduling such that a Block corresponds
				1188	// to a range of instructions.
				1189	DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
				1190	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1191	SIScheduleBlock *Block = CurrentBlocks[i];
				1192	Block->fastSchedule();
				1193	}
				1194
				1195	// Note: the following code, and the part restoring previous position
				1196	// is by far the most expensive operation of the Scheduler.
				1197
				1198	// Do not update CurrentTop.
				1199	MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
				1200	std::vector<MachineBasicBlock::iterator> PosOld;
				1201	std::vector<MachineBasicBlock::iterator> PosNew;
				1202	PosOld.reserve(DAG->SUnits.size());
				1203	PosNew.reserve(DAG->SUnits.size());
				1204
				1205	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1206	int BlockIndice = TopDownIndex2Block[i];
				1207	SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
				1208	std::vector<SUnit*> SUs = Block->getScheduledUnits();
				1209
				1210	for (SUnit* SU : SUs) {
				1211	MachineInstr *MI = SU->getInstr();
				1212	MachineBasicBlock::iterator Pos = MI;
				1213	PosOld.push_back(Pos);
				1214	if (&*CurrentTopFastSched == MI) {
				1215	PosNew.push_back(Pos);
				1216	CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
				1217	DAG->getCurrentBottom());
				1218	} else {
				1219	// Update the instruction stream.
				1220	DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
				1221
				1222	// Update LiveIntervals.
				1223	// Note: Moving all instructions and calling handleMove everytime
				1224	// is the most cpu intensive operation of the scheduler.
				1225	// It would gain a lot if there was a way to recompute the
				1226	// LiveIntervals for the entire scheduling region.
Duncan P. N. Exon Smith	be8f8c4	2016-02-27 20:14:29 +0000	[diff] [blame]	1227	DAG->getLIS()->handleMove(MI, /UpdateFlags=*/true);
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	1228	PosNew.push_back(CurrentTopFastSched);
				1229	}
				1230	}
				1231	}
				1232
				1233	// Now we have Block of SUs == Block of MI.
				1234	// We do the final schedule for the instructions inside the block.
				1235	// The property that all the SUs of the Block are grouped together as MI
				1236	// is used for correct reg usage tracking.
				1237	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1238	SIScheduleBlock *Block = CurrentBlocks[i];
				1239	std::vector<SUnit*> SUs = Block->getScheduledUnits();
				1240	Block->schedule((SUs.begin())->getInstr(), (SUs.rbegin())->getInstr());
				1241	}
				1242
				1243	DEBUG(dbgs() << "Restoring MI Pos\n");
				1244	// Restore old ordering (which prevents a LIS->handleMove bug).
				1245	for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
				1246	MachineBasicBlock::iterator POld = PosOld[i-1];
				1247	MachineBasicBlock::iterator PNew = PosNew[i-1];
				1248	if (PNew != POld) {
				1249	// Update the instruction stream.
				1250	DAG->getBB()->splice(POld, DAG->getBB(), PNew);
				1251
				1252	// Update LiveIntervals.
Duncan P. N. Exon Smith	be8f8c4	2016-02-27 20:14:29 +0000	[diff] [blame]	1253	DAG->getLIS()->handleMove(POld, /UpdateFlags=*/true);
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	1254	}
				1255	}
				1256
				1257	DEBUG(
				1258	for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
				1259	SIScheduleBlock *Block = CurrentBlocks[i];
				1260	Block->printDebug(true);
				1261	}
				1262	);
				1263	}
				1264
				1265	void SIScheduleBlockCreator::fillStats() {
				1266	unsigned DAGSize = CurrentBlocks.size();
				1267
				1268	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1269	int BlockIndice = TopDownIndex2Block[i];
				1270	SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
				1271	if (Block->getPreds().size() == 0)
				1272	Block->Depth = 0;
				1273	else {
				1274	unsigned Depth = 0;
				1275	for (SIScheduleBlock *Pred : Block->getPreds()) {
				1276	if (Depth < Pred->Depth + 1)
				1277	Depth = Pred->Depth + 1;
				1278	}
				1279	Block->Depth = Depth;
				1280	}
				1281	}
				1282
				1283	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1284	int BlockIndice = BottomUpIndex2Block[i];
				1285	SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
				1286	if (Block->getSuccs().size() == 0)
				1287	Block->Height = 0;
				1288	else {
				1289	unsigned Height = 0;
				1290	for (SIScheduleBlock *Succ : Block->getSuccs()) {
				1291	if (Height < Succ->Height + 1)
				1292	Height = Succ->Height + 1;
				1293	}
				1294	Block->Height = Height;
				1295	}
				1296	}
				1297	}
				1298
				1299	// SIScheduleBlockScheduler //
				1300
				1301	SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
				1302	SISchedulerBlockSchedulerVariant Variant,
				1303	SIScheduleBlocks BlocksStruct) :
				1304	DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
				1305	LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
				1306	SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
				1307
				1308	// Fill the usage of every output
				1309	// Warning: while by construction we always have a link between two blocks
				1310	// when one needs a result from the other, the number of users of an output
				1311	// is not the sum of child blocks having as input the same virtual register.
				1312	// Here is an example. A produces x and y. B eats x and produces x'.
				1313	// C eats x' and y. The register coalescer may have attributed the same
				1314	// virtual register to x and x'.
				1315	// To count accurately, we do a topological sort. In case the register is
				1316	// found for several parents, we increment the usage of the one with the
				1317	// highest topological index.
				1318	LiveOutRegsNumUsages.resize(Blocks.size());
				1319	for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
				1320	SIScheduleBlock *Block = Blocks[i];
				1321	for (unsigned Reg : Block->getInRegs()) {
				1322	bool Found = false;
				1323	int topoInd = -1;
				1324	for (SIScheduleBlock* Pred: Block->getPreds()) {
				1325	std::set<unsigned> PredOutRegs = Pred->getOutRegs();
				1326	std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
				1327
				1328	if (RegPos != PredOutRegs.end()) {
				1329	Found = true;
				1330	if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
				1331	topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
				1332	}
				1333	}
				1334	}
				1335
				1336	if (!Found)
				1337	continue;
				1338
				1339	int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
				1340	std::map<unsigned, unsigned>::iterator RegPos =
				1341	LiveOutRegsNumUsages[PredID].find(Reg);
				1342	if (RegPos != LiveOutRegsNumUsages[PredID].end()) {
				1343	++LiveOutRegsNumUsages[PredID][Reg];
				1344	} else {
				1345	LiveOutRegsNumUsages[PredID][Reg] = 1;
				1346	}
				1347	}
				1348	}
				1349
				1350	LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
				1351	BlockNumPredsLeft.resize(Blocks.size());
				1352	BlockNumSuccsLeft.resize(Blocks.size());
				1353
				1354	for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
				1355	SIScheduleBlock *Block = Blocks[i];
				1356	BlockNumPredsLeft[i] = Block->getPreds().size();
				1357	BlockNumSuccsLeft[i] = Block->getSuccs().size();
				1358	}
				1359
				1360	#ifndef NDEBUG
				1361	for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
				1362	SIScheduleBlock *Block = Blocks[i];
				1363	assert(Block->getID() == i);
				1364	}
				1365	#endif
				1366
				1367	std::set<unsigned> InRegs = DAG->getInRegs();
				1368	addLiveRegs(InRegs);
				1369
				1370	// Fill LiveRegsConsumers for regs that were already
				1371	// defined before scheduling.
				1372	for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
				1373	SIScheduleBlock *Block = Blocks[i];
				1374	for (unsigned Reg : Block->getInRegs()) {
				1375	bool Found = false;
				1376	for (SIScheduleBlock* Pred: Block->getPreds()) {
				1377	std::set<unsigned> PredOutRegs = Pred->getOutRegs();
				1378	std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
				1379
				1380	if (RegPos != PredOutRegs.end()) {
				1381	Found = true;
				1382	break;
				1383	}
				1384	}
				1385
				1386	if (!Found) {
				1387	if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end())
				1388	LiveRegsConsumers[Reg] = 1;
				1389	else
				1390	++LiveRegsConsumers[Reg];
				1391	}
				1392	}
				1393	}
				1394
				1395	for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
				1396	SIScheduleBlock *Block = Blocks[i];
				1397	if (BlockNumPredsLeft[i] == 0) {
				1398	ReadyBlocks.push_back(Block);
				1399	}
				1400	}
				1401
				1402	while (SIScheduleBlock *Block = pickBlock()) {
				1403	BlocksScheduled.push_back(Block);
				1404	blockScheduled(Block);
				1405	}
				1406
				1407	DEBUG(
				1408	dbgs() << "Block Order:";
				1409	for (SIScheduleBlock* Block : BlocksScheduled) {
				1410	dbgs() << ' ' << Block->getID();
				1411	}
				1412	);
				1413	}
				1414
				1415	bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
				1416	SIBlockSchedCandidate &TryCand) {
				1417	if (!Cand.isValid()) {
				1418	TryCand.Reason = NodeOrder;
				1419	return true;
				1420	}
				1421
				1422	// Try to hide high latencies.
				1423	if (tryLess(TryCand.LastPosHighLatParentScheduled,
				1424	Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
				1425	return true;
				1426	// Schedule high latencies early so you can hide them better.
				1427	if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
				1428	TryCand, Cand, Latency))
				1429	return true;
				1430	if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height,
				1431	TryCand, Cand, Depth))
				1432	return true;
				1433	if (tryGreater(TryCand.NumHighLatencySuccessors,
				1434	Cand.NumHighLatencySuccessors,
				1435	TryCand, Cand, Successor))
				1436	return true;
				1437	return false;
				1438	}
				1439
				1440	bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
				1441	SIBlockSchedCandidate &TryCand) {
				1442	if (!Cand.isValid()) {
				1443	TryCand.Reason = NodeOrder;
				1444	return true;
				1445	}
				1446
				1447	if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
				1448	TryCand, Cand, RegUsage))
				1449	return true;
				1450	if (tryGreater(TryCand.NumSuccessors > 0,
				1451	Cand.NumSuccessors > 0,
				1452	TryCand, Cand, Successor))
				1453	return true;
				1454	if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
				1455	return true;
				1456	if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
				1457	TryCand, Cand, RegUsage))
				1458	return true;
				1459	return false;
				1460	}
				1461
				1462	SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
				1463	SIBlockSchedCandidate Cand;
				1464	std::vector<SIScheduleBlock*>::iterator Best;
				1465	SIScheduleBlock *Block;
				1466	if (ReadyBlocks.empty())
				1467	return nullptr;
				1468
				1469	DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
				1470	VregCurrentUsage, SregCurrentUsage);
				1471	if (VregCurrentUsage > maxVregUsage)
				1472	maxVregUsage = VregCurrentUsage;
				1473	if (VregCurrentUsage > maxSregUsage)
				1474	maxSregUsage = VregCurrentUsage;
				1475	DEBUG(
				1476	dbgs() << "Picking New Blocks\n";
				1477	dbgs() << "Available: ";
				1478	for (SIScheduleBlock* Block : ReadyBlocks)
				1479	dbgs() << Block->getID() << ' ';
				1480	dbgs() << "\nCurrent Live:\n";
				1481	for (unsigned Reg : LiveRegs)
				1482	dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
				1483	dbgs() << '\n';
				1484	dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
				1485	dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
				1486	);
				1487
				1488	Cand.Block = nullptr;
				1489	for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
				1490	E = ReadyBlocks.end(); I != E; ++I) {
				1491	SIBlockSchedCandidate TryCand;
				1492	TryCand.Block = *I;
				1493	TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
				1494	TryCand.VGPRUsageDiff =
				1495	checkRegUsageImpact(TryCand.Block->getInRegs(),
				1496	TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
				1497	TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
				1498	TryCand.NumHighLatencySuccessors =
				1499	TryCand.Block->getNumHighLatencySuccessors();
				1500	TryCand.LastPosHighLatParentScheduled =
				1501	(unsigned int) std::max<int> (0,
				1502	LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
				1503	LastPosWaitedHighLatency);
				1504	TryCand.Height = TryCand.Block->Height;
				1505	// Try not to increase VGPR usage too much, else we may spill.
				1506	if (VregCurrentUsage > 120 \|\|
				1507	Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
				1508	if (!tryCandidateRegUsage(Cand, TryCand) &&
				1509	Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
				1510	tryCandidateLatency(Cand, TryCand);
				1511	} else {
				1512	if (!tryCandidateLatency(Cand, TryCand))
				1513	tryCandidateRegUsage(Cand, TryCand);
				1514	}
				1515	if (TryCand.Reason != NoCand) {
				1516	Cand.setBest(TryCand);
				1517	Best = I;
				1518	DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
				1519	<< getReasonStr(Cand.Reason) << '\n');
				1520	}
				1521	}
				1522
				1523	DEBUG(
				1524	dbgs() << "Picking: " << Cand.Block->getID() << '\n';
				1525	dbgs() << "Is a block with high latency instruction: "
				1526	<< (Cand.IsHighLatency ? "yes\n" : "no\n");
				1527	dbgs() << "Position of last high latency dependency: "
				1528	<< Cand.LastPosHighLatParentScheduled << '\n';
				1529	dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
				1530	dbgs() << '\n';
				1531	);
				1532
				1533	Block = Cand.Block;
				1534	ReadyBlocks.erase(Best);
				1535	return Block;
				1536	}
				1537
				1538	// Tracking of currently alive registers to determine VGPR Usage.
				1539
				1540	void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
				1541	for (unsigned Reg : Regs) {
				1542	// For now only track virtual registers.
				1543	if (!TargetRegisterInfo::isVirtualRegister(Reg))
				1544	continue;
				1545	// If not already in the live set, then add it.
				1546	(void) LiveRegs.insert(Reg);
				1547	}
				1548	}
				1549
				1550	void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
				1551	std::set<unsigned> &Regs) {
				1552	for (unsigned Reg : Regs) {
				1553	// For now only track virtual registers.
				1554	std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
				1555	assert (Pos != LiveRegs.end() && // Reg must be live.
				1556	LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
				1557	LiveRegsConsumers[Reg] >= 1);
				1558	--LiveRegsConsumers[Reg];
				1559	if (LiveRegsConsumers[Reg] == 0)
				1560	LiveRegs.erase(Pos);
				1561	}
				1562	}
				1563
				1564	void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
				1565	for (SIScheduleBlock* Block : Parent->getSuccs()) {
				1566	--BlockNumPredsLeft[Block->getID()];
				1567	if (BlockNumPredsLeft[Block->getID()] == 0) {
				1568	ReadyBlocks.push_back(Block);
				1569	}
				1570	// TODO: Improve check. When the dependency between the high latency
				1571	// instructions and the instructions of the other blocks are WAR or WAW
				1572	// there will be no wait triggered. We would like these cases to not
				1573	// update LastPosHighLatencyParentScheduled.
				1574	if (Parent->isHighLatencyBlock())
				1575	LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled;
				1576	}
				1577	}
				1578
				1579	void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
				1580	decreaseLiveRegs(Block, Block->getInRegs());
				1581	addLiveRegs(Block->getOutRegs());
				1582	releaseBlockSuccs(Block);
				1583	for (std::map<unsigned, unsigned>::iterator RegI =
				1584	LiveOutRegsNumUsages[Block->getID()].begin(),
				1585	E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
				1586	std::pair<unsigned, unsigned> RegP = *RegI;
				1587	if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end())
				1588	LiveRegsConsumers[RegP.first] = RegP.second;
				1589	else {
				1590	assert(LiveRegsConsumers[RegP.first] == 0);
				1591	LiveRegsConsumers[RegP.first] += RegP.second;
				1592	}
				1593	}
				1594	if (LastPosHighLatencyParentScheduled[Block->getID()] >
				1595	(unsigned)LastPosWaitedHighLatency)
				1596	LastPosWaitedHighLatency =
				1597	LastPosHighLatencyParentScheduled[Block->getID()];
				1598	++NumBlockScheduled;
				1599	}
				1600
				1601	std::vector<int>
				1602	SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
				1603	std::set<unsigned> &OutRegs) {
				1604	std::vector<int> DiffSetPressure;
				1605	DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
				1606
				1607	for (unsigned Reg : InRegs) {
				1608	// For now only track virtual registers.
				1609	if (!TargetRegisterInfo::isVirtualRegister(Reg))
				1610	continue;
				1611	if (LiveRegsConsumers[Reg] > 1)
				1612	continue;
				1613	PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
				1614	for (; PSetI.isValid(); ++PSetI) {
				1615	DiffSetPressure[*PSetI] -= PSetI.getWeight();
				1616	}
				1617	}
				1618
				1619	for (unsigned Reg : OutRegs) {
				1620	// For now only track virtual registers.
				1621	if (!TargetRegisterInfo::isVirtualRegister(Reg))
				1622	continue;
				1623	PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
				1624	for (; PSetI.isValid(); ++PSetI) {
				1625	DiffSetPressure[*PSetI] += PSetI.getWeight();
				1626	}
				1627	}
				1628
				1629	return DiffSetPressure;
				1630	}
				1631
				1632	// SIScheduler //
				1633
				1634	struct SIScheduleBlockResult
				1635	SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
				1636	SISchedulerBlockSchedulerVariant ScheduleVariant) {
				1637	SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
				1638	SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
				1639	std::vector<SIScheduleBlock*> ScheduledBlocks;
				1640	struct SIScheduleBlockResult Res;
				1641
				1642	ScheduledBlocks = Scheduler.getBlocks();
				1643
				1644	for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
				1645	SIScheduleBlock *Block = ScheduledBlocks[b];
				1646	std::vector<SUnit*> SUs = Block->getScheduledUnits();
				1647
				1648	for (SUnit* SU : SUs)
				1649	Res.SUs.push_back(SU->NodeNum);
				1650	}
				1651
				1652	Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
				1653	Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
				1654	return Res;
				1655	}
				1656
				1657	// SIScheduleDAGMI //
				1658
				1659	SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
				1660	ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)) {
				1661	SITII = static_cast<const SIInstrInfo*>(TII);
				1662	SITRI = static_cast<const SIRegisterInfo*>(TRI);
				1663
				1664	VGPRSetID = SITRI->getVGPR32PressureSet();
				1665	SGPRSetID = SITRI->getSGPR32PressureSet();
				1666	}
				1667
				1668	SIScheduleDAGMI::~SIScheduleDAGMI() {
				1669	}
				1670
				1671	ScheduleDAGInstrs llvm::createSIMachineScheduler(MachineSchedContext C) {
				1672	return new SIScheduleDAGMI(C);
				1673	}
				1674
				1675	// Code adapted from scheduleDAG.cpp
				1676	// Does a topological sort over the SUs.
				1677	// Both TopDown and BottomUp
				1678	void SIScheduleDAGMI::topologicalSort() {
				1679	std::vector<int> TopDownSU2Index;
				1680	unsigned DAGSize = SUnits.size();
				1681	std::vector<SUnit*> WorkList;
				1682
				1683	DEBUG(dbgs() << "Topological Sort\n");
				1684	WorkList.reserve(DAGSize);
				1685
				1686	TopDownIndex2SU.resize(DAGSize);
				1687	TopDownSU2Index.resize(DAGSize);
				1688	BottomUpIndex2SU.resize(DAGSize);
				1689
				1690	WorkList.push_back(&getExitSU());
				1691	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1692	SUnit *SU = &SUnits[i];
				1693	int NodeNum = SU->NodeNum;
				1694	unsigned Degree = SU->Succs.size();
				1695	TopDownSU2Index[NodeNum] = Degree;
				1696	if (Degree == 0) {
				1697	assert(SU->Succs.empty() && "SUnit should have no successors");
				1698	WorkList.push_back(SU);
				1699	}
				1700	}
				1701
				1702	int Id = DAGSize;
				1703	while (!WorkList.empty()) {
				1704	SUnit *SU = WorkList.back();
				1705	WorkList.pop_back();
				1706	if (SU->NodeNum < DAGSize) {
				1707	TopDownSU2Index[SU->NodeNum] = --Id;
				1708	TopDownIndex2SU[Id] = SU->NodeNum;
				1709	}
				1710	for (SDep& Pred : SU->Preds) {
				1711	SUnit *SU = Pred.getSUnit();
				1712	if (SU->NodeNum < DAGSize && !--TopDownSU2Index[SU->NodeNum])
				1713	WorkList.push_back(SU);
				1714	}
				1715	}
				1716
				1717	BottomUpIndex2SU = std::vector<int>(TopDownIndex2SU.rbegin(),
				1718	TopDownIndex2SU.rend());
				1719
				1720	#ifndef NDEBUG
				1721	// Check correctness of the ordering
				1722	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1723	SUnit *SU = &SUnits[i];
				1724	for (SDep& Pred : SU->Preds) {
				1725	if (Pred.getSUnit()->NodeNum >= DAGSize)
				1726	continue;
				1727	assert(TopDownSU2Index[SU->NodeNum] >
				1728	TopDownSU2Index[Pred.getSUnit()->NodeNum] &&
				1729	"Wrong Top Down topological sorting");
				1730	}
				1731	}
				1732	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1733	SUnit *SU = &SUnits[i];
				1734	for (SDep& Succ : SU->Succs) {
				1735	if (Succ.getSUnit()->NodeNum >= DAGSize)
				1736	continue;
				1737	assert(TopDownSU2Index[SU->NodeNum] <
				1738	TopDownSU2Index[Succ.getSUnit()->NodeNum] &&
				1739	"Wrong Bottom Up topological sorting");
				1740	}
				1741	}
				1742	#endif
				1743	}
				1744
				1745	// Move low latencies further from their user without
				1746	// increasing SGPR usage (in general)
				1747	// This is to be replaced by a better pass that would
				1748	// take into account SGPR usage (based on VGPR Usage
				1749	// and the corresponding wavefront count), that would
				1750	// try to merge groups of loads if it make sense, etc
				1751	void SIScheduleDAGMI::moveLowLatencies() {
				1752	unsigned DAGSize = SUnits.size();
				1753	int LastLowLatencyUser = -1;
				1754	int LastLowLatencyPos = -1;
				1755
				1756	for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
				1757	SUnit *SU = &SUnits[ScheduledSUnits[i]];
				1758	bool IsLowLatencyUser = false;
				1759	unsigned MinPos = 0;
				1760
				1761	for (SDep& PredDep : SU->Preds) {
				1762	SUnit *Pred = PredDep.getSUnit();
				1763	if (SITII->isLowLatencyInstruction(Pred->getInstr())) {
				1764	IsLowLatencyUser = true;
				1765	}
				1766	if (Pred->NodeNum >= DAGSize)
				1767	continue;
				1768	unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
				1769	if (PredPos >= MinPos)
				1770	MinPos = PredPos + 1;
				1771	}
				1772
				1773	if (SITII->isLowLatencyInstruction(SU->getInstr())) {
				1774	unsigned BestPos = LastLowLatencyUser + 1;
				1775	if ((int)BestPos <= LastLowLatencyPos)
				1776	BestPos = LastLowLatencyPos + 1;
				1777	if (BestPos < MinPos)
				1778	BestPos = MinPos;
				1779	if (BestPos < i) {
				1780	for (unsigned u = i; u > BestPos; --u) {
				1781	++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
				1782	ScheduledSUnits[u] = ScheduledSUnits[u-1];
				1783	}
				1784	ScheduledSUnits[BestPos] = SU->NodeNum;
				1785	ScheduledSUnitsInv[SU->NodeNum] = BestPos;
				1786	}
				1787	LastLowLatencyPos = BestPos;
				1788	if (IsLowLatencyUser)
				1789	LastLowLatencyUser = BestPos;
				1790	} else if (IsLowLatencyUser) {
				1791	LastLowLatencyUser = i;
				1792	// Moves COPY instructions on which depends
				1793	// the low latency instructions too.
				1794	} else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
				1795	bool CopyForLowLat = false;
				1796	for (SDep& SuccDep : SU->Succs) {
				1797	SUnit *Succ = SuccDep.getSUnit();
				1798	if (SITII->isLowLatencyInstruction(Succ->getInstr())) {
				1799	CopyForLowLat = true;
				1800	}
				1801	}
				1802	if (!CopyForLowLat)
				1803	continue;
				1804	if (MinPos < i) {
				1805	for (unsigned u = i; u > MinPos; --u) {
				1806	++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
				1807	ScheduledSUnits[u] = ScheduledSUnits[u-1];
				1808	}
				1809	ScheduledSUnits[MinPos] = SU->NodeNum;
				1810	ScheduledSUnitsInv[SU->NodeNum] = MinPos;
				1811	}
				1812	}
				1813	}
				1814	}
				1815
				1816	void SIScheduleDAGMI::restoreSULinksLeft() {
				1817	for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
				1818	SUnits[i].isScheduled = false;
				1819	SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
				1820	SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
				1821	SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
				1822	SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
				1823	}
				1824	}
				1825
				1826	// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
				1827	template<typename _Iterator> void
				1828	SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
				1829	unsigned &VgprUsage, unsigned &SgprUsage) {
				1830	VgprUsage = 0;
				1831	SgprUsage = 0;
				1832	for (_Iterator RegI = First; RegI != End; ++RegI) {
				1833	unsigned Reg = *RegI;
				1834	// For now only track virtual registers
				1835	if (!TargetRegisterInfo::isVirtualRegister(Reg))
				1836	continue;
				1837	PSetIterator PSetI = MRI.getPressureSets(Reg);
				1838	for (; PSetI.isValid(); ++PSetI) {
				1839	if (*PSetI == VGPRSetID)
				1840	VgprUsage += PSetI.getWeight();
				1841	else if (*PSetI == SGPRSetID)
				1842	SgprUsage += PSetI.getWeight();
				1843	}
				1844	}
				1845	}
				1846
				1847	void SIScheduleDAGMI::schedule()
				1848	{
				1849	SmallVector<SUnit*, 8> TopRoots, BotRoots;
				1850	SIScheduleBlockResult Best, Temp;
				1851	DEBUG(dbgs() << "Preparing Scheduling\n");
				1852
				1853	buildDAGWithRegPressure();
				1854	DEBUG(
				1855	for(SUnit& SU : SUnits)
				1856	SU.dumpAll(this)
				1857	);
				1858
				1859	Topo.InitDAGTopologicalSorting();
				1860	topologicalSort();
				1861	findRootsAndBiasEdges(TopRoots, BotRoots);
				1862	// We reuse several ScheduleDAGMI and ScheduleDAGMILive
				1863	// functions, but to make them happy we must initialize
				1864	// the default Scheduler implementation (even if we do not
				1865	// run it)
				1866	SchedImpl->initialize(this);
				1867	initQueues(TopRoots, BotRoots);
				1868
				1869	// Fill some stats to help scheduling.
				1870
				1871	SUnitsLinksBackup = SUnits;
				1872	IsLowLatencySU.clear();
				1873	LowLatencyOffset.clear();
				1874	IsHighLatencySU.clear();
				1875
				1876	IsLowLatencySU.resize(SUnits.size(), 0);
				1877	LowLatencyOffset.resize(SUnits.size(), 0);
				1878	IsHighLatencySU.resize(SUnits.size(), 0);
				1879
				1880	for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
				1881	SUnit *SU = &SUnits[i];
Chad Rosier	c27a18f	2016-03-09 16:00:35 +0000	[diff] [blame^]	1882	unsigned BaseLatReg;
				1883	int64_t OffLatReg;
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	1884	if (SITII->isLowLatencyInstruction(SU->getInstr())) {
				1885	IsLowLatencySU[i] = 1;
				1886	if (SITII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseLatReg,
				1887	OffLatReg, TRI))
				1888	LowLatencyOffset[i] = OffLatReg;
				1889	} else if (SITII->isHighLatencyInstruction(SU->getInstr()))
				1890	IsHighLatencySU[i] = 1;
				1891	}
				1892
				1893	SIScheduler Scheduler(this);
				1894	Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
				1895	SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
				1896	#if 0 // To enable when handleMove fix lands
				1897	// if VGPR usage is extremely high, try other good performing variants
				1898	// which could lead to lower VGPR usage
				1899	if (Best.MaxVGPRUsage > 180) {
				1900	std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
				1901	{ LatenciesAlone, BlockRegUsageLatency },
				1902	// { LatenciesAlone, BlockRegUsage },
				1903	{ LatenciesGrouped, BlockLatencyRegUsage },
				1904	// { LatenciesGrouped, BlockRegUsageLatency },
				1905	// { LatenciesGrouped, BlockRegUsage },
				1906	{ LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
				1907	// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
				1908	// { LatenciesAlonePlusConsecutive, BlockRegUsage }
				1909	};
				1910	for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
				1911	Temp = Scheduler.scheduleVariant(v.first, v.second);
				1912	if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
				1913	Best = Temp;
				1914	}
				1915	}
				1916	// if VGPR usage is still extremely high, we may spill. Try other variants
				1917	// which are less performing, but that could lead to lower VGPR usage.
				1918	if (Best.MaxVGPRUsage > 200) {
				1919	std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
				1920	// { LatenciesAlone, BlockRegUsageLatency },
				1921	{ LatenciesAlone, BlockRegUsage },
				1922	// { LatenciesGrouped, BlockLatencyRegUsage },
				1923	{ LatenciesGrouped, BlockRegUsageLatency },
				1924	{ LatenciesGrouped, BlockRegUsage },
				1925	// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
				1926	{ LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
				1927	{ LatenciesAlonePlusConsecutive, BlockRegUsage }
				1928	};
				1929	for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
				1930	Temp = Scheduler.scheduleVariant(v.first, v.second);
				1931	if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
				1932	Best = Temp;
				1933	}
				1934	}
				1935	#endif
				1936	ScheduledSUnits = Best.SUs;
				1937	ScheduledSUnitsInv.resize(SUnits.size());
				1938
				1939	for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
				1940	ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
				1941	}
				1942
				1943	moveLowLatencies();
				1944
				1945	// Tell the outside world about the result of the scheduling.
				1946
				1947	assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
				1948	TopRPTracker.setPos(CurrentTop);
				1949
				1950	for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
				1951	E = ScheduledSUnits.end(); I != E; ++I) {
				1952	SUnit SU = &SUnits[I];
				1953
				1954	scheduleMI(SU, true);
				1955
				1956	DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
				1957	<< *SU->getInstr());
				1958	}
				1959
				1960	assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
				1961
				1962	placeDebugValues();
				1963
				1964	DEBUG({
				1965	unsigned BBNum = begin()->getParent()->getNumber();
				1966	dbgs() << "* Final schedule for BB#" << BBNum << " *\n";
				1967	dumpSchedule();
				1968	dbgs() << '\n';
				1969	});
				1970	}