Blame - llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp - toolchain/llvm-project

blob: 8de1c4a51e2d47a878b2c959fbc880035a62c0ce [file] [log] [blame]

Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	1	//===-- SIMachineScheduler.cpp - SI Scheduler Interface -- C++ ------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	/// \file
				11	/// \brief SI Machine Scheduler interface
				12	//
				13	//===----------------------------------------------------------------------===//
				14
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	15	#include "AMDGPU.h"
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	16	#include "SIMachineScheduler.h"
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	17	#include "llvm/CodeGen/LiveInterval.h"
				18	#include "llvm/CodeGen/LiveIntervalAnalysis.h"
				19	#include "llvm/CodeGen/MachineRegisterInfo.h"
				20	#include "llvm/CodeGen/MachineScheduler.h"
				21	#include "llvm/CodeGen/RegisterPressure.h"
				22
				23	using namespace llvm;
				24
				25	#define DEBUG_TYPE "misched"
				26
				27	// This scheduler implements a different scheduling algorithm than
				28	// GenericScheduler.
				29	//
				30	// There are several specific architecture behaviours that can't be modelled
				31	// for GenericScheduler:
				32	// . When accessing the result of an SGPR load instruction, you have to wait
				33	// for all the SGPR load instructions before your current instruction to
				34	// have finished.
				35	// . When accessing the result of an VGPR load instruction, you have to wait
				36	// for all the VGPR load instructions previous to the VGPR load instruction
				37	// you are interested in to finish.
				38	// . The less the register pressure, the best load latencies are hidden
				39	//
				40	// Moreover some specifities (like the fact a lot of instructions in the shader
				41	// have few dependencies) makes the generic scheduler have some unpredictable
				42	// behaviours. For example when register pressure becomes high, it can either
				43	// manage to prevent register pressure from going too high, or it can
				44	// increase register pressure even more than if it hadn't taken register
				45	// pressure into account.
				46	//
				47	// Also some other bad behaviours are generated, like loading at the beginning
				48	// of the shader a constant in VGPR you won't need until the end of the shader.
				49	//
				50	// The scheduling problem for SI can distinguish three main parts:
				51	// . Hiding high latencies (texture sampling, etc)
				52	// . Hiding low latencies (SGPR constant loading, etc)
				53	// . Keeping register usage low for better latency hiding and general
				54	// performance
				55	//
				56	// Some other things can also affect performance, but are hard to predict
				57	// (cache usage, the fact the HW can issue several instructions from different
				58	// wavefronts if different types, etc)
				59	//
				60	// This scheduler tries to solve the scheduling problem by dividing it into
				61	// simpler sub-problems. It divides the instructions into blocks, schedules
				62	// locally inside the blocks where it takes care of low latencies, and then
				63	// chooses the order of the blocks by taking care of high latencies.
				64	// Dividing the instructions into blocks helps control keeping register
				65	// usage low.
				66	//
				67	// First the instructions are put into blocks.
				68	// We want the blocks help control register usage and hide high latencies
				69	// later. To help control register usage, we typically want all local
				70	// computations, when for example you create a result that can be comsummed
				71	// right away, to be contained in a block. Block inputs and outputs would
				72	// typically be important results that are needed in several locations of
				73	// the shader. Since we do want blocks to help hide high latencies, we want
				74	// the instructions inside the block to have a minimal set of dependencies
				75	// on high latencies. It will make it easy to pick blocks to hide specific
				76	// high latencies.
				77	// The block creation algorithm is divided into several steps, and several
				78	// variants can be tried during the scheduling process.
				79	//
				80	// Second the order of the instructions inside the blocks is choosen.
				81	// At that step we do take into account only register usage and hiding
				82	// low latency instructions
				83	//
				84	// Third the block order is choosen, there we try to hide high latencies
				85	// and keep register usage low.
				86	//
				87	// After the third step, a pass is done to improve the hiding of low
				88	// latencies.
				89	//
				90	// Actually when talking about 'low latency' or 'high latency' it includes
				91	// both the latency to get the cache (or global mem) data go to the register,
				92	// and the bandwith limitations.
				93	// Increasing the number of active wavefronts helps hide the former, but it
				94	// doesn't solve the latter, thus why even if wavefront count is high, we have
				95	// to try have as many instructions hiding high latencies as possible.
				96	// The OpenCL doc says for example latency of 400 cycles for a global mem access,
				97	// which is hidden by 10 instructions if the wavefront count is 10.
				98
				99	// Some figures taken from AMD docs:
				100	// Both texture and constant L1 caches are 4-way associative with 64 bytes
				101	// lines.
				102	// Constant cache is shared with 4 CUs.
				103	// For texture sampling, the address generation unit receives 4 texture
				104	// addresses per cycle, thus we could expect texture sampling latency to be
				105	// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
				106	// instructions in a wavefront group are executed every 4 cycles),
				107	// or 16 instructions if the other wavefronts associated to the 3 other VALUs
				108	// of the CU do texture sampling too. (Don't take these figures too seriously,
				109	// as I'm not 100% sure of the computation)
				110	// Data exports should get similar latency.
				111	// For constant loading, the cache is shader with 4 CUs.
				112	// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
				113	// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
				114	// It means a simple s_buffer_load should take one instruction to hide, as
				115	// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
				116	// cache line.
				117	//
				118	// As of today the driver doesn't preload the constants in cache, thus the
				119	// first loads get extra latency. The doc says global memory access can be
				120	// 300-600 cycles. We do not specially take that into account when scheduling
				121	// As we expect the driver to be able to preload the constants soon.
				122
				123
				124	// common code //
				125
				126	#ifndef NDEBUG
				127
				128	static const char *getReasonStr(SIScheduleCandReason Reason) {
				129	switch (Reason) {
				130	case NoCand: return "NOCAND";
				131	case RegUsage: return "REGUSAGE";
				132	case Latency: return "LATENCY";
				133	case Successor: return "SUCCESSOR";
				134	case Depth: return "DEPTH";
				135	case NodeOrder: return "ORDER";
				136	}
				137	llvm_unreachable("Unknown reason!");
				138	}
				139
				140	#endif
				141
				142	static bool tryLess(int TryVal, int CandVal,
				143	SISchedulerCandidate &TryCand,
				144	SISchedulerCandidate &Cand,
				145	SIScheduleCandReason Reason) {
				146	if (TryVal < CandVal) {
				147	TryCand.Reason = Reason;
				148	return true;
				149	}
				150	if (TryVal > CandVal) {
				151	if (Cand.Reason > Reason)
				152	Cand.Reason = Reason;
				153	return true;
				154	}
				155	Cand.setRepeat(Reason);
				156	return false;
				157	}
				158
				159	static bool tryGreater(int TryVal, int CandVal,
				160	SISchedulerCandidate &TryCand,
				161	SISchedulerCandidate &Cand,
				162	SIScheduleCandReason Reason) {
				163	if (TryVal > CandVal) {
				164	TryCand.Reason = Reason;
				165	return true;
				166	}
				167	if (TryVal < CandVal) {
				168	if (Cand.Reason > Reason)
				169	Cand.Reason = Reason;
				170	return true;
				171	}
				172	Cand.setRepeat(Reason);
				173	return false;
				174	}
				175
				176	// SIScheduleBlock //
				177
				178	void SIScheduleBlock::addUnit(SUnit *SU) {
				179	NodeNum2Index[SU->NodeNum] = SUnits.size();
				180	SUnits.push_back(SU);
				181	}
				182
				183	#ifndef NDEBUG
				184
				185	void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
				186
				187	dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
				188	dbgs() << '\n';
				189	}
				190	#endif
				191
				192	void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
				193	SISchedCandidate &TryCand) {
				194	// Initialize the candidate if needed.
				195	if (!Cand.isValid()) {
				196	TryCand.Reason = NodeOrder;
				197	return;
				198	}
				199
				200	if (Cand.SGPRUsage > 60 &&
				201	tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
				202	return;
				203
				204	// Schedule low latency instructions as top as possible.
				205	// Order of priority is:
				206	// . Low latency instructions which do not depend on other low latency
				207	// instructions we haven't waited for
				208	// . Other instructions which do not depend on low latency instructions
				209	// we haven't waited for
				210	// . Low latencies
				211	// . All other instructions
				212	// Goal is to get: low latency instructions - independant instructions
				213	// - (eventually some more low latency instructions)
				214	// - instructions that depend on the first low latency instructions.
				215	// If in the block there is a lot of constant loads, the SGPR usage
				216	// could go quite high, thus above the arbitrary limit of 60 will encourage
				217	// use the already loaded constants (in order to release some SGPRs) before
				218	// loading more.
				219	if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
				220	Cand.HasLowLatencyNonWaitedParent,
				221	TryCand, Cand, SIScheduleCandReason::Depth))
				222	return;
				223
				224	if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
				225	TryCand, Cand, SIScheduleCandReason::Depth))
				226	return;
				227
				228	if (TryCand.IsLowLatency &&
				229	tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
				230	TryCand, Cand, SIScheduleCandReason::Depth))
				231	return;
				232
				233	if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
				234	return;
				235
				236	// Fall through to original instruction order.
				237	if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
				238	TryCand.Reason = NodeOrder;
				239	}
				240	}
				241
				242	SUnit* SIScheduleBlock::pickNode() {
				243	SISchedCandidate TopCand;
				244
				245	for (SUnit* SU : TopReadySUs) {
				246	SISchedCandidate TryCand;
				247	std::vector<unsigned> pressure;
				248	std::vector<unsigned> MaxPressure;
				249	// Predict register usage after this instruction.
				250	TryCand.SU = SU;
				251	TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
				252	TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
				253	TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
				254	TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
				255	TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
				256	TryCand.HasLowLatencyNonWaitedParent =
				257	HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
				258	tryCandidateTopDown(TopCand, TryCand);
				259	if (TryCand.Reason != NoCand)
				260	TopCand.setBest(TryCand);
				261	}
				262
				263	return TopCand.SU;
				264	}
				265
				266
				267	// Schedule something valid.
				268	void SIScheduleBlock::fastSchedule() {
				269	TopReadySUs.clear();
				270	if (Scheduled)
				271	undoSchedule();
				272
				273	for (SUnit* SU : SUnits) {
				274	if (!SU->NumPredsLeft)
				275	TopReadySUs.push_back(SU);
				276	}
				277
				278	while (!TopReadySUs.empty()) {
				279	SUnit *SU = TopReadySUs[0];
				280	ScheduledSUnits.push_back(SU);
				281	nodeScheduled(SU);
				282	}
				283
				284	Scheduled = true;
				285	}
				286
				287	// Returns if the register was set between first and last.
				288	static bool isDefBetween(unsigned Reg,
				289	SlotIndex First, SlotIndex Last,
				290	const MachineRegisterInfo *MRI,
				291	const LiveIntervals *LIS) {
				292	for (MachineRegisterInfo::def_instr_iterator
				293	UI = MRI->def_instr_begin(Reg),
				294	UE = MRI->def_instr_end(); UI != UE; ++UI) {
				295	const MachineInstr* MI = &*UI;
				296	if (MI->isDebugValue())
				297	continue;
Duncan P. N. Exon Smith	3ac9cc6	2016-02-27 06:40:41 +0000	[diff] [blame]	298	SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	299	if (InstSlot >= First && InstSlot <= Last)
				300	return true;
				301	}
				302	return false;
				303	}
				304
				305	void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
				306	MachineBasicBlock::iterator EndBlock) {
				307	IntervalPressure Pressure, BotPressure;
				308	RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
				309	LiveIntervals *LIS = DAG->getLIS();
				310	MachineRegisterInfo *MRI = DAG->getMRI();
				311	DAG->initRPTracker(TopRPTracker);
				312	DAG->initRPTracker(BotRPTracker);
				313	DAG->initRPTracker(RPTracker);
				314
				315	// Goes though all SU. RPTracker captures what had to be alive for the SUs
				316	// to execute, and what is still alive at the end.
				317	for (SUnit* SU : ScheduledSUnits) {
				318	RPTracker.setPos(SU->getInstr());
				319	RPTracker.advance();
				320	}
				321
				322	// Close the RPTracker to finalize live ins/outs.
				323	RPTracker.closeRegion();
				324
				325	// Initialize the live ins and live outs.
				326	TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
				327	BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
				328
				329	// Do not Track Physical Registers, because it messes up.
Matthias Braun	5d45861	2016-01-20 00:23:26 +0000	[diff] [blame]	330	for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
				331	if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
				332	LiveInRegs.insert(RegMaskPair.RegUnit);
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	333	}
				334	LiveOutRegs.clear();
				335	// There is several possibilities to distinguish:
				336	// 1) Reg is not input to any instruction in the block, but is output of one
				337	// 2) 1) + read in the block and not needed after it
				338	// 3) 1) + read in the block but needed in another block
				339	// 4) Reg is input of an instruction but another block will read it too
				340	// 5) Reg is input of an instruction and then rewritten in the block.
				341	// result is not read in the block (implies used in another block)
				342	// 6) Reg is input of an instruction and then rewritten in the block.
				343	// result is read in the block and not needed in another block
				344	// 7) Reg is input of an instruction and then rewritten in the block.
				345	// result is read in the block but also needed in another block
				346	// LiveInRegs will contains all the regs in situation 4, 5, 6, 7
				347	// We want LiveOutRegs to contain only Regs whose content will be read after
				348	// in another block, and whose content was written in the current block,
				349	// that is we want it to get 1, 3, 5, 7
				350	// Since we made the MIs of a block to be packed all together before
				351	// scheduling, then the LiveIntervals were correct, and the RPTracker was
				352	// able to correctly handle 5 vs 6, 2 vs 3.
				353	// (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
				354	// The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
				355	// Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
				356	// The use of findDefBetween removes the case 4.
Matthias Braun	5d45861	2016-01-20 00:23:26 +0000	[diff] [blame]	357	for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
				358	unsigned Reg = RegMaskPair.RegUnit;
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	359	if (TargetRegisterInfo::isVirtualRegister(Reg) &&
Duncan P. N. Exon Smith	3ac9cc6	2016-02-27 06:40:41 +0000	[diff] [blame]	360	isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
				361	LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
				362	LIS)) {
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	363	LiveOutRegs.insert(Reg);
				364	}
				365	}
				366
				367	// Pressure = sum_alive_registers register size
				368	// Internally llvm will represent some registers as big 128 bits registers
				369	// for example, but they actually correspond to 4 actual 32 bits registers.
				370	// Thus Pressure is not equal to num_alive_registers * constant.
				371	LiveInPressure = TopPressure.MaxSetPressure;
				372	LiveOutPressure = BotPressure.MaxSetPressure;
				373
				374	// Prepares TopRPTracker for top down scheduling.
				375	TopRPTracker.closeTop();
				376	}
				377
				378	void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
				379	MachineBasicBlock::iterator EndBlock) {
				380	if (!Scheduled)
				381	fastSchedule();
				382
				383	// PreScheduling phase to set LiveIn and LiveOut.
				384	initRegPressure(BeginBlock, EndBlock);
				385	undoSchedule();
				386
				387	// Schedule for real now.
				388
				389	TopReadySUs.clear();
				390
				391	for (SUnit* SU : SUnits) {
				392	if (!SU->NumPredsLeft)
				393	TopReadySUs.push_back(SU);
				394	}
				395
				396	while (!TopReadySUs.empty()) {
				397	SUnit *SU = pickNode();
				398	ScheduledSUnits.push_back(SU);
				399	TopRPTracker.setPos(SU->getInstr());
				400	TopRPTracker.advance();
				401	nodeScheduled(SU);
				402	}
				403
				404	// TODO: compute InternalAdditionnalPressure.
				405	InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
				406
				407	// Check everything is right.
				408	#ifndef NDEBUG
				409	assert(SUnits.size() == ScheduledSUnits.size() &&
				410	TopReadySUs.empty());
				411	for (SUnit* SU : SUnits) {
				412	assert(SU->isScheduled &&
				413	SU->NumPredsLeft == 0);
				414	}
				415	#endif
				416
				417	Scheduled = true;
				418	}
				419
				420	void SIScheduleBlock::undoSchedule() {
				421	for (SUnit* SU : SUnits) {
				422	SU->isScheduled = false;
				423	for (SDep& Succ : SU->Succs) {
				424	if (BC->isSUInBlock(Succ.getSUnit(), ID))
				425	undoReleaseSucc(SU, &Succ);
				426	}
				427	}
				428	HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
				429	ScheduledSUnits.clear();
				430	Scheduled = false;
				431	}
				432
				433	void SIScheduleBlock::undoReleaseSucc(SUnit SU, SDep SuccEdge) {
				434	SUnit *SuccSU = SuccEdge->getSUnit();
				435
				436	if (SuccEdge->isWeak()) {
				437	++SuccSU->WeakPredsLeft;
				438	return;
				439	}
				440	++SuccSU->NumPredsLeft;
				441	}
				442
				443	void SIScheduleBlock::releaseSucc(SUnit SU, SDep SuccEdge) {
				444	SUnit *SuccSU = SuccEdge->getSUnit();
				445
				446	if (SuccEdge->isWeak()) {
				447	--SuccSU->WeakPredsLeft;
				448	return;
				449	}
				450	#ifndef NDEBUG
				451	if (SuccSU->NumPredsLeft == 0) {
				452	dbgs() << "* Scheduling failed! *\n";
				453	SuccSU->dump(DAG);
				454	dbgs() << " has been released too many times!\n";
				455	llvm_unreachable(nullptr);
				456	}
				457	#endif
				458
				459	--SuccSU->NumPredsLeft;
				460	}
				461
				462	/// Release Successors of the SU that are in the block or not.
				463	void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
				464	for (SDep& Succ : SU->Succs) {
				465	SUnit *SuccSU = Succ.getSUnit();
				466
				467	if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
				468	continue;
				469
				470	releaseSucc(SU, &Succ);
				471	if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
				472	TopReadySUs.push_back(SuccSU);
				473	}
				474	}
				475
				476	void SIScheduleBlock::nodeScheduled(SUnit *SU) {
				477	// Is in TopReadySUs
				478	assert (!SU->NumPredsLeft);
				479	std::vector<SUnit*>::iterator I =
				480	std::find(TopReadySUs.begin(), TopReadySUs.end(), SU);
				481	if (I == TopReadySUs.end()) {
				482	dbgs() << "Data Structure Bug in SI Scheduler\n";
				483	llvm_unreachable(nullptr);
				484	}
				485	TopReadySUs.erase(I);
				486
				487	releaseSuccessors(SU, true);
				488	// Scheduling this node will trigger a wait,
				489	// thus propagate to other instructions that they do not need to wait either.
				490	if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
				491	HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
				492
				493	if (DAG->IsLowLatencySU[SU->NodeNum]) {
				494	for (SDep& Succ : SU->Succs) {
				495	std::map<unsigned, unsigned>::iterator I =
				496	NodeNum2Index.find(Succ.getSUnit()->NodeNum);
				497	if (I != NodeNum2Index.end())
				498	HasLowLatencyNonWaitedParent[I->second] = 1;
				499	}
				500	}
				501	SU->isScheduled = true;
				502	}
				503
				504	void SIScheduleBlock::finalizeUnits() {
				505	// We remove links from outside blocks to enable scheduling inside the block.
				506	for (SUnit* SU : SUnits) {
				507	releaseSuccessors(SU, false);
				508	if (DAG->IsHighLatencySU[SU->NodeNum])
				509	HighLatencyBlock = true;
				510	}
				511	HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
				512	}
				513
				514	// we maintain ascending order of IDs
				515	void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
				516	unsigned PredID = Pred->getID();
				517
				518	// Check if not already predecessor.
				519	for (SIScheduleBlock* P : Preds) {
				520	if (PredID == P->getID())
				521	return;
				522	}
				523	Preds.push_back(Pred);
				524
Benjamin Kramer	3e9a5d3	2016-05-27 11:36:04 +0000	[diff] [blame]	525	assert(none_of(Succs,
				526	[=](SIScheduleBlock *S) { return PredID == S->getID(); }) &&
				527	"Loop in the Block Graph!");
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	528	}
				529
				530	void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
				531	unsigned SuccID = Succ->getID();
				532
				533	// Check if not already predecessor.
				534	for (SIScheduleBlock* S : Succs) {
				535	if (SuccID == S->getID())
				536	return;
				537	}
				538	if (Succ->isHighLatencyBlock())
				539	++NumHighLatencySuccessors;
				540	Succs.push_back(Succ);
Benjamin Kramer	3e9a5d3	2016-05-27 11:36:04 +0000	[diff] [blame]	541	assert(none_of(Preds,
				542	[=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
				543	"Loop in the Block Graph!");
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	544	}
				545
				546	#ifndef NDEBUG
				547	void SIScheduleBlock::printDebug(bool full) {
				548	dbgs() << "Block (" << ID << ")\n";
				549	if (!full)
				550	return;
				551
				552	dbgs() << "\nContains High Latency Instruction: "
				553	<< HighLatencyBlock << '\n';
				554	dbgs() << "\nDepends On:\n";
				555	for (SIScheduleBlock* P : Preds) {
				556	P->printDebug(false);
				557	}
				558
				559	dbgs() << "\nSuccessors:\n";
				560	for (SIScheduleBlock* S : Succs) {
				561	S->printDebug(false);
				562	}
				563
				564	if (Scheduled) {
				565	dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
				566	<< LiveInPressure[DAG->getVGPRSetID()] << '\n';
				567	dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
				568	<< LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
				569	dbgs() << "LiveIns:\n";
				570	for (unsigned Reg : LiveInRegs)
				571	dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
				572
				573	dbgs() << "\nLiveOuts:\n";
				574	for (unsigned Reg : LiveOutRegs)
				575	dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
				576	}
				577
				578	dbgs() << "\nInstructions:\n";
				579	if (!Scheduled) {
				580	for (SUnit* SU : SUnits) {
				581	SU->dump(DAG);
				582	}
				583	} else {
				584	for (SUnit* SU : SUnits) {
				585	SU->dump(DAG);
				586	}
				587	}
				588
				589	dbgs() << "///////////////////////\n";
				590	}
				591
				592	#endif
				593
				594	// SIScheduleBlockCreator //
				595
				596	SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
				597	DAG(DAG) {
				598	}
				599
				600	SIScheduleBlockCreator::~SIScheduleBlockCreator() {
				601	}
				602
				603	SIScheduleBlocks
				604	SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
				605	std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
				606	Blocks.find(BlockVariant);
				607	if (B == Blocks.end()) {
				608	SIScheduleBlocks Res;
				609	createBlocksForVariant(BlockVariant);
				610	topologicalSort();
				611	scheduleInsideBlocks();
				612	fillStats();
				613	Res.Blocks = CurrentBlocks;
				614	Res.TopDownIndex2Block = TopDownIndex2Block;
				615	Res.TopDownBlock2Index = TopDownBlock2Index;
				616	Blocks[BlockVariant] = Res;
				617	return Res;
				618	} else {
				619	return B->second;
				620	}
				621	}
				622
				623	bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
				624	if (SU->NodeNum >= DAG->SUnits.size())
				625	return false;
				626	return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
				627	}
				628
				629	void SIScheduleBlockCreator::colorHighLatenciesAlone() {
				630	unsigned DAGSize = DAG->SUnits.size();
				631
				632	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				633	SUnit *SU = &DAG->SUnits[i];
				634	if (DAG->IsHighLatencySU[SU->NodeNum]) {
				635	CurrentColoring[SU->NodeNum] = NextReservedID++;
				636	}
				637	}
				638	}
				639
				640	void SIScheduleBlockCreator::colorHighLatenciesGroups() {
				641	unsigned DAGSize = DAG->SUnits.size();
				642	unsigned NumHighLatencies = 0;
				643	unsigned GroupSize;
				644	unsigned Color = NextReservedID;
				645	unsigned Count = 0;
				646	std::set<unsigned> FormingGroup;
				647
				648	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				649	SUnit *SU = &DAG->SUnits[i];
				650	if (DAG->IsHighLatencySU[SU->NodeNum])
				651	++NumHighLatencies;
				652	}
				653
				654	if (NumHighLatencies == 0)
				655	return;
				656
				657	if (NumHighLatencies <= 6)
				658	GroupSize = 2;
				659	else if (NumHighLatencies <= 12)
				660	GroupSize = 3;
				661	else
				662	GroupSize = 4;
				663
				664	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				665	SUnit *SU = &DAG->SUnits[i];
				666	if (DAG->IsHighLatencySU[SU->NodeNum]) {
				667	unsigned CompatibleGroup = true;
				668	unsigned ProposedColor = Color;
				669	for (unsigned j : FormingGroup) {
				670	// TODO: Currently CompatibleGroup will always be false,
				671	// because the graph enforces the load order. This
				672	// can be fixed, but as keeping the load order is often
				673	// good for performance that causes a performance hit (both
				674	// the default scheduler and this scheduler).
				675	// When this scheduler determines a good load order,
				676	// this can be fixed.
				677	if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) \|\|
				678	!DAG->canAddEdge(&DAG->SUnits[j], SU))
				679	CompatibleGroup = false;
				680	}
				681	if (!CompatibleGroup \|\| ++Count == GroupSize) {
				682	FormingGroup.clear();
				683	Color = ++NextReservedID;
				684	if (!CompatibleGroup) {
				685	ProposedColor = Color;
				686	FormingGroup.insert(SU->NodeNum);
				687	}
				688	Count = 0;
				689	} else {
				690	FormingGroup.insert(SU->NodeNum);
				691	}
				692	CurrentColoring[SU->NodeNum] = ProposedColor;
				693	}
				694	}
				695	}
				696
				697	void SIScheduleBlockCreator::colorComputeReservedDependencies() {
				698	unsigned DAGSize = DAG->SUnits.size();
				699	std::map<std::set<unsigned>, unsigned> ColorCombinations;
				700
				701	CurrentTopDownReservedDependencyColoring.clear();
				702	CurrentBottomUpReservedDependencyColoring.clear();
				703
				704	CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
				705	CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
				706
				707	// Traverse TopDown, and give different colors to SUs depending
				708	// on which combination of High Latencies they depend on.
				709
Tom Stellard	4a304b3	2016-05-03 16:30:56 +0000	[diff] [blame]	710	for (unsigned SUNum : DAG->TopDownIndex2SU) {
				711	SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	712	std::set<unsigned> SUColors;
				713
				714	// Already given.
				715	if (CurrentColoring[SU->NodeNum]) {
				716	CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
				717	CurrentColoring[SU->NodeNum];
				718	continue;
				719	}
				720
				721	for (SDep& PredDep : SU->Preds) {
				722	SUnit *Pred = PredDep.getSUnit();
				723	if (PredDep.isWeak() \|\| Pred->NodeNum >= DAGSize)
				724	continue;
				725	if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
				726	SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
				727	}
				728	// Color 0 by default.
				729	if (SUColors.empty())
				730	continue;
				731	// Same color than parents.
				732	if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
				733	CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
				734	*SUColors.begin();
				735	else {
				736	std::map<std::set<unsigned>, unsigned>::iterator Pos =
				737	ColorCombinations.find(SUColors);
				738	if (Pos != ColorCombinations.end()) {
				739	CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
				740	} else {
				741	CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
				742	NextNonReservedID;
				743	ColorCombinations[SUColors] = NextNonReservedID++;
				744	}
				745	}
				746	}
				747
				748	ColorCombinations.clear();
				749
				750	// Same as before, but BottomUp.
				751
Tom Stellard	4a304b3	2016-05-03 16:30:56 +0000	[diff] [blame]	752	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
				753	SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	754	std::set<unsigned> SUColors;
				755
				756	// Already given.
				757	if (CurrentColoring[SU->NodeNum]) {
				758	CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
				759	CurrentColoring[SU->NodeNum];
				760	continue;
				761	}
				762
				763	for (SDep& SuccDep : SU->Succs) {
				764	SUnit *Succ = SuccDep.getSUnit();
				765	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
				766	continue;
				767	if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
				768	SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
				769	}
				770	// Keep color 0.
				771	if (SUColors.empty())
				772	continue;
				773	// Same color than parents.
				774	if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
				775	CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
				776	*SUColors.begin();
				777	else {
				778	std::map<std::set<unsigned>, unsigned>::iterator Pos =
				779	ColorCombinations.find(SUColors);
				780	if (Pos != ColorCombinations.end()) {
				781	CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
				782	} else {
				783	CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
				784	NextNonReservedID;
				785	ColorCombinations[SUColors] = NextNonReservedID++;
				786	}
				787	}
				788	}
				789	}
				790
				791	void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
				792	unsigned DAGSize = DAG->SUnits.size();
				793	std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
				794
				795	// Every combination of colors given by the top down
				796	// and bottom up Reserved node dependency
				797
				798	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				799	SUnit *SU = &DAG->SUnits[i];
				800	std::pair<unsigned, unsigned> SUColors;
				801
				802	// High latency instructions: already given.
				803	if (CurrentColoring[SU->NodeNum])
				804	continue;
				805
				806	SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
				807	SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
				808
				809	std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
				810	ColorCombinations.find(SUColors);
				811	if (Pos != ColorCombinations.end()) {
				812	CurrentColoring[SU->NodeNum] = Pos->second;
				813	} else {
				814	CurrentColoring[SU->NodeNum] = NextNonReservedID;
				815	ColorCombinations[SUColors] = NextNonReservedID++;
				816	}
				817	}
				818	}
				819
				820	void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
				821	unsigned DAGSize = DAG->SUnits.size();
				822	std::vector<int> PendingColoring = CurrentColoring;
				823
Tom Stellard	4a304b3	2016-05-03 16:30:56 +0000	[diff] [blame]	824	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
				825	SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	826	std::set<unsigned> SUColors;
				827	std::set<unsigned> SUColorsPending;
				828
				829	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
				830	continue;
				831
				832	if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 \|\|
				833	CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
				834	continue;
				835
				836	for (SDep& SuccDep : SU->Succs) {
				837	SUnit *Succ = SuccDep.getSUnit();
				838	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
				839	continue;
				840	if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 \|\|
				841	CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
				842	SUColors.insert(CurrentColoring[Succ->NodeNum]);
				843	SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
				844	}
				845	if (SUColors.size() == 1 && SUColorsPending.size() == 1)
				846	PendingColoring[SU->NodeNum] = *SUColors.begin();
				847	else // TODO: Attribute new colors depending on color
				848	// combination of children.
				849	PendingColoring[SU->NodeNum] = NextNonReservedID++;
				850	}
				851	CurrentColoring = PendingColoring;
				852	}
				853
				854
				855	void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
				856	unsigned DAGSize = DAG->SUnits.size();
				857	unsigned PreviousColor;
				858	std::set<unsigned> SeenColors;
				859
				860	if (DAGSize <= 1)
				861	return;
				862
				863	PreviousColor = CurrentColoring[0];
				864
				865	for (unsigned i = 1, e = DAGSize; i != e; ++i) {
				866	SUnit *SU = &DAG->SUnits[i];
				867	unsigned CurrentColor = CurrentColoring[i];
				868	unsigned PreviousColorSave = PreviousColor;
				869	assert(i == SU->NodeNum);
				870
				871	if (CurrentColor != PreviousColor)
				872	SeenColors.insert(PreviousColor);
				873	PreviousColor = CurrentColor;
				874
				875	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
				876	continue;
				877
				878	if (SeenColors.find(CurrentColor) == SeenColors.end())
				879	continue;
				880
				881	if (PreviousColorSave != CurrentColor)
				882	CurrentColoring[i] = NextNonReservedID++;
				883	else
				884	CurrentColoring[i] = CurrentColoring[i-1];
				885	}
				886	}
				887
				888	void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
				889	unsigned DAGSize = DAG->SUnits.size();
				890
Tom Stellard	4a304b3	2016-05-03 16:30:56 +0000	[diff] [blame]	891	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
				892	SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	893	std::set<unsigned> SUColors;
				894
				895	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
				896	continue;
				897
				898	// No predecessor: Vgpr constant loading.
				899	// Low latency instructions usually have a predecessor (the address)
				900	if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
				901	continue;
				902
				903	for (SDep& SuccDep : SU->Succs) {
				904	SUnit *Succ = SuccDep.getSUnit();
				905	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
				906	continue;
				907	SUColors.insert(CurrentColoring[Succ->NodeNum]);
				908	}
				909	if (SUColors.size() == 1)
				910	CurrentColoring[SU->NodeNum] = *SUColors.begin();
				911	}
				912	}
				913
				914	void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
				915	unsigned DAGSize = DAG->SUnits.size();
				916
Tom Stellard	4a304b3	2016-05-03 16:30:56 +0000	[diff] [blame]	917	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
				918	SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	919	std::set<unsigned> SUColors;
				920
				921	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
				922	continue;
				923
				924	for (SDep& SuccDep : SU->Succs) {
				925	SUnit *Succ = SuccDep.getSUnit();
				926	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
				927	continue;
				928	SUColors.insert(CurrentColoring[Succ->NodeNum]);
				929	}
				930	if (SUColors.size() == 1)
				931	CurrentColoring[SU->NodeNum] = *SUColors.begin();
				932	}
				933	}
				934
				935	void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
				936	unsigned DAGSize = DAG->SUnits.size();
				937
Tom Stellard	4a304b3	2016-05-03 16:30:56 +0000	[diff] [blame]	938	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
				939	SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	940	std::set<unsigned> SUColors;
				941
				942	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
				943	continue;
				944
				945	for (SDep& SuccDep : SU->Succs) {
				946	SUnit *Succ = SuccDep.getSUnit();
				947	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
				948	continue;
				949	SUColors.insert(CurrentColoring[Succ->NodeNum]);
				950	}
				951	if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
				952	CurrentColoring[SU->NodeNum] = *SUColors.begin();
				953	}
				954	}
				955
				956	void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
				957	unsigned DAGSize = DAG->SUnits.size();
				958	std::map<unsigned, unsigned> ColorCount;
				959
Tom Stellard	4a304b3	2016-05-03 16:30:56 +0000	[diff] [blame]	960	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
				961	SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	962	unsigned color = CurrentColoring[SU->NodeNum];
				963	std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color);
				964	if (Pos != ColorCount.end()) {
				965	++ColorCount[color];
				966	} else {
				967	ColorCount[color] = 1;
				968	}
				969	}
				970
Tom Stellard	4a304b3	2016-05-03 16:30:56 +0000	[diff] [blame]	971	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
				972	SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	973	unsigned color = CurrentColoring[SU->NodeNum];
				974	std::set<unsigned> SUColors;
				975
				976	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
				977	continue;
				978
				979	if (ColorCount[color] > 1)
				980	continue;
				981
				982	for (SDep& SuccDep : SU->Succs) {
				983	SUnit *Succ = SuccDep.getSUnit();
				984	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
				985	continue;
				986	SUColors.insert(CurrentColoring[Succ->NodeNum]);
				987	}
				988	if (SUColors.size() == 1 && *SUColors.begin() != color) {
				989	--ColorCount[color];
				990	CurrentColoring[SU->NodeNum] = *SUColors.begin();
				991	++ColorCount[*SUColors.begin()];
				992	}
				993	}
				994	}
				995
				996	void SIScheduleBlockCreator::cutHugeBlocks() {
				997	// TODO
				998	}
				999
				1000	void SIScheduleBlockCreator::regroupNoUserInstructions() {
				1001	unsigned DAGSize = DAG->SUnits.size();
				1002	int GroupID = NextNonReservedID++;
				1003
Tom Stellard	4a304b3	2016-05-03 16:30:56 +0000	[diff] [blame]	1004	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
				1005	SUnit *SU = &DAG->SUnits[SUNum];
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	1006	bool hasSuccessor = false;
				1007
				1008	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
				1009	continue;
				1010
				1011	for (SDep& SuccDep : SU->Succs) {
				1012	SUnit *Succ = SuccDep.getSUnit();
				1013	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
				1014	continue;
				1015	hasSuccessor = true;
				1016	}
				1017	if (!hasSuccessor)
				1018	CurrentColoring[SU->NodeNum] = GroupID;
				1019	}
				1020	}
				1021
				1022	void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
				1023	unsigned DAGSize = DAG->SUnits.size();
				1024	std::map<unsigned,unsigned> RealID;
				1025
				1026	CurrentBlocks.clear();
				1027	CurrentColoring.clear();
				1028	CurrentColoring.resize(DAGSize, 0);
				1029	Node2CurrentBlock.clear();
				1030
				1031	// Restore links previous scheduling variant has overridden.
				1032	DAG->restoreSULinksLeft();
				1033
				1034	NextReservedID = 1;
				1035	NextNonReservedID = DAGSize + 1;
				1036
				1037	DEBUG(dbgs() << "Coloring the graph\n");
				1038
				1039	if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
				1040	colorHighLatenciesGroups();
				1041	else
				1042	colorHighLatenciesAlone();
				1043	colorComputeReservedDependencies();
				1044	colorAccordingToReservedDependencies();
				1045	colorEndsAccordingToDependencies();
				1046	if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
				1047	colorForceConsecutiveOrderInGroup();
				1048	regroupNoUserInstructions();
				1049	colorMergeConstantLoadsNextGroup();
				1050	colorMergeIfPossibleNextGroupOnlyForReserved();
				1051
				1052	// Put SUs of same color into same block
				1053	Node2CurrentBlock.resize(DAGSize, -1);
				1054	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1055	SUnit *SU = &DAG->SUnits[i];
				1056	unsigned Color = CurrentColoring[SU->NodeNum];
				1057	if (RealID.find(Color) == RealID.end()) {
				1058	int ID = CurrentBlocks.size();
				1059	BlockPtrs.push_back(
				1060	make_unique<SIScheduleBlock>(DAG, this, ID));
				1061	CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
				1062	RealID[Color] = ID;
				1063	}
				1064	CurrentBlocks[RealID[Color]]->addUnit(SU);
				1065	Node2CurrentBlock[SU->NodeNum] = RealID[Color];
				1066	}
				1067
				1068	// Build dependencies between blocks.
				1069	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1070	SUnit *SU = &DAG->SUnits[i];
				1071	int SUID = Node2CurrentBlock[i];
				1072	for (SDep& SuccDep : SU->Succs) {
				1073	SUnit *Succ = SuccDep.getSUnit();
				1074	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
				1075	continue;
				1076	if (Node2CurrentBlock[Succ->NodeNum] != SUID)
				1077	CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]);
				1078	}
				1079	for (SDep& PredDep : SU->Preds) {
				1080	SUnit *Pred = PredDep.getSUnit();
				1081	if (PredDep.isWeak() \|\| Pred->NodeNum >= DAGSize)
				1082	continue;
				1083	if (Node2CurrentBlock[Pred->NodeNum] != SUID)
				1084	CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
				1085	}
				1086	}
				1087
				1088	// Free root and leafs of all blocks to enable scheduling inside them.
				1089	for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
				1090	SIScheduleBlock *Block = CurrentBlocks[i];
				1091	Block->finalizeUnits();
				1092	}
				1093	DEBUG(
				1094	dbgs() << "Blocks created:\n\n";
				1095	for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
				1096	SIScheduleBlock *Block = CurrentBlocks[i];
				1097	Block->printDebug(true);
				1098	}
				1099	);
				1100	}
				1101
				1102	// Two functions taken from Codegen/MachineScheduler.cpp
				1103
				1104	/// If this iterator is a debug value, increment until reaching the End or a
				1105	/// non-debug instruction.
				1106	static MachineBasicBlock::const_iterator
				1107	nextIfDebug(MachineBasicBlock::const_iterator I,
				1108	MachineBasicBlock::const_iterator End) {
				1109	for(; I != End; ++I) {
				1110	if (!I->isDebugValue())
				1111	break;
				1112	}
				1113	return I;
				1114	}
				1115
				1116	/// Non-const version.
				1117	static MachineBasicBlock::iterator
				1118	nextIfDebug(MachineBasicBlock::iterator I,
				1119	MachineBasicBlock::const_iterator End) {
				1120	// Cast the return value to nonconst MachineInstr, then cast to an
				1121	// instr_iterator, which does not check for null, finally return a
				1122	// bundle_iterator.
				1123	return MachineBasicBlock::instr_iterator(
				1124	const_cast<MachineInstr*>(
				1125	&*nextIfDebug(MachineBasicBlock::const_iterator(I), End)));
				1126	}
				1127
				1128	void SIScheduleBlockCreator::topologicalSort() {
				1129	unsigned DAGSize = CurrentBlocks.size();
				1130	std::vector<int> WorkList;
				1131
				1132	DEBUG(dbgs() << "Topological Sort\n");
				1133
				1134	WorkList.reserve(DAGSize);
				1135	TopDownIndex2Block.resize(DAGSize);
				1136	TopDownBlock2Index.resize(DAGSize);
				1137	BottomUpIndex2Block.resize(DAGSize);
				1138
				1139	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1140	SIScheduleBlock *Block = CurrentBlocks[i];
				1141	unsigned Degree = Block->getSuccs().size();
				1142	TopDownBlock2Index[i] = Degree;
				1143	if (Degree == 0) {
				1144	WorkList.push_back(i);
				1145	}
				1146	}
				1147
				1148	int Id = DAGSize;
				1149	while (!WorkList.empty()) {
				1150	int i = WorkList.back();
				1151	SIScheduleBlock *Block = CurrentBlocks[i];
				1152	WorkList.pop_back();
				1153	TopDownBlock2Index[i] = --Id;
				1154	TopDownIndex2Block[Id] = i;
				1155	for (SIScheduleBlock* Pred : Block->getPreds()) {
				1156	if (!--TopDownBlock2Index[Pred->getID()])
				1157	WorkList.push_back(Pred->getID());
				1158	}
				1159	}
				1160
				1161	#ifndef NDEBUG
				1162	// Check correctness of the ordering.
				1163	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1164	SIScheduleBlock *Block = CurrentBlocks[i];
				1165	for (SIScheduleBlock* Pred : Block->getPreds()) {
				1166	assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
				1167	"Wrong Top Down topological sorting");
				1168	}
				1169	}
				1170	#endif
				1171
				1172	BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
				1173	TopDownIndex2Block.rend());
				1174	}
				1175
				1176	void SIScheduleBlockCreator::scheduleInsideBlocks() {
				1177	unsigned DAGSize = CurrentBlocks.size();
				1178
				1179	DEBUG(dbgs() << "\nScheduling Blocks\n\n");
				1180
				1181	// We do schedule a valid scheduling such that a Block corresponds
				1182	// to a range of instructions.
				1183	DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
				1184	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1185	SIScheduleBlock *Block = CurrentBlocks[i];
				1186	Block->fastSchedule();
				1187	}
				1188
				1189	// Note: the following code, and the part restoring previous position
				1190	// is by far the most expensive operation of the Scheduler.
				1191
				1192	// Do not update CurrentTop.
				1193	MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
				1194	std::vector<MachineBasicBlock::iterator> PosOld;
				1195	std::vector<MachineBasicBlock::iterator> PosNew;
				1196	PosOld.reserve(DAG->SUnits.size());
				1197	PosNew.reserve(DAG->SUnits.size());
				1198
				1199	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1200	int BlockIndice = TopDownIndex2Block[i];
				1201	SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
				1202	std::vector<SUnit*> SUs = Block->getScheduledUnits();
				1203
				1204	for (SUnit* SU : SUs) {
				1205	MachineInstr *MI = SU->getInstr();
				1206	MachineBasicBlock::iterator Pos = MI;
				1207	PosOld.push_back(Pos);
				1208	if (&*CurrentTopFastSched == MI) {
				1209	PosNew.push_back(Pos);
				1210	CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
				1211	DAG->getCurrentBottom());
				1212	} else {
				1213	// Update the instruction stream.
				1214	DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
				1215
				1216	// Update LiveIntervals.
				1217	// Note: Moving all instructions and calling handleMove everytime
				1218	// is the most cpu intensive operation of the scheduler.
				1219	// It would gain a lot if there was a way to recompute the
				1220	// LiveIntervals for the entire scheduling region.
Duncan P. N. Exon Smith	be8f8c4	2016-02-27 20:14:29 +0000	[diff] [blame]	1221	DAG->getLIS()->handleMove(MI, /UpdateFlags=*/true);
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	1222	PosNew.push_back(CurrentTopFastSched);
				1223	}
				1224	}
				1225	}
				1226
				1227	// Now we have Block of SUs == Block of MI.
				1228	// We do the final schedule for the instructions inside the block.
				1229	// The property that all the SUs of the Block are grouped together as MI
				1230	// is used for correct reg usage tracking.
				1231	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1232	SIScheduleBlock *Block = CurrentBlocks[i];
				1233	std::vector<SUnit*> SUs = Block->getScheduledUnits();
				1234	Block->schedule((SUs.begin())->getInstr(), (SUs.rbegin())->getInstr());
				1235	}
				1236
				1237	DEBUG(dbgs() << "Restoring MI Pos\n");
				1238	// Restore old ordering (which prevents a LIS->handleMove bug).
				1239	for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
				1240	MachineBasicBlock::iterator POld = PosOld[i-1];
				1241	MachineBasicBlock::iterator PNew = PosNew[i-1];
				1242	if (PNew != POld) {
				1243	// Update the instruction stream.
				1244	DAG->getBB()->splice(POld, DAG->getBB(), PNew);
				1245
				1246	// Update LiveIntervals.
Duncan P. N. Exon Smith	be8f8c4	2016-02-27 20:14:29 +0000	[diff] [blame]	1247	DAG->getLIS()->handleMove(POld, /UpdateFlags=*/true);
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	1248	}
				1249	}
				1250
				1251	DEBUG(
				1252	for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
				1253	SIScheduleBlock *Block = CurrentBlocks[i];
				1254	Block->printDebug(true);
				1255	}
				1256	);
				1257	}
				1258
				1259	void SIScheduleBlockCreator::fillStats() {
				1260	unsigned DAGSize = CurrentBlocks.size();
				1261
				1262	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1263	int BlockIndice = TopDownIndex2Block[i];
				1264	SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
				1265	if (Block->getPreds().size() == 0)
				1266	Block->Depth = 0;
				1267	else {
				1268	unsigned Depth = 0;
				1269	for (SIScheduleBlock *Pred : Block->getPreds()) {
				1270	if (Depth < Pred->Depth + 1)
				1271	Depth = Pred->Depth + 1;
				1272	}
				1273	Block->Depth = Depth;
				1274	}
				1275	}
				1276
				1277	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
				1278	int BlockIndice = BottomUpIndex2Block[i];
				1279	SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
				1280	if (Block->getSuccs().size() == 0)
				1281	Block->Height = 0;
				1282	else {
				1283	unsigned Height = 0;
				1284	for (SIScheduleBlock *Succ : Block->getSuccs()) {
				1285	if (Height < Succ->Height + 1)
				1286	Height = Succ->Height + 1;
				1287	}
				1288	Block->Height = Height;
				1289	}
				1290	}
				1291	}
				1292
				1293	// SIScheduleBlockScheduler //
				1294
				1295	SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
				1296	SISchedulerBlockSchedulerVariant Variant,
				1297	SIScheduleBlocks BlocksStruct) :
				1298	DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
				1299	LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
				1300	SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
				1301
				1302	// Fill the usage of every output
				1303	// Warning: while by construction we always have a link between two blocks
				1304	// when one needs a result from the other, the number of users of an output
				1305	// is not the sum of child blocks having as input the same virtual register.
				1306	// Here is an example. A produces x and y. B eats x and produces x'.
				1307	// C eats x' and y. The register coalescer may have attributed the same
				1308	// virtual register to x and x'.
				1309	// To count accurately, we do a topological sort. In case the register is
				1310	// found for several parents, we increment the usage of the one with the
				1311	// highest topological index.
				1312	LiveOutRegsNumUsages.resize(Blocks.size());
				1313	for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
				1314	SIScheduleBlock *Block = Blocks[i];
				1315	for (unsigned Reg : Block->getInRegs()) {
				1316	bool Found = false;
				1317	int topoInd = -1;
				1318	for (SIScheduleBlock* Pred: Block->getPreds()) {
				1319	std::set<unsigned> PredOutRegs = Pred->getOutRegs();
				1320	std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
				1321
				1322	if (RegPos != PredOutRegs.end()) {
				1323	Found = true;
				1324	if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
				1325	topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
				1326	}
				1327	}
				1328	}
				1329
				1330	if (!Found)
				1331	continue;
				1332
				1333	int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
				1334	std::map<unsigned, unsigned>::iterator RegPos =
				1335	LiveOutRegsNumUsages[PredID].find(Reg);
				1336	if (RegPos != LiveOutRegsNumUsages[PredID].end()) {
				1337	++LiveOutRegsNumUsages[PredID][Reg];
				1338	} else {
				1339	LiveOutRegsNumUsages[PredID][Reg] = 1;
				1340	}
				1341	}
				1342	}
				1343
				1344	LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
				1345	BlockNumPredsLeft.resize(Blocks.size());
				1346	BlockNumSuccsLeft.resize(Blocks.size());
				1347
				1348	for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
				1349	SIScheduleBlock *Block = Blocks[i];
				1350	BlockNumPredsLeft[i] = Block->getPreds().size();
				1351	BlockNumSuccsLeft[i] = Block->getSuccs().size();
				1352	}
				1353
				1354	#ifndef NDEBUG
				1355	for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
				1356	SIScheduleBlock *Block = Blocks[i];
				1357	assert(Block->getID() == i);
				1358	}
				1359	#endif
				1360
				1361	std::set<unsigned> InRegs = DAG->getInRegs();
				1362	addLiveRegs(InRegs);
				1363
				1364	// Fill LiveRegsConsumers for regs that were already
				1365	// defined before scheduling.
				1366	for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
				1367	SIScheduleBlock *Block = Blocks[i];
				1368	for (unsigned Reg : Block->getInRegs()) {
				1369	bool Found = false;
				1370	for (SIScheduleBlock* Pred: Block->getPreds()) {
				1371	std::set<unsigned> PredOutRegs = Pred->getOutRegs();
				1372	std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
				1373
				1374	if (RegPos != PredOutRegs.end()) {
				1375	Found = true;
				1376	break;
				1377	}
				1378	}
				1379
				1380	if (!Found) {
				1381	if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end())
				1382	LiveRegsConsumers[Reg] = 1;
				1383	else
				1384	++LiveRegsConsumers[Reg];
				1385	}
				1386	}
				1387	}
				1388
				1389	for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
				1390	SIScheduleBlock *Block = Blocks[i];
				1391	if (BlockNumPredsLeft[i] == 0) {
				1392	ReadyBlocks.push_back(Block);
				1393	}
				1394	}
				1395
				1396	while (SIScheduleBlock *Block = pickBlock()) {
				1397	BlocksScheduled.push_back(Block);
				1398	blockScheduled(Block);
				1399	}
				1400
				1401	DEBUG(
				1402	dbgs() << "Block Order:";
				1403	for (SIScheduleBlock* Block : BlocksScheduled) {
				1404	dbgs() << ' ' << Block->getID();
				1405	}
				1406	);
				1407	}
				1408
				1409	bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
				1410	SIBlockSchedCandidate &TryCand) {
				1411	if (!Cand.isValid()) {
				1412	TryCand.Reason = NodeOrder;
				1413	return true;
				1414	}
				1415
				1416	// Try to hide high latencies.
				1417	if (tryLess(TryCand.LastPosHighLatParentScheduled,
				1418	Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
				1419	return true;
				1420	// Schedule high latencies early so you can hide them better.
				1421	if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
				1422	TryCand, Cand, Latency))
				1423	return true;
				1424	if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height,
				1425	TryCand, Cand, Depth))
				1426	return true;
				1427	if (tryGreater(TryCand.NumHighLatencySuccessors,
				1428	Cand.NumHighLatencySuccessors,
				1429	TryCand, Cand, Successor))
				1430	return true;
				1431	return false;
				1432	}
				1433
				1434	bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
				1435	SIBlockSchedCandidate &TryCand) {
				1436	if (!Cand.isValid()) {
				1437	TryCand.Reason = NodeOrder;
				1438	return true;
				1439	}
				1440
				1441	if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
				1442	TryCand, Cand, RegUsage))
				1443	return true;
				1444	if (tryGreater(TryCand.NumSuccessors > 0,
				1445	Cand.NumSuccessors > 0,
				1446	TryCand, Cand, Successor))
				1447	return true;
				1448	if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
				1449	return true;
				1450	if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
				1451	TryCand, Cand, RegUsage))
				1452	return true;
				1453	return false;
				1454	}
				1455
				1456	SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
				1457	SIBlockSchedCandidate Cand;
				1458	std::vector<SIScheduleBlock*>::iterator Best;
				1459	SIScheduleBlock *Block;
				1460	if (ReadyBlocks.empty())
				1461	return nullptr;
				1462
				1463	DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
				1464	VregCurrentUsage, SregCurrentUsage);
				1465	if (VregCurrentUsage > maxVregUsage)
				1466	maxVregUsage = VregCurrentUsage;
				1467	if (VregCurrentUsage > maxSregUsage)
				1468	maxSregUsage = VregCurrentUsage;
				1469	DEBUG(
				1470	dbgs() << "Picking New Blocks\n";
				1471	dbgs() << "Available: ";
				1472	for (SIScheduleBlock* Block : ReadyBlocks)
				1473	dbgs() << Block->getID() << ' ';
				1474	dbgs() << "\nCurrent Live:\n";
				1475	for (unsigned Reg : LiveRegs)
				1476	dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
				1477	dbgs() << '\n';
				1478	dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
				1479	dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
				1480	);
				1481
				1482	Cand.Block = nullptr;
				1483	for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
				1484	E = ReadyBlocks.end(); I != E; ++I) {
				1485	SIBlockSchedCandidate TryCand;
				1486	TryCand.Block = *I;
				1487	TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
				1488	TryCand.VGPRUsageDiff =
				1489	checkRegUsageImpact(TryCand.Block->getInRegs(),
				1490	TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
				1491	TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
				1492	TryCand.NumHighLatencySuccessors =
				1493	TryCand.Block->getNumHighLatencySuccessors();
				1494	TryCand.LastPosHighLatParentScheduled =
				1495	(unsigned int) std::max<int> (0,
				1496	LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
				1497	LastPosWaitedHighLatency);
				1498	TryCand.Height = TryCand.Block->Height;
				1499	// Try not to increase VGPR usage too much, else we may spill.
				1500	if (VregCurrentUsage > 120 \|\|
				1501	Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
				1502	if (!tryCandidateRegUsage(Cand, TryCand) &&
				1503	Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
				1504	tryCandidateLatency(Cand, TryCand);
				1505	} else {
				1506	if (!tryCandidateLatency(Cand, TryCand))
				1507	tryCandidateRegUsage(Cand, TryCand);
				1508	}
				1509	if (TryCand.Reason != NoCand) {
				1510	Cand.setBest(TryCand);
				1511	Best = I;
				1512	DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
				1513	<< getReasonStr(Cand.Reason) << '\n');
				1514	}
				1515	}
				1516
				1517	DEBUG(
				1518	dbgs() << "Picking: " << Cand.Block->getID() << '\n';
				1519	dbgs() << "Is a block with high latency instruction: "
				1520	<< (Cand.IsHighLatency ? "yes\n" : "no\n");
				1521	dbgs() << "Position of last high latency dependency: "
				1522	<< Cand.LastPosHighLatParentScheduled << '\n';
				1523	dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
				1524	dbgs() << '\n';
				1525	);
				1526
				1527	Block = Cand.Block;
				1528	ReadyBlocks.erase(Best);
				1529	return Block;
				1530	}
				1531
				1532	// Tracking of currently alive registers to determine VGPR Usage.
				1533
				1534	void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
				1535	for (unsigned Reg : Regs) {
				1536	// For now only track virtual registers.
				1537	if (!TargetRegisterInfo::isVirtualRegister(Reg))
				1538	continue;
				1539	// If not already in the live set, then add it.
				1540	(void) LiveRegs.insert(Reg);
				1541	}
				1542	}
				1543
				1544	void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
				1545	std::set<unsigned> &Regs) {
				1546	for (unsigned Reg : Regs) {
				1547	// For now only track virtual registers.
				1548	std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
				1549	assert (Pos != LiveRegs.end() && // Reg must be live.
				1550	LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
				1551	LiveRegsConsumers[Reg] >= 1);
				1552	--LiveRegsConsumers[Reg];
				1553	if (LiveRegsConsumers[Reg] == 0)
				1554	LiveRegs.erase(Pos);
				1555	}
				1556	}
				1557
				1558	void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
				1559	for (SIScheduleBlock* Block : Parent->getSuccs()) {
				1560	--BlockNumPredsLeft[Block->getID()];
				1561	if (BlockNumPredsLeft[Block->getID()] == 0) {
				1562	ReadyBlocks.push_back(Block);
				1563	}
				1564	// TODO: Improve check. When the dependency between the high latency
				1565	// instructions and the instructions of the other blocks are WAR or WAW
				1566	// there will be no wait triggered. We would like these cases to not
				1567	// update LastPosHighLatencyParentScheduled.
				1568	if (Parent->isHighLatencyBlock())
				1569	LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled;
				1570	}
				1571	}
				1572
				1573	void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
				1574	decreaseLiveRegs(Block, Block->getInRegs());
				1575	addLiveRegs(Block->getOutRegs());
				1576	releaseBlockSuccs(Block);
				1577	for (std::map<unsigned, unsigned>::iterator RegI =
				1578	LiveOutRegsNumUsages[Block->getID()].begin(),
				1579	E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
				1580	std::pair<unsigned, unsigned> RegP = *RegI;
				1581	if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end())
				1582	LiveRegsConsumers[RegP.first] = RegP.second;
				1583	else {
				1584	assert(LiveRegsConsumers[RegP.first] == 0);
				1585	LiveRegsConsumers[RegP.first] += RegP.second;
				1586	}
				1587	}
				1588	if (LastPosHighLatencyParentScheduled[Block->getID()] >
				1589	(unsigned)LastPosWaitedHighLatency)
				1590	LastPosWaitedHighLatency =
				1591	LastPosHighLatencyParentScheduled[Block->getID()];
				1592	++NumBlockScheduled;
				1593	}
				1594
				1595	std::vector<int>
				1596	SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
				1597	std::set<unsigned> &OutRegs) {
				1598	std::vector<int> DiffSetPressure;
				1599	DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
				1600
				1601	for (unsigned Reg : InRegs) {
				1602	// For now only track virtual registers.
				1603	if (!TargetRegisterInfo::isVirtualRegister(Reg))
				1604	continue;
				1605	if (LiveRegsConsumers[Reg] > 1)
				1606	continue;
				1607	PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
				1608	for (; PSetI.isValid(); ++PSetI) {
				1609	DiffSetPressure[*PSetI] -= PSetI.getWeight();
				1610	}
				1611	}
				1612
				1613	for (unsigned Reg : OutRegs) {
				1614	// For now only track virtual registers.
				1615	if (!TargetRegisterInfo::isVirtualRegister(Reg))
				1616	continue;
				1617	PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
				1618	for (; PSetI.isValid(); ++PSetI) {
				1619	DiffSetPressure[*PSetI] += PSetI.getWeight();
				1620	}
				1621	}
				1622
				1623	return DiffSetPressure;
				1624	}
				1625
				1626	// SIScheduler //
				1627
				1628	struct SIScheduleBlockResult
				1629	SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
				1630	SISchedulerBlockSchedulerVariant ScheduleVariant) {
				1631	SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
				1632	SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
				1633	std::vector<SIScheduleBlock*> ScheduledBlocks;
				1634	struct SIScheduleBlockResult Res;
				1635
				1636	ScheduledBlocks = Scheduler.getBlocks();
				1637
				1638	for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
				1639	SIScheduleBlock *Block = ScheduledBlocks[b];
				1640	std::vector<SUnit*> SUs = Block->getScheduledUnits();
				1641
				1642	for (SUnit* SU : SUs)
				1643	Res.SUs.push_back(SU->NodeNum);
				1644	}
				1645
				1646	Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
				1647	Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
				1648	return Res;
				1649	}
				1650
				1651	// SIScheduleDAGMI //
				1652
				1653	SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
				1654	ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)) {
				1655	SITII = static_cast<const SIInstrInfo*>(TII);
				1656	SITRI = static_cast<const SIRegisterInfo*>(TRI);
				1657
				1658	VGPRSetID = SITRI->getVGPR32PressureSet();
				1659	SGPRSetID = SITRI->getSGPR32PressureSet();
				1660	}
				1661
				1662	SIScheduleDAGMI::~SIScheduleDAGMI() {
				1663	}
				1664
				1665	ScheduleDAGInstrs llvm::createSIMachineScheduler(MachineSchedContext C) {
				1666	return new SIScheduleDAGMI(C);
				1667	}
				1668
				1669	// Code adapted from scheduleDAG.cpp
				1670	// Does a topological sort over the SUs.
				1671	// Both TopDown and BottomUp
				1672	void SIScheduleDAGMI::topologicalSort() {
Tom Stellard	1d3940e	2016-06-09 23:48:02 +0000	[diff] [blame]	1673	Topo.InitDAGTopologicalSorting();
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	1674
Tom Stellard	1d3940e	2016-06-09 23:48:02 +0000	[diff] [blame]	1675	TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
				1676	BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	1677	}
				1678
				1679	// Move low latencies further from their user without
				1680	// increasing SGPR usage (in general)
				1681	// This is to be replaced by a better pass that would
				1682	// take into account SGPR usage (based on VGPR Usage
				1683	// and the corresponding wavefront count), that would
				1684	// try to merge groups of loads if it make sense, etc
				1685	void SIScheduleDAGMI::moveLowLatencies() {
				1686	unsigned DAGSize = SUnits.size();
				1687	int LastLowLatencyUser = -1;
				1688	int LastLowLatencyPos = -1;
				1689
				1690	for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
				1691	SUnit *SU = &SUnits[ScheduledSUnits[i]];
				1692	bool IsLowLatencyUser = false;
				1693	unsigned MinPos = 0;
				1694
				1695	for (SDep& PredDep : SU->Preds) {
				1696	SUnit *Pred = PredDep.getSUnit();
Duncan P. N. Exon Smith	9cfc75c	2016-06-30 00:01:54 +0000	[diff] [blame]	1697	if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	1698	IsLowLatencyUser = true;
				1699	}
				1700	if (Pred->NodeNum >= DAGSize)
				1701	continue;
				1702	unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
				1703	if (PredPos >= MinPos)
				1704	MinPos = PredPos + 1;
				1705	}
				1706
Duncan P. N. Exon Smith	9cfc75c	2016-06-30 00:01:54 +0000	[diff] [blame]	1707	if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	1708	unsigned BestPos = LastLowLatencyUser + 1;
				1709	if ((int)BestPos <= LastLowLatencyPos)
				1710	BestPos = LastLowLatencyPos + 1;
				1711	if (BestPos < MinPos)
				1712	BestPos = MinPos;
				1713	if (BestPos < i) {
				1714	for (unsigned u = i; u > BestPos; --u) {
				1715	++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
				1716	ScheduledSUnits[u] = ScheduledSUnits[u-1];
				1717	}
				1718	ScheduledSUnits[BestPos] = SU->NodeNum;
				1719	ScheduledSUnitsInv[SU->NodeNum] = BestPos;
				1720	}
				1721	LastLowLatencyPos = BestPos;
				1722	if (IsLowLatencyUser)
				1723	LastLowLatencyUser = BestPos;
				1724	} else if (IsLowLatencyUser) {
				1725	LastLowLatencyUser = i;
				1726	// Moves COPY instructions on which depends
				1727	// the low latency instructions too.
				1728	} else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
				1729	bool CopyForLowLat = false;
				1730	for (SDep& SuccDep : SU->Succs) {
				1731	SUnit *Succ = SuccDep.getSUnit();
Duncan P. N. Exon Smith	9cfc75c	2016-06-30 00:01:54 +0000	[diff] [blame]	1732	if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	1733	CopyForLowLat = true;
				1734	}
				1735	}
				1736	if (!CopyForLowLat)
				1737	continue;
				1738	if (MinPos < i) {
				1739	for (unsigned u = i; u > MinPos; --u) {
				1740	++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
				1741	ScheduledSUnits[u] = ScheduledSUnits[u-1];
				1742	}
				1743	ScheduledSUnits[MinPos] = SU->NodeNum;
				1744	ScheduledSUnitsInv[SU->NodeNum] = MinPos;
				1745	}
				1746	}
				1747	}
				1748	}
				1749
				1750	void SIScheduleDAGMI::restoreSULinksLeft() {
				1751	for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
				1752	SUnits[i].isScheduled = false;
				1753	SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
				1754	SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
				1755	SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
				1756	SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
				1757	}
				1758	}
				1759
				1760	// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
				1761	template<typename _Iterator> void
				1762	SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
				1763	unsigned &VgprUsage, unsigned &SgprUsage) {
				1764	VgprUsage = 0;
				1765	SgprUsage = 0;
				1766	for (_Iterator RegI = First; RegI != End; ++RegI) {
				1767	unsigned Reg = *RegI;
				1768	// For now only track virtual registers
				1769	if (!TargetRegisterInfo::isVirtualRegister(Reg))
				1770	continue;
				1771	PSetIterator PSetI = MRI.getPressureSets(Reg);
				1772	for (; PSetI.isValid(); ++PSetI) {
				1773	if (*PSetI == VGPRSetID)
				1774	VgprUsage += PSetI.getWeight();
				1775	else if (*PSetI == SGPRSetID)
				1776	SgprUsage += PSetI.getWeight();
				1777	}
				1778	}
				1779	}
				1780
				1781	void SIScheduleDAGMI::schedule()
				1782	{
				1783	SmallVector<SUnit*, 8> TopRoots, BotRoots;
				1784	SIScheduleBlockResult Best, Temp;
				1785	DEBUG(dbgs() << "Preparing Scheduling\n");
				1786
				1787	buildDAGWithRegPressure();
				1788	DEBUG(
				1789	for(SUnit& SU : SUnits)
				1790	SU.dumpAll(this)
				1791	);
				1792
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	1793	topologicalSort();
				1794	findRootsAndBiasEdges(TopRoots, BotRoots);
				1795	// We reuse several ScheduleDAGMI and ScheduleDAGMILive
				1796	// functions, but to make them happy we must initialize
				1797	// the default Scheduler implementation (even if we do not
				1798	// run it)
				1799	SchedImpl->initialize(this);
				1800	initQueues(TopRoots, BotRoots);
				1801
				1802	// Fill some stats to help scheduling.
				1803
				1804	SUnitsLinksBackup = SUnits;
				1805	IsLowLatencySU.clear();
				1806	LowLatencyOffset.clear();
				1807	IsHighLatencySU.clear();
				1808
				1809	IsLowLatencySU.resize(SUnits.size(), 0);
				1810	LowLatencyOffset.resize(SUnits.size(), 0);
				1811	IsHighLatencySU.resize(SUnits.size(), 0);
				1812
				1813	for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
				1814	SUnit *SU = &SUnits[i];
Chad Rosier	c27a18f	2016-03-09 16:00:35 +0000	[diff] [blame]	1815	unsigned BaseLatReg;
				1816	int64_t OffLatReg;
Duncan P. N. Exon Smith	9cfc75c	2016-06-30 00:01:54 +0000	[diff] [blame]	1817	if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	1818	IsLowLatencySU[i] = 1;
Duncan P. N. Exon Smith	9cfc75c	2016-06-30 00:01:54 +0000	[diff] [blame]	1819	if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg,
				1820	TRI))
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	1821	LowLatencyOffset[i] = OffLatReg;
Duncan P. N. Exon Smith	9cfc75c	2016-06-30 00:01:54 +0000	[diff] [blame]	1822	} else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	1823	IsHighLatencySU[i] = 1;
				1824	}
				1825
				1826	SIScheduler Scheduler(this);
				1827	Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
				1828	SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
Matt Arsenault	105c2a2	2016-07-01 18:03:46 +0000	[diff] [blame^]	1829
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	1830	// if VGPR usage is extremely high, try other good performing variants
				1831	// which could lead to lower VGPR usage
				1832	if (Best.MaxVGPRUsage > 180) {
				1833	std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
				1834	{ LatenciesAlone, BlockRegUsageLatency },
				1835	// { LatenciesAlone, BlockRegUsage },
				1836	{ LatenciesGrouped, BlockLatencyRegUsage },
				1837	// { LatenciesGrouped, BlockRegUsageLatency },
				1838	// { LatenciesGrouped, BlockRegUsage },
				1839	{ LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
				1840	// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
				1841	// { LatenciesAlonePlusConsecutive, BlockRegUsage }
				1842	};
				1843	for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
				1844	Temp = Scheduler.scheduleVariant(v.first, v.second);
				1845	if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
				1846	Best = Temp;
				1847	}
				1848	}
				1849	// if VGPR usage is still extremely high, we may spill. Try other variants
				1850	// which are less performing, but that could lead to lower VGPR usage.
				1851	if (Best.MaxVGPRUsage > 200) {
				1852	std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
				1853	// { LatenciesAlone, BlockRegUsageLatency },
				1854	{ LatenciesAlone, BlockRegUsage },
				1855	// { LatenciesGrouped, BlockLatencyRegUsage },
				1856	{ LatenciesGrouped, BlockRegUsageLatency },
				1857	{ LatenciesGrouped, BlockRegUsage },
				1858	// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
				1859	{ LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
				1860	{ LatenciesAlonePlusConsecutive, BlockRegUsage }
				1861	};
				1862	for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
				1863	Temp = Scheduler.scheduleVariant(v.first, v.second);
				1864	if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
				1865	Best = Temp;
				1866	}
				1867	}
Matt Arsenault	105c2a2	2016-07-01 18:03:46 +0000	[diff] [blame^]	1868
Nicolai Haehnle	02c3291	2016-01-13 16:10:10 +0000	[diff] [blame]	1869	ScheduledSUnits = Best.SUs;
				1870	ScheduledSUnitsInv.resize(SUnits.size());
				1871
				1872	for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
				1873	ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
				1874	}
				1875
				1876	moveLowLatencies();
				1877
				1878	// Tell the outside world about the result of the scheduling.
				1879
				1880	assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
				1881	TopRPTracker.setPos(CurrentTop);
				1882
				1883	for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
				1884	E = ScheduledSUnits.end(); I != E; ++I) {
				1885	SUnit SU = &SUnits[I];
				1886
				1887	scheduleMI(SU, true);
				1888
				1889	DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
				1890	<< *SU->getInstr());
				1891	}
				1892
				1893	assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
				1894
				1895	placeDebugValues();
				1896
				1897	DEBUG({
				1898	unsigned BBNum = begin()->getParent()->getNumber();
				1899	dbgs() << "* Final schedule for BB#" << BBNum << " *\n";
				1900	dumpSchedule();
				1901	dbgs() << '\n';
				1902	});
				1903	}