Blame - llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp - toolchain/llvm-project

blob: e22166d03e9aeefe0d7aada584866ae30e42707c [file] [log] [blame]

Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1	//===-- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===/
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	/// \file
				11	/// \brief Insert wait instructions for memory reads and writes.
				12	///
				13	/// Memory reads and writes are issued asynchronously, so we need to insert
				14	/// S_WAITCNT instructions when we want to access any of their results or
				15	/// overwrite any register that's used asynchronously.
				16	//
				17	//===----------------------------------------------------------------------===//
				18
				19	#include "AMDGPU.h"
				20	#include "AMDGPUSubtarget.h"
				21	#include "SIDefines.h"
				22	#include "SIInstrInfo.h"
				23	#include "SIMachineFunctionInfo.h"
				24	#include "Utils/AMDGPUBaseInfo.h"
				25	#include "llvm/ADT/PostOrderIterator.h"
				26	#include "llvm/CodeGen/MachineFunction.h"
				27	#include "llvm/CodeGen/MachineFunctionPass.h"
				28	#include "llvm/CodeGen/MachineInstrBuilder.h"
				29	#include "llvm/CodeGen/MachineRegisterInfo.h"
				30
				31	#define DEBUG_TYPE "si-insert-waitcnts"
				32
				33	using namespace llvm;
				34
				35	namespace {
				36
				37	// Class of object that encapsulates latest instruction counter score
				38	// associated with the operand. Used for determining whether
				39	// s_waitcnt instruction needs to be emited.
				40
				41	#define CNT_MASK(t) (1u << (t))
				42
				43	enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
				44
				45	typedef std::pair<signed, signed> RegInterval;
				46
				47	struct {
				48	int32_t VmcntMax;
				49	int32_t ExpcntMax;
				50	int32_t LgkmcntMax;
				51	int32_t NumVGPRsMax;
				52	int32_t NumSGPRsMax;
				53	} HardwareLimits;
				54
				55	struct {
				56	unsigned VGPR0;
				57	unsigned VGPRL;
				58	unsigned SGPR0;
				59	unsigned SGPRL;
				60	} RegisterEncoding;
				61
				62	enum WaitEventType {
				63	VMEM_ACCESS, // vector-memory read & write
				64	LDS_ACCESS, // lds read & write
				65	GDS_ACCESS, // gds read & write
				66	SQ_MESSAGE, // send message
				67	SMEM_ACCESS, // scalar-memory read & write
				68	EXP_GPR_LOCK, // export holding on its data src
				69	GDS_GPR_LOCK, // GDS holding on its data and addr src
				70	EXP_POS_ACCESS, // write to export position
				71	EXP_PARAM_ACCESS, // write to export parameter
				72	VMW_GPR_LOCK, // vector-memory write holding on its data src
				73	NUM_WAIT_EVENTS,
				74	};
				75
				76	// The mapping is:
				77	// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
				78	// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
				79	// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
				80	// We reserve a fixed number of VGPR slots in the scoring tables for
				81	// special tokens like SCMEM_LDS (needed for buffer load to LDS).
				82	enum RegisterMapping {
				83	SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
				84	SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
				85	NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
				86	EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
				87	NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
				88	};
				89
				90	#define ForAllWaitEventType(w) \
				91	for (enum WaitEventType w = (enum WaitEventType)0; \
				92	(w) < (enum WaitEventType)NUM_WAIT_EVENTS; \
				93	(w) = (enum WaitEventType)((w) + 1))
				94
				95	// This is a per-basic-block object that maintains current score brackets
				96	// of each wait-counter, and a per-register scoreboard for each wait-couner.
				97	// We also maintain the latest score for every event type that can change the
				98	// waitcnt in order to know if there are multiple types of events within
				99	// the brackets. When multiple types of event happen in the bracket,
				100	// wait-count may get decreased out of order, therefore we need to put in
				101	// "s_waitcnt 0" before use.
				102	class BlockWaitcntBrackets {
				103	public:
				104	static int32_t getWaitCountMax(InstCounterType T) {
				105	switch (T) {
				106	case VM_CNT:
				107	return HardwareLimits.VmcntMax;
				108	case LGKM_CNT:
				109	return HardwareLimits.LgkmcntMax;
				110	case EXP_CNT:
				111	return HardwareLimits.ExpcntMax;
				112	default:
				113	break;
				114	}
				115	return 0;
				116	};
				117
				118	void setScoreLB(InstCounterType T, int32_t Val) {
				119	assert(T < NUM_INST_CNTS);
				120	if (T >= NUM_INST_CNTS)
				121	return;
				122	ScoreLBs[T] = Val;
				123	};
				124
				125	void setScoreUB(InstCounterType T, int32_t Val) {
				126	assert(T < NUM_INST_CNTS);
				127	if (T >= NUM_INST_CNTS)
				128	return;
				129	ScoreUBs[T] = Val;
				130	if (T == EXP_CNT) {
				131	int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
				132	if (ScoreLBs[T] < UB)
				133	ScoreLBs[T] = UB;
				134	}
				135	};
				136
				137	int32_t getScoreLB(InstCounterType T) {
				138	assert(T < NUM_INST_CNTS);
				139	if (T >= NUM_INST_CNTS)
				140	return 0;
				141	return ScoreLBs[T];
				142	};
				143
				144	int32_t getScoreUB(InstCounterType T) {
				145	assert(T < NUM_INST_CNTS);
				146	if (T >= NUM_INST_CNTS)
				147	return 0;
				148	return ScoreUBs[T];
				149	};
				150
				151	// Mapping from event to counter.
				152	InstCounterType eventCounter(WaitEventType E) {
				153	switch (E) {
				154	case VMEM_ACCESS:
				155	return VM_CNT;
				156	case LDS_ACCESS:
				157	case GDS_ACCESS:
				158	case SQ_MESSAGE:
				159	case SMEM_ACCESS:
				160	return LGKM_CNT;
				161	case EXP_GPR_LOCK:
				162	case GDS_GPR_LOCK:
				163	case VMW_GPR_LOCK:
				164	case EXP_POS_ACCESS:
				165	case EXP_PARAM_ACCESS:
				166	return EXP_CNT;
				167	default:
				168	llvm_unreachable("unhandled event type");
				169	}
				170	return NUM_INST_CNTS;
				171	}
				172
				173	void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
				174	if (GprNo < NUM_ALL_VGPRS) {
				175	if (GprNo > VgprUB) {
				176	VgprUB = GprNo;
				177	}
				178	VgprScores[T][GprNo] = Val;
				179	} else {
				180	assert(T == LGKM_CNT);
				181	if (GprNo - NUM_ALL_VGPRS > SgprUB) {
				182	SgprUB = GprNo - NUM_ALL_VGPRS;
				183	}
				184	SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
				185	}
				186	}
				187
				188	int32_t getRegScore(int GprNo, InstCounterType T) {
				189	if (GprNo < NUM_ALL_VGPRS) {
				190	return VgprScores[T][GprNo];
				191	}
				192	return SgprScores[GprNo - NUM_ALL_VGPRS];
				193	}
				194
				195	void clear() {
				196	memset(ScoreLBs, 0, sizeof(ScoreLBs));
				197	memset(ScoreUBs, 0, sizeof(ScoreUBs));
				198	memset(EventUBs, 0, sizeof(EventUBs));
				199	for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
				200	T = (enum InstCounterType)(T + 1)) {
				201	memset(VgprScores[T], 0, sizeof(VgprScores[T]));
				202	}
				203	memset(SgprScores, 0, sizeof(SgprScores));
				204	}
				205
				206	RegInterval getRegInterval(const MachineInstr MI, const SIInstrInfo TII,
				207	const MachineRegisterInfo *MRI,
				208	const SIRegisterInfo *TRI, unsigned OpNo,
				209	bool Def) const;
				210
				211	void setExpScore(const MachineInstr MI, const SIInstrInfo TII,
				212	const SIRegisterInfo TRI, const MachineRegisterInfo MRI,
				213	unsigned OpNo, int32_t Val);
				214
				215	void setWaitAtBeginning() { WaitAtBeginning = true; }
				216	void clearWaitAtBeginning() { WaitAtBeginning = false; }
				217	bool getWaitAtBeginning() const { return WaitAtBeginning; }
				218	void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
				219	int32_t getMaxVGPR() const { return VgprUB; }
				220	int32_t getMaxSGPR() const { return SgprUB; }
				221	int32_t getEventUB(enum WaitEventType W) const {
				222	assert(W < NUM_WAIT_EVENTS);
				223	return EventUBs[W];
				224	}
				225	bool counterOutOfOrder(InstCounterType T);
				226	unsigned int updateByWait(InstCounterType T, int ScoreToWait);
				227	void updateByEvent(const SIInstrInfo TII, const SIRegisterInfo TRI,
				228	const MachineRegisterInfo *MRI, WaitEventType E,
				229	MachineInstr &MI);
				230
				231	BlockWaitcntBrackets()
				232	: WaitAtBeginning(false), ValidLoop(false), MixedExpTypes(false),
				233	LoopRegion(NULL), PostOrder(0), Waitcnt(NULL), VgprUB(0), SgprUB(0) {
				234	for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
				235	T = (enum InstCounterType)(T + 1)) {
				236	memset(VgprScores[T], 0, sizeof(VgprScores[T]));
				237	}
				238	}
				239	~BlockWaitcntBrackets(){};
				240
				241	bool hasPendingSMEM() const {
				242	return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
				243	EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
				244	}
				245
				246	bool hasPendingFlat() const {
				247	return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
				248	LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) \|\|
				249	(LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
				250	LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
				251	}
				252
				253	void setPendingFlat() {
				254	LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
				255	LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
				256	}
				257
				258	int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
				259
				260	void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
				261
				262	bool getRevisitLoop() const { return RevisitLoop; }
				263	void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
				264
				265	void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
				266	int32_t getPostOrder() const { return PostOrder; }
				267
				268	void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
				269	void clearWaitcnt() { Waitcnt = NULL; }
				270	MachineInstr *getWaitcnt() const { return Waitcnt; }
				271
				272	bool mixedExpTypes() const { return MixedExpTypes; }
				273	void setMixedExpTypes(bool MixedExpTypesIn) {
				274	MixedExpTypes = MixedExpTypesIn;
				275	}
				276
				277	void print(raw_ostream &);
				278	void dump() { print(dbgs()); }
				279
				280	private:
				281	bool WaitAtBeginning;
				282	bool RevisitLoop;
				283	bool ValidLoop;
				284	bool MixedExpTypes;
				285	MachineLoop *LoopRegion;
				286	int32_t PostOrder;
				287	MachineInstr *Waitcnt;
				288	int32_t ScoreLBs[NUM_INST_CNTS] = {0};
				289	int32_t ScoreUBs[NUM_INST_CNTS] = {0};
				290	int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
				291	// Remember the last flat memory operation.
				292	int32_t LastFlat[NUM_INST_CNTS] = {0};
				293	// wait_cnt scores for every vgpr.
				294	// Keep track of the VgprUB and SgprUB to make merge at join efficient.
				295	int32_t VgprUB;
				296	int32_t SgprUB;
				297	int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
				298	// Wait cnt scores for every sgpr, only lgkmcnt is relevant.
				299	int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
				300	};
				301
				302	// This is a per-loop-region object that records waitcnt status at the end of
				303	// loop footer from the previous iteration. We also maintain an iteration
				304	// count to track the number of times the loop has been visited. When it
				305	// doesn't converge naturally, we force convergence by inserting s_waitcnt 0
				306	// at the end of the loop footer.
				307	class LoopWaitcntData {
				308	public:
				309	void incIterCnt() { IterCnt++; }
				310	void resetIterCnt() { IterCnt = 0; }
				311	int32_t getIterCnt() { return IterCnt; }
				312
				313	LoopWaitcntData() : LfWaitcnt(NULL), IterCnt(0) {}
				314	~LoopWaitcntData(){};
				315
				316	void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
				317	MachineInstr *getWaitcnt() const { return LfWaitcnt; }
				318
				319	void print() {
				320	DEBUG(dbgs() << " iteration " << IterCnt << '\n';);
				321	return;
				322	}
				323
				324	private:
				325	// s_waitcnt added at the end of loop footer to stablize wait scores
				326	// at the end of the loop footer.
				327	MachineInstr *LfWaitcnt;
				328	// Number of iterations the loop has been visited, not including the initial
				329	// walk over.
				330	int32_t IterCnt;
				331	};
				332
				333	class SIInsertWaitcnts : public MachineFunctionPass {
				334
				335	private:
				336	const SISubtarget *ST;
				337	const SIInstrInfo *TII;
				338	const SIRegisterInfo *TRI;
				339	const MachineRegisterInfo *MRI;
				340	const MachineLoopInfo *MLI;
				341	AMDGPU::IsaInfo::IsaVersion IV;
				342	AMDGPUAS AMDGPUASI;
				343
				344	DenseSet<MachineBasicBlock *> BlockVisitedSet;
				345	DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet;
				346	DenseSet<MachineInstr *> VCCZBugHandledSet;
				347
				348	DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
				349	BlockWaitcntBracketsMap;
				350
				351	DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet;
				352
				353	DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
				354
				355	std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
				356
				357	public:
				358	static char ID;
				359
				360	SIInsertWaitcnts()
				361	: MachineFunctionPass(ID), ST(nullptr), TII(nullptr), TRI(nullptr),
				362	MRI(nullptr), MLI(nullptr) {}
				363
				364	bool runOnMachineFunction(MachineFunction &MF) override;
				365
				366	StringRef getPassName() const override {
				367	return "SI insert wait instructions";
				368	}
				369
				370	void getAnalysisUsage(AnalysisUsage &AU) const override {
				371	AU.setPreservesCFG();
				372	AU.addRequired<MachineLoopInfo>();
				373	MachineFunctionPass::getAnalysisUsage(AU);
				374	}
				375
				376	void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
				377	// The waitcnt information is copied because it changes as the block is
				378	// traversed.
				379	KillWaitBrackets.push_back(make_unique<BlockWaitcntBrackets>(*Bracket));
				380	}
				381
				382	MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI,
				383	BlockWaitcntBrackets *ScoreBrackets);
				384	void updateEventWaitCntAfter(MachineInstr &Inst,
				385	BlockWaitcntBrackets *ScoreBrackets);
				386	void mergeInputScoreBrackets(MachineBasicBlock &Block);
				387	MachineBasicBlock loopBottom(const MachineLoop Loop);
				388	void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
				389	void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
				390	};
				391
				392	} // End anonymous namespace.
				393
				394	RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
				395	const SIInstrInfo *TII,
				396	const MachineRegisterInfo *MRI,
				397	const SIRegisterInfo *TRI,
				398	unsigned OpNo,
				399	bool Def) const {
				400	const MachineOperand &Op = MI->getOperand(OpNo);
				401	if (!Op.isReg() \|\| !TRI->isInAllocatableClass(Op.getReg()) \|\|
				402	(Def && !Op.isDef()))
				403	return {-1, -1};
				404
				405	// A use via a PW operand does not need a waitcnt.
				406	// A partial write is not a WAW.
				407	assert(!Op.getSubReg() \|\| !Op.isUndef());
				408
				409	RegInterval Result;
				410	const MachineRegisterInfo &MRIA = *MRI;
				411
				412	unsigned Reg = TRI->getEncodingValue(Op.getReg());
				413
				414	if (TRI->isVGPR(MRIA, Op.getReg())) {
				415	assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
				416	Result.first = Reg - RegisterEncoding.VGPR0;
				417	assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
				418	} else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
				419	assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
				420	Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
				421	assert(Result.first >= NUM_ALL_VGPRS &&
				422	Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
				423	}
				424	// TODO: Handle TTMP
				425	// else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
				426	else
				427	return {-1, -1};
				428
				429	const MachineInstr &MIA = *MI;
				430	const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
Krzysztof Parzyszek	44e25f3	2017-04-24 18:55:33 +0000	[diff] [blame]	431	unsigned Size = TRI->getRegSizeInBits(*RC);
				432	Result.second = Result.first + (Size / 32);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	433
				434	return Result;
				435	}
				436
				437	void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
				438	const SIInstrInfo *TII,
				439	const SIRegisterInfo *TRI,
				440	const MachineRegisterInfo *MRI,
				441	unsigned OpNo, int32_t Val) {
				442	RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
				443	DEBUG({
				444	const MachineOperand &Opnd = MI->getOperand(OpNo);
				445	assert(TRI->isVGPR(*MRI, Opnd.getReg()));
				446	});
				447	for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
				448	setRegScore(RegNo, EXP_CNT, Val);
				449	}
				450	}
				451
				452	void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
				453	const SIRegisterInfo *TRI,
				454	const MachineRegisterInfo *MRI,
				455	WaitEventType E, MachineInstr &Inst) {
				456	const MachineRegisterInfo &MRIA = *MRI;
				457	InstCounterType T = eventCounter(E);
				458	int32_t CurrScore = getScoreUB(T) + 1;
				459	// EventUB and ScoreUB need to be update regardless if this event changes
				460	// the score of a register or not.
				461	// Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
				462	EventUBs[E] = CurrScore;
				463	setScoreUB(T, CurrScore);
				464
				465	if (T == EXP_CNT) {
				466	// Check for mixed export types. If they are mixed, then a waitcnt exp(0)
				467	// is required.
				468	if (!MixedExpTypes) {
				469	MixedExpTypes = counterOutOfOrder(EXP_CNT);
				470	}
				471
				472	// Put score on the source vgprs. If this is a store, just use those
				473	// specific register(s).
				474	if (TII->isDS(Inst) && (Inst.mayStore() \|\| Inst.mayLoad())) {
				475	// All GDS operations must protect their address register (same as
				476	// export.)
				477	if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
				478	Inst.getOpcode() != AMDGPU::DS_CONSUME) {
				479	setExpScore(
				480	&Inst, TII, TRI, MRI,
				481	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
				482	CurrScore);
				483	}
				484	if (Inst.mayStore()) {
				485	setExpScore(
				486	&Inst, TII, TRI, MRI,
				487	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
				488	CurrScore);
				489	if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
				490	AMDGPU::OpName::data1) != -1) {
				491	setExpScore(&Inst, TII, TRI, MRI,
				492	AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
				493	AMDGPU::OpName::data1),
				494	CurrScore);
				495	}
				496	} else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
				497	Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
				498	Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
				499	Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
				500	Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
				501	Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
				502	Inst.getOpcode() != AMDGPU::DS_APPEND &&
				503	Inst.getOpcode() != AMDGPU::DS_CONSUME &&
				504	Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
				505	for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
				506	const MachineOperand &Op = Inst.getOperand(I);
				507	if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
				508	setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
				509	}
				510	}
				511	}
				512	} else if (TII->isFLAT(Inst)) {
				513	if (Inst.mayStore()) {
				514	setExpScore(
				515	&Inst, TII, TRI, MRI,
				516	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
				517	CurrScore);
				518	} else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
				519	setExpScore(
				520	&Inst, TII, TRI, MRI,
				521	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
				522	CurrScore);
				523	}
				524	} else if (TII->isMIMG(Inst)) {
				525	if (Inst.mayStore()) {
				526	setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
				527	} else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
				528	setExpScore(
				529	&Inst, TII, TRI, MRI,
				530	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
				531	CurrScore);
				532	}
				533	} else if (TII->isMTBUF(Inst)) {
				534	if (Inst.mayStore()) {
				535	setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
				536	}
				537	} else if (TII->isMUBUF(Inst)) {
				538	if (Inst.mayStore()) {
				539	setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
				540	} else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
				541	setExpScore(
				542	&Inst, TII, TRI, MRI,
				543	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
				544	CurrScore);
				545	}
				546	} else {
				547	if (TII->isEXP(Inst)) {
				548	// For export the destination registers are really temps that
				549	// can be used as the actual source after export patching, so
				550	// we need to treat them like sources and set the EXP_CNT
				551	// score.
				552	for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
				553	MachineOperand &DefMO = Inst.getOperand(I);
				554	if (DefMO.isReg() && DefMO.isDef() &&
				555	TRI->isVGPR(MRIA, DefMO.getReg())) {
				556	setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
				557	CurrScore);
				558	}
				559	}
				560	}
				561	for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
				562	MachineOperand &MO = Inst.getOperand(I);
				563	if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
				564	setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
				565	}
				566	}
				567	}
				568	#if 0 // TODO: check if this is handled by MUBUF code above.
				569	} else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD \|\|
				570	Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 \|\|
				571	Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
				572	MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
				573	unsigned OpNo;//TODO: find the OpNo for this operand;
				574	RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
				575	for (signed RegNo = Interval.first; RegNo < Interval.second;
				576	++RegNo) {
				577	setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
				578	}
				579	#endif
				580	} else {
				581	// Match the score to the destination registers.
				582	for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
				583	RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
				584	if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
				585	continue;
				586	for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
				587	setRegScore(RegNo, T, CurrScore);
				588	}
				589	}
				590	if (TII->isDS(Inst) && Inst.mayStore()) {
				591	setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
				592	}
				593	}
				594	}
				595
				596	void BlockWaitcntBrackets::print(raw_ostream &OS) {
				597	OS << '\n';
				598	for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
				599	T = (enum InstCounterType)(T + 1)) {
				600	int LB = getScoreLB(T);
				601	int UB = getScoreUB(T);
				602
				603	switch (T) {
				604	case VM_CNT:
				605	OS << " VM_CNT(" << UB - LB << "): ";
				606	break;
				607	case LGKM_CNT:
				608	OS << " LGKM_CNT(" << UB - LB << "): ";
				609	break;
				610	case EXP_CNT:
				611	OS << " EXP_CNT(" << UB - LB << "): ";
				612	break;
				613	default:
				614	OS << " UNKNOWN(" << UB - LB << "): ";
				615	break;
				616	}
				617
				618	if (LB < UB) {
				619	// Print vgpr scores.
				620	for (int J = 0; J <= getMaxVGPR(); J++) {
				621	int RegScore = getRegScore(J, T);
				622	if (RegScore <= LB)
				623	continue;
				624	int RelScore = RegScore - LB - 1;
				625	if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
				626	OS << RelScore << ":v" << J << " ";
				627	} else {
				628	OS << RelScore << ":ds ";
				629	}
				630	}
				631	// Also need to print sgpr scores for lgkm_cnt.
				632	if (T == LGKM_CNT) {
				633	for (int J = 0; J <= getMaxSGPR(); J++) {
				634	int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
				635	if (RegScore <= LB)
				636	continue;
				637	int RelScore = RegScore - LB - 1;
				638	OS << RelScore << ":s" << J << " ";
				639	}
				640	}
				641	}
				642	OS << '\n';
				643	}
				644	OS << '\n';
				645	return;
				646	}
				647
				648	unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
				649	int ScoreToWait) {
				650	unsigned int NeedWait = 0;
				651	if (ScoreToWait == -1) {
				652	// The score to wait is unknown. This implies that it was not encountered
				653	// during the path of the CFG walk done during the current traversal but
				654	// may be seen on a different path. Emit an s_wait counter with a
				655	// conservative value of 0 for the counter.
				656	NeedWait = CNT_MASK(T);
				657	setScoreLB(T, getScoreUB(T));
				658	return NeedWait;
				659	}
				660
				661	// If the score of src_operand falls within the bracket, we need an
				662	// s_waitcnt instruction.
				663	const int32_t LB = getScoreLB(T);
				664	const int32_t UB = getScoreUB(T);
				665	if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
				666	if (T == VM_CNT && hasPendingFlat()) {
				667	// If there is a pending FLAT operation, and this is a VM waitcnt,
				668	// then we need to force a waitcnt 0 for VM.
				669	NeedWait = CNT_MASK(T);
				670	setScoreLB(T, getScoreUB(T));
				671	} else if (counterOutOfOrder(T)) {
				672	// Counter can get decremented out-of-order when there
				673	// are multiple types event in the brack. Also emit an s_wait counter
				674	// with a conservative value of 0 for the counter.
				675	NeedWait = CNT_MASK(T);
				676	setScoreLB(T, getScoreUB(T));
				677	} else {
				678	NeedWait = CNT_MASK(T);
				679	setScoreLB(T, ScoreToWait);
				680	}
				681	}
				682
				683	return NeedWait;
				684	}
				685
				686	// Where there are multiple types of event in the bracket of a counter,
				687	// the decrement may go out of order.
				688	bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
				689	switch (T) {
				690	case VM_CNT:
				691	return false;
				692	case LGKM_CNT: {
				693	if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
				694	EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
				695	// Scalar memory read always can go out of order.
				696	return true;
				697	}
				698	int NumEventTypes = 0;
				699	if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
				700	EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
				701	NumEventTypes++;
				702	}
				703	if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
				704	EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
				705	NumEventTypes++;
				706	}
				707	if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
				708	EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
				709	NumEventTypes++;
				710	}
				711	if (NumEventTypes <= 1) {
				712	return false;
				713	}
				714	break;
				715	}
				716	case EXP_CNT: {
				717	// If there has been a mixture of export types, then a waitcnt exp(0) is
				718	// required.
				719	if (MixedExpTypes)
				720	return true;
				721	int NumEventTypes = 0;
				722	if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
				723	EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
				724	NumEventTypes++;
				725	}
				726	if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
				727	EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
				728	NumEventTypes++;
				729	}
				730	if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
				731	EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
				732	NumEventTypes++;
				733	}
				734	if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
				735	EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
				736	NumEventTypes++;
				737	}
				738
				739	if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
				740	EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
				741	NumEventTypes++;
				742	}
				743
				744	if (NumEventTypes <= 1) {
				745	return false;
				746	}
				747	break;
				748	}
				749	default:
				750	break;
				751	}
				752	return true;
				753	}
				754
				755	INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
				756	false)
				757	INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
				758	false)
				759
				760	char SIInsertWaitcnts::ID = 0;
				761
				762	char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
				763
				764	FunctionPass *llvm::createSIInsertWaitcntsPass() {
				765	return new SIInsertWaitcnts();
				766	}
				767
				768	static bool readsVCCZ(const MachineInstr &MI) {
				769	unsigned Opc = MI.getOpcode();
				770	return (Opc == AMDGPU::S_CBRANCH_VCCNZ \|\| Opc == AMDGPU::S_CBRANCH_VCCZ) &&
				771	!MI.getOperand(1).isUndef();
				772	}
				773
				774	/// \brief Generate s_waitcnt instruction to be placed before cur_Inst.
				775	/// Instructions of a given type are returned in order,
				776	/// but instructions of different types can complete out of order.
				777	/// We rely on this in-order completion
				778	/// and simply assign a score to the memory access instructions.
				779	/// We keep track of the active "score bracket" to determine
				780	/// if an access of a memory read requires an s_waitcnt
				781	/// and if so what the value of each counter is.
				782	/// The "score bracket" is bound by the lower bound and upper bound
				783	/// scores (_score_LB and _score_ub respectively).
				784	MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
				785	MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
				786	// To emit, or not to emit - that's the question!
				787	// Start with an assumption that there is no need to emit.
				788	unsigned int EmitSwaitcnt = 0;
				789	// s_waitcnt instruction to return; default is NULL.
				790	MachineInstr *SWaitInst = nullptr;
				791	// No need to wait before phi. If a phi-move exists, then the wait should
				792	// has been inserted before the move. If a phi-move does not exist, then
				793	// wait should be inserted before the real use. The same is true for
				794	// sc-merge. It is not a coincident that all these cases correspond to the
				795	// instructions that are skipped in the assembling loop.
				796	bool NeedLineMapping = false; // TODO: Check on this.
				797	if (MI.isDebugValue() &&
				798	// TODO: any other opcode?
				799	!NeedLineMapping) {
				800	return SWaitInst;
				801	}
				802
				803	// See if an s_waitcnt is forced at block entry, or is needed at
				804	// program end.
				805	if (ScoreBrackets->getWaitAtBeginning()) {
				806	// Note that we have already cleared the state, so we don't need to update
				807	// it.
				808	ScoreBrackets->clearWaitAtBeginning();
				809	for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
				810	T = (enum InstCounterType)(T + 1)) {
				811	EmitSwaitcnt \|= CNT_MASK(T);
				812	ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
				813	}
				814	}
				815
				816	// See if this instruction has a forced S_WAITCNT VM.
				817	// TODO: Handle other cases of NeedsWaitcntVmBefore()
				818	else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 \|\|
				819	MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC \|\|
				820	MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
				821	EmitSwaitcnt \|=
				822	ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
				823	}
				824
				825	// All waits must be resolved at call return.
				826	// NOTE: this could be improved with knowledge of all call sites or
				827	// with knowledge of the called routines.
				828	if (MI.getOpcode() == AMDGPU::RETURN \|\|
Mark Searles	11d0a04	2017-05-31 16:44:23 +0000	[diff] [blame^]	829	MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG \|\|
				830	MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	831	for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
				832	T = (enum InstCounterType)(T + 1)) {
				833	if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
				834	ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
				835	EmitSwaitcnt \|= CNT_MASK(T);
				836	}
				837	}
				838	}
				839	// Resolve vm waits before gs-done.
				840	else if ((MI.getOpcode() == AMDGPU::S_SENDMSG \|\|
				841	MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
				842	((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
				843	AMDGPU::SendMsg::ID_GS_DONE)) {
				844	if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
				845	ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
				846	EmitSwaitcnt \|= CNT_MASK(VM_CNT);
				847	}
				848	}
				849	#if 0 // TODO: the following blocks of logic when we have fence.
				850	else if (MI.getOpcode() == SC_FENCE) {
				851	const unsigned int group_size =
				852	context->shader_info->GetMaxThreadGroupSize();
				853	// group_size == 0 means thread group size is unknown at compile time
				854	const bool group_is_multi_wave =
				855	(group_size == 0 \|\| group_size > target_info->GetWaveFrontSize());
				856	const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
				857
				858	for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
				859	SCRegType src_type = Inst->GetSrcType(i);
				860	switch (src_type) {
				861	case SCMEM_LDS:
				862	if (group_is_multi_wave \|\|
				863	context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
				864	EmitSwaitcnt \|= ScoreBrackets->updateByWait(LGKM_CNT,
				865	ScoreBrackets->getScoreUB(LGKM_CNT));
				866	// LDS may have to wait for VM_CNT after buffer load to LDS
				867	if (target_info->HasBufferLoadToLDS()) {
				868	EmitSwaitcnt \|= ScoreBrackets->updateByWait(VM_CNT,
				869	ScoreBrackets->getScoreUB(VM_CNT));
				870	}
				871	}
				872	break;
				873
				874	case SCMEM_GDS:
				875	if (group_is_multi_wave \|\| fence_is_global) {
				876	EmitSwaitcnt \|= ScoreBrackets->updateByWait(EXP_CNT,
				877	ScoreBrackets->getScoreUB(EXP_CNT));
				878	EmitSwaitcnt \|= ScoreBrackets->updateByWait(LGKM_CNT,
				879	ScoreBrackets->getScoreUB(LGKM_CNT));
				880	}
				881	break;
				882
				883	case SCMEM_UAV:
				884	case SCMEM_TFBUF:
				885	case SCMEM_RING:
				886	case SCMEM_SCATTER:
				887	if (group_is_multi_wave \|\| fence_is_global) {
				888	EmitSwaitcnt \|= ScoreBrackets->updateByWait(EXP_CNT,
				889	ScoreBrackets->getScoreUB(EXP_CNT));
				890	EmitSwaitcnt \|= ScoreBrackets->updateByWait(VM_CNT,
				891	ScoreBrackets->getScoreUB(VM_CNT));
				892	}
				893	break;
				894
				895	case SCMEM_SCRATCH:
				896	default:
				897	break;
				898	}
				899	}
				900	}
				901	#endif
				902
				903	// Export & GDS instructions do not read the EXEC mask until after the export
				904	// is granted (which can occur well after the instruction is issued).
				905	// The shader program must flush all EXP operations on the export-count
				906	// before overwriting the EXEC mask.
				907	else {
				908	if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
				909	// Export and GDS are tracked individually, either may trigger a waitcnt
				910	// for EXEC.
				911	EmitSwaitcnt \|= ScoreBrackets->updateByWait(
				912	EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
				913	EmitSwaitcnt \|= ScoreBrackets->updateByWait(
				914	EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
				915	EmitSwaitcnt \|= ScoreBrackets->updateByWait(
				916	EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
				917	EmitSwaitcnt \|= ScoreBrackets->updateByWait(
				918	EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
				919	}
				920
				921	#if 0 // TODO: the following code to handle CALL.
				922	// The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
				923	// However, there is a problem with EXP_CNT, because the call cannot
				924	// easily tell if a register is used in the function, and if it did, then
				925	// the referring instruction would have to have an S_WAITCNT, which is
				926	// dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
				927	// before the call.
				928	if (MI.getOpcode() == SC_CALL) {
				929	if (ScoreBrackets->getScoreUB(EXP_CNT) >
				930	ScoreBrackets->getScoreLB(EXP_CNT)) {
				931	ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
				932	EmitSwaitcnt \|= CNT_MASK(EXP_CNT);
				933	}
				934	}
				935	#endif
				936
				937	// Look at the source operands of every instruction to see if
				938	// any of them results from a previous memory operation that affects
				939	// its current usage. If so, an s_waitcnt instruction needs to be
				940	// emitted.
				941	// If the source operand was defined by a load, add the s_waitcnt
				942	// instruction.
				943	for (const MachineMemOperand *Memop : MI.memoperands()) {
				944	unsigned AS = Memop->getAddrSpace();
				945	if (AS != AMDGPUASI.LOCAL_ADDRESS)
				946	continue;
				947	unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
				948	// VM_CNT is only relevant to vgpr or LDS.
				949	EmitSwaitcnt \|= ScoreBrackets->updateByWait(
				950	VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
				951	}
				952	for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
				953	const MachineOperand &Op = MI.getOperand(I);
				954	const MachineRegisterInfo &MRIA = *MRI;
				955	RegInterval Interval =
				956	ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
				957	for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
				958	if (TRI->isVGPR(MRIA, Op.getReg())) {
				959	// VM_CNT is only relevant to vgpr or LDS.
				960	EmitSwaitcnt \|= ScoreBrackets->updateByWait(
				961	VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
				962	}
				963	EmitSwaitcnt \|= ScoreBrackets->updateByWait(
				964	LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
				965	}
				966	}
				967	// End of for loop that looks at all source operands to decide vm_wait_cnt
				968	// and lgk_wait_cnt.
				969
				970	// Two cases are handled for destination operands:
				971	// 1) If the destination operand was defined by a load, add the s_waitcnt
				972	// instruction to guarantee the right WAW order.
				973	// 2) If a destination operand that was used by a recent export/store ins,
				974	// add s_waitcnt on exp_cnt to guarantee the WAR order.
				975	if (MI.mayStore()) {
				976	for (const MachineMemOperand *Memop : MI.memoperands()) {
				977	unsigned AS = Memop->getAddrSpace();
				978	if (AS != AMDGPUASI.LOCAL_ADDRESS)
				979	continue;
				980	unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
				981	EmitSwaitcnt \|= ScoreBrackets->updateByWait(
				982	VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
				983	EmitSwaitcnt \|= ScoreBrackets->updateByWait(
				984	EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
				985	}
				986	}
				987	for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
				988	MachineOperand &Def = MI.getOperand(I);
				989	const MachineRegisterInfo &MRIA = *MRI;
				990	RegInterval Interval =
				991	ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
				992	for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
				993	if (TRI->isVGPR(MRIA, Def.getReg())) {
				994	EmitSwaitcnt \|= ScoreBrackets->updateByWait(
				995	VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
				996	EmitSwaitcnt \|= ScoreBrackets->updateByWait(
				997	EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
				998	}
				999	EmitSwaitcnt \|= ScoreBrackets->updateByWait(
				1000	LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
				1001	}
				1002	} // End of for loop that looks at all dest operands.
				1003	}
				1004
				1005	// TODO: Tie force zero to a compiler triage option.
				1006	bool ForceZero = false;
				1007
				1008	// Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
				1009	// occurs before the instruction. Doing it here prevents any additional
				1010	// S_WAITCNTs from being emitted if the instruction was marked as
				1011	// requiring a WAITCNT beforehand.
				1012	if (MI.getOpcode() == AMDGPU::S_BARRIER && ST->needWaitcntBeforeBarrier()) {
				1013	EmitSwaitcnt \|=
				1014	ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
				1015	EmitSwaitcnt \|= ScoreBrackets->updateByWait(
				1016	EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
				1017	EmitSwaitcnt \|= ScoreBrackets->updateByWait(
				1018	LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
				1019	}
				1020
				1021	// TODO: Remove this work-around, enable the assert for Bug 457939
				1022	// after fixing the scheduler. Also, the Shader Compiler code is
				1023	// independent of target.
				1024	if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
				1025	if (ScoreBrackets->getScoreLB(LGKM_CNT) <
				1026	ScoreBrackets->getScoreUB(LGKM_CNT) &&
				1027	ScoreBrackets->hasPendingSMEM()) {
				1028	// Wait on everything, not just LGKM. vccz reads usually come from
				1029	// terminators, and we always wait on everything at the end of the
				1030	// block, so if we only wait on LGKM here, we might end up with
				1031	// another s_waitcnt inserted right after this if there are non-LGKM
				1032	// instructions still outstanding.
				1033	ForceZero = true;
				1034	EmitSwaitcnt = true;
				1035	}
				1036	}
				1037
				1038	// Does this operand processing indicate s_wait counter update?
				1039	if (EmitSwaitcnt) {
				1040	int CntVal[NUM_INST_CNTS];
				1041
				1042	bool UseDefaultWaitcntStrategy = true;
				1043	if (ForceZero) {
				1044	// Force all waitcnts to 0.
				1045	for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
				1046	T = (enum InstCounterType)(T + 1)) {
				1047	ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
				1048	}
				1049	CntVal[VM_CNT] = 0;
				1050	CntVal[EXP_CNT] = 0;
				1051	CntVal[LGKM_CNT] = 0;
				1052	UseDefaultWaitcntStrategy = false;
				1053	}
				1054
				1055	if (UseDefaultWaitcntStrategy) {
				1056	for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
				1057	T = (enum InstCounterType)(T + 1)) {
				1058	if (EmitSwaitcnt & CNT_MASK(T)) {
				1059	int Delta =
				1060	ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
				1061	int MaxDelta = ScoreBrackets->getWaitCountMax(T);
				1062	if (Delta >= MaxDelta) {
				1063	Delta = -1;
				1064	if (T != EXP_CNT) {
				1065	ScoreBrackets->setScoreLB(
				1066	T, ScoreBrackets->getScoreUB(T) - MaxDelta);
				1067	}
				1068	EmitSwaitcnt &= ~CNT_MASK(T);
				1069	}
				1070	CntVal[T] = Delta;
				1071	} else {
				1072	// If we are not waiting for a particular counter then encode
				1073	// it as -1 which means "don't care."
				1074	CntVal[T] = -1;
				1075	}
				1076	}
				1077	}
				1078
				1079	// If we are not waiting on any counter we can skip the wait altogether.
				1080	if (EmitSwaitcnt != 0) {
				1081	MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
				1082	int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
				1083	if (!OldWaitcnt \|\| (AMDGPU::decodeVmcnt(IV, Imm) !=
				1084	(CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) \|\|
				1085	(AMDGPU::decodeExpcnt(IV, Imm) !=
				1086	(CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) \|\|
				1087	(AMDGPU::decodeLgkmcnt(IV, Imm) !=
				1088	(CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
				1089	MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
				1090	if (ContainingLoop) {
Kannan Narayanan	5e73b04	2017-05-05 21:10:17 +0000	[diff] [blame]	1091	MachineBasicBlock *TBB = ContainingLoop->getHeader();
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1092	BlockWaitcntBrackets *ScoreBracket =
				1093	BlockWaitcntBracketsMap[TBB].get();
				1094	if (!ScoreBracket) {
				1095	assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end());
				1096	BlockWaitcntBracketsMap[TBB] = make_unique<BlockWaitcntBrackets>();
				1097	ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
				1098	}
				1099	ScoreBracket->setRevisitLoop(true);
				1100	DEBUG(dbgs() << "set-revisit: block"
Kannan Narayanan	5e73b04	2017-05-05 21:10:17 +0000	[diff] [blame]	1101	<< ContainingLoop->getHeader()->getNumber() << '\n';);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1102	}
				1103	}
				1104
				1105	// Update an existing waitcount, or make a new one.
				1106	MachineFunction &MF = *MI.getParent()->getParent();
				1107	if (OldWaitcnt && OldWaitcnt->getOpcode() != AMDGPU::S_WAITCNT) {
				1108	SWaitInst = OldWaitcnt;
				1109	} else {
				1110	SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT),
				1111	MI.getDebugLoc());
				1112	CompilerGeneratedWaitcntSet.insert(SWaitInst);
				1113	}
				1114
				1115	const MachineOperand &Op =
				1116	MachineOperand::CreateImm(AMDGPU::encodeWaitcnt(
				1117	IV, CntVal[VM_CNT], CntVal[EXP_CNT], CntVal[LGKM_CNT]));
				1118	SWaitInst->addOperand(MF, Op);
				1119
				1120	if (CntVal[EXP_CNT] == 0) {
				1121	ScoreBrackets->setMixedExpTypes(false);
				1122	}
				1123	}
				1124	}
				1125
				1126	return SWaitInst;
				1127	}
				1128
				1129	void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
				1130	MachineInstr *Waitcnt) {
				1131	if (MBB.empty()) {
				1132	MBB.push_back(Waitcnt);
				1133	return;
				1134	}
				1135
				1136	MachineBasicBlock::iterator It = MBB.end();
				1137	MachineInstr MI = &(--It);
				1138	if (MI->isBranch()) {
				1139	MBB.insert(It, Waitcnt);
				1140	} else {
				1141	MBB.push_back(Waitcnt);
				1142	}
				1143
				1144	return;
				1145	}
				1146
				1147	void SIInsertWaitcnts::updateEventWaitCntAfter(
				1148	MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
				1149	// Now look at the instruction opcode. If it is a memory access
				1150	// instruction, update the upper-bound of the appropriate counter's
				1151	// bracket and the destination operand scores.
				1152	// TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
Mark Searles	11d0a04	2017-05-31 16:44:23 +0000	[diff] [blame^]	1153	uint64_t TSFlags = Inst.getDesc().TSFlags;
				1154	if (TII->isDS(Inst) && (TSFlags & SIInstrFlags::LGKM_CNT)) {
				1155	if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds) &&
				1156	TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1157	ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
				1158	ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
				1159	} else {
				1160	ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
				1161	}
				1162	} else if (TII->isFLAT(Inst)) {
				1163	assert(Inst.mayLoad() \|\| Inst.mayStore());
				1164	ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
				1165	ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
				1166
				1167	// This is a flat memory operation. Check to see if it has memory
				1168	// tokens for both LDS and Memory, and if so mark it as a flat.
				1169	bool FoundLDSMem = false;
				1170	for (const MachineMemOperand *Memop : Inst.memoperands()) {
				1171	unsigned AS = Memop->getAddrSpace();
				1172	if (AS == AMDGPUASI.LOCAL_ADDRESS \|\| AS == AMDGPUASI.FLAT_ADDRESS)
				1173	FoundLDSMem = true;
				1174	}
				1175
				1176	// This is a flat memory operation, so note it - it will require
				1177	// that both the VM and LGKM be flushed to zero if it is pending when
				1178	// a VM or LGKM dependency occurs.
				1179	if (FoundLDSMem) {
				1180	ScoreBrackets->setPendingFlat();
				1181	}
				1182	} else if (SIInstrInfo::isVMEM(Inst) &&
				1183	// TODO: get a better carve out.
				1184	Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
				1185	Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
				1186	Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
				1187	ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
				1188	if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() &&
Mark Searles	11d0a04	2017-05-31 16:44:23 +0000	[diff] [blame^]	1189	(Inst.mayStore() \|\| AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1190	ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
				1191	}
				1192	} else if (TII->isSMRD(Inst)) {
				1193	ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
				1194	} else {
				1195	switch (Inst.getOpcode()) {
				1196	case AMDGPU::S_SENDMSG:
				1197	case AMDGPU::S_SENDMSGHALT:
				1198	ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
				1199	break;
				1200	case AMDGPU::EXP:
				1201	case AMDGPU::EXP_DONE: {
				1202	int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
				1203	if (Imm >= 32 && Imm <= 63)
				1204	ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
				1205	else if (Imm >= 12 && Imm <= 15)
				1206	ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
				1207	else
				1208	ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
				1209	break;
				1210	}
				1211	case AMDGPU::S_MEMTIME:
				1212	case AMDGPU::S_MEMREALTIME:
				1213	ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
				1214	break;
				1215	default:
				1216	break;
				1217	}
				1218	}
				1219	}
				1220
				1221	void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
				1222	BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
				1223	int32_t MaxPending[NUM_INST_CNTS] = {0};
				1224	int32_t MaxFlat[NUM_INST_CNTS] = {0};
				1225	bool MixedExpTypes = false;
				1226
				1227	// Clear the score bracket state.
				1228	ScoreBrackets->clear();
				1229
				1230	// Compute the number of pending elements on block entry.
				1231
				1232	// IMPORTANT NOTE: If iterative handling of loops is added, the code will
				1233	// need to handle single BBs with backedges to themselves. This means that
				1234	// they will need to retain and not clear their initial state.
				1235
				1236	// See if there are any uninitialized predecessors. If so, emit an
				1237	// s_waitcnt 0 at the beginning of the block.
				1238	for (MachineBasicBlock *pred : Block.predecessors()) {
				1239	BlockWaitcntBrackets *PredScoreBrackets =
				1240	BlockWaitcntBracketsMap[pred].get();
				1241	bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
				1242	if (!Visited \|\| PredScoreBrackets->getWaitAtBeginning()) {
				1243	break;
				1244	}
				1245	for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
				1246	T = (enum InstCounterType)(T + 1)) {
				1247	int span =
				1248	PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
				1249	MaxPending[T] = std::max(MaxPending[T], span);
				1250	span =
				1251	PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
				1252	MaxFlat[T] = std::max(MaxFlat[T], span);
				1253	}
				1254
				1255	MixedExpTypes \|= PredScoreBrackets->mixedExpTypes();
				1256	}
				1257
				1258	// TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
				1259	// Also handle kills for exit block.
				1260	if (Block.succ_empty() && !KillWaitBrackets.empty()) {
				1261	for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
				1262	for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
				1263	T = (enum InstCounterType)(T + 1)) {
				1264	int Span = KillWaitBrackets[I]->getScoreUB(T) -
				1265	KillWaitBrackets[I]->getScoreLB(T);
				1266	MaxPending[T] = std::max(MaxPending[T], Span);
				1267	Span = KillWaitBrackets[I]->pendingFlat(T) -
				1268	KillWaitBrackets[I]->getScoreLB(T);
				1269	MaxFlat[T] = std::max(MaxFlat[T], Span);
				1270	}
				1271
				1272	MixedExpTypes \|= KillWaitBrackets[I]->mixedExpTypes();
				1273	}
				1274	}
				1275
				1276	// Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
				1277	for (MachineBasicBlock *Pred : Block.predecessors()) {
				1278	BlockWaitcntBrackets *PredScoreBrackets =
				1279	BlockWaitcntBracketsMap[Pred].get();
				1280	bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
				1281	if (!Visited \|\| PredScoreBrackets->getWaitAtBeginning()) {
				1282	break;
				1283	}
				1284
				1285	int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
				1286	PredScoreBrackets->getScoreLB(EXP_CNT);
				1287	MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
				1288	int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
				1289	PredScoreBrackets->getScoreLB(EXP_CNT);
				1290	MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
				1291	}
				1292
				1293	// TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
				1294	if (Block.succ_empty() && !KillWaitBrackets.empty()) {
				1295	for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
				1296	int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
				1297	KillWaitBrackets[I]->getScoreLB(EXP_CNT);
				1298	MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
				1299	int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
				1300	KillWaitBrackets[I]->getScoreLB(EXP_CNT);
				1301	MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
				1302	}
				1303	}
				1304
				1305	#if 0
				1306	// LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
				1307	// TODO: how does LC distinguish between function entry and main entry?
				1308	// If this is the entry to a function, force a wait.
				1309	MachineBasicBlock &Entry = Block.getParent()->front();
				1310	if (Entry.getNumber() == Block.getNumber()) {
				1311	ScoreBrackets->setWaitAtBeginning();
				1312	return;
				1313	}
				1314	#endif
				1315
				1316	// Now set the current Block's brackets to the largest ending bracket.
				1317	for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
				1318	T = (enum InstCounterType)(T + 1)) {
				1319	ScoreBrackets->setScoreUB(T, MaxPending[T]);
				1320	ScoreBrackets->setScoreLB(T, 0);
				1321	ScoreBrackets->setLastFlat(T, MaxFlat[T]);
				1322	}
				1323
				1324	ScoreBrackets->setMixedExpTypes(MixedExpTypes);
				1325
				1326	// Set the register scoreboard.
				1327	for (MachineBasicBlock *Pred : Block.predecessors()) {
				1328	if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
				1329	break;
				1330	}
				1331
				1332	BlockWaitcntBrackets *PredScoreBrackets =
				1333	BlockWaitcntBracketsMap[Pred].get();
				1334
				1335	// Now merge the gpr_reg_score information
				1336	for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
				1337	T = (enum InstCounterType)(T + 1)) {
				1338	int PredLB = PredScoreBrackets->getScoreLB(T);
				1339	int PredUB = PredScoreBrackets->getScoreUB(T);
				1340	if (PredLB < PredUB) {
				1341	int PredScale = MaxPending[T] - PredUB;
				1342	// Merge vgpr scores.
				1343	for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
				1344	int PredRegScore = PredScoreBrackets->getRegScore(J, T);
				1345	if (PredRegScore <= PredLB)
				1346	continue;
				1347	int NewRegScore = PredScale + PredRegScore;
				1348	ScoreBrackets->setRegScore(
				1349	J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
				1350	}
				1351	// Also need to merge sgpr scores for lgkm_cnt.
				1352	if (T == LGKM_CNT) {
				1353	for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
				1354	int PredRegScore =
				1355	PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
				1356	if (PredRegScore <= PredLB)
				1357	continue;
				1358	int NewRegScore = PredScale + PredRegScore;
				1359	ScoreBrackets->setRegScore(
				1360	J + NUM_ALL_VGPRS, LGKM_CNT,
				1361	std::max(
				1362	ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
				1363	NewRegScore));
				1364	}
				1365	}
				1366	}
				1367	}
				1368
				1369	// Also merge the WaitEvent information.
				1370	ForAllWaitEventType(W) {
				1371	enum InstCounterType T = PredScoreBrackets->eventCounter(W);
				1372	int PredEventUB = PredScoreBrackets->getEventUB(W);
				1373	if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
				1374	int NewEventUB =
				1375	MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
				1376	if (NewEventUB > 0) {
				1377	ScoreBrackets->setEventUB(
				1378	W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
				1379	}
				1380	}
				1381	}
				1382	}
				1383
				1384	// TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
				1385	// Set the register scoreboard.
				1386	if (Block.succ_empty() && !KillWaitBrackets.empty()) {
				1387	for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
				1388	// Now merge the gpr_reg_score information.
				1389	for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
				1390	T = (enum InstCounterType)(T + 1)) {
				1391	int PredLB = KillWaitBrackets[I]->getScoreLB(T);
				1392	int PredUB = KillWaitBrackets[I]->getScoreUB(T);
				1393	if (PredLB < PredUB) {
				1394	int PredScale = MaxPending[T] - PredUB;
				1395	// Merge vgpr scores.
				1396	for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
				1397	int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
				1398	if (PredRegScore <= PredLB)
				1399	continue;
				1400	int NewRegScore = PredScale + PredRegScore;
				1401	ScoreBrackets->setRegScore(
				1402	J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
				1403	}
				1404	// Also need to merge sgpr scores for lgkm_cnt.
				1405	if (T == LGKM_CNT) {
				1406	for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
				1407	int PredRegScore =
				1408	KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
				1409	if (PredRegScore <= PredLB)
				1410	continue;
				1411	int NewRegScore = PredScale + PredRegScore;
				1412	ScoreBrackets->setRegScore(
				1413	J + NUM_ALL_VGPRS, LGKM_CNT,
				1414	std::max(
				1415	ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
				1416	NewRegScore));
				1417	}
				1418	}
				1419	}
				1420	}
				1421
				1422	// Also merge the WaitEvent information.
				1423	ForAllWaitEventType(W) {
				1424	enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
				1425	int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
				1426	if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
				1427	int NewEventUB =
				1428	MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
				1429	if (NewEventUB > 0) {
				1430	ScoreBrackets->setEventUB(
				1431	W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
				1432	}
				1433	}
				1434	}
				1435	}
				1436	}
				1437
				1438	// Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
				1439	// sequencing predecessors, because changes to EXEC require waitcnts due to
				1440	// the delayed nature of these operations.
				1441	for (MachineBasicBlock *Pred : Block.predecessors()) {
				1442	if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
				1443	break;
				1444	}
				1445
				1446	BlockWaitcntBrackets *PredScoreBrackets =
				1447	BlockWaitcntBracketsMap[Pred].get();
				1448
				1449	int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
				1450	if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
				1451	int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
				1452	PredScoreBrackets->getScoreUB(EXP_CNT);
				1453	if (new_gds_ub > 0) {
				1454	ScoreBrackets->setEventUB(
				1455	GDS_GPR_LOCK,
				1456	std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
				1457	}
				1458	}
				1459	int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
				1460	if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
				1461	int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
				1462	PredScoreBrackets->getScoreUB(EXP_CNT);
				1463	if (new_exp_ub > 0) {
				1464	ScoreBrackets->setEventUB(
				1465	EXP_GPR_LOCK,
				1466	std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
				1467	}
				1468	}
				1469	}
				1470	}
				1471
				1472	/// Return the "bottom" block of a loop. This differs from
				1473	/// MachineLoop::getBottomBlock in that it works even if the loop is
				1474	/// discontiguous.
				1475	MachineBasicBlock SIInsertWaitcnts::loopBottom(const MachineLoop Loop) {
				1476	MachineBasicBlock *Bottom = Loop->getHeader();
				1477	for (MachineBasicBlock *MBB : Loop->blocks())
				1478	if (MBB->getNumber() > Bottom->getNumber())
				1479	Bottom = MBB;
				1480	return Bottom;
				1481	}
				1482
				1483	// Generate s_waitcnt instructions where needed.
				1484	void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
				1485	MachineBasicBlock &Block) {
				1486	// Initialize the state information.
				1487	mergeInputScoreBrackets(Block);
				1488
				1489	BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
				1490
				1491	DEBUG({
				1492	dbgs() << "Block" << Block.getNumber();
				1493	ScoreBrackets->dump();
				1494	});
				1495
				1496	bool InsertNOP = false;
				1497
				1498	// Walk over the instructions.
				1499	for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
				1500	Iter != E;) {
				1501	MachineInstr &Inst = *Iter;
				1502	// Remove any previously existing waitcnts.
				1503	if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
				1504	// TODO: Register the old waitcnt and optimize the following waitcnts.
				1505	// Leaving the previously existing waitcnts is conservatively correct.
				1506	if (CompilerGeneratedWaitcntSet.find(&Inst) ==
				1507	CompilerGeneratedWaitcntSet.end())
				1508	++Iter;
				1509	else {
				1510	ScoreBrackets->setWaitcnt(&Inst);
				1511	++Iter;
				1512	Inst.removeFromParent();
				1513	}
				1514	continue;
				1515	}
				1516
				1517	// Kill instructions generate a conditional branch to the endmain block.
				1518	// Merge the current waitcnt state into the endmain block information.
				1519	// TODO: Are there other flavors of KILL instruction?
				1520	if (Inst.getOpcode() == AMDGPU::KILL) {
				1521	addKillWaitBracket(ScoreBrackets);
				1522	}
				1523
				1524	bool VCCZBugWorkAround = false;
				1525	if (readsVCCZ(Inst) &&
				1526	(VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) {
				1527	if (ScoreBrackets->getScoreLB(LGKM_CNT) <
				1528	ScoreBrackets->getScoreUB(LGKM_CNT) &&
				1529	ScoreBrackets->hasPendingSMEM()) {
				1530	if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
				1531	VCCZBugWorkAround = true;
				1532	}
				1533	}
				1534
				1535	// Generate an s_waitcnt instruction to be placed before
				1536	// cur_Inst, if needed.
				1537	MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets);
				1538
				1539	if (SWaitInst) {
				1540	Block.insert(Inst, SWaitInst);
				1541	if (ScoreBrackets->getWaitcnt() != SWaitInst) {
				1542	DEBUG(dbgs() << "insertWaitcntInBlock\n"
				1543	<< "Old Instr: " << Inst << '\n'
				1544	<< "New Instr: " << *SWaitInst << '\n';);
				1545	}
				1546	}
				1547
				1548	updateEventWaitCntAfter(Inst, ScoreBrackets);
				1549
				1550	#if 0 // TODO: implement resource type check controlled by options with ub = LB.
				1551	// If this instruction generates a S_SETVSKIP because it is an
				1552	// indexed resource, and we are on Tahiti, then it will also force
				1553	// an S_WAITCNT vmcnt(0)
				1554	if (RequireCheckResourceType(Inst, context)) {
				1555	// Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
				1556	ScoreBrackets->setScoreLB(VM_CNT,
				1557	ScoreBrackets->getScoreUB(VM_CNT));
				1558	}
				1559	#endif
				1560
				1561	ScoreBrackets->clearWaitcnt();
				1562
				1563	if (SWaitInst) {
				1564	DEBUG({ SWaitInst->print(dbgs() << '\n'); });
				1565	}
				1566	DEBUG({
				1567	Inst.print(dbgs());
				1568	ScoreBrackets->dump();
				1569	});
				1570
				1571	// Check to see if this is a GWS instruction. If so, and if this is CI or
				1572	// VI, then the generated code sequence will include an S_WAITCNT 0.
				1573	// TODO: Are these the only GWS instructions?
				1574	if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT \|\|
				1575	Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V \|\|
				1576	Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR \|\|
				1577	Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P \|\|
				1578	Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
				1579	// TODO: && context->target_info->GwsRequiresMemViolTest() ) {
				1580	ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
				1581	ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
				1582	ScoreBrackets->updateByWait(LGKM_CNT,
				1583	ScoreBrackets->getScoreUB(LGKM_CNT));
				1584	}
				1585
				1586	// TODO: Remove this work-around after fixing the scheduler and enable the
				1587	// assert above.
				1588	if (VCCZBugWorkAround) {
				1589	// Restore the vccz bit. Any time a value is written to vcc, the vcc
				1590	// bit is updated, so we can restore the bit by reading the value of
				1591	// vcc and then writing it back to the register.
				1592	BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
				1593	AMDGPU::VCC)
				1594	.addReg(AMDGPU::VCC);
				1595	VCCZBugHandledSet.insert(&Inst);
				1596	}
				1597
				1598	if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
				1599
				1600	// This avoids a s_nop after a waitcnt has just been inserted.
				1601	if (!SWaitInst && InsertNOP) {
				1602	BuildMI(Block, Inst, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
				1603	}
				1604	InsertNOP = false;
				1605
				1606	// Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
				1607	// or SMEM clause, respectively.
				1608	//
				1609	// The temporary workaround is to break the clauses with S_NOP.
				1610	//
				1611	// The proper solution would be to allocate registers such that all source
				1612	// and destination registers don't overlap, e.g. this is illegal:
				1613	// r0 = load r2
				1614	// r2 = load r0
				1615	bool IsSMEM = false;
				1616	bool IsVMEM = false;
				1617	if (TII->isSMRD(Inst))
				1618	IsSMEM = true;
				1619	else if (TII->usesVM_CNT(Inst))
				1620	IsVMEM = true;
				1621
				1622	++Iter;
				1623	if (Iter == E)
				1624	break;
				1625
				1626	MachineInstr &Next = *Iter;
				1627
				1628	// TODO: How about consecutive SMEM instructions?
				1629	// The comments above says break the clause but the code does not.
				1630	// if ((TII->isSMRD(next) && isSMEM) \|\|
				1631	if (!IsSMEM && TII->usesVM_CNT(Next) && IsVMEM &&
				1632	// TODO: Enable this check when hasSoftClause is upstreamed.
				1633	// ST->hasSoftClauses() &&
				1634	ST->isXNACKEnabled()) {
				1635	// Insert a NOP to break the clause.
				1636	InsertNOP = true;
				1637	continue;
				1638	}
				1639
				1640	// There must be "S_NOP 0" between an instruction writing M0 and
				1641	// S_SENDMSG.
				1642	if ((Next.getOpcode() == AMDGPU::S_SENDMSG \|\|
				1643	Next.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
				1644	Inst.definesRegister(AMDGPU::M0))
				1645	InsertNOP = true;
				1646
				1647	continue;
				1648	}
				1649
				1650	++Iter;
				1651	}
				1652
				1653	// Check if we need to force convergence at loop footer.
				1654	MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
				1655	if (ContainingLoop && loopBottom(ContainingLoop) == &Block) {
				1656	LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
				1657	WaitcntData->print();
				1658	DEBUG(dbgs() << '\n';);
				1659
				1660	// The iterative waitcnt insertion algorithm aims for optimal waitcnt
				1661	// placement and doesn't always guarantee convergence for a loop. Each
				1662	// loop should take at most 2 iterations for it to converge naturally.
				1663	// When this max is reached and result doesn't converge, we force
				1664	// convergence by inserting a s_waitcnt at the end of loop footer.
				1665	if (WaitcntData->getIterCnt() > 2) {
				1666	// To ensure convergence, need to make wait events at loop footer be no
				1667	// more than those from the previous iteration.
				1668	// As a simplification, Instead of tracking individual scores and
				1669	// generate the precise wait count, just wait on 0.
				1670	bool HasPending = false;
				1671	MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
				1672	for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
				1673	T = (enum InstCounterType)(T + 1)) {
				1674	if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
				1675	ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
				1676	HasPending = true;
				1677	}
				1678	}
				1679
				1680	if (HasPending) {
				1681	if (!SWaitInst) {
				1682	SWaitInst = Block.getParent()->CreateMachineInstr(
				1683	TII->get(AMDGPU::S_WAITCNT), DebugLoc());
				1684	CompilerGeneratedWaitcntSet.insert(SWaitInst);
				1685	const MachineOperand &Op = MachineOperand::CreateImm(0);
				1686	SWaitInst->addOperand(MF, Op);
				1687	#if 0 // TODO: Format the debug output
				1688	OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
				1689	OutputTransformAdd(SWaitInst, context);
				1690	#endif
				1691	}
				1692	#if 0 // TODO: ??
				1693	_DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
				1694	#endif
				1695	}
				1696
				1697	if (SWaitInst) {
				1698	DEBUG({
				1699	SWaitInst->print(dbgs());
				1700	dbgs() << "\nAdjusted score board:";
				1701	ScoreBrackets->dump();
				1702	});
				1703
				1704	// Add this waitcnt to the block. It is either newly created or
				1705	// created in previous iterations and added back since block traversal
				1706	// always remove waitcnt.
				1707	insertWaitcntBeforeCF(Block, SWaitInst);
				1708	WaitcntData->setWaitcnt(SWaitInst);
				1709	}
				1710	}
				1711	}
				1712	}
				1713
				1714	bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
				1715	ST = &MF.getSubtarget<SISubtarget>();
				1716	TII = ST->getInstrInfo();
				1717	TRI = &TII->getRegisterInfo();
				1718	MRI = &MF.getRegInfo();
				1719	MLI = &getAnalysis<MachineLoopInfo>();
				1720	IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
Mark Searles	11d0a04	2017-05-31 16:44:23 +0000	[diff] [blame^]	1721	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1722	AMDGPUASI = ST->getAMDGPUAS();
				1723
				1724	HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
				1725	HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
				1726	HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
				1727
				1728	HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
				1729	HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
				1730	assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
				1731	assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
				1732
				1733	RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
				1734	RegisterEncoding.VGPRL =
				1735	RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
				1736	RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
				1737	RegisterEncoding.SGPRL =
				1738	RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
				1739
				1740	// Walk over the blocks in reverse post-dominator order, inserting
				1741	// s_waitcnt where needed.
				1742	ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
				1743	bool Modified = false;
				1744	for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
				1745	I = RPOT.begin(),
				1746	E = RPOT.end(), J = RPOT.begin();
				1747	I != E;) {
				1748	MachineBasicBlock &MBB = **I;
				1749
				1750	BlockVisitedSet.insert(&MBB);
				1751
				1752	BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
				1753	if (!ScoreBrackets) {
				1754	BlockWaitcntBracketsMap[&MBB] = make_unique<BlockWaitcntBrackets>();
				1755	ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
				1756	}
				1757	ScoreBrackets->setPostOrder(MBB.getNumber());
				1758	MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
				1759	if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
				1760	LoopWaitcntDataMap[ContainingLoop] = make_unique<LoopWaitcntData>();
				1761
				1762	// If we are walking into the block from before the loop, then guarantee
				1763	// at least 1 re-walk over the loop to propagate the information, even if
				1764	// no S_WAITCNT instructions were generated.
Kannan Narayanan	5e73b04	2017-05-05 21:10:17 +0000	[diff] [blame]	1765	if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1766	(BlockWaitcntProcessedSet.find(&MBB) ==
				1767	BlockWaitcntProcessedSet.end())) {
				1768	BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
				1769	DEBUG(dbgs() << "set-revisit: block"
Kannan Narayanan	5e73b04	2017-05-05 21:10:17 +0000	[diff] [blame]	1770	<< ContainingLoop->getHeader()->getNumber() << '\n';);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1771	}
				1772
				1773	// Walk over the instructions.
				1774	insertWaitcntInBlock(MF, MBB);
				1775
				1776	// Flag that waitcnts have been processed at least once.
				1777	BlockWaitcntProcessedSet.insert(&MBB);
				1778
				1779	// See if we want to revisit the loop.
				1780	if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) {
Kannan Narayanan	5e73b04	2017-05-05 21:10:17 +0000	[diff] [blame]	1781	MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1782	BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
				1783	if (EntrySB && EntrySB->getRevisitLoop()) {
				1784	EntrySB->setRevisitLoop(false);
				1785	J = I;
				1786	int32_t PostOrder = EntrySB->getPostOrder();
				1787	// TODO: Avoid this loop. Find another way to set I.
				1788	for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
				1789	X = RPOT.begin(),
				1790	Y = RPOT.end();
				1791	X != Y; ++X) {
				1792	MachineBasicBlock &MBBX = **X;
				1793	if (MBBX.getNumber() == PostOrder) {
				1794	I = X;
				1795	break;
				1796	}
				1797	}
				1798	LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
				1799	WaitcntData->incIterCnt();
				1800	DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';);
				1801	continue;
				1802	} else {
				1803	LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
				1804	// Loop converged, reset iteration count. If this loop gets revisited,
				1805	// it must be from an outer loop, the counter will restart, this will
				1806	// ensure we don't force convergence on such revisits.
				1807	WaitcntData->resetIterCnt();
				1808	}
				1809	}
				1810
				1811	J = I;
				1812	++I;
				1813	}
				1814
				1815	SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
				1816
				1817	bool HaveScalarStores = false;
				1818
				1819	for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
				1820	++BI) {
				1821
				1822	MachineBasicBlock &MBB = *BI;
				1823
				1824	for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
				1825	++I) {
				1826
				1827	if (!HaveScalarStores && TII->isScalarStore(*I))
				1828	HaveScalarStores = true;
				1829
				1830	if (I->getOpcode() == AMDGPU::S_ENDPGM \|\|
				1831	I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
				1832	EndPgmBlocks.push_back(&MBB);
				1833	}
				1834	}
				1835
				1836	if (HaveScalarStores) {
				1837	// If scalar writes are used, the cache must be flushed or else the next
				1838	// wave to reuse the same scratch memory can be clobbered.
				1839	//
				1840	// Insert s_dcache_wb at wave termination points if there were any scalar
				1841	// stores, and only if the cache hasn't already been flushed. This could be
				1842	// improved by looking across blocks for flushes in postdominating blocks
				1843	// from the stores but an explicitly requested flush is probably very rare.
				1844	for (MachineBasicBlock *MBB : EndPgmBlocks) {
				1845	bool SeenDCacheWB = false;
				1846
				1847	for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
				1848	++I) {
				1849
				1850	if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
				1851	SeenDCacheWB = true;
				1852	else if (TII->isScalarStore(*I))
				1853	SeenDCacheWB = false;
				1854
				1855	// FIXME: It would be better to insert this before a waitcnt if any.
				1856	if ((I->getOpcode() == AMDGPU::S_ENDPGM \|\|
				1857	I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
				1858	!SeenDCacheWB) {
				1859	Modified = true;
				1860	BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
				1861	}
				1862	}
				1863	}
				1864	}
				1865
Mark Searles	11d0a04	2017-05-31 16:44:23 +0000	[diff] [blame^]	1866	if (!MFI->isEntryFunction()) {
				1867	// Wait for any outstanding memory operations that the input registers may
				1868	// depend on. We can't track them and it's better to to the wait after the
				1869	// costly call sequence.
				1870
				1871	// TODO: Could insert earlier and schedule more liberally with operations
				1872	// that only use caller preserved registers.
				1873	MachineBasicBlock &EntryBB = MF.front();
				1874	BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
				1875	.addImm(0);
				1876
				1877	Modified = true;
				1878	}
				1879
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1880	return Modified;
				1881	}