Blame - llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp - toolchain/llvm-project

blob: c33350d7546ed2be919c6e56212660bcc2f79ec2 [file] [log] [blame]

Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	1	//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	/// \file
Adrian Prantl	5f8f34e4	2018-05-01 15:54:18 +0000	[diff] [blame]	11	/// Insert wait instructions for memory reads and writes.
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	12	///
				13	/// Memory reads and writes are issued asynchronously, so we need to insert
				14	/// S_WAITCNT instructions when we want to access any of their results or
				15	/// overwrite any register that's used asynchronously.
Nicolai Haehnle	d1f45da	2018-11-29 11:06:14 +0000	[diff] [blame^]	16	///
				17	/// TODO: This pass currently keeps one timeline per hardware counter. A more
				18	/// finely-grained approach that keeps one timeline per event type could
				19	/// sometimes get away with generating weaker s_waitcnt instructions. For
				20	/// example, when both SMEM and LDS are in flight and we need to wait for
				21	/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
				22	/// but the pass will currently generate a conservative lgkmcnt(0) because
				23	/// multiple event types are in flight.
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	24	//
				25	//===----------------------------------------------------------------------===//
				26
				27	#include "AMDGPU.h"
				28	#include "AMDGPUSubtarget.h"
				29	#include "SIDefines.h"
				30	#include "SIInstrInfo.h"
				31	#include "SIMachineFunctionInfo.h"
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	32	#include "SIRegisterInfo.h"
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	33	#include "Utils/AMDGPUBaseInfo.h"
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	34	#include "llvm/ADT/DenseMap.h"
				35	#include "llvm/ADT/DenseSet.h"
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	36	#include "llvm/ADT/PostOrderIterator.h"
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	37	#include "llvm/ADT/STLExtras.h"
				38	#include "llvm/ADT/SmallVector.h"
				39	#include "llvm/CodeGen/MachineBasicBlock.h"
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	40	#include "llvm/CodeGen/MachineFunction.h"
				41	#include "llvm/CodeGen/MachineFunctionPass.h"
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	42	#include "llvm/CodeGen/MachineInstr.h"
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	43	#include "llvm/CodeGen/MachineInstrBuilder.h"
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	44	#include "llvm/CodeGen/MachineLoopInfo.h"
				45	#include "llvm/CodeGen/MachineMemOperand.h"
				46	#include "llvm/CodeGen/MachineOperand.h"
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	47	#include "llvm/CodeGen/MachineRegisterInfo.h"
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	48	#include "llvm/IR/DebugLoc.h"
				49	#include "llvm/Pass.h"
				50	#include "llvm/Support/Debug.h"
Mark Searles	ec58183	2018-04-25 19:21:26 +0000	[diff] [blame]	51	#include "llvm/Support/DebugCounter.h"
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	52	#include "llvm/Support/ErrorHandling.h"
				53	#include "llvm/Support/raw_ostream.h"
				54	#include <algorithm>
				55	#include <cassert>
				56	#include <cstdint>
				57	#include <cstring>
				58	#include <memory>
				59	#include <utility>
				60	#include <vector>
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	61
Mark Searles	ec58183	2018-04-25 19:21:26 +0000	[diff] [blame]	62	using namespace llvm;
				63
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	64	#define DEBUG_TYPE "si-insert-waitcnts"
				65
Mark Searles	ec58183	2018-04-25 19:21:26 +0000	[diff] [blame]	66	DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
				67	"Force emit s_waitcnt expcnt(0) instrs");
				68	DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
				69	"Force emit s_waitcnt lgkmcnt(0) instrs");
				70	DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
				71	"Force emit s_waitcnt vmcnt(0) instrs");
				72
				73	static cl::opt<unsigned> ForceEmitZeroFlag(
				74	"amdgpu-waitcnt-forcezero",
				75	cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
				76	cl::init(0), cl::Hidden);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	77
				78	namespace {
				79
Nicolai Haehnle	ae369d7	2018-11-29 11:06:11 +0000	[diff] [blame]	80	template <typename EnumT>
				81	class enum_iterator
				82	: public iterator_facade_base<enum_iterator<EnumT>,
				83	std::forward_iterator_tag, const EnumT> {
				84	EnumT Value;
				85	public:
				86	enum_iterator() = default;
				87	enum_iterator(EnumT Value) : Value(Value) {}
				88
				89	enum_iterator &operator++() {
				90	Value = static_cast<EnumT>(Value + 1);
				91	return *this;
				92	}
				93
				94	bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
				95
				96	EnumT operator*() const { return Value; }
				97	};
				98
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	99	// Class of object that encapsulates latest instruction counter score
				100	// associated with the operand. Used for determining whether
				101	// s_waitcnt instruction needs to be emited.
				102
				103	#define CNT_MASK(t) (1u << (t))
				104
				105	enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
				106
Nicolai Haehnle	ae369d7	2018-11-29 11:06:11 +0000	[diff] [blame]	107	iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
				108	return make_range(enum_iterator<InstCounterType>(VM_CNT),
				109	enum_iterator<InstCounterType>(NUM_INST_CNTS));
				110	}
				111
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	112	using RegInterval = std::pair<signed, signed>;
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	113
				114	struct {
				115	int32_t VmcntMax;
				116	int32_t ExpcntMax;
				117	int32_t LgkmcntMax;
				118	int32_t NumVGPRsMax;
				119	int32_t NumSGPRsMax;
				120	} HardwareLimits;
				121
				122	struct {
				123	unsigned VGPR0;
				124	unsigned VGPRL;
				125	unsigned SGPR0;
				126	unsigned SGPRL;
				127	} RegisterEncoding;
				128
				129	enum WaitEventType {
				130	VMEM_ACCESS, // vector-memory read & write
				131	LDS_ACCESS, // lds read & write
				132	GDS_ACCESS, // gds read & write
				133	SQ_MESSAGE, // send message
				134	SMEM_ACCESS, // scalar-memory read & write
				135	EXP_GPR_LOCK, // export holding on its data src
				136	GDS_GPR_LOCK, // GDS holding on its data and addr src
				137	EXP_POS_ACCESS, // write to export position
				138	EXP_PARAM_ACCESS, // write to export parameter
				139	VMW_GPR_LOCK, // vector-memory write holding on its data src
				140	NUM_WAIT_EVENTS,
				141	};
				142
Nicolai Haehnle	d1f45da	2018-11-29 11:06:14 +0000	[diff] [blame^]	143	static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
				144	(1 << VMEM_ACCESS),
				145	(1 << SMEM_ACCESS) \| (1 << LDS_ACCESS) \| (1 << GDS_ACCESS) \|
				146	(1 << SQ_MESSAGE),
				147	(1 << EXP_GPR_LOCK) \| (1 << GDS_GPR_LOCK) \| (1 << VMW_GPR_LOCK) \|
				148	(1 << EXP_PARAM_ACCESS) \| (1 << EXP_POS_ACCESS),
				149	};
Nicolai Haehnle	ae369d7	2018-11-29 11:06:11 +0000	[diff] [blame]	150
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	151	// The mapping is:
				152	// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
				153	// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
				154	// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
				155	// We reserve a fixed number of VGPR slots in the scoring tables for
				156	// special tokens like SCMEM_LDS (needed for buffer load to LDS).
				157	enum RegisterMapping {
				158	SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
				159	SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
				160	NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
				161	EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
				162	NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
				163	};
				164
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	165	void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
				166	switch (T) {
				167	case VM_CNT:
				168	Wait.VmCnt = std::min(Wait.VmCnt, Count);
				169	break;
				170	case EXP_CNT:
				171	Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
				172	break;
				173	case LGKM_CNT:
				174	Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
				175	break;
				176	default:
				177	llvm_unreachable("bad InstCounterType");
				178	}
				179	}
				180
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	181	// This is a per-basic-block object that maintains current score brackets
Mark Searles	c3c02bd	2018-03-14 22:04:32 +0000	[diff] [blame]	182	// of each wait counter, and a per-register scoreboard for each wait counter.
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	183	// We also maintain the latest score for every event type that can change the
				184	// waitcnt in order to know if there are multiple types of events within
				185	// the brackets. When multiple types of event happen in the bracket,
Mark Searles	c3c02bd	2018-03-14 22:04:32 +0000	[diff] [blame]	186	// wait count may get decreased out of order, therefore we need to put in
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	187	// "s_waitcnt 0" before use.
				188	class BlockWaitcntBrackets {
				189	public:
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	190	BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
Nicolai Haehnle	ae369d7	2018-11-29 11:06:11 +0000	[diff] [blame]	191	for (auto T : inst_counter_types())
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	192	memset(VgprScores[T], 0, sizeof(VgprScores[T]));
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	193	}
				194
				195	~BlockWaitcntBrackets() = default;
				196
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	197	static int32_t getWaitCountMax(InstCounterType T) {
				198	switch (T) {
				199	case VM_CNT:
				200	return HardwareLimits.VmcntMax;
				201	case LGKM_CNT:
				202	return HardwareLimits.LgkmcntMax;
				203	case EXP_CNT:
				204	return HardwareLimits.ExpcntMax;
				205	default:
				206	break;
				207	}
				208	return 0;
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	209	}
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	210
				211	void setScoreLB(InstCounterType T, int32_t Val) {
				212	assert(T < NUM_INST_CNTS);
				213	if (T >= NUM_INST_CNTS)
				214	return;
				215	ScoreLBs[T] = Val;
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	216	}
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	217
				218	void setScoreUB(InstCounterType T, int32_t Val) {
				219	assert(T < NUM_INST_CNTS);
				220	if (T >= NUM_INST_CNTS)
				221	return;
				222	ScoreUBs[T] = Val;
				223	if (T == EXP_CNT) {
				224	int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
				225	if (ScoreLBs[T] < UB)
				226	ScoreLBs[T] = UB;
				227	}
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	228	}
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	229
Nicolai Haehnle	c548d91	2018-11-19 12:03:11 +0000	[diff] [blame]	230	int32_t getScoreLB(InstCounterType T) const {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	231	assert(T < NUM_INST_CNTS);
				232	if (T >= NUM_INST_CNTS)
				233	return 0;
				234	return ScoreLBs[T];
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	235	}
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	236
Nicolai Haehnle	c548d91	2018-11-19 12:03:11 +0000	[diff] [blame]	237	int32_t getScoreUB(InstCounterType T) const {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	238	assert(T < NUM_INST_CNTS);
				239	if (T >= NUM_INST_CNTS)
				240	return 0;
				241	return ScoreUBs[T];
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	242	}
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	243
				244	// Mapping from event to counter.
				245	InstCounterType eventCounter(WaitEventType E) {
Nicolai Haehnle	d1f45da	2018-11-29 11:06:14 +0000	[diff] [blame^]	246	if (E == VMEM_ACCESS)
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	247	return VM_CNT;
Nicolai Haehnle	d1f45da	2018-11-29 11:06:14 +0000	[diff] [blame^]	248	if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	249	return LGKM_CNT;
Nicolai Haehnle	d1f45da	2018-11-29 11:06:14 +0000	[diff] [blame^]	250	assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
				251	return EXP_CNT;
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	252	}
				253
				254	void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
				255	if (GprNo < NUM_ALL_VGPRS) {
				256	if (GprNo > VgprUB) {
				257	VgprUB = GprNo;
				258	}
				259	VgprScores[T][GprNo] = Val;
				260	} else {
				261	assert(T == LGKM_CNT);
				262	if (GprNo - NUM_ALL_VGPRS > SgprUB) {
				263	SgprUB = GprNo - NUM_ALL_VGPRS;
				264	}
				265	SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
				266	}
				267	}
				268
				269	int32_t getRegScore(int GprNo, InstCounterType T) {
				270	if (GprNo < NUM_ALL_VGPRS) {
				271	return VgprScores[T][GprNo];
				272	}
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	273	assert(T == LGKM_CNT);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	274	return SgprScores[GprNo - NUM_ALL_VGPRS];
				275	}
				276
				277	void clear() {
				278	memset(ScoreLBs, 0, sizeof(ScoreLBs));
				279	memset(ScoreUBs, 0, sizeof(ScoreUBs));
Nicolai Haehnle	d1f45da	2018-11-29 11:06:14 +0000	[diff] [blame^]	280	PendingEvents = 0;
				281	memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents));
Nicolai Haehnle	ae369d7	2018-11-29 11:06:11 +0000	[diff] [blame]	282	for (auto T : inst_counter_types())
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	283	memset(VgprScores[T], 0, sizeof(VgprScores[T]));
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	284	memset(SgprScores, 0, sizeof(SgprScores));
				285	}
				286
				287	RegInterval getRegInterval(const MachineInstr MI, const SIInstrInfo TII,
				288	const MachineRegisterInfo *MRI,
				289	const SIRegisterInfo *TRI, unsigned OpNo,
				290	bool Def) const;
				291
				292	void setExpScore(const MachineInstr MI, const SIInstrInfo TII,
				293	const SIRegisterInfo TRI, const MachineRegisterInfo MRI,
				294	unsigned OpNo, int32_t Val);
				295
				296	void setWaitAtBeginning() { WaitAtBeginning = true; }
				297	void clearWaitAtBeginning() { WaitAtBeginning = false; }
				298	bool getWaitAtBeginning() const { return WaitAtBeginning; }
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	299	int32_t getMaxVGPR() const { return VgprUB; }
				300	int32_t getMaxSGPR() const { return SgprUB; }
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	301
Nicolai Haehnle	c548d91	2018-11-19 12:03:11 +0000	[diff] [blame]	302	bool counterOutOfOrder(InstCounterType T) const;
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	303	bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
				304	bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
				305	void determineWait(InstCounterType T, int ScoreToWait,
				306	AMDGPU::Waitcnt &Wait) const;
				307	void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
				308	void applyWaitcnt(InstCounterType T, unsigned Count);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	309	void updateByEvent(const SIInstrInfo TII, const SIRegisterInfo TRI,
				310	const MachineRegisterInfo *MRI, WaitEventType E,
				311	MachineInstr &MI);
				312
Nicolai Haehnle	d1f45da	2018-11-29 11:06:14 +0000	[diff] [blame^]	313	bool hasPendingEvent(WaitEventType E) const {
				314	return PendingEvents & (1 << E);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	315	}
				316
Nicolai Haehnle	d1f45da	2018-11-29 11:06:14 +0000	[diff] [blame^]	317	void mergePendingEvents(const BlockWaitcntBrackets &Other);
				318
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	319	bool hasPendingFlat() const {
				320	return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
				321	LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) \|\|
				322	(LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
				323	LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
				324	}
				325
				326	void setPendingFlat() {
				327	LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
				328	LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
				329	}
				330
				331	int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
				332
				333	void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
				334
				335	bool getRevisitLoop() const { return RevisitLoop; }
				336	void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
				337
				338	void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
				339	int32_t getPostOrder() const { return PostOrder; }
				340
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	341	void print(raw_ostream &);
				342	void dump() { print(dbgs()); }
				343
				344	private:
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	345	const GCNSubtarget *ST = nullptr;
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	346	bool WaitAtBeginning = false;
				347	bool RevisitLoop = false;
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	348	int32_t PostOrder = 0;
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	349	int32_t ScoreLBs[NUM_INST_CNTS] = {0};
				350	int32_t ScoreUBs[NUM_INST_CNTS] = {0};
Nicolai Haehnle	d1f45da	2018-11-29 11:06:14 +0000	[diff] [blame^]	351	uint32_t PendingEvents = 0;
				352	bool MixedPendingEvents[NUM_INST_CNTS] = {false};
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	353	// Remember the last flat memory operation.
				354	int32_t LastFlat[NUM_INST_CNTS] = {0};
				355	// wait_cnt scores for every vgpr.
				356	// Keep track of the VgprUB and SgprUB to make merge at join efficient.
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	357	int32_t VgprUB = 0;
				358	int32_t SgprUB = 0;
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	359	int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
				360	// Wait cnt scores for every sgpr, only lgkmcnt is relevant.
				361	int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
				362	};
				363
				364	// This is a per-loop-region object that records waitcnt status at the end of
				365	// loop footer from the previous iteration. We also maintain an iteration
				366	// count to track the number of times the loop has been visited. When it
				367	// doesn't converge naturally, we force convergence by inserting s_waitcnt 0
				368	// at the end of the loop footer.
				369	class LoopWaitcntData {
				370	public:
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	371	LoopWaitcntData() = default;
				372	~LoopWaitcntData() = default;
				373
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	374	void incIterCnt() { IterCnt++; }
				375	void resetIterCnt() { IterCnt = 0; }
Mark Searles	1054541	2018-05-30 15:47:45 +0000	[diff] [blame]	376	unsigned getIterCnt() { return IterCnt; }
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	377
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	378	void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
				379	MachineInstr *getWaitcnt() const { return LfWaitcnt; }
				380
Nicola Zaghen	d34e60c	2018-05-14 12:53:11 +0000	[diff] [blame]	381	void print() { LLVM_DEBUG(dbgs() << " iteration " << IterCnt << '\n';); }
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	382
				383	private:
				384	// s_waitcnt added at the end of loop footer to stablize wait scores
				385	// at the end of the loop footer.
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	386	MachineInstr *LfWaitcnt = nullptr;
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	387	// Number of iterations the loop has been visited, not including the initial
				388	// walk over.
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	389	int32_t IterCnt = 0;
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	390	};
				391
				392	class SIInsertWaitcnts : public MachineFunctionPass {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	393	private:
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	394	const GCNSubtarget *ST = nullptr;
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	395	const SIInstrInfo *TII = nullptr;
				396	const SIRegisterInfo *TRI = nullptr;
				397	const MachineRegisterInfo *MRI = nullptr;
				398	const MachineLoopInfo *MLI = nullptr;
Konstantin Zhuravlyov	71e43ee	2018-09-12 18:50:47 +0000	[diff] [blame]	399	AMDGPU::IsaVersion IV;
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	400
				401	DenseSet<MachineBasicBlock *> BlockVisitedSet;
Mark Searles	24c92ee	2018-02-07 02:21:21 +0000	[diff] [blame]	402	DenseSet<MachineInstr *> TrackedWaitcntSet;
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	403	DenseSet<MachineInstr *> VCCZBugHandledSet;
				404
				405	DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
				406	BlockWaitcntBracketsMap;
				407
Mark Searles	1bc6e71	2018-04-19 15:42:30 +0000	[diff] [blame]	408	std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	409
				410	DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
				411
Mark Searles	4a0f2c5	2018-05-07 14:43:28 +0000	[diff] [blame]	412	// ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
				413	// because of amdgpu-waitcnt-forcezero flag
				414	bool ForceEmitZeroWaitcnts;
Mark Searles	ec58183	2018-04-25 19:21:26 +0000	[diff] [blame]	415	bool ForceEmitWaitcnt[NUM_INST_CNTS];
				416
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	417	public:
				418	static char ID;
				419
Konstantin Zhuravlyov	7774777	2018-06-26 21:33:38 +0000	[diff] [blame]	420	SIInsertWaitcnts() : MachineFunctionPass(ID) {
				421	(void)ForceExpCounter;
				422	(void)ForceLgkmCounter;
				423	(void)ForceVMCounter;
				424	}
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	425
				426	bool runOnMachineFunction(MachineFunction &MF) override;
				427
				428	StringRef getPassName() const override {
				429	return "SI insert wait instructions";
				430	}
				431
				432	void getAnalysisUsage(AnalysisUsage &AU) const override {
				433	AU.setPreservesCFG();
				434	AU.addRequired<MachineLoopInfo>();
				435	MachineFunctionPass::getAnalysisUsage(AU);
				436	}
				437
Mark Searles	ec58183	2018-04-25 19:21:26 +0000	[diff] [blame]	438	bool isForceEmitWaitcnt() const {
Nicolai Haehnle	ae369d7	2018-11-29 11:06:11 +0000	[diff] [blame]	439	for (auto T : inst_counter_types())
Mark Searles	ec58183	2018-04-25 19:21:26 +0000	[diff] [blame]	440	if (ForceEmitWaitcnt[T])
				441	return true;
				442	return false;
				443	}
				444
				445	void setForceEmitWaitcnt() {
				446	// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
				447	// For debug builds, get the debug counter info and adjust if need be
				448	#ifndef NDEBUG
				449	if (DebugCounter::isCounterSet(ForceExpCounter) &&
				450	DebugCounter::shouldExecute(ForceExpCounter)) {
				451	ForceEmitWaitcnt[EXP_CNT] = true;
				452	} else {
				453	ForceEmitWaitcnt[EXP_CNT] = false;
				454	}
				455
				456	if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
				457	DebugCounter::shouldExecute(ForceLgkmCounter)) {
				458	ForceEmitWaitcnt[LGKM_CNT] = true;
				459	} else {
				460	ForceEmitWaitcnt[LGKM_CNT] = false;
				461	}
				462
				463	if (DebugCounter::isCounterSet(ForceVMCounter) &&
				464	DebugCounter::shouldExecute(ForceVMCounter)) {
				465	ForceEmitWaitcnt[VM_CNT] = true;
				466	} else {
				467	ForceEmitWaitcnt[VM_CNT] = false;
				468	}
				469	#endif // NDEBUG
				470	}
				471
Matt Arsenault	0ed39d3	2017-07-21 18:54:54 +0000	[diff] [blame]	472	bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
Mark Searles	70901b9	2018-04-24 15:59:59 +0000	[diff] [blame]	473	void generateWaitcntInstBefore(MachineInstr &MI,
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	474	BlockWaitcntBrackets *ScoreBrackets,
				475	MachineInstr *OldWaitcntInstr);
Mark Searles	70901b9	2018-04-24 15:59:59 +0000	[diff] [blame]	476	void updateEventWaitcntAfter(MachineInstr &Inst,
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	477	BlockWaitcntBrackets *ScoreBrackets);
				478	void mergeInputScoreBrackets(MachineBasicBlock &Block);
Mark Searles	1bc6e71	2018-04-19 15:42:30 +0000	[diff] [blame]	479	bool isLoopBottom(const MachineLoop Loop, const MachineBasicBlock Block);
				480	unsigned countNumBottomBlocks(const MachineLoop *Loop);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	481	void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
				482	void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
				483	};
				484
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	485	} // end anonymous namespace
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	486
				487	RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
				488	const SIInstrInfo *TII,
				489	const MachineRegisterInfo *MRI,
				490	const SIRegisterInfo *TRI,
				491	unsigned OpNo,
				492	bool Def) const {
				493	const MachineOperand &Op = MI->getOperand(OpNo);
				494	if (!Op.isReg() \|\| !TRI->isInAllocatableClass(Op.getReg()) \|\|
				495	(Def && !Op.isDef()))
				496	return {-1, -1};
				497
				498	// A use via a PW operand does not need a waitcnt.
				499	// A partial write is not a WAW.
				500	assert(!Op.getSubReg() \|\| !Op.isUndef());
				501
				502	RegInterval Result;
				503	const MachineRegisterInfo &MRIA = *MRI;
				504
				505	unsigned Reg = TRI->getEncodingValue(Op.getReg());
				506
				507	if (TRI->isVGPR(MRIA, Op.getReg())) {
				508	assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
				509	Result.first = Reg - RegisterEncoding.VGPR0;
				510	assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
				511	} else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
				512	assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
				513	Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
				514	assert(Result.first >= NUM_ALL_VGPRS &&
				515	Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
				516	}
				517	// TODO: Handle TTMP
				518	// else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
				519	else
				520	return {-1, -1};
				521
				522	const MachineInstr &MIA = *MI;
				523	const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
Krzysztof Parzyszek	44e25f3	2017-04-24 18:55:33 +0000	[diff] [blame]	524	unsigned Size = TRI->getRegSizeInBits(*RC);
				525	Result.second = Result.first + (Size / 32);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	526
				527	return Result;
				528	}
				529
				530	void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
				531	const SIInstrInfo *TII,
				532	const SIRegisterInfo *TRI,
				533	const MachineRegisterInfo *MRI,
				534	unsigned OpNo, int32_t Val) {
				535	RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
Nicola Zaghen	d34e60c	2018-05-14 12:53:11 +0000	[diff] [blame]	536	LLVM_DEBUG({
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	537	const MachineOperand &Opnd = MI->getOperand(OpNo);
				538	assert(TRI->isVGPR(*MRI, Opnd.getReg()));
				539	});
				540	for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
				541	setRegScore(RegNo, EXP_CNT, Val);
				542	}
				543	}
				544
				545	void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
				546	const SIRegisterInfo *TRI,
				547	const MachineRegisterInfo *MRI,
				548	WaitEventType E, MachineInstr &Inst) {
				549	const MachineRegisterInfo &MRIA = *MRI;
				550	InstCounterType T = eventCounter(E);
				551	int32_t CurrScore = getScoreUB(T) + 1;
Nicolai Haehnle	d1f45da	2018-11-29 11:06:14 +0000	[diff] [blame^]	552	// PendingEvents and ScoreUB need to be update regardless if this event
				553	// changes the score of a register or not.
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	554	// Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
Nicolai Haehnle	d1f45da	2018-11-29 11:06:14 +0000	[diff] [blame^]	555	if (!hasPendingEvent(E)) {
				556	if (PendingEvents & WaitEventMaskForInst[T])
				557	MixedPendingEvents[T] = true;
				558	PendingEvents \|= 1 << E;
				559	}
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	560	setScoreUB(T, CurrScore);
				561
				562	if (T == EXP_CNT) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	563	// Put score on the source vgprs. If this is a store, just use those
				564	// specific register(s).
				565	if (TII->isDS(Inst) && (Inst.mayStore() \|\| Inst.mayLoad())) {
				566	// All GDS operations must protect their address register (same as
				567	// export.)
				568	if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
				569	Inst.getOpcode() != AMDGPU::DS_CONSUME) {
				570	setExpScore(
				571	&Inst, TII, TRI, MRI,
				572	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
				573	CurrScore);
				574	}
				575	if (Inst.mayStore()) {
				576	setExpScore(
				577	&Inst, TII, TRI, MRI,
				578	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
				579	CurrScore);
				580	if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
				581	AMDGPU::OpName::data1) != -1) {
				582	setExpScore(&Inst, TII, TRI, MRI,
				583	AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
				584	AMDGPU::OpName::data1),
				585	CurrScore);
				586	}
				587	} else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
				588	Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
				589	Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
				590	Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
				591	Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
				592	Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
				593	Inst.getOpcode() != AMDGPU::DS_APPEND &&
				594	Inst.getOpcode() != AMDGPU::DS_CONSUME &&
				595	Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
				596	for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
				597	const MachineOperand &Op = Inst.getOperand(I);
				598	if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
				599	setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
				600	}
				601	}
				602	}
				603	} else if (TII->isFLAT(Inst)) {
				604	if (Inst.mayStore()) {
				605	setExpScore(
				606	&Inst, TII, TRI, MRI,
				607	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
				608	CurrScore);
				609	} else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
				610	setExpScore(
				611	&Inst, TII, TRI, MRI,
				612	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
				613	CurrScore);
				614	}
				615	} else if (TII->isMIMG(Inst)) {
				616	if (Inst.mayStore()) {
				617	setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
				618	} else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
				619	setExpScore(
				620	&Inst, TII, TRI, MRI,
				621	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
				622	CurrScore);
				623	}
				624	} else if (TII->isMTBUF(Inst)) {
				625	if (Inst.mayStore()) {
				626	setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
				627	}
				628	} else if (TII->isMUBUF(Inst)) {
				629	if (Inst.mayStore()) {
				630	setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
				631	} else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
				632	setExpScore(
				633	&Inst, TII, TRI, MRI,
				634	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
				635	CurrScore);
				636	}
				637	} else {
				638	if (TII->isEXP(Inst)) {
				639	// For export the destination registers are really temps that
				640	// can be used as the actual source after export patching, so
				641	// we need to treat them like sources and set the EXP_CNT
				642	// score.
				643	for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
				644	MachineOperand &DefMO = Inst.getOperand(I);
				645	if (DefMO.isReg() && DefMO.isDef() &&
				646	TRI->isVGPR(MRIA, DefMO.getReg())) {
				647	setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
				648	CurrScore);
				649	}
				650	}
				651	}
				652	for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
				653	MachineOperand &MO = Inst.getOperand(I);
				654	if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
				655	setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
				656	}
				657	}
				658	}
				659	#if 0 // TODO: check if this is handled by MUBUF code above.
				660	} else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD \|\|
Evgeny Mankov	bf97517	2017-08-16 16:47:29 +0000	[diff] [blame]	661	Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 \|\|
				662	Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	663	MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
				664	unsigned OpNo;//TODO: find the OpNo for this operand;
				665	RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
				666	for (signed RegNo = Interval.first; RegNo < Interval.second;
Evgeny Mankov	bf97517	2017-08-16 16:47:29 +0000	[diff] [blame]	667	++RegNo) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	668	setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
				669	}
				670	#endif
				671	} else {
				672	// Match the score to the destination registers.
				673	for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
				674	RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
				675	if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
				676	continue;
				677	for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
				678	setRegScore(RegNo, T, CurrScore);
				679	}
				680	}
				681	if (TII->isDS(Inst) && Inst.mayStore()) {
				682	setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
				683	}
				684	}
				685	}
				686
				687	void BlockWaitcntBrackets::print(raw_ostream &OS) {
				688	OS << '\n';
Nicolai Haehnle	ae369d7	2018-11-29 11:06:11 +0000	[diff] [blame]	689	for (auto T : inst_counter_types()) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	690	int LB = getScoreLB(T);
				691	int UB = getScoreUB(T);
				692
				693	switch (T) {
				694	case VM_CNT:
				695	OS << " VM_CNT(" << UB - LB << "): ";
				696	break;
				697	case LGKM_CNT:
				698	OS << " LGKM_CNT(" << UB - LB << "): ";
				699	break;
				700	case EXP_CNT:
				701	OS << " EXP_CNT(" << UB - LB << "): ";
				702	break;
				703	default:
				704	OS << " UNKNOWN(" << UB - LB << "): ";
				705	break;
				706	}
				707
				708	if (LB < UB) {
				709	// Print vgpr scores.
				710	for (int J = 0; J <= getMaxVGPR(); J++) {
				711	int RegScore = getRegScore(J, T);
				712	if (RegScore <= LB)
				713	continue;
				714	int RelScore = RegScore - LB - 1;
				715	if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
				716	OS << RelScore << ":v" << J << " ";
				717	} else {
				718	OS << RelScore << ":ds ";
				719	}
				720	}
				721	// Also need to print sgpr scores for lgkm_cnt.
				722	if (T == LGKM_CNT) {
				723	for (int J = 0; J <= getMaxSGPR(); J++) {
				724	int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
				725	if (RegScore <= LB)
				726	continue;
				727	int RelScore = RegScore - LB - 1;
				728	OS << RelScore << ":s" << J << " ";
				729	}
				730	}
				731	}
				732	OS << '\n';
				733	}
				734	OS << '\n';
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	735	}
				736
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	737	/// Simplify the waitcnt, in the sense of removing redundant counts, and return
				738	/// whether a waitcnt instruction is needed at all.
				739	bool BlockWaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
				740	return simplifyWaitcnt(VM_CNT, Wait.VmCnt) \|
				741	simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) \|
				742	simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
				743	}
				744
				745	bool BlockWaitcntBrackets::simplifyWaitcnt(InstCounterType T,
				746	unsigned &Count) const {
				747	const int32_t LB = getScoreLB(T);
				748	const int32_t UB = getScoreUB(T);
				749	if (Count < (unsigned)UB && UB - (int32_t)Count > LB)
				750	return true;
				751
				752	Count = ~0u;
				753	return false;
				754	}
				755
				756	void BlockWaitcntBrackets::determineWait(InstCounterType T, int ScoreToWait,
				757	AMDGPU::Waitcnt &Wait) const {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	758	if (ScoreToWait == -1) {
				759	// The score to wait is unknown. This implies that it was not encountered
				760	// during the path of the CFG walk done during the current traversal but
				761	// may be seen on a different path. Emit an s_wait counter with a
				762	// conservative value of 0 for the counter.
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	763	addWait(Wait, T, 0);
				764	return;
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	765	}
				766
				767	// If the score of src_operand falls within the bracket, we need an
				768	// s_waitcnt instruction.
				769	const int32_t LB = getScoreLB(T);
				770	const int32_t UB = getScoreUB(T);
				771	if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
Mark Searles	f0b93f1	2018-06-04 16:51:59 +0000	[diff] [blame]	772	if ((T == VM_CNT \|\| T == LGKM_CNT) &&
				773	hasPendingFlat() &&
				774	!ST->hasFlatLgkmVMemCountInOrder()) {
				775	// If there is a pending FLAT operation, and this is a VMem or LGKM
				776	// waitcnt and the target can report early completion, then we need
				777	// to force a waitcnt 0.
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	778	addWait(Wait, T, 0);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	779	} else if (counterOutOfOrder(T)) {
				780	// Counter can get decremented out-of-order when there
Mark Searles	c3c02bd	2018-03-14 22:04:32 +0000	[diff] [blame]	781	// are multiple types event in the bracket. Also emit an s_wait counter
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	782	// with a conservative value of 0 for the counter.
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	783	addWait(Wait, T, 0);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	784	} else {
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	785	addWait(Wait, T, UB - ScoreToWait);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	786	}
				787	}
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	788	}
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	789
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	790	void BlockWaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
				791	applyWaitcnt(VM_CNT, Wait.VmCnt);
				792	applyWaitcnt(EXP_CNT, Wait.ExpCnt);
				793	applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	794	}
				795
				796	void BlockWaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
				797	const int32_t UB = getScoreUB(T);
				798	if (Count >= (unsigned)UB)
				799	return;
				800	if (Count != 0) {
				801	if (counterOutOfOrder(T))
				802	return;
				803	setScoreLB(T, std::max(getScoreLB(T), UB - (int32_t)Count));
				804	} else {
				805	setScoreLB(T, UB);
Nicolai Haehnle	d1f45da	2018-11-29 11:06:14 +0000	[diff] [blame^]	806	MixedPendingEvents[T] = false;
				807	PendingEvents &= ~WaitEventMaskForInst[T];
				808	}
				809	}
				810
				811	void BlockWaitcntBrackets::mergePendingEvents(const BlockWaitcntBrackets &Other) {
				812	for (auto T : inst_counter_types()) {
				813	uint32_t Old = PendingEvents & WaitEventMaskForInst[T];
				814	uint32_t New = Other.PendingEvents & WaitEventMaskForInst[T];
				815	if (Other.MixedPendingEvents[T] \|\| (Old && New && Old != New))
				816	MixedPendingEvents[T] = true;
				817	PendingEvents \|= New;
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	818	}
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	819	}
				820
				821	// Where there are multiple types of event in the bracket of a counter,
				822	// the decrement may go out of order.
Nicolai Haehnle	c548d91	2018-11-19 12:03:11 +0000	[diff] [blame]	823	bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
Nicolai Haehnle	d1f45da	2018-11-29 11:06:14 +0000	[diff] [blame^]	824	// Scalar memory read always can go out of order.
				825	if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
				826	return true;
				827	return MixedPendingEvents[T];
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	828	}
				829
				830	INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
				831	false)
				832	INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
				833	false)
				834
				835	char SIInsertWaitcnts::ID = 0;
				836
				837	char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
				838
				839	FunctionPass *llvm::createSIInsertWaitcntsPass() {
				840	return new SIInsertWaitcnts();
				841	}
				842
				843	static bool readsVCCZ(const MachineInstr &MI) {
				844	unsigned Opc = MI.getOpcode();
				845	return (Opc == AMDGPU::S_CBRANCH_VCCNZ \|\| Opc == AMDGPU::S_CBRANCH_VCCZ) &&
				846	!MI.getOperand(1).isUndef();
				847	}
				848
Adrian Prantl	5f8f34e4	2018-05-01 15:54:18 +0000	[diff] [blame]	849	/// Generate s_waitcnt instruction to be placed before cur_Inst.
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	850	/// Instructions of a given type are returned in order,
				851	/// but instructions of different types can complete out of order.
				852	/// We rely on this in-order completion
				853	/// and simply assign a score to the memory access instructions.
				854	/// We keep track of the active "score bracket" to determine
				855	/// if an access of a memory read requires an s_waitcnt
				856	/// and if so what the value of each counter is.
				857	/// The "score bracket" is bound by the lower bound and upper bound
				858	/// scores (_score_LB and _score_ub respectively).
Mark Searles	70901b9	2018-04-24 15:59:59 +0000	[diff] [blame]	859	void SIInsertWaitcnts::generateWaitcntInstBefore(
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	860	MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets,
				861	MachineInstr *OldWaitcntInstr) {
Mark Searles	4a0f2c5	2018-05-07 14:43:28 +0000	[diff] [blame]	862	setForceEmitWaitcnt();
Mark Searles	ec58183	2018-04-25 19:21:26 +0000	[diff] [blame]	863	bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
				864
Nicolai Haehnle	61396ff	2018-11-07 21:53:36 +0000	[diff] [blame]	865	if (MI.isDebugInstr())
Stanislav Mekhanoshin	db39b4b	2018-02-08 00:18:35 +0000	[diff] [blame]	866	return;
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	867
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	868	AMDGPU::Waitcnt Wait;
				869
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	870	// See if an s_waitcnt is forced at block entry, or is needed at
				871	// program end.
				872	if (ScoreBrackets->getWaitAtBeginning()) {
				873	// Note that we have already cleared the state, so we don't need to update
				874	// it.
				875	ScoreBrackets->clearWaitAtBeginning();
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	876	Wait = AMDGPU::Waitcnt::allZero();
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	877	}
				878
				879	// See if this instruction has a forced S_WAITCNT VM.
				880	// TODO: Handle other cases of NeedsWaitcntVmBefore()
				881	else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 \|\|
				882	MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC \|\|
				883	MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	884	Wait.VmCnt = 0;
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	885	}
				886
				887	// All waits must be resolved at call return.
				888	// NOTE: this could be improved with knowledge of all call sites or
				889	// with knowledge of the called routines.
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	890	if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG \|\|
Mark Searles	11d0a04	2017-05-31 16:44:23 +0000	[diff] [blame]	891	MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	892	Wait = AMDGPU::Waitcnt::allZero();
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	893	}
				894	// Resolve vm waits before gs-done.
				895	else if ((MI.getOpcode() == AMDGPU::S_SENDMSG \|\|
				896	MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
				897	((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
				898	AMDGPU::SendMsg::ID_GS_DONE)) {
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	899	Wait.VmCnt = 0;
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	900	}
				901	#if 0 // TODO: the following blocks of logic when we have fence.
				902	else if (MI.getOpcode() == SC_FENCE) {
				903	const unsigned int group_size =
				904	context->shader_info->GetMaxThreadGroupSize();
				905	// group_size == 0 means thread group size is unknown at compile time
				906	const bool group_is_multi_wave =
				907	(group_size == 0 \|\| group_size > target_info->GetWaveFrontSize());
				908	const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
				909
				910	for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
				911	SCRegType src_type = Inst->GetSrcType(i);
				912	switch (src_type) {
				913	case SCMEM_LDS:
				914	if (group_is_multi_wave \|\|
Evgeny Mankov	bf97517	2017-08-16 16:47:29 +0000	[diff] [blame]	915	context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
Mark Searles	70901b9	2018-04-24 15:59:59 +0000	[diff] [blame]	916	EmitWaitcnt \|= ScoreBrackets->updateByWait(LGKM_CNT,
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	917	ScoreBrackets->getScoreUB(LGKM_CNT));
				918	// LDS may have to wait for VM_CNT after buffer load to LDS
				919	if (target_info->HasBufferLoadToLDS()) {
Mark Searles	70901b9	2018-04-24 15:59:59 +0000	[diff] [blame]	920	EmitWaitcnt \|= ScoreBrackets->updateByWait(VM_CNT,
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	921	ScoreBrackets->getScoreUB(VM_CNT));
				922	}
				923	}
				924	break;
				925
				926	case SCMEM_GDS:
				927	if (group_is_multi_wave \|\| fence_is_global) {
Mark Searles	70901b9	2018-04-24 15:59:59 +0000	[diff] [blame]	928	EmitWaitcnt \|= ScoreBrackets->updateByWait(EXP_CNT,
Evgeny Mankov	bf97517	2017-08-16 16:47:29 +0000	[diff] [blame]	929	ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles	70901b9	2018-04-24 15:59:59 +0000	[diff] [blame]	930	EmitWaitcnt \|= ScoreBrackets->updateByWait(LGKM_CNT,
Evgeny Mankov	bf97517	2017-08-16 16:47:29 +0000	[diff] [blame]	931	ScoreBrackets->getScoreUB(LGKM_CNT));
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	932	}
				933	break;
				934
				935	case SCMEM_UAV:
				936	case SCMEM_TFBUF:
				937	case SCMEM_RING:
				938	case SCMEM_SCATTER:
				939	if (group_is_multi_wave \|\| fence_is_global) {
Mark Searles	70901b9	2018-04-24 15:59:59 +0000	[diff] [blame]	940	EmitWaitcnt \|= ScoreBrackets->updateByWait(EXP_CNT,
Evgeny Mankov	bf97517	2017-08-16 16:47:29 +0000	[diff] [blame]	941	ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles	70901b9	2018-04-24 15:59:59 +0000	[diff] [blame]	942	EmitWaitcnt \|= ScoreBrackets->updateByWait(VM_CNT,
Evgeny Mankov	bf97517	2017-08-16 16:47:29 +0000	[diff] [blame]	943	ScoreBrackets->getScoreUB(VM_CNT));
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	944	}
				945	break;
				946
				947	case SCMEM_SCRATCH:
				948	default:
				949	break;
				950	}
				951	}
				952	}
				953	#endif
				954
				955	// Export & GDS instructions do not read the EXEC mask until after the export
				956	// is granted (which can occur well after the instruction is issued).
				957	// The shader program must flush all EXP operations on the export-count
				958	// before overwriting the EXEC mask.
				959	else {
				960	if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
				961	// Export and GDS are tracked individually, either may trigger a waitcnt
				962	// for EXEC.
Nicolai Haehnle	d1f45da	2018-11-29 11:06:14 +0000	[diff] [blame^]	963	if (ScoreBrackets->hasPendingEvent(EXP_GPR_LOCK) \|\|
				964	ScoreBrackets->hasPendingEvent(EXP_PARAM_ACCESS) \|\|
				965	ScoreBrackets->hasPendingEvent(EXP_POS_ACCESS) \|\|
				966	ScoreBrackets->hasPendingEvent(GDS_GPR_LOCK)) {
				967	Wait.ExpCnt = 0;
				968	}
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	969	}
				970
				971	#if 0 // TODO: the following code to handle CALL.
				972	// The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
				973	// However, there is a problem with EXP_CNT, because the call cannot
				974	// easily tell if a register is used in the function, and if it did, then
				975	// the referring instruction would have to have an S_WAITCNT, which is
				976	// dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
				977	// before the call.
				978	if (MI.getOpcode() == SC_CALL) {
				979	if (ScoreBrackets->getScoreUB(EXP_CNT) >
Evgeny Mankov	bf97517	2017-08-16 16:47:29 +0000	[diff] [blame]	980	ScoreBrackets->getScoreLB(EXP_CNT)) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	981	ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles	70901b9	2018-04-24 15:59:59 +0000	[diff] [blame]	982	EmitWaitcnt \|= CNT_MASK(EXP_CNT);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	983	}
				984	}
				985	#endif
				986
Matt Arsenault	0ed39d3	2017-07-21 18:54:54 +0000	[diff] [blame]	987	// FIXME: Should not be relying on memoperands.
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	988	// Look at the source operands of every instruction to see if
				989	// any of them results from a previous memory operation that affects
				990	// its current usage. If so, an s_waitcnt instruction needs to be
				991	// emitted.
				992	// If the source operand was defined by a load, add the s_waitcnt
				993	// instruction.
				994	for (const MachineMemOperand *Memop : MI.memoperands()) {
				995	unsigned AS = Memop->getAddrSpace();
Matt Arsenault	0da6350	2018-08-31 05:49:54 +0000	[diff] [blame]	996	if (AS != AMDGPUAS::LOCAL_ADDRESS)
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	997	continue;
				998	unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
				999	// VM_CNT is only relevant to vgpr or LDS.
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	1000	ScoreBrackets->determineWait(
				1001	VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1002	}
Matt Arsenault	0ed39d3	2017-07-21 18:54:54 +0000	[diff] [blame]	1003
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1004	for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
				1005	const MachineOperand &Op = MI.getOperand(I);
				1006	const MachineRegisterInfo &MRIA = *MRI;
				1007	RegInterval Interval =
				1008	ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
				1009	for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
				1010	if (TRI->isVGPR(MRIA, Op.getReg())) {
				1011	// VM_CNT is only relevant to vgpr or LDS.
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	1012	ScoreBrackets->determineWait(
				1013	VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1014	}
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	1015	ScoreBrackets->determineWait(
				1016	LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT), Wait);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1017	}
				1018	}
				1019	// End of for loop that looks at all source operands to decide vm_wait_cnt
				1020	// and lgk_wait_cnt.
				1021
				1022	// Two cases are handled for destination operands:
				1023	// 1) If the destination operand was defined by a load, add the s_waitcnt
				1024	// instruction to guarantee the right WAW order.
				1025	// 2) If a destination operand that was used by a recent export/store ins,
				1026	// add s_waitcnt on exp_cnt to guarantee the WAR order.
				1027	if (MI.mayStore()) {
Matt Arsenault	0ed39d3	2017-07-21 18:54:54 +0000	[diff] [blame]	1028	// FIXME: Should not be relying on memoperands.
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1029	for (const MachineMemOperand *Memop : MI.memoperands()) {
				1030	unsigned AS = Memop->getAddrSpace();
Matt Arsenault	0da6350	2018-08-31 05:49:54 +0000	[diff] [blame]	1031	if (AS != AMDGPUAS::LOCAL_ADDRESS)
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1032	continue;
				1033	unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	1034	ScoreBrackets->determineWait(
				1035	VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
				1036	ScoreBrackets->determineWait(
				1037	EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT), Wait);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1038	}
				1039	}
				1040	for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
				1041	MachineOperand &Def = MI.getOperand(I);
				1042	const MachineRegisterInfo &MRIA = *MRI;
				1043	RegInterval Interval =
				1044	ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
				1045	for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
				1046	if (TRI->isVGPR(MRIA, Def.getReg())) {
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	1047	ScoreBrackets->determineWait(
				1048	VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
				1049	ScoreBrackets->determineWait(
				1050	EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT), Wait);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1051	}
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	1052	ScoreBrackets->determineWait(
				1053	LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT), Wait);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1054	}
				1055	} // End of for loop that looks at all dest operands.
				1056	}
				1057
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1058	// Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
				1059	// occurs before the instruction. Doing it here prevents any additional
				1060	// S_WAITCNTs from being emitted if the instruction was marked as
				1061	// requiring a WAITCNT beforehand.
Konstantin Zhuravlyov	be6c0ca	2017-06-02 17:40:26 +0000	[diff] [blame]	1062	if (MI.getOpcode() == AMDGPU::S_BARRIER &&
				1063	!ST->hasAutoWaitcntBeforeBarrier()) {
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	1064	Wait = AMDGPU::Waitcnt::allZero();
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1065	}
				1066
				1067	// TODO: Remove this work-around, enable the assert for Bug 457939
				1068	// after fixing the scheduler. Also, the Shader Compiler code is
				1069	// independent of target.
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	1070	if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1071	if (ScoreBrackets->getScoreLB(LGKM_CNT) <
				1072	ScoreBrackets->getScoreUB(LGKM_CNT) &&
Nicolai Haehnle	d1f45da	2018-11-29 11:06:14 +0000	[diff] [blame^]	1073	ScoreBrackets->hasPendingEvent(SMEM_ACCESS)) {
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	1074	Wait.LgkmCnt = 0;
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1075	}
				1076	}
				1077
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	1078	// Early-out if no wait is indicated.
				1079	if (!ScoreBrackets->simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
				1080	if (OldWaitcntInstr) {
				1081	if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
				1082	TrackedWaitcntSet.erase(OldWaitcntInstr);
				1083	OldWaitcntInstr->eraseFromParent();
Nicolai Haehnle	61396ff	2018-11-07 21:53:36 +0000	[diff] [blame]	1084	} else {
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	1085	int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
				1086	ScoreBrackets->applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
Stanislav Mekhanoshin	db39b4b	2018-02-08 00:18:35 +0000	[diff] [blame]	1087	}
Nicolai Haehnle	61396ff	2018-11-07 21:53:36 +0000	[diff] [blame]	1088	}
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	1089	return;
				1090	}
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1091
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	1092	if (ForceEmitZeroWaitcnts)
				1093	Wait = AMDGPU::Waitcnt::allZero();
				1094
				1095	if (ForceEmitWaitcnt[VM_CNT])
				1096	Wait.VmCnt = 0;
				1097	if (ForceEmitWaitcnt[EXP_CNT])
				1098	Wait.ExpCnt = 0;
				1099	if (ForceEmitWaitcnt[LGKM_CNT])
				1100	Wait.LgkmCnt = 0;
				1101
				1102	ScoreBrackets->applyWaitcnt(Wait);
				1103
				1104	AMDGPU::Waitcnt OldWait;
				1105	if (OldWaitcntInstr) {
				1106	OldWait =
				1107	AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm());
				1108	}
				1109	if (OldWait.dominates(Wait))
				1110	return;
				1111
				1112	MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
				1113	if (ContainingLoop) {
				1114	MachineBasicBlock *TBB = ContainingLoop->getHeader();
				1115	BlockWaitcntBrackets *ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
				1116	if (!ScoreBracket) {
				1117	assert(!BlockVisitedSet.count(TBB));
				1118	BlockWaitcntBracketsMap[TBB] =
				1119	llvm::make_unique<BlockWaitcntBrackets>(ST);
				1120	ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1121	}
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	1122	ScoreBracket->setRevisitLoop(true);
				1123	LLVM_DEBUG(dbgs() << "set-revisit2: Block"
				1124	<< ContainingLoop->getHeader()->getNumber() << '\n';);
				1125	}
				1126
				1127	if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr))
				1128	Wait = Wait.combined(OldWait);
				1129
				1130	unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
				1131	if (OldWaitcntInstr) {
				1132	OldWaitcntInstr->getOperand(0).setImm(Enc);
				1133
				1134	LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
				1135	<< "Old Instr: " << MI << '\n'
				1136	<< "New Instr: " << *OldWaitcntInstr << '\n');
				1137	} else {
				1138	auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
				1139	MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
				1140	.addImm(Enc);
				1141	TrackedWaitcntSet.insert(SWaitInst);
				1142
				1143	LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
				1144	<< "Old Instr: " << MI << '\n'
				1145	<< "New Instr: " << *SWaitInst << '\n');
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1146	}
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1147	}
				1148
				1149	void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
				1150	MachineInstr *Waitcnt) {
				1151	if (MBB.empty()) {
				1152	MBB.push_back(Waitcnt);
				1153	return;
				1154	}
				1155
				1156	MachineBasicBlock::iterator It = MBB.end();
				1157	MachineInstr MI = &(--It);
				1158	if (MI->isBranch()) {
				1159	MBB.insert(It, Waitcnt);
				1160	} else {
				1161	MBB.push_back(Waitcnt);
				1162	}
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1163	}
				1164
Matt Arsenault	0ed39d3	2017-07-21 18:54:54 +0000	[diff] [blame]	1165	// This is a flat memory operation. Check to see if it has memory
				1166	// tokens for both LDS and Memory, and if so mark it as a flat.
				1167	bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
				1168	if (MI.memoperands_empty())
				1169	return true;
				1170
				1171	for (const MachineMemOperand *Memop : MI.memoperands()) {
				1172	unsigned AS = Memop->getAddrSpace();
Matt Arsenault	0da6350	2018-08-31 05:49:54 +0000	[diff] [blame]	1173	if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::FLAT_ADDRESS)
Matt Arsenault	0ed39d3	2017-07-21 18:54:54 +0000	[diff] [blame]	1174	return true;
				1175	}
				1176
				1177	return false;
				1178	}
				1179
Mark Searles	70901b9	2018-04-24 15:59:59 +0000	[diff] [blame]	1180	void SIInsertWaitcnts::updateEventWaitcntAfter(
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1181	MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
				1182	// Now look at the instruction opcode. If it is a memory access
				1183	// instruction, update the upper-bound of the appropriate counter's
				1184	// bracket and the destination operand scores.
				1185	// TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
Matt Arsenault	6ab9ea9	2017-07-21 18:34:51 +0000	[diff] [blame]	1186	if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
Matt Arsenault	0ed39d3	2017-07-21 18:54:54 +0000	[diff] [blame]	1187	if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1188	ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
				1189	ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
				1190	} else {
				1191	ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
				1192	}
				1193	} else if (TII->isFLAT(Inst)) {
				1194	assert(Inst.mayLoad() \|\| Inst.mayStore());
Matt Arsenault	6ab9ea9	2017-07-21 18:34:51 +0000	[diff] [blame]	1195
				1196	if (TII->usesVM_CNT(Inst))
				1197	ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
				1198
Matt Arsenault	0ed39d3	2017-07-21 18:54:54 +0000	[diff] [blame]	1199	if (TII->usesLGKM_CNT(Inst)) {
Matt Arsenault	6ab9ea9	2017-07-21 18:34:51 +0000	[diff] [blame]	1200	ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1201
Matt Arsenault	0ed39d3	2017-07-21 18:54:54 +0000	[diff] [blame]	1202	// This is a flat memory operation, so note it - it will require
				1203	// that both the VM and LGKM be flushed to zero if it is pending when
				1204	// a VM or LGKM dependency occurs.
				1205	if (mayAccessLDSThroughFlat(Inst))
				1206	ScoreBrackets->setPendingFlat();
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1207	}
				1208	} else if (SIInstrInfo::isVMEM(Inst) &&
				1209	// TODO: get a better carve out.
				1210	Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
				1211	Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
				1212	Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
				1213	ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
Mark Searles	2a19af6	2018-04-26 16:11:19 +0000	[diff] [blame]	1214	if (ST->vmemWriteNeedsExpWaitcnt() &&
Mark Searles	11d0a04	2017-05-31 16:44:23 +0000	[diff] [blame]	1215	(Inst.mayStore() \|\| AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1216	ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
				1217	}
				1218	} else if (TII->isSMRD(Inst)) {
				1219	ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
				1220	} else {
				1221	switch (Inst.getOpcode()) {
				1222	case AMDGPU::S_SENDMSG:
				1223	case AMDGPU::S_SENDMSGHALT:
				1224	ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
				1225	break;
				1226	case AMDGPU::EXP:
				1227	case AMDGPU::EXP_DONE: {
				1228	int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
				1229	if (Imm >= 32 && Imm <= 63)
				1230	ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
				1231	else if (Imm >= 12 && Imm <= 15)
				1232	ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
				1233	else
				1234	ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
				1235	break;
				1236	}
				1237	case AMDGPU::S_MEMTIME:
				1238	case AMDGPU::S_MEMREALTIME:
				1239	ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
				1240	break;
				1241	default:
				1242	break;
				1243	}
				1244	}
				1245	}
				1246
Mark Searles	c3c02bd	2018-03-14 22:04:32 +0000	[diff] [blame]	1247	// Merge the score brackets of the Block's predecessors;
				1248	// this merged score bracket is used when adding waitcnts to the Block
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1249	void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
				1250	BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
				1251	int32_t MaxPending[NUM_INST_CNTS] = {0};
				1252	int32_t MaxFlat[NUM_INST_CNTS] = {0};
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1253
Mark Searles	c3c02bd	2018-03-14 22:04:32 +0000	[diff] [blame]	1254	// For single basic block loops, we need to retain the Block's
				1255	// score bracket to have accurate Pred info. So, make a copy of Block's
				1256	// score bracket, clear() it (which retains several important bits of info),
				1257	// populate, and then replace en masse. For non-single basic block loops,
				1258	// just clear Block's current score bracket and repopulate in-place.
				1259	bool IsSelfPred;
				1260	std::unique_ptr<BlockWaitcntBrackets> S;
				1261
				1262	IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
				1263	!= Block.pred_end();
				1264	if (IsSelfPred) {
				1265	S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
				1266	ScoreBrackets = S.get();
				1267	}
				1268
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1269	ScoreBrackets->clear();
				1270
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1271	// See if there are any uninitialized predecessors. If so, emit an
				1272	// s_waitcnt 0 at the beginning of the block.
Mark Searles	c3c02bd	2018-03-14 22:04:32 +0000	[diff] [blame]	1273	for (MachineBasicBlock *Pred : Block.predecessors()) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1274	BlockWaitcntBrackets *PredScoreBrackets =
Mark Searles	c3c02bd	2018-03-14 22:04:32 +0000	[diff] [blame]	1275	BlockWaitcntBracketsMap[Pred].get();
				1276	bool Visited = BlockVisitedSet.count(Pred);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1277	if (!Visited \|\| PredScoreBrackets->getWaitAtBeginning()) {
Tim Corringham	6c6d5e2	2017-12-04 12:30:49 +0000	[diff] [blame]	1278	continue;
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1279	}
Nicolai Haehnle	ae369d7	2018-11-29 11:06:11 +0000	[diff] [blame]	1280	for (auto T : inst_counter_types()) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1281	int span =
				1282	PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
				1283	MaxPending[T] = std::max(MaxPending[T], span);
				1284	span =
				1285	PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
				1286	MaxFlat[T] = std::max(MaxFlat[T], span);
				1287	}
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1288	}
				1289
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1290	#if 0
				1291	// LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
				1292	// TODO: how does LC distinguish between function entry and main entry?
				1293	// If this is the entry to a function, force a wait.
				1294	MachineBasicBlock &Entry = Block.getParent()->front();
				1295	if (Entry.getNumber() == Block.getNumber()) {
				1296	ScoreBrackets->setWaitAtBeginning();
				1297	return;
				1298	}
				1299	#endif
				1300
				1301	// Now set the current Block's brackets to the largest ending bracket.
Nicolai Haehnle	ae369d7	2018-11-29 11:06:11 +0000	[diff] [blame]	1302	for (auto T : inst_counter_types()) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1303	ScoreBrackets->setScoreUB(T, MaxPending[T]);
				1304	ScoreBrackets->setScoreLB(T, 0);
				1305	ScoreBrackets->setLastFlat(T, MaxFlat[T]);
				1306	}
				1307
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1308	// Set the register scoreboard.
				1309	for (MachineBasicBlock *Pred : Block.predecessors()) {
Mark Searles	24c92ee	2018-02-07 02:21:21 +0000	[diff] [blame]	1310	if (!BlockVisitedSet.count(Pred)) {
Tim Corringham	6c6d5e2	2017-12-04 12:30:49 +0000	[diff] [blame]	1311	continue;
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1312	}
				1313
				1314	BlockWaitcntBrackets *PredScoreBrackets =
				1315	BlockWaitcntBracketsMap[Pred].get();
				1316
				1317	// Now merge the gpr_reg_score information
Nicolai Haehnle	ae369d7	2018-11-29 11:06:11 +0000	[diff] [blame]	1318	for (auto T : inst_counter_types()) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1319	int PredLB = PredScoreBrackets->getScoreLB(T);
				1320	int PredUB = PredScoreBrackets->getScoreUB(T);
				1321	if (PredLB < PredUB) {
				1322	int PredScale = MaxPending[T] - PredUB;
				1323	// Merge vgpr scores.
				1324	for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
				1325	int PredRegScore = PredScoreBrackets->getRegScore(J, T);
				1326	if (PredRegScore <= PredLB)
				1327	continue;
				1328	int NewRegScore = PredScale + PredRegScore;
				1329	ScoreBrackets->setRegScore(
				1330	J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
				1331	}
				1332	// Also need to merge sgpr scores for lgkm_cnt.
				1333	if (T == LGKM_CNT) {
				1334	for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
				1335	int PredRegScore =
				1336	PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
				1337	if (PredRegScore <= PredLB)
				1338	continue;
				1339	int NewRegScore = PredScale + PredRegScore;
				1340	ScoreBrackets->setRegScore(
				1341	J + NUM_ALL_VGPRS, LGKM_CNT,
				1342	std::max(
				1343	ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
				1344	NewRegScore));
				1345	}
				1346	}
				1347	}
				1348	}
				1349
Nicolai Haehnle	d1f45da	2018-11-29 11:06:14 +0000	[diff] [blame^]	1350	ScoreBrackets->mergePendingEvents(*PredScoreBrackets);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1351	}
Mark Searles	c3c02bd	2018-03-14 22:04:32 +0000	[diff] [blame]	1352
				1353	// if a single block loop, update the score brackets. Not needed for other
				1354	// blocks, as we did this in-place
				1355	if (IsSelfPred) {
				1356	BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
				1357	}
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1358	}
				1359
Mark Searles	1054541	2018-05-30 15:47:45 +0000	[diff] [blame]	1360	/// Return true if the given basic block is a "bottom" block of a loop.
				1361	/// This works even if the loop is discontiguous. This also handles
				1362	/// multiple back-edges for the same "header" block of a loop.
Mark Searles	1bc6e71	2018-04-19 15:42:30 +0000	[diff] [blame]	1363	bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
				1364	const MachineBasicBlock *Block) {
				1365	for (MachineBasicBlock *MBB : Loop->blocks()) {
				1366	if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
				1367	return true;
				1368	}
				1369	}
				1370	return false;
				1371	}
				1372
				1373	/// Count the number of "bottom" basic blocks of a loop.
				1374	unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
				1375	unsigned Count = 0;
				1376	for (MachineBasicBlock *MBB : Loop->blocks()) {
				1377	if (MBB->isSuccessor(Loop->getHeader())) {
				1378	Count++;
				1379	}
				1380	}
				1381	return Count;
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1382	}
				1383
				1384	// Generate s_waitcnt instructions where needed.
				1385	void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
				1386	MachineBasicBlock &Block) {
				1387	// Initialize the state information.
				1388	mergeInputScoreBrackets(Block);
				1389
				1390	BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
				1391
Nicola Zaghen	d34e60c	2018-05-14 12:53:11 +0000	[diff] [blame]	1392	LLVM_DEBUG({
Mark Searles	ec58183	2018-04-25 19:21:26 +0000	[diff] [blame]	1393	dbgs() << "* Block" << Block.getNumber() << " *";
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1394	ScoreBrackets->dump();
				1395	});
				1396
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1397	// Walk over the instructions.
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	1398	MachineInstr *OldWaitcntInstr = nullptr;
				1399
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1400	for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
				1401	Iter != E;) {
				1402	MachineInstr &Inst = *Iter;
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	1403
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1404	// Remove any previously existing waitcnts.
				1405	if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	1406	if (OldWaitcntInstr) {
				1407	if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
				1408	TrackedWaitcntSet.erase(OldWaitcntInstr);
				1409	OldWaitcntInstr->eraseFromParent();
				1410	OldWaitcntInstr = nullptr;
				1411	} else if (!TrackedWaitcntSet.count(&Inst)) {
				1412	// Two successive s_waitcnt's, both of which are pre-existing and
				1413	// are therefore preserved.
				1414	int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
				1415	ScoreBrackets->applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
				1416	} else {
				1417	++Iter;
				1418	Inst.eraseFromParent();
				1419	continue;
				1420	}
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1421	}
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	1422
				1423	OldWaitcntInstr = &Inst;
				1424	++Iter;
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1425	continue;
				1426	}
				1427
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1428	bool VCCZBugWorkAround = false;
				1429	if (readsVCCZ(Inst) &&
Mark Searles	24c92ee	2018-02-07 02:21:21 +0000	[diff] [blame]	1430	(!VCCZBugHandledSet.count(&Inst))) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1431	if (ScoreBrackets->getScoreLB(LGKM_CNT) <
				1432	ScoreBrackets->getScoreUB(LGKM_CNT) &&
Nicolai Haehnle	d1f45da	2018-11-29 11:06:14 +0000	[diff] [blame^]	1433	ScoreBrackets->hasPendingEvent(SMEM_ACCESS)) {
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	1434	if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1435	VCCZBugWorkAround = true;
				1436	}
				1437	}
				1438
				1439	// Generate an s_waitcnt instruction to be placed before
				1440	// cur_Inst, if needed.
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	1441	generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
				1442	OldWaitcntInstr = nullptr;
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1443
Mark Searles	70901b9	2018-04-24 15:59:59 +0000	[diff] [blame]	1444	updateEventWaitcntAfter(Inst, ScoreBrackets);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1445
				1446	#if 0 // TODO: implement resource type check controlled by options with ub = LB.
				1447	// If this instruction generates a S_SETVSKIP because it is an
				1448	// indexed resource, and we are on Tahiti, then it will also force
				1449	// an S_WAITCNT vmcnt(0)
				1450	if (RequireCheckResourceType(Inst, context)) {
				1451	// Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
				1452	ScoreBrackets->setScoreLB(VM_CNT,
Evgeny Mankov	bf97517	2017-08-16 16:47:29 +0000	[diff] [blame]	1453	ScoreBrackets->getScoreUB(VM_CNT));
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1454	}
				1455	#endif
				1456
Nicola Zaghen	d34e60c	2018-05-14 12:53:11 +0000	[diff] [blame]	1457	LLVM_DEBUG({
Mark Searles	94ae3b2	2018-01-30 17:17:06 +0000	[diff] [blame]	1458	Inst.print(dbgs());
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1459	ScoreBrackets->dump();
				1460	});
				1461
				1462	// Check to see if this is a GWS instruction. If so, and if this is CI or
				1463	// VI, then the generated code sequence will include an S_WAITCNT 0.
				1464	// TODO: Are these the only GWS instructions?
				1465	if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT \|\|
				1466	Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V \|\|
				1467	Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR \|\|
				1468	Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P \|\|
				1469	Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
				1470	// TODO: && context->target_info->GwsRequiresMemViolTest() ) {
Nicolai Haehnle	1a94cbb	2018-11-29 11:06:06 +0000	[diff] [blame]	1471	ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero());
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1472	}
				1473
				1474	// TODO: Remove this work-around after fixing the scheduler and enable the
				1475	// assert above.
				1476	if (VCCZBugWorkAround) {
				1477	// Restore the vccz bit. Any time a value is written to vcc, the vcc
				1478	// bit is updated, so we can restore the bit by reading the value of
				1479	// vcc and then writing it back to the register.
				1480	BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
				1481	AMDGPU::VCC)
				1482	.addReg(AMDGPU::VCC);
				1483	VCCZBugHandledSet.insert(&Inst);
				1484	}
				1485
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1486	++Iter;
				1487	}
				1488
				1489	// Check if we need to force convergence at loop footer.
				1490	MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
Mark Searles	1bc6e71	2018-04-19 15:42:30 +0000	[diff] [blame]	1491	if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1492	LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
				1493	WaitcntData->print();
Nicola Zaghen	d34e60c	2018-05-14 12:53:11 +0000	[diff] [blame]	1494	LLVM_DEBUG(dbgs() << '\n';);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1495
				1496	// The iterative waitcnt insertion algorithm aims for optimal waitcnt
Mark Searles	1054541	2018-05-30 15:47:45 +0000	[diff] [blame]	1497	// placement, but doesn't guarantee convergence for a loop. Each
				1498	// loop should take at most (n+1) iterations for it to converge naturally,
				1499	// where n is the number of bottom blocks. If this threshold is reached and
				1500	// the result hasn't converged, then we force convergence by inserting
				1501	// a s_waitcnt at the end of loop footer.
				1502	if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1503	// To ensure convergence, need to make wait events at loop footer be no
				1504	// more than those from the previous iteration.
Mark Searles	6520792	2018-02-19 19:19:59 +0000	[diff] [blame]	1505	// As a simplification, instead of tracking individual scores and
				1506	// generating the precise wait count, just wait on 0.
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1507	bool HasPending = false;
				1508	MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
Nicolai Haehnle	ae369d7	2018-11-29 11:06:11 +0000	[diff] [blame]	1509	for (auto T : inst_counter_types()) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1510	if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
				1511	ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
				1512	HasPending = true;
Mark Searles	1054541	2018-05-30 15:47:45 +0000	[diff] [blame]	1513	break;
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1514	}
				1515	}
				1516
				1517	if (HasPending) {
				1518	if (!SWaitInst) {
Mark Searles	1054541	2018-05-30 15:47:45 +0000	[diff] [blame]	1519	SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
				1520	DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
				1521	.addImm(0);
Mark Searles	24c92ee	2018-02-07 02:21:21 +0000	[diff] [blame]	1522	TrackedWaitcntSet.insert(SWaitInst);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1523	#if 0 // TODO: Format the debug output
				1524	OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
				1525	OutputTransformAdd(SWaitInst, context);
				1526	#endif
				1527	}
				1528	#if 0 // TODO: ??
				1529	_DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
				1530	#endif
				1531	}
				1532
				1533	if (SWaitInst) {
Nicola Zaghen	d34e60c	2018-05-14 12:53:11 +0000	[diff] [blame]	1534	LLVM_DEBUG({
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1535	SWaitInst->print(dbgs());
				1536	dbgs() << "\nAdjusted score board:";
				1537	ScoreBrackets->dump();
				1538	});
				1539
				1540	// Add this waitcnt to the block. It is either newly created or
				1541	// created in previous iterations and added back since block traversal
Mark Searles	6520792	2018-02-19 19:19:59 +0000	[diff] [blame]	1542	// always removes waitcnts.
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1543	insertWaitcntBeforeCF(Block, SWaitInst);
				1544	WaitcntData->setWaitcnt(SWaitInst);
				1545	}
				1546	}
				1547	}
				1548	}
				1549
				1550	bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	1551	ST = &MF.getSubtarget<GCNSubtarget>();
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1552	TII = ST->getInstrInfo();
				1553	TRI = &TII->getRegisterInfo();
				1554	MRI = &MF.getRegInfo();
				1555	MLI = &getAnalysis<MachineLoopInfo>();
Konstantin Zhuravlyov	71e43ee	2018-09-12 18:50:47 +0000	[diff] [blame]	1556	IV = AMDGPU::getIsaVersion(ST->getCPU());
Mark Searles	11d0a04	2017-05-31 16:44:23 +0000	[diff] [blame]	1557	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1558
Mark Searles	4a0f2c5	2018-05-07 14:43:28 +0000	[diff] [blame]	1559	ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
Nicolai Haehnle	ae369d7	2018-11-29 11:06:11 +0000	[diff] [blame]	1560	for (auto T : inst_counter_types())
Mark Searles	ec58183	2018-04-25 19:21:26 +0000	[diff] [blame]	1561	ForceEmitWaitcnt[T] = false;
				1562
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1563	HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
				1564	HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
				1565	HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
				1566
				1567	HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
				1568	HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
				1569	assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
				1570	assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
				1571
				1572	RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
				1573	RegisterEncoding.VGPRL =
				1574	RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
				1575	RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
				1576	RegisterEncoding.SGPRL =
				1577	RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
				1578
Mark Searles	24c92ee	2018-02-07 02:21:21 +0000	[diff] [blame]	1579	TrackedWaitcntSet.clear();
				1580	BlockVisitedSet.clear();
				1581	VCCZBugHandledSet.clear();
Mark Searles	1bc6e71	2018-04-19 15:42:30 +0000	[diff] [blame]	1582	LoopWaitcntDataMap.clear();
Scott Linder	5792dd0	2018-06-21 18:48:48 +0000	[diff] [blame]	1583	BlockWaitcntProcessedSet.clear();
Mark Searles	24c92ee	2018-02-07 02:21:21 +0000	[diff] [blame]	1584
Nicolai Haehnle	0ab31c9	2018-11-07 21:53:29 +0000	[diff] [blame]	1585	// Walk over the blocks in reverse post order, inserting
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1586	// s_waitcnt where needed.
				1587	ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
				1588	bool Modified = false;
				1589	for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
				1590	I = RPOT.begin(),
				1591	E = RPOT.end(), J = RPOT.begin();
				1592	I != E;) {
				1593	MachineBasicBlock &MBB = **I;
				1594
				1595	BlockVisitedSet.insert(&MBB);
				1596
				1597	BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
				1598	if (!ScoreBrackets) {
Mark Searles	f0b93f1	2018-06-04 16:51:59 +0000	[diff] [blame]	1599	BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1600	ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
				1601	}
				1602	ScoreBrackets->setPostOrder(MBB.getNumber());
				1603	MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
				1604	if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
Eugene Zelenko	59e1282	2017-08-08 00:47:13 +0000	[diff] [blame]	1605	LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1606
				1607	// If we are walking into the block from before the loop, then guarantee
				1608	// at least 1 re-walk over the loop to propagate the information, even if
				1609	// no S_WAITCNT instructions were generated.
Mark Searles	1bc6e71	2018-04-19 15:42:30 +0000	[diff] [blame]	1610	if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
				1611	unsigned Count = countNumBottomBlocks(ContainingLoop);
				1612
				1613	// If the loop has multiple back-edges, and so more than one "bottom"
				1614	// basic block, we have to guarantee a re-walk over every blocks.
				1615	if ((std::count(BlockWaitcntProcessedSet.begin(),
Mark Searles	f4e7025	2018-07-16 10:21:36 +0000	[diff] [blame]	1616	BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
Mark Searles	1bc6e71	2018-04-19 15:42:30 +0000	[diff] [blame]	1617	BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
Mark Searles	1054541	2018-05-30 15:47:45 +0000	[diff] [blame]	1618	LLVM_DEBUG(dbgs() << "set-revisit1: Block"
Nicola Zaghen	d34e60c	2018-05-14 12:53:11 +0000	[diff] [blame]	1619	<< ContainingLoop->getHeader()->getNumber() << '\n';);
Mark Searles	1bc6e71	2018-04-19 15:42:30 +0000	[diff] [blame]	1620	}
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1621	}
				1622
				1623	// Walk over the instructions.
				1624	insertWaitcntInBlock(MF, MBB);
				1625
Mark Searles	1054541	2018-05-30 15:47:45 +0000	[diff] [blame]	1626	// Record that waitcnts have been processed at least once for this block.
Mark Searles	1bc6e71	2018-04-19 15:42:30 +0000	[diff] [blame]	1627	BlockWaitcntProcessedSet.push_back(&MBB);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1628
Mark Searles	1bc6e71	2018-04-19 15:42:30 +0000	[diff] [blame]	1629	// See if we want to revisit the loop. If a loop has multiple back-edges,
				1630	// we shouldn't revisit the same "bottom" basic block.
				1631	if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
				1632	std::count(BlockWaitcntProcessedSet.begin(),
				1633	BlockWaitcntProcessedSet.end(), &MBB) == 1) {
Kannan Narayanan	5e73b04	2017-05-05 21:10:17 +0000	[diff] [blame]	1634	MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1635	BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
				1636	if (EntrySB && EntrySB->getRevisitLoop()) {
				1637	EntrySB->setRevisitLoop(false);
				1638	J = I;
				1639	int32_t PostOrder = EntrySB->getPostOrder();
				1640	// TODO: Avoid this loop. Find another way to set I.
				1641	for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
				1642	X = RPOT.begin(),
				1643	Y = RPOT.end();
				1644	X != Y; ++X) {
				1645	MachineBasicBlock &MBBX = **X;
				1646	if (MBBX.getNumber() == PostOrder) {
				1647	I = X;
				1648	break;
				1649	}
				1650	}
				1651	LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
				1652	WaitcntData->incIterCnt();
Nicola Zaghen	d34e60c	2018-05-14 12:53:11 +0000	[diff] [blame]	1653	LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1654	continue;
				1655	} else {
				1656	LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
				1657	// Loop converged, reset iteration count. If this loop gets revisited,
				1658	// it must be from an outer loop, the counter will restart, this will
				1659	// ensure we don't force convergence on such revisits.
				1660	WaitcntData->resetIterCnt();
				1661	}
				1662	}
				1663
				1664	J = I;
				1665	++I;
				1666	}
				1667
				1668	SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
				1669
				1670	bool HaveScalarStores = false;
				1671
				1672	for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
				1673	++BI) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1674	MachineBasicBlock &MBB = *BI;
				1675
				1676	for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
				1677	++I) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1678	if (!HaveScalarStores && TII->isScalarStore(*I))
				1679	HaveScalarStores = true;
				1680
				1681	if (I->getOpcode() == AMDGPU::S_ENDPGM \|\|
				1682	I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
				1683	EndPgmBlocks.push_back(&MBB);
				1684	}
				1685	}
				1686
				1687	if (HaveScalarStores) {
				1688	// If scalar writes are used, the cache must be flushed or else the next
				1689	// wave to reuse the same scratch memory can be clobbered.
				1690	//
				1691	// Insert s_dcache_wb at wave termination points if there were any scalar
				1692	// stores, and only if the cache hasn't already been flushed. This could be
				1693	// improved by looking across blocks for flushes in postdominating blocks
				1694	// from the stores but an explicitly requested flush is probably very rare.
				1695	for (MachineBasicBlock *MBB : EndPgmBlocks) {
				1696	bool SeenDCacheWB = false;
				1697
				1698	for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
				1699	++I) {
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1700	if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
				1701	SeenDCacheWB = true;
				1702	else if (TII->isScalarStore(*I))
				1703	SeenDCacheWB = false;
				1704
				1705	// FIXME: It would be better to insert this before a waitcnt if any.
				1706	if ((I->getOpcode() == AMDGPU::S_ENDPGM \|\|
				1707	I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
				1708	!SeenDCacheWB) {
				1709	Modified = true;
				1710	BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
				1711	}
				1712	}
				1713	}
				1714	}
				1715
Mark Searles	11d0a04	2017-05-31 16:44:23 +0000	[diff] [blame]	1716	if (!MFI->isEntryFunction()) {
				1717	// Wait for any outstanding memory operations that the input registers may
Hiroshi Inoue	c8e9245	2018-01-29 05:17:03 +0000	[diff] [blame]	1718	// depend on. We can't track them and it's better to the wait after the
Mark Searles	11d0a04	2017-05-31 16:44:23 +0000	[diff] [blame]	1719	// costly call sequence.
				1720
				1721	// TODO: Could insert earlier and schedule more liberally with operations
				1722	// that only use caller preserved registers.
				1723	MachineBasicBlock &EntryBB = MF.front();
Mark Searles	ed54ff1	2018-05-30 16:27:57 +0000	[diff] [blame]	1724	BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
				1725	.addImm(0);
Mark Searles	11d0a04	2017-05-31 16:44:23 +0000	[diff] [blame]	1726
				1727	Modified = true;
				1728	}
				1729
Kannan Narayanan	acb089e	2017-04-12 03:25:12 +0000	[diff] [blame]	1730	return Modified;
				1731	}