blob: c33350d7546ed2be919c6e56212660bcc2f79ec2 [file] [log] [blame]
Eugene Zelenko59e12822017-08-08 00:47:13 +00001//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
Kannan Narayananacb089e2017-04-12 03:25:12 +00002//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
Adrian Prantl5f8f34e42018-05-01 15:54:18 +000011/// Insert wait instructions for memory reads and writes.
Kannan Narayananacb089e2017-04-12 03:25:12 +000012///
13/// Memory reads and writes are issued asynchronously, so we need to insert
14/// S_WAITCNT instructions when we want to access any of their results or
15/// overwrite any register that's used asynchronously.
Nicolai Haehnled1f45da2018-11-29 11:06:14 +000016///
17/// TODO: This pass currently keeps one timeline per hardware counter. A more
18/// finely-grained approach that keeps one timeline per event type could
19/// sometimes get away with generating weaker s_waitcnt instructions. For
20/// example, when both SMEM and LDS are in flight and we need to wait for
21/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
22/// but the pass will currently generate a conservative lgkmcnt(0) because
23/// multiple event types are in flight.
Kannan Narayananacb089e2017-04-12 03:25:12 +000024//
25//===----------------------------------------------------------------------===//
26
27#include "AMDGPU.h"
28#include "AMDGPUSubtarget.h"
29#include "SIDefines.h"
30#include "SIInstrInfo.h"
31#include "SIMachineFunctionInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000032#include "SIRegisterInfo.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000033#include "Utils/AMDGPUBaseInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000034#include "llvm/ADT/DenseMap.h"
35#include "llvm/ADT/DenseSet.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000036#include "llvm/ADT/PostOrderIterator.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000037#include "llvm/ADT/STLExtras.h"
38#include "llvm/ADT/SmallVector.h"
39#include "llvm/CodeGen/MachineBasicBlock.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000040#include "llvm/CodeGen/MachineFunction.h"
41#include "llvm/CodeGen/MachineFunctionPass.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000042#include "llvm/CodeGen/MachineInstr.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000043#include "llvm/CodeGen/MachineInstrBuilder.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000044#include "llvm/CodeGen/MachineLoopInfo.h"
45#include "llvm/CodeGen/MachineMemOperand.h"
46#include "llvm/CodeGen/MachineOperand.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000047#include "llvm/CodeGen/MachineRegisterInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000048#include "llvm/IR/DebugLoc.h"
49#include "llvm/Pass.h"
50#include "llvm/Support/Debug.h"
Mark Searlesec581832018-04-25 19:21:26 +000051#include "llvm/Support/DebugCounter.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000052#include "llvm/Support/ErrorHandling.h"
53#include "llvm/Support/raw_ostream.h"
54#include <algorithm>
55#include <cassert>
56#include <cstdint>
57#include <cstring>
58#include <memory>
59#include <utility>
60#include <vector>
Kannan Narayananacb089e2017-04-12 03:25:12 +000061
Mark Searlesec581832018-04-25 19:21:26 +000062using namespace llvm;
63
Kannan Narayananacb089e2017-04-12 03:25:12 +000064#define DEBUG_TYPE "si-insert-waitcnts"
65
Mark Searlesec581832018-04-25 19:21:26 +000066DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
67 "Force emit s_waitcnt expcnt(0) instrs");
68DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
69 "Force emit s_waitcnt lgkmcnt(0) instrs");
70DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
71 "Force emit s_waitcnt vmcnt(0) instrs");
72
73static cl::opt<unsigned> ForceEmitZeroFlag(
74 "amdgpu-waitcnt-forcezero",
75 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
76 cl::init(0), cl::Hidden);
Kannan Narayananacb089e2017-04-12 03:25:12 +000077
78namespace {
79
Nicolai Haehnleae369d72018-11-29 11:06:11 +000080template <typename EnumT>
81class enum_iterator
82 : public iterator_facade_base<enum_iterator<EnumT>,
83 std::forward_iterator_tag, const EnumT> {
84 EnumT Value;
85public:
86 enum_iterator() = default;
87 enum_iterator(EnumT Value) : Value(Value) {}
88
89 enum_iterator &operator++() {
90 Value = static_cast<EnumT>(Value + 1);
91 return *this;
92 }
93
94 bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
95
96 EnumT operator*() const { return Value; }
97};
98
Kannan Narayananacb089e2017-04-12 03:25:12 +000099// Class of object that encapsulates latest instruction counter score
100// associated with the operand. Used for determining whether
101// s_waitcnt instruction needs to be emited.
102
103#define CNT_MASK(t) (1u << (t))
104
105enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
106
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000107iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
108 return make_range(enum_iterator<InstCounterType>(VM_CNT),
109 enum_iterator<InstCounterType>(NUM_INST_CNTS));
110}
111
Eugene Zelenko59e12822017-08-08 00:47:13 +0000112using RegInterval = std::pair<signed, signed>;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000113
114struct {
115 int32_t VmcntMax;
116 int32_t ExpcntMax;
117 int32_t LgkmcntMax;
118 int32_t NumVGPRsMax;
119 int32_t NumSGPRsMax;
120} HardwareLimits;
121
122struct {
123 unsigned VGPR0;
124 unsigned VGPRL;
125 unsigned SGPR0;
126 unsigned SGPRL;
127} RegisterEncoding;
128
129enum WaitEventType {
130 VMEM_ACCESS, // vector-memory read & write
131 LDS_ACCESS, // lds read & write
132 GDS_ACCESS, // gds read & write
133 SQ_MESSAGE, // send message
134 SMEM_ACCESS, // scalar-memory read & write
135 EXP_GPR_LOCK, // export holding on its data src
136 GDS_GPR_LOCK, // GDS holding on its data and addr src
137 EXP_POS_ACCESS, // write to export position
138 EXP_PARAM_ACCESS, // write to export parameter
139 VMW_GPR_LOCK, // vector-memory write holding on its data src
140 NUM_WAIT_EVENTS,
141};
142
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000143static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
144 (1 << VMEM_ACCESS),
145 (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
146 (1 << SQ_MESSAGE),
147 (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
148 (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
149};
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000150
Kannan Narayananacb089e2017-04-12 03:25:12 +0000151// The mapping is:
152// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
153// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
154// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
155// We reserve a fixed number of VGPR slots in the scoring tables for
156// special tokens like SCMEM_LDS (needed for buffer load to LDS).
157enum RegisterMapping {
158 SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
159 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
160 NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
161 EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
162 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
163};
164
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000165void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
166 switch (T) {
167 case VM_CNT:
168 Wait.VmCnt = std::min(Wait.VmCnt, Count);
169 break;
170 case EXP_CNT:
171 Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
172 break;
173 case LGKM_CNT:
174 Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
175 break;
176 default:
177 llvm_unreachable("bad InstCounterType");
178 }
179}
180
Kannan Narayananacb089e2017-04-12 03:25:12 +0000181// This is a per-basic-block object that maintains current score brackets
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000182// of each wait counter, and a per-register scoreboard for each wait counter.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000183// We also maintain the latest score for every event type that can change the
184// waitcnt in order to know if there are multiple types of events within
185// the brackets. When multiple types of event happen in the bracket,
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000186// wait count may get decreased out of order, therefore we need to put in
Kannan Narayananacb089e2017-04-12 03:25:12 +0000187// "s_waitcnt 0" before use.
188class BlockWaitcntBrackets {
189public:
Tom Stellard5bfbae52018-07-11 20:59:01 +0000190 BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000191 for (auto T : inst_counter_types())
Eugene Zelenko59e12822017-08-08 00:47:13 +0000192 memset(VgprScores[T], 0, sizeof(VgprScores[T]));
Eugene Zelenko59e12822017-08-08 00:47:13 +0000193 }
194
195 ~BlockWaitcntBrackets() = default;
196
Kannan Narayananacb089e2017-04-12 03:25:12 +0000197 static int32_t getWaitCountMax(InstCounterType T) {
198 switch (T) {
199 case VM_CNT:
200 return HardwareLimits.VmcntMax;
201 case LGKM_CNT:
202 return HardwareLimits.LgkmcntMax;
203 case EXP_CNT:
204 return HardwareLimits.ExpcntMax;
205 default:
206 break;
207 }
208 return 0;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000209 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000210
211 void setScoreLB(InstCounterType T, int32_t Val) {
212 assert(T < NUM_INST_CNTS);
213 if (T >= NUM_INST_CNTS)
214 return;
215 ScoreLBs[T] = Val;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000216 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000217
218 void setScoreUB(InstCounterType T, int32_t Val) {
219 assert(T < NUM_INST_CNTS);
220 if (T >= NUM_INST_CNTS)
221 return;
222 ScoreUBs[T] = Val;
223 if (T == EXP_CNT) {
224 int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
225 if (ScoreLBs[T] < UB)
226 ScoreLBs[T] = UB;
227 }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000228 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000229
Nicolai Haehnlec548d912018-11-19 12:03:11 +0000230 int32_t getScoreLB(InstCounterType T) const {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000231 assert(T < NUM_INST_CNTS);
232 if (T >= NUM_INST_CNTS)
233 return 0;
234 return ScoreLBs[T];
Eugene Zelenko59e12822017-08-08 00:47:13 +0000235 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000236
Nicolai Haehnlec548d912018-11-19 12:03:11 +0000237 int32_t getScoreUB(InstCounterType T) const {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000238 assert(T < NUM_INST_CNTS);
239 if (T >= NUM_INST_CNTS)
240 return 0;
241 return ScoreUBs[T];
Eugene Zelenko59e12822017-08-08 00:47:13 +0000242 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000243
244 // Mapping from event to counter.
245 InstCounterType eventCounter(WaitEventType E) {
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000246 if (E == VMEM_ACCESS)
Kannan Narayananacb089e2017-04-12 03:25:12 +0000247 return VM_CNT;
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000248 if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
Kannan Narayananacb089e2017-04-12 03:25:12 +0000249 return LGKM_CNT;
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000250 assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
251 return EXP_CNT;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000252 }
253
254 void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
255 if (GprNo < NUM_ALL_VGPRS) {
256 if (GprNo > VgprUB) {
257 VgprUB = GprNo;
258 }
259 VgprScores[T][GprNo] = Val;
260 } else {
261 assert(T == LGKM_CNT);
262 if (GprNo - NUM_ALL_VGPRS > SgprUB) {
263 SgprUB = GprNo - NUM_ALL_VGPRS;
264 }
265 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
266 }
267 }
268
269 int32_t getRegScore(int GprNo, InstCounterType T) {
270 if (GprNo < NUM_ALL_VGPRS) {
271 return VgprScores[T][GprNo];
272 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000273 assert(T == LGKM_CNT);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000274 return SgprScores[GprNo - NUM_ALL_VGPRS];
275 }
276
277 void clear() {
278 memset(ScoreLBs, 0, sizeof(ScoreLBs));
279 memset(ScoreUBs, 0, sizeof(ScoreUBs));
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000280 PendingEvents = 0;
281 memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents));
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000282 for (auto T : inst_counter_types())
Kannan Narayananacb089e2017-04-12 03:25:12 +0000283 memset(VgprScores[T], 0, sizeof(VgprScores[T]));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000284 memset(SgprScores, 0, sizeof(SgprScores));
285 }
286
287 RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
288 const MachineRegisterInfo *MRI,
289 const SIRegisterInfo *TRI, unsigned OpNo,
290 bool Def) const;
291
292 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
293 const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
294 unsigned OpNo, int32_t Val);
295
296 void setWaitAtBeginning() { WaitAtBeginning = true; }
297 void clearWaitAtBeginning() { WaitAtBeginning = false; }
298 bool getWaitAtBeginning() const { return WaitAtBeginning; }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000299 int32_t getMaxVGPR() const { return VgprUB; }
300 int32_t getMaxSGPR() const { return SgprUB; }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000301
Nicolai Haehnlec548d912018-11-19 12:03:11 +0000302 bool counterOutOfOrder(InstCounterType T) const;
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000303 bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
304 bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
305 void determineWait(InstCounterType T, int ScoreToWait,
306 AMDGPU::Waitcnt &Wait) const;
307 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
308 void applyWaitcnt(InstCounterType T, unsigned Count);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000309 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
310 const MachineRegisterInfo *MRI, WaitEventType E,
311 MachineInstr &MI);
312
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000313 bool hasPendingEvent(WaitEventType E) const {
314 return PendingEvents & (1 << E);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000315 }
316
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000317 void mergePendingEvents(const BlockWaitcntBrackets &Other);
318
Kannan Narayananacb089e2017-04-12 03:25:12 +0000319 bool hasPendingFlat() const {
320 return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
321 LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
322 (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
323 LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
324 }
325
326 void setPendingFlat() {
327 LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
328 LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
329 }
330
331 int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
332
333 void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
334
335 bool getRevisitLoop() const { return RevisitLoop; }
336 void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
337
338 void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
339 int32_t getPostOrder() const { return PostOrder; }
340
Kannan Narayananacb089e2017-04-12 03:25:12 +0000341 void print(raw_ostream &);
342 void dump() { print(dbgs()); }
343
344private:
Tom Stellard5bfbae52018-07-11 20:59:01 +0000345 const GCNSubtarget *ST = nullptr;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000346 bool WaitAtBeginning = false;
347 bool RevisitLoop = false;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000348 int32_t PostOrder = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000349 int32_t ScoreLBs[NUM_INST_CNTS] = {0};
350 int32_t ScoreUBs[NUM_INST_CNTS] = {0};
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000351 uint32_t PendingEvents = 0;
352 bool MixedPendingEvents[NUM_INST_CNTS] = {false};
Kannan Narayananacb089e2017-04-12 03:25:12 +0000353 // Remember the last flat memory operation.
354 int32_t LastFlat[NUM_INST_CNTS] = {0};
355 // wait_cnt scores for every vgpr.
356 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000357 int32_t VgprUB = 0;
358 int32_t SgprUB = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000359 int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
360 // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
361 int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
362};
363
364// This is a per-loop-region object that records waitcnt status at the end of
365// loop footer from the previous iteration. We also maintain an iteration
366// count to track the number of times the loop has been visited. When it
367// doesn't converge naturally, we force convergence by inserting s_waitcnt 0
368// at the end of the loop footer.
369class LoopWaitcntData {
370public:
Eugene Zelenko59e12822017-08-08 00:47:13 +0000371 LoopWaitcntData() = default;
372 ~LoopWaitcntData() = default;
373
Kannan Narayananacb089e2017-04-12 03:25:12 +0000374 void incIterCnt() { IterCnt++; }
375 void resetIterCnt() { IterCnt = 0; }
Mark Searles10545412018-05-30 15:47:45 +0000376 unsigned getIterCnt() { return IterCnt; }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000377
Kannan Narayananacb089e2017-04-12 03:25:12 +0000378 void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
379 MachineInstr *getWaitcnt() const { return LfWaitcnt; }
380
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000381 void print() { LLVM_DEBUG(dbgs() << " iteration " << IterCnt << '\n';); }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000382
383private:
384 // s_waitcnt added at the end of loop footer to stablize wait scores
385 // at the end of the loop footer.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000386 MachineInstr *LfWaitcnt = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000387 // Number of iterations the loop has been visited, not including the initial
388 // walk over.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000389 int32_t IterCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000390};
391
392class SIInsertWaitcnts : public MachineFunctionPass {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000393private:
Tom Stellard5bfbae52018-07-11 20:59:01 +0000394 const GCNSubtarget *ST = nullptr;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000395 const SIInstrInfo *TII = nullptr;
396 const SIRegisterInfo *TRI = nullptr;
397 const MachineRegisterInfo *MRI = nullptr;
398 const MachineLoopInfo *MLI = nullptr;
Konstantin Zhuravlyov71e43ee2018-09-12 18:50:47 +0000399 AMDGPU::IsaVersion IV;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000400
401 DenseSet<MachineBasicBlock *> BlockVisitedSet;
Mark Searles24c92ee2018-02-07 02:21:21 +0000402 DenseSet<MachineInstr *> TrackedWaitcntSet;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000403 DenseSet<MachineInstr *> VCCZBugHandledSet;
404
405 DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
406 BlockWaitcntBracketsMap;
407
Mark Searles1bc6e712018-04-19 15:42:30 +0000408 std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000409
410 DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
411
Mark Searles4a0f2c52018-05-07 14:43:28 +0000412 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
413 // because of amdgpu-waitcnt-forcezero flag
414 bool ForceEmitZeroWaitcnts;
Mark Searlesec581832018-04-25 19:21:26 +0000415 bool ForceEmitWaitcnt[NUM_INST_CNTS];
416
Kannan Narayananacb089e2017-04-12 03:25:12 +0000417public:
418 static char ID;
419
Konstantin Zhuravlyov77747772018-06-26 21:33:38 +0000420 SIInsertWaitcnts() : MachineFunctionPass(ID) {
421 (void)ForceExpCounter;
422 (void)ForceLgkmCounter;
423 (void)ForceVMCounter;
424 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000425
426 bool runOnMachineFunction(MachineFunction &MF) override;
427
428 StringRef getPassName() const override {
429 return "SI insert wait instructions";
430 }
431
432 void getAnalysisUsage(AnalysisUsage &AU) const override {
433 AU.setPreservesCFG();
434 AU.addRequired<MachineLoopInfo>();
435 MachineFunctionPass::getAnalysisUsage(AU);
436 }
437
Mark Searlesec581832018-04-25 19:21:26 +0000438 bool isForceEmitWaitcnt() const {
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000439 for (auto T : inst_counter_types())
Mark Searlesec581832018-04-25 19:21:26 +0000440 if (ForceEmitWaitcnt[T])
441 return true;
442 return false;
443 }
444
445 void setForceEmitWaitcnt() {
446// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
447// For debug builds, get the debug counter info and adjust if need be
448#ifndef NDEBUG
449 if (DebugCounter::isCounterSet(ForceExpCounter) &&
450 DebugCounter::shouldExecute(ForceExpCounter)) {
451 ForceEmitWaitcnt[EXP_CNT] = true;
452 } else {
453 ForceEmitWaitcnt[EXP_CNT] = false;
454 }
455
456 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
457 DebugCounter::shouldExecute(ForceLgkmCounter)) {
458 ForceEmitWaitcnt[LGKM_CNT] = true;
459 } else {
460 ForceEmitWaitcnt[LGKM_CNT] = false;
461 }
462
463 if (DebugCounter::isCounterSet(ForceVMCounter) &&
464 DebugCounter::shouldExecute(ForceVMCounter)) {
465 ForceEmitWaitcnt[VM_CNT] = true;
466 } else {
467 ForceEmitWaitcnt[VM_CNT] = false;
468 }
469#endif // NDEBUG
470 }
471
Matt Arsenault0ed39d32017-07-21 18:54:54 +0000472 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
Mark Searles70901b92018-04-24 15:59:59 +0000473 void generateWaitcntInstBefore(MachineInstr &MI,
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000474 BlockWaitcntBrackets *ScoreBrackets,
475 MachineInstr *OldWaitcntInstr);
Mark Searles70901b92018-04-24 15:59:59 +0000476 void updateEventWaitcntAfter(MachineInstr &Inst,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000477 BlockWaitcntBrackets *ScoreBrackets);
478 void mergeInputScoreBrackets(MachineBasicBlock &Block);
Mark Searles1bc6e712018-04-19 15:42:30 +0000479 bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
480 unsigned countNumBottomBlocks(const MachineLoop *Loop);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000481 void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
482 void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
483};
484
Eugene Zelenko59e12822017-08-08 00:47:13 +0000485} // end anonymous namespace
Kannan Narayananacb089e2017-04-12 03:25:12 +0000486
487RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
488 const SIInstrInfo *TII,
489 const MachineRegisterInfo *MRI,
490 const SIRegisterInfo *TRI,
491 unsigned OpNo,
492 bool Def) const {
493 const MachineOperand &Op = MI->getOperand(OpNo);
494 if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
495 (Def && !Op.isDef()))
496 return {-1, -1};
497
498 // A use via a PW operand does not need a waitcnt.
499 // A partial write is not a WAW.
500 assert(!Op.getSubReg() || !Op.isUndef());
501
502 RegInterval Result;
503 const MachineRegisterInfo &MRIA = *MRI;
504
505 unsigned Reg = TRI->getEncodingValue(Op.getReg());
506
507 if (TRI->isVGPR(MRIA, Op.getReg())) {
508 assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
509 Result.first = Reg - RegisterEncoding.VGPR0;
510 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
511 } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
512 assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
513 Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
514 assert(Result.first >= NUM_ALL_VGPRS &&
515 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
516 }
517 // TODO: Handle TTMP
518 // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
519 else
520 return {-1, -1};
521
522 const MachineInstr &MIA = *MI;
523 const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +0000524 unsigned Size = TRI->getRegSizeInBits(*RC);
525 Result.second = Result.first + (Size / 32);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000526
527 return Result;
528}
529
530void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
531 const SIInstrInfo *TII,
532 const SIRegisterInfo *TRI,
533 const MachineRegisterInfo *MRI,
534 unsigned OpNo, int32_t Val) {
535 RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000536 LLVM_DEBUG({
Kannan Narayananacb089e2017-04-12 03:25:12 +0000537 const MachineOperand &Opnd = MI->getOperand(OpNo);
538 assert(TRI->isVGPR(*MRI, Opnd.getReg()));
539 });
540 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
541 setRegScore(RegNo, EXP_CNT, Val);
542 }
543}
544
545void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
546 const SIRegisterInfo *TRI,
547 const MachineRegisterInfo *MRI,
548 WaitEventType E, MachineInstr &Inst) {
549 const MachineRegisterInfo &MRIA = *MRI;
550 InstCounterType T = eventCounter(E);
551 int32_t CurrScore = getScoreUB(T) + 1;
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000552 // PendingEvents and ScoreUB need to be update regardless if this event
553 // changes the score of a register or not.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000554 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000555 if (!hasPendingEvent(E)) {
556 if (PendingEvents & WaitEventMaskForInst[T])
557 MixedPendingEvents[T] = true;
558 PendingEvents |= 1 << E;
559 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000560 setScoreUB(T, CurrScore);
561
562 if (T == EXP_CNT) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000563 // Put score on the source vgprs. If this is a store, just use those
564 // specific register(s).
565 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
566 // All GDS operations must protect their address register (same as
567 // export.)
568 if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
569 Inst.getOpcode() != AMDGPU::DS_CONSUME) {
570 setExpScore(
571 &Inst, TII, TRI, MRI,
572 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
573 CurrScore);
574 }
575 if (Inst.mayStore()) {
576 setExpScore(
577 &Inst, TII, TRI, MRI,
578 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
579 CurrScore);
580 if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
581 AMDGPU::OpName::data1) != -1) {
582 setExpScore(&Inst, TII, TRI, MRI,
583 AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
584 AMDGPU::OpName::data1),
585 CurrScore);
586 }
587 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
588 Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
589 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
590 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
591 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
592 Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
593 Inst.getOpcode() != AMDGPU::DS_APPEND &&
594 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
595 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
596 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
597 const MachineOperand &Op = Inst.getOperand(I);
598 if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
599 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
600 }
601 }
602 }
603 } else if (TII->isFLAT(Inst)) {
604 if (Inst.mayStore()) {
605 setExpScore(
606 &Inst, TII, TRI, MRI,
607 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
608 CurrScore);
609 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
610 setExpScore(
611 &Inst, TII, TRI, MRI,
612 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
613 CurrScore);
614 }
615 } else if (TII->isMIMG(Inst)) {
616 if (Inst.mayStore()) {
617 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
618 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
619 setExpScore(
620 &Inst, TII, TRI, MRI,
621 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
622 CurrScore);
623 }
624 } else if (TII->isMTBUF(Inst)) {
625 if (Inst.mayStore()) {
626 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
627 }
628 } else if (TII->isMUBUF(Inst)) {
629 if (Inst.mayStore()) {
630 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
631 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
632 setExpScore(
633 &Inst, TII, TRI, MRI,
634 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
635 CurrScore);
636 }
637 } else {
638 if (TII->isEXP(Inst)) {
639 // For export the destination registers are really temps that
640 // can be used as the actual source after export patching, so
641 // we need to treat them like sources and set the EXP_CNT
642 // score.
643 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
644 MachineOperand &DefMO = Inst.getOperand(I);
645 if (DefMO.isReg() && DefMO.isDef() &&
646 TRI->isVGPR(MRIA, DefMO.getReg())) {
647 setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
648 CurrScore);
649 }
650 }
651 }
652 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
653 MachineOperand &MO = Inst.getOperand(I);
654 if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
655 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
656 }
657 }
658 }
659#if 0 // TODO: check if this is handled by MUBUF code above.
660 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
Evgeny Mankovbf975172017-08-16 16:47:29 +0000661 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
662 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000663 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
664 unsigned OpNo;//TODO: find the OpNo for this operand;
665 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
666 for (signed RegNo = Interval.first; RegNo < Interval.second;
Evgeny Mankovbf975172017-08-16 16:47:29 +0000667 ++RegNo) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000668 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
669 }
670#endif
671 } else {
672 // Match the score to the destination registers.
673 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
674 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
675 if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
676 continue;
677 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
678 setRegScore(RegNo, T, CurrScore);
679 }
680 }
681 if (TII->isDS(Inst) && Inst.mayStore()) {
682 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
683 }
684 }
685}
686
687void BlockWaitcntBrackets::print(raw_ostream &OS) {
688 OS << '\n';
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000689 for (auto T : inst_counter_types()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000690 int LB = getScoreLB(T);
691 int UB = getScoreUB(T);
692
693 switch (T) {
694 case VM_CNT:
695 OS << " VM_CNT(" << UB - LB << "): ";
696 break;
697 case LGKM_CNT:
698 OS << " LGKM_CNT(" << UB - LB << "): ";
699 break;
700 case EXP_CNT:
701 OS << " EXP_CNT(" << UB - LB << "): ";
702 break;
703 default:
704 OS << " UNKNOWN(" << UB - LB << "): ";
705 break;
706 }
707
708 if (LB < UB) {
709 // Print vgpr scores.
710 for (int J = 0; J <= getMaxVGPR(); J++) {
711 int RegScore = getRegScore(J, T);
712 if (RegScore <= LB)
713 continue;
714 int RelScore = RegScore - LB - 1;
715 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
716 OS << RelScore << ":v" << J << " ";
717 } else {
718 OS << RelScore << ":ds ";
719 }
720 }
721 // Also need to print sgpr scores for lgkm_cnt.
722 if (T == LGKM_CNT) {
723 for (int J = 0; J <= getMaxSGPR(); J++) {
724 int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
725 if (RegScore <= LB)
726 continue;
727 int RelScore = RegScore - LB - 1;
728 OS << RelScore << ":s" << J << " ";
729 }
730 }
731 }
732 OS << '\n';
733 }
734 OS << '\n';
Kannan Narayananacb089e2017-04-12 03:25:12 +0000735}
736
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000737/// Simplify the waitcnt, in the sense of removing redundant counts, and return
738/// whether a waitcnt instruction is needed at all.
739bool BlockWaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
740 return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
741 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
742 simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
743}
744
745bool BlockWaitcntBrackets::simplifyWaitcnt(InstCounterType T,
746 unsigned &Count) const {
747 const int32_t LB = getScoreLB(T);
748 const int32_t UB = getScoreUB(T);
749 if (Count < (unsigned)UB && UB - (int32_t)Count > LB)
750 return true;
751
752 Count = ~0u;
753 return false;
754}
755
756void BlockWaitcntBrackets::determineWait(InstCounterType T, int ScoreToWait,
757 AMDGPU::Waitcnt &Wait) const {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000758 if (ScoreToWait == -1) {
759 // The score to wait is unknown. This implies that it was not encountered
760 // during the path of the CFG walk done during the current traversal but
761 // may be seen on a different path. Emit an s_wait counter with a
762 // conservative value of 0 for the counter.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000763 addWait(Wait, T, 0);
764 return;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000765 }
766
767 // If the score of src_operand falls within the bracket, we need an
768 // s_waitcnt instruction.
769 const int32_t LB = getScoreLB(T);
770 const int32_t UB = getScoreUB(T);
771 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
Mark Searlesf0b93f12018-06-04 16:51:59 +0000772 if ((T == VM_CNT || T == LGKM_CNT) &&
773 hasPendingFlat() &&
774 !ST->hasFlatLgkmVMemCountInOrder()) {
775 // If there is a pending FLAT operation, and this is a VMem or LGKM
776 // waitcnt and the target can report early completion, then we need
777 // to force a waitcnt 0.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000778 addWait(Wait, T, 0);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000779 } else if (counterOutOfOrder(T)) {
780 // Counter can get decremented out-of-order when there
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000781 // are multiple types event in the bracket. Also emit an s_wait counter
Kannan Narayananacb089e2017-04-12 03:25:12 +0000782 // with a conservative value of 0 for the counter.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000783 addWait(Wait, T, 0);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000784 } else {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000785 addWait(Wait, T, UB - ScoreToWait);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000786 }
787 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000788}
Kannan Narayananacb089e2017-04-12 03:25:12 +0000789
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000790void BlockWaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
791 applyWaitcnt(VM_CNT, Wait.VmCnt);
792 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
793 applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000794}
795
796void BlockWaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
797 const int32_t UB = getScoreUB(T);
798 if (Count >= (unsigned)UB)
799 return;
800 if (Count != 0) {
801 if (counterOutOfOrder(T))
802 return;
803 setScoreLB(T, std::max(getScoreLB(T), UB - (int32_t)Count));
804 } else {
805 setScoreLB(T, UB);
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000806 MixedPendingEvents[T] = false;
807 PendingEvents &= ~WaitEventMaskForInst[T];
808 }
809}
810
811void BlockWaitcntBrackets::mergePendingEvents(const BlockWaitcntBrackets &Other) {
812 for (auto T : inst_counter_types()) {
813 uint32_t Old = PendingEvents & WaitEventMaskForInst[T];
814 uint32_t New = Other.PendingEvents & WaitEventMaskForInst[T];
815 if (Other.MixedPendingEvents[T] || (Old && New && Old != New))
816 MixedPendingEvents[T] = true;
817 PendingEvents |= New;
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000818 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000819}
820
821// Where there are multiple types of event in the bracket of a counter,
822// the decrement may go out of order.
Nicolai Haehnlec548d912018-11-19 12:03:11 +0000823bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000824 // Scalar memory read always can go out of order.
825 if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
826 return true;
827 return MixedPendingEvents[T];
Kannan Narayananacb089e2017-04-12 03:25:12 +0000828}
829
830INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
831 false)
832INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
833 false)
834
835char SIInsertWaitcnts::ID = 0;
836
837char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
838
839FunctionPass *llvm::createSIInsertWaitcntsPass() {
840 return new SIInsertWaitcnts();
841}
842
843static bool readsVCCZ(const MachineInstr &MI) {
844 unsigned Opc = MI.getOpcode();
845 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
846 !MI.getOperand(1).isUndef();
847}
848
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000849/// Generate s_waitcnt instruction to be placed before cur_Inst.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000850/// Instructions of a given type are returned in order,
851/// but instructions of different types can complete out of order.
852/// We rely on this in-order completion
853/// and simply assign a score to the memory access instructions.
854/// We keep track of the active "score bracket" to determine
855/// if an access of a memory read requires an s_waitcnt
856/// and if so what the value of each counter is.
857/// The "score bracket" is bound by the lower bound and upper bound
858/// scores (*_score_LB and *_score_ub respectively).
Mark Searles70901b92018-04-24 15:59:59 +0000859void SIInsertWaitcnts::generateWaitcntInstBefore(
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000860 MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets,
861 MachineInstr *OldWaitcntInstr) {
Mark Searles4a0f2c52018-05-07 14:43:28 +0000862 setForceEmitWaitcnt();
Mark Searlesec581832018-04-25 19:21:26 +0000863 bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
864
Nicolai Haehnle61396ff2018-11-07 21:53:36 +0000865 if (MI.isDebugInstr())
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +0000866 return;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000867
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000868 AMDGPU::Waitcnt Wait;
869
Kannan Narayananacb089e2017-04-12 03:25:12 +0000870 // See if an s_waitcnt is forced at block entry, or is needed at
871 // program end.
872 if (ScoreBrackets->getWaitAtBeginning()) {
873 // Note that we have already cleared the state, so we don't need to update
874 // it.
875 ScoreBrackets->clearWaitAtBeginning();
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000876 Wait = AMDGPU::Waitcnt::allZero();
Kannan Narayananacb089e2017-04-12 03:25:12 +0000877 }
878
879 // See if this instruction has a forced S_WAITCNT VM.
880 // TODO: Handle other cases of NeedsWaitcntVmBefore()
881 else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
882 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
883 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000884 Wait.VmCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000885 }
886
887 // All waits must be resolved at call return.
888 // NOTE: this could be improved with knowledge of all call sites or
889 // with knowledge of the called routines.
Tom Stellardc5a154d2018-06-28 23:47:12 +0000890 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
Mark Searles11d0a042017-05-31 16:44:23 +0000891 MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000892 Wait = AMDGPU::Waitcnt::allZero();
Kannan Narayananacb089e2017-04-12 03:25:12 +0000893 }
894 // Resolve vm waits before gs-done.
895 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
896 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
897 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
898 AMDGPU::SendMsg::ID_GS_DONE)) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000899 Wait.VmCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000900 }
901#if 0 // TODO: the following blocks of logic when we have fence.
902 else if (MI.getOpcode() == SC_FENCE) {
903 const unsigned int group_size =
904 context->shader_info->GetMaxThreadGroupSize();
905 // group_size == 0 means thread group size is unknown at compile time
906 const bool group_is_multi_wave =
907 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
908 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
909
910 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
911 SCRegType src_type = Inst->GetSrcType(i);
912 switch (src_type) {
913 case SCMEM_LDS:
914 if (group_is_multi_wave ||
Evgeny Mankovbf975172017-08-16 16:47:29 +0000915 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
Mark Searles70901b92018-04-24 15:59:59 +0000916 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000917 ScoreBrackets->getScoreUB(LGKM_CNT));
918 // LDS may have to wait for VM_CNT after buffer load to LDS
919 if (target_info->HasBufferLoadToLDS()) {
Mark Searles70901b92018-04-24 15:59:59 +0000920 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000921 ScoreBrackets->getScoreUB(VM_CNT));
922 }
923 }
924 break;
925
926 case SCMEM_GDS:
927 if (group_is_multi_wave || fence_is_global) {
Mark Searles70901b92018-04-24 15:59:59 +0000928 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000929 ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000930 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000931 ScoreBrackets->getScoreUB(LGKM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000932 }
933 break;
934
935 case SCMEM_UAV:
936 case SCMEM_TFBUF:
937 case SCMEM_RING:
938 case SCMEM_SCATTER:
939 if (group_is_multi_wave || fence_is_global) {
Mark Searles70901b92018-04-24 15:59:59 +0000940 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000941 ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000942 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000943 ScoreBrackets->getScoreUB(VM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000944 }
945 break;
946
947 case SCMEM_SCRATCH:
948 default:
949 break;
950 }
951 }
952 }
953#endif
954
955 // Export & GDS instructions do not read the EXEC mask until after the export
956 // is granted (which can occur well after the instruction is issued).
957 // The shader program must flush all EXP operations on the export-count
958 // before overwriting the EXEC mask.
959 else {
960 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
961 // Export and GDS are tracked individually, either may trigger a waitcnt
962 // for EXEC.
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000963 if (ScoreBrackets->hasPendingEvent(EXP_GPR_LOCK) ||
964 ScoreBrackets->hasPendingEvent(EXP_PARAM_ACCESS) ||
965 ScoreBrackets->hasPendingEvent(EXP_POS_ACCESS) ||
966 ScoreBrackets->hasPendingEvent(GDS_GPR_LOCK)) {
967 Wait.ExpCnt = 0;
968 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000969 }
970
971#if 0 // TODO: the following code to handle CALL.
972 // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
973 // However, there is a problem with EXP_CNT, because the call cannot
974 // easily tell if a register is used in the function, and if it did, then
975 // the referring instruction would have to have an S_WAITCNT, which is
976 // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
977 // before the call.
978 if (MI.getOpcode() == SC_CALL) {
979 if (ScoreBrackets->getScoreUB(EXP_CNT) >
Evgeny Mankovbf975172017-08-16 16:47:29 +0000980 ScoreBrackets->getScoreLB(EXP_CNT)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000981 ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000982 EmitWaitcnt |= CNT_MASK(EXP_CNT);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000983 }
984 }
985#endif
986
Matt Arsenault0ed39d32017-07-21 18:54:54 +0000987 // FIXME: Should not be relying on memoperands.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000988 // Look at the source operands of every instruction to see if
989 // any of them results from a previous memory operation that affects
990 // its current usage. If so, an s_waitcnt instruction needs to be
991 // emitted.
992 // If the source operand was defined by a load, add the s_waitcnt
993 // instruction.
994 for (const MachineMemOperand *Memop : MI.memoperands()) {
995 unsigned AS = Memop->getAddrSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +0000996 if (AS != AMDGPUAS::LOCAL_ADDRESS)
Kannan Narayananacb089e2017-04-12 03:25:12 +0000997 continue;
998 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
999 // VM_CNT is only relevant to vgpr or LDS.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001000 ScoreBrackets->determineWait(
1001 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001002 }
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001003
Kannan Narayananacb089e2017-04-12 03:25:12 +00001004 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1005 const MachineOperand &Op = MI.getOperand(I);
1006 const MachineRegisterInfo &MRIA = *MRI;
1007 RegInterval Interval =
1008 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
1009 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1010 if (TRI->isVGPR(MRIA, Op.getReg())) {
1011 // VM_CNT is only relevant to vgpr or LDS.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001012 ScoreBrackets->determineWait(
1013 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001014 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001015 ScoreBrackets->determineWait(
1016 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001017 }
1018 }
1019 // End of for loop that looks at all source operands to decide vm_wait_cnt
1020 // and lgk_wait_cnt.
1021
1022 // Two cases are handled for destination operands:
1023 // 1) If the destination operand was defined by a load, add the s_waitcnt
1024 // instruction to guarantee the right WAW order.
1025 // 2) If a destination operand that was used by a recent export/store ins,
1026 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1027 if (MI.mayStore()) {
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001028 // FIXME: Should not be relying on memoperands.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001029 for (const MachineMemOperand *Memop : MI.memoperands()) {
1030 unsigned AS = Memop->getAddrSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +00001031 if (AS != AMDGPUAS::LOCAL_ADDRESS)
Kannan Narayananacb089e2017-04-12 03:25:12 +00001032 continue;
1033 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001034 ScoreBrackets->determineWait(
1035 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
1036 ScoreBrackets->determineWait(
1037 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001038 }
1039 }
1040 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1041 MachineOperand &Def = MI.getOperand(I);
1042 const MachineRegisterInfo &MRIA = *MRI;
1043 RegInterval Interval =
1044 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
1045 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1046 if (TRI->isVGPR(MRIA, Def.getReg())) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001047 ScoreBrackets->determineWait(
1048 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
1049 ScoreBrackets->determineWait(
1050 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001051 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001052 ScoreBrackets->determineWait(
1053 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001054 }
1055 } // End of for loop that looks at all dest operands.
1056 }
1057
Kannan Narayananacb089e2017-04-12 03:25:12 +00001058 // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1059 // occurs before the instruction. Doing it here prevents any additional
1060 // S_WAITCNTs from being emitted if the instruction was marked as
1061 // requiring a WAITCNT beforehand.
Konstantin Zhuravlyovbe6c0ca2017-06-02 17:40:26 +00001062 if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1063 !ST->hasAutoWaitcntBeforeBarrier()) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001064 Wait = AMDGPU::Waitcnt::allZero();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001065 }
1066
1067 // TODO: Remove this work-around, enable the assert for Bug 457939
1068 // after fixing the scheduler. Also, the Shader Compiler code is
1069 // independent of target.
Tom Stellardc5a154d2018-06-28 23:47:12 +00001070 if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001071 if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1072 ScoreBrackets->getScoreUB(LGKM_CNT) &&
Nicolai Haehnled1f45da2018-11-29 11:06:14 +00001073 ScoreBrackets->hasPendingEvent(SMEM_ACCESS)) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001074 Wait.LgkmCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001075 }
1076 }
1077
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001078 // Early-out if no wait is indicated.
1079 if (!ScoreBrackets->simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
1080 if (OldWaitcntInstr) {
1081 if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
1082 TrackedWaitcntSet.erase(OldWaitcntInstr);
1083 OldWaitcntInstr->eraseFromParent();
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001084 } else {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001085 int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
1086 ScoreBrackets->applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001087 }
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001088 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001089 return;
1090 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001091
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001092 if (ForceEmitZeroWaitcnts)
1093 Wait = AMDGPU::Waitcnt::allZero();
1094
1095 if (ForceEmitWaitcnt[VM_CNT])
1096 Wait.VmCnt = 0;
1097 if (ForceEmitWaitcnt[EXP_CNT])
1098 Wait.ExpCnt = 0;
1099 if (ForceEmitWaitcnt[LGKM_CNT])
1100 Wait.LgkmCnt = 0;
1101
1102 ScoreBrackets->applyWaitcnt(Wait);
1103
1104 AMDGPU::Waitcnt OldWait;
1105 if (OldWaitcntInstr) {
1106 OldWait =
1107 AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm());
1108 }
1109 if (OldWait.dominates(Wait))
1110 return;
1111
1112 MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
1113 if (ContainingLoop) {
1114 MachineBasicBlock *TBB = ContainingLoop->getHeader();
1115 BlockWaitcntBrackets *ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
1116 if (!ScoreBracket) {
1117 assert(!BlockVisitedSet.count(TBB));
1118 BlockWaitcntBracketsMap[TBB] =
1119 llvm::make_unique<BlockWaitcntBrackets>(ST);
1120 ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001121 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001122 ScoreBracket->setRevisitLoop(true);
1123 LLVM_DEBUG(dbgs() << "set-revisit2: Block"
1124 << ContainingLoop->getHeader()->getNumber() << '\n';);
1125 }
1126
1127 if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr))
1128 Wait = Wait.combined(OldWait);
1129
1130 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1131 if (OldWaitcntInstr) {
1132 OldWaitcntInstr->getOperand(0).setImm(Enc);
1133
1134 LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
1135 << "Old Instr: " << MI << '\n'
1136 << "New Instr: " << *OldWaitcntInstr << '\n');
1137 } else {
1138 auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1139 MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1140 .addImm(Enc);
1141 TrackedWaitcntSet.insert(SWaitInst);
1142
1143 LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1144 << "Old Instr: " << MI << '\n'
1145 << "New Instr: " << *SWaitInst << '\n');
Kannan Narayananacb089e2017-04-12 03:25:12 +00001146 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001147}
1148
1149void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
1150 MachineInstr *Waitcnt) {
1151 if (MBB.empty()) {
1152 MBB.push_back(Waitcnt);
1153 return;
1154 }
1155
1156 MachineBasicBlock::iterator It = MBB.end();
1157 MachineInstr *MI = &*(--It);
1158 if (MI->isBranch()) {
1159 MBB.insert(It, Waitcnt);
1160 } else {
1161 MBB.push_back(Waitcnt);
1162 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001163}
1164
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001165// This is a flat memory operation. Check to see if it has memory
1166// tokens for both LDS and Memory, and if so mark it as a flat.
1167bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1168 if (MI.memoperands_empty())
1169 return true;
1170
1171 for (const MachineMemOperand *Memop : MI.memoperands()) {
1172 unsigned AS = Memop->getAddrSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +00001173 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001174 return true;
1175 }
1176
1177 return false;
1178}
1179
Mark Searles70901b92018-04-24 15:59:59 +00001180void SIInsertWaitcnts::updateEventWaitcntAfter(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001181 MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
1182 // Now look at the instruction opcode. If it is a memory access
1183 // instruction, update the upper-bound of the appropriate counter's
1184 // bracket and the destination operand scores.
1185 // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001186 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001187 if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001188 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1189 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1190 } else {
1191 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1192 }
1193 } else if (TII->isFLAT(Inst)) {
1194 assert(Inst.mayLoad() || Inst.mayStore());
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001195
1196 if (TII->usesVM_CNT(Inst))
1197 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1198
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001199 if (TII->usesLGKM_CNT(Inst)) {
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001200 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001201
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001202 // This is a flat memory operation, so note it - it will require
1203 // that both the VM and LGKM be flushed to zero if it is pending when
1204 // a VM or LGKM dependency occurs.
1205 if (mayAccessLDSThroughFlat(Inst))
1206 ScoreBrackets->setPendingFlat();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001207 }
1208 } else if (SIInstrInfo::isVMEM(Inst) &&
1209 // TODO: get a better carve out.
1210 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1211 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1212 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1213 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
Mark Searles2a19af62018-04-26 16:11:19 +00001214 if (ST->vmemWriteNeedsExpWaitcnt() &&
Mark Searles11d0a042017-05-31 16:44:23 +00001215 (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001216 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1217 }
1218 } else if (TII->isSMRD(Inst)) {
1219 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1220 } else {
1221 switch (Inst.getOpcode()) {
1222 case AMDGPU::S_SENDMSG:
1223 case AMDGPU::S_SENDMSGHALT:
1224 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1225 break;
1226 case AMDGPU::EXP:
1227 case AMDGPU::EXP_DONE: {
1228 int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1229 if (Imm >= 32 && Imm <= 63)
1230 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1231 else if (Imm >= 12 && Imm <= 15)
1232 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1233 else
1234 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1235 break;
1236 }
1237 case AMDGPU::S_MEMTIME:
1238 case AMDGPU::S_MEMREALTIME:
1239 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1240 break;
1241 default:
1242 break;
1243 }
1244 }
1245}
1246
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001247// Merge the score brackets of the Block's predecessors;
1248// this merged score bracket is used when adding waitcnts to the Block
Kannan Narayananacb089e2017-04-12 03:25:12 +00001249void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
1250 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1251 int32_t MaxPending[NUM_INST_CNTS] = {0};
1252 int32_t MaxFlat[NUM_INST_CNTS] = {0};
Kannan Narayananacb089e2017-04-12 03:25:12 +00001253
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001254 // For single basic block loops, we need to retain the Block's
1255 // score bracket to have accurate Pred info. So, make a copy of Block's
1256 // score bracket, clear() it (which retains several important bits of info),
1257 // populate, and then replace en masse. For non-single basic block loops,
1258 // just clear Block's current score bracket and repopulate in-place.
1259 bool IsSelfPred;
1260 std::unique_ptr<BlockWaitcntBrackets> S;
1261
1262 IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
1263 != Block.pred_end();
1264 if (IsSelfPred) {
1265 S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1266 ScoreBrackets = S.get();
1267 }
1268
Kannan Narayananacb089e2017-04-12 03:25:12 +00001269 ScoreBrackets->clear();
1270
Kannan Narayananacb089e2017-04-12 03:25:12 +00001271 // See if there are any uninitialized predecessors. If so, emit an
1272 // s_waitcnt 0 at the beginning of the block.
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001273 for (MachineBasicBlock *Pred : Block.predecessors()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001274 BlockWaitcntBrackets *PredScoreBrackets =
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001275 BlockWaitcntBracketsMap[Pred].get();
1276 bool Visited = BlockVisitedSet.count(Pred);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001277 if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001278 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001279 }
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001280 for (auto T : inst_counter_types()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001281 int span =
1282 PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
1283 MaxPending[T] = std::max(MaxPending[T], span);
1284 span =
1285 PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
1286 MaxFlat[T] = std::max(MaxFlat[T], span);
1287 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001288 }
1289
Kannan Narayananacb089e2017-04-12 03:25:12 +00001290#if 0
1291 // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
1292 // TODO: how does LC distinguish between function entry and main entry?
1293 // If this is the entry to a function, force a wait.
1294 MachineBasicBlock &Entry = Block.getParent()->front();
1295 if (Entry.getNumber() == Block.getNumber()) {
1296 ScoreBrackets->setWaitAtBeginning();
1297 return;
1298 }
1299#endif
1300
1301 // Now set the current Block's brackets to the largest ending bracket.
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001302 for (auto T : inst_counter_types()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001303 ScoreBrackets->setScoreUB(T, MaxPending[T]);
1304 ScoreBrackets->setScoreLB(T, 0);
1305 ScoreBrackets->setLastFlat(T, MaxFlat[T]);
1306 }
1307
Kannan Narayananacb089e2017-04-12 03:25:12 +00001308 // Set the register scoreboard.
1309 for (MachineBasicBlock *Pred : Block.predecessors()) {
Mark Searles24c92ee2018-02-07 02:21:21 +00001310 if (!BlockVisitedSet.count(Pred)) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001311 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001312 }
1313
1314 BlockWaitcntBrackets *PredScoreBrackets =
1315 BlockWaitcntBracketsMap[Pred].get();
1316
1317 // Now merge the gpr_reg_score information
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001318 for (auto T : inst_counter_types()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001319 int PredLB = PredScoreBrackets->getScoreLB(T);
1320 int PredUB = PredScoreBrackets->getScoreUB(T);
1321 if (PredLB < PredUB) {
1322 int PredScale = MaxPending[T] - PredUB;
1323 // Merge vgpr scores.
1324 for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
1325 int PredRegScore = PredScoreBrackets->getRegScore(J, T);
1326 if (PredRegScore <= PredLB)
1327 continue;
1328 int NewRegScore = PredScale + PredRegScore;
1329 ScoreBrackets->setRegScore(
1330 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1331 }
1332 // Also need to merge sgpr scores for lgkm_cnt.
1333 if (T == LGKM_CNT) {
1334 for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
1335 int PredRegScore =
1336 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1337 if (PredRegScore <= PredLB)
1338 continue;
1339 int NewRegScore = PredScale + PredRegScore;
1340 ScoreBrackets->setRegScore(
1341 J + NUM_ALL_VGPRS, LGKM_CNT,
1342 std::max(
1343 ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1344 NewRegScore));
1345 }
1346 }
1347 }
1348 }
1349
Nicolai Haehnled1f45da2018-11-29 11:06:14 +00001350 ScoreBrackets->mergePendingEvents(*PredScoreBrackets);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001351 }
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001352
1353 // if a single block loop, update the score brackets. Not needed for other
1354 // blocks, as we did this in-place
1355 if (IsSelfPred) {
1356 BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1357 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001358}
1359
Mark Searles10545412018-05-30 15:47:45 +00001360/// Return true if the given basic block is a "bottom" block of a loop.
1361/// This works even if the loop is discontiguous. This also handles
1362/// multiple back-edges for the same "header" block of a loop.
Mark Searles1bc6e712018-04-19 15:42:30 +00001363bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
1364 const MachineBasicBlock *Block) {
1365 for (MachineBasicBlock *MBB : Loop->blocks()) {
1366 if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
1367 return true;
1368 }
1369 }
1370 return false;
1371}
1372
1373/// Count the number of "bottom" basic blocks of a loop.
1374unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
1375 unsigned Count = 0;
1376 for (MachineBasicBlock *MBB : Loop->blocks()) {
1377 if (MBB->isSuccessor(Loop->getHeader())) {
1378 Count++;
1379 }
1380 }
1381 return Count;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001382}
1383
1384// Generate s_waitcnt instructions where needed.
1385void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1386 MachineBasicBlock &Block) {
1387 // Initialize the state information.
1388 mergeInputScoreBrackets(Block);
1389
1390 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1391
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001392 LLVM_DEBUG({
Mark Searlesec581832018-04-25 19:21:26 +00001393 dbgs() << "*** Block" << Block.getNumber() << " ***";
Kannan Narayananacb089e2017-04-12 03:25:12 +00001394 ScoreBrackets->dump();
1395 });
1396
Kannan Narayananacb089e2017-04-12 03:25:12 +00001397 // Walk over the instructions.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001398 MachineInstr *OldWaitcntInstr = nullptr;
1399
Kannan Narayananacb089e2017-04-12 03:25:12 +00001400 for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1401 Iter != E;) {
1402 MachineInstr &Inst = *Iter;
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001403
Kannan Narayananacb089e2017-04-12 03:25:12 +00001404 // Remove any previously existing waitcnts.
1405 if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001406 if (OldWaitcntInstr) {
1407 if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
1408 TrackedWaitcntSet.erase(OldWaitcntInstr);
1409 OldWaitcntInstr->eraseFromParent();
1410 OldWaitcntInstr = nullptr;
1411 } else if (!TrackedWaitcntSet.count(&Inst)) {
1412 // Two successive s_waitcnt's, both of which are pre-existing and
1413 // are therefore preserved.
1414 int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
1415 ScoreBrackets->applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
1416 } else {
1417 ++Iter;
1418 Inst.eraseFromParent();
1419 continue;
1420 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001421 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001422
1423 OldWaitcntInstr = &Inst;
1424 ++Iter;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001425 continue;
1426 }
1427
Kannan Narayananacb089e2017-04-12 03:25:12 +00001428 bool VCCZBugWorkAround = false;
1429 if (readsVCCZ(Inst) &&
Mark Searles24c92ee2018-02-07 02:21:21 +00001430 (!VCCZBugHandledSet.count(&Inst))) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001431 if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1432 ScoreBrackets->getScoreUB(LGKM_CNT) &&
Nicolai Haehnled1f45da2018-11-29 11:06:14 +00001433 ScoreBrackets->hasPendingEvent(SMEM_ACCESS)) {
Tom Stellardc5a154d2018-06-28 23:47:12 +00001434 if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
Kannan Narayananacb089e2017-04-12 03:25:12 +00001435 VCCZBugWorkAround = true;
1436 }
1437 }
1438
1439 // Generate an s_waitcnt instruction to be placed before
1440 // cur_Inst, if needed.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001441 generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
1442 OldWaitcntInstr = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001443
Mark Searles70901b92018-04-24 15:59:59 +00001444 updateEventWaitcntAfter(Inst, ScoreBrackets);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001445
1446#if 0 // TODO: implement resource type check controlled by options with ub = LB.
1447 // If this instruction generates a S_SETVSKIP because it is an
1448 // indexed resource, and we are on Tahiti, then it will also force
1449 // an S_WAITCNT vmcnt(0)
1450 if (RequireCheckResourceType(Inst, context)) {
1451 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1452 ScoreBrackets->setScoreLB(VM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +00001453 ScoreBrackets->getScoreUB(VM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +00001454 }
1455#endif
1456
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001457 LLVM_DEBUG({
Mark Searles94ae3b22018-01-30 17:17:06 +00001458 Inst.print(dbgs());
Kannan Narayananacb089e2017-04-12 03:25:12 +00001459 ScoreBrackets->dump();
1460 });
1461
1462 // Check to see if this is a GWS instruction. If so, and if this is CI or
1463 // VI, then the generated code sequence will include an S_WAITCNT 0.
1464 // TODO: Are these the only GWS instructions?
1465 if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1466 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1467 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1468 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1469 Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1470 // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001471 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero());
Kannan Narayananacb089e2017-04-12 03:25:12 +00001472 }
1473
1474 // TODO: Remove this work-around after fixing the scheduler and enable the
1475 // assert above.
1476 if (VCCZBugWorkAround) {
1477 // Restore the vccz bit. Any time a value is written to vcc, the vcc
1478 // bit is updated, so we can restore the bit by reading the value of
1479 // vcc and then writing it back to the register.
1480 BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1481 AMDGPU::VCC)
1482 .addReg(AMDGPU::VCC);
1483 VCCZBugHandledSet.insert(&Inst);
1484 }
1485
Kannan Narayananacb089e2017-04-12 03:25:12 +00001486 ++Iter;
1487 }
1488
1489 // Check if we need to force convergence at loop footer.
1490 MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
Mark Searles1bc6e712018-04-19 15:42:30 +00001491 if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001492 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1493 WaitcntData->print();
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001494 LLVM_DEBUG(dbgs() << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001495
1496 // The iterative waitcnt insertion algorithm aims for optimal waitcnt
Mark Searles10545412018-05-30 15:47:45 +00001497 // placement, but doesn't guarantee convergence for a loop. Each
1498 // loop should take at most (n+1) iterations for it to converge naturally,
1499 // where n is the number of bottom blocks. If this threshold is reached and
1500 // the result hasn't converged, then we force convergence by inserting
1501 // a s_waitcnt at the end of loop footer.
1502 if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001503 // To ensure convergence, need to make wait events at loop footer be no
1504 // more than those from the previous iteration.
Mark Searles65207922018-02-19 19:19:59 +00001505 // As a simplification, instead of tracking individual scores and
1506 // generating the precise wait count, just wait on 0.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001507 bool HasPending = false;
1508 MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001509 for (auto T : inst_counter_types()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001510 if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
1511 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1512 HasPending = true;
Mark Searles10545412018-05-30 15:47:45 +00001513 break;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001514 }
1515 }
1516
1517 if (HasPending) {
1518 if (!SWaitInst) {
Mark Searles10545412018-05-30 15:47:45 +00001519 SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
1520 DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1521 .addImm(0);
Mark Searles24c92ee2018-02-07 02:21:21 +00001522 TrackedWaitcntSet.insert(SWaitInst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001523#if 0 // TODO: Format the debug output
1524 OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
1525 OutputTransformAdd(SWaitInst, context);
1526#endif
1527 }
1528#if 0 // TODO: ??
1529 _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
1530#endif
1531 }
1532
1533 if (SWaitInst) {
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001534 LLVM_DEBUG({
Kannan Narayananacb089e2017-04-12 03:25:12 +00001535 SWaitInst->print(dbgs());
1536 dbgs() << "\nAdjusted score board:";
1537 ScoreBrackets->dump();
1538 });
1539
1540 // Add this waitcnt to the block. It is either newly created or
1541 // created in previous iterations and added back since block traversal
Mark Searles65207922018-02-19 19:19:59 +00001542 // always removes waitcnts.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001543 insertWaitcntBeforeCF(Block, SWaitInst);
1544 WaitcntData->setWaitcnt(SWaitInst);
1545 }
1546 }
1547 }
1548}
1549
1550bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
Tom Stellard5bfbae52018-07-11 20:59:01 +00001551 ST = &MF.getSubtarget<GCNSubtarget>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001552 TII = ST->getInstrInfo();
1553 TRI = &TII->getRegisterInfo();
1554 MRI = &MF.getRegInfo();
1555 MLI = &getAnalysis<MachineLoopInfo>();
Konstantin Zhuravlyov71e43ee2018-09-12 18:50:47 +00001556 IV = AMDGPU::getIsaVersion(ST->getCPU());
Mark Searles11d0a042017-05-31 16:44:23 +00001557 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001558
Mark Searles4a0f2c52018-05-07 14:43:28 +00001559 ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001560 for (auto T : inst_counter_types())
Mark Searlesec581832018-04-25 19:21:26 +00001561 ForceEmitWaitcnt[T] = false;
1562
Kannan Narayananacb089e2017-04-12 03:25:12 +00001563 HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1564 HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1565 HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1566
1567 HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1568 HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1569 assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1570 assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1571
1572 RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1573 RegisterEncoding.VGPRL =
1574 RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1575 RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1576 RegisterEncoding.SGPRL =
1577 RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1578
Mark Searles24c92ee2018-02-07 02:21:21 +00001579 TrackedWaitcntSet.clear();
1580 BlockVisitedSet.clear();
1581 VCCZBugHandledSet.clear();
Mark Searles1bc6e712018-04-19 15:42:30 +00001582 LoopWaitcntDataMap.clear();
Scott Linder5792dd02018-06-21 18:48:48 +00001583 BlockWaitcntProcessedSet.clear();
Mark Searles24c92ee2018-02-07 02:21:21 +00001584
Nicolai Haehnle0ab31c92018-11-07 21:53:29 +00001585 // Walk over the blocks in reverse post order, inserting
Kannan Narayananacb089e2017-04-12 03:25:12 +00001586 // s_waitcnt where needed.
1587 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
1588 bool Modified = false;
1589 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1590 I = RPOT.begin(),
1591 E = RPOT.end(), J = RPOT.begin();
1592 I != E;) {
1593 MachineBasicBlock &MBB = **I;
1594
1595 BlockVisitedSet.insert(&MBB);
1596
1597 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1598 if (!ScoreBrackets) {
Mark Searlesf0b93f12018-06-04 16:51:59 +00001599 BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001600 ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1601 }
1602 ScoreBrackets->setPostOrder(MBB.getNumber());
1603 MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
1604 if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
Eugene Zelenko59e12822017-08-08 00:47:13 +00001605 LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001606
1607 // If we are walking into the block from before the loop, then guarantee
1608 // at least 1 re-walk over the loop to propagate the information, even if
1609 // no S_WAITCNT instructions were generated.
Mark Searles1bc6e712018-04-19 15:42:30 +00001610 if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
1611 unsigned Count = countNumBottomBlocks(ContainingLoop);
1612
1613 // If the loop has multiple back-edges, and so more than one "bottom"
1614 // basic block, we have to guarantee a re-walk over every blocks.
1615 if ((std::count(BlockWaitcntProcessedSet.begin(),
Mark Searlesf4e70252018-07-16 10:21:36 +00001616 BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
Mark Searles1bc6e712018-04-19 15:42:30 +00001617 BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
Mark Searles10545412018-05-30 15:47:45 +00001618 LLVM_DEBUG(dbgs() << "set-revisit1: Block"
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001619 << ContainingLoop->getHeader()->getNumber() << '\n';);
Mark Searles1bc6e712018-04-19 15:42:30 +00001620 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001621 }
1622
1623 // Walk over the instructions.
1624 insertWaitcntInBlock(MF, MBB);
1625
Mark Searles10545412018-05-30 15:47:45 +00001626 // Record that waitcnts have been processed at least once for this block.
Mark Searles1bc6e712018-04-19 15:42:30 +00001627 BlockWaitcntProcessedSet.push_back(&MBB);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001628
Mark Searles1bc6e712018-04-19 15:42:30 +00001629 // See if we want to revisit the loop. If a loop has multiple back-edges,
1630 // we shouldn't revisit the same "bottom" basic block.
1631 if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
1632 std::count(BlockWaitcntProcessedSet.begin(),
1633 BlockWaitcntProcessedSet.end(), &MBB) == 1) {
Kannan Narayanan5e73b042017-05-05 21:10:17 +00001634 MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001635 BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
1636 if (EntrySB && EntrySB->getRevisitLoop()) {
1637 EntrySB->setRevisitLoop(false);
1638 J = I;
1639 int32_t PostOrder = EntrySB->getPostOrder();
1640 // TODO: Avoid this loop. Find another way to set I.
1641 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1642 X = RPOT.begin(),
1643 Y = RPOT.end();
1644 X != Y; ++X) {
1645 MachineBasicBlock &MBBX = **X;
1646 if (MBBX.getNumber() == PostOrder) {
1647 I = X;
1648 break;
1649 }
1650 }
1651 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1652 WaitcntData->incIterCnt();
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001653 LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001654 continue;
1655 } else {
1656 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1657 // Loop converged, reset iteration count. If this loop gets revisited,
1658 // it must be from an outer loop, the counter will restart, this will
1659 // ensure we don't force convergence on such revisits.
1660 WaitcntData->resetIterCnt();
1661 }
1662 }
1663
1664 J = I;
1665 ++I;
1666 }
1667
1668 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1669
1670 bool HaveScalarStores = false;
1671
1672 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1673 ++BI) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001674 MachineBasicBlock &MBB = *BI;
1675
1676 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1677 ++I) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001678 if (!HaveScalarStores && TII->isScalarStore(*I))
1679 HaveScalarStores = true;
1680
1681 if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1682 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1683 EndPgmBlocks.push_back(&MBB);
1684 }
1685 }
1686
1687 if (HaveScalarStores) {
1688 // If scalar writes are used, the cache must be flushed or else the next
1689 // wave to reuse the same scratch memory can be clobbered.
1690 //
1691 // Insert s_dcache_wb at wave termination points if there were any scalar
1692 // stores, and only if the cache hasn't already been flushed. This could be
1693 // improved by looking across blocks for flushes in postdominating blocks
1694 // from the stores but an explicitly requested flush is probably very rare.
1695 for (MachineBasicBlock *MBB : EndPgmBlocks) {
1696 bool SeenDCacheWB = false;
1697
1698 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1699 ++I) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001700 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1701 SeenDCacheWB = true;
1702 else if (TII->isScalarStore(*I))
1703 SeenDCacheWB = false;
1704
1705 // FIXME: It would be better to insert this before a waitcnt if any.
1706 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1707 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1708 !SeenDCacheWB) {
1709 Modified = true;
1710 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1711 }
1712 }
1713 }
1714 }
1715
Mark Searles11d0a042017-05-31 16:44:23 +00001716 if (!MFI->isEntryFunction()) {
1717 // Wait for any outstanding memory operations that the input registers may
Hiroshi Inouec8e92452018-01-29 05:17:03 +00001718 // depend on. We can't track them and it's better to the wait after the
Mark Searles11d0a042017-05-31 16:44:23 +00001719 // costly call sequence.
1720
1721 // TODO: Could insert earlier and schedule more liberally with operations
1722 // that only use caller preserved registers.
1723 MachineBasicBlock &EntryBB = MF.front();
Mark Searlesed54ff12018-05-30 16:27:57 +00001724 BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1725 .addImm(0);
Mark Searles11d0a042017-05-31 16:44:23 +00001726
1727 Modified = true;
1728 }
1729
Kannan Narayananacb089e2017-04-12 03:25:12 +00001730 return Modified;
1731}