blob: 3d37d97c677fed51b499e5e5a1bcc70ee1c3e9be [file] [log] [blame]
Eugene Zelenko59e12822017-08-08 00:47:13 +00001//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
Kannan Narayananacb089e2017-04-12 03:25:12 +00002//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
Adrian Prantl5f8f34e42018-05-01 15:54:18 +000011/// Insert wait instructions for memory reads and writes.
Kannan Narayananacb089e2017-04-12 03:25:12 +000012///
13/// Memory reads and writes are issued asynchronously, so we need to insert
14/// S_WAITCNT instructions when we want to access any of their results or
15/// overwrite any register that's used asynchronously.
16//
17//===----------------------------------------------------------------------===//
18
19#include "AMDGPU.h"
20#include "AMDGPUSubtarget.h"
21#include "SIDefines.h"
22#include "SIInstrInfo.h"
23#include "SIMachineFunctionInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000024#include "SIRegisterInfo.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000025#include "Utils/AMDGPUBaseInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000026#include "llvm/ADT/DenseMap.h"
27#include "llvm/ADT/DenseSet.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000028#include "llvm/ADT/PostOrderIterator.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000029#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/SmallVector.h"
31#include "llvm/CodeGen/MachineBasicBlock.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000032#include "llvm/CodeGen/MachineFunction.h"
33#include "llvm/CodeGen/MachineFunctionPass.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000034#include "llvm/CodeGen/MachineInstr.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000035#include "llvm/CodeGen/MachineInstrBuilder.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000036#include "llvm/CodeGen/MachineLoopInfo.h"
37#include "llvm/CodeGen/MachineMemOperand.h"
38#include "llvm/CodeGen/MachineOperand.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000039#include "llvm/CodeGen/MachineRegisterInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000040#include "llvm/IR/DebugLoc.h"
41#include "llvm/Pass.h"
42#include "llvm/Support/Debug.h"
Mark Searlesec581832018-04-25 19:21:26 +000043#include "llvm/Support/DebugCounter.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000044#include "llvm/Support/ErrorHandling.h"
45#include "llvm/Support/raw_ostream.h"
46#include <algorithm>
47#include <cassert>
48#include <cstdint>
49#include <cstring>
50#include <memory>
51#include <utility>
52#include <vector>
Kannan Narayananacb089e2017-04-12 03:25:12 +000053
Mark Searlesec581832018-04-25 19:21:26 +000054using namespace llvm;
55
Kannan Narayananacb089e2017-04-12 03:25:12 +000056#define DEBUG_TYPE "si-insert-waitcnts"
57
Mark Searlesec581832018-04-25 19:21:26 +000058DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
59 "Force emit s_waitcnt expcnt(0) instrs");
60DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
61 "Force emit s_waitcnt lgkmcnt(0) instrs");
62DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
63 "Force emit s_waitcnt vmcnt(0) instrs");
64
65static cl::opt<unsigned> ForceEmitZeroFlag(
66 "amdgpu-waitcnt-forcezero",
67 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
68 cl::init(0), cl::Hidden);
Kannan Narayananacb089e2017-04-12 03:25:12 +000069
70namespace {
71
Nicolai Haehnleae369d72018-11-29 11:06:11 +000072template <typename EnumT>
73class enum_iterator
74 : public iterator_facade_base<enum_iterator<EnumT>,
75 std::forward_iterator_tag, const EnumT> {
76 EnumT Value;
77public:
78 enum_iterator() = default;
79 enum_iterator(EnumT Value) : Value(Value) {}
80
81 enum_iterator &operator++() {
82 Value = static_cast<EnumT>(Value + 1);
83 return *this;
84 }
85
86 bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
87
88 EnumT operator*() const { return Value; }
89};
90
Kannan Narayananacb089e2017-04-12 03:25:12 +000091// Class of object that encapsulates latest instruction counter score
92// associated with the operand. Used for determining whether
93// s_waitcnt instruction needs to be emited.
94
95#define CNT_MASK(t) (1u << (t))
96
97enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
98
Nicolai Haehnleae369d72018-11-29 11:06:11 +000099iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
100 return make_range(enum_iterator<InstCounterType>(VM_CNT),
101 enum_iterator<InstCounterType>(NUM_INST_CNTS));
102}
103
Eugene Zelenko59e12822017-08-08 00:47:13 +0000104using RegInterval = std::pair<signed, signed>;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000105
106struct {
107 int32_t VmcntMax;
108 int32_t ExpcntMax;
109 int32_t LgkmcntMax;
110 int32_t NumVGPRsMax;
111 int32_t NumSGPRsMax;
112} HardwareLimits;
113
114struct {
115 unsigned VGPR0;
116 unsigned VGPRL;
117 unsigned SGPR0;
118 unsigned SGPRL;
119} RegisterEncoding;
120
121enum WaitEventType {
122 VMEM_ACCESS, // vector-memory read & write
123 LDS_ACCESS, // lds read & write
124 GDS_ACCESS, // gds read & write
125 SQ_MESSAGE, // send message
126 SMEM_ACCESS, // scalar-memory read & write
127 EXP_GPR_LOCK, // export holding on its data src
128 GDS_GPR_LOCK, // GDS holding on its data and addr src
129 EXP_POS_ACCESS, // write to export position
130 EXP_PARAM_ACCESS, // write to export parameter
131 VMW_GPR_LOCK, // vector-memory write holding on its data src
132 NUM_WAIT_EVENTS,
133};
134
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000135iterator_range<enum_iterator<WaitEventType>> wait_event_types() {
136 return make_range(enum_iterator<WaitEventType>(VMEM_ACCESS),
137 enum_iterator<WaitEventType>(NUM_WAIT_EVENTS));
138}
139
Kannan Narayananacb089e2017-04-12 03:25:12 +0000140// The mapping is:
141// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
142// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
143// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
144// We reserve a fixed number of VGPR slots in the scoring tables for
145// special tokens like SCMEM_LDS (needed for buffer load to LDS).
146enum RegisterMapping {
147 SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
148 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
149 NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
150 EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
151 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
152};
153
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000154void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
155 switch (T) {
156 case VM_CNT:
157 Wait.VmCnt = std::min(Wait.VmCnt, Count);
158 break;
159 case EXP_CNT:
160 Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
161 break;
162 case LGKM_CNT:
163 Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
164 break;
165 default:
166 llvm_unreachable("bad InstCounterType");
167 }
168}
169
Kannan Narayananacb089e2017-04-12 03:25:12 +0000170// This is a per-basic-block object that maintains current score brackets
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000171// of each wait counter, and a per-register scoreboard for each wait counter.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000172// We also maintain the latest score for every event type that can change the
173// waitcnt in order to know if there are multiple types of events within
174// the brackets. When multiple types of event happen in the bracket,
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000175// wait count may get decreased out of order, therefore we need to put in
Kannan Narayananacb089e2017-04-12 03:25:12 +0000176// "s_waitcnt 0" before use.
177class BlockWaitcntBrackets {
178public:
Tom Stellard5bfbae52018-07-11 20:59:01 +0000179 BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000180 for (auto T : inst_counter_types())
Eugene Zelenko59e12822017-08-08 00:47:13 +0000181 memset(VgprScores[T], 0, sizeof(VgprScores[T]));
Eugene Zelenko59e12822017-08-08 00:47:13 +0000182 }
183
184 ~BlockWaitcntBrackets() = default;
185
Kannan Narayananacb089e2017-04-12 03:25:12 +0000186 static int32_t getWaitCountMax(InstCounterType T) {
187 switch (T) {
188 case VM_CNT:
189 return HardwareLimits.VmcntMax;
190 case LGKM_CNT:
191 return HardwareLimits.LgkmcntMax;
192 case EXP_CNT:
193 return HardwareLimits.ExpcntMax;
194 default:
195 break;
196 }
197 return 0;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000198 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000199
200 void setScoreLB(InstCounterType T, int32_t Val) {
201 assert(T < NUM_INST_CNTS);
202 if (T >= NUM_INST_CNTS)
203 return;
204 ScoreLBs[T] = Val;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000205 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000206
207 void setScoreUB(InstCounterType T, int32_t Val) {
208 assert(T < NUM_INST_CNTS);
209 if (T >= NUM_INST_CNTS)
210 return;
211 ScoreUBs[T] = Val;
212 if (T == EXP_CNT) {
213 int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
214 if (ScoreLBs[T] < UB)
215 ScoreLBs[T] = UB;
216 }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000217 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000218
Nicolai Haehnlec548d912018-11-19 12:03:11 +0000219 int32_t getScoreLB(InstCounterType T) const {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000220 assert(T < NUM_INST_CNTS);
221 if (T >= NUM_INST_CNTS)
222 return 0;
223 return ScoreLBs[T];
Eugene Zelenko59e12822017-08-08 00:47:13 +0000224 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000225
Nicolai Haehnlec548d912018-11-19 12:03:11 +0000226 int32_t getScoreUB(InstCounterType T) const {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000227 assert(T < NUM_INST_CNTS);
228 if (T >= NUM_INST_CNTS)
229 return 0;
230 return ScoreUBs[T];
Eugene Zelenko59e12822017-08-08 00:47:13 +0000231 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000232
233 // Mapping from event to counter.
234 InstCounterType eventCounter(WaitEventType E) {
235 switch (E) {
236 case VMEM_ACCESS:
237 return VM_CNT;
238 case LDS_ACCESS:
239 case GDS_ACCESS:
240 case SQ_MESSAGE:
241 case SMEM_ACCESS:
242 return LGKM_CNT;
243 case EXP_GPR_LOCK:
244 case GDS_GPR_LOCK:
245 case VMW_GPR_LOCK:
246 case EXP_POS_ACCESS:
247 case EXP_PARAM_ACCESS:
248 return EXP_CNT;
249 default:
250 llvm_unreachable("unhandled event type");
251 }
252 return NUM_INST_CNTS;
253 }
254
255 void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
256 if (GprNo < NUM_ALL_VGPRS) {
257 if (GprNo > VgprUB) {
258 VgprUB = GprNo;
259 }
260 VgprScores[T][GprNo] = Val;
261 } else {
262 assert(T == LGKM_CNT);
263 if (GprNo - NUM_ALL_VGPRS > SgprUB) {
264 SgprUB = GprNo - NUM_ALL_VGPRS;
265 }
266 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
267 }
268 }
269
270 int32_t getRegScore(int GprNo, InstCounterType T) {
271 if (GprNo < NUM_ALL_VGPRS) {
272 return VgprScores[T][GprNo];
273 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000274 assert(T == LGKM_CNT);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000275 return SgprScores[GprNo - NUM_ALL_VGPRS];
276 }
277
278 void clear() {
279 memset(ScoreLBs, 0, sizeof(ScoreLBs));
280 memset(ScoreUBs, 0, sizeof(ScoreUBs));
281 memset(EventUBs, 0, sizeof(EventUBs));
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000282 for (auto T : inst_counter_types())
Kannan Narayananacb089e2017-04-12 03:25:12 +0000283 memset(VgprScores[T], 0, sizeof(VgprScores[T]));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000284 memset(SgprScores, 0, sizeof(SgprScores));
285 }
286
287 RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
288 const MachineRegisterInfo *MRI,
289 const SIRegisterInfo *TRI, unsigned OpNo,
290 bool Def) const;
291
292 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
293 const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
294 unsigned OpNo, int32_t Val);
295
296 void setWaitAtBeginning() { WaitAtBeginning = true; }
297 void clearWaitAtBeginning() { WaitAtBeginning = false; }
298 bool getWaitAtBeginning() const { return WaitAtBeginning; }
299 void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
300 int32_t getMaxVGPR() const { return VgprUB; }
301 int32_t getMaxSGPR() const { return SgprUB; }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000302
Kannan Narayananacb089e2017-04-12 03:25:12 +0000303 int32_t getEventUB(enum WaitEventType W) const {
304 assert(W < NUM_WAIT_EVENTS);
305 return EventUBs[W];
306 }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000307
Nicolai Haehnlec548d912018-11-19 12:03:11 +0000308 bool counterOutOfOrder(InstCounterType T) const;
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000309 bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
310 bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
311 void determineWait(InstCounterType T, int ScoreToWait,
312 AMDGPU::Waitcnt &Wait) const;
313 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
314 void applyWaitcnt(InstCounterType T, unsigned Count);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000315 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
316 const MachineRegisterInfo *MRI, WaitEventType E,
317 MachineInstr &MI);
318
Kannan Narayananacb089e2017-04-12 03:25:12 +0000319 bool hasPendingSMEM() const {
320 return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
321 EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
322 }
323
324 bool hasPendingFlat() const {
325 return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
326 LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
327 (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
328 LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
329 }
330
331 void setPendingFlat() {
332 LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
333 LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
334 }
335
336 int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
337
338 void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
339
340 bool getRevisitLoop() const { return RevisitLoop; }
341 void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
342
343 void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
344 int32_t getPostOrder() const { return PostOrder; }
345
Kannan Narayananacb089e2017-04-12 03:25:12 +0000346 bool mixedExpTypes() const { return MixedExpTypes; }
347 void setMixedExpTypes(bool MixedExpTypesIn) {
348 MixedExpTypes = MixedExpTypesIn;
349 }
350
351 void print(raw_ostream &);
352 void dump() { print(dbgs()); }
353
354private:
Tom Stellard5bfbae52018-07-11 20:59:01 +0000355 const GCNSubtarget *ST = nullptr;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000356 bool WaitAtBeginning = false;
357 bool RevisitLoop = false;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000358 bool MixedExpTypes = false;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000359 int32_t PostOrder = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000360 int32_t ScoreLBs[NUM_INST_CNTS] = {0};
361 int32_t ScoreUBs[NUM_INST_CNTS] = {0};
362 int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
363 // Remember the last flat memory operation.
364 int32_t LastFlat[NUM_INST_CNTS] = {0};
365 // wait_cnt scores for every vgpr.
366 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000367 int32_t VgprUB = 0;
368 int32_t SgprUB = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000369 int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
370 // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
371 int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
372};
373
374// This is a per-loop-region object that records waitcnt status at the end of
375// loop footer from the previous iteration. We also maintain an iteration
376// count to track the number of times the loop has been visited. When it
377// doesn't converge naturally, we force convergence by inserting s_waitcnt 0
378// at the end of the loop footer.
379class LoopWaitcntData {
380public:
Eugene Zelenko59e12822017-08-08 00:47:13 +0000381 LoopWaitcntData() = default;
382 ~LoopWaitcntData() = default;
383
Kannan Narayananacb089e2017-04-12 03:25:12 +0000384 void incIterCnt() { IterCnt++; }
385 void resetIterCnt() { IterCnt = 0; }
Mark Searles10545412018-05-30 15:47:45 +0000386 unsigned getIterCnt() { return IterCnt; }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000387
Kannan Narayananacb089e2017-04-12 03:25:12 +0000388 void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
389 MachineInstr *getWaitcnt() const { return LfWaitcnt; }
390
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000391 void print() { LLVM_DEBUG(dbgs() << " iteration " << IterCnt << '\n';); }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000392
393private:
394 // s_waitcnt added at the end of loop footer to stablize wait scores
395 // at the end of the loop footer.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000396 MachineInstr *LfWaitcnt = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000397 // Number of iterations the loop has been visited, not including the initial
398 // walk over.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000399 int32_t IterCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000400};
401
402class SIInsertWaitcnts : public MachineFunctionPass {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000403private:
Tom Stellard5bfbae52018-07-11 20:59:01 +0000404 const GCNSubtarget *ST = nullptr;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000405 const SIInstrInfo *TII = nullptr;
406 const SIRegisterInfo *TRI = nullptr;
407 const MachineRegisterInfo *MRI = nullptr;
408 const MachineLoopInfo *MLI = nullptr;
Konstantin Zhuravlyov71e43ee2018-09-12 18:50:47 +0000409 AMDGPU::IsaVersion IV;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000410
411 DenseSet<MachineBasicBlock *> BlockVisitedSet;
Mark Searles24c92ee2018-02-07 02:21:21 +0000412 DenseSet<MachineInstr *> TrackedWaitcntSet;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000413 DenseSet<MachineInstr *> VCCZBugHandledSet;
414
415 DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
416 BlockWaitcntBracketsMap;
417
Mark Searles1bc6e712018-04-19 15:42:30 +0000418 std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000419
420 DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
421
Mark Searles4a0f2c52018-05-07 14:43:28 +0000422 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
423 // because of amdgpu-waitcnt-forcezero flag
424 bool ForceEmitZeroWaitcnts;
Mark Searlesec581832018-04-25 19:21:26 +0000425 bool ForceEmitWaitcnt[NUM_INST_CNTS];
426
Kannan Narayananacb089e2017-04-12 03:25:12 +0000427public:
428 static char ID;
429
Konstantin Zhuravlyov77747772018-06-26 21:33:38 +0000430 SIInsertWaitcnts() : MachineFunctionPass(ID) {
431 (void)ForceExpCounter;
432 (void)ForceLgkmCounter;
433 (void)ForceVMCounter;
434 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000435
436 bool runOnMachineFunction(MachineFunction &MF) override;
437
438 StringRef getPassName() const override {
439 return "SI insert wait instructions";
440 }
441
442 void getAnalysisUsage(AnalysisUsage &AU) const override {
443 AU.setPreservesCFG();
444 AU.addRequired<MachineLoopInfo>();
445 MachineFunctionPass::getAnalysisUsage(AU);
446 }
447
Mark Searlesec581832018-04-25 19:21:26 +0000448 bool isForceEmitWaitcnt() const {
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000449 for (auto T : inst_counter_types())
Mark Searlesec581832018-04-25 19:21:26 +0000450 if (ForceEmitWaitcnt[T])
451 return true;
452 return false;
453 }
454
455 void setForceEmitWaitcnt() {
456// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
457// For debug builds, get the debug counter info and adjust if need be
458#ifndef NDEBUG
459 if (DebugCounter::isCounterSet(ForceExpCounter) &&
460 DebugCounter::shouldExecute(ForceExpCounter)) {
461 ForceEmitWaitcnt[EXP_CNT] = true;
462 } else {
463 ForceEmitWaitcnt[EXP_CNT] = false;
464 }
465
466 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
467 DebugCounter::shouldExecute(ForceLgkmCounter)) {
468 ForceEmitWaitcnt[LGKM_CNT] = true;
469 } else {
470 ForceEmitWaitcnt[LGKM_CNT] = false;
471 }
472
473 if (DebugCounter::isCounterSet(ForceVMCounter) &&
474 DebugCounter::shouldExecute(ForceVMCounter)) {
475 ForceEmitWaitcnt[VM_CNT] = true;
476 } else {
477 ForceEmitWaitcnt[VM_CNT] = false;
478 }
479#endif // NDEBUG
480 }
481
Matt Arsenault0ed39d32017-07-21 18:54:54 +0000482 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
Mark Searles70901b92018-04-24 15:59:59 +0000483 void generateWaitcntInstBefore(MachineInstr &MI,
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000484 BlockWaitcntBrackets *ScoreBrackets,
485 MachineInstr *OldWaitcntInstr);
Mark Searles70901b92018-04-24 15:59:59 +0000486 void updateEventWaitcntAfter(MachineInstr &Inst,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000487 BlockWaitcntBrackets *ScoreBrackets);
488 void mergeInputScoreBrackets(MachineBasicBlock &Block);
Mark Searles1bc6e712018-04-19 15:42:30 +0000489 bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
490 unsigned countNumBottomBlocks(const MachineLoop *Loop);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000491 void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
492 void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
493};
494
Eugene Zelenko59e12822017-08-08 00:47:13 +0000495} // end anonymous namespace
Kannan Narayananacb089e2017-04-12 03:25:12 +0000496
497RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
498 const SIInstrInfo *TII,
499 const MachineRegisterInfo *MRI,
500 const SIRegisterInfo *TRI,
501 unsigned OpNo,
502 bool Def) const {
503 const MachineOperand &Op = MI->getOperand(OpNo);
504 if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
505 (Def && !Op.isDef()))
506 return {-1, -1};
507
508 // A use via a PW operand does not need a waitcnt.
509 // A partial write is not a WAW.
510 assert(!Op.getSubReg() || !Op.isUndef());
511
512 RegInterval Result;
513 const MachineRegisterInfo &MRIA = *MRI;
514
515 unsigned Reg = TRI->getEncodingValue(Op.getReg());
516
517 if (TRI->isVGPR(MRIA, Op.getReg())) {
518 assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
519 Result.first = Reg - RegisterEncoding.VGPR0;
520 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
521 } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
522 assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
523 Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
524 assert(Result.first >= NUM_ALL_VGPRS &&
525 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
526 }
527 // TODO: Handle TTMP
528 // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
529 else
530 return {-1, -1};
531
532 const MachineInstr &MIA = *MI;
533 const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +0000534 unsigned Size = TRI->getRegSizeInBits(*RC);
535 Result.second = Result.first + (Size / 32);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000536
537 return Result;
538}
539
540void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
541 const SIInstrInfo *TII,
542 const SIRegisterInfo *TRI,
543 const MachineRegisterInfo *MRI,
544 unsigned OpNo, int32_t Val) {
545 RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000546 LLVM_DEBUG({
Kannan Narayananacb089e2017-04-12 03:25:12 +0000547 const MachineOperand &Opnd = MI->getOperand(OpNo);
548 assert(TRI->isVGPR(*MRI, Opnd.getReg()));
549 });
550 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
551 setRegScore(RegNo, EXP_CNT, Val);
552 }
553}
554
555void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
556 const SIRegisterInfo *TRI,
557 const MachineRegisterInfo *MRI,
558 WaitEventType E, MachineInstr &Inst) {
559 const MachineRegisterInfo &MRIA = *MRI;
560 InstCounterType T = eventCounter(E);
561 int32_t CurrScore = getScoreUB(T) + 1;
562 // EventUB and ScoreUB need to be update regardless if this event changes
563 // the score of a register or not.
564 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
565 EventUBs[E] = CurrScore;
566 setScoreUB(T, CurrScore);
567
568 if (T == EXP_CNT) {
569 // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
570 // is required.
571 if (!MixedExpTypes) {
572 MixedExpTypes = counterOutOfOrder(EXP_CNT);
573 }
574
575 // Put score on the source vgprs. If this is a store, just use those
576 // specific register(s).
577 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
578 // All GDS operations must protect their address register (same as
579 // export.)
580 if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
581 Inst.getOpcode() != AMDGPU::DS_CONSUME) {
582 setExpScore(
583 &Inst, TII, TRI, MRI,
584 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
585 CurrScore);
586 }
587 if (Inst.mayStore()) {
588 setExpScore(
589 &Inst, TII, TRI, MRI,
590 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
591 CurrScore);
592 if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
593 AMDGPU::OpName::data1) != -1) {
594 setExpScore(&Inst, TII, TRI, MRI,
595 AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
596 AMDGPU::OpName::data1),
597 CurrScore);
598 }
599 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
600 Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
601 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
602 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
603 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
604 Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
605 Inst.getOpcode() != AMDGPU::DS_APPEND &&
606 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
607 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
608 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
609 const MachineOperand &Op = Inst.getOperand(I);
610 if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
611 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
612 }
613 }
614 }
615 } else if (TII->isFLAT(Inst)) {
616 if (Inst.mayStore()) {
617 setExpScore(
618 &Inst, TII, TRI, MRI,
619 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
620 CurrScore);
621 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
622 setExpScore(
623 &Inst, TII, TRI, MRI,
624 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
625 CurrScore);
626 }
627 } else if (TII->isMIMG(Inst)) {
628 if (Inst.mayStore()) {
629 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
630 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
631 setExpScore(
632 &Inst, TII, TRI, MRI,
633 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
634 CurrScore);
635 }
636 } else if (TII->isMTBUF(Inst)) {
637 if (Inst.mayStore()) {
638 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
639 }
640 } else if (TII->isMUBUF(Inst)) {
641 if (Inst.mayStore()) {
642 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
643 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
644 setExpScore(
645 &Inst, TII, TRI, MRI,
646 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
647 CurrScore);
648 }
649 } else {
650 if (TII->isEXP(Inst)) {
651 // For export the destination registers are really temps that
652 // can be used as the actual source after export patching, so
653 // we need to treat them like sources and set the EXP_CNT
654 // score.
655 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
656 MachineOperand &DefMO = Inst.getOperand(I);
657 if (DefMO.isReg() && DefMO.isDef() &&
658 TRI->isVGPR(MRIA, DefMO.getReg())) {
659 setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
660 CurrScore);
661 }
662 }
663 }
664 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
665 MachineOperand &MO = Inst.getOperand(I);
666 if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
667 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
668 }
669 }
670 }
671#if 0 // TODO: check if this is handled by MUBUF code above.
672 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
Evgeny Mankovbf975172017-08-16 16:47:29 +0000673 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
674 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000675 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
676 unsigned OpNo;//TODO: find the OpNo for this operand;
677 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
678 for (signed RegNo = Interval.first; RegNo < Interval.second;
Evgeny Mankovbf975172017-08-16 16:47:29 +0000679 ++RegNo) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000680 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
681 }
682#endif
683 } else {
684 // Match the score to the destination registers.
685 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
686 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
687 if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
688 continue;
689 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
690 setRegScore(RegNo, T, CurrScore);
691 }
692 }
693 if (TII->isDS(Inst) && Inst.mayStore()) {
694 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
695 }
696 }
697}
698
699void BlockWaitcntBrackets::print(raw_ostream &OS) {
700 OS << '\n';
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000701 for (auto T : inst_counter_types()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000702 int LB = getScoreLB(T);
703 int UB = getScoreUB(T);
704
705 switch (T) {
706 case VM_CNT:
707 OS << " VM_CNT(" << UB - LB << "): ";
708 break;
709 case LGKM_CNT:
710 OS << " LGKM_CNT(" << UB - LB << "): ";
711 break;
712 case EXP_CNT:
713 OS << " EXP_CNT(" << UB - LB << "): ";
714 break;
715 default:
716 OS << " UNKNOWN(" << UB - LB << "): ";
717 break;
718 }
719
720 if (LB < UB) {
721 // Print vgpr scores.
722 for (int J = 0; J <= getMaxVGPR(); J++) {
723 int RegScore = getRegScore(J, T);
724 if (RegScore <= LB)
725 continue;
726 int RelScore = RegScore - LB - 1;
727 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
728 OS << RelScore << ":v" << J << " ";
729 } else {
730 OS << RelScore << ":ds ";
731 }
732 }
733 // Also need to print sgpr scores for lgkm_cnt.
734 if (T == LGKM_CNT) {
735 for (int J = 0; J <= getMaxSGPR(); J++) {
736 int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
737 if (RegScore <= LB)
738 continue;
739 int RelScore = RegScore - LB - 1;
740 OS << RelScore << ":s" << J << " ";
741 }
742 }
743 }
744 OS << '\n';
745 }
746 OS << '\n';
Kannan Narayananacb089e2017-04-12 03:25:12 +0000747}
748
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000749/// Simplify the waitcnt, in the sense of removing redundant counts, and return
750/// whether a waitcnt instruction is needed at all.
751bool BlockWaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
752 return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
753 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
754 simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
755}
756
757bool BlockWaitcntBrackets::simplifyWaitcnt(InstCounterType T,
758 unsigned &Count) const {
759 const int32_t LB = getScoreLB(T);
760 const int32_t UB = getScoreUB(T);
761 if (Count < (unsigned)UB && UB - (int32_t)Count > LB)
762 return true;
763
764 Count = ~0u;
765 return false;
766}
767
768void BlockWaitcntBrackets::determineWait(InstCounterType T, int ScoreToWait,
769 AMDGPU::Waitcnt &Wait) const {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000770 if (ScoreToWait == -1) {
771 // The score to wait is unknown. This implies that it was not encountered
772 // during the path of the CFG walk done during the current traversal but
773 // may be seen on a different path. Emit an s_wait counter with a
774 // conservative value of 0 for the counter.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000775 addWait(Wait, T, 0);
776 return;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000777 }
778
779 // If the score of src_operand falls within the bracket, we need an
780 // s_waitcnt instruction.
781 const int32_t LB = getScoreLB(T);
782 const int32_t UB = getScoreUB(T);
783 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
Mark Searlesf0b93f12018-06-04 16:51:59 +0000784 if ((T == VM_CNT || T == LGKM_CNT) &&
785 hasPendingFlat() &&
786 !ST->hasFlatLgkmVMemCountInOrder()) {
787 // If there is a pending FLAT operation, and this is a VMem or LGKM
788 // waitcnt and the target can report early completion, then we need
789 // to force a waitcnt 0.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000790 addWait(Wait, T, 0);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000791 } else if (counterOutOfOrder(T)) {
792 // Counter can get decremented out-of-order when there
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000793 // are multiple types event in the bracket. Also emit an s_wait counter
Kannan Narayananacb089e2017-04-12 03:25:12 +0000794 // with a conservative value of 0 for the counter.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000795 addWait(Wait, T, 0);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000796 } else {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000797 addWait(Wait, T, UB - ScoreToWait);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000798 }
799 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000800}
Kannan Narayananacb089e2017-04-12 03:25:12 +0000801
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000802void BlockWaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
803 applyWaitcnt(VM_CNT, Wait.VmCnt);
804 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
805 applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
806
807 if (Wait.ExpCnt == 0)
808 setMixedExpTypes(false);
809}
810
811void BlockWaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
812 const int32_t UB = getScoreUB(T);
813 if (Count >= (unsigned)UB)
814 return;
815 if (Count != 0) {
816 if (counterOutOfOrder(T))
817 return;
818 setScoreLB(T, std::max(getScoreLB(T), UB - (int32_t)Count));
819 } else {
820 setScoreLB(T, UB);
821 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000822}
823
824// Where there are multiple types of event in the bracket of a counter,
825// the decrement may go out of order.
Nicolai Haehnlec548d912018-11-19 12:03:11 +0000826bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000827 switch (T) {
828 case VM_CNT:
829 return false;
830 case LGKM_CNT: {
831 if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
832 EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
833 // Scalar memory read always can go out of order.
834 return true;
835 }
836 int NumEventTypes = 0;
837 if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
838 EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
839 NumEventTypes++;
840 }
841 if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
842 EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
843 NumEventTypes++;
844 }
845 if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
846 EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
847 NumEventTypes++;
848 }
849 if (NumEventTypes <= 1) {
850 return false;
851 }
852 break;
853 }
854 case EXP_CNT: {
855 // If there has been a mixture of export types, then a waitcnt exp(0) is
856 // required.
857 if (MixedExpTypes)
858 return true;
859 int NumEventTypes = 0;
860 if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
861 EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
862 NumEventTypes++;
863 }
864 if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
865 EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
866 NumEventTypes++;
867 }
868 if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
869 EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
870 NumEventTypes++;
871 }
872 if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
873 EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
874 NumEventTypes++;
875 }
876
877 if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
878 EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
879 NumEventTypes++;
880 }
881
882 if (NumEventTypes <= 1) {
883 return false;
884 }
885 break;
886 }
887 default:
888 break;
889 }
890 return true;
891}
892
893INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
894 false)
895INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
896 false)
897
898char SIInsertWaitcnts::ID = 0;
899
900char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
901
902FunctionPass *llvm::createSIInsertWaitcntsPass() {
903 return new SIInsertWaitcnts();
904}
905
906static bool readsVCCZ(const MachineInstr &MI) {
907 unsigned Opc = MI.getOpcode();
908 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
909 !MI.getOperand(1).isUndef();
910}
911
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000912/// Generate s_waitcnt instruction to be placed before cur_Inst.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000913/// Instructions of a given type are returned in order,
914/// but instructions of different types can complete out of order.
915/// We rely on this in-order completion
916/// and simply assign a score to the memory access instructions.
917/// We keep track of the active "score bracket" to determine
918/// if an access of a memory read requires an s_waitcnt
919/// and if so what the value of each counter is.
920/// The "score bracket" is bound by the lower bound and upper bound
921/// scores (*_score_LB and *_score_ub respectively).
Mark Searles70901b92018-04-24 15:59:59 +0000922void SIInsertWaitcnts::generateWaitcntInstBefore(
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000923 MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets,
924 MachineInstr *OldWaitcntInstr) {
Mark Searles4a0f2c52018-05-07 14:43:28 +0000925 setForceEmitWaitcnt();
Mark Searlesec581832018-04-25 19:21:26 +0000926 bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
927
Nicolai Haehnle61396ff2018-11-07 21:53:36 +0000928 if (MI.isDebugInstr())
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +0000929 return;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000930
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000931 AMDGPU::Waitcnt Wait;
932
Kannan Narayananacb089e2017-04-12 03:25:12 +0000933 // See if an s_waitcnt is forced at block entry, or is needed at
934 // program end.
935 if (ScoreBrackets->getWaitAtBeginning()) {
936 // Note that we have already cleared the state, so we don't need to update
937 // it.
938 ScoreBrackets->clearWaitAtBeginning();
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000939 Wait = AMDGPU::Waitcnt::allZero();
Kannan Narayananacb089e2017-04-12 03:25:12 +0000940 }
941
942 // See if this instruction has a forced S_WAITCNT VM.
943 // TODO: Handle other cases of NeedsWaitcntVmBefore()
944 else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
945 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
946 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000947 Wait.VmCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000948 }
949
950 // All waits must be resolved at call return.
951 // NOTE: this could be improved with knowledge of all call sites or
952 // with knowledge of the called routines.
Tom Stellardc5a154d2018-06-28 23:47:12 +0000953 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
Mark Searles11d0a042017-05-31 16:44:23 +0000954 MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000955 Wait = AMDGPU::Waitcnt::allZero();
Kannan Narayananacb089e2017-04-12 03:25:12 +0000956 }
957 // Resolve vm waits before gs-done.
958 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
959 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
960 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
961 AMDGPU::SendMsg::ID_GS_DONE)) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000962 Wait.VmCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000963 }
964#if 0 // TODO: the following blocks of logic when we have fence.
965 else if (MI.getOpcode() == SC_FENCE) {
966 const unsigned int group_size =
967 context->shader_info->GetMaxThreadGroupSize();
968 // group_size == 0 means thread group size is unknown at compile time
969 const bool group_is_multi_wave =
970 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
971 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
972
973 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
974 SCRegType src_type = Inst->GetSrcType(i);
975 switch (src_type) {
976 case SCMEM_LDS:
977 if (group_is_multi_wave ||
Evgeny Mankovbf975172017-08-16 16:47:29 +0000978 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
Mark Searles70901b92018-04-24 15:59:59 +0000979 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000980 ScoreBrackets->getScoreUB(LGKM_CNT));
981 // LDS may have to wait for VM_CNT after buffer load to LDS
982 if (target_info->HasBufferLoadToLDS()) {
Mark Searles70901b92018-04-24 15:59:59 +0000983 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000984 ScoreBrackets->getScoreUB(VM_CNT));
985 }
986 }
987 break;
988
989 case SCMEM_GDS:
990 if (group_is_multi_wave || fence_is_global) {
Mark Searles70901b92018-04-24 15:59:59 +0000991 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000992 ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000993 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000994 ScoreBrackets->getScoreUB(LGKM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000995 }
996 break;
997
998 case SCMEM_UAV:
999 case SCMEM_TFBUF:
1000 case SCMEM_RING:
1001 case SCMEM_SCATTER:
1002 if (group_is_multi_wave || fence_is_global) {
Mark Searles70901b92018-04-24 15:59:59 +00001003 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +00001004 ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +00001005 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +00001006 ScoreBrackets->getScoreUB(VM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +00001007 }
1008 break;
1009
1010 case SCMEM_SCRATCH:
1011 default:
1012 break;
1013 }
1014 }
1015 }
1016#endif
1017
1018 // Export & GDS instructions do not read the EXEC mask until after the export
1019 // is granted (which can occur well after the instruction is issued).
1020 // The shader program must flush all EXP operations on the export-count
1021 // before overwriting the EXEC mask.
1022 else {
1023 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1024 // Export and GDS are tracked individually, either may trigger a waitcnt
1025 // for EXEC.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001026 ScoreBrackets->determineWait(
1027 EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK), Wait);
1028 ScoreBrackets->determineWait(
1029 EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS), Wait);
1030 ScoreBrackets->determineWait(
1031 EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS), Wait);
1032 ScoreBrackets->determineWait(
1033 EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001034 }
1035
1036#if 0 // TODO: the following code to handle CALL.
1037 // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
1038 // However, there is a problem with EXP_CNT, because the call cannot
1039 // easily tell if a register is used in the function, and if it did, then
1040 // the referring instruction would have to have an S_WAITCNT, which is
1041 // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
1042 // before the call.
1043 if (MI.getOpcode() == SC_CALL) {
1044 if (ScoreBrackets->getScoreUB(EXP_CNT) >
Evgeny Mankovbf975172017-08-16 16:47:29 +00001045 ScoreBrackets->getScoreLB(EXP_CNT)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001046 ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +00001047 EmitWaitcnt |= CNT_MASK(EXP_CNT);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001048 }
1049 }
1050#endif
1051
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001052 // FIXME: Should not be relying on memoperands.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001053 // Look at the source operands of every instruction to see if
1054 // any of them results from a previous memory operation that affects
1055 // its current usage. If so, an s_waitcnt instruction needs to be
1056 // emitted.
1057 // If the source operand was defined by a load, add the s_waitcnt
1058 // instruction.
1059 for (const MachineMemOperand *Memop : MI.memoperands()) {
1060 unsigned AS = Memop->getAddrSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +00001061 if (AS != AMDGPUAS::LOCAL_ADDRESS)
Kannan Narayananacb089e2017-04-12 03:25:12 +00001062 continue;
1063 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1064 // VM_CNT is only relevant to vgpr or LDS.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001065 ScoreBrackets->determineWait(
1066 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001067 }
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001068
Kannan Narayananacb089e2017-04-12 03:25:12 +00001069 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1070 const MachineOperand &Op = MI.getOperand(I);
1071 const MachineRegisterInfo &MRIA = *MRI;
1072 RegInterval Interval =
1073 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
1074 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1075 if (TRI->isVGPR(MRIA, Op.getReg())) {
1076 // VM_CNT is only relevant to vgpr or LDS.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001077 ScoreBrackets->determineWait(
1078 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001079 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001080 ScoreBrackets->determineWait(
1081 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001082 }
1083 }
1084 // End of for loop that looks at all source operands to decide vm_wait_cnt
1085 // and lgk_wait_cnt.
1086
1087 // Two cases are handled for destination operands:
1088 // 1) If the destination operand was defined by a load, add the s_waitcnt
1089 // instruction to guarantee the right WAW order.
1090 // 2) If a destination operand that was used by a recent export/store ins,
1091 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1092 if (MI.mayStore()) {
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001093 // FIXME: Should not be relying on memoperands.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001094 for (const MachineMemOperand *Memop : MI.memoperands()) {
1095 unsigned AS = Memop->getAddrSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +00001096 if (AS != AMDGPUAS::LOCAL_ADDRESS)
Kannan Narayananacb089e2017-04-12 03:25:12 +00001097 continue;
1098 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001099 ScoreBrackets->determineWait(
1100 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
1101 ScoreBrackets->determineWait(
1102 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001103 }
1104 }
1105 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1106 MachineOperand &Def = MI.getOperand(I);
1107 const MachineRegisterInfo &MRIA = *MRI;
1108 RegInterval Interval =
1109 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
1110 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1111 if (TRI->isVGPR(MRIA, Def.getReg())) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001112 ScoreBrackets->determineWait(
1113 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
1114 ScoreBrackets->determineWait(
1115 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001116 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001117 ScoreBrackets->determineWait(
1118 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001119 }
1120 } // End of for loop that looks at all dest operands.
1121 }
1122
Kannan Narayananacb089e2017-04-12 03:25:12 +00001123 // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1124 // occurs before the instruction. Doing it here prevents any additional
1125 // S_WAITCNTs from being emitted if the instruction was marked as
1126 // requiring a WAITCNT beforehand.
Konstantin Zhuravlyovbe6c0ca2017-06-02 17:40:26 +00001127 if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1128 !ST->hasAutoWaitcntBeforeBarrier()) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001129 Wait = AMDGPU::Waitcnt::allZero();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001130 }
1131
1132 // TODO: Remove this work-around, enable the assert for Bug 457939
1133 // after fixing the scheduler. Also, the Shader Compiler code is
1134 // independent of target.
Tom Stellardc5a154d2018-06-28 23:47:12 +00001135 if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001136 if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1137 ScoreBrackets->getScoreUB(LGKM_CNT) &&
1138 ScoreBrackets->hasPendingSMEM()) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001139 Wait.LgkmCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001140 }
1141 }
1142
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001143 // Early-out if no wait is indicated.
1144 if (!ScoreBrackets->simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
1145 if (OldWaitcntInstr) {
1146 if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
1147 TrackedWaitcntSet.erase(OldWaitcntInstr);
1148 OldWaitcntInstr->eraseFromParent();
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001149 } else {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001150 int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
1151 ScoreBrackets->applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001152 }
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001153 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001154 return;
1155 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001156
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001157 if (ForceEmitZeroWaitcnts)
1158 Wait = AMDGPU::Waitcnt::allZero();
1159
1160 if (ForceEmitWaitcnt[VM_CNT])
1161 Wait.VmCnt = 0;
1162 if (ForceEmitWaitcnt[EXP_CNT])
1163 Wait.ExpCnt = 0;
1164 if (ForceEmitWaitcnt[LGKM_CNT])
1165 Wait.LgkmCnt = 0;
1166
1167 ScoreBrackets->applyWaitcnt(Wait);
1168
1169 AMDGPU::Waitcnt OldWait;
1170 if (OldWaitcntInstr) {
1171 OldWait =
1172 AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm());
1173 }
1174 if (OldWait.dominates(Wait))
1175 return;
1176
1177 MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
1178 if (ContainingLoop) {
1179 MachineBasicBlock *TBB = ContainingLoop->getHeader();
1180 BlockWaitcntBrackets *ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
1181 if (!ScoreBracket) {
1182 assert(!BlockVisitedSet.count(TBB));
1183 BlockWaitcntBracketsMap[TBB] =
1184 llvm::make_unique<BlockWaitcntBrackets>(ST);
1185 ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001186 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001187 ScoreBracket->setRevisitLoop(true);
1188 LLVM_DEBUG(dbgs() << "set-revisit2: Block"
1189 << ContainingLoop->getHeader()->getNumber() << '\n';);
1190 }
1191
1192 if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr))
1193 Wait = Wait.combined(OldWait);
1194
1195 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1196 if (OldWaitcntInstr) {
1197 OldWaitcntInstr->getOperand(0).setImm(Enc);
1198
1199 LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
1200 << "Old Instr: " << MI << '\n'
1201 << "New Instr: " << *OldWaitcntInstr << '\n');
1202 } else {
1203 auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1204 MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1205 .addImm(Enc);
1206 TrackedWaitcntSet.insert(SWaitInst);
1207
1208 LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1209 << "Old Instr: " << MI << '\n'
1210 << "New Instr: " << *SWaitInst << '\n');
Kannan Narayananacb089e2017-04-12 03:25:12 +00001211 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001212}
1213
1214void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
1215 MachineInstr *Waitcnt) {
1216 if (MBB.empty()) {
1217 MBB.push_back(Waitcnt);
1218 return;
1219 }
1220
1221 MachineBasicBlock::iterator It = MBB.end();
1222 MachineInstr *MI = &*(--It);
1223 if (MI->isBranch()) {
1224 MBB.insert(It, Waitcnt);
1225 } else {
1226 MBB.push_back(Waitcnt);
1227 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001228}
1229
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001230// This is a flat memory operation. Check to see if it has memory
1231// tokens for both LDS and Memory, and if so mark it as a flat.
1232bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1233 if (MI.memoperands_empty())
1234 return true;
1235
1236 for (const MachineMemOperand *Memop : MI.memoperands()) {
1237 unsigned AS = Memop->getAddrSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +00001238 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001239 return true;
1240 }
1241
1242 return false;
1243}
1244
Mark Searles70901b92018-04-24 15:59:59 +00001245void SIInsertWaitcnts::updateEventWaitcntAfter(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001246 MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
1247 // Now look at the instruction opcode. If it is a memory access
1248 // instruction, update the upper-bound of the appropriate counter's
1249 // bracket and the destination operand scores.
1250 // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001251 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001252 if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001253 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1254 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1255 } else {
1256 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1257 }
1258 } else if (TII->isFLAT(Inst)) {
1259 assert(Inst.mayLoad() || Inst.mayStore());
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001260
1261 if (TII->usesVM_CNT(Inst))
1262 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1263
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001264 if (TII->usesLGKM_CNT(Inst)) {
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001265 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001266
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001267 // This is a flat memory operation, so note it - it will require
1268 // that both the VM and LGKM be flushed to zero if it is pending when
1269 // a VM or LGKM dependency occurs.
1270 if (mayAccessLDSThroughFlat(Inst))
1271 ScoreBrackets->setPendingFlat();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001272 }
1273 } else if (SIInstrInfo::isVMEM(Inst) &&
1274 // TODO: get a better carve out.
1275 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1276 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1277 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1278 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
Mark Searles2a19af62018-04-26 16:11:19 +00001279 if (ST->vmemWriteNeedsExpWaitcnt() &&
Mark Searles11d0a042017-05-31 16:44:23 +00001280 (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001281 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1282 }
1283 } else if (TII->isSMRD(Inst)) {
1284 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1285 } else {
1286 switch (Inst.getOpcode()) {
1287 case AMDGPU::S_SENDMSG:
1288 case AMDGPU::S_SENDMSGHALT:
1289 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1290 break;
1291 case AMDGPU::EXP:
1292 case AMDGPU::EXP_DONE: {
1293 int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1294 if (Imm >= 32 && Imm <= 63)
1295 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1296 else if (Imm >= 12 && Imm <= 15)
1297 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1298 else
1299 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1300 break;
1301 }
1302 case AMDGPU::S_MEMTIME:
1303 case AMDGPU::S_MEMREALTIME:
1304 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1305 break;
1306 default:
1307 break;
1308 }
1309 }
1310}
1311
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001312// Merge the score brackets of the Block's predecessors;
1313// this merged score bracket is used when adding waitcnts to the Block
Kannan Narayananacb089e2017-04-12 03:25:12 +00001314void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
1315 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1316 int32_t MaxPending[NUM_INST_CNTS] = {0};
1317 int32_t MaxFlat[NUM_INST_CNTS] = {0};
1318 bool MixedExpTypes = false;
1319
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001320 // For single basic block loops, we need to retain the Block's
1321 // score bracket to have accurate Pred info. So, make a copy of Block's
1322 // score bracket, clear() it (which retains several important bits of info),
1323 // populate, and then replace en masse. For non-single basic block loops,
1324 // just clear Block's current score bracket and repopulate in-place.
1325 bool IsSelfPred;
1326 std::unique_ptr<BlockWaitcntBrackets> S;
1327
1328 IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
1329 != Block.pred_end();
1330 if (IsSelfPred) {
1331 S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1332 ScoreBrackets = S.get();
1333 }
1334
Kannan Narayananacb089e2017-04-12 03:25:12 +00001335 ScoreBrackets->clear();
1336
Kannan Narayananacb089e2017-04-12 03:25:12 +00001337 // See if there are any uninitialized predecessors. If so, emit an
1338 // s_waitcnt 0 at the beginning of the block.
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001339 for (MachineBasicBlock *Pred : Block.predecessors()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001340 BlockWaitcntBrackets *PredScoreBrackets =
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001341 BlockWaitcntBracketsMap[Pred].get();
1342 bool Visited = BlockVisitedSet.count(Pred);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001343 if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001344 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001345 }
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001346 for (auto T : inst_counter_types()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001347 int span =
1348 PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
1349 MaxPending[T] = std::max(MaxPending[T], span);
1350 span =
1351 PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
1352 MaxFlat[T] = std::max(MaxFlat[T], span);
1353 }
1354
1355 MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
1356 }
1357
Kannan Narayananacb089e2017-04-12 03:25:12 +00001358 // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
1359 for (MachineBasicBlock *Pred : Block.predecessors()) {
1360 BlockWaitcntBrackets *PredScoreBrackets =
1361 BlockWaitcntBracketsMap[Pred].get();
Mark Searles24c92ee2018-02-07 02:21:21 +00001362 bool Visited = BlockVisitedSet.count(Pred);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001363 if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001364 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001365 }
1366
1367 int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
1368 PredScoreBrackets->getScoreLB(EXP_CNT);
1369 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1370 int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
1371 PredScoreBrackets->getScoreLB(EXP_CNT);
1372 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1373 }
1374
Kannan Narayananacb089e2017-04-12 03:25:12 +00001375#if 0
1376 // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
1377 // TODO: how does LC distinguish between function entry and main entry?
1378 // If this is the entry to a function, force a wait.
1379 MachineBasicBlock &Entry = Block.getParent()->front();
1380 if (Entry.getNumber() == Block.getNumber()) {
1381 ScoreBrackets->setWaitAtBeginning();
1382 return;
1383 }
1384#endif
1385
1386 // Now set the current Block's brackets to the largest ending bracket.
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001387 for (auto T : inst_counter_types()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001388 ScoreBrackets->setScoreUB(T, MaxPending[T]);
1389 ScoreBrackets->setScoreLB(T, 0);
1390 ScoreBrackets->setLastFlat(T, MaxFlat[T]);
1391 }
1392
1393 ScoreBrackets->setMixedExpTypes(MixedExpTypes);
1394
1395 // Set the register scoreboard.
1396 for (MachineBasicBlock *Pred : Block.predecessors()) {
Mark Searles24c92ee2018-02-07 02:21:21 +00001397 if (!BlockVisitedSet.count(Pred)) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001398 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001399 }
1400
1401 BlockWaitcntBrackets *PredScoreBrackets =
1402 BlockWaitcntBracketsMap[Pred].get();
1403
1404 // Now merge the gpr_reg_score information
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001405 for (auto T : inst_counter_types()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001406 int PredLB = PredScoreBrackets->getScoreLB(T);
1407 int PredUB = PredScoreBrackets->getScoreUB(T);
1408 if (PredLB < PredUB) {
1409 int PredScale = MaxPending[T] - PredUB;
1410 // Merge vgpr scores.
1411 for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
1412 int PredRegScore = PredScoreBrackets->getRegScore(J, T);
1413 if (PredRegScore <= PredLB)
1414 continue;
1415 int NewRegScore = PredScale + PredRegScore;
1416 ScoreBrackets->setRegScore(
1417 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1418 }
1419 // Also need to merge sgpr scores for lgkm_cnt.
1420 if (T == LGKM_CNT) {
1421 for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
1422 int PredRegScore =
1423 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1424 if (PredRegScore <= PredLB)
1425 continue;
1426 int NewRegScore = PredScale + PredRegScore;
1427 ScoreBrackets->setRegScore(
1428 J + NUM_ALL_VGPRS, LGKM_CNT,
1429 std::max(
1430 ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1431 NewRegScore));
1432 }
1433 }
1434 }
1435 }
1436
1437 // Also merge the WaitEvent information.
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001438 for (auto W : wait_event_types()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001439 enum InstCounterType T = PredScoreBrackets->eventCounter(W);
1440 int PredEventUB = PredScoreBrackets->getEventUB(W);
1441 if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
1442 int NewEventUB =
1443 MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
1444 if (NewEventUB > 0) {
1445 ScoreBrackets->setEventUB(
1446 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1447 }
1448 }
1449 }
1450 }
1451
Kannan Narayananacb089e2017-04-12 03:25:12 +00001452 // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
1453 // sequencing predecessors, because changes to EXEC require waitcnts due to
1454 // the delayed nature of these operations.
1455 for (MachineBasicBlock *Pred : Block.predecessors()) {
Mark Searles24c92ee2018-02-07 02:21:21 +00001456 if (!BlockVisitedSet.count(Pred)) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001457 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001458 }
1459
1460 BlockWaitcntBrackets *PredScoreBrackets =
1461 BlockWaitcntBracketsMap[Pred].get();
1462
1463 int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
1464 if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1465 int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
1466 PredScoreBrackets->getScoreUB(EXP_CNT);
1467 if (new_gds_ub > 0) {
1468 ScoreBrackets->setEventUB(
1469 GDS_GPR_LOCK,
1470 std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
1471 }
1472 }
1473 int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
1474 if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1475 int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
1476 PredScoreBrackets->getScoreUB(EXP_CNT);
1477 if (new_exp_ub > 0) {
1478 ScoreBrackets->setEventUB(
1479 EXP_GPR_LOCK,
1480 std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
1481 }
1482 }
1483 }
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001484
1485 // if a single block loop, update the score brackets. Not needed for other
1486 // blocks, as we did this in-place
1487 if (IsSelfPred) {
1488 BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1489 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001490}
1491
Mark Searles10545412018-05-30 15:47:45 +00001492/// Return true if the given basic block is a "bottom" block of a loop.
1493/// This works even if the loop is discontiguous. This also handles
1494/// multiple back-edges for the same "header" block of a loop.
Mark Searles1bc6e712018-04-19 15:42:30 +00001495bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
1496 const MachineBasicBlock *Block) {
1497 for (MachineBasicBlock *MBB : Loop->blocks()) {
1498 if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
1499 return true;
1500 }
1501 }
1502 return false;
1503}
1504
1505/// Count the number of "bottom" basic blocks of a loop.
1506unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
1507 unsigned Count = 0;
1508 for (MachineBasicBlock *MBB : Loop->blocks()) {
1509 if (MBB->isSuccessor(Loop->getHeader())) {
1510 Count++;
1511 }
1512 }
1513 return Count;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001514}
1515
1516// Generate s_waitcnt instructions where needed.
1517void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1518 MachineBasicBlock &Block) {
1519 // Initialize the state information.
1520 mergeInputScoreBrackets(Block);
1521
1522 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1523
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001524 LLVM_DEBUG({
Mark Searlesec581832018-04-25 19:21:26 +00001525 dbgs() << "*** Block" << Block.getNumber() << " ***";
Kannan Narayananacb089e2017-04-12 03:25:12 +00001526 ScoreBrackets->dump();
1527 });
1528
Kannan Narayananacb089e2017-04-12 03:25:12 +00001529 // Walk over the instructions.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001530 MachineInstr *OldWaitcntInstr = nullptr;
1531
Kannan Narayananacb089e2017-04-12 03:25:12 +00001532 for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1533 Iter != E;) {
1534 MachineInstr &Inst = *Iter;
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001535
Kannan Narayananacb089e2017-04-12 03:25:12 +00001536 // Remove any previously existing waitcnts.
1537 if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001538 if (OldWaitcntInstr) {
1539 if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
1540 TrackedWaitcntSet.erase(OldWaitcntInstr);
1541 OldWaitcntInstr->eraseFromParent();
1542 OldWaitcntInstr = nullptr;
1543 } else if (!TrackedWaitcntSet.count(&Inst)) {
1544 // Two successive s_waitcnt's, both of which are pre-existing and
1545 // are therefore preserved.
1546 int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
1547 ScoreBrackets->applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
1548 } else {
1549 ++Iter;
1550 Inst.eraseFromParent();
1551 continue;
1552 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001553 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001554
1555 OldWaitcntInstr = &Inst;
1556 ++Iter;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001557 continue;
1558 }
1559
Kannan Narayananacb089e2017-04-12 03:25:12 +00001560 bool VCCZBugWorkAround = false;
1561 if (readsVCCZ(Inst) &&
Mark Searles24c92ee2018-02-07 02:21:21 +00001562 (!VCCZBugHandledSet.count(&Inst))) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001563 if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1564 ScoreBrackets->getScoreUB(LGKM_CNT) &&
1565 ScoreBrackets->hasPendingSMEM()) {
Tom Stellardc5a154d2018-06-28 23:47:12 +00001566 if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
Kannan Narayananacb089e2017-04-12 03:25:12 +00001567 VCCZBugWorkAround = true;
1568 }
1569 }
1570
1571 // Generate an s_waitcnt instruction to be placed before
1572 // cur_Inst, if needed.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001573 generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
1574 OldWaitcntInstr = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001575
Mark Searles70901b92018-04-24 15:59:59 +00001576 updateEventWaitcntAfter(Inst, ScoreBrackets);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001577
1578#if 0 // TODO: implement resource type check controlled by options with ub = LB.
1579 // If this instruction generates a S_SETVSKIP because it is an
1580 // indexed resource, and we are on Tahiti, then it will also force
1581 // an S_WAITCNT vmcnt(0)
1582 if (RequireCheckResourceType(Inst, context)) {
1583 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1584 ScoreBrackets->setScoreLB(VM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +00001585 ScoreBrackets->getScoreUB(VM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +00001586 }
1587#endif
1588
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001589 LLVM_DEBUG({
Mark Searles94ae3b22018-01-30 17:17:06 +00001590 Inst.print(dbgs());
Kannan Narayananacb089e2017-04-12 03:25:12 +00001591 ScoreBrackets->dump();
1592 });
1593
1594 // Check to see if this is a GWS instruction. If so, and if this is CI or
1595 // VI, then the generated code sequence will include an S_WAITCNT 0.
1596 // TODO: Are these the only GWS instructions?
1597 if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1598 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1599 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1600 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1601 Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1602 // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001603 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero());
Kannan Narayananacb089e2017-04-12 03:25:12 +00001604 }
1605
1606 // TODO: Remove this work-around after fixing the scheduler and enable the
1607 // assert above.
1608 if (VCCZBugWorkAround) {
1609 // Restore the vccz bit. Any time a value is written to vcc, the vcc
1610 // bit is updated, so we can restore the bit by reading the value of
1611 // vcc and then writing it back to the register.
1612 BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1613 AMDGPU::VCC)
1614 .addReg(AMDGPU::VCC);
1615 VCCZBugHandledSet.insert(&Inst);
1616 }
1617
Kannan Narayananacb089e2017-04-12 03:25:12 +00001618 ++Iter;
1619 }
1620
1621 // Check if we need to force convergence at loop footer.
1622 MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
Mark Searles1bc6e712018-04-19 15:42:30 +00001623 if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001624 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1625 WaitcntData->print();
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001626 LLVM_DEBUG(dbgs() << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001627
1628 // The iterative waitcnt insertion algorithm aims for optimal waitcnt
Mark Searles10545412018-05-30 15:47:45 +00001629 // placement, but doesn't guarantee convergence for a loop. Each
1630 // loop should take at most (n+1) iterations for it to converge naturally,
1631 // where n is the number of bottom blocks. If this threshold is reached and
1632 // the result hasn't converged, then we force convergence by inserting
1633 // a s_waitcnt at the end of loop footer.
1634 if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001635 // To ensure convergence, need to make wait events at loop footer be no
1636 // more than those from the previous iteration.
Mark Searles65207922018-02-19 19:19:59 +00001637 // As a simplification, instead of tracking individual scores and
1638 // generating the precise wait count, just wait on 0.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001639 bool HasPending = false;
1640 MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001641 for (auto T : inst_counter_types()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001642 if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
1643 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1644 HasPending = true;
Mark Searles10545412018-05-30 15:47:45 +00001645 break;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001646 }
1647 }
1648
1649 if (HasPending) {
1650 if (!SWaitInst) {
Mark Searles10545412018-05-30 15:47:45 +00001651 SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
1652 DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1653 .addImm(0);
Mark Searles24c92ee2018-02-07 02:21:21 +00001654 TrackedWaitcntSet.insert(SWaitInst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001655#if 0 // TODO: Format the debug output
1656 OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
1657 OutputTransformAdd(SWaitInst, context);
1658#endif
1659 }
1660#if 0 // TODO: ??
1661 _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
1662#endif
1663 }
1664
1665 if (SWaitInst) {
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001666 LLVM_DEBUG({
Kannan Narayananacb089e2017-04-12 03:25:12 +00001667 SWaitInst->print(dbgs());
1668 dbgs() << "\nAdjusted score board:";
1669 ScoreBrackets->dump();
1670 });
1671
1672 // Add this waitcnt to the block. It is either newly created or
1673 // created in previous iterations and added back since block traversal
Mark Searles65207922018-02-19 19:19:59 +00001674 // always removes waitcnts.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001675 insertWaitcntBeforeCF(Block, SWaitInst);
1676 WaitcntData->setWaitcnt(SWaitInst);
1677 }
1678 }
1679 }
1680}
1681
1682bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
Tom Stellard5bfbae52018-07-11 20:59:01 +00001683 ST = &MF.getSubtarget<GCNSubtarget>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001684 TII = ST->getInstrInfo();
1685 TRI = &TII->getRegisterInfo();
1686 MRI = &MF.getRegInfo();
1687 MLI = &getAnalysis<MachineLoopInfo>();
Konstantin Zhuravlyov71e43ee2018-09-12 18:50:47 +00001688 IV = AMDGPU::getIsaVersion(ST->getCPU());
Mark Searles11d0a042017-05-31 16:44:23 +00001689 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001690
Mark Searles4a0f2c52018-05-07 14:43:28 +00001691 ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001692 for (auto T : inst_counter_types())
Mark Searlesec581832018-04-25 19:21:26 +00001693 ForceEmitWaitcnt[T] = false;
1694
Kannan Narayananacb089e2017-04-12 03:25:12 +00001695 HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1696 HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1697 HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1698
1699 HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1700 HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1701 assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1702 assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1703
1704 RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1705 RegisterEncoding.VGPRL =
1706 RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1707 RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1708 RegisterEncoding.SGPRL =
1709 RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1710
Mark Searles24c92ee2018-02-07 02:21:21 +00001711 TrackedWaitcntSet.clear();
1712 BlockVisitedSet.clear();
1713 VCCZBugHandledSet.clear();
Mark Searles1bc6e712018-04-19 15:42:30 +00001714 LoopWaitcntDataMap.clear();
Scott Linder5792dd02018-06-21 18:48:48 +00001715 BlockWaitcntProcessedSet.clear();
Mark Searles24c92ee2018-02-07 02:21:21 +00001716
Nicolai Haehnle0ab31c92018-11-07 21:53:29 +00001717 // Walk over the blocks in reverse post order, inserting
Kannan Narayananacb089e2017-04-12 03:25:12 +00001718 // s_waitcnt where needed.
1719 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
1720 bool Modified = false;
1721 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1722 I = RPOT.begin(),
1723 E = RPOT.end(), J = RPOT.begin();
1724 I != E;) {
1725 MachineBasicBlock &MBB = **I;
1726
1727 BlockVisitedSet.insert(&MBB);
1728
1729 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1730 if (!ScoreBrackets) {
Mark Searlesf0b93f12018-06-04 16:51:59 +00001731 BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001732 ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1733 }
1734 ScoreBrackets->setPostOrder(MBB.getNumber());
1735 MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
1736 if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
Eugene Zelenko59e12822017-08-08 00:47:13 +00001737 LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001738
1739 // If we are walking into the block from before the loop, then guarantee
1740 // at least 1 re-walk over the loop to propagate the information, even if
1741 // no S_WAITCNT instructions were generated.
Mark Searles1bc6e712018-04-19 15:42:30 +00001742 if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
1743 unsigned Count = countNumBottomBlocks(ContainingLoop);
1744
1745 // If the loop has multiple back-edges, and so more than one "bottom"
1746 // basic block, we have to guarantee a re-walk over every blocks.
1747 if ((std::count(BlockWaitcntProcessedSet.begin(),
Mark Searlesf4e70252018-07-16 10:21:36 +00001748 BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
Mark Searles1bc6e712018-04-19 15:42:30 +00001749 BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
Mark Searles10545412018-05-30 15:47:45 +00001750 LLVM_DEBUG(dbgs() << "set-revisit1: Block"
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001751 << ContainingLoop->getHeader()->getNumber() << '\n';);
Mark Searles1bc6e712018-04-19 15:42:30 +00001752 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001753 }
1754
1755 // Walk over the instructions.
1756 insertWaitcntInBlock(MF, MBB);
1757
Mark Searles10545412018-05-30 15:47:45 +00001758 // Record that waitcnts have been processed at least once for this block.
Mark Searles1bc6e712018-04-19 15:42:30 +00001759 BlockWaitcntProcessedSet.push_back(&MBB);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001760
Mark Searles1bc6e712018-04-19 15:42:30 +00001761 // See if we want to revisit the loop. If a loop has multiple back-edges,
1762 // we shouldn't revisit the same "bottom" basic block.
1763 if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
1764 std::count(BlockWaitcntProcessedSet.begin(),
1765 BlockWaitcntProcessedSet.end(), &MBB) == 1) {
Kannan Narayanan5e73b042017-05-05 21:10:17 +00001766 MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001767 BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
1768 if (EntrySB && EntrySB->getRevisitLoop()) {
1769 EntrySB->setRevisitLoop(false);
1770 J = I;
1771 int32_t PostOrder = EntrySB->getPostOrder();
1772 // TODO: Avoid this loop. Find another way to set I.
1773 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1774 X = RPOT.begin(),
1775 Y = RPOT.end();
1776 X != Y; ++X) {
1777 MachineBasicBlock &MBBX = **X;
1778 if (MBBX.getNumber() == PostOrder) {
1779 I = X;
1780 break;
1781 }
1782 }
1783 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1784 WaitcntData->incIterCnt();
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001785 LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001786 continue;
1787 } else {
1788 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1789 // Loop converged, reset iteration count. If this loop gets revisited,
1790 // it must be from an outer loop, the counter will restart, this will
1791 // ensure we don't force convergence on such revisits.
1792 WaitcntData->resetIterCnt();
1793 }
1794 }
1795
1796 J = I;
1797 ++I;
1798 }
1799
1800 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1801
1802 bool HaveScalarStores = false;
1803
1804 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1805 ++BI) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001806 MachineBasicBlock &MBB = *BI;
1807
1808 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1809 ++I) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001810 if (!HaveScalarStores && TII->isScalarStore(*I))
1811 HaveScalarStores = true;
1812
1813 if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1814 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1815 EndPgmBlocks.push_back(&MBB);
1816 }
1817 }
1818
1819 if (HaveScalarStores) {
1820 // If scalar writes are used, the cache must be flushed or else the next
1821 // wave to reuse the same scratch memory can be clobbered.
1822 //
1823 // Insert s_dcache_wb at wave termination points if there were any scalar
1824 // stores, and only if the cache hasn't already been flushed. This could be
1825 // improved by looking across blocks for flushes in postdominating blocks
1826 // from the stores but an explicitly requested flush is probably very rare.
1827 for (MachineBasicBlock *MBB : EndPgmBlocks) {
1828 bool SeenDCacheWB = false;
1829
1830 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1831 ++I) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001832 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1833 SeenDCacheWB = true;
1834 else if (TII->isScalarStore(*I))
1835 SeenDCacheWB = false;
1836
1837 // FIXME: It would be better to insert this before a waitcnt if any.
1838 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1839 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1840 !SeenDCacheWB) {
1841 Modified = true;
1842 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1843 }
1844 }
1845 }
1846 }
1847
Mark Searles11d0a042017-05-31 16:44:23 +00001848 if (!MFI->isEntryFunction()) {
1849 // Wait for any outstanding memory operations that the input registers may
Hiroshi Inouec8e92452018-01-29 05:17:03 +00001850 // depend on. We can't track them and it's better to the wait after the
Mark Searles11d0a042017-05-31 16:44:23 +00001851 // costly call sequence.
1852
1853 // TODO: Could insert earlier and schedule more liberally with operations
1854 // that only use caller preserved registers.
1855 MachineBasicBlock &EntryBB = MF.front();
Mark Searlesed54ff12018-05-30 16:27:57 +00001856 BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1857 .addImm(0);
Mark Searles11d0a042017-05-31 16:44:23 +00001858
1859 Modified = true;
1860 }
1861
Kannan Narayananacb089e2017-04-12 03:25:12 +00001862 return Modified;
1863}