blob: 6b3eb00fe2f7f8bca67f13cc121874d28bcc9cbb [file] [log] [blame]
Eugene Zelenko59e12822017-08-08 00:47:13 +00001//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
Kannan Narayananacb089e2017-04-12 03:25:12 +00002//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
Adrian Prantl5f8f34e42018-05-01 15:54:18 +000011/// Insert wait instructions for memory reads and writes.
Kannan Narayananacb089e2017-04-12 03:25:12 +000012///
13/// Memory reads and writes are issued asynchronously, so we need to insert
14/// S_WAITCNT instructions when we want to access any of their results or
15/// overwrite any register that's used asynchronously.
16//
17//===----------------------------------------------------------------------===//
18
19#include "AMDGPU.h"
20#include "AMDGPUSubtarget.h"
21#include "SIDefines.h"
22#include "SIInstrInfo.h"
23#include "SIMachineFunctionInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000024#include "SIRegisterInfo.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000025#include "Utils/AMDGPUBaseInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000026#include "llvm/ADT/DenseMap.h"
27#include "llvm/ADT/DenseSet.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000028#include "llvm/ADT/PostOrderIterator.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000029#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/SmallVector.h"
31#include "llvm/CodeGen/MachineBasicBlock.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000032#include "llvm/CodeGen/MachineFunction.h"
33#include "llvm/CodeGen/MachineFunctionPass.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000034#include "llvm/CodeGen/MachineInstr.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000035#include "llvm/CodeGen/MachineInstrBuilder.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000036#include "llvm/CodeGen/MachineLoopInfo.h"
37#include "llvm/CodeGen/MachineMemOperand.h"
38#include "llvm/CodeGen/MachineOperand.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000039#include "llvm/CodeGen/MachineRegisterInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000040#include "llvm/IR/DebugLoc.h"
41#include "llvm/Pass.h"
42#include "llvm/Support/Debug.h"
Mark Searlesec581832018-04-25 19:21:26 +000043#include "llvm/Support/DebugCounter.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000044#include "llvm/Support/ErrorHandling.h"
45#include "llvm/Support/raw_ostream.h"
46#include <algorithm>
47#include <cassert>
48#include <cstdint>
49#include <cstring>
50#include <memory>
51#include <utility>
52#include <vector>
Kannan Narayananacb089e2017-04-12 03:25:12 +000053
Mark Searlesec581832018-04-25 19:21:26 +000054using namespace llvm;
55
Kannan Narayananacb089e2017-04-12 03:25:12 +000056#define DEBUG_TYPE "si-insert-waitcnts"
57
Mark Searlesec581832018-04-25 19:21:26 +000058DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
59 "Force emit s_waitcnt expcnt(0) instrs");
60DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
61 "Force emit s_waitcnt lgkmcnt(0) instrs");
62DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
63 "Force emit s_waitcnt vmcnt(0) instrs");
64
65static cl::opt<unsigned> ForceEmitZeroFlag(
66 "amdgpu-waitcnt-forcezero",
67 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
68 cl::init(0), cl::Hidden);
Kannan Narayananacb089e2017-04-12 03:25:12 +000069
70namespace {
71
72// Class of object that encapsulates latest instruction counter score
73// associated with the operand. Used for determining whether
74// s_waitcnt instruction needs to be emited.
75
76#define CNT_MASK(t) (1u << (t))
77
78enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
79
Eugene Zelenko59e12822017-08-08 00:47:13 +000080using RegInterval = std::pair<signed, signed>;
Kannan Narayananacb089e2017-04-12 03:25:12 +000081
82struct {
83 int32_t VmcntMax;
84 int32_t ExpcntMax;
85 int32_t LgkmcntMax;
86 int32_t NumVGPRsMax;
87 int32_t NumSGPRsMax;
88} HardwareLimits;
89
90struct {
91 unsigned VGPR0;
92 unsigned VGPRL;
93 unsigned SGPR0;
94 unsigned SGPRL;
95} RegisterEncoding;
96
97enum WaitEventType {
98 VMEM_ACCESS, // vector-memory read & write
99 LDS_ACCESS, // lds read & write
100 GDS_ACCESS, // gds read & write
101 SQ_MESSAGE, // send message
102 SMEM_ACCESS, // scalar-memory read & write
103 EXP_GPR_LOCK, // export holding on its data src
104 GDS_GPR_LOCK, // GDS holding on its data and addr src
105 EXP_POS_ACCESS, // write to export position
106 EXP_PARAM_ACCESS, // write to export parameter
107 VMW_GPR_LOCK, // vector-memory write holding on its data src
108 NUM_WAIT_EVENTS,
109};
110
111// The mapping is:
112// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
113// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
114// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
115// We reserve a fixed number of VGPR slots in the scoring tables for
116// special tokens like SCMEM_LDS (needed for buffer load to LDS).
117enum RegisterMapping {
118 SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
119 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
120 NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
121 EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
122 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
123};
124
125#define ForAllWaitEventType(w) \
126 for (enum WaitEventType w = (enum WaitEventType)0; \
127 (w) < (enum WaitEventType)NUM_WAIT_EVENTS; \
128 (w) = (enum WaitEventType)((w) + 1))
129
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000130void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
131 switch (T) {
132 case VM_CNT:
133 Wait.VmCnt = std::min(Wait.VmCnt, Count);
134 break;
135 case EXP_CNT:
136 Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
137 break;
138 case LGKM_CNT:
139 Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
140 break;
141 default:
142 llvm_unreachable("bad InstCounterType");
143 }
144}
145
Kannan Narayananacb089e2017-04-12 03:25:12 +0000146// This is a per-basic-block object that maintains current score brackets
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000147// of each wait counter, and a per-register scoreboard for each wait counter.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000148// We also maintain the latest score for every event type that can change the
149// waitcnt in order to know if there are multiple types of events within
150// the brackets. When multiple types of event happen in the bracket,
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000151// wait count may get decreased out of order, therefore we need to put in
Kannan Narayananacb089e2017-04-12 03:25:12 +0000152// "s_waitcnt 0" before use.
153class BlockWaitcntBrackets {
154public:
Tom Stellard5bfbae52018-07-11 20:59:01 +0000155 BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
Eugene Zelenko59e12822017-08-08 00:47:13 +0000156 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
157 T = (enum InstCounterType)(T + 1)) {
158 memset(VgprScores[T], 0, sizeof(VgprScores[T]));
159 }
160 }
161
162 ~BlockWaitcntBrackets() = default;
163
Kannan Narayananacb089e2017-04-12 03:25:12 +0000164 static int32_t getWaitCountMax(InstCounterType T) {
165 switch (T) {
166 case VM_CNT:
167 return HardwareLimits.VmcntMax;
168 case LGKM_CNT:
169 return HardwareLimits.LgkmcntMax;
170 case EXP_CNT:
171 return HardwareLimits.ExpcntMax;
172 default:
173 break;
174 }
175 return 0;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000176 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000177
178 void setScoreLB(InstCounterType T, int32_t Val) {
179 assert(T < NUM_INST_CNTS);
180 if (T >= NUM_INST_CNTS)
181 return;
182 ScoreLBs[T] = Val;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000183 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000184
185 void setScoreUB(InstCounterType T, int32_t Val) {
186 assert(T < NUM_INST_CNTS);
187 if (T >= NUM_INST_CNTS)
188 return;
189 ScoreUBs[T] = Val;
190 if (T == EXP_CNT) {
191 int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
192 if (ScoreLBs[T] < UB)
193 ScoreLBs[T] = UB;
194 }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000195 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000196
Nicolai Haehnlec548d912018-11-19 12:03:11 +0000197 int32_t getScoreLB(InstCounterType T) const {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000198 assert(T < NUM_INST_CNTS);
199 if (T >= NUM_INST_CNTS)
200 return 0;
201 return ScoreLBs[T];
Eugene Zelenko59e12822017-08-08 00:47:13 +0000202 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000203
Nicolai Haehnlec548d912018-11-19 12:03:11 +0000204 int32_t getScoreUB(InstCounterType T) const {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000205 assert(T < NUM_INST_CNTS);
206 if (T >= NUM_INST_CNTS)
207 return 0;
208 return ScoreUBs[T];
Eugene Zelenko59e12822017-08-08 00:47:13 +0000209 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000210
211 // Mapping from event to counter.
212 InstCounterType eventCounter(WaitEventType E) {
213 switch (E) {
214 case VMEM_ACCESS:
215 return VM_CNT;
216 case LDS_ACCESS:
217 case GDS_ACCESS:
218 case SQ_MESSAGE:
219 case SMEM_ACCESS:
220 return LGKM_CNT;
221 case EXP_GPR_LOCK:
222 case GDS_GPR_LOCK:
223 case VMW_GPR_LOCK:
224 case EXP_POS_ACCESS:
225 case EXP_PARAM_ACCESS:
226 return EXP_CNT;
227 default:
228 llvm_unreachable("unhandled event type");
229 }
230 return NUM_INST_CNTS;
231 }
232
233 void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
234 if (GprNo < NUM_ALL_VGPRS) {
235 if (GprNo > VgprUB) {
236 VgprUB = GprNo;
237 }
238 VgprScores[T][GprNo] = Val;
239 } else {
240 assert(T == LGKM_CNT);
241 if (GprNo - NUM_ALL_VGPRS > SgprUB) {
242 SgprUB = GprNo - NUM_ALL_VGPRS;
243 }
244 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
245 }
246 }
247
248 int32_t getRegScore(int GprNo, InstCounterType T) {
249 if (GprNo < NUM_ALL_VGPRS) {
250 return VgprScores[T][GprNo];
251 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000252 assert(T == LGKM_CNT);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000253 return SgprScores[GprNo - NUM_ALL_VGPRS];
254 }
255
256 void clear() {
257 memset(ScoreLBs, 0, sizeof(ScoreLBs));
258 memset(ScoreUBs, 0, sizeof(ScoreUBs));
259 memset(EventUBs, 0, sizeof(EventUBs));
260 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
261 T = (enum InstCounterType)(T + 1)) {
262 memset(VgprScores[T], 0, sizeof(VgprScores[T]));
263 }
264 memset(SgprScores, 0, sizeof(SgprScores));
265 }
266
267 RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
268 const MachineRegisterInfo *MRI,
269 const SIRegisterInfo *TRI, unsigned OpNo,
270 bool Def) const;
271
272 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
273 const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
274 unsigned OpNo, int32_t Val);
275
276 void setWaitAtBeginning() { WaitAtBeginning = true; }
277 void clearWaitAtBeginning() { WaitAtBeginning = false; }
278 bool getWaitAtBeginning() const { return WaitAtBeginning; }
279 void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
280 int32_t getMaxVGPR() const { return VgprUB; }
281 int32_t getMaxSGPR() const { return SgprUB; }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000282
Kannan Narayananacb089e2017-04-12 03:25:12 +0000283 int32_t getEventUB(enum WaitEventType W) const {
284 assert(W < NUM_WAIT_EVENTS);
285 return EventUBs[W];
286 }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000287
Nicolai Haehnlec548d912018-11-19 12:03:11 +0000288 bool counterOutOfOrder(InstCounterType T) const;
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000289 bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
290 bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
291 void determineWait(InstCounterType T, int ScoreToWait,
292 AMDGPU::Waitcnt &Wait) const;
293 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
294 void applyWaitcnt(InstCounterType T, unsigned Count);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000295 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
296 const MachineRegisterInfo *MRI, WaitEventType E,
297 MachineInstr &MI);
298
Kannan Narayananacb089e2017-04-12 03:25:12 +0000299 bool hasPendingSMEM() const {
300 return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
301 EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
302 }
303
304 bool hasPendingFlat() const {
305 return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
306 LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
307 (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
308 LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
309 }
310
311 void setPendingFlat() {
312 LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
313 LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
314 }
315
316 int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
317
318 void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
319
320 bool getRevisitLoop() const { return RevisitLoop; }
321 void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
322
323 void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
324 int32_t getPostOrder() const { return PostOrder; }
325
Kannan Narayananacb089e2017-04-12 03:25:12 +0000326 bool mixedExpTypes() const { return MixedExpTypes; }
327 void setMixedExpTypes(bool MixedExpTypesIn) {
328 MixedExpTypes = MixedExpTypesIn;
329 }
330
331 void print(raw_ostream &);
332 void dump() { print(dbgs()); }
333
334private:
Tom Stellard5bfbae52018-07-11 20:59:01 +0000335 const GCNSubtarget *ST = nullptr;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000336 bool WaitAtBeginning = false;
337 bool RevisitLoop = false;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000338 bool MixedExpTypes = false;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000339 int32_t PostOrder = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000340 int32_t ScoreLBs[NUM_INST_CNTS] = {0};
341 int32_t ScoreUBs[NUM_INST_CNTS] = {0};
342 int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
343 // Remember the last flat memory operation.
344 int32_t LastFlat[NUM_INST_CNTS] = {0};
345 // wait_cnt scores for every vgpr.
346 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000347 int32_t VgprUB = 0;
348 int32_t SgprUB = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000349 int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
350 // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
351 int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
352};
353
354// This is a per-loop-region object that records waitcnt status at the end of
355// loop footer from the previous iteration. We also maintain an iteration
356// count to track the number of times the loop has been visited. When it
357// doesn't converge naturally, we force convergence by inserting s_waitcnt 0
358// at the end of the loop footer.
359class LoopWaitcntData {
360public:
Eugene Zelenko59e12822017-08-08 00:47:13 +0000361 LoopWaitcntData() = default;
362 ~LoopWaitcntData() = default;
363
Kannan Narayananacb089e2017-04-12 03:25:12 +0000364 void incIterCnt() { IterCnt++; }
365 void resetIterCnt() { IterCnt = 0; }
Mark Searles10545412018-05-30 15:47:45 +0000366 unsigned getIterCnt() { return IterCnt; }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000367
Kannan Narayananacb089e2017-04-12 03:25:12 +0000368 void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
369 MachineInstr *getWaitcnt() const { return LfWaitcnt; }
370
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000371 void print() { LLVM_DEBUG(dbgs() << " iteration " << IterCnt << '\n';); }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000372
373private:
374 // s_waitcnt added at the end of loop footer to stablize wait scores
375 // at the end of the loop footer.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000376 MachineInstr *LfWaitcnt = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000377 // Number of iterations the loop has been visited, not including the initial
378 // walk over.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000379 int32_t IterCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000380};
381
382class SIInsertWaitcnts : public MachineFunctionPass {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000383private:
Tom Stellard5bfbae52018-07-11 20:59:01 +0000384 const GCNSubtarget *ST = nullptr;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000385 const SIInstrInfo *TII = nullptr;
386 const SIRegisterInfo *TRI = nullptr;
387 const MachineRegisterInfo *MRI = nullptr;
388 const MachineLoopInfo *MLI = nullptr;
Konstantin Zhuravlyov71e43ee2018-09-12 18:50:47 +0000389 AMDGPU::IsaVersion IV;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000390
391 DenseSet<MachineBasicBlock *> BlockVisitedSet;
Mark Searles24c92ee2018-02-07 02:21:21 +0000392 DenseSet<MachineInstr *> TrackedWaitcntSet;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000393 DenseSet<MachineInstr *> VCCZBugHandledSet;
394
395 DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
396 BlockWaitcntBracketsMap;
397
Mark Searles1bc6e712018-04-19 15:42:30 +0000398 std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000399
400 DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
401
Mark Searles4a0f2c52018-05-07 14:43:28 +0000402 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
403 // because of amdgpu-waitcnt-forcezero flag
404 bool ForceEmitZeroWaitcnts;
Mark Searlesec581832018-04-25 19:21:26 +0000405 bool ForceEmitWaitcnt[NUM_INST_CNTS];
406
Kannan Narayananacb089e2017-04-12 03:25:12 +0000407public:
408 static char ID;
409
Konstantin Zhuravlyov77747772018-06-26 21:33:38 +0000410 SIInsertWaitcnts() : MachineFunctionPass(ID) {
411 (void)ForceExpCounter;
412 (void)ForceLgkmCounter;
413 (void)ForceVMCounter;
414 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000415
416 bool runOnMachineFunction(MachineFunction &MF) override;
417
418 StringRef getPassName() const override {
419 return "SI insert wait instructions";
420 }
421
422 void getAnalysisUsage(AnalysisUsage &AU) const override {
423 AU.setPreservesCFG();
424 AU.addRequired<MachineLoopInfo>();
425 MachineFunctionPass::getAnalysisUsage(AU);
426 }
427
Mark Searlesec581832018-04-25 19:21:26 +0000428 bool isForceEmitWaitcnt() const {
429 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
430 T = (enum InstCounterType)(T + 1))
431 if (ForceEmitWaitcnt[T])
432 return true;
433 return false;
434 }
435
436 void setForceEmitWaitcnt() {
437// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
438// For debug builds, get the debug counter info and adjust if need be
439#ifndef NDEBUG
440 if (DebugCounter::isCounterSet(ForceExpCounter) &&
441 DebugCounter::shouldExecute(ForceExpCounter)) {
442 ForceEmitWaitcnt[EXP_CNT] = true;
443 } else {
444 ForceEmitWaitcnt[EXP_CNT] = false;
445 }
446
447 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
448 DebugCounter::shouldExecute(ForceLgkmCounter)) {
449 ForceEmitWaitcnt[LGKM_CNT] = true;
450 } else {
451 ForceEmitWaitcnt[LGKM_CNT] = false;
452 }
453
454 if (DebugCounter::isCounterSet(ForceVMCounter) &&
455 DebugCounter::shouldExecute(ForceVMCounter)) {
456 ForceEmitWaitcnt[VM_CNT] = true;
457 } else {
458 ForceEmitWaitcnt[VM_CNT] = false;
459 }
460#endif // NDEBUG
461 }
462
Matt Arsenault0ed39d32017-07-21 18:54:54 +0000463 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
Mark Searles70901b92018-04-24 15:59:59 +0000464 void generateWaitcntInstBefore(MachineInstr &MI,
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000465 BlockWaitcntBrackets *ScoreBrackets,
466 MachineInstr *OldWaitcntInstr);
Mark Searles70901b92018-04-24 15:59:59 +0000467 void updateEventWaitcntAfter(MachineInstr &Inst,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000468 BlockWaitcntBrackets *ScoreBrackets);
469 void mergeInputScoreBrackets(MachineBasicBlock &Block);
Mark Searles1bc6e712018-04-19 15:42:30 +0000470 bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
471 unsigned countNumBottomBlocks(const MachineLoop *Loop);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000472 void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
473 void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
474};
475
Eugene Zelenko59e12822017-08-08 00:47:13 +0000476} // end anonymous namespace
Kannan Narayananacb089e2017-04-12 03:25:12 +0000477
478RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
479 const SIInstrInfo *TII,
480 const MachineRegisterInfo *MRI,
481 const SIRegisterInfo *TRI,
482 unsigned OpNo,
483 bool Def) const {
484 const MachineOperand &Op = MI->getOperand(OpNo);
485 if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
486 (Def && !Op.isDef()))
487 return {-1, -1};
488
489 // A use via a PW operand does not need a waitcnt.
490 // A partial write is not a WAW.
491 assert(!Op.getSubReg() || !Op.isUndef());
492
493 RegInterval Result;
494 const MachineRegisterInfo &MRIA = *MRI;
495
496 unsigned Reg = TRI->getEncodingValue(Op.getReg());
497
498 if (TRI->isVGPR(MRIA, Op.getReg())) {
499 assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
500 Result.first = Reg - RegisterEncoding.VGPR0;
501 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
502 } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
503 assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
504 Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
505 assert(Result.first >= NUM_ALL_VGPRS &&
506 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
507 }
508 // TODO: Handle TTMP
509 // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
510 else
511 return {-1, -1};
512
513 const MachineInstr &MIA = *MI;
514 const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +0000515 unsigned Size = TRI->getRegSizeInBits(*RC);
516 Result.second = Result.first + (Size / 32);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000517
518 return Result;
519}
520
521void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
522 const SIInstrInfo *TII,
523 const SIRegisterInfo *TRI,
524 const MachineRegisterInfo *MRI,
525 unsigned OpNo, int32_t Val) {
526 RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000527 LLVM_DEBUG({
Kannan Narayananacb089e2017-04-12 03:25:12 +0000528 const MachineOperand &Opnd = MI->getOperand(OpNo);
529 assert(TRI->isVGPR(*MRI, Opnd.getReg()));
530 });
531 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
532 setRegScore(RegNo, EXP_CNT, Val);
533 }
534}
535
536void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
537 const SIRegisterInfo *TRI,
538 const MachineRegisterInfo *MRI,
539 WaitEventType E, MachineInstr &Inst) {
540 const MachineRegisterInfo &MRIA = *MRI;
541 InstCounterType T = eventCounter(E);
542 int32_t CurrScore = getScoreUB(T) + 1;
543 // EventUB and ScoreUB need to be update regardless if this event changes
544 // the score of a register or not.
545 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
546 EventUBs[E] = CurrScore;
547 setScoreUB(T, CurrScore);
548
549 if (T == EXP_CNT) {
550 // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
551 // is required.
552 if (!MixedExpTypes) {
553 MixedExpTypes = counterOutOfOrder(EXP_CNT);
554 }
555
556 // Put score on the source vgprs. If this is a store, just use those
557 // specific register(s).
558 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
559 // All GDS operations must protect their address register (same as
560 // export.)
561 if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
562 Inst.getOpcode() != AMDGPU::DS_CONSUME) {
563 setExpScore(
564 &Inst, TII, TRI, MRI,
565 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
566 CurrScore);
567 }
568 if (Inst.mayStore()) {
569 setExpScore(
570 &Inst, TII, TRI, MRI,
571 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
572 CurrScore);
573 if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
574 AMDGPU::OpName::data1) != -1) {
575 setExpScore(&Inst, TII, TRI, MRI,
576 AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
577 AMDGPU::OpName::data1),
578 CurrScore);
579 }
580 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
581 Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
582 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
583 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
584 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
585 Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
586 Inst.getOpcode() != AMDGPU::DS_APPEND &&
587 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
588 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
589 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
590 const MachineOperand &Op = Inst.getOperand(I);
591 if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
592 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
593 }
594 }
595 }
596 } else if (TII->isFLAT(Inst)) {
597 if (Inst.mayStore()) {
598 setExpScore(
599 &Inst, TII, TRI, MRI,
600 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
601 CurrScore);
602 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
603 setExpScore(
604 &Inst, TII, TRI, MRI,
605 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
606 CurrScore);
607 }
608 } else if (TII->isMIMG(Inst)) {
609 if (Inst.mayStore()) {
610 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
611 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
612 setExpScore(
613 &Inst, TII, TRI, MRI,
614 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
615 CurrScore);
616 }
617 } else if (TII->isMTBUF(Inst)) {
618 if (Inst.mayStore()) {
619 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
620 }
621 } else if (TII->isMUBUF(Inst)) {
622 if (Inst.mayStore()) {
623 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
624 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
625 setExpScore(
626 &Inst, TII, TRI, MRI,
627 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
628 CurrScore);
629 }
630 } else {
631 if (TII->isEXP(Inst)) {
632 // For export the destination registers are really temps that
633 // can be used as the actual source after export patching, so
634 // we need to treat them like sources and set the EXP_CNT
635 // score.
636 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
637 MachineOperand &DefMO = Inst.getOperand(I);
638 if (DefMO.isReg() && DefMO.isDef() &&
639 TRI->isVGPR(MRIA, DefMO.getReg())) {
640 setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
641 CurrScore);
642 }
643 }
644 }
645 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
646 MachineOperand &MO = Inst.getOperand(I);
647 if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
648 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
649 }
650 }
651 }
652#if 0 // TODO: check if this is handled by MUBUF code above.
653 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
Evgeny Mankovbf975172017-08-16 16:47:29 +0000654 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
655 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000656 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
657 unsigned OpNo;//TODO: find the OpNo for this operand;
658 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
659 for (signed RegNo = Interval.first; RegNo < Interval.second;
Evgeny Mankovbf975172017-08-16 16:47:29 +0000660 ++RegNo) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000661 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
662 }
663#endif
664 } else {
665 // Match the score to the destination registers.
666 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
667 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
668 if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
669 continue;
670 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
671 setRegScore(RegNo, T, CurrScore);
672 }
673 }
674 if (TII->isDS(Inst) && Inst.mayStore()) {
675 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
676 }
677 }
678}
679
680void BlockWaitcntBrackets::print(raw_ostream &OS) {
681 OS << '\n';
682 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
683 T = (enum InstCounterType)(T + 1)) {
684 int LB = getScoreLB(T);
685 int UB = getScoreUB(T);
686
687 switch (T) {
688 case VM_CNT:
689 OS << " VM_CNT(" << UB - LB << "): ";
690 break;
691 case LGKM_CNT:
692 OS << " LGKM_CNT(" << UB - LB << "): ";
693 break;
694 case EXP_CNT:
695 OS << " EXP_CNT(" << UB - LB << "): ";
696 break;
697 default:
698 OS << " UNKNOWN(" << UB - LB << "): ";
699 break;
700 }
701
702 if (LB < UB) {
703 // Print vgpr scores.
704 for (int J = 0; J <= getMaxVGPR(); J++) {
705 int RegScore = getRegScore(J, T);
706 if (RegScore <= LB)
707 continue;
708 int RelScore = RegScore - LB - 1;
709 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
710 OS << RelScore << ":v" << J << " ";
711 } else {
712 OS << RelScore << ":ds ";
713 }
714 }
715 // Also need to print sgpr scores for lgkm_cnt.
716 if (T == LGKM_CNT) {
717 for (int J = 0; J <= getMaxSGPR(); J++) {
718 int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
719 if (RegScore <= LB)
720 continue;
721 int RelScore = RegScore - LB - 1;
722 OS << RelScore << ":s" << J << " ";
723 }
724 }
725 }
726 OS << '\n';
727 }
728 OS << '\n';
Kannan Narayananacb089e2017-04-12 03:25:12 +0000729}
730
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000731/// Simplify the waitcnt, in the sense of removing redundant counts, and return
732/// whether a waitcnt instruction is needed at all.
733bool BlockWaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
734 return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
735 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
736 simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
737}
738
739bool BlockWaitcntBrackets::simplifyWaitcnt(InstCounterType T,
740 unsigned &Count) const {
741 const int32_t LB = getScoreLB(T);
742 const int32_t UB = getScoreUB(T);
743 if (Count < (unsigned)UB && UB - (int32_t)Count > LB)
744 return true;
745
746 Count = ~0u;
747 return false;
748}
749
750void BlockWaitcntBrackets::determineWait(InstCounterType T, int ScoreToWait,
751 AMDGPU::Waitcnt &Wait) const {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000752 if (ScoreToWait == -1) {
753 // The score to wait is unknown. This implies that it was not encountered
754 // during the path of the CFG walk done during the current traversal but
755 // may be seen on a different path. Emit an s_wait counter with a
756 // conservative value of 0 for the counter.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000757 addWait(Wait, T, 0);
758 return;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000759 }
760
761 // If the score of src_operand falls within the bracket, we need an
762 // s_waitcnt instruction.
763 const int32_t LB = getScoreLB(T);
764 const int32_t UB = getScoreUB(T);
765 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
Mark Searlesf0b93f12018-06-04 16:51:59 +0000766 if ((T == VM_CNT || T == LGKM_CNT) &&
767 hasPendingFlat() &&
768 !ST->hasFlatLgkmVMemCountInOrder()) {
769 // If there is a pending FLAT operation, and this is a VMem or LGKM
770 // waitcnt and the target can report early completion, then we need
771 // to force a waitcnt 0.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000772 addWait(Wait, T, 0);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000773 } else if (counterOutOfOrder(T)) {
774 // Counter can get decremented out-of-order when there
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000775 // are multiple types event in the bracket. Also emit an s_wait counter
Kannan Narayananacb089e2017-04-12 03:25:12 +0000776 // with a conservative value of 0 for the counter.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000777 addWait(Wait, T, 0);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000778 } else {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000779 addWait(Wait, T, UB - ScoreToWait);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000780 }
781 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000782}
Kannan Narayananacb089e2017-04-12 03:25:12 +0000783
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000784void BlockWaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
785 applyWaitcnt(VM_CNT, Wait.VmCnt);
786 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
787 applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
788
789 if (Wait.ExpCnt == 0)
790 setMixedExpTypes(false);
791}
792
793void BlockWaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
794 const int32_t UB = getScoreUB(T);
795 if (Count >= (unsigned)UB)
796 return;
797 if (Count != 0) {
798 if (counterOutOfOrder(T))
799 return;
800 setScoreLB(T, std::max(getScoreLB(T), UB - (int32_t)Count));
801 } else {
802 setScoreLB(T, UB);
803 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000804}
805
806// Where there are multiple types of event in the bracket of a counter,
807// the decrement may go out of order.
Nicolai Haehnlec548d912018-11-19 12:03:11 +0000808bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000809 switch (T) {
810 case VM_CNT:
811 return false;
812 case LGKM_CNT: {
813 if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
814 EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
815 // Scalar memory read always can go out of order.
816 return true;
817 }
818 int NumEventTypes = 0;
819 if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
820 EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
821 NumEventTypes++;
822 }
823 if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
824 EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
825 NumEventTypes++;
826 }
827 if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
828 EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
829 NumEventTypes++;
830 }
831 if (NumEventTypes <= 1) {
832 return false;
833 }
834 break;
835 }
836 case EXP_CNT: {
837 // If there has been a mixture of export types, then a waitcnt exp(0) is
838 // required.
839 if (MixedExpTypes)
840 return true;
841 int NumEventTypes = 0;
842 if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
843 EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
844 NumEventTypes++;
845 }
846 if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
847 EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
848 NumEventTypes++;
849 }
850 if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
851 EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
852 NumEventTypes++;
853 }
854 if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
855 EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
856 NumEventTypes++;
857 }
858
859 if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
860 EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
861 NumEventTypes++;
862 }
863
864 if (NumEventTypes <= 1) {
865 return false;
866 }
867 break;
868 }
869 default:
870 break;
871 }
872 return true;
873}
874
875INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
876 false)
877INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
878 false)
879
880char SIInsertWaitcnts::ID = 0;
881
882char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
883
884FunctionPass *llvm::createSIInsertWaitcntsPass() {
885 return new SIInsertWaitcnts();
886}
887
888static bool readsVCCZ(const MachineInstr &MI) {
889 unsigned Opc = MI.getOpcode();
890 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
891 !MI.getOperand(1).isUndef();
892}
893
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000894/// Generate s_waitcnt instruction to be placed before cur_Inst.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000895/// Instructions of a given type are returned in order,
896/// but instructions of different types can complete out of order.
897/// We rely on this in-order completion
898/// and simply assign a score to the memory access instructions.
899/// We keep track of the active "score bracket" to determine
900/// if an access of a memory read requires an s_waitcnt
901/// and if so what the value of each counter is.
902/// The "score bracket" is bound by the lower bound and upper bound
903/// scores (*_score_LB and *_score_ub respectively).
Mark Searles70901b92018-04-24 15:59:59 +0000904void SIInsertWaitcnts::generateWaitcntInstBefore(
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000905 MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets,
906 MachineInstr *OldWaitcntInstr) {
Mark Searles4a0f2c52018-05-07 14:43:28 +0000907 setForceEmitWaitcnt();
Mark Searlesec581832018-04-25 19:21:26 +0000908 bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
909
Nicolai Haehnle61396ff2018-11-07 21:53:36 +0000910 if (MI.isDebugInstr())
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +0000911 return;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000912
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000913 AMDGPU::Waitcnt Wait;
914
Kannan Narayananacb089e2017-04-12 03:25:12 +0000915 // See if an s_waitcnt is forced at block entry, or is needed at
916 // program end.
917 if (ScoreBrackets->getWaitAtBeginning()) {
918 // Note that we have already cleared the state, so we don't need to update
919 // it.
920 ScoreBrackets->clearWaitAtBeginning();
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000921 Wait = AMDGPU::Waitcnt::allZero();
Kannan Narayananacb089e2017-04-12 03:25:12 +0000922 }
923
924 // See if this instruction has a forced S_WAITCNT VM.
925 // TODO: Handle other cases of NeedsWaitcntVmBefore()
926 else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
927 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
928 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000929 Wait.VmCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000930 }
931
932 // All waits must be resolved at call return.
933 // NOTE: this could be improved with knowledge of all call sites or
934 // with knowledge of the called routines.
Tom Stellardc5a154d2018-06-28 23:47:12 +0000935 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
Mark Searles11d0a042017-05-31 16:44:23 +0000936 MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000937 Wait = AMDGPU::Waitcnt::allZero();
Kannan Narayananacb089e2017-04-12 03:25:12 +0000938 }
939 // Resolve vm waits before gs-done.
940 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
941 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
942 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
943 AMDGPU::SendMsg::ID_GS_DONE)) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000944 Wait.VmCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000945 }
946#if 0 // TODO: the following blocks of logic when we have fence.
947 else if (MI.getOpcode() == SC_FENCE) {
948 const unsigned int group_size =
949 context->shader_info->GetMaxThreadGroupSize();
950 // group_size == 0 means thread group size is unknown at compile time
951 const bool group_is_multi_wave =
952 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
953 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
954
955 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
956 SCRegType src_type = Inst->GetSrcType(i);
957 switch (src_type) {
958 case SCMEM_LDS:
959 if (group_is_multi_wave ||
Evgeny Mankovbf975172017-08-16 16:47:29 +0000960 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
Mark Searles70901b92018-04-24 15:59:59 +0000961 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000962 ScoreBrackets->getScoreUB(LGKM_CNT));
963 // LDS may have to wait for VM_CNT after buffer load to LDS
964 if (target_info->HasBufferLoadToLDS()) {
Mark Searles70901b92018-04-24 15:59:59 +0000965 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000966 ScoreBrackets->getScoreUB(VM_CNT));
967 }
968 }
969 break;
970
971 case SCMEM_GDS:
972 if (group_is_multi_wave || fence_is_global) {
Mark Searles70901b92018-04-24 15:59:59 +0000973 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000974 ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000975 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000976 ScoreBrackets->getScoreUB(LGKM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000977 }
978 break;
979
980 case SCMEM_UAV:
981 case SCMEM_TFBUF:
982 case SCMEM_RING:
983 case SCMEM_SCATTER:
984 if (group_is_multi_wave || fence_is_global) {
Mark Searles70901b92018-04-24 15:59:59 +0000985 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000986 ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000987 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000988 ScoreBrackets->getScoreUB(VM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000989 }
990 break;
991
992 case SCMEM_SCRATCH:
993 default:
994 break;
995 }
996 }
997 }
998#endif
999
1000 // Export & GDS instructions do not read the EXEC mask until after the export
1001 // is granted (which can occur well after the instruction is issued).
1002 // The shader program must flush all EXP operations on the export-count
1003 // before overwriting the EXEC mask.
1004 else {
1005 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1006 // Export and GDS are tracked individually, either may trigger a waitcnt
1007 // for EXEC.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001008 ScoreBrackets->determineWait(
1009 EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK), Wait);
1010 ScoreBrackets->determineWait(
1011 EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS), Wait);
1012 ScoreBrackets->determineWait(
1013 EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS), Wait);
1014 ScoreBrackets->determineWait(
1015 EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001016 }
1017
1018#if 0 // TODO: the following code to handle CALL.
1019 // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
1020 // However, there is a problem with EXP_CNT, because the call cannot
1021 // easily tell if a register is used in the function, and if it did, then
1022 // the referring instruction would have to have an S_WAITCNT, which is
1023 // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
1024 // before the call.
1025 if (MI.getOpcode() == SC_CALL) {
1026 if (ScoreBrackets->getScoreUB(EXP_CNT) >
Evgeny Mankovbf975172017-08-16 16:47:29 +00001027 ScoreBrackets->getScoreLB(EXP_CNT)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001028 ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +00001029 EmitWaitcnt |= CNT_MASK(EXP_CNT);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001030 }
1031 }
1032#endif
1033
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001034 // FIXME: Should not be relying on memoperands.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001035 // Look at the source operands of every instruction to see if
1036 // any of them results from a previous memory operation that affects
1037 // its current usage. If so, an s_waitcnt instruction needs to be
1038 // emitted.
1039 // If the source operand was defined by a load, add the s_waitcnt
1040 // instruction.
1041 for (const MachineMemOperand *Memop : MI.memoperands()) {
1042 unsigned AS = Memop->getAddrSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +00001043 if (AS != AMDGPUAS::LOCAL_ADDRESS)
Kannan Narayananacb089e2017-04-12 03:25:12 +00001044 continue;
1045 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1046 // VM_CNT is only relevant to vgpr or LDS.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001047 ScoreBrackets->determineWait(
1048 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001049 }
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001050
Kannan Narayananacb089e2017-04-12 03:25:12 +00001051 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1052 const MachineOperand &Op = MI.getOperand(I);
1053 const MachineRegisterInfo &MRIA = *MRI;
1054 RegInterval Interval =
1055 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
1056 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1057 if (TRI->isVGPR(MRIA, Op.getReg())) {
1058 // VM_CNT is only relevant to vgpr or LDS.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001059 ScoreBrackets->determineWait(
1060 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001061 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001062 ScoreBrackets->determineWait(
1063 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001064 }
1065 }
1066 // End of for loop that looks at all source operands to decide vm_wait_cnt
1067 // and lgk_wait_cnt.
1068
1069 // Two cases are handled for destination operands:
1070 // 1) If the destination operand was defined by a load, add the s_waitcnt
1071 // instruction to guarantee the right WAW order.
1072 // 2) If a destination operand that was used by a recent export/store ins,
1073 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1074 if (MI.mayStore()) {
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001075 // FIXME: Should not be relying on memoperands.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001076 for (const MachineMemOperand *Memop : MI.memoperands()) {
1077 unsigned AS = Memop->getAddrSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +00001078 if (AS != AMDGPUAS::LOCAL_ADDRESS)
Kannan Narayananacb089e2017-04-12 03:25:12 +00001079 continue;
1080 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001081 ScoreBrackets->determineWait(
1082 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
1083 ScoreBrackets->determineWait(
1084 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001085 }
1086 }
1087 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1088 MachineOperand &Def = MI.getOperand(I);
1089 const MachineRegisterInfo &MRIA = *MRI;
1090 RegInterval Interval =
1091 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
1092 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1093 if (TRI->isVGPR(MRIA, Def.getReg())) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001094 ScoreBrackets->determineWait(
1095 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
1096 ScoreBrackets->determineWait(
1097 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001098 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001099 ScoreBrackets->determineWait(
1100 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001101 }
1102 } // End of for loop that looks at all dest operands.
1103 }
1104
Kannan Narayananacb089e2017-04-12 03:25:12 +00001105 // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1106 // occurs before the instruction. Doing it here prevents any additional
1107 // S_WAITCNTs from being emitted if the instruction was marked as
1108 // requiring a WAITCNT beforehand.
Konstantin Zhuravlyovbe6c0ca2017-06-02 17:40:26 +00001109 if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1110 !ST->hasAutoWaitcntBeforeBarrier()) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001111 Wait = AMDGPU::Waitcnt::allZero();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001112 }
1113
1114 // TODO: Remove this work-around, enable the assert for Bug 457939
1115 // after fixing the scheduler. Also, the Shader Compiler code is
1116 // independent of target.
Tom Stellardc5a154d2018-06-28 23:47:12 +00001117 if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001118 if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1119 ScoreBrackets->getScoreUB(LGKM_CNT) &&
1120 ScoreBrackets->hasPendingSMEM()) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001121 Wait.LgkmCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001122 }
1123 }
1124
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001125 // Early-out if no wait is indicated.
1126 if (!ScoreBrackets->simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
1127 if (OldWaitcntInstr) {
1128 if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
1129 TrackedWaitcntSet.erase(OldWaitcntInstr);
1130 OldWaitcntInstr->eraseFromParent();
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001131 } else {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001132 int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
1133 ScoreBrackets->applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001134 }
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001135 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001136 return;
1137 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001138
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001139 if (ForceEmitZeroWaitcnts)
1140 Wait = AMDGPU::Waitcnt::allZero();
1141
1142 if (ForceEmitWaitcnt[VM_CNT])
1143 Wait.VmCnt = 0;
1144 if (ForceEmitWaitcnt[EXP_CNT])
1145 Wait.ExpCnt = 0;
1146 if (ForceEmitWaitcnt[LGKM_CNT])
1147 Wait.LgkmCnt = 0;
1148
1149 ScoreBrackets->applyWaitcnt(Wait);
1150
1151 AMDGPU::Waitcnt OldWait;
1152 if (OldWaitcntInstr) {
1153 OldWait =
1154 AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm());
1155 }
1156 if (OldWait.dominates(Wait))
1157 return;
1158
1159 MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
1160 if (ContainingLoop) {
1161 MachineBasicBlock *TBB = ContainingLoop->getHeader();
1162 BlockWaitcntBrackets *ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
1163 if (!ScoreBracket) {
1164 assert(!BlockVisitedSet.count(TBB));
1165 BlockWaitcntBracketsMap[TBB] =
1166 llvm::make_unique<BlockWaitcntBrackets>(ST);
1167 ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001168 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001169 ScoreBracket->setRevisitLoop(true);
1170 LLVM_DEBUG(dbgs() << "set-revisit2: Block"
1171 << ContainingLoop->getHeader()->getNumber() << '\n';);
1172 }
1173
1174 if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr))
1175 Wait = Wait.combined(OldWait);
1176
1177 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1178 if (OldWaitcntInstr) {
1179 OldWaitcntInstr->getOperand(0).setImm(Enc);
1180
1181 LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
1182 << "Old Instr: " << MI << '\n'
1183 << "New Instr: " << *OldWaitcntInstr << '\n');
1184 } else {
1185 auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1186 MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1187 .addImm(Enc);
1188 TrackedWaitcntSet.insert(SWaitInst);
1189
1190 LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1191 << "Old Instr: " << MI << '\n'
1192 << "New Instr: " << *SWaitInst << '\n');
Kannan Narayananacb089e2017-04-12 03:25:12 +00001193 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001194}
1195
1196void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
1197 MachineInstr *Waitcnt) {
1198 if (MBB.empty()) {
1199 MBB.push_back(Waitcnt);
1200 return;
1201 }
1202
1203 MachineBasicBlock::iterator It = MBB.end();
1204 MachineInstr *MI = &*(--It);
1205 if (MI->isBranch()) {
1206 MBB.insert(It, Waitcnt);
1207 } else {
1208 MBB.push_back(Waitcnt);
1209 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001210}
1211
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001212// This is a flat memory operation. Check to see if it has memory
1213// tokens for both LDS and Memory, and if so mark it as a flat.
1214bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1215 if (MI.memoperands_empty())
1216 return true;
1217
1218 for (const MachineMemOperand *Memop : MI.memoperands()) {
1219 unsigned AS = Memop->getAddrSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +00001220 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001221 return true;
1222 }
1223
1224 return false;
1225}
1226
Mark Searles70901b92018-04-24 15:59:59 +00001227void SIInsertWaitcnts::updateEventWaitcntAfter(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001228 MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
1229 // Now look at the instruction opcode. If it is a memory access
1230 // instruction, update the upper-bound of the appropriate counter's
1231 // bracket and the destination operand scores.
1232 // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001233 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001234 if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001235 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1236 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1237 } else {
1238 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1239 }
1240 } else if (TII->isFLAT(Inst)) {
1241 assert(Inst.mayLoad() || Inst.mayStore());
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001242
1243 if (TII->usesVM_CNT(Inst))
1244 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1245
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001246 if (TII->usesLGKM_CNT(Inst)) {
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001247 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001248
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001249 // This is a flat memory operation, so note it - it will require
1250 // that both the VM and LGKM be flushed to zero if it is pending when
1251 // a VM or LGKM dependency occurs.
1252 if (mayAccessLDSThroughFlat(Inst))
1253 ScoreBrackets->setPendingFlat();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001254 }
1255 } else if (SIInstrInfo::isVMEM(Inst) &&
1256 // TODO: get a better carve out.
1257 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1258 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1259 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1260 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
Mark Searles2a19af62018-04-26 16:11:19 +00001261 if (ST->vmemWriteNeedsExpWaitcnt() &&
Mark Searles11d0a042017-05-31 16:44:23 +00001262 (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001263 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1264 }
1265 } else if (TII->isSMRD(Inst)) {
1266 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1267 } else {
1268 switch (Inst.getOpcode()) {
1269 case AMDGPU::S_SENDMSG:
1270 case AMDGPU::S_SENDMSGHALT:
1271 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1272 break;
1273 case AMDGPU::EXP:
1274 case AMDGPU::EXP_DONE: {
1275 int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1276 if (Imm >= 32 && Imm <= 63)
1277 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1278 else if (Imm >= 12 && Imm <= 15)
1279 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1280 else
1281 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1282 break;
1283 }
1284 case AMDGPU::S_MEMTIME:
1285 case AMDGPU::S_MEMREALTIME:
1286 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1287 break;
1288 default:
1289 break;
1290 }
1291 }
1292}
1293
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001294// Merge the score brackets of the Block's predecessors;
1295// this merged score bracket is used when adding waitcnts to the Block
Kannan Narayananacb089e2017-04-12 03:25:12 +00001296void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
1297 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1298 int32_t MaxPending[NUM_INST_CNTS] = {0};
1299 int32_t MaxFlat[NUM_INST_CNTS] = {0};
1300 bool MixedExpTypes = false;
1301
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001302 // For single basic block loops, we need to retain the Block's
1303 // score bracket to have accurate Pred info. So, make a copy of Block's
1304 // score bracket, clear() it (which retains several important bits of info),
1305 // populate, and then replace en masse. For non-single basic block loops,
1306 // just clear Block's current score bracket and repopulate in-place.
1307 bool IsSelfPred;
1308 std::unique_ptr<BlockWaitcntBrackets> S;
1309
1310 IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
1311 != Block.pred_end();
1312 if (IsSelfPred) {
1313 S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1314 ScoreBrackets = S.get();
1315 }
1316
Kannan Narayananacb089e2017-04-12 03:25:12 +00001317 ScoreBrackets->clear();
1318
Kannan Narayananacb089e2017-04-12 03:25:12 +00001319 // See if there are any uninitialized predecessors. If so, emit an
1320 // s_waitcnt 0 at the beginning of the block.
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001321 for (MachineBasicBlock *Pred : Block.predecessors()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001322 BlockWaitcntBrackets *PredScoreBrackets =
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001323 BlockWaitcntBracketsMap[Pred].get();
1324 bool Visited = BlockVisitedSet.count(Pred);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001325 if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001326 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001327 }
1328 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1329 T = (enum InstCounterType)(T + 1)) {
1330 int span =
1331 PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
1332 MaxPending[T] = std::max(MaxPending[T], span);
1333 span =
1334 PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
1335 MaxFlat[T] = std::max(MaxFlat[T], span);
1336 }
1337
1338 MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
1339 }
1340
Kannan Narayananacb089e2017-04-12 03:25:12 +00001341 // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
1342 for (MachineBasicBlock *Pred : Block.predecessors()) {
1343 BlockWaitcntBrackets *PredScoreBrackets =
1344 BlockWaitcntBracketsMap[Pred].get();
Mark Searles24c92ee2018-02-07 02:21:21 +00001345 bool Visited = BlockVisitedSet.count(Pred);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001346 if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001347 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001348 }
1349
1350 int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
1351 PredScoreBrackets->getScoreLB(EXP_CNT);
1352 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1353 int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
1354 PredScoreBrackets->getScoreLB(EXP_CNT);
1355 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1356 }
1357
Kannan Narayananacb089e2017-04-12 03:25:12 +00001358#if 0
1359 // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
1360 // TODO: how does LC distinguish between function entry and main entry?
1361 // If this is the entry to a function, force a wait.
1362 MachineBasicBlock &Entry = Block.getParent()->front();
1363 if (Entry.getNumber() == Block.getNumber()) {
1364 ScoreBrackets->setWaitAtBeginning();
1365 return;
1366 }
1367#endif
1368
1369 // Now set the current Block's brackets to the largest ending bracket.
1370 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1371 T = (enum InstCounterType)(T + 1)) {
1372 ScoreBrackets->setScoreUB(T, MaxPending[T]);
1373 ScoreBrackets->setScoreLB(T, 0);
1374 ScoreBrackets->setLastFlat(T, MaxFlat[T]);
1375 }
1376
1377 ScoreBrackets->setMixedExpTypes(MixedExpTypes);
1378
1379 // Set the register scoreboard.
1380 for (MachineBasicBlock *Pred : Block.predecessors()) {
Mark Searles24c92ee2018-02-07 02:21:21 +00001381 if (!BlockVisitedSet.count(Pred)) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001382 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001383 }
1384
1385 BlockWaitcntBrackets *PredScoreBrackets =
1386 BlockWaitcntBracketsMap[Pred].get();
1387
1388 // Now merge the gpr_reg_score information
1389 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1390 T = (enum InstCounterType)(T + 1)) {
1391 int PredLB = PredScoreBrackets->getScoreLB(T);
1392 int PredUB = PredScoreBrackets->getScoreUB(T);
1393 if (PredLB < PredUB) {
1394 int PredScale = MaxPending[T] - PredUB;
1395 // Merge vgpr scores.
1396 for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
1397 int PredRegScore = PredScoreBrackets->getRegScore(J, T);
1398 if (PredRegScore <= PredLB)
1399 continue;
1400 int NewRegScore = PredScale + PredRegScore;
1401 ScoreBrackets->setRegScore(
1402 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1403 }
1404 // Also need to merge sgpr scores for lgkm_cnt.
1405 if (T == LGKM_CNT) {
1406 for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
1407 int PredRegScore =
1408 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1409 if (PredRegScore <= PredLB)
1410 continue;
1411 int NewRegScore = PredScale + PredRegScore;
1412 ScoreBrackets->setRegScore(
1413 J + NUM_ALL_VGPRS, LGKM_CNT,
1414 std::max(
1415 ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1416 NewRegScore));
1417 }
1418 }
1419 }
1420 }
1421
1422 // Also merge the WaitEvent information.
1423 ForAllWaitEventType(W) {
1424 enum InstCounterType T = PredScoreBrackets->eventCounter(W);
1425 int PredEventUB = PredScoreBrackets->getEventUB(W);
1426 if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
1427 int NewEventUB =
1428 MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
1429 if (NewEventUB > 0) {
1430 ScoreBrackets->setEventUB(
1431 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1432 }
1433 }
1434 }
1435 }
1436
Kannan Narayananacb089e2017-04-12 03:25:12 +00001437 // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
1438 // sequencing predecessors, because changes to EXEC require waitcnts due to
1439 // the delayed nature of these operations.
1440 for (MachineBasicBlock *Pred : Block.predecessors()) {
Mark Searles24c92ee2018-02-07 02:21:21 +00001441 if (!BlockVisitedSet.count(Pred)) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001442 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001443 }
1444
1445 BlockWaitcntBrackets *PredScoreBrackets =
1446 BlockWaitcntBracketsMap[Pred].get();
1447
1448 int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
1449 if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1450 int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
1451 PredScoreBrackets->getScoreUB(EXP_CNT);
1452 if (new_gds_ub > 0) {
1453 ScoreBrackets->setEventUB(
1454 GDS_GPR_LOCK,
1455 std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
1456 }
1457 }
1458 int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
1459 if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1460 int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
1461 PredScoreBrackets->getScoreUB(EXP_CNT);
1462 if (new_exp_ub > 0) {
1463 ScoreBrackets->setEventUB(
1464 EXP_GPR_LOCK,
1465 std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
1466 }
1467 }
1468 }
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001469
1470 // if a single block loop, update the score brackets. Not needed for other
1471 // blocks, as we did this in-place
1472 if (IsSelfPred) {
1473 BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1474 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001475}
1476
Mark Searles10545412018-05-30 15:47:45 +00001477/// Return true if the given basic block is a "bottom" block of a loop.
1478/// This works even if the loop is discontiguous. This also handles
1479/// multiple back-edges for the same "header" block of a loop.
Mark Searles1bc6e712018-04-19 15:42:30 +00001480bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
1481 const MachineBasicBlock *Block) {
1482 for (MachineBasicBlock *MBB : Loop->blocks()) {
1483 if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
1484 return true;
1485 }
1486 }
1487 return false;
1488}
1489
1490/// Count the number of "bottom" basic blocks of a loop.
1491unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
1492 unsigned Count = 0;
1493 for (MachineBasicBlock *MBB : Loop->blocks()) {
1494 if (MBB->isSuccessor(Loop->getHeader())) {
1495 Count++;
1496 }
1497 }
1498 return Count;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001499}
1500
1501// Generate s_waitcnt instructions where needed.
1502void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1503 MachineBasicBlock &Block) {
1504 // Initialize the state information.
1505 mergeInputScoreBrackets(Block);
1506
1507 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1508
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001509 LLVM_DEBUG({
Mark Searlesec581832018-04-25 19:21:26 +00001510 dbgs() << "*** Block" << Block.getNumber() << " ***";
Kannan Narayananacb089e2017-04-12 03:25:12 +00001511 ScoreBrackets->dump();
1512 });
1513
Kannan Narayananacb089e2017-04-12 03:25:12 +00001514 // Walk over the instructions.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001515 MachineInstr *OldWaitcntInstr = nullptr;
1516
Kannan Narayananacb089e2017-04-12 03:25:12 +00001517 for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1518 Iter != E;) {
1519 MachineInstr &Inst = *Iter;
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001520
Kannan Narayananacb089e2017-04-12 03:25:12 +00001521 // Remove any previously existing waitcnts.
1522 if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001523 if (OldWaitcntInstr) {
1524 if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
1525 TrackedWaitcntSet.erase(OldWaitcntInstr);
1526 OldWaitcntInstr->eraseFromParent();
1527 OldWaitcntInstr = nullptr;
1528 } else if (!TrackedWaitcntSet.count(&Inst)) {
1529 // Two successive s_waitcnt's, both of which are pre-existing and
1530 // are therefore preserved.
1531 int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
1532 ScoreBrackets->applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
1533 } else {
1534 ++Iter;
1535 Inst.eraseFromParent();
1536 continue;
1537 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001538 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001539
1540 OldWaitcntInstr = &Inst;
1541 ++Iter;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001542 continue;
1543 }
1544
Kannan Narayananacb089e2017-04-12 03:25:12 +00001545 bool VCCZBugWorkAround = false;
1546 if (readsVCCZ(Inst) &&
Mark Searles24c92ee2018-02-07 02:21:21 +00001547 (!VCCZBugHandledSet.count(&Inst))) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001548 if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1549 ScoreBrackets->getScoreUB(LGKM_CNT) &&
1550 ScoreBrackets->hasPendingSMEM()) {
Tom Stellardc5a154d2018-06-28 23:47:12 +00001551 if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
Kannan Narayananacb089e2017-04-12 03:25:12 +00001552 VCCZBugWorkAround = true;
1553 }
1554 }
1555
1556 // Generate an s_waitcnt instruction to be placed before
1557 // cur_Inst, if needed.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001558 generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
1559 OldWaitcntInstr = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001560
Mark Searles70901b92018-04-24 15:59:59 +00001561 updateEventWaitcntAfter(Inst, ScoreBrackets);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001562
1563#if 0 // TODO: implement resource type check controlled by options with ub = LB.
1564 // If this instruction generates a S_SETVSKIP because it is an
1565 // indexed resource, and we are on Tahiti, then it will also force
1566 // an S_WAITCNT vmcnt(0)
1567 if (RequireCheckResourceType(Inst, context)) {
1568 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1569 ScoreBrackets->setScoreLB(VM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +00001570 ScoreBrackets->getScoreUB(VM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +00001571 }
1572#endif
1573
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001574 LLVM_DEBUG({
Mark Searles94ae3b22018-01-30 17:17:06 +00001575 Inst.print(dbgs());
Kannan Narayananacb089e2017-04-12 03:25:12 +00001576 ScoreBrackets->dump();
1577 });
1578
1579 // Check to see if this is a GWS instruction. If so, and if this is CI or
1580 // VI, then the generated code sequence will include an S_WAITCNT 0.
1581 // TODO: Are these the only GWS instructions?
1582 if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1583 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1584 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1585 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1586 Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1587 // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001588 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero());
Kannan Narayananacb089e2017-04-12 03:25:12 +00001589 }
1590
1591 // TODO: Remove this work-around after fixing the scheduler and enable the
1592 // assert above.
1593 if (VCCZBugWorkAround) {
1594 // Restore the vccz bit. Any time a value is written to vcc, the vcc
1595 // bit is updated, so we can restore the bit by reading the value of
1596 // vcc and then writing it back to the register.
1597 BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1598 AMDGPU::VCC)
1599 .addReg(AMDGPU::VCC);
1600 VCCZBugHandledSet.insert(&Inst);
1601 }
1602
Kannan Narayananacb089e2017-04-12 03:25:12 +00001603 ++Iter;
1604 }
1605
1606 // Check if we need to force convergence at loop footer.
1607 MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
Mark Searles1bc6e712018-04-19 15:42:30 +00001608 if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001609 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1610 WaitcntData->print();
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001611 LLVM_DEBUG(dbgs() << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001612
1613 // The iterative waitcnt insertion algorithm aims for optimal waitcnt
Mark Searles10545412018-05-30 15:47:45 +00001614 // placement, but doesn't guarantee convergence for a loop. Each
1615 // loop should take at most (n+1) iterations for it to converge naturally,
1616 // where n is the number of bottom blocks. If this threshold is reached and
1617 // the result hasn't converged, then we force convergence by inserting
1618 // a s_waitcnt at the end of loop footer.
1619 if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001620 // To ensure convergence, need to make wait events at loop footer be no
1621 // more than those from the previous iteration.
Mark Searles65207922018-02-19 19:19:59 +00001622 // As a simplification, instead of tracking individual scores and
1623 // generating the precise wait count, just wait on 0.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001624 bool HasPending = false;
1625 MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
1626 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1627 T = (enum InstCounterType)(T + 1)) {
1628 if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
1629 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1630 HasPending = true;
Mark Searles10545412018-05-30 15:47:45 +00001631 break;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001632 }
1633 }
1634
1635 if (HasPending) {
1636 if (!SWaitInst) {
Mark Searles10545412018-05-30 15:47:45 +00001637 SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
1638 DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1639 .addImm(0);
Mark Searles24c92ee2018-02-07 02:21:21 +00001640 TrackedWaitcntSet.insert(SWaitInst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001641#if 0 // TODO: Format the debug output
1642 OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
1643 OutputTransformAdd(SWaitInst, context);
1644#endif
1645 }
1646#if 0 // TODO: ??
1647 _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
1648#endif
1649 }
1650
1651 if (SWaitInst) {
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001652 LLVM_DEBUG({
Kannan Narayananacb089e2017-04-12 03:25:12 +00001653 SWaitInst->print(dbgs());
1654 dbgs() << "\nAdjusted score board:";
1655 ScoreBrackets->dump();
1656 });
1657
1658 // Add this waitcnt to the block. It is either newly created or
1659 // created in previous iterations and added back since block traversal
Mark Searles65207922018-02-19 19:19:59 +00001660 // always removes waitcnts.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001661 insertWaitcntBeforeCF(Block, SWaitInst);
1662 WaitcntData->setWaitcnt(SWaitInst);
1663 }
1664 }
1665 }
1666}
1667
1668bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
Tom Stellard5bfbae52018-07-11 20:59:01 +00001669 ST = &MF.getSubtarget<GCNSubtarget>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001670 TII = ST->getInstrInfo();
1671 TRI = &TII->getRegisterInfo();
1672 MRI = &MF.getRegInfo();
1673 MLI = &getAnalysis<MachineLoopInfo>();
Konstantin Zhuravlyov71e43ee2018-09-12 18:50:47 +00001674 IV = AMDGPU::getIsaVersion(ST->getCPU());
Mark Searles11d0a042017-05-31 16:44:23 +00001675 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001676
Mark Searles4a0f2c52018-05-07 14:43:28 +00001677 ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
Mark Searlesec581832018-04-25 19:21:26 +00001678 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1679 T = (enum InstCounterType)(T + 1))
1680 ForceEmitWaitcnt[T] = false;
1681
Kannan Narayananacb089e2017-04-12 03:25:12 +00001682 HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1683 HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1684 HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1685
1686 HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1687 HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1688 assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1689 assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1690
1691 RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1692 RegisterEncoding.VGPRL =
1693 RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1694 RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1695 RegisterEncoding.SGPRL =
1696 RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1697
Mark Searles24c92ee2018-02-07 02:21:21 +00001698 TrackedWaitcntSet.clear();
1699 BlockVisitedSet.clear();
1700 VCCZBugHandledSet.clear();
Mark Searles1bc6e712018-04-19 15:42:30 +00001701 LoopWaitcntDataMap.clear();
Scott Linder5792dd02018-06-21 18:48:48 +00001702 BlockWaitcntProcessedSet.clear();
Mark Searles24c92ee2018-02-07 02:21:21 +00001703
Nicolai Haehnle0ab31c92018-11-07 21:53:29 +00001704 // Walk over the blocks in reverse post order, inserting
Kannan Narayananacb089e2017-04-12 03:25:12 +00001705 // s_waitcnt where needed.
1706 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
1707 bool Modified = false;
1708 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1709 I = RPOT.begin(),
1710 E = RPOT.end(), J = RPOT.begin();
1711 I != E;) {
1712 MachineBasicBlock &MBB = **I;
1713
1714 BlockVisitedSet.insert(&MBB);
1715
1716 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1717 if (!ScoreBrackets) {
Mark Searlesf0b93f12018-06-04 16:51:59 +00001718 BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001719 ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1720 }
1721 ScoreBrackets->setPostOrder(MBB.getNumber());
1722 MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
1723 if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
Eugene Zelenko59e12822017-08-08 00:47:13 +00001724 LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001725
1726 // If we are walking into the block from before the loop, then guarantee
1727 // at least 1 re-walk over the loop to propagate the information, even if
1728 // no S_WAITCNT instructions were generated.
Mark Searles1bc6e712018-04-19 15:42:30 +00001729 if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
1730 unsigned Count = countNumBottomBlocks(ContainingLoop);
1731
1732 // If the loop has multiple back-edges, and so more than one "bottom"
1733 // basic block, we have to guarantee a re-walk over every blocks.
1734 if ((std::count(BlockWaitcntProcessedSet.begin(),
Mark Searlesf4e70252018-07-16 10:21:36 +00001735 BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
Mark Searles1bc6e712018-04-19 15:42:30 +00001736 BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
Mark Searles10545412018-05-30 15:47:45 +00001737 LLVM_DEBUG(dbgs() << "set-revisit1: Block"
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001738 << ContainingLoop->getHeader()->getNumber() << '\n';);
Mark Searles1bc6e712018-04-19 15:42:30 +00001739 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001740 }
1741
1742 // Walk over the instructions.
1743 insertWaitcntInBlock(MF, MBB);
1744
Mark Searles10545412018-05-30 15:47:45 +00001745 // Record that waitcnts have been processed at least once for this block.
Mark Searles1bc6e712018-04-19 15:42:30 +00001746 BlockWaitcntProcessedSet.push_back(&MBB);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001747
Mark Searles1bc6e712018-04-19 15:42:30 +00001748 // See if we want to revisit the loop. If a loop has multiple back-edges,
1749 // we shouldn't revisit the same "bottom" basic block.
1750 if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
1751 std::count(BlockWaitcntProcessedSet.begin(),
1752 BlockWaitcntProcessedSet.end(), &MBB) == 1) {
Kannan Narayanan5e73b042017-05-05 21:10:17 +00001753 MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001754 BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
1755 if (EntrySB && EntrySB->getRevisitLoop()) {
1756 EntrySB->setRevisitLoop(false);
1757 J = I;
1758 int32_t PostOrder = EntrySB->getPostOrder();
1759 // TODO: Avoid this loop. Find another way to set I.
1760 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1761 X = RPOT.begin(),
1762 Y = RPOT.end();
1763 X != Y; ++X) {
1764 MachineBasicBlock &MBBX = **X;
1765 if (MBBX.getNumber() == PostOrder) {
1766 I = X;
1767 break;
1768 }
1769 }
1770 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1771 WaitcntData->incIterCnt();
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001772 LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001773 continue;
1774 } else {
1775 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1776 // Loop converged, reset iteration count. If this loop gets revisited,
1777 // it must be from an outer loop, the counter will restart, this will
1778 // ensure we don't force convergence on such revisits.
1779 WaitcntData->resetIterCnt();
1780 }
1781 }
1782
1783 J = I;
1784 ++I;
1785 }
1786
1787 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1788
1789 bool HaveScalarStores = false;
1790
1791 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1792 ++BI) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001793 MachineBasicBlock &MBB = *BI;
1794
1795 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1796 ++I) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001797 if (!HaveScalarStores && TII->isScalarStore(*I))
1798 HaveScalarStores = true;
1799
1800 if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1801 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1802 EndPgmBlocks.push_back(&MBB);
1803 }
1804 }
1805
1806 if (HaveScalarStores) {
1807 // If scalar writes are used, the cache must be flushed or else the next
1808 // wave to reuse the same scratch memory can be clobbered.
1809 //
1810 // Insert s_dcache_wb at wave termination points if there were any scalar
1811 // stores, and only if the cache hasn't already been flushed. This could be
1812 // improved by looking across blocks for flushes in postdominating blocks
1813 // from the stores but an explicitly requested flush is probably very rare.
1814 for (MachineBasicBlock *MBB : EndPgmBlocks) {
1815 bool SeenDCacheWB = false;
1816
1817 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1818 ++I) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001819 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1820 SeenDCacheWB = true;
1821 else if (TII->isScalarStore(*I))
1822 SeenDCacheWB = false;
1823
1824 // FIXME: It would be better to insert this before a waitcnt if any.
1825 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1826 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1827 !SeenDCacheWB) {
1828 Modified = true;
1829 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1830 }
1831 }
1832 }
1833 }
1834
Mark Searles11d0a042017-05-31 16:44:23 +00001835 if (!MFI->isEntryFunction()) {
1836 // Wait for any outstanding memory operations that the input registers may
Hiroshi Inouec8e92452018-01-29 05:17:03 +00001837 // depend on. We can't track them and it's better to the wait after the
Mark Searles11d0a042017-05-31 16:44:23 +00001838 // costly call sequence.
1839
1840 // TODO: Could insert earlier and schedule more liberally with operations
1841 // that only use caller preserved registers.
1842 MachineBasicBlock &EntryBB = MF.front();
Mark Searlesed54ff12018-05-30 16:27:57 +00001843 BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1844 .addImm(0);
Mark Searles11d0a042017-05-31 16:44:23 +00001845
1846 Modified = true;
1847 }
1848
Kannan Narayananacb089e2017-04-12 03:25:12 +00001849 return Modified;
1850}