blob: 7dc3b68e01dd80ebd5829bea524139afd25c3ed7 [file] [log] [blame]
Eugene Zelenko59e12822017-08-08 00:47:13 +00001//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
Kannan Narayananacb089e2017-04-12 03:25:12 +00002//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
Adrian Prantl5f8f34e42018-05-01 15:54:18 +000011/// Insert wait instructions for memory reads and writes.
Kannan Narayananacb089e2017-04-12 03:25:12 +000012///
13/// Memory reads and writes are issued asynchronously, so we need to insert
14/// S_WAITCNT instructions when we want to access any of their results or
15/// overwrite any register that's used asynchronously.
16//
17//===----------------------------------------------------------------------===//
18
19#include "AMDGPU.h"
20#include "AMDGPUSubtarget.h"
21#include "SIDefines.h"
22#include "SIInstrInfo.h"
23#include "SIMachineFunctionInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000024#include "SIRegisterInfo.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000025#include "Utils/AMDGPUBaseInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000026#include "llvm/ADT/DenseMap.h"
27#include "llvm/ADT/DenseSet.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000028#include "llvm/ADT/PostOrderIterator.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000029#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/SmallVector.h"
31#include "llvm/CodeGen/MachineBasicBlock.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000032#include "llvm/CodeGen/MachineFunction.h"
33#include "llvm/CodeGen/MachineFunctionPass.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000034#include "llvm/CodeGen/MachineInstr.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000035#include "llvm/CodeGen/MachineInstrBuilder.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000036#include "llvm/CodeGen/MachineLoopInfo.h"
37#include "llvm/CodeGen/MachineMemOperand.h"
38#include "llvm/CodeGen/MachineOperand.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000039#include "llvm/CodeGen/MachineRegisterInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000040#include "llvm/IR/DebugLoc.h"
41#include "llvm/Pass.h"
42#include "llvm/Support/Debug.h"
Mark Searlesec581832018-04-25 19:21:26 +000043#include "llvm/Support/DebugCounter.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000044#include "llvm/Support/ErrorHandling.h"
45#include "llvm/Support/raw_ostream.h"
46#include <algorithm>
47#include <cassert>
48#include <cstdint>
49#include <cstring>
50#include <memory>
51#include <utility>
52#include <vector>
Kannan Narayananacb089e2017-04-12 03:25:12 +000053
Mark Searlesec581832018-04-25 19:21:26 +000054using namespace llvm;
55
Kannan Narayananacb089e2017-04-12 03:25:12 +000056#define DEBUG_TYPE "si-insert-waitcnts"
57
Mark Searlesec581832018-04-25 19:21:26 +000058DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
59 "Force emit s_waitcnt expcnt(0) instrs");
60DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
61 "Force emit s_waitcnt lgkmcnt(0) instrs");
62DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
63 "Force emit s_waitcnt vmcnt(0) instrs");
64
65static cl::opt<unsigned> ForceEmitZeroFlag(
66 "amdgpu-waitcnt-forcezero",
67 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
68 cl::init(0), cl::Hidden);
Kannan Narayananacb089e2017-04-12 03:25:12 +000069
70namespace {
71
72// Class of object that encapsulates latest instruction counter score
73// associated with the operand. Used for determining whether
74// s_waitcnt instruction needs to be emited.
75
76#define CNT_MASK(t) (1u << (t))
77
78enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
79
Eugene Zelenko59e12822017-08-08 00:47:13 +000080using RegInterval = std::pair<signed, signed>;
Kannan Narayananacb089e2017-04-12 03:25:12 +000081
82struct {
83 int32_t VmcntMax;
84 int32_t ExpcntMax;
85 int32_t LgkmcntMax;
86 int32_t NumVGPRsMax;
87 int32_t NumSGPRsMax;
88} HardwareLimits;
89
90struct {
91 unsigned VGPR0;
92 unsigned VGPRL;
93 unsigned SGPR0;
94 unsigned SGPRL;
95} RegisterEncoding;
96
97enum WaitEventType {
98 VMEM_ACCESS, // vector-memory read & write
99 LDS_ACCESS, // lds read & write
100 GDS_ACCESS, // gds read & write
101 SQ_MESSAGE, // send message
102 SMEM_ACCESS, // scalar-memory read & write
103 EXP_GPR_LOCK, // export holding on its data src
104 GDS_GPR_LOCK, // GDS holding on its data and addr src
105 EXP_POS_ACCESS, // write to export position
106 EXP_PARAM_ACCESS, // write to export parameter
107 VMW_GPR_LOCK, // vector-memory write holding on its data src
108 NUM_WAIT_EVENTS,
109};
110
111// The mapping is:
112// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
113// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
114// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
115// We reserve a fixed number of VGPR slots in the scoring tables for
116// special tokens like SCMEM_LDS (needed for buffer load to LDS).
117enum RegisterMapping {
118 SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
119 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
120 NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
121 EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
122 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
123};
124
125#define ForAllWaitEventType(w) \
126 for (enum WaitEventType w = (enum WaitEventType)0; \
127 (w) < (enum WaitEventType)NUM_WAIT_EVENTS; \
128 (w) = (enum WaitEventType)((w) + 1))
129
130// This is a per-basic-block object that maintains current score brackets
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000131// of each wait counter, and a per-register scoreboard for each wait counter.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000132// We also maintain the latest score for every event type that can change the
133// waitcnt in order to know if there are multiple types of events within
134// the brackets. When multiple types of event happen in the bracket,
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000135// wait count may get decreased out of order, therefore we need to put in
Kannan Narayananacb089e2017-04-12 03:25:12 +0000136// "s_waitcnt 0" before use.
137class BlockWaitcntBrackets {
138public:
Eugene Zelenko59e12822017-08-08 00:47:13 +0000139 BlockWaitcntBrackets() {
140 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
141 T = (enum InstCounterType)(T + 1)) {
142 memset(VgprScores[T], 0, sizeof(VgprScores[T]));
143 }
144 }
145
146 ~BlockWaitcntBrackets() = default;
147
Kannan Narayananacb089e2017-04-12 03:25:12 +0000148 static int32_t getWaitCountMax(InstCounterType T) {
149 switch (T) {
150 case VM_CNT:
151 return HardwareLimits.VmcntMax;
152 case LGKM_CNT:
153 return HardwareLimits.LgkmcntMax;
154 case EXP_CNT:
155 return HardwareLimits.ExpcntMax;
156 default:
157 break;
158 }
159 return 0;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000160 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000161
162 void setScoreLB(InstCounterType T, int32_t Val) {
163 assert(T < NUM_INST_CNTS);
164 if (T >= NUM_INST_CNTS)
165 return;
166 ScoreLBs[T] = Val;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000167 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000168
169 void setScoreUB(InstCounterType T, int32_t Val) {
170 assert(T < NUM_INST_CNTS);
171 if (T >= NUM_INST_CNTS)
172 return;
173 ScoreUBs[T] = Val;
174 if (T == EXP_CNT) {
175 int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
176 if (ScoreLBs[T] < UB)
177 ScoreLBs[T] = UB;
178 }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000179 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000180
181 int32_t getScoreLB(InstCounterType T) {
182 assert(T < NUM_INST_CNTS);
183 if (T >= NUM_INST_CNTS)
184 return 0;
185 return ScoreLBs[T];
Eugene Zelenko59e12822017-08-08 00:47:13 +0000186 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000187
188 int32_t getScoreUB(InstCounterType T) {
189 assert(T < NUM_INST_CNTS);
190 if (T >= NUM_INST_CNTS)
191 return 0;
192 return ScoreUBs[T];
Eugene Zelenko59e12822017-08-08 00:47:13 +0000193 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000194
195 // Mapping from event to counter.
196 InstCounterType eventCounter(WaitEventType E) {
197 switch (E) {
198 case VMEM_ACCESS:
199 return VM_CNT;
200 case LDS_ACCESS:
201 case GDS_ACCESS:
202 case SQ_MESSAGE:
203 case SMEM_ACCESS:
204 return LGKM_CNT;
205 case EXP_GPR_LOCK:
206 case GDS_GPR_LOCK:
207 case VMW_GPR_LOCK:
208 case EXP_POS_ACCESS:
209 case EXP_PARAM_ACCESS:
210 return EXP_CNT;
211 default:
212 llvm_unreachable("unhandled event type");
213 }
214 return NUM_INST_CNTS;
215 }
216
217 void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
218 if (GprNo < NUM_ALL_VGPRS) {
219 if (GprNo > VgprUB) {
220 VgprUB = GprNo;
221 }
222 VgprScores[T][GprNo] = Val;
223 } else {
224 assert(T == LGKM_CNT);
225 if (GprNo - NUM_ALL_VGPRS > SgprUB) {
226 SgprUB = GprNo - NUM_ALL_VGPRS;
227 }
228 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
229 }
230 }
231
232 int32_t getRegScore(int GprNo, InstCounterType T) {
233 if (GprNo < NUM_ALL_VGPRS) {
234 return VgprScores[T][GprNo];
235 }
236 return SgprScores[GprNo - NUM_ALL_VGPRS];
237 }
238
239 void clear() {
240 memset(ScoreLBs, 0, sizeof(ScoreLBs));
241 memset(ScoreUBs, 0, sizeof(ScoreUBs));
242 memset(EventUBs, 0, sizeof(EventUBs));
243 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
244 T = (enum InstCounterType)(T + 1)) {
245 memset(VgprScores[T], 0, sizeof(VgprScores[T]));
246 }
247 memset(SgprScores, 0, sizeof(SgprScores));
248 }
249
250 RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
251 const MachineRegisterInfo *MRI,
252 const SIRegisterInfo *TRI, unsigned OpNo,
253 bool Def) const;
254
255 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
256 const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
257 unsigned OpNo, int32_t Val);
258
259 void setWaitAtBeginning() { WaitAtBeginning = true; }
260 void clearWaitAtBeginning() { WaitAtBeginning = false; }
261 bool getWaitAtBeginning() const { return WaitAtBeginning; }
262 void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
263 int32_t getMaxVGPR() const { return VgprUB; }
264 int32_t getMaxSGPR() const { return SgprUB; }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000265
Kannan Narayananacb089e2017-04-12 03:25:12 +0000266 int32_t getEventUB(enum WaitEventType W) const {
267 assert(W < NUM_WAIT_EVENTS);
268 return EventUBs[W];
269 }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000270
Kannan Narayananacb089e2017-04-12 03:25:12 +0000271 bool counterOutOfOrder(InstCounterType T);
272 unsigned int updateByWait(InstCounterType T, int ScoreToWait);
273 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
274 const MachineRegisterInfo *MRI, WaitEventType E,
275 MachineInstr &MI);
276
Kannan Narayananacb089e2017-04-12 03:25:12 +0000277 bool hasPendingSMEM() const {
278 return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
279 EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
280 }
281
282 bool hasPendingFlat() const {
283 return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
284 LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
285 (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
286 LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
287 }
288
289 void setPendingFlat() {
290 LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
291 LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
292 }
293
294 int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
295
296 void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
297
298 bool getRevisitLoop() const { return RevisitLoop; }
299 void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
300
301 void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
302 int32_t getPostOrder() const { return PostOrder; }
303
304 void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000305 void clearWaitcnt() { Waitcnt = nullptr; }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000306 MachineInstr *getWaitcnt() const { return Waitcnt; }
307
308 bool mixedExpTypes() const { return MixedExpTypes; }
309 void setMixedExpTypes(bool MixedExpTypesIn) {
310 MixedExpTypes = MixedExpTypesIn;
311 }
312
313 void print(raw_ostream &);
314 void dump() { print(dbgs()); }
315
316private:
Eugene Zelenko59e12822017-08-08 00:47:13 +0000317 bool WaitAtBeginning = false;
318 bool RevisitLoop = false;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000319 bool MixedExpTypes = false;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000320 int32_t PostOrder = 0;
321 MachineInstr *Waitcnt = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000322 int32_t ScoreLBs[NUM_INST_CNTS] = {0};
323 int32_t ScoreUBs[NUM_INST_CNTS] = {0};
324 int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
325 // Remember the last flat memory operation.
326 int32_t LastFlat[NUM_INST_CNTS] = {0};
327 // wait_cnt scores for every vgpr.
328 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000329 int32_t VgprUB = 0;
330 int32_t SgprUB = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000331 int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
332 // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
333 int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
334};
335
336// This is a per-loop-region object that records waitcnt status at the end of
337// loop footer from the previous iteration. We also maintain an iteration
338// count to track the number of times the loop has been visited. When it
339// doesn't converge naturally, we force convergence by inserting s_waitcnt 0
340// at the end of the loop footer.
341class LoopWaitcntData {
342public:
Eugene Zelenko59e12822017-08-08 00:47:13 +0000343 LoopWaitcntData() = default;
344 ~LoopWaitcntData() = default;
345
Kannan Narayananacb089e2017-04-12 03:25:12 +0000346 void incIterCnt() { IterCnt++; }
347 void resetIterCnt() { IterCnt = 0; }
348 int32_t getIterCnt() { return IterCnt; }
349
Kannan Narayananacb089e2017-04-12 03:25:12 +0000350 void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
351 MachineInstr *getWaitcnt() const { return LfWaitcnt; }
352
353 void print() {
354 DEBUG(dbgs() << " iteration " << IterCnt << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000355 }
356
357private:
358 // s_waitcnt added at the end of loop footer to stablize wait scores
359 // at the end of the loop footer.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000360 MachineInstr *LfWaitcnt = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000361 // Number of iterations the loop has been visited, not including the initial
362 // walk over.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000363 int32_t IterCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000364};
365
366class SIInsertWaitcnts : public MachineFunctionPass {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000367private:
Eugene Zelenko59e12822017-08-08 00:47:13 +0000368 const SISubtarget *ST = nullptr;
369 const SIInstrInfo *TII = nullptr;
370 const SIRegisterInfo *TRI = nullptr;
371 const MachineRegisterInfo *MRI = nullptr;
372 const MachineLoopInfo *MLI = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000373 AMDGPU::IsaInfo::IsaVersion IV;
374 AMDGPUAS AMDGPUASI;
375
376 DenseSet<MachineBasicBlock *> BlockVisitedSet;
Mark Searles24c92ee2018-02-07 02:21:21 +0000377 DenseSet<MachineInstr *> TrackedWaitcntSet;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000378 DenseSet<MachineInstr *> VCCZBugHandledSet;
379
380 DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
381 BlockWaitcntBracketsMap;
382
Mark Searles1bc6e712018-04-19 15:42:30 +0000383 std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000384
385 DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
386
387 std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
388
Mark Searles4a0f2c52018-05-07 14:43:28 +0000389 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
390 // because of amdgpu-waitcnt-forcezero flag
391 bool ForceEmitZeroWaitcnts;
Mark Searlesec581832018-04-25 19:21:26 +0000392 bool ForceEmitWaitcnt[NUM_INST_CNTS];
393
Kannan Narayananacb089e2017-04-12 03:25:12 +0000394public:
395 static char ID;
396
Eugene Zelenko59e12822017-08-08 00:47:13 +0000397 SIInsertWaitcnts() : MachineFunctionPass(ID) {}
Kannan Narayananacb089e2017-04-12 03:25:12 +0000398
399 bool runOnMachineFunction(MachineFunction &MF) override;
400
401 StringRef getPassName() const override {
402 return "SI insert wait instructions";
403 }
404
405 void getAnalysisUsage(AnalysisUsage &AU) const override {
406 AU.setPreservesCFG();
407 AU.addRequired<MachineLoopInfo>();
408 MachineFunctionPass::getAnalysisUsage(AU);
409 }
410
411 void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
412 // The waitcnt information is copied because it changes as the block is
413 // traversed.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000414 KillWaitBrackets.push_back(
415 llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000416 }
417
Mark Searlesec581832018-04-25 19:21:26 +0000418 bool isForceEmitWaitcnt() const {
419 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
420 T = (enum InstCounterType)(T + 1))
421 if (ForceEmitWaitcnt[T])
422 return true;
423 return false;
424 }
425
426 void setForceEmitWaitcnt() {
427// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
428// For debug builds, get the debug counter info and adjust if need be
429#ifndef NDEBUG
430 if (DebugCounter::isCounterSet(ForceExpCounter) &&
431 DebugCounter::shouldExecute(ForceExpCounter)) {
432 ForceEmitWaitcnt[EXP_CNT] = true;
433 } else {
434 ForceEmitWaitcnt[EXP_CNT] = false;
435 }
436
437 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
438 DebugCounter::shouldExecute(ForceLgkmCounter)) {
439 ForceEmitWaitcnt[LGKM_CNT] = true;
440 } else {
441 ForceEmitWaitcnt[LGKM_CNT] = false;
442 }
443
444 if (DebugCounter::isCounterSet(ForceVMCounter) &&
445 DebugCounter::shouldExecute(ForceVMCounter)) {
446 ForceEmitWaitcnt[VM_CNT] = true;
447 } else {
448 ForceEmitWaitcnt[VM_CNT] = false;
449 }
450#endif // NDEBUG
451 }
452
Matt Arsenault0ed39d32017-07-21 18:54:54 +0000453 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
Mark Searles70901b92018-04-24 15:59:59 +0000454 void generateWaitcntInstBefore(MachineInstr &MI,
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +0000455 BlockWaitcntBrackets *ScoreBrackets);
Mark Searles70901b92018-04-24 15:59:59 +0000456 void updateEventWaitcntAfter(MachineInstr &Inst,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000457 BlockWaitcntBrackets *ScoreBrackets);
458 void mergeInputScoreBrackets(MachineBasicBlock &Block);
Mark Searles1bc6e712018-04-19 15:42:30 +0000459 bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
460 unsigned countNumBottomBlocks(const MachineLoop *Loop);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000461 void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
462 void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +0000463 bool isWaitcntStronger(unsigned LHS, unsigned RHS);
464 unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000465};
466
Eugene Zelenko59e12822017-08-08 00:47:13 +0000467} // end anonymous namespace
Kannan Narayananacb089e2017-04-12 03:25:12 +0000468
469RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
470 const SIInstrInfo *TII,
471 const MachineRegisterInfo *MRI,
472 const SIRegisterInfo *TRI,
473 unsigned OpNo,
474 bool Def) const {
475 const MachineOperand &Op = MI->getOperand(OpNo);
476 if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
477 (Def && !Op.isDef()))
478 return {-1, -1};
479
480 // A use via a PW operand does not need a waitcnt.
481 // A partial write is not a WAW.
482 assert(!Op.getSubReg() || !Op.isUndef());
483
484 RegInterval Result;
485 const MachineRegisterInfo &MRIA = *MRI;
486
487 unsigned Reg = TRI->getEncodingValue(Op.getReg());
488
489 if (TRI->isVGPR(MRIA, Op.getReg())) {
490 assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
491 Result.first = Reg - RegisterEncoding.VGPR0;
492 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
493 } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
494 assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
495 Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
496 assert(Result.first >= NUM_ALL_VGPRS &&
497 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
498 }
499 // TODO: Handle TTMP
500 // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
501 else
502 return {-1, -1};
503
504 const MachineInstr &MIA = *MI;
505 const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +0000506 unsigned Size = TRI->getRegSizeInBits(*RC);
507 Result.second = Result.first + (Size / 32);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000508
509 return Result;
510}
511
512void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
513 const SIInstrInfo *TII,
514 const SIRegisterInfo *TRI,
515 const MachineRegisterInfo *MRI,
516 unsigned OpNo, int32_t Val) {
517 RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
518 DEBUG({
519 const MachineOperand &Opnd = MI->getOperand(OpNo);
520 assert(TRI->isVGPR(*MRI, Opnd.getReg()));
521 });
522 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
523 setRegScore(RegNo, EXP_CNT, Val);
524 }
525}
526
527void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
528 const SIRegisterInfo *TRI,
529 const MachineRegisterInfo *MRI,
530 WaitEventType E, MachineInstr &Inst) {
531 const MachineRegisterInfo &MRIA = *MRI;
532 InstCounterType T = eventCounter(E);
533 int32_t CurrScore = getScoreUB(T) + 1;
534 // EventUB and ScoreUB need to be update regardless if this event changes
535 // the score of a register or not.
536 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
537 EventUBs[E] = CurrScore;
538 setScoreUB(T, CurrScore);
539
540 if (T == EXP_CNT) {
541 // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
542 // is required.
543 if (!MixedExpTypes) {
544 MixedExpTypes = counterOutOfOrder(EXP_CNT);
545 }
546
547 // Put score on the source vgprs. If this is a store, just use those
548 // specific register(s).
549 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
550 // All GDS operations must protect their address register (same as
551 // export.)
552 if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
553 Inst.getOpcode() != AMDGPU::DS_CONSUME) {
554 setExpScore(
555 &Inst, TII, TRI, MRI,
556 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
557 CurrScore);
558 }
559 if (Inst.mayStore()) {
560 setExpScore(
561 &Inst, TII, TRI, MRI,
562 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
563 CurrScore);
564 if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
565 AMDGPU::OpName::data1) != -1) {
566 setExpScore(&Inst, TII, TRI, MRI,
567 AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
568 AMDGPU::OpName::data1),
569 CurrScore);
570 }
571 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
572 Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
573 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
574 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
575 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
576 Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
577 Inst.getOpcode() != AMDGPU::DS_APPEND &&
578 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
579 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
580 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
581 const MachineOperand &Op = Inst.getOperand(I);
582 if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
583 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
584 }
585 }
586 }
587 } else if (TII->isFLAT(Inst)) {
588 if (Inst.mayStore()) {
589 setExpScore(
590 &Inst, TII, TRI, MRI,
591 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
592 CurrScore);
593 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
594 setExpScore(
595 &Inst, TII, TRI, MRI,
596 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
597 CurrScore);
598 }
599 } else if (TII->isMIMG(Inst)) {
600 if (Inst.mayStore()) {
601 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
602 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
603 setExpScore(
604 &Inst, TII, TRI, MRI,
605 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
606 CurrScore);
607 }
608 } else if (TII->isMTBUF(Inst)) {
609 if (Inst.mayStore()) {
610 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
611 }
612 } else if (TII->isMUBUF(Inst)) {
613 if (Inst.mayStore()) {
614 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
615 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
616 setExpScore(
617 &Inst, TII, TRI, MRI,
618 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
619 CurrScore);
620 }
621 } else {
622 if (TII->isEXP(Inst)) {
623 // For export the destination registers are really temps that
624 // can be used as the actual source after export patching, so
625 // we need to treat them like sources and set the EXP_CNT
626 // score.
627 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
628 MachineOperand &DefMO = Inst.getOperand(I);
629 if (DefMO.isReg() && DefMO.isDef() &&
630 TRI->isVGPR(MRIA, DefMO.getReg())) {
631 setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
632 CurrScore);
633 }
634 }
635 }
636 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
637 MachineOperand &MO = Inst.getOperand(I);
638 if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
639 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
640 }
641 }
642 }
643#if 0 // TODO: check if this is handled by MUBUF code above.
644 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
Evgeny Mankovbf975172017-08-16 16:47:29 +0000645 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
646 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000647 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
648 unsigned OpNo;//TODO: find the OpNo for this operand;
649 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
650 for (signed RegNo = Interval.first; RegNo < Interval.second;
Evgeny Mankovbf975172017-08-16 16:47:29 +0000651 ++RegNo) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000652 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
653 }
654#endif
655 } else {
656 // Match the score to the destination registers.
657 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
658 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
659 if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
660 continue;
661 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
662 setRegScore(RegNo, T, CurrScore);
663 }
664 }
665 if (TII->isDS(Inst) && Inst.mayStore()) {
666 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
667 }
668 }
669}
670
671void BlockWaitcntBrackets::print(raw_ostream &OS) {
672 OS << '\n';
673 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
674 T = (enum InstCounterType)(T + 1)) {
675 int LB = getScoreLB(T);
676 int UB = getScoreUB(T);
677
678 switch (T) {
679 case VM_CNT:
680 OS << " VM_CNT(" << UB - LB << "): ";
681 break;
682 case LGKM_CNT:
683 OS << " LGKM_CNT(" << UB - LB << "): ";
684 break;
685 case EXP_CNT:
686 OS << " EXP_CNT(" << UB - LB << "): ";
687 break;
688 default:
689 OS << " UNKNOWN(" << UB - LB << "): ";
690 break;
691 }
692
693 if (LB < UB) {
694 // Print vgpr scores.
695 for (int J = 0; J <= getMaxVGPR(); J++) {
696 int RegScore = getRegScore(J, T);
697 if (RegScore <= LB)
698 continue;
699 int RelScore = RegScore - LB - 1;
700 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
701 OS << RelScore << ":v" << J << " ";
702 } else {
703 OS << RelScore << ":ds ";
704 }
705 }
706 // Also need to print sgpr scores for lgkm_cnt.
707 if (T == LGKM_CNT) {
708 for (int J = 0; J <= getMaxSGPR(); J++) {
709 int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
710 if (RegScore <= LB)
711 continue;
712 int RelScore = RegScore - LB - 1;
713 OS << RelScore << ":s" << J << " ";
714 }
715 }
716 }
717 OS << '\n';
718 }
719 OS << '\n';
Kannan Narayananacb089e2017-04-12 03:25:12 +0000720}
721
722unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
723 int ScoreToWait) {
724 unsigned int NeedWait = 0;
725 if (ScoreToWait == -1) {
726 // The score to wait is unknown. This implies that it was not encountered
727 // during the path of the CFG walk done during the current traversal but
728 // may be seen on a different path. Emit an s_wait counter with a
729 // conservative value of 0 for the counter.
730 NeedWait = CNT_MASK(T);
731 setScoreLB(T, getScoreUB(T));
732 return NeedWait;
733 }
734
735 // If the score of src_operand falls within the bracket, we need an
736 // s_waitcnt instruction.
737 const int32_t LB = getScoreLB(T);
738 const int32_t UB = getScoreUB(T);
739 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
740 if (T == VM_CNT && hasPendingFlat()) {
741 // If there is a pending FLAT operation, and this is a VM waitcnt,
742 // then we need to force a waitcnt 0 for VM.
743 NeedWait = CNT_MASK(T);
744 setScoreLB(T, getScoreUB(T));
745 } else if (counterOutOfOrder(T)) {
746 // Counter can get decremented out-of-order when there
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000747 // are multiple types event in the bracket. Also emit an s_wait counter
Kannan Narayananacb089e2017-04-12 03:25:12 +0000748 // with a conservative value of 0 for the counter.
749 NeedWait = CNT_MASK(T);
750 setScoreLB(T, getScoreUB(T));
751 } else {
752 NeedWait = CNT_MASK(T);
753 setScoreLB(T, ScoreToWait);
754 }
755 }
756
757 return NeedWait;
758}
759
760// Where there are multiple types of event in the bracket of a counter,
761// the decrement may go out of order.
762bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
763 switch (T) {
764 case VM_CNT:
765 return false;
766 case LGKM_CNT: {
767 if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
768 EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
769 // Scalar memory read always can go out of order.
770 return true;
771 }
772 int NumEventTypes = 0;
773 if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
774 EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
775 NumEventTypes++;
776 }
777 if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
778 EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
779 NumEventTypes++;
780 }
781 if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
782 EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
783 NumEventTypes++;
784 }
785 if (NumEventTypes <= 1) {
786 return false;
787 }
788 break;
789 }
790 case EXP_CNT: {
791 // If there has been a mixture of export types, then a waitcnt exp(0) is
792 // required.
793 if (MixedExpTypes)
794 return true;
795 int NumEventTypes = 0;
796 if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
797 EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
798 NumEventTypes++;
799 }
800 if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
801 EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
802 NumEventTypes++;
803 }
804 if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
805 EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
806 NumEventTypes++;
807 }
808 if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
809 EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
810 NumEventTypes++;
811 }
812
813 if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
814 EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
815 NumEventTypes++;
816 }
817
818 if (NumEventTypes <= 1) {
819 return false;
820 }
821 break;
822 }
823 default:
824 break;
825 }
826 return true;
827}
828
829INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
830 false)
831INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
832 false)
833
834char SIInsertWaitcnts::ID = 0;
835
836char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
837
838FunctionPass *llvm::createSIInsertWaitcntsPass() {
839 return new SIInsertWaitcnts();
840}
841
842static bool readsVCCZ(const MachineInstr &MI) {
843 unsigned Opc = MI.getOpcode();
844 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
845 !MI.getOperand(1).isUndef();
846}
847
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000848/// Given wait count encodings checks if LHS is stronger than RHS.
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +0000849bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
850 if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
851 return false;
852 if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
853 return false;
854 if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
855 return false;
856 return true;
857}
858
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000859/// Given wait count encodings create a new encoding which is stronger
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +0000860/// or equal to both.
861unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
862 unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
863 AMDGPU::decodeVmcnt(IV, RHS));
864 unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
865 AMDGPU::decodeLgkmcnt(IV, RHS));
866 unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
867 AMDGPU::decodeExpcnt(IV, RHS));
868 return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
869}
870
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000871/// Generate s_waitcnt instruction to be placed before cur_Inst.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000872/// Instructions of a given type are returned in order,
873/// but instructions of different types can complete out of order.
874/// We rely on this in-order completion
875/// and simply assign a score to the memory access instructions.
876/// We keep track of the active "score bracket" to determine
877/// if an access of a memory read requires an s_waitcnt
878/// and if so what the value of each counter is.
879/// The "score bracket" is bound by the lower bound and upper bound
880/// scores (*_score_LB and *_score_ub respectively).
Mark Searles70901b92018-04-24 15:59:59 +0000881void SIInsertWaitcnts::generateWaitcntInstBefore(
Kannan Narayananacb089e2017-04-12 03:25:12 +0000882 MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
883 // To emit, or not to emit - that's the question!
884 // Start with an assumption that there is no need to emit.
Mark Searles70901b92018-04-24 15:59:59 +0000885 unsigned int EmitWaitcnt = 0;
Mark Searles4a0f2c52018-05-07 14:43:28 +0000886
Kannan Narayananacb089e2017-04-12 03:25:12 +0000887 // No need to wait before phi. If a phi-move exists, then the wait should
888 // has been inserted before the move. If a phi-move does not exist, then
889 // wait should be inserted before the real use. The same is true for
890 // sc-merge. It is not a coincident that all these cases correspond to the
891 // instructions that are skipped in the assembling loop.
892 bool NeedLineMapping = false; // TODO: Check on this.
Mark Searlesec581832018-04-25 19:21:26 +0000893
Mark Searles4a0f2c52018-05-07 14:43:28 +0000894 // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
895 bool ForceEmitZeroWaitcnt = false;
896
897 setForceEmitWaitcnt();
Mark Searlesec581832018-04-25 19:21:26 +0000898 bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
899
Kannan Narayananacb089e2017-04-12 03:25:12 +0000900 if (MI.isDebugValue() &&
901 // TODO: any other opcode?
902 !NeedLineMapping) {
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +0000903 return;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000904 }
905
906 // See if an s_waitcnt is forced at block entry, or is needed at
907 // program end.
908 if (ScoreBrackets->getWaitAtBeginning()) {
909 // Note that we have already cleared the state, so we don't need to update
910 // it.
911 ScoreBrackets->clearWaitAtBeginning();
912 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
913 T = (enum InstCounterType)(T + 1)) {
Mark Searles70901b92018-04-24 15:59:59 +0000914 EmitWaitcnt |= CNT_MASK(T);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000915 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
916 }
917 }
918
919 // See if this instruction has a forced S_WAITCNT VM.
920 // TODO: Handle other cases of NeedsWaitcntVmBefore()
921 else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
922 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
923 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
Mark Searles70901b92018-04-24 15:59:59 +0000924 EmitWaitcnt |=
Kannan Narayananacb089e2017-04-12 03:25:12 +0000925 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
926 }
927
928 // All waits must be resolved at call return.
929 // NOTE: this could be improved with knowledge of all call sites or
930 // with knowledge of the called routines.
931 if (MI.getOpcode() == AMDGPU::RETURN ||
Mark Searles11d0a042017-05-31 16:44:23 +0000932 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
933 MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000934 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
935 T = (enum InstCounterType)(T + 1)) {
936 if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
937 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
Mark Searles70901b92018-04-24 15:59:59 +0000938 EmitWaitcnt |= CNT_MASK(T);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000939 }
940 }
941 }
942 // Resolve vm waits before gs-done.
943 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
944 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
945 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
946 AMDGPU::SendMsg::ID_GS_DONE)) {
947 if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
948 ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000949 EmitWaitcnt |= CNT_MASK(VM_CNT);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000950 }
951 }
952#if 0 // TODO: the following blocks of logic when we have fence.
953 else if (MI.getOpcode() == SC_FENCE) {
954 const unsigned int group_size =
955 context->shader_info->GetMaxThreadGroupSize();
956 // group_size == 0 means thread group size is unknown at compile time
957 const bool group_is_multi_wave =
958 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
959 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
960
961 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
962 SCRegType src_type = Inst->GetSrcType(i);
963 switch (src_type) {
964 case SCMEM_LDS:
965 if (group_is_multi_wave ||
Evgeny Mankovbf975172017-08-16 16:47:29 +0000966 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
Mark Searles70901b92018-04-24 15:59:59 +0000967 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000968 ScoreBrackets->getScoreUB(LGKM_CNT));
969 // LDS may have to wait for VM_CNT after buffer load to LDS
970 if (target_info->HasBufferLoadToLDS()) {
Mark Searles70901b92018-04-24 15:59:59 +0000971 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000972 ScoreBrackets->getScoreUB(VM_CNT));
973 }
974 }
975 break;
976
977 case SCMEM_GDS:
978 if (group_is_multi_wave || fence_is_global) {
Mark Searles70901b92018-04-24 15:59:59 +0000979 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000980 ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000981 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000982 ScoreBrackets->getScoreUB(LGKM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000983 }
984 break;
985
986 case SCMEM_UAV:
987 case SCMEM_TFBUF:
988 case SCMEM_RING:
989 case SCMEM_SCATTER:
990 if (group_is_multi_wave || fence_is_global) {
Mark Searles70901b92018-04-24 15:59:59 +0000991 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000992 ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000993 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000994 ScoreBrackets->getScoreUB(VM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000995 }
996 break;
997
998 case SCMEM_SCRATCH:
999 default:
1000 break;
1001 }
1002 }
1003 }
1004#endif
1005
1006 // Export & GDS instructions do not read the EXEC mask until after the export
1007 // is granted (which can occur well after the instruction is issued).
1008 // The shader program must flush all EXP operations on the export-count
1009 // before overwriting the EXEC mask.
1010 else {
1011 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1012 // Export and GDS are tracked individually, either may trigger a waitcnt
1013 // for EXEC.
Mark Searles70901b92018-04-24 15:59:59 +00001014 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001015 EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
Mark Searles70901b92018-04-24 15:59:59 +00001016 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001017 EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
Mark Searles70901b92018-04-24 15:59:59 +00001018 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001019 EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
Mark Searles70901b92018-04-24 15:59:59 +00001020 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001021 EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
1022 }
1023
1024#if 0 // TODO: the following code to handle CALL.
1025 // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
1026 // However, there is a problem with EXP_CNT, because the call cannot
1027 // easily tell if a register is used in the function, and if it did, then
1028 // the referring instruction would have to have an S_WAITCNT, which is
1029 // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
1030 // before the call.
1031 if (MI.getOpcode() == SC_CALL) {
1032 if (ScoreBrackets->getScoreUB(EXP_CNT) >
Evgeny Mankovbf975172017-08-16 16:47:29 +00001033 ScoreBrackets->getScoreLB(EXP_CNT)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001034 ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +00001035 EmitWaitcnt |= CNT_MASK(EXP_CNT);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001036 }
1037 }
1038#endif
1039
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001040 // FIXME: Should not be relying on memoperands.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001041 // Look at the source operands of every instruction to see if
1042 // any of them results from a previous memory operation that affects
1043 // its current usage. If so, an s_waitcnt instruction needs to be
1044 // emitted.
1045 // If the source operand was defined by a load, add the s_waitcnt
1046 // instruction.
1047 for (const MachineMemOperand *Memop : MI.memoperands()) {
1048 unsigned AS = Memop->getAddrSpace();
1049 if (AS != AMDGPUASI.LOCAL_ADDRESS)
1050 continue;
1051 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1052 // VM_CNT is only relevant to vgpr or LDS.
Mark Searles70901b92018-04-24 15:59:59 +00001053 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001054 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1055 }
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001056
Kannan Narayananacb089e2017-04-12 03:25:12 +00001057 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1058 const MachineOperand &Op = MI.getOperand(I);
1059 const MachineRegisterInfo &MRIA = *MRI;
1060 RegInterval Interval =
1061 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
1062 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1063 if (TRI->isVGPR(MRIA, Op.getReg())) {
1064 // VM_CNT is only relevant to vgpr or LDS.
Mark Searles70901b92018-04-24 15:59:59 +00001065 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001066 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1067 }
Mark Searles70901b92018-04-24 15:59:59 +00001068 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001069 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1070 }
1071 }
1072 // End of for loop that looks at all source operands to decide vm_wait_cnt
1073 // and lgk_wait_cnt.
1074
1075 // Two cases are handled for destination operands:
1076 // 1) If the destination operand was defined by a load, add the s_waitcnt
1077 // instruction to guarantee the right WAW order.
1078 // 2) If a destination operand that was used by a recent export/store ins,
1079 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1080 if (MI.mayStore()) {
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001081 // FIXME: Should not be relying on memoperands.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001082 for (const MachineMemOperand *Memop : MI.memoperands()) {
1083 unsigned AS = Memop->getAddrSpace();
1084 if (AS != AMDGPUASI.LOCAL_ADDRESS)
1085 continue;
1086 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
Mark Searles70901b92018-04-24 15:59:59 +00001087 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001088 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
Mark Searles70901b92018-04-24 15:59:59 +00001089 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001090 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1091 }
1092 }
1093 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1094 MachineOperand &Def = MI.getOperand(I);
1095 const MachineRegisterInfo &MRIA = *MRI;
1096 RegInterval Interval =
1097 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
1098 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1099 if (TRI->isVGPR(MRIA, Def.getReg())) {
Mark Searles70901b92018-04-24 15:59:59 +00001100 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001101 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
Mark Searles70901b92018-04-24 15:59:59 +00001102 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001103 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1104 }
Mark Searles70901b92018-04-24 15:59:59 +00001105 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001106 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1107 }
1108 } // End of for loop that looks at all dest operands.
1109 }
1110
Kannan Narayananacb089e2017-04-12 03:25:12 +00001111 // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1112 // occurs before the instruction. Doing it here prevents any additional
1113 // S_WAITCNTs from being emitted if the instruction was marked as
1114 // requiring a WAITCNT beforehand.
Konstantin Zhuravlyovbe6c0ca2017-06-02 17:40:26 +00001115 if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1116 !ST->hasAutoWaitcntBeforeBarrier()) {
Mark Searles70901b92018-04-24 15:59:59 +00001117 EmitWaitcnt |=
Kannan Narayananacb089e2017-04-12 03:25:12 +00001118 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
Mark Searles70901b92018-04-24 15:59:59 +00001119 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001120 EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +00001121 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001122 LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
1123 }
1124
1125 // TODO: Remove this work-around, enable the assert for Bug 457939
1126 // after fixing the scheduler. Also, the Shader Compiler code is
1127 // independent of target.
1128 if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
1129 if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1130 ScoreBrackets->getScoreUB(LGKM_CNT) &&
1131 ScoreBrackets->hasPendingSMEM()) {
1132 // Wait on everything, not just LGKM. vccz reads usually come from
1133 // terminators, and we always wait on everything at the end of the
1134 // block, so if we only wait on LGKM here, we might end up with
1135 // another s_waitcnt inserted right after this if there are non-LGKM
1136 // instructions still outstanding.
Mark Searles4a0f2c52018-05-07 14:43:28 +00001137 // FIXME: this is too conservative / the comment is wrong.
1138 // We don't wait on everything at the end of the block and we combine
1139 // waitcnts so we should never have back-to-back waitcnts.
Mark Searlesec581832018-04-25 19:21:26 +00001140 ForceEmitZeroWaitcnt = true;
Mark Searles70901b92018-04-24 15:59:59 +00001141 EmitWaitcnt = true;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001142 }
1143 }
1144
1145 // Does this operand processing indicate s_wait counter update?
Mark Searlesec581832018-04-25 19:21:26 +00001146 if (EmitWaitcnt || IsForceEmitWaitcnt) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001147 int CntVal[NUM_INST_CNTS];
1148
1149 bool UseDefaultWaitcntStrategy = true;
Mark Searles4a0f2c52018-05-07 14:43:28 +00001150 if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001151 // Force all waitcnts to 0.
1152 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1153 T = (enum InstCounterType)(T + 1)) {
1154 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1155 }
1156 CntVal[VM_CNT] = 0;
1157 CntVal[EXP_CNT] = 0;
1158 CntVal[LGKM_CNT] = 0;
1159 UseDefaultWaitcntStrategy = false;
1160 }
1161
1162 if (UseDefaultWaitcntStrategy) {
1163 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1164 T = (enum InstCounterType)(T + 1)) {
Mark Searles70901b92018-04-24 15:59:59 +00001165 if (EmitWaitcnt & CNT_MASK(T)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001166 int Delta =
1167 ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
1168 int MaxDelta = ScoreBrackets->getWaitCountMax(T);
1169 if (Delta >= MaxDelta) {
1170 Delta = -1;
1171 if (T != EXP_CNT) {
1172 ScoreBrackets->setScoreLB(
1173 T, ScoreBrackets->getScoreUB(T) - MaxDelta);
1174 }
Mark Searles70901b92018-04-24 15:59:59 +00001175 EmitWaitcnt &= ~CNT_MASK(T);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001176 }
1177 CntVal[T] = Delta;
1178 } else {
1179 // If we are not waiting for a particular counter then encode
1180 // it as -1 which means "don't care."
1181 CntVal[T] = -1;
1182 }
1183 }
1184 }
1185
1186 // If we are not waiting on any counter we can skip the wait altogether.
Mark Searlesec581832018-04-25 19:21:26 +00001187 if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001188 MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
1189 int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
Mark Searles65207922018-02-19 19:19:59 +00001190 if (!OldWaitcnt ||
1191 (AMDGPU::decodeVmcnt(IV, Imm) !=
Kannan Narayananacb089e2017-04-12 03:25:12 +00001192 (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
1193 (AMDGPU::decodeExpcnt(IV, Imm) !=
1194 (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
1195 (AMDGPU::decodeLgkmcnt(IV, Imm) !=
1196 (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
1197 MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
1198 if (ContainingLoop) {
Kannan Narayanan5e73b042017-05-05 21:10:17 +00001199 MachineBasicBlock *TBB = ContainingLoop->getHeader();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001200 BlockWaitcntBrackets *ScoreBracket =
1201 BlockWaitcntBracketsMap[TBB].get();
1202 if (!ScoreBracket) {
Mark Searles24c92ee2018-02-07 02:21:21 +00001203 assert(!BlockVisitedSet.count(TBB));
Eugene Zelenko59e12822017-08-08 00:47:13 +00001204 BlockWaitcntBracketsMap[TBB] =
1205 llvm::make_unique<BlockWaitcntBrackets>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001206 ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
1207 }
1208 ScoreBracket->setRevisitLoop(true);
Mark Searles65207922018-02-19 19:19:59 +00001209 DEBUG(dbgs() << "set-revisit: Block"
Kannan Narayanan5e73b042017-05-05 21:10:17 +00001210 << ContainingLoop->getHeader()->getNumber() << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001211 }
1212 }
1213
1214 // Update an existing waitcount, or make a new one.
Mark Searlesec581832018-04-25 19:21:26 +00001215 unsigned Enc = AMDGPU::encodeWaitcnt(IV,
1216 ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
1217 ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
1218 ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
Mark Searles65207922018-02-19 19:19:59 +00001219 // We don't remove waitcnts that existed prior to the waitcnt
1220 // pass. Check if the waitcnt to-be-inserted can be avoided
1221 // or if the prev waitcnt can be updated.
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001222 bool insertSWaitInst = true;
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +00001223 for (MachineBasicBlock::iterator I = MI.getIterator(),
1224 B = MI.getParent()->begin();
1225 insertSWaitInst && I != B; --I) {
Mark Searles65207922018-02-19 19:19:59 +00001226 if (I == MI.getIterator())
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +00001227 continue;
1228
1229 switch (I->getOpcode()) {
1230 case AMDGPU::S_WAITCNT:
1231 if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
1232 insertSWaitInst = false;
1233 else if (!OldWaitcnt) {
1234 OldWaitcnt = &*I;
1235 Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
1236 }
1237 break;
1238 // TODO: skip over instructions which never require wait.
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001239 }
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +00001240 break;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001241 }
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001242 if (insertSWaitInst) {
1243 if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
Mark Searles4a0f2c52018-05-07 14:43:28 +00001244 if (ForceEmitZeroWaitcnts)
Mark Searlesec581832018-04-25 19:21:26 +00001245 DEBUG(dbgs() << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
1246 if (IsForceEmitWaitcnt)
1247 DEBUG(dbgs() << "Force emit a s_waitcnt due to debug counter\n");
1248
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001249 OldWaitcnt->getOperand(0).setImm(Enc);
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +00001250 if (!OldWaitcnt->getParent())
1251 MI.getParent()->insert(MI, OldWaitcnt);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001252
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001253 DEBUG(dbgs() << "updateWaitcntInBlock\n"
1254 << "Old Instr: " << MI << '\n'
1255 << "New Instr: " << *OldWaitcnt << '\n');
1256 } else {
1257 auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1258 MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1259 .addImm(Enc);
1260 TrackedWaitcntSet.insert(SWaitInst);
1261
1262 DEBUG(dbgs() << "insertWaitcntInBlock\n"
1263 << "Old Instr: " << MI << '\n'
1264 << "New Instr: " << *SWaitInst << '\n');
1265 }
1266 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001267
1268 if (CntVal[EXP_CNT] == 0) {
1269 ScoreBrackets->setMixedExpTypes(false);
1270 }
1271 }
1272 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001273}
1274
1275void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
1276 MachineInstr *Waitcnt) {
1277 if (MBB.empty()) {
1278 MBB.push_back(Waitcnt);
1279 return;
1280 }
1281
1282 MachineBasicBlock::iterator It = MBB.end();
1283 MachineInstr *MI = &*(--It);
1284 if (MI->isBranch()) {
1285 MBB.insert(It, Waitcnt);
1286 } else {
1287 MBB.push_back(Waitcnt);
1288 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001289}
1290
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001291// This is a flat memory operation. Check to see if it has memory
1292// tokens for both LDS and Memory, and if so mark it as a flat.
1293bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1294 if (MI.memoperands_empty())
1295 return true;
1296
1297 for (const MachineMemOperand *Memop : MI.memoperands()) {
1298 unsigned AS = Memop->getAddrSpace();
1299 if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
1300 return true;
1301 }
1302
1303 return false;
1304}
1305
Mark Searles70901b92018-04-24 15:59:59 +00001306void SIInsertWaitcnts::updateEventWaitcntAfter(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001307 MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
1308 // Now look at the instruction opcode. If it is a memory access
1309 // instruction, update the upper-bound of the appropriate counter's
1310 // bracket and the destination operand scores.
1311 // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001312 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001313 if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001314 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1315 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1316 } else {
1317 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1318 }
1319 } else if (TII->isFLAT(Inst)) {
1320 assert(Inst.mayLoad() || Inst.mayStore());
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001321
1322 if (TII->usesVM_CNT(Inst))
1323 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1324
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001325 if (TII->usesLGKM_CNT(Inst)) {
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001326 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001327
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001328 // This is a flat memory operation, so note it - it will require
1329 // that both the VM and LGKM be flushed to zero if it is pending when
1330 // a VM or LGKM dependency occurs.
1331 if (mayAccessLDSThroughFlat(Inst))
1332 ScoreBrackets->setPendingFlat();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001333 }
1334 } else if (SIInstrInfo::isVMEM(Inst) &&
1335 // TODO: get a better carve out.
1336 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1337 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1338 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1339 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
Mark Searles2a19af62018-04-26 16:11:19 +00001340 if (ST->vmemWriteNeedsExpWaitcnt() &&
Mark Searles11d0a042017-05-31 16:44:23 +00001341 (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001342 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1343 }
1344 } else if (TII->isSMRD(Inst)) {
1345 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1346 } else {
1347 switch (Inst.getOpcode()) {
1348 case AMDGPU::S_SENDMSG:
1349 case AMDGPU::S_SENDMSGHALT:
1350 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1351 break;
1352 case AMDGPU::EXP:
1353 case AMDGPU::EXP_DONE: {
1354 int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1355 if (Imm >= 32 && Imm <= 63)
1356 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1357 else if (Imm >= 12 && Imm <= 15)
1358 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1359 else
1360 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1361 break;
1362 }
1363 case AMDGPU::S_MEMTIME:
1364 case AMDGPU::S_MEMREALTIME:
1365 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1366 break;
1367 default:
1368 break;
1369 }
1370 }
1371}
1372
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001373// Merge the score brackets of the Block's predecessors;
1374// this merged score bracket is used when adding waitcnts to the Block
Kannan Narayananacb089e2017-04-12 03:25:12 +00001375void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
1376 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1377 int32_t MaxPending[NUM_INST_CNTS] = {0};
1378 int32_t MaxFlat[NUM_INST_CNTS] = {0};
1379 bool MixedExpTypes = false;
1380
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001381 // For single basic block loops, we need to retain the Block's
1382 // score bracket to have accurate Pred info. So, make a copy of Block's
1383 // score bracket, clear() it (which retains several important bits of info),
1384 // populate, and then replace en masse. For non-single basic block loops,
1385 // just clear Block's current score bracket and repopulate in-place.
1386 bool IsSelfPred;
1387 std::unique_ptr<BlockWaitcntBrackets> S;
1388
1389 IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
1390 != Block.pred_end();
1391 if (IsSelfPred) {
1392 S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1393 ScoreBrackets = S.get();
1394 }
1395
Kannan Narayananacb089e2017-04-12 03:25:12 +00001396 ScoreBrackets->clear();
1397
Kannan Narayananacb089e2017-04-12 03:25:12 +00001398 // See if there are any uninitialized predecessors. If so, emit an
1399 // s_waitcnt 0 at the beginning of the block.
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001400 for (MachineBasicBlock *Pred : Block.predecessors()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001401 BlockWaitcntBrackets *PredScoreBrackets =
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001402 BlockWaitcntBracketsMap[Pred].get();
1403 bool Visited = BlockVisitedSet.count(Pred);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001404 if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001405 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001406 }
1407 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1408 T = (enum InstCounterType)(T + 1)) {
1409 int span =
1410 PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
1411 MaxPending[T] = std::max(MaxPending[T], span);
1412 span =
1413 PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
1414 MaxFlat[T] = std::max(MaxFlat[T], span);
1415 }
1416
1417 MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
1418 }
1419
1420 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1421 // Also handle kills for exit block.
1422 if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1423 for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1424 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1425 T = (enum InstCounterType)(T + 1)) {
1426 int Span = KillWaitBrackets[I]->getScoreUB(T) -
1427 KillWaitBrackets[I]->getScoreLB(T);
1428 MaxPending[T] = std::max(MaxPending[T], Span);
1429 Span = KillWaitBrackets[I]->pendingFlat(T) -
1430 KillWaitBrackets[I]->getScoreLB(T);
1431 MaxFlat[T] = std::max(MaxFlat[T], Span);
1432 }
1433
1434 MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
1435 }
1436 }
1437
1438 // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
1439 for (MachineBasicBlock *Pred : Block.predecessors()) {
1440 BlockWaitcntBrackets *PredScoreBrackets =
1441 BlockWaitcntBracketsMap[Pred].get();
Mark Searles24c92ee2018-02-07 02:21:21 +00001442 bool Visited = BlockVisitedSet.count(Pred);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001443 if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001444 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001445 }
1446
1447 int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
1448 PredScoreBrackets->getScoreLB(EXP_CNT);
1449 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1450 int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
1451 PredScoreBrackets->getScoreLB(EXP_CNT);
1452 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1453 }
1454
1455 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1456 if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1457 for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1458 int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
1459 KillWaitBrackets[I]->getScoreLB(EXP_CNT);
1460 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1461 int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
1462 KillWaitBrackets[I]->getScoreLB(EXP_CNT);
1463 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1464 }
1465 }
1466
1467#if 0
1468 // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
1469 // TODO: how does LC distinguish between function entry and main entry?
1470 // If this is the entry to a function, force a wait.
1471 MachineBasicBlock &Entry = Block.getParent()->front();
1472 if (Entry.getNumber() == Block.getNumber()) {
1473 ScoreBrackets->setWaitAtBeginning();
1474 return;
1475 }
1476#endif
1477
1478 // Now set the current Block's brackets to the largest ending bracket.
1479 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1480 T = (enum InstCounterType)(T + 1)) {
1481 ScoreBrackets->setScoreUB(T, MaxPending[T]);
1482 ScoreBrackets->setScoreLB(T, 0);
1483 ScoreBrackets->setLastFlat(T, MaxFlat[T]);
1484 }
1485
1486 ScoreBrackets->setMixedExpTypes(MixedExpTypes);
1487
1488 // Set the register scoreboard.
1489 for (MachineBasicBlock *Pred : Block.predecessors()) {
Mark Searles24c92ee2018-02-07 02:21:21 +00001490 if (!BlockVisitedSet.count(Pred)) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001491 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001492 }
1493
1494 BlockWaitcntBrackets *PredScoreBrackets =
1495 BlockWaitcntBracketsMap[Pred].get();
1496
1497 // Now merge the gpr_reg_score information
1498 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1499 T = (enum InstCounterType)(T + 1)) {
1500 int PredLB = PredScoreBrackets->getScoreLB(T);
1501 int PredUB = PredScoreBrackets->getScoreUB(T);
1502 if (PredLB < PredUB) {
1503 int PredScale = MaxPending[T] - PredUB;
1504 // Merge vgpr scores.
1505 for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
1506 int PredRegScore = PredScoreBrackets->getRegScore(J, T);
1507 if (PredRegScore <= PredLB)
1508 continue;
1509 int NewRegScore = PredScale + PredRegScore;
1510 ScoreBrackets->setRegScore(
1511 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1512 }
1513 // Also need to merge sgpr scores for lgkm_cnt.
1514 if (T == LGKM_CNT) {
1515 for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
1516 int PredRegScore =
1517 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1518 if (PredRegScore <= PredLB)
1519 continue;
1520 int NewRegScore = PredScale + PredRegScore;
1521 ScoreBrackets->setRegScore(
1522 J + NUM_ALL_VGPRS, LGKM_CNT,
1523 std::max(
1524 ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1525 NewRegScore));
1526 }
1527 }
1528 }
1529 }
1530
1531 // Also merge the WaitEvent information.
1532 ForAllWaitEventType(W) {
1533 enum InstCounterType T = PredScoreBrackets->eventCounter(W);
1534 int PredEventUB = PredScoreBrackets->getEventUB(W);
1535 if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
1536 int NewEventUB =
1537 MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
1538 if (NewEventUB > 0) {
1539 ScoreBrackets->setEventUB(
1540 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1541 }
1542 }
1543 }
1544 }
1545
1546 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1547 // Set the register scoreboard.
1548 if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1549 for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1550 // Now merge the gpr_reg_score information.
1551 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1552 T = (enum InstCounterType)(T + 1)) {
1553 int PredLB = KillWaitBrackets[I]->getScoreLB(T);
1554 int PredUB = KillWaitBrackets[I]->getScoreUB(T);
1555 if (PredLB < PredUB) {
1556 int PredScale = MaxPending[T] - PredUB;
1557 // Merge vgpr scores.
1558 for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
1559 int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
1560 if (PredRegScore <= PredLB)
1561 continue;
1562 int NewRegScore = PredScale + PredRegScore;
1563 ScoreBrackets->setRegScore(
1564 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1565 }
1566 // Also need to merge sgpr scores for lgkm_cnt.
1567 if (T == LGKM_CNT) {
1568 for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
1569 int PredRegScore =
1570 KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1571 if (PredRegScore <= PredLB)
1572 continue;
1573 int NewRegScore = PredScale + PredRegScore;
1574 ScoreBrackets->setRegScore(
1575 J + NUM_ALL_VGPRS, LGKM_CNT,
1576 std::max(
1577 ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1578 NewRegScore));
1579 }
1580 }
1581 }
1582 }
1583
1584 // Also merge the WaitEvent information.
1585 ForAllWaitEventType(W) {
1586 enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
1587 int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
1588 if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
1589 int NewEventUB =
1590 MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
1591 if (NewEventUB > 0) {
1592 ScoreBrackets->setEventUB(
1593 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1594 }
1595 }
1596 }
1597 }
1598 }
1599
1600 // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
1601 // sequencing predecessors, because changes to EXEC require waitcnts due to
1602 // the delayed nature of these operations.
1603 for (MachineBasicBlock *Pred : Block.predecessors()) {
Mark Searles24c92ee2018-02-07 02:21:21 +00001604 if (!BlockVisitedSet.count(Pred)) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001605 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001606 }
1607
1608 BlockWaitcntBrackets *PredScoreBrackets =
1609 BlockWaitcntBracketsMap[Pred].get();
1610
1611 int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
1612 if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1613 int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
1614 PredScoreBrackets->getScoreUB(EXP_CNT);
1615 if (new_gds_ub > 0) {
1616 ScoreBrackets->setEventUB(
1617 GDS_GPR_LOCK,
1618 std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
1619 }
1620 }
1621 int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
1622 if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1623 int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
1624 PredScoreBrackets->getScoreUB(EXP_CNT);
1625 if (new_exp_ub > 0) {
1626 ScoreBrackets->setEventUB(
1627 EXP_GPR_LOCK,
1628 std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
1629 }
1630 }
1631 }
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001632
1633 // if a single block loop, update the score brackets. Not needed for other
1634 // blocks, as we did this in-place
1635 if (IsSelfPred) {
1636 BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1637 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001638}
1639
Mark Searles1bc6e712018-04-19 15:42:30 +00001640/// Return true if the given basic block is a "bottom" block of a loop. This
1641/// differs from MachineLoop::getBottomBlock in that it works even if the loop
1642/// is discontiguous. This also handles multiple back-edges for the same
1643/// "header" block of a loop.
1644bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
1645 const MachineBasicBlock *Block) {
1646 for (MachineBasicBlock *MBB : Loop->blocks()) {
1647 if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
1648 return true;
1649 }
1650 }
1651 return false;
1652}
1653
1654/// Count the number of "bottom" basic blocks of a loop.
1655unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
1656 unsigned Count = 0;
1657 for (MachineBasicBlock *MBB : Loop->blocks()) {
1658 if (MBB->isSuccessor(Loop->getHeader())) {
1659 Count++;
1660 }
1661 }
1662 return Count;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001663}
1664
1665// Generate s_waitcnt instructions where needed.
1666void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1667 MachineBasicBlock &Block) {
1668 // Initialize the state information.
1669 mergeInputScoreBrackets(Block);
1670
1671 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1672
1673 DEBUG({
Mark Searlesec581832018-04-25 19:21:26 +00001674 dbgs() << "*** Block" << Block.getNumber() << " ***";
Kannan Narayananacb089e2017-04-12 03:25:12 +00001675 ScoreBrackets->dump();
1676 });
1677
Kannan Narayananacb089e2017-04-12 03:25:12 +00001678 // Walk over the instructions.
1679 for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1680 Iter != E;) {
1681 MachineInstr &Inst = *Iter;
1682 // Remove any previously existing waitcnts.
1683 if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
Mark Searles65207922018-02-19 19:19:59 +00001684 // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
1685 // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
1686 // as needed.
Mark Searles24c92ee2018-02-07 02:21:21 +00001687 if (!TrackedWaitcntSet.count(&Inst))
Kannan Narayananacb089e2017-04-12 03:25:12 +00001688 ++Iter;
1689 else {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001690 ++Iter;
1691 Inst.removeFromParent();
1692 }
Mark Searles65207922018-02-19 19:19:59 +00001693 ScoreBrackets->setWaitcnt(&Inst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001694 continue;
1695 }
1696
1697 // Kill instructions generate a conditional branch to the endmain block.
1698 // Merge the current waitcnt state into the endmain block information.
1699 // TODO: Are there other flavors of KILL instruction?
1700 if (Inst.getOpcode() == AMDGPU::KILL) {
1701 addKillWaitBracket(ScoreBrackets);
1702 }
1703
1704 bool VCCZBugWorkAround = false;
1705 if (readsVCCZ(Inst) &&
Mark Searles24c92ee2018-02-07 02:21:21 +00001706 (!VCCZBugHandledSet.count(&Inst))) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001707 if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1708 ScoreBrackets->getScoreUB(LGKM_CNT) &&
1709 ScoreBrackets->hasPendingSMEM()) {
1710 if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
1711 VCCZBugWorkAround = true;
1712 }
1713 }
1714
1715 // Generate an s_waitcnt instruction to be placed before
1716 // cur_Inst, if needed.
Mark Searles70901b92018-04-24 15:59:59 +00001717 generateWaitcntInstBefore(Inst, ScoreBrackets);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001718
Mark Searles70901b92018-04-24 15:59:59 +00001719 updateEventWaitcntAfter(Inst, ScoreBrackets);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001720
1721#if 0 // TODO: implement resource type check controlled by options with ub = LB.
1722 // If this instruction generates a S_SETVSKIP because it is an
1723 // indexed resource, and we are on Tahiti, then it will also force
1724 // an S_WAITCNT vmcnt(0)
1725 if (RequireCheckResourceType(Inst, context)) {
1726 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1727 ScoreBrackets->setScoreLB(VM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +00001728 ScoreBrackets->getScoreUB(VM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +00001729 }
1730#endif
1731
1732 ScoreBrackets->clearWaitcnt();
1733
Kannan Narayananacb089e2017-04-12 03:25:12 +00001734 DEBUG({
Mark Searles94ae3b22018-01-30 17:17:06 +00001735 Inst.print(dbgs());
Kannan Narayananacb089e2017-04-12 03:25:12 +00001736 ScoreBrackets->dump();
1737 });
1738
1739 // Check to see if this is a GWS instruction. If so, and if this is CI or
1740 // VI, then the generated code sequence will include an S_WAITCNT 0.
1741 // TODO: Are these the only GWS instructions?
1742 if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1743 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1744 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1745 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1746 Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1747 // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
1748 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1749 ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1750 ScoreBrackets->updateByWait(LGKM_CNT,
1751 ScoreBrackets->getScoreUB(LGKM_CNT));
1752 }
1753
1754 // TODO: Remove this work-around after fixing the scheduler and enable the
1755 // assert above.
1756 if (VCCZBugWorkAround) {
1757 // Restore the vccz bit. Any time a value is written to vcc, the vcc
1758 // bit is updated, so we can restore the bit by reading the value of
1759 // vcc and then writing it back to the register.
1760 BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1761 AMDGPU::VCC)
1762 .addReg(AMDGPU::VCC);
1763 VCCZBugHandledSet.insert(&Inst);
1764 }
1765
Kannan Narayananacb089e2017-04-12 03:25:12 +00001766 ++Iter;
1767 }
1768
1769 // Check if we need to force convergence at loop footer.
1770 MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
Mark Searles1bc6e712018-04-19 15:42:30 +00001771 if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001772 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1773 WaitcntData->print();
1774 DEBUG(dbgs() << '\n';);
1775
1776 // The iterative waitcnt insertion algorithm aims for optimal waitcnt
1777 // placement and doesn't always guarantee convergence for a loop. Each
1778 // loop should take at most 2 iterations for it to converge naturally.
1779 // When this max is reached and result doesn't converge, we force
1780 // convergence by inserting a s_waitcnt at the end of loop footer.
1781 if (WaitcntData->getIterCnt() > 2) {
1782 // To ensure convergence, need to make wait events at loop footer be no
1783 // more than those from the previous iteration.
Mark Searles65207922018-02-19 19:19:59 +00001784 // As a simplification, instead of tracking individual scores and
1785 // generating the precise wait count, just wait on 0.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001786 bool HasPending = false;
1787 MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
1788 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1789 T = (enum InstCounterType)(T + 1)) {
1790 if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
1791 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1792 HasPending = true;
1793 }
1794 }
1795
1796 if (HasPending) {
1797 if (!SWaitInst) {
1798 SWaitInst = Block.getParent()->CreateMachineInstr(
1799 TII->get(AMDGPU::S_WAITCNT), DebugLoc());
Mark Searles24c92ee2018-02-07 02:21:21 +00001800 TrackedWaitcntSet.insert(SWaitInst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001801 const MachineOperand &Op = MachineOperand::CreateImm(0);
1802 SWaitInst->addOperand(MF, Op);
1803#if 0 // TODO: Format the debug output
1804 OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
1805 OutputTransformAdd(SWaitInst, context);
1806#endif
1807 }
1808#if 0 // TODO: ??
1809 _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
1810#endif
1811 }
1812
1813 if (SWaitInst) {
1814 DEBUG({
1815 SWaitInst->print(dbgs());
1816 dbgs() << "\nAdjusted score board:";
1817 ScoreBrackets->dump();
1818 });
1819
1820 // Add this waitcnt to the block. It is either newly created or
1821 // created in previous iterations and added back since block traversal
Mark Searles65207922018-02-19 19:19:59 +00001822 // always removes waitcnts.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001823 insertWaitcntBeforeCF(Block, SWaitInst);
1824 WaitcntData->setWaitcnt(SWaitInst);
1825 }
1826 }
1827 }
1828}
1829
1830bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1831 ST = &MF.getSubtarget<SISubtarget>();
1832 TII = ST->getInstrInfo();
1833 TRI = &TII->getRegisterInfo();
1834 MRI = &MF.getRegInfo();
1835 MLI = &getAnalysis<MachineLoopInfo>();
1836 IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
Mark Searles11d0a042017-05-31 16:44:23 +00001837 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001838 AMDGPUASI = ST->getAMDGPUAS();
1839
Mark Searles4a0f2c52018-05-07 14:43:28 +00001840 ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
Mark Searlesec581832018-04-25 19:21:26 +00001841 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1842 T = (enum InstCounterType)(T + 1))
1843 ForceEmitWaitcnt[T] = false;
1844
Kannan Narayananacb089e2017-04-12 03:25:12 +00001845 HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1846 HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1847 HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1848
1849 HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1850 HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1851 assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1852 assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1853
1854 RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1855 RegisterEncoding.VGPRL =
1856 RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1857 RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1858 RegisterEncoding.SGPRL =
1859 RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1860
Mark Searles24c92ee2018-02-07 02:21:21 +00001861 TrackedWaitcntSet.clear();
1862 BlockVisitedSet.clear();
1863 VCCZBugHandledSet.clear();
Mark Searles1bc6e712018-04-19 15:42:30 +00001864 LoopWaitcntDataMap.clear();
Mark Searles24c92ee2018-02-07 02:21:21 +00001865
Kannan Narayananacb089e2017-04-12 03:25:12 +00001866 // Walk over the blocks in reverse post-dominator order, inserting
1867 // s_waitcnt where needed.
1868 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
1869 bool Modified = false;
1870 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1871 I = RPOT.begin(),
1872 E = RPOT.end(), J = RPOT.begin();
1873 I != E;) {
1874 MachineBasicBlock &MBB = **I;
1875
1876 BlockVisitedSet.insert(&MBB);
1877
1878 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1879 if (!ScoreBrackets) {
Eugene Zelenko59e12822017-08-08 00:47:13 +00001880 BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001881 ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1882 }
1883 ScoreBrackets->setPostOrder(MBB.getNumber());
1884 MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
1885 if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
Eugene Zelenko59e12822017-08-08 00:47:13 +00001886 LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001887
1888 // If we are walking into the block from before the loop, then guarantee
1889 // at least 1 re-walk over the loop to propagate the information, even if
1890 // no S_WAITCNT instructions were generated.
Mark Searles1bc6e712018-04-19 15:42:30 +00001891 if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
1892 unsigned Count = countNumBottomBlocks(ContainingLoop);
1893
1894 // If the loop has multiple back-edges, and so more than one "bottom"
1895 // basic block, we have to guarantee a re-walk over every blocks.
1896 if ((std::count(BlockWaitcntProcessedSet.begin(),
1897 BlockWaitcntProcessedSet.end(), &MBB) < Count)) {
1898 BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
1899 DEBUG(dbgs() << "set-revisit: Block"
1900 << ContainingLoop->getHeader()->getNumber() << '\n';);
1901 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001902 }
1903
1904 // Walk over the instructions.
1905 insertWaitcntInBlock(MF, MBB);
1906
1907 // Flag that waitcnts have been processed at least once.
Mark Searles1bc6e712018-04-19 15:42:30 +00001908 BlockWaitcntProcessedSet.push_back(&MBB);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001909
Mark Searles1bc6e712018-04-19 15:42:30 +00001910 // See if we want to revisit the loop. If a loop has multiple back-edges,
1911 // we shouldn't revisit the same "bottom" basic block.
1912 if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
1913 std::count(BlockWaitcntProcessedSet.begin(),
1914 BlockWaitcntProcessedSet.end(), &MBB) == 1) {
Kannan Narayanan5e73b042017-05-05 21:10:17 +00001915 MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001916 BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
1917 if (EntrySB && EntrySB->getRevisitLoop()) {
1918 EntrySB->setRevisitLoop(false);
1919 J = I;
1920 int32_t PostOrder = EntrySB->getPostOrder();
1921 // TODO: Avoid this loop. Find another way to set I.
1922 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1923 X = RPOT.begin(),
1924 Y = RPOT.end();
1925 X != Y; ++X) {
1926 MachineBasicBlock &MBBX = **X;
1927 if (MBBX.getNumber() == PostOrder) {
1928 I = X;
1929 break;
1930 }
1931 }
1932 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1933 WaitcntData->incIterCnt();
Mark Searles65207922018-02-19 19:19:59 +00001934 DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001935 continue;
1936 } else {
1937 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1938 // Loop converged, reset iteration count. If this loop gets revisited,
1939 // it must be from an outer loop, the counter will restart, this will
1940 // ensure we don't force convergence on such revisits.
1941 WaitcntData->resetIterCnt();
1942 }
1943 }
1944
1945 J = I;
1946 ++I;
1947 }
1948
1949 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1950
1951 bool HaveScalarStores = false;
1952
1953 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1954 ++BI) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001955 MachineBasicBlock &MBB = *BI;
1956
1957 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1958 ++I) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001959 if (!HaveScalarStores && TII->isScalarStore(*I))
1960 HaveScalarStores = true;
1961
1962 if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1963 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1964 EndPgmBlocks.push_back(&MBB);
1965 }
1966 }
1967
1968 if (HaveScalarStores) {
1969 // If scalar writes are used, the cache must be flushed or else the next
1970 // wave to reuse the same scratch memory can be clobbered.
1971 //
1972 // Insert s_dcache_wb at wave termination points if there were any scalar
1973 // stores, and only if the cache hasn't already been flushed. This could be
1974 // improved by looking across blocks for flushes in postdominating blocks
1975 // from the stores but an explicitly requested flush is probably very rare.
1976 for (MachineBasicBlock *MBB : EndPgmBlocks) {
1977 bool SeenDCacheWB = false;
1978
1979 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1980 ++I) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001981 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1982 SeenDCacheWB = true;
1983 else if (TII->isScalarStore(*I))
1984 SeenDCacheWB = false;
1985
1986 // FIXME: It would be better to insert this before a waitcnt if any.
1987 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1988 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1989 !SeenDCacheWB) {
1990 Modified = true;
1991 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1992 }
1993 }
1994 }
1995 }
1996
Mark Searles11d0a042017-05-31 16:44:23 +00001997 if (!MFI->isEntryFunction()) {
1998 // Wait for any outstanding memory operations that the input registers may
Hiroshi Inouec8e92452018-01-29 05:17:03 +00001999 // depend on. We can't track them and it's better to the wait after the
Mark Searles11d0a042017-05-31 16:44:23 +00002000 // costly call sequence.
2001
2002 // TODO: Could insert earlier and schedule more liberally with operations
2003 // that only use caller preserved registers.
2004 MachineBasicBlock &EntryBB = MF.front();
2005 BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
2006 .addImm(0);
2007
2008 Modified = true;
2009 }
2010
Kannan Narayananacb089e2017-04-12 03:25:12 +00002011 return Modified;
2012}