blob: 7f750989376042ba4ad300194d7a8fd176dc1ad8 [file] [log] [blame]
Eugene Zelenko59e12822017-08-08 00:47:13 +00001//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
Kannan Narayananacb089e2017-04-12 03:25:12 +00002//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
Adrian Prantl5f8f34e42018-05-01 15:54:18 +000011/// Insert wait instructions for memory reads and writes.
Kannan Narayananacb089e2017-04-12 03:25:12 +000012///
13/// Memory reads and writes are issued asynchronously, so we need to insert
14/// S_WAITCNT instructions when we want to access any of their results or
15/// overwrite any register that's used asynchronously.
16//
17//===----------------------------------------------------------------------===//
18
19#include "AMDGPU.h"
20#include "AMDGPUSubtarget.h"
21#include "SIDefines.h"
22#include "SIInstrInfo.h"
23#include "SIMachineFunctionInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000024#include "SIRegisterInfo.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000025#include "Utils/AMDGPUBaseInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000026#include "llvm/ADT/DenseMap.h"
27#include "llvm/ADT/DenseSet.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000028#include "llvm/ADT/PostOrderIterator.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000029#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/SmallVector.h"
31#include "llvm/CodeGen/MachineBasicBlock.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000032#include "llvm/CodeGen/MachineFunction.h"
33#include "llvm/CodeGen/MachineFunctionPass.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000034#include "llvm/CodeGen/MachineInstr.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000035#include "llvm/CodeGen/MachineInstrBuilder.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000036#include "llvm/CodeGen/MachineLoopInfo.h"
37#include "llvm/CodeGen/MachineMemOperand.h"
38#include "llvm/CodeGen/MachineOperand.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000039#include "llvm/CodeGen/MachineRegisterInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000040#include "llvm/IR/DebugLoc.h"
41#include "llvm/Pass.h"
42#include "llvm/Support/Debug.h"
Mark Searlesec581832018-04-25 19:21:26 +000043#include "llvm/Support/DebugCounter.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000044#include "llvm/Support/ErrorHandling.h"
45#include "llvm/Support/raw_ostream.h"
46#include <algorithm>
47#include <cassert>
48#include <cstdint>
49#include <cstring>
50#include <memory>
51#include <utility>
52#include <vector>
Kannan Narayananacb089e2017-04-12 03:25:12 +000053
Mark Searlesec581832018-04-25 19:21:26 +000054using namespace llvm;
55
Kannan Narayananacb089e2017-04-12 03:25:12 +000056#define DEBUG_TYPE "si-insert-waitcnts"
57
Mark Searlesec581832018-04-25 19:21:26 +000058DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
59 "Force emit s_waitcnt expcnt(0) instrs");
60DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
61 "Force emit s_waitcnt lgkmcnt(0) instrs");
62DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
63 "Force emit s_waitcnt vmcnt(0) instrs");
64
65static cl::opt<unsigned> ForceEmitZeroFlag(
66 "amdgpu-waitcnt-forcezero",
67 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
68 cl::init(0), cl::Hidden);
Kannan Narayananacb089e2017-04-12 03:25:12 +000069
70namespace {
71
72// Class of object that encapsulates latest instruction counter score
73// associated with the operand. Used for determining whether
74// s_waitcnt instruction needs to be emited.
75
76#define CNT_MASK(t) (1u << (t))
77
78enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
79
Eugene Zelenko59e12822017-08-08 00:47:13 +000080using RegInterval = std::pair<signed, signed>;
Kannan Narayananacb089e2017-04-12 03:25:12 +000081
82struct {
83 int32_t VmcntMax;
84 int32_t ExpcntMax;
85 int32_t LgkmcntMax;
86 int32_t NumVGPRsMax;
87 int32_t NumSGPRsMax;
88} HardwareLimits;
89
90struct {
91 unsigned VGPR0;
92 unsigned VGPRL;
93 unsigned SGPR0;
94 unsigned SGPRL;
95} RegisterEncoding;
96
97enum WaitEventType {
98 VMEM_ACCESS, // vector-memory read & write
99 LDS_ACCESS, // lds read & write
100 GDS_ACCESS, // gds read & write
101 SQ_MESSAGE, // send message
102 SMEM_ACCESS, // scalar-memory read & write
103 EXP_GPR_LOCK, // export holding on its data src
104 GDS_GPR_LOCK, // GDS holding on its data and addr src
105 EXP_POS_ACCESS, // write to export position
106 EXP_PARAM_ACCESS, // write to export parameter
107 VMW_GPR_LOCK, // vector-memory write holding on its data src
108 NUM_WAIT_EVENTS,
109};
110
111// The mapping is:
112// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
113// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
114// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
115// We reserve a fixed number of VGPR slots in the scoring tables for
116// special tokens like SCMEM_LDS (needed for buffer load to LDS).
117enum RegisterMapping {
118 SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
119 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
120 NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
121 EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
122 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
123};
124
125#define ForAllWaitEventType(w) \
126 for (enum WaitEventType w = (enum WaitEventType)0; \
127 (w) < (enum WaitEventType)NUM_WAIT_EVENTS; \
128 (w) = (enum WaitEventType)((w) + 1))
129
130// This is a per-basic-block object that maintains current score brackets
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000131// of each wait counter, and a per-register scoreboard for each wait counter.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000132// We also maintain the latest score for every event type that can change the
133// waitcnt in order to know if there are multiple types of events within
134// the brackets. When multiple types of event happen in the bracket,
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000135// wait count may get decreased out of order, therefore we need to put in
Kannan Narayananacb089e2017-04-12 03:25:12 +0000136// "s_waitcnt 0" before use.
137class BlockWaitcntBrackets {
138public:
Mark Searlesf0b93f12018-06-04 16:51:59 +0000139 BlockWaitcntBrackets(const SISubtarget *SubTarget) : ST(SubTarget) {
Eugene Zelenko59e12822017-08-08 00:47:13 +0000140 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
141 T = (enum InstCounterType)(T + 1)) {
142 memset(VgprScores[T], 0, sizeof(VgprScores[T]));
143 }
144 }
145
146 ~BlockWaitcntBrackets() = default;
147
Kannan Narayananacb089e2017-04-12 03:25:12 +0000148 static int32_t getWaitCountMax(InstCounterType T) {
149 switch (T) {
150 case VM_CNT:
151 return HardwareLimits.VmcntMax;
152 case LGKM_CNT:
153 return HardwareLimits.LgkmcntMax;
154 case EXP_CNT:
155 return HardwareLimits.ExpcntMax;
156 default:
157 break;
158 }
159 return 0;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000160 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000161
162 void setScoreLB(InstCounterType T, int32_t Val) {
163 assert(T < NUM_INST_CNTS);
164 if (T >= NUM_INST_CNTS)
165 return;
166 ScoreLBs[T] = Val;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000167 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000168
169 void setScoreUB(InstCounterType T, int32_t Val) {
170 assert(T < NUM_INST_CNTS);
171 if (T >= NUM_INST_CNTS)
172 return;
173 ScoreUBs[T] = Val;
174 if (T == EXP_CNT) {
175 int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
176 if (ScoreLBs[T] < UB)
177 ScoreLBs[T] = UB;
178 }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000179 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000180
181 int32_t getScoreLB(InstCounterType T) {
182 assert(T < NUM_INST_CNTS);
183 if (T >= NUM_INST_CNTS)
184 return 0;
185 return ScoreLBs[T];
Eugene Zelenko59e12822017-08-08 00:47:13 +0000186 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000187
188 int32_t getScoreUB(InstCounterType T) {
189 assert(T < NUM_INST_CNTS);
190 if (T >= NUM_INST_CNTS)
191 return 0;
192 return ScoreUBs[T];
Eugene Zelenko59e12822017-08-08 00:47:13 +0000193 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000194
195 // Mapping from event to counter.
196 InstCounterType eventCounter(WaitEventType E) {
197 switch (E) {
198 case VMEM_ACCESS:
199 return VM_CNT;
200 case LDS_ACCESS:
201 case GDS_ACCESS:
202 case SQ_MESSAGE:
203 case SMEM_ACCESS:
204 return LGKM_CNT;
205 case EXP_GPR_LOCK:
206 case GDS_GPR_LOCK:
207 case VMW_GPR_LOCK:
208 case EXP_POS_ACCESS:
209 case EXP_PARAM_ACCESS:
210 return EXP_CNT;
211 default:
212 llvm_unreachable("unhandled event type");
213 }
214 return NUM_INST_CNTS;
215 }
216
217 void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
218 if (GprNo < NUM_ALL_VGPRS) {
219 if (GprNo > VgprUB) {
220 VgprUB = GprNo;
221 }
222 VgprScores[T][GprNo] = Val;
223 } else {
224 assert(T == LGKM_CNT);
225 if (GprNo - NUM_ALL_VGPRS > SgprUB) {
226 SgprUB = GprNo - NUM_ALL_VGPRS;
227 }
228 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
229 }
230 }
231
232 int32_t getRegScore(int GprNo, InstCounterType T) {
233 if (GprNo < NUM_ALL_VGPRS) {
234 return VgprScores[T][GprNo];
235 }
236 return SgprScores[GprNo - NUM_ALL_VGPRS];
237 }
238
239 void clear() {
240 memset(ScoreLBs, 0, sizeof(ScoreLBs));
241 memset(ScoreUBs, 0, sizeof(ScoreUBs));
242 memset(EventUBs, 0, sizeof(EventUBs));
243 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
244 T = (enum InstCounterType)(T + 1)) {
245 memset(VgprScores[T], 0, sizeof(VgprScores[T]));
246 }
247 memset(SgprScores, 0, sizeof(SgprScores));
248 }
249
250 RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
251 const MachineRegisterInfo *MRI,
252 const SIRegisterInfo *TRI, unsigned OpNo,
253 bool Def) const;
254
255 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
256 const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
257 unsigned OpNo, int32_t Val);
258
259 void setWaitAtBeginning() { WaitAtBeginning = true; }
260 void clearWaitAtBeginning() { WaitAtBeginning = false; }
261 bool getWaitAtBeginning() const { return WaitAtBeginning; }
262 void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
263 int32_t getMaxVGPR() const { return VgprUB; }
264 int32_t getMaxSGPR() const { return SgprUB; }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000265
Kannan Narayananacb089e2017-04-12 03:25:12 +0000266 int32_t getEventUB(enum WaitEventType W) const {
267 assert(W < NUM_WAIT_EVENTS);
268 return EventUBs[W];
269 }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000270
Kannan Narayananacb089e2017-04-12 03:25:12 +0000271 bool counterOutOfOrder(InstCounterType T);
272 unsigned int updateByWait(InstCounterType T, int ScoreToWait);
273 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
274 const MachineRegisterInfo *MRI, WaitEventType E,
275 MachineInstr &MI);
276
Kannan Narayananacb089e2017-04-12 03:25:12 +0000277 bool hasPendingSMEM() const {
278 return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
279 EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
280 }
281
282 bool hasPendingFlat() const {
283 return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
284 LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
285 (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
286 LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
287 }
288
289 void setPendingFlat() {
290 LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
291 LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
292 }
293
294 int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
295
296 void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
297
298 bool getRevisitLoop() const { return RevisitLoop; }
299 void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
300
301 void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
302 int32_t getPostOrder() const { return PostOrder; }
303
304 void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000305 void clearWaitcnt() { Waitcnt = nullptr; }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000306 MachineInstr *getWaitcnt() const { return Waitcnt; }
307
308 bool mixedExpTypes() const { return MixedExpTypes; }
309 void setMixedExpTypes(bool MixedExpTypesIn) {
310 MixedExpTypes = MixedExpTypesIn;
311 }
312
313 void print(raw_ostream &);
314 void dump() { print(dbgs()); }
315
316private:
Mark Searlesf0b93f12018-06-04 16:51:59 +0000317 const SISubtarget *ST = nullptr;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000318 bool WaitAtBeginning = false;
319 bool RevisitLoop = false;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000320 bool MixedExpTypes = false;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000321 int32_t PostOrder = 0;
322 MachineInstr *Waitcnt = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000323 int32_t ScoreLBs[NUM_INST_CNTS] = {0};
324 int32_t ScoreUBs[NUM_INST_CNTS] = {0};
325 int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
326 // Remember the last flat memory operation.
327 int32_t LastFlat[NUM_INST_CNTS] = {0};
328 // wait_cnt scores for every vgpr.
329 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000330 int32_t VgprUB = 0;
331 int32_t SgprUB = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000332 int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
333 // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
334 int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
335};
336
337// This is a per-loop-region object that records waitcnt status at the end of
338// loop footer from the previous iteration. We also maintain an iteration
339// count to track the number of times the loop has been visited. When it
340// doesn't converge naturally, we force convergence by inserting s_waitcnt 0
341// at the end of the loop footer.
342class LoopWaitcntData {
343public:
Eugene Zelenko59e12822017-08-08 00:47:13 +0000344 LoopWaitcntData() = default;
345 ~LoopWaitcntData() = default;
346
Kannan Narayananacb089e2017-04-12 03:25:12 +0000347 void incIterCnt() { IterCnt++; }
348 void resetIterCnt() { IterCnt = 0; }
Mark Searles10545412018-05-30 15:47:45 +0000349 unsigned getIterCnt() { return IterCnt; }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000350
Kannan Narayananacb089e2017-04-12 03:25:12 +0000351 void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
352 MachineInstr *getWaitcnt() const { return LfWaitcnt; }
353
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000354 void print() { LLVM_DEBUG(dbgs() << " iteration " << IterCnt << '\n';); }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000355
356private:
357 // s_waitcnt added at the end of loop footer to stablize wait scores
358 // at the end of the loop footer.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000359 MachineInstr *LfWaitcnt = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000360 // Number of iterations the loop has been visited, not including the initial
361 // walk over.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000362 int32_t IterCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000363};
364
365class SIInsertWaitcnts : public MachineFunctionPass {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000366private:
Eugene Zelenko59e12822017-08-08 00:47:13 +0000367 const SISubtarget *ST = nullptr;
368 const SIInstrInfo *TII = nullptr;
369 const SIRegisterInfo *TRI = nullptr;
370 const MachineRegisterInfo *MRI = nullptr;
371 const MachineLoopInfo *MLI = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000372 AMDGPU::IsaInfo::IsaVersion IV;
373 AMDGPUAS AMDGPUASI;
374
375 DenseSet<MachineBasicBlock *> BlockVisitedSet;
Mark Searles24c92ee2018-02-07 02:21:21 +0000376 DenseSet<MachineInstr *> TrackedWaitcntSet;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000377 DenseSet<MachineInstr *> VCCZBugHandledSet;
378
379 DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
380 BlockWaitcntBracketsMap;
381
Mark Searles1bc6e712018-04-19 15:42:30 +0000382 std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000383
384 DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
385
386 std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
387
Mark Searles4a0f2c52018-05-07 14:43:28 +0000388 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
389 // because of amdgpu-waitcnt-forcezero flag
390 bool ForceEmitZeroWaitcnts;
Mark Searlesec581832018-04-25 19:21:26 +0000391 bool ForceEmitWaitcnt[NUM_INST_CNTS];
392
Kannan Narayananacb089e2017-04-12 03:25:12 +0000393public:
394 static char ID;
395
Eugene Zelenko59e12822017-08-08 00:47:13 +0000396 SIInsertWaitcnts() : MachineFunctionPass(ID) {}
Kannan Narayananacb089e2017-04-12 03:25:12 +0000397
398 bool runOnMachineFunction(MachineFunction &MF) override;
399
400 StringRef getPassName() const override {
401 return "SI insert wait instructions";
402 }
403
404 void getAnalysisUsage(AnalysisUsage &AU) const override {
405 AU.setPreservesCFG();
406 AU.addRequired<MachineLoopInfo>();
407 MachineFunctionPass::getAnalysisUsage(AU);
408 }
409
410 void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
411 // The waitcnt information is copied because it changes as the block is
412 // traversed.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000413 KillWaitBrackets.push_back(
414 llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000415 }
416
Mark Searlesec581832018-04-25 19:21:26 +0000417 bool isForceEmitWaitcnt() const {
418 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
419 T = (enum InstCounterType)(T + 1))
420 if (ForceEmitWaitcnt[T])
421 return true;
422 return false;
423 }
424
425 void setForceEmitWaitcnt() {
426// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
427// For debug builds, get the debug counter info and adjust if need be
428#ifndef NDEBUG
429 if (DebugCounter::isCounterSet(ForceExpCounter) &&
430 DebugCounter::shouldExecute(ForceExpCounter)) {
431 ForceEmitWaitcnt[EXP_CNT] = true;
432 } else {
433 ForceEmitWaitcnt[EXP_CNT] = false;
434 }
435
436 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
437 DebugCounter::shouldExecute(ForceLgkmCounter)) {
438 ForceEmitWaitcnt[LGKM_CNT] = true;
439 } else {
440 ForceEmitWaitcnt[LGKM_CNT] = false;
441 }
442
443 if (DebugCounter::isCounterSet(ForceVMCounter) &&
444 DebugCounter::shouldExecute(ForceVMCounter)) {
445 ForceEmitWaitcnt[VM_CNT] = true;
446 } else {
447 ForceEmitWaitcnt[VM_CNT] = false;
448 }
449#endif // NDEBUG
450 }
451
Matt Arsenault0ed39d32017-07-21 18:54:54 +0000452 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
Mark Searles70901b92018-04-24 15:59:59 +0000453 void generateWaitcntInstBefore(MachineInstr &MI,
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +0000454 BlockWaitcntBrackets *ScoreBrackets);
Mark Searles70901b92018-04-24 15:59:59 +0000455 void updateEventWaitcntAfter(MachineInstr &Inst,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000456 BlockWaitcntBrackets *ScoreBrackets);
457 void mergeInputScoreBrackets(MachineBasicBlock &Block);
Mark Searles1bc6e712018-04-19 15:42:30 +0000458 bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
459 unsigned countNumBottomBlocks(const MachineLoop *Loop);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000460 void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
461 void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +0000462 bool isWaitcntStronger(unsigned LHS, unsigned RHS);
463 unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000464};
465
Eugene Zelenko59e12822017-08-08 00:47:13 +0000466} // end anonymous namespace
Kannan Narayananacb089e2017-04-12 03:25:12 +0000467
468RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
469 const SIInstrInfo *TII,
470 const MachineRegisterInfo *MRI,
471 const SIRegisterInfo *TRI,
472 unsigned OpNo,
473 bool Def) const {
474 const MachineOperand &Op = MI->getOperand(OpNo);
475 if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
476 (Def && !Op.isDef()))
477 return {-1, -1};
478
479 // A use via a PW operand does not need a waitcnt.
480 // A partial write is not a WAW.
481 assert(!Op.getSubReg() || !Op.isUndef());
482
483 RegInterval Result;
484 const MachineRegisterInfo &MRIA = *MRI;
485
486 unsigned Reg = TRI->getEncodingValue(Op.getReg());
487
488 if (TRI->isVGPR(MRIA, Op.getReg())) {
489 assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
490 Result.first = Reg - RegisterEncoding.VGPR0;
491 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
492 } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
493 assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
494 Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
495 assert(Result.first >= NUM_ALL_VGPRS &&
496 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
497 }
498 // TODO: Handle TTMP
499 // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
500 else
501 return {-1, -1};
502
503 const MachineInstr &MIA = *MI;
504 const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +0000505 unsigned Size = TRI->getRegSizeInBits(*RC);
506 Result.second = Result.first + (Size / 32);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000507
508 return Result;
509}
510
511void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
512 const SIInstrInfo *TII,
513 const SIRegisterInfo *TRI,
514 const MachineRegisterInfo *MRI,
515 unsigned OpNo, int32_t Val) {
516 RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000517 LLVM_DEBUG({
Kannan Narayananacb089e2017-04-12 03:25:12 +0000518 const MachineOperand &Opnd = MI->getOperand(OpNo);
519 assert(TRI->isVGPR(*MRI, Opnd.getReg()));
520 });
521 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
522 setRegScore(RegNo, EXP_CNT, Val);
523 }
524}
525
526void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
527 const SIRegisterInfo *TRI,
528 const MachineRegisterInfo *MRI,
529 WaitEventType E, MachineInstr &Inst) {
530 const MachineRegisterInfo &MRIA = *MRI;
531 InstCounterType T = eventCounter(E);
532 int32_t CurrScore = getScoreUB(T) + 1;
533 // EventUB and ScoreUB need to be update regardless if this event changes
534 // the score of a register or not.
535 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
536 EventUBs[E] = CurrScore;
537 setScoreUB(T, CurrScore);
538
539 if (T == EXP_CNT) {
540 // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
541 // is required.
542 if (!MixedExpTypes) {
543 MixedExpTypes = counterOutOfOrder(EXP_CNT);
544 }
545
546 // Put score on the source vgprs. If this is a store, just use those
547 // specific register(s).
548 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
549 // All GDS operations must protect their address register (same as
550 // export.)
551 if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
552 Inst.getOpcode() != AMDGPU::DS_CONSUME) {
553 setExpScore(
554 &Inst, TII, TRI, MRI,
555 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
556 CurrScore);
557 }
558 if (Inst.mayStore()) {
559 setExpScore(
560 &Inst, TII, TRI, MRI,
561 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
562 CurrScore);
563 if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
564 AMDGPU::OpName::data1) != -1) {
565 setExpScore(&Inst, TII, TRI, MRI,
566 AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
567 AMDGPU::OpName::data1),
568 CurrScore);
569 }
570 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
571 Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
572 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
573 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
574 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
575 Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
576 Inst.getOpcode() != AMDGPU::DS_APPEND &&
577 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
578 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
579 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
580 const MachineOperand &Op = Inst.getOperand(I);
581 if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
582 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
583 }
584 }
585 }
586 } else if (TII->isFLAT(Inst)) {
587 if (Inst.mayStore()) {
588 setExpScore(
589 &Inst, TII, TRI, MRI,
590 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
591 CurrScore);
592 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
593 setExpScore(
594 &Inst, TII, TRI, MRI,
595 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
596 CurrScore);
597 }
598 } else if (TII->isMIMG(Inst)) {
599 if (Inst.mayStore()) {
600 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
601 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
602 setExpScore(
603 &Inst, TII, TRI, MRI,
604 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
605 CurrScore);
606 }
607 } else if (TII->isMTBUF(Inst)) {
608 if (Inst.mayStore()) {
609 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
610 }
611 } else if (TII->isMUBUF(Inst)) {
612 if (Inst.mayStore()) {
613 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
614 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
615 setExpScore(
616 &Inst, TII, TRI, MRI,
617 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
618 CurrScore);
619 }
620 } else {
621 if (TII->isEXP(Inst)) {
622 // For export the destination registers are really temps that
623 // can be used as the actual source after export patching, so
624 // we need to treat them like sources and set the EXP_CNT
625 // score.
626 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
627 MachineOperand &DefMO = Inst.getOperand(I);
628 if (DefMO.isReg() && DefMO.isDef() &&
629 TRI->isVGPR(MRIA, DefMO.getReg())) {
630 setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
631 CurrScore);
632 }
633 }
634 }
635 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
636 MachineOperand &MO = Inst.getOperand(I);
637 if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
638 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
639 }
640 }
641 }
642#if 0 // TODO: check if this is handled by MUBUF code above.
643 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
Evgeny Mankovbf975172017-08-16 16:47:29 +0000644 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
645 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000646 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
647 unsigned OpNo;//TODO: find the OpNo for this operand;
648 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
649 for (signed RegNo = Interval.first; RegNo < Interval.second;
Evgeny Mankovbf975172017-08-16 16:47:29 +0000650 ++RegNo) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000651 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
652 }
653#endif
654 } else {
655 // Match the score to the destination registers.
656 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
657 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
658 if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
659 continue;
660 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
661 setRegScore(RegNo, T, CurrScore);
662 }
663 }
664 if (TII->isDS(Inst) && Inst.mayStore()) {
665 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
666 }
667 }
668}
669
670void BlockWaitcntBrackets::print(raw_ostream &OS) {
671 OS << '\n';
672 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
673 T = (enum InstCounterType)(T + 1)) {
674 int LB = getScoreLB(T);
675 int UB = getScoreUB(T);
676
677 switch (T) {
678 case VM_CNT:
679 OS << " VM_CNT(" << UB - LB << "): ";
680 break;
681 case LGKM_CNT:
682 OS << " LGKM_CNT(" << UB - LB << "): ";
683 break;
684 case EXP_CNT:
685 OS << " EXP_CNT(" << UB - LB << "): ";
686 break;
687 default:
688 OS << " UNKNOWN(" << UB - LB << "): ";
689 break;
690 }
691
692 if (LB < UB) {
693 // Print vgpr scores.
694 for (int J = 0; J <= getMaxVGPR(); J++) {
695 int RegScore = getRegScore(J, T);
696 if (RegScore <= LB)
697 continue;
698 int RelScore = RegScore - LB - 1;
699 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
700 OS << RelScore << ":v" << J << " ";
701 } else {
702 OS << RelScore << ":ds ";
703 }
704 }
705 // Also need to print sgpr scores for lgkm_cnt.
706 if (T == LGKM_CNT) {
707 for (int J = 0; J <= getMaxSGPR(); J++) {
708 int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
709 if (RegScore <= LB)
710 continue;
711 int RelScore = RegScore - LB - 1;
712 OS << RelScore << ":s" << J << " ";
713 }
714 }
715 }
716 OS << '\n';
717 }
718 OS << '\n';
Kannan Narayananacb089e2017-04-12 03:25:12 +0000719}
720
721unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
722 int ScoreToWait) {
723 unsigned int NeedWait = 0;
724 if (ScoreToWait == -1) {
725 // The score to wait is unknown. This implies that it was not encountered
726 // during the path of the CFG walk done during the current traversal but
727 // may be seen on a different path. Emit an s_wait counter with a
728 // conservative value of 0 for the counter.
729 NeedWait = CNT_MASK(T);
730 setScoreLB(T, getScoreUB(T));
731 return NeedWait;
732 }
733
734 // If the score of src_operand falls within the bracket, we need an
735 // s_waitcnt instruction.
736 const int32_t LB = getScoreLB(T);
737 const int32_t UB = getScoreUB(T);
738 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
Mark Searlesf0b93f12018-06-04 16:51:59 +0000739 if ((T == VM_CNT || T == LGKM_CNT) &&
740 hasPendingFlat() &&
741 !ST->hasFlatLgkmVMemCountInOrder()) {
742 // If there is a pending FLAT operation, and this is a VMem or LGKM
743 // waitcnt and the target can report early completion, then we need
744 // to force a waitcnt 0.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000745 NeedWait = CNT_MASK(T);
746 setScoreLB(T, getScoreUB(T));
747 } else if (counterOutOfOrder(T)) {
748 // Counter can get decremented out-of-order when there
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000749 // are multiple types event in the bracket. Also emit an s_wait counter
Kannan Narayananacb089e2017-04-12 03:25:12 +0000750 // with a conservative value of 0 for the counter.
751 NeedWait = CNT_MASK(T);
752 setScoreLB(T, getScoreUB(T));
753 } else {
754 NeedWait = CNT_MASK(T);
755 setScoreLB(T, ScoreToWait);
756 }
757 }
758
759 return NeedWait;
760}
761
762// Where there are multiple types of event in the bracket of a counter,
763// the decrement may go out of order.
764bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
765 switch (T) {
766 case VM_CNT:
767 return false;
768 case LGKM_CNT: {
769 if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
770 EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
771 // Scalar memory read always can go out of order.
772 return true;
773 }
774 int NumEventTypes = 0;
775 if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
776 EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
777 NumEventTypes++;
778 }
779 if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
780 EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
781 NumEventTypes++;
782 }
783 if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
784 EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
785 NumEventTypes++;
786 }
787 if (NumEventTypes <= 1) {
788 return false;
789 }
790 break;
791 }
792 case EXP_CNT: {
793 // If there has been a mixture of export types, then a waitcnt exp(0) is
794 // required.
795 if (MixedExpTypes)
796 return true;
797 int NumEventTypes = 0;
798 if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
799 EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
800 NumEventTypes++;
801 }
802 if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
803 EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
804 NumEventTypes++;
805 }
806 if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
807 EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
808 NumEventTypes++;
809 }
810 if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
811 EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
812 NumEventTypes++;
813 }
814
815 if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
816 EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
817 NumEventTypes++;
818 }
819
820 if (NumEventTypes <= 1) {
821 return false;
822 }
823 break;
824 }
825 default:
826 break;
827 }
828 return true;
829}
830
831INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
832 false)
833INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
834 false)
835
836char SIInsertWaitcnts::ID = 0;
837
838char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
839
840FunctionPass *llvm::createSIInsertWaitcntsPass() {
841 return new SIInsertWaitcnts();
842}
843
844static bool readsVCCZ(const MachineInstr &MI) {
845 unsigned Opc = MI.getOpcode();
846 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
847 !MI.getOperand(1).isUndef();
848}
849
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000850/// Given wait count encodings checks if LHS is stronger than RHS.
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +0000851bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
852 if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
853 return false;
854 if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
855 return false;
856 if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
857 return false;
858 return true;
859}
860
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000861/// Given wait count encodings create a new encoding which is stronger
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +0000862/// or equal to both.
863unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
864 unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
865 AMDGPU::decodeVmcnt(IV, RHS));
866 unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
867 AMDGPU::decodeLgkmcnt(IV, RHS));
868 unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
869 AMDGPU::decodeExpcnt(IV, RHS));
870 return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
871}
872
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000873/// Generate s_waitcnt instruction to be placed before cur_Inst.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000874/// Instructions of a given type are returned in order,
875/// but instructions of different types can complete out of order.
876/// We rely on this in-order completion
877/// and simply assign a score to the memory access instructions.
878/// We keep track of the active "score bracket" to determine
879/// if an access of a memory read requires an s_waitcnt
880/// and if so what the value of each counter is.
881/// The "score bracket" is bound by the lower bound and upper bound
882/// scores (*_score_LB and *_score_ub respectively).
Mark Searles70901b92018-04-24 15:59:59 +0000883void SIInsertWaitcnts::generateWaitcntInstBefore(
Kannan Narayananacb089e2017-04-12 03:25:12 +0000884 MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
885 // To emit, or not to emit - that's the question!
886 // Start with an assumption that there is no need to emit.
Mark Searles70901b92018-04-24 15:59:59 +0000887 unsigned int EmitWaitcnt = 0;
Mark Searles4a0f2c52018-05-07 14:43:28 +0000888
Kannan Narayananacb089e2017-04-12 03:25:12 +0000889 // No need to wait before phi. If a phi-move exists, then the wait should
890 // has been inserted before the move. If a phi-move does not exist, then
891 // wait should be inserted before the real use. The same is true for
892 // sc-merge. It is not a coincident that all these cases correspond to the
893 // instructions that are skipped in the assembling loop.
894 bool NeedLineMapping = false; // TODO: Check on this.
Mark Searlesec581832018-04-25 19:21:26 +0000895
Mark Searles4a0f2c52018-05-07 14:43:28 +0000896 // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
897 bool ForceEmitZeroWaitcnt = false;
898
899 setForceEmitWaitcnt();
Mark Searlesec581832018-04-25 19:21:26 +0000900 bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
901
Shiva Chen801bf7e2018-05-09 02:42:00 +0000902 if (MI.isDebugInstr() &&
Kannan Narayananacb089e2017-04-12 03:25:12 +0000903 // TODO: any other opcode?
904 !NeedLineMapping) {
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +0000905 return;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000906 }
907
908 // See if an s_waitcnt is forced at block entry, or is needed at
909 // program end.
910 if (ScoreBrackets->getWaitAtBeginning()) {
911 // Note that we have already cleared the state, so we don't need to update
912 // it.
913 ScoreBrackets->clearWaitAtBeginning();
914 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
915 T = (enum InstCounterType)(T + 1)) {
Mark Searles70901b92018-04-24 15:59:59 +0000916 EmitWaitcnt |= CNT_MASK(T);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000917 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
918 }
919 }
920
921 // See if this instruction has a forced S_WAITCNT VM.
922 // TODO: Handle other cases of NeedsWaitcntVmBefore()
923 else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
924 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
925 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
Mark Searles70901b92018-04-24 15:59:59 +0000926 EmitWaitcnt |=
Kannan Narayananacb089e2017-04-12 03:25:12 +0000927 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
928 }
929
930 // All waits must be resolved at call return.
931 // NOTE: this could be improved with knowledge of all call sites or
932 // with knowledge of the called routines.
933 if (MI.getOpcode() == AMDGPU::RETURN ||
Mark Searles11d0a042017-05-31 16:44:23 +0000934 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
935 MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000936 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
937 T = (enum InstCounterType)(T + 1)) {
938 if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
939 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
Mark Searles70901b92018-04-24 15:59:59 +0000940 EmitWaitcnt |= CNT_MASK(T);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000941 }
942 }
943 }
944 // Resolve vm waits before gs-done.
945 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
946 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
947 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
948 AMDGPU::SendMsg::ID_GS_DONE)) {
949 if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
950 ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000951 EmitWaitcnt |= CNT_MASK(VM_CNT);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000952 }
953 }
954#if 0 // TODO: the following blocks of logic when we have fence.
955 else if (MI.getOpcode() == SC_FENCE) {
956 const unsigned int group_size =
957 context->shader_info->GetMaxThreadGroupSize();
958 // group_size == 0 means thread group size is unknown at compile time
959 const bool group_is_multi_wave =
960 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
961 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
962
963 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
964 SCRegType src_type = Inst->GetSrcType(i);
965 switch (src_type) {
966 case SCMEM_LDS:
967 if (group_is_multi_wave ||
Evgeny Mankovbf975172017-08-16 16:47:29 +0000968 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
Mark Searles70901b92018-04-24 15:59:59 +0000969 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000970 ScoreBrackets->getScoreUB(LGKM_CNT));
971 // LDS may have to wait for VM_CNT after buffer load to LDS
972 if (target_info->HasBufferLoadToLDS()) {
Mark Searles70901b92018-04-24 15:59:59 +0000973 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000974 ScoreBrackets->getScoreUB(VM_CNT));
975 }
976 }
977 break;
978
979 case SCMEM_GDS:
980 if (group_is_multi_wave || fence_is_global) {
Mark Searles70901b92018-04-24 15:59:59 +0000981 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000982 ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000983 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000984 ScoreBrackets->getScoreUB(LGKM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000985 }
986 break;
987
988 case SCMEM_UAV:
989 case SCMEM_TFBUF:
990 case SCMEM_RING:
991 case SCMEM_SCATTER:
992 if (group_is_multi_wave || fence_is_global) {
Mark Searles70901b92018-04-24 15:59:59 +0000993 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000994 ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000995 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000996 ScoreBrackets->getScoreUB(VM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000997 }
998 break;
999
1000 case SCMEM_SCRATCH:
1001 default:
1002 break;
1003 }
1004 }
1005 }
1006#endif
1007
1008 // Export & GDS instructions do not read the EXEC mask until after the export
1009 // is granted (which can occur well after the instruction is issued).
1010 // The shader program must flush all EXP operations on the export-count
1011 // before overwriting the EXEC mask.
1012 else {
1013 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1014 // Export and GDS are tracked individually, either may trigger a waitcnt
1015 // for EXEC.
Mark Searles70901b92018-04-24 15:59:59 +00001016 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001017 EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
Mark Searles70901b92018-04-24 15:59:59 +00001018 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001019 EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
Mark Searles70901b92018-04-24 15:59:59 +00001020 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001021 EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
Mark Searles70901b92018-04-24 15:59:59 +00001022 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001023 EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
1024 }
1025
1026#if 0 // TODO: the following code to handle CALL.
1027 // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
1028 // However, there is a problem with EXP_CNT, because the call cannot
1029 // easily tell if a register is used in the function, and if it did, then
1030 // the referring instruction would have to have an S_WAITCNT, which is
1031 // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
1032 // before the call.
1033 if (MI.getOpcode() == SC_CALL) {
1034 if (ScoreBrackets->getScoreUB(EXP_CNT) >
Evgeny Mankovbf975172017-08-16 16:47:29 +00001035 ScoreBrackets->getScoreLB(EXP_CNT)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001036 ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +00001037 EmitWaitcnt |= CNT_MASK(EXP_CNT);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001038 }
1039 }
1040#endif
1041
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001042 // FIXME: Should not be relying on memoperands.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001043 // Look at the source operands of every instruction to see if
1044 // any of them results from a previous memory operation that affects
1045 // its current usage. If so, an s_waitcnt instruction needs to be
1046 // emitted.
1047 // If the source operand was defined by a load, add the s_waitcnt
1048 // instruction.
1049 for (const MachineMemOperand *Memop : MI.memoperands()) {
1050 unsigned AS = Memop->getAddrSpace();
1051 if (AS != AMDGPUASI.LOCAL_ADDRESS)
1052 continue;
1053 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1054 // VM_CNT is only relevant to vgpr or LDS.
Mark Searles70901b92018-04-24 15:59:59 +00001055 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001056 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1057 }
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001058
Kannan Narayananacb089e2017-04-12 03:25:12 +00001059 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1060 const MachineOperand &Op = MI.getOperand(I);
1061 const MachineRegisterInfo &MRIA = *MRI;
1062 RegInterval Interval =
1063 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
1064 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1065 if (TRI->isVGPR(MRIA, Op.getReg())) {
1066 // VM_CNT is only relevant to vgpr or LDS.
Mark Searles70901b92018-04-24 15:59:59 +00001067 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001068 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1069 }
Mark Searles70901b92018-04-24 15:59:59 +00001070 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001071 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1072 }
1073 }
1074 // End of for loop that looks at all source operands to decide vm_wait_cnt
1075 // and lgk_wait_cnt.
1076
1077 // Two cases are handled for destination operands:
1078 // 1) If the destination operand was defined by a load, add the s_waitcnt
1079 // instruction to guarantee the right WAW order.
1080 // 2) If a destination operand that was used by a recent export/store ins,
1081 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1082 if (MI.mayStore()) {
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001083 // FIXME: Should not be relying on memoperands.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001084 for (const MachineMemOperand *Memop : MI.memoperands()) {
1085 unsigned AS = Memop->getAddrSpace();
1086 if (AS != AMDGPUASI.LOCAL_ADDRESS)
1087 continue;
1088 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
Mark Searles70901b92018-04-24 15:59:59 +00001089 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001090 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
Mark Searles70901b92018-04-24 15:59:59 +00001091 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001092 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1093 }
1094 }
1095 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1096 MachineOperand &Def = MI.getOperand(I);
1097 const MachineRegisterInfo &MRIA = *MRI;
1098 RegInterval Interval =
1099 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
1100 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1101 if (TRI->isVGPR(MRIA, Def.getReg())) {
Mark Searles70901b92018-04-24 15:59:59 +00001102 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001103 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
Mark Searles70901b92018-04-24 15:59:59 +00001104 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001105 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1106 }
Mark Searles70901b92018-04-24 15:59:59 +00001107 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001108 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1109 }
1110 } // End of for loop that looks at all dest operands.
1111 }
1112
Kannan Narayananacb089e2017-04-12 03:25:12 +00001113 // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1114 // occurs before the instruction. Doing it here prevents any additional
1115 // S_WAITCNTs from being emitted if the instruction was marked as
1116 // requiring a WAITCNT beforehand.
Konstantin Zhuravlyovbe6c0ca2017-06-02 17:40:26 +00001117 if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1118 !ST->hasAutoWaitcntBeforeBarrier()) {
Mark Searles70901b92018-04-24 15:59:59 +00001119 EmitWaitcnt |=
Kannan Narayananacb089e2017-04-12 03:25:12 +00001120 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
Mark Searles70901b92018-04-24 15:59:59 +00001121 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001122 EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +00001123 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001124 LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
1125 }
1126
1127 // TODO: Remove this work-around, enable the assert for Bug 457939
1128 // after fixing the scheduler. Also, the Shader Compiler code is
1129 // independent of target.
1130 if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
1131 if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1132 ScoreBrackets->getScoreUB(LGKM_CNT) &&
1133 ScoreBrackets->hasPendingSMEM()) {
1134 // Wait on everything, not just LGKM. vccz reads usually come from
1135 // terminators, and we always wait on everything at the end of the
1136 // block, so if we only wait on LGKM here, we might end up with
1137 // another s_waitcnt inserted right after this if there are non-LGKM
1138 // instructions still outstanding.
Mark Searles4a0f2c52018-05-07 14:43:28 +00001139 // FIXME: this is too conservative / the comment is wrong.
1140 // We don't wait on everything at the end of the block and we combine
1141 // waitcnts so we should never have back-to-back waitcnts.
Mark Searlesec581832018-04-25 19:21:26 +00001142 ForceEmitZeroWaitcnt = true;
Mark Searles70901b92018-04-24 15:59:59 +00001143 EmitWaitcnt = true;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001144 }
1145 }
1146
1147 // Does this operand processing indicate s_wait counter update?
Mark Searlesec581832018-04-25 19:21:26 +00001148 if (EmitWaitcnt || IsForceEmitWaitcnt) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001149 int CntVal[NUM_INST_CNTS];
1150
1151 bool UseDefaultWaitcntStrategy = true;
Mark Searles4a0f2c52018-05-07 14:43:28 +00001152 if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001153 // Force all waitcnts to 0.
1154 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1155 T = (enum InstCounterType)(T + 1)) {
1156 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1157 }
1158 CntVal[VM_CNT] = 0;
1159 CntVal[EXP_CNT] = 0;
1160 CntVal[LGKM_CNT] = 0;
1161 UseDefaultWaitcntStrategy = false;
1162 }
1163
1164 if (UseDefaultWaitcntStrategy) {
1165 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1166 T = (enum InstCounterType)(T + 1)) {
Mark Searles70901b92018-04-24 15:59:59 +00001167 if (EmitWaitcnt & CNT_MASK(T)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001168 int Delta =
1169 ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
1170 int MaxDelta = ScoreBrackets->getWaitCountMax(T);
1171 if (Delta >= MaxDelta) {
1172 Delta = -1;
1173 if (T != EXP_CNT) {
1174 ScoreBrackets->setScoreLB(
1175 T, ScoreBrackets->getScoreUB(T) - MaxDelta);
1176 }
Mark Searles70901b92018-04-24 15:59:59 +00001177 EmitWaitcnt &= ~CNT_MASK(T);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001178 }
1179 CntVal[T] = Delta;
1180 } else {
1181 // If we are not waiting for a particular counter then encode
1182 // it as -1 which means "don't care."
1183 CntVal[T] = -1;
1184 }
1185 }
1186 }
1187
1188 // If we are not waiting on any counter we can skip the wait altogether.
Mark Searlesec581832018-04-25 19:21:26 +00001189 if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001190 MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
1191 int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
Mark Searles65207922018-02-19 19:19:59 +00001192 if (!OldWaitcnt ||
1193 (AMDGPU::decodeVmcnt(IV, Imm) !=
Kannan Narayananacb089e2017-04-12 03:25:12 +00001194 (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
1195 (AMDGPU::decodeExpcnt(IV, Imm) !=
1196 (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
1197 (AMDGPU::decodeLgkmcnt(IV, Imm) !=
1198 (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
1199 MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
1200 if (ContainingLoop) {
Kannan Narayanan5e73b042017-05-05 21:10:17 +00001201 MachineBasicBlock *TBB = ContainingLoop->getHeader();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001202 BlockWaitcntBrackets *ScoreBracket =
1203 BlockWaitcntBracketsMap[TBB].get();
1204 if (!ScoreBracket) {
Mark Searles24c92ee2018-02-07 02:21:21 +00001205 assert(!BlockVisitedSet.count(TBB));
Eugene Zelenko59e12822017-08-08 00:47:13 +00001206 BlockWaitcntBracketsMap[TBB] =
Mark Searlesf0b93f12018-06-04 16:51:59 +00001207 llvm::make_unique<BlockWaitcntBrackets>(ST);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001208 ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
1209 }
1210 ScoreBracket->setRevisitLoop(true);
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001211 LLVM_DEBUG(dbgs()
Mark Searles10545412018-05-30 15:47:45 +00001212 << "set-revisit2: Block"
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001213 << ContainingLoop->getHeader()->getNumber() << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001214 }
1215 }
1216
1217 // Update an existing waitcount, or make a new one.
Mark Searlesec581832018-04-25 19:21:26 +00001218 unsigned Enc = AMDGPU::encodeWaitcnt(IV,
1219 ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
1220 ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
1221 ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
Mark Searles65207922018-02-19 19:19:59 +00001222 // We don't remove waitcnts that existed prior to the waitcnt
1223 // pass. Check if the waitcnt to-be-inserted can be avoided
1224 // or if the prev waitcnt can be updated.
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001225 bool insertSWaitInst = true;
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +00001226 for (MachineBasicBlock::iterator I = MI.getIterator(),
1227 B = MI.getParent()->begin();
1228 insertSWaitInst && I != B; --I) {
Mark Searles65207922018-02-19 19:19:59 +00001229 if (I == MI.getIterator())
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +00001230 continue;
1231
1232 switch (I->getOpcode()) {
1233 case AMDGPU::S_WAITCNT:
1234 if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
1235 insertSWaitInst = false;
1236 else if (!OldWaitcnt) {
1237 OldWaitcnt = &*I;
1238 Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
1239 }
1240 break;
1241 // TODO: skip over instructions which never require wait.
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001242 }
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +00001243 break;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001244 }
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001245 if (insertSWaitInst) {
1246 if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
Mark Searles4a0f2c52018-05-07 14:43:28 +00001247 if (ForceEmitZeroWaitcnts)
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001248 LLVM_DEBUG(
1249 dbgs()
1250 << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
Mark Searlesec581832018-04-25 19:21:26 +00001251 if (IsForceEmitWaitcnt)
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001252 LLVM_DEBUG(dbgs()
1253 << "Force emit a s_waitcnt due to debug counter\n");
Mark Searlesec581832018-04-25 19:21:26 +00001254
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001255 OldWaitcnt->getOperand(0).setImm(Enc);
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +00001256 if (!OldWaitcnt->getParent())
1257 MI.getParent()->insert(MI, OldWaitcnt);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001258
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001259 LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
1260 << "Old Instr: " << MI << '\n'
1261 << "New Instr: " << *OldWaitcnt << '\n');
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001262 } else {
1263 auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1264 MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1265 .addImm(Enc);
1266 TrackedWaitcntSet.insert(SWaitInst);
1267
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001268 LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1269 << "Old Instr: " << MI << '\n'
1270 << "New Instr: " << *SWaitInst << '\n');
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001271 }
1272 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001273
1274 if (CntVal[EXP_CNT] == 0) {
1275 ScoreBrackets->setMixedExpTypes(false);
1276 }
1277 }
1278 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001279}
1280
1281void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
1282 MachineInstr *Waitcnt) {
1283 if (MBB.empty()) {
1284 MBB.push_back(Waitcnt);
1285 return;
1286 }
1287
1288 MachineBasicBlock::iterator It = MBB.end();
1289 MachineInstr *MI = &*(--It);
1290 if (MI->isBranch()) {
1291 MBB.insert(It, Waitcnt);
1292 } else {
1293 MBB.push_back(Waitcnt);
1294 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001295}
1296
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001297// This is a flat memory operation. Check to see if it has memory
1298// tokens for both LDS and Memory, and if so mark it as a flat.
1299bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1300 if (MI.memoperands_empty())
1301 return true;
1302
1303 for (const MachineMemOperand *Memop : MI.memoperands()) {
1304 unsigned AS = Memop->getAddrSpace();
1305 if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
1306 return true;
1307 }
1308
1309 return false;
1310}
1311
Mark Searles70901b92018-04-24 15:59:59 +00001312void SIInsertWaitcnts::updateEventWaitcntAfter(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001313 MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
1314 // Now look at the instruction opcode. If it is a memory access
1315 // instruction, update the upper-bound of the appropriate counter's
1316 // bracket and the destination operand scores.
1317 // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001318 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001319 if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001320 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1321 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1322 } else {
1323 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1324 }
1325 } else if (TII->isFLAT(Inst)) {
1326 assert(Inst.mayLoad() || Inst.mayStore());
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001327
1328 if (TII->usesVM_CNT(Inst))
1329 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1330
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001331 if (TII->usesLGKM_CNT(Inst)) {
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001332 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001333
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001334 // This is a flat memory operation, so note it - it will require
1335 // that both the VM and LGKM be flushed to zero if it is pending when
1336 // a VM or LGKM dependency occurs.
1337 if (mayAccessLDSThroughFlat(Inst))
1338 ScoreBrackets->setPendingFlat();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001339 }
1340 } else if (SIInstrInfo::isVMEM(Inst) &&
1341 // TODO: get a better carve out.
1342 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1343 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1344 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1345 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
Mark Searles2a19af62018-04-26 16:11:19 +00001346 if (ST->vmemWriteNeedsExpWaitcnt() &&
Mark Searles11d0a042017-05-31 16:44:23 +00001347 (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001348 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1349 }
1350 } else if (TII->isSMRD(Inst)) {
1351 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1352 } else {
1353 switch (Inst.getOpcode()) {
1354 case AMDGPU::S_SENDMSG:
1355 case AMDGPU::S_SENDMSGHALT:
1356 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1357 break;
1358 case AMDGPU::EXP:
1359 case AMDGPU::EXP_DONE: {
1360 int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1361 if (Imm >= 32 && Imm <= 63)
1362 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1363 else if (Imm >= 12 && Imm <= 15)
1364 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1365 else
1366 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1367 break;
1368 }
1369 case AMDGPU::S_MEMTIME:
1370 case AMDGPU::S_MEMREALTIME:
1371 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1372 break;
1373 default:
1374 break;
1375 }
1376 }
1377}
1378
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001379// Merge the score brackets of the Block's predecessors;
1380// this merged score bracket is used when adding waitcnts to the Block
Kannan Narayananacb089e2017-04-12 03:25:12 +00001381void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
1382 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1383 int32_t MaxPending[NUM_INST_CNTS] = {0};
1384 int32_t MaxFlat[NUM_INST_CNTS] = {0};
1385 bool MixedExpTypes = false;
1386
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001387 // For single basic block loops, we need to retain the Block's
1388 // score bracket to have accurate Pred info. So, make a copy of Block's
1389 // score bracket, clear() it (which retains several important bits of info),
1390 // populate, and then replace en masse. For non-single basic block loops,
1391 // just clear Block's current score bracket and repopulate in-place.
1392 bool IsSelfPred;
1393 std::unique_ptr<BlockWaitcntBrackets> S;
1394
1395 IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
1396 != Block.pred_end();
1397 if (IsSelfPred) {
1398 S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1399 ScoreBrackets = S.get();
1400 }
1401
Kannan Narayananacb089e2017-04-12 03:25:12 +00001402 ScoreBrackets->clear();
1403
Kannan Narayananacb089e2017-04-12 03:25:12 +00001404 // See if there are any uninitialized predecessors. If so, emit an
1405 // s_waitcnt 0 at the beginning of the block.
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001406 for (MachineBasicBlock *Pred : Block.predecessors()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001407 BlockWaitcntBrackets *PredScoreBrackets =
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001408 BlockWaitcntBracketsMap[Pred].get();
1409 bool Visited = BlockVisitedSet.count(Pred);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001410 if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001411 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001412 }
1413 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1414 T = (enum InstCounterType)(T + 1)) {
1415 int span =
1416 PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
1417 MaxPending[T] = std::max(MaxPending[T], span);
1418 span =
1419 PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
1420 MaxFlat[T] = std::max(MaxFlat[T], span);
1421 }
1422
1423 MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
1424 }
1425
1426 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1427 // Also handle kills for exit block.
1428 if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1429 for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1430 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1431 T = (enum InstCounterType)(T + 1)) {
1432 int Span = KillWaitBrackets[I]->getScoreUB(T) -
1433 KillWaitBrackets[I]->getScoreLB(T);
1434 MaxPending[T] = std::max(MaxPending[T], Span);
1435 Span = KillWaitBrackets[I]->pendingFlat(T) -
1436 KillWaitBrackets[I]->getScoreLB(T);
1437 MaxFlat[T] = std::max(MaxFlat[T], Span);
1438 }
1439
1440 MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
1441 }
1442 }
1443
1444 // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
1445 for (MachineBasicBlock *Pred : Block.predecessors()) {
1446 BlockWaitcntBrackets *PredScoreBrackets =
1447 BlockWaitcntBracketsMap[Pred].get();
Mark Searles24c92ee2018-02-07 02:21:21 +00001448 bool Visited = BlockVisitedSet.count(Pred);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001449 if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001450 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001451 }
1452
1453 int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
1454 PredScoreBrackets->getScoreLB(EXP_CNT);
1455 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1456 int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
1457 PredScoreBrackets->getScoreLB(EXP_CNT);
1458 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1459 }
1460
1461 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1462 if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1463 for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1464 int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
1465 KillWaitBrackets[I]->getScoreLB(EXP_CNT);
1466 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1467 int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
1468 KillWaitBrackets[I]->getScoreLB(EXP_CNT);
1469 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1470 }
1471 }
1472
1473#if 0
1474 // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
1475 // TODO: how does LC distinguish between function entry and main entry?
1476 // If this is the entry to a function, force a wait.
1477 MachineBasicBlock &Entry = Block.getParent()->front();
1478 if (Entry.getNumber() == Block.getNumber()) {
1479 ScoreBrackets->setWaitAtBeginning();
1480 return;
1481 }
1482#endif
1483
1484 // Now set the current Block's brackets to the largest ending bracket.
1485 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1486 T = (enum InstCounterType)(T + 1)) {
1487 ScoreBrackets->setScoreUB(T, MaxPending[T]);
1488 ScoreBrackets->setScoreLB(T, 0);
1489 ScoreBrackets->setLastFlat(T, MaxFlat[T]);
1490 }
1491
1492 ScoreBrackets->setMixedExpTypes(MixedExpTypes);
1493
1494 // Set the register scoreboard.
1495 for (MachineBasicBlock *Pred : Block.predecessors()) {
Mark Searles24c92ee2018-02-07 02:21:21 +00001496 if (!BlockVisitedSet.count(Pred)) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001497 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001498 }
1499
1500 BlockWaitcntBrackets *PredScoreBrackets =
1501 BlockWaitcntBracketsMap[Pred].get();
1502
1503 // Now merge the gpr_reg_score information
1504 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1505 T = (enum InstCounterType)(T + 1)) {
1506 int PredLB = PredScoreBrackets->getScoreLB(T);
1507 int PredUB = PredScoreBrackets->getScoreUB(T);
1508 if (PredLB < PredUB) {
1509 int PredScale = MaxPending[T] - PredUB;
1510 // Merge vgpr scores.
1511 for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
1512 int PredRegScore = PredScoreBrackets->getRegScore(J, T);
1513 if (PredRegScore <= PredLB)
1514 continue;
1515 int NewRegScore = PredScale + PredRegScore;
1516 ScoreBrackets->setRegScore(
1517 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1518 }
1519 // Also need to merge sgpr scores for lgkm_cnt.
1520 if (T == LGKM_CNT) {
1521 for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
1522 int PredRegScore =
1523 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1524 if (PredRegScore <= PredLB)
1525 continue;
1526 int NewRegScore = PredScale + PredRegScore;
1527 ScoreBrackets->setRegScore(
1528 J + NUM_ALL_VGPRS, LGKM_CNT,
1529 std::max(
1530 ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1531 NewRegScore));
1532 }
1533 }
1534 }
1535 }
1536
1537 // Also merge the WaitEvent information.
1538 ForAllWaitEventType(W) {
1539 enum InstCounterType T = PredScoreBrackets->eventCounter(W);
1540 int PredEventUB = PredScoreBrackets->getEventUB(W);
1541 if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
1542 int NewEventUB =
1543 MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
1544 if (NewEventUB > 0) {
1545 ScoreBrackets->setEventUB(
1546 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1547 }
1548 }
1549 }
1550 }
1551
1552 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1553 // Set the register scoreboard.
1554 if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1555 for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1556 // Now merge the gpr_reg_score information.
1557 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1558 T = (enum InstCounterType)(T + 1)) {
1559 int PredLB = KillWaitBrackets[I]->getScoreLB(T);
1560 int PredUB = KillWaitBrackets[I]->getScoreUB(T);
1561 if (PredLB < PredUB) {
1562 int PredScale = MaxPending[T] - PredUB;
1563 // Merge vgpr scores.
1564 for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
1565 int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
1566 if (PredRegScore <= PredLB)
1567 continue;
1568 int NewRegScore = PredScale + PredRegScore;
1569 ScoreBrackets->setRegScore(
1570 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1571 }
1572 // Also need to merge sgpr scores for lgkm_cnt.
1573 if (T == LGKM_CNT) {
1574 for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
1575 int PredRegScore =
1576 KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1577 if (PredRegScore <= PredLB)
1578 continue;
1579 int NewRegScore = PredScale + PredRegScore;
1580 ScoreBrackets->setRegScore(
1581 J + NUM_ALL_VGPRS, LGKM_CNT,
1582 std::max(
1583 ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1584 NewRegScore));
1585 }
1586 }
1587 }
1588 }
1589
1590 // Also merge the WaitEvent information.
1591 ForAllWaitEventType(W) {
1592 enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
1593 int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
1594 if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
1595 int NewEventUB =
1596 MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
1597 if (NewEventUB > 0) {
1598 ScoreBrackets->setEventUB(
1599 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1600 }
1601 }
1602 }
1603 }
1604 }
1605
1606 // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
1607 // sequencing predecessors, because changes to EXEC require waitcnts due to
1608 // the delayed nature of these operations.
1609 for (MachineBasicBlock *Pred : Block.predecessors()) {
Mark Searles24c92ee2018-02-07 02:21:21 +00001610 if (!BlockVisitedSet.count(Pred)) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001611 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001612 }
1613
1614 BlockWaitcntBrackets *PredScoreBrackets =
1615 BlockWaitcntBracketsMap[Pred].get();
1616
1617 int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
1618 if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1619 int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
1620 PredScoreBrackets->getScoreUB(EXP_CNT);
1621 if (new_gds_ub > 0) {
1622 ScoreBrackets->setEventUB(
1623 GDS_GPR_LOCK,
1624 std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
1625 }
1626 }
1627 int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
1628 if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1629 int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
1630 PredScoreBrackets->getScoreUB(EXP_CNT);
1631 if (new_exp_ub > 0) {
1632 ScoreBrackets->setEventUB(
1633 EXP_GPR_LOCK,
1634 std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
1635 }
1636 }
1637 }
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001638
1639 // if a single block loop, update the score brackets. Not needed for other
1640 // blocks, as we did this in-place
1641 if (IsSelfPred) {
1642 BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1643 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001644}
1645
Mark Searles10545412018-05-30 15:47:45 +00001646/// Return true if the given basic block is a "bottom" block of a loop.
1647/// This works even if the loop is discontiguous. This also handles
1648/// multiple back-edges for the same "header" block of a loop.
Mark Searles1bc6e712018-04-19 15:42:30 +00001649bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
1650 const MachineBasicBlock *Block) {
1651 for (MachineBasicBlock *MBB : Loop->blocks()) {
1652 if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
1653 return true;
1654 }
1655 }
1656 return false;
1657}
1658
1659/// Count the number of "bottom" basic blocks of a loop.
1660unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
1661 unsigned Count = 0;
1662 for (MachineBasicBlock *MBB : Loop->blocks()) {
1663 if (MBB->isSuccessor(Loop->getHeader())) {
1664 Count++;
1665 }
1666 }
1667 return Count;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001668}
1669
1670// Generate s_waitcnt instructions where needed.
1671void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1672 MachineBasicBlock &Block) {
1673 // Initialize the state information.
1674 mergeInputScoreBrackets(Block);
1675
1676 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1677
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001678 LLVM_DEBUG({
Mark Searlesec581832018-04-25 19:21:26 +00001679 dbgs() << "*** Block" << Block.getNumber() << " ***";
Kannan Narayananacb089e2017-04-12 03:25:12 +00001680 ScoreBrackets->dump();
1681 });
1682
Kannan Narayananacb089e2017-04-12 03:25:12 +00001683 // Walk over the instructions.
1684 for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1685 Iter != E;) {
1686 MachineInstr &Inst = *Iter;
1687 // Remove any previously existing waitcnts.
1688 if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
Mark Searles65207922018-02-19 19:19:59 +00001689 // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
1690 // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
1691 // as needed.
Mark Searles24c92ee2018-02-07 02:21:21 +00001692 if (!TrackedWaitcntSet.count(&Inst))
Kannan Narayananacb089e2017-04-12 03:25:12 +00001693 ++Iter;
1694 else {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001695 ++Iter;
1696 Inst.removeFromParent();
1697 }
Mark Searles65207922018-02-19 19:19:59 +00001698 ScoreBrackets->setWaitcnt(&Inst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001699 continue;
1700 }
1701
1702 // Kill instructions generate a conditional branch to the endmain block.
1703 // Merge the current waitcnt state into the endmain block information.
1704 // TODO: Are there other flavors of KILL instruction?
1705 if (Inst.getOpcode() == AMDGPU::KILL) {
1706 addKillWaitBracket(ScoreBrackets);
1707 }
1708
1709 bool VCCZBugWorkAround = false;
1710 if (readsVCCZ(Inst) &&
Mark Searles24c92ee2018-02-07 02:21:21 +00001711 (!VCCZBugHandledSet.count(&Inst))) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001712 if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1713 ScoreBrackets->getScoreUB(LGKM_CNT) &&
1714 ScoreBrackets->hasPendingSMEM()) {
1715 if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
1716 VCCZBugWorkAround = true;
1717 }
1718 }
1719
1720 // Generate an s_waitcnt instruction to be placed before
1721 // cur_Inst, if needed.
Mark Searles70901b92018-04-24 15:59:59 +00001722 generateWaitcntInstBefore(Inst, ScoreBrackets);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001723
Mark Searles70901b92018-04-24 15:59:59 +00001724 updateEventWaitcntAfter(Inst, ScoreBrackets);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001725
1726#if 0 // TODO: implement resource type check controlled by options with ub = LB.
1727 // If this instruction generates a S_SETVSKIP because it is an
1728 // indexed resource, and we are on Tahiti, then it will also force
1729 // an S_WAITCNT vmcnt(0)
1730 if (RequireCheckResourceType(Inst, context)) {
1731 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1732 ScoreBrackets->setScoreLB(VM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +00001733 ScoreBrackets->getScoreUB(VM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +00001734 }
1735#endif
1736
1737 ScoreBrackets->clearWaitcnt();
1738
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001739 LLVM_DEBUG({
Mark Searles94ae3b22018-01-30 17:17:06 +00001740 Inst.print(dbgs());
Kannan Narayananacb089e2017-04-12 03:25:12 +00001741 ScoreBrackets->dump();
1742 });
1743
1744 // Check to see if this is a GWS instruction. If so, and if this is CI or
1745 // VI, then the generated code sequence will include an S_WAITCNT 0.
1746 // TODO: Are these the only GWS instructions?
1747 if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1748 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1749 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1750 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1751 Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1752 // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
1753 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1754 ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1755 ScoreBrackets->updateByWait(LGKM_CNT,
1756 ScoreBrackets->getScoreUB(LGKM_CNT));
1757 }
1758
1759 // TODO: Remove this work-around after fixing the scheduler and enable the
1760 // assert above.
1761 if (VCCZBugWorkAround) {
1762 // Restore the vccz bit. Any time a value is written to vcc, the vcc
1763 // bit is updated, so we can restore the bit by reading the value of
1764 // vcc and then writing it back to the register.
1765 BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1766 AMDGPU::VCC)
1767 .addReg(AMDGPU::VCC);
1768 VCCZBugHandledSet.insert(&Inst);
1769 }
1770
Kannan Narayananacb089e2017-04-12 03:25:12 +00001771 ++Iter;
1772 }
1773
1774 // Check if we need to force convergence at loop footer.
1775 MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
Mark Searles1bc6e712018-04-19 15:42:30 +00001776 if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001777 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1778 WaitcntData->print();
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001779 LLVM_DEBUG(dbgs() << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001780
1781 // The iterative waitcnt insertion algorithm aims for optimal waitcnt
Mark Searles10545412018-05-30 15:47:45 +00001782 // placement, but doesn't guarantee convergence for a loop. Each
1783 // loop should take at most (n+1) iterations for it to converge naturally,
1784 // where n is the number of bottom blocks. If this threshold is reached and
1785 // the result hasn't converged, then we force convergence by inserting
1786 // a s_waitcnt at the end of loop footer.
1787 if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001788 // To ensure convergence, need to make wait events at loop footer be no
1789 // more than those from the previous iteration.
Mark Searles65207922018-02-19 19:19:59 +00001790 // As a simplification, instead of tracking individual scores and
1791 // generating the precise wait count, just wait on 0.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001792 bool HasPending = false;
1793 MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
1794 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1795 T = (enum InstCounterType)(T + 1)) {
1796 if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
1797 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1798 HasPending = true;
Mark Searles10545412018-05-30 15:47:45 +00001799 break;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001800 }
1801 }
1802
1803 if (HasPending) {
1804 if (!SWaitInst) {
Mark Searles10545412018-05-30 15:47:45 +00001805 SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
1806 DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1807 .addImm(0);
Mark Searles24c92ee2018-02-07 02:21:21 +00001808 TrackedWaitcntSet.insert(SWaitInst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001809#if 0 // TODO: Format the debug output
1810 OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
1811 OutputTransformAdd(SWaitInst, context);
1812#endif
1813 }
1814#if 0 // TODO: ??
1815 _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
1816#endif
1817 }
1818
1819 if (SWaitInst) {
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001820 LLVM_DEBUG({
Kannan Narayananacb089e2017-04-12 03:25:12 +00001821 SWaitInst->print(dbgs());
1822 dbgs() << "\nAdjusted score board:";
1823 ScoreBrackets->dump();
1824 });
1825
1826 // Add this waitcnt to the block. It is either newly created or
1827 // created in previous iterations and added back since block traversal
Mark Searles65207922018-02-19 19:19:59 +00001828 // always removes waitcnts.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001829 insertWaitcntBeforeCF(Block, SWaitInst);
1830 WaitcntData->setWaitcnt(SWaitInst);
1831 }
1832 }
1833 }
1834}
1835
1836bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1837 ST = &MF.getSubtarget<SISubtarget>();
1838 TII = ST->getInstrInfo();
1839 TRI = &TII->getRegisterInfo();
1840 MRI = &MF.getRegInfo();
1841 MLI = &getAnalysis<MachineLoopInfo>();
1842 IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
Mark Searles11d0a042017-05-31 16:44:23 +00001843 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001844 AMDGPUASI = ST->getAMDGPUAS();
1845
Mark Searles4a0f2c52018-05-07 14:43:28 +00001846 ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
Mark Searlesec581832018-04-25 19:21:26 +00001847 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1848 T = (enum InstCounterType)(T + 1))
1849 ForceEmitWaitcnt[T] = false;
1850
Kannan Narayananacb089e2017-04-12 03:25:12 +00001851 HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1852 HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1853 HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1854
1855 HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1856 HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1857 assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1858 assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1859
1860 RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1861 RegisterEncoding.VGPRL =
1862 RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1863 RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1864 RegisterEncoding.SGPRL =
1865 RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1866
Mark Searles24c92ee2018-02-07 02:21:21 +00001867 TrackedWaitcntSet.clear();
1868 BlockVisitedSet.clear();
1869 VCCZBugHandledSet.clear();
Mark Searles1bc6e712018-04-19 15:42:30 +00001870 LoopWaitcntDataMap.clear();
Scott Linder5792dd02018-06-21 18:48:48 +00001871 BlockWaitcntProcessedSet.clear();
Mark Searles24c92ee2018-02-07 02:21:21 +00001872
Kannan Narayananacb089e2017-04-12 03:25:12 +00001873 // Walk over the blocks in reverse post-dominator order, inserting
1874 // s_waitcnt where needed.
1875 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
1876 bool Modified = false;
1877 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1878 I = RPOT.begin(),
1879 E = RPOT.end(), J = RPOT.begin();
1880 I != E;) {
1881 MachineBasicBlock &MBB = **I;
1882
1883 BlockVisitedSet.insert(&MBB);
1884
1885 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1886 if (!ScoreBrackets) {
Mark Searlesf0b93f12018-06-04 16:51:59 +00001887 BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001888 ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1889 }
1890 ScoreBrackets->setPostOrder(MBB.getNumber());
1891 MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
1892 if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
Eugene Zelenko59e12822017-08-08 00:47:13 +00001893 LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001894
1895 // If we are walking into the block from before the loop, then guarantee
1896 // at least 1 re-walk over the loop to propagate the information, even if
1897 // no S_WAITCNT instructions were generated.
Mark Searles1bc6e712018-04-19 15:42:30 +00001898 if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
1899 unsigned Count = countNumBottomBlocks(ContainingLoop);
1900
1901 // If the loop has multiple back-edges, and so more than one "bottom"
1902 // basic block, we have to guarantee a re-walk over every blocks.
1903 if ((std::count(BlockWaitcntProcessedSet.begin(),
1904 BlockWaitcntProcessedSet.end(), &MBB) < Count)) {
1905 BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
Mark Searles10545412018-05-30 15:47:45 +00001906 LLVM_DEBUG(dbgs() << "set-revisit1: Block"
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001907 << ContainingLoop->getHeader()->getNumber() << '\n';);
Mark Searles1bc6e712018-04-19 15:42:30 +00001908 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001909 }
1910
1911 // Walk over the instructions.
1912 insertWaitcntInBlock(MF, MBB);
1913
Mark Searles10545412018-05-30 15:47:45 +00001914 // Record that waitcnts have been processed at least once for this block.
Mark Searles1bc6e712018-04-19 15:42:30 +00001915 BlockWaitcntProcessedSet.push_back(&MBB);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001916
Mark Searles1bc6e712018-04-19 15:42:30 +00001917 // See if we want to revisit the loop. If a loop has multiple back-edges,
1918 // we shouldn't revisit the same "bottom" basic block.
1919 if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
1920 std::count(BlockWaitcntProcessedSet.begin(),
1921 BlockWaitcntProcessedSet.end(), &MBB) == 1) {
Kannan Narayanan5e73b042017-05-05 21:10:17 +00001922 MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001923 BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
1924 if (EntrySB && EntrySB->getRevisitLoop()) {
1925 EntrySB->setRevisitLoop(false);
1926 J = I;
1927 int32_t PostOrder = EntrySB->getPostOrder();
1928 // TODO: Avoid this loop. Find another way to set I.
1929 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1930 X = RPOT.begin(),
1931 Y = RPOT.end();
1932 X != Y; ++X) {
1933 MachineBasicBlock &MBBX = **X;
1934 if (MBBX.getNumber() == PostOrder) {
1935 I = X;
1936 break;
1937 }
1938 }
1939 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1940 WaitcntData->incIterCnt();
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001941 LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001942 continue;
1943 } else {
1944 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1945 // Loop converged, reset iteration count. If this loop gets revisited,
1946 // it must be from an outer loop, the counter will restart, this will
1947 // ensure we don't force convergence on such revisits.
1948 WaitcntData->resetIterCnt();
1949 }
1950 }
1951
1952 J = I;
1953 ++I;
1954 }
1955
1956 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1957
1958 bool HaveScalarStores = false;
1959
1960 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1961 ++BI) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001962 MachineBasicBlock &MBB = *BI;
1963
1964 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1965 ++I) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001966 if (!HaveScalarStores && TII->isScalarStore(*I))
1967 HaveScalarStores = true;
1968
1969 if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1970 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1971 EndPgmBlocks.push_back(&MBB);
1972 }
1973 }
1974
1975 if (HaveScalarStores) {
1976 // If scalar writes are used, the cache must be flushed or else the next
1977 // wave to reuse the same scratch memory can be clobbered.
1978 //
1979 // Insert s_dcache_wb at wave termination points if there were any scalar
1980 // stores, and only if the cache hasn't already been flushed. This could be
1981 // improved by looking across blocks for flushes in postdominating blocks
1982 // from the stores but an explicitly requested flush is probably very rare.
1983 for (MachineBasicBlock *MBB : EndPgmBlocks) {
1984 bool SeenDCacheWB = false;
1985
1986 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1987 ++I) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001988 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1989 SeenDCacheWB = true;
1990 else if (TII->isScalarStore(*I))
1991 SeenDCacheWB = false;
1992
1993 // FIXME: It would be better to insert this before a waitcnt if any.
1994 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1995 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1996 !SeenDCacheWB) {
1997 Modified = true;
1998 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1999 }
2000 }
2001 }
2002 }
2003
Mark Searles11d0a042017-05-31 16:44:23 +00002004 if (!MFI->isEntryFunction()) {
2005 // Wait for any outstanding memory operations that the input registers may
Hiroshi Inouec8e92452018-01-29 05:17:03 +00002006 // depend on. We can't track them and it's better to the wait after the
Mark Searles11d0a042017-05-31 16:44:23 +00002007 // costly call sequence.
2008
2009 // TODO: Could insert earlier and schedule more liberally with operations
2010 // that only use caller preserved registers.
2011 MachineBasicBlock &EntryBB = MF.front();
Mark Searlesed54ff12018-05-30 16:27:57 +00002012 BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
2013 .addImm(0);
Mark Searles11d0a042017-05-31 16:44:23 +00002014
2015 Modified = true;
2016 }
2017
Kannan Narayananacb089e2017-04-12 03:25:12 +00002018 return Modified;
2019}