blob: eb39984f795910e38acf036c3856bd5c968d7456 [file] [log] [blame]
Eugene Zelenko59e12822017-08-08 00:47:13 +00001//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
Kannan Narayananacb089e2017-04-12 03:25:12 +00002//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
Adrian Prantl5f8f34e42018-05-01 15:54:18 +000011/// Insert wait instructions for memory reads and writes.
Kannan Narayananacb089e2017-04-12 03:25:12 +000012///
13/// Memory reads and writes are issued asynchronously, so we need to insert
14/// S_WAITCNT instructions when we want to access any of their results or
15/// overwrite any register that's used asynchronously.
16//
17//===----------------------------------------------------------------------===//
18
19#include "AMDGPU.h"
20#include "AMDGPUSubtarget.h"
21#include "SIDefines.h"
22#include "SIInstrInfo.h"
23#include "SIMachineFunctionInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000024#include "SIRegisterInfo.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000025#include "Utils/AMDGPUBaseInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000026#include "llvm/ADT/DenseMap.h"
27#include "llvm/ADT/DenseSet.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000028#include "llvm/ADT/PostOrderIterator.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000029#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/SmallVector.h"
31#include "llvm/CodeGen/MachineBasicBlock.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000032#include "llvm/CodeGen/MachineFunction.h"
33#include "llvm/CodeGen/MachineFunctionPass.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000034#include "llvm/CodeGen/MachineInstr.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000035#include "llvm/CodeGen/MachineInstrBuilder.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000036#include "llvm/CodeGen/MachineLoopInfo.h"
37#include "llvm/CodeGen/MachineMemOperand.h"
38#include "llvm/CodeGen/MachineOperand.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000039#include "llvm/CodeGen/MachineRegisterInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000040#include "llvm/IR/DebugLoc.h"
41#include "llvm/Pass.h"
42#include "llvm/Support/Debug.h"
Mark Searlesec581832018-04-25 19:21:26 +000043#include "llvm/Support/DebugCounter.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000044#include "llvm/Support/ErrorHandling.h"
45#include "llvm/Support/raw_ostream.h"
46#include <algorithm>
47#include <cassert>
48#include <cstdint>
49#include <cstring>
50#include <memory>
51#include <utility>
52#include <vector>
Kannan Narayananacb089e2017-04-12 03:25:12 +000053
Mark Searlesec581832018-04-25 19:21:26 +000054using namespace llvm;
55
Kannan Narayananacb089e2017-04-12 03:25:12 +000056#define DEBUG_TYPE "si-insert-waitcnts"
57
Mark Searlesec581832018-04-25 19:21:26 +000058DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
59 "Force emit s_waitcnt expcnt(0) instrs");
60DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
61 "Force emit s_waitcnt lgkmcnt(0) instrs");
62DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
63 "Force emit s_waitcnt vmcnt(0) instrs");
64
65static cl::opt<unsigned> ForceEmitZeroFlag(
66 "amdgpu-waitcnt-forcezero",
67 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
68 cl::init(0), cl::Hidden);
Kannan Narayananacb089e2017-04-12 03:25:12 +000069
70namespace {
71
72// Class of object that encapsulates latest instruction counter score
73// associated with the operand. Used for determining whether
74// s_waitcnt instruction needs to be emited.
75
76#define CNT_MASK(t) (1u << (t))
77
78enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
79
Eugene Zelenko59e12822017-08-08 00:47:13 +000080using RegInterval = std::pair<signed, signed>;
Kannan Narayananacb089e2017-04-12 03:25:12 +000081
82struct {
83 int32_t VmcntMax;
84 int32_t ExpcntMax;
85 int32_t LgkmcntMax;
86 int32_t NumVGPRsMax;
87 int32_t NumSGPRsMax;
88} HardwareLimits;
89
90struct {
91 unsigned VGPR0;
92 unsigned VGPRL;
93 unsigned SGPR0;
94 unsigned SGPRL;
95} RegisterEncoding;
96
97enum WaitEventType {
98 VMEM_ACCESS, // vector-memory read & write
99 LDS_ACCESS, // lds read & write
100 GDS_ACCESS, // gds read & write
101 SQ_MESSAGE, // send message
102 SMEM_ACCESS, // scalar-memory read & write
103 EXP_GPR_LOCK, // export holding on its data src
104 GDS_GPR_LOCK, // GDS holding on its data and addr src
105 EXP_POS_ACCESS, // write to export position
106 EXP_PARAM_ACCESS, // write to export parameter
107 VMW_GPR_LOCK, // vector-memory write holding on its data src
108 NUM_WAIT_EVENTS,
109};
110
111// The mapping is:
112// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
113// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
114// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
115// We reserve a fixed number of VGPR slots in the scoring tables for
116// special tokens like SCMEM_LDS (needed for buffer load to LDS).
117enum RegisterMapping {
118 SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
119 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
120 NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
121 EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
122 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
123};
124
125#define ForAllWaitEventType(w) \
126 for (enum WaitEventType w = (enum WaitEventType)0; \
127 (w) < (enum WaitEventType)NUM_WAIT_EVENTS; \
128 (w) = (enum WaitEventType)((w) + 1))
129
130// This is a per-basic-block object that maintains current score brackets
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000131// of each wait counter, and a per-register scoreboard for each wait counter.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000132// We also maintain the latest score for every event type that can change the
133// waitcnt in order to know if there are multiple types of events within
134// the brackets. When multiple types of event happen in the bracket,
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000135// wait count may get decreased out of order, therefore we need to put in
Kannan Narayananacb089e2017-04-12 03:25:12 +0000136// "s_waitcnt 0" before use.
137class BlockWaitcntBrackets {
138public:
Tom Stellard5bfbae52018-07-11 20:59:01 +0000139 BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
Eugene Zelenko59e12822017-08-08 00:47:13 +0000140 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
141 T = (enum InstCounterType)(T + 1)) {
142 memset(VgprScores[T], 0, sizeof(VgprScores[T]));
143 }
144 }
145
146 ~BlockWaitcntBrackets() = default;
147
Kannan Narayananacb089e2017-04-12 03:25:12 +0000148 static int32_t getWaitCountMax(InstCounterType T) {
149 switch (T) {
150 case VM_CNT:
151 return HardwareLimits.VmcntMax;
152 case LGKM_CNT:
153 return HardwareLimits.LgkmcntMax;
154 case EXP_CNT:
155 return HardwareLimits.ExpcntMax;
156 default:
157 break;
158 }
159 return 0;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000160 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000161
162 void setScoreLB(InstCounterType T, int32_t Val) {
163 assert(T < NUM_INST_CNTS);
164 if (T >= NUM_INST_CNTS)
165 return;
166 ScoreLBs[T] = Val;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000167 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000168
169 void setScoreUB(InstCounterType T, int32_t Val) {
170 assert(T < NUM_INST_CNTS);
171 if (T >= NUM_INST_CNTS)
172 return;
173 ScoreUBs[T] = Val;
174 if (T == EXP_CNT) {
175 int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
176 if (ScoreLBs[T] < UB)
177 ScoreLBs[T] = UB;
178 }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000179 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000180
181 int32_t getScoreLB(InstCounterType T) {
182 assert(T < NUM_INST_CNTS);
183 if (T >= NUM_INST_CNTS)
184 return 0;
185 return ScoreLBs[T];
Eugene Zelenko59e12822017-08-08 00:47:13 +0000186 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000187
188 int32_t getScoreUB(InstCounterType T) {
189 assert(T < NUM_INST_CNTS);
190 if (T >= NUM_INST_CNTS)
191 return 0;
192 return ScoreUBs[T];
Eugene Zelenko59e12822017-08-08 00:47:13 +0000193 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000194
195 // Mapping from event to counter.
196 InstCounterType eventCounter(WaitEventType E) {
197 switch (E) {
198 case VMEM_ACCESS:
199 return VM_CNT;
200 case LDS_ACCESS:
201 case GDS_ACCESS:
202 case SQ_MESSAGE:
203 case SMEM_ACCESS:
204 return LGKM_CNT;
205 case EXP_GPR_LOCK:
206 case GDS_GPR_LOCK:
207 case VMW_GPR_LOCK:
208 case EXP_POS_ACCESS:
209 case EXP_PARAM_ACCESS:
210 return EXP_CNT;
211 default:
212 llvm_unreachable("unhandled event type");
213 }
214 return NUM_INST_CNTS;
215 }
216
217 void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
218 if (GprNo < NUM_ALL_VGPRS) {
219 if (GprNo > VgprUB) {
220 VgprUB = GprNo;
221 }
222 VgprScores[T][GprNo] = Val;
223 } else {
224 assert(T == LGKM_CNT);
225 if (GprNo - NUM_ALL_VGPRS > SgprUB) {
226 SgprUB = GprNo - NUM_ALL_VGPRS;
227 }
228 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
229 }
230 }
231
232 int32_t getRegScore(int GprNo, InstCounterType T) {
233 if (GprNo < NUM_ALL_VGPRS) {
234 return VgprScores[T][GprNo];
235 }
236 return SgprScores[GprNo - NUM_ALL_VGPRS];
237 }
238
239 void clear() {
240 memset(ScoreLBs, 0, sizeof(ScoreLBs));
241 memset(ScoreUBs, 0, sizeof(ScoreUBs));
242 memset(EventUBs, 0, sizeof(EventUBs));
243 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
244 T = (enum InstCounterType)(T + 1)) {
245 memset(VgprScores[T], 0, sizeof(VgprScores[T]));
246 }
247 memset(SgprScores, 0, sizeof(SgprScores));
248 }
249
250 RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
251 const MachineRegisterInfo *MRI,
252 const SIRegisterInfo *TRI, unsigned OpNo,
253 bool Def) const;
254
255 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
256 const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
257 unsigned OpNo, int32_t Val);
258
259 void setWaitAtBeginning() { WaitAtBeginning = true; }
260 void clearWaitAtBeginning() { WaitAtBeginning = false; }
261 bool getWaitAtBeginning() const { return WaitAtBeginning; }
262 void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
263 int32_t getMaxVGPR() const { return VgprUB; }
264 int32_t getMaxSGPR() const { return SgprUB; }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000265
Kannan Narayananacb089e2017-04-12 03:25:12 +0000266 int32_t getEventUB(enum WaitEventType W) const {
267 assert(W < NUM_WAIT_EVENTS);
268 return EventUBs[W];
269 }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000270
Kannan Narayananacb089e2017-04-12 03:25:12 +0000271 bool counterOutOfOrder(InstCounterType T);
272 unsigned int updateByWait(InstCounterType T, int ScoreToWait);
273 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
274 const MachineRegisterInfo *MRI, WaitEventType E,
275 MachineInstr &MI);
276
Kannan Narayananacb089e2017-04-12 03:25:12 +0000277 bool hasPendingSMEM() const {
278 return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
279 EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
280 }
281
282 bool hasPendingFlat() const {
283 return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
284 LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
285 (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
286 LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
287 }
288
289 void setPendingFlat() {
290 LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
291 LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
292 }
293
294 int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
295
296 void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
297
298 bool getRevisitLoop() const { return RevisitLoop; }
299 void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
300
301 void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
302 int32_t getPostOrder() const { return PostOrder; }
303
304 void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000305 void clearWaitcnt() { Waitcnt = nullptr; }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000306 MachineInstr *getWaitcnt() const { return Waitcnt; }
307
308 bool mixedExpTypes() const { return MixedExpTypes; }
309 void setMixedExpTypes(bool MixedExpTypesIn) {
310 MixedExpTypes = MixedExpTypesIn;
311 }
312
313 void print(raw_ostream &);
314 void dump() { print(dbgs()); }
315
316private:
Tom Stellard5bfbae52018-07-11 20:59:01 +0000317 const GCNSubtarget *ST = nullptr;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000318 bool WaitAtBeginning = false;
319 bool RevisitLoop = false;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000320 bool MixedExpTypes = false;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000321 int32_t PostOrder = 0;
322 MachineInstr *Waitcnt = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000323 int32_t ScoreLBs[NUM_INST_CNTS] = {0};
324 int32_t ScoreUBs[NUM_INST_CNTS] = {0};
325 int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
326 // Remember the last flat memory operation.
327 int32_t LastFlat[NUM_INST_CNTS] = {0};
328 // wait_cnt scores for every vgpr.
329 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000330 int32_t VgprUB = 0;
331 int32_t SgprUB = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000332 int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
333 // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
334 int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
335};
336
337// This is a per-loop-region object that records waitcnt status at the end of
338// loop footer from the previous iteration. We also maintain an iteration
339// count to track the number of times the loop has been visited. When it
340// doesn't converge naturally, we force convergence by inserting s_waitcnt 0
341// at the end of the loop footer.
342class LoopWaitcntData {
343public:
Eugene Zelenko59e12822017-08-08 00:47:13 +0000344 LoopWaitcntData() = default;
345 ~LoopWaitcntData() = default;
346
Kannan Narayananacb089e2017-04-12 03:25:12 +0000347 void incIterCnt() { IterCnt++; }
348 void resetIterCnt() { IterCnt = 0; }
Mark Searles10545412018-05-30 15:47:45 +0000349 unsigned getIterCnt() { return IterCnt; }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000350
Kannan Narayananacb089e2017-04-12 03:25:12 +0000351 void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
352 MachineInstr *getWaitcnt() const { return LfWaitcnt; }
353
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000354 void print() { LLVM_DEBUG(dbgs() << " iteration " << IterCnt << '\n';); }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000355
356private:
357 // s_waitcnt added at the end of loop footer to stablize wait scores
358 // at the end of the loop footer.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000359 MachineInstr *LfWaitcnt = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000360 // Number of iterations the loop has been visited, not including the initial
361 // walk over.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000362 int32_t IterCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000363};
364
365class SIInsertWaitcnts : public MachineFunctionPass {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000366private:
Tom Stellard5bfbae52018-07-11 20:59:01 +0000367 const GCNSubtarget *ST = nullptr;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000368 const SIInstrInfo *TII = nullptr;
369 const SIRegisterInfo *TRI = nullptr;
370 const MachineRegisterInfo *MRI = nullptr;
371 const MachineLoopInfo *MLI = nullptr;
Konstantin Zhuravlyov71e43ee2018-09-12 18:50:47 +0000372 AMDGPU::IsaVersion IV;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000373
374 DenseSet<MachineBasicBlock *> BlockVisitedSet;
Mark Searles24c92ee2018-02-07 02:21:21 +0000375 DenseSet<MachineInstr *> TrackedWaitcntSet;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000376 DenseSet<MachineInstr *> VCCZBugHandledSet;
377
378 DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
379 BlockWaitcntBracketsMap;
380
Mark Searles1bc6e712018-04-19 15:42:30 +0000381 std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000382
383 DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
384
Mark Searles4a0f2c52018-05-07 14:43:28 +0000385 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
386 // because of amdgpu-waitcnt-forcezero flag
387 bool ForceEmitZeroWaitcnts;
Mark Searlesec581832018-04-25 19:21:26 +0000388 bool ForceEmitWaitcnt[NUM_INST_CNTS];
389
Kannan Narayananacb089e2017-04-12 03:25:12 +0000390public:
391 static char ID;
392
Konstantin Zhuravlyov77747772018-06-26 21:33:38 +0000393 SIInsertWaitcnts() : MachineFunctionPass(ID) {
394 (void)ForceExpCounter;
395 (void)ForceLgkmCounter;
396 (void)ForceVMCounter;
397 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000398
399 bool runOnMachineFunction(MachineFunction &MF) override;
400
401 StringRef getPassName() const override {
402 return "SI insert wait instructions";
403 }
404
405 void getAnalysisUsage(AnalysisUsage &AU) const override {
406 AU.setPreservesCFG();
407 AU.addRequired<MachineLoopInfo>();
408 MachineFunctionPass::getAnalysisUsage(AU);
409 }
410
Mark Searlesec581832018-04-25 19:21:26 +0000411 bool isForceEmitWaitcnt() const {
412 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
413 T = (enum InstCounterType)(T + 1))
414 if (ForceEmitWaitcnt[T])
415 return true;
416 return false;
417 }
418
419 void setForceEmitWaitcnt() {
420// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
421// For debug builds, get the debug counter info and adjust if need be
422#ifndef NDEBUG
423 if (DebugCounter::isCounterSet(ForceExpCounter) &&
424 DebugCounter::shouldExecute(ForceExpCounter)) {
425 ForceEmitWaitcnt[EXP_CNT] = true;
426 } else {
427 ForceEmitWaitcnt[EXP_CNT] = false;
428 }
429
430 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
431 DebugCounter::shouldExecute(ForceLgkmCounter)) {
432 ForceEmitWaitcnt[LGKM_CNT] = true;
433 } else {
434 ForceEmitWaitcnt[LGKM_CNT] = false;
435 }
436
437 if (DebugCounter::isCounterSet(ForceVMCounter) &&
438 DebugCounter::shouldExecute(ForceVMCounter)) {
439 ForceEmitWaitcnt[VM_CNT] = true;
440 } else {
441 ForceEmitWaitcnt[VM_CNT] = false;
442 }
443#endif // NDEBUG
444 }
445
Matt Arsenault0ed39d32017-07-21 18:54:54 +0000446 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
Mark Searles70901b92018-04-24 15:59:59 +0000447 void generateWaitcntInstBefore(MachineInstr &MI,
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +0000448 BlockWaitcntBrackets *ScoreBrackets);
Mark Searles70901b92018-04-24 15:59:59 +0000449 void updateEventWaitcntAfter(MachineInstr &Inst,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000450 BlockWaitcntBrackets *ScoreBrackets);
451 void mergeInputScoreBrackets(MachineBasicBlock &Block);
Mark Searles1bc6e712018-04-19 15:42:30 +0000452 bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
453 unsigned countNumBottomBlocks(const MachineLoop *Loop);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000454 void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
455 void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +0000456 bool isWaitcntStronger(unsigned LHS, unsigned RHS);
457 unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000458};
459
Eugene Zelenko59e12822017-08-08 00:47:13 +0000460} // end anonymous namespace
Kannan Narayananacb089e2017-04-12 03:25:12 +0000461
462RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
463 const SIInstrInfo *TII,
464 const MachineRegisterInfo *MRI,
465 const SIRegisterInfo *TRI,
466 unsigned OpNo,
467 bool Def) const {
468 const MachineOperand &Op = MI->getOperand(OpNo);
469 if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
470 (Def && !Op.isDef()))
471 return {-1, -1};
472
473 // A use via a PW operand does not need a waitcnt.
474 // A partial write is not a WAW.
475 assert(!Op.getSubReg() || !Op.isUndef());
476
477 RegInterval Result;
478 const MachineRegisterInfo &MRIA = *MRI;
479
480 unsigned Reg = TRI->getEncodingValue(Op.getReg());
481
482 if (TRI->isVGPR(MRIA, Op.getReg())) {
483 assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
484 Result.first = Reg - RegisterEncoding.VGPR0;
485 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
486 } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
487 assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
488 Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
489 assert(Result.first >= NUM_ALL_VGPRS &&
490 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
491 }
492 // TODO: Handle TTMP
493 // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
494 else
495 return {-1, -1};
496
497 const MachineInstr &MIA = *MI;
498 const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +0000499 unsigned Size = TRI->getRegSizeInBits(*RC);
500 Result.second = Result.first + (Size / 32);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000501
502 return Result;
503}
504
505void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
506 const SIInstrInfo *TII,
507 const SIRegisterInfo *TRI,
508 const MachineRegisterInfo *MRI,
509 unsigned OpNo, int32_t Val) {
510 RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000511 LLVM_DEBUG({
Kannan Narayananacb089e2017-04-12 03:25:12 +0000512 const MachineOperand &Opnd = MI->getOperand(OpNo);
513 assert(TRI->isVGPR(*MRI, Opnd.getReg()));
514 });
515 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
516 setRegScore(RegNo, EXP_CNT, Val);
517 }
518}
519
520void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
521 const SIRegisterInfo *TRI,
522 const MachineRegisterInfo *MRI,
523 WaitEventType E, MachineInstr &Inst) {
524 const MachineRegisterInfo &MRIA = *MRI;
525 InstCounterType T = eventCounter(E);
526 int32_t CurrScore = getScoreUB(T) + 1;
527 // EventUB and ScoreUB need to be update regardless if this event changes
528 // the score of a register or not.
529 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
530 EventUBs[E] = CurrScore;
531 setScoreUB(T, CurrScore);
532
533 if (T == EXP_CNT) {
534 // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
535 // is required.
536 if (!MixedExpTypes) {
537 MixedExpTypes = counterOutOfOrder(EXP_CNT);
538 }
539
540 // Put score on the source vgprs. If this is a store, just use those
541 // specific register(s).
542 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
543 // All GDS operations must protect their address register (same as
544 // export.)
545 if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
546 Inst.getOpcode() != AMDGPU::DS_CONSUME) {
547 setExpScore(
548 &Inst, TII, TRI, MRI,
549 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
550 CurrScore);
551 }
552 if (Inst.mayStore()) {
553 setExpScore(
554 &Inst, TII, TRI, MRI,
555 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
556 CurrScore);
557 if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
558 AMDGPU::OpName::data1) != -1) {
559 setExpScore(&Inst, TII, TRI, MRI,
560 AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
561 AMDGPU::OpName::data1),
562 CurrScore);
563 }
564 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
565 Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
566 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
567 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
568 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
569 Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
570 Inst.getOpcode() != AMDGPU::DS_APPEND &&
571 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
572 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
573 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
574 const MachineOperand &Op = Inst.getOperand(I);
575 if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
576 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
577 }
578 }
579 }
580 } else if (TII->isFLAT(Inst)) {
581 if (Inst.mayStore()) {
582 setExpScore(
583 &Inst, TII, TRI, MRI,
584 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
585 CurrScore);
586 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
587 setExpScore(
588 &Inst, TII, TRI, MRI,
589 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
590 CurrScore);
591 }
592 } else if (TII->isMIMG(Inst)) {
593 if (Inst.mayStore()) {
594 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
595 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
596 setExpScore(
597 &Inst, TII, TRI, MRI,
598 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
599 CurrScore);
600 }
601 } else if (TII->isMTBUF(Inst)) {
602 if (Inst.mayStore()) {
603 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
604 }
605 } else if (TII->isMUBUF(Inst)) {
606 if (Inst.mayStore()) {
607 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
608 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
609 setExpScore(
610 &Inst, TII, TRI, MRI,
611 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
612 CurrScore);
613 }
614 } else {
615 if (TII->isEXP(Inst)) {
616 // For export the destination registers are really temps that
617 // can be used as the actual source after export patching, so
618 // we need to treat them like sources and set the EXP_CNT
619 // score.
620 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
621 MachineOperand &DefMO = Inst.getOperand(I);
622 if (DefMO.isReg() && DefMO.isDef() &&
623 TRI->isVGPR(MRIA, DefMO.getReg())) {
624 setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
625 CurrScore);
626 }
627 }
628 }
629 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
630 MachineOperand &MO = Inst.getOperand(I);
631 if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
632 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
633 }
634 }
635 }
636#if 0 // TODO: check if this is handled by MUBUF code above.
637 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
Evgeny Mankovbf975172017-08-16 16:47:29 +0000638 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
639 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000640 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
641 unsigned OpNo;//TODO: find the OpNo for this operand;
642 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
643 for (signed RegNo = Interval.first; RegNo < Interval.second;
Evgeny Mankovbf975172017-08-16 16:47:29 +0000644 ++RegNo) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000645 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
646 }
647#endif
648 } else {
649 // Match the score to the destination registers.
650 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
651 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
652 if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
653 continue;
654 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
655 setRegScore(RegNo, T, CurrScore);
656 }
657 }
658 if (TII->isDS(Inst) && Inst.mayStore()) {
659 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
660 }
661 }
662}
663
664void BlockWaitcntBrackets::print(raw_ostream &OS) {
665 OS << '\n';
666 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
667 T = (enum InstCounterType)(T + 1)) {
668 int LB = getScoreLB(T);
669 int UB = getScoreUB(T);
670
671 switch (T) {
672 case VM_CNT:
673 OS << " VM_CNT(" << UB - LB << "): ";
674 break;
675 case LGKM_CNT:
676 OS << " LGKM_CNT(" << UB - LB << "): ";
677 break;
678 case EXP_CNT:
679 OS << " EXP_CNT(" << UB - LB << "): ";
680 break;
681 default:
682 OS << " UNKNOWN(" << UB - LB << "): ";
683 break;
684 }
685
686 if (LB < UB) {
687 // Print vgpr scores.
688 for (int J = 0; J <= getMaxVGPR(); J++) {
689 int RegScore = getRegScore(J, T);
690 if (RegScore <= LB)
691 continue;
692 int RelScore = RegScore - LB - 1;
693 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
694 OS << RelScore << ":v" << J << " ";
695 } else {
696 OS << RelScore << ":ds ";
697 }
698 }
699 // Also need to print sgpr scores for lgkm_cnt.
700 if (T == LGKM_CNT) {
701 for (int J = 0; J <= getMaxSGPR(); J++) {
702 int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
703 if (RegScore <= LB)
704 continue;
705 int RelScore = RegScore - LB - 1;
706 OS << RelScore << ":s" << J << " ";
707 }
708 }
709 }
710 OS << '\n';
711 }
712 OS << '\n';
Kannan Narayananacb089e2017-04-12 03:25:12 +0000713}
714
715unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
716 int ScoreToWait) {
717 unsigned int NeedWait = 0;
718 if (ScoreToWait == -1) {
719 // The score to wait is unknown. This implies that it was not encountered
720 // during the path of the CFG walk done during the current traversal but
721 // may be seen on a different path. Emit an s_wait counter with a
722 // conservative value of 0 for the counter.
723 NeedWait = CNT_MASK(T);
724 setScoreLB(T, getScoreUB(T));
725 return NeedWait;
726 }
727
728 // If the score of src_operand falls within the bracket, we need an
729 // s_waitcnt instruction.
730 const int32_t LB = getScoreLB(T);
731 const int32_t UB = getScoreUB(T);
732 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
Mark Searlesf0b93f12018-06-04 16:51:59 +0000733 if ((T == VM_CNT || T == LGKM_CNT) &&
734 hasPendingFlat() &&
735 !ST->hasFlatLgkmVMemCountInOrder()) {
736 // If there is a pending FLAT operation, and this is a VMem or LGKM
737 // waitcnt and the target can report early completion, then we need
738 // to force a waitcnt 0.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000739 NeedWait = CNT_MASK(T);
740 setScoreLB(T, getScoreUB(T));
741 } else if (counterOutOfOrder(T)) {
742 // Counter can get decremented out-of-order when there
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000743 // are multiple types event in the bracket. Also emit an s_wait counter
Kannan Narayananacb089e2017-04-12 03:25:12 +0000744 // with a conservative value of 0 for the counter.
745 NeedWait = CNT_MASK(T);
746 setScoreLB(T, getScoreUB(T));
747 } else {
748 NeedWait = CNT_MASK(T);
749 setScoreLB(T, ScoreToWait);
750 }
751 }
752
753 return NeedWait;
754}
755
756// Where there are multiple types of event in the bracket of a counter,
757// the decrement may go out of order.
758bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
759 switch (T) {
760 case VM_CNT:
761 return false;
762 case LGKM_CNT: {
763 if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
764 EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
765 // Scalar memory read always can go out of order.
766 return true;
767 }
768 int NumEventTypes = 0;
769 if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
770 EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
771 NumEventTypes++;
772 }
773 if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
774 EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
775 NumEventTypes++;
776 }
777 if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
778 EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
779 NumEventTypes++;
780 }
781 if (NumEventTypes <= 1) {
782 return false;
783 }
784 break;
785 }
786 case EXP_CNT: {
787 // If there has been a mixture of export types, then a waitcnt exp(0) is
788 // required.
789 if (MixedExpTypes)
790 return true;
791 int NumEventTypes = 0;
792 if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
793 EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
794 NumEventTypes++;
795 }
796 if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
797 EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
798 NumEventTypes++;
799 }
800 if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
801 EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
802 NumEventTypes++;
803 }
804 if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
805 EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
806 NumEventTypes++;
807 }
808
809 if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
810 EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
811 NumEventTypes++;
812 }
813
814 if (NumEventTypes <= 1) {
815 return false;
816 }
817 break;
818 }
819 default:
820 break;
821 }
822 return true;
823}
824
825INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
826 false)
827INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
828 false)
829
830char SIInsertWaitcnts::ID = 0;
831
832char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
833
834FunctionPass *llvm::createSIInsertWaitcntsPass() {
835 return new SIInsertWaitcnts();
836}
837
838static bool readsVCCZ(const MachineInstr &MI) {
839 unsigned Opc = MI.getOpcode();
840 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
841 !MI.getOperand(1).isUndef();
842}
843
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000844/// Given wait count encodings checks if LHS is stronger than RHS.
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +0000845bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
846 if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
847 return false;
848 if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
849 return false;
850 if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
851 return false;
852 return true;
853}
854
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000855/// Given wait count encodings create a new encoding which is stronger
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +0000856/// or equal to both.
857unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
858 unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
859 AMDGPU::decodeVmcnt(IV, RHS));
860 unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
861 AMDGPU::decodeLgkmcnt(IV, RHS));
862 unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
863 AMDGPU::decodeExpcnt(IV, RHS));
864 return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
865}
866
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000867/// Generate s_waitcnt instruction to be placed before cur_Inst.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000868/// Instructions of a given type are returned in order,
869/// but instructions of different types can complete out of order.
870/// We rely on this in-order completion
871/// and simply assign a score to the memory access instructions.
872/// We keep track of the active "score bracket" to determine
873/// if an access of a memory read requires an s_waitcnt
874/// and if so what the value of each counter is.
875/// The "score bracket" is bound by the lower bound and upper bound
876/// scores (*_score_LB and *_score_ub respectively).
Mark Searles70901b92018-04-24 15:59:59 +0000877void SIInsertWaitcnts::generateWaitcntInstBefore(
Kannan Narayananacb089e2017-04-12 03:25:12 +0000878 MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
879 // To emit, or not to emit - that's the question!
880 // Start with an assumption that there is no need to emit.
Mark Searles70901b92018-04-24 15:59:59 +0000881 unsigned int EmitWaitcnt = 0;
Mark Searles4a0f2c52018-05-07 14:43:28 +0000882
Mark Searles4a0f2c52018-05-07 14:43:28 +0000883 // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
884 bool ForceEmitZeroWaitcnt = false;
885
886 setForceEmitWaitcnt();
Mark Searlesec581832018-04-25 19:21:26 +0000887 bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
888
Nicolai Haehnle61396ff2018-11-07 21:53:36 +0000889 if (MI.isDebugInstr())
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +0000890 return;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000891
892 // See if an s_waitcnt is forced at block entry, or is needed at
893 // program end.
894 if (ScoreBrackets->getWaitAtBeginning()) {
895 // Note that we have already cleared the state, so we don't need to update
896 // it.
897 ScoreBrackets->clearWaitAtBeginning();
898 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
899 T = (enum InstCounterType)(T + 1)) {
Mark Searles70901b92018-04-24 15:59:59 +0000900 EmitWaitcnt |= CNT_MASK(T);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000901 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
902 }
903 }
904
905 // See if this instruction has a forced S_WAITCNT VM.
906 // TODO: Handle other cases of NeedsWaitcntVmBefore()
907 else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
908 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
909 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
Mark Searles70901b92018-04-24 15:59:59 +0000910 EmitWaitcnt |=
Kannan Narayananacb089e2017-04-12 03:25:12 +0000911 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
912 }
913
914 // All waits must be resolved at call return.
915 // NOTE: this could be improved with knowledge of all call sites or
916 // with knowledge of the called routines.
Tom Stellardc5a154d2018-06-28 23:47:12 +0000917 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
Mark Searles11d0a042017-05-31 16:44:23 +0000918 MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000919 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
920 T = (enum InstCounterType)(T + 1)) {
921 if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
922 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
Mark Searles70901b92018-04-24 15:59:59 +0000923 EmitWaitcnt |= CNT_MASK(T);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000924 }
925 }
926 }
927 // Resolve vm waits before gs-done.
928 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
929 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
930 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
931 AMDGPU::SendMsg::ID_GS_DONE)) {
932 if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
933 ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000934 EmitWaitcnt |= CNT_MASK(VM_CNT);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000935 }
936 }
937#if 0 // TODO: the following blocks of logic when we have fence.
938 else if (MI.getOpcode() == SC_FENCE) {
939 const unsigned int group_size =
940 context->shader_info->GetMaxThreadGroupSize();
941 // group_size == 0 means thread group size is unknown at compile time
942 const bool group_is_multi_wave =
943 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
944 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
945
946 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
947 SCRegType src_type = Inst->GetSrcType(i);
948 switch (src_type) {
949 case SCMEM_LDS:
950 if (group_is_multi_wave ||
Evgeny Mankovbf975172017-08-16 16:47:29 +0000951 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
Mark Searles70901b92018-04-24 15:59:59 +0000952 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000953 ScoreBrackets->getScoreUB(LGKM_CNT));
954 // LDS may have to wait for VM_CNT after buffer load to LDS
955 if (target_info->HasBufferLoadToLDS()) {
Mark Searles70901b92018-04-24 15:59:59 +0000956 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000957 ScoreBrackets->getScoreUB(VM_CNT));
958 }
959 }
960 break;
961
962 case SCMEM_GDS:
963 if (group_is_multi_wave || fence_is_global) {
Mark Searles70901b92018-04-24 15:59:59 +0000964 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000965 ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000966 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000967 ScoreBrackets->getScoreUB(LGKM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000968 }
969 break;
970
971 case SCMEM_UAV:
972 case SCMEM_TFBUF:
973 case SCMEM_RING:
974 case SCMEM_SCATTER:
975 if (group_is_multi_wave || fence_is_global) {
Mark Searles70901b92018-04-24 15:59:59 +0000976 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000977 ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000978 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000979 ScoreBrackets->getScoreUB(VM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000980 }
981 break;
982
983 case SCMEM_SCRATCH:
984 default:
985 break;
986 }
987 }
988 }
989#endif
990
991 // Export & GDS instructions do not read the EXEC mask until after the export
992 // is granted (which can occur well after the instruction is issued).
993 // The shader program must flush all EXP operations on the export-count
994 // before overwriting the EXEC mask.
995 else {
996 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
997 // Export and GDS are tracked individually, either may trigger a waitcnt
998 // for EXEC.
Mark Searles70901b92018-04-24 15:59:59 +0000999 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001000 EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
Mark Searles70901b92018-04-24 15:59:59 +00001001 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001002 EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
Mark Searles70901b92018-04-24 15:59:59 +00001003 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001004 EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
Mark Searles70901b92018-04-24 15:59:59 +00001005 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001006 EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
1007 }
1008
1009#if 0 // TODO: the following code to handle CALL.
1010 // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
1011 // However, there is a problem with EXP_CNT, because the call cannot
1012 // easily tell if a register is used in the function, and if it did, then
1013 // the referring instruction would have to have an S_WAITCNT, which is
1014 // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
1015 // before the call.
1016 if (MI.getOpcode() == SC_CALL) {
1017 if (ScoreBrackets->getScoreUB(EXP_CNT) >
Evgeny Mankovbf975172017-08-16 16:47:29 +00001018 ScoreBrackets->getScoreLB(EXP_CNT)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001019 ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +00001020 EmitWaitcnt |= CNT_MASK(EXP_CNT);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001021 }
1022 }
1023#endif
1024
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001025 // FIXME: Should not be relying on memoperands.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001026 // Look at the source operands of every instruction to see if
1027 // any of them results from a previous memory operation that affects
1028 // its current usage. If so, an s_waitcnt instruction needs to be
1029 // emitted.
1030 // If the source operand was defined by a load, add the s_waitcnt
1031 // instruction.
1032 for (const MachineMemOperand *Memop : MI.memoperands()) {
1033 unsigned AS = Memop->getAddrSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +00001034 if (AS != AMDGPUAS::LOCAL_ADDRESS)
Kannan Narayananacb089e2017-04-12 03:25:12 +00001035 continue;
1036 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1037 // VM_CNT is only relevant to vgpr or LDS.
Mark Searles70901b92018-04-24 15:59:59 +00001038 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001039 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1040 }
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001041
Kannan Narayananacb089e2017-04-12 03:25:12 +00001042 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1043 const MachineOperand &Op = MI.getOperand(I);
1044 const MachineRegisterInfo &MRIA = *MRI;
1045 RegInterval Interval =
1046 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
1047 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1048 if (TRI->isVGPR(MRIA, Op.getReg())) {
1049 // VM_CNT is only relevant to vgpr or LDS.
Mark Searles70901b92018-04-24 15:59:59 +00001050 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001051 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1052 }
Mark Searles70901b92018-04-24 15:59:59 +00001053 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001054 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1055 }
1056 }
1057 // End of for loop that looks at all source operands to decide vm_wait_cnt
1058 // and lgk_wait_cnt.
1059
1060 // Two cases are handled for destination operands:
1061 // 1) If the destination operand was defined by a load, add the s_waitcnt
1062 // instruction to guarantee the right WAW order.
1063 // 2) If a destination operand that was used by a recent export/store ins,
1064 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1065 if (MI.mayStore()) {
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001066 // FIXME: Should not be relying on memoperands.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001067 for (const MachineMemOperand *Memop : MI.memoperands()) {
1068 unsigned AS = Memop->getAddrSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +00001069 if (AS != AMDGPUAS::LOCAL_ADDRESS)
Kannan Narayananacb089e2017-04-12 03:25:12 +00001070 continue;
1071 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
Mark Searles70901b92018-04-24 15:59:59 +00001072 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001073 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
Mark Searles70901b92018-04-24 15:59:59 +00001074 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001075 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1076 }
1077 }
1078 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1079 MachineOperand &Def = MI.getOperand(I);
1080 const MachineRegisterInfo &MRIA = *MRI;
1081 RegInterval Interval =
1082 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
1083 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1084 if (TRI->isVGPR(MRIA, Def.getReg())) {
Mark Searles70901b92018-04-24 15:59:59 +00001085 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001086 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
Mark Searles70901b92018-04-24 15:59:59 +00001087 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001088 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1089 }
Mark Searles70901b92018-04-24 15:59:59 +00001090 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001091 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1092 }
1093 } // End of for loop that looks at all dest operands.
1094 }
1095
Kannan Narayananacb089e2017-04-12 03:25:12 +00001096 // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1097 // occurs before the instruction. Doing it here prevents any additional
1098 // S_WAITCNTs from being emitted if the instruction was marked as
1099 // requiring a WAITCNT beforehand.
Konstantin Zhuravlyovbe6c0ca2017-06-02 17:40:26 +00001100 if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1101 !ST->hasAutoWaitcntBeforeBarrier()) {
Mark Searles70901b92018-04-24 15:59:59 +00001102 EmitWaitcnt |=
Kannan Narayananacb089e2017-04-12 03:25:12 +00001103 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
Mark Searles70901b92018-04-24 15:59:59 +00001104 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001105 EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +00001106 EmitWaitcnt |= ScoreBrackets->updateByWait(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001107 LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
1108 }
1109
1110 // TODO: Remove this work-around, enable the assert for Bug 457939
1111 // after fixing the scheduler. Also, the Shader Compiler code is
1112 // independent of target.
Tom Stellardc5a154d2018-06-28 23:47:12 +00001113 if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001114 if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1115 ScoreBrackets->getScoreUB(LGKM_CNT) &&
1116 ScoreBrackets->hasPendingSMEM()) {
1117 // Wait on everything, not just LGKM. vccz reads usually come from
1118 // terminators, and we always wait on everything at the end of the
1119 // block, so if we only wait on LGKM here, we might end up with
1120 // another s_waitcnt inserted right after this if there are non-LGKM
1121 // instructions still outstanding.
Mark Searles4a0f2c52018-05-07 14:43:28 +00001122 // FIXME: this is too conservative / the comment is wrong.
1123 // We don't wait on everything at the end of the block and we combine
1124 // waitcnts so we should never have back-to-back waitcnts.
Mark Searlesec581832018-04-25 19:21:26 +00001125 ForceEmitZeroWaitcnt = true;
Mark Searles70901b92018-04-24 15:59:59 +00001126 EmitWaitcnt = true;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001127 }
1128 }
1129
1130 // Does this operand processing indicate s_wait counter update?
Mark Searlesec581832018-04-25 19:21:26 +00001131 if (EmitWaitcnt || IsForceEmitWaitcnt) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001132 int CntVal[NUM_INST_CNTS];
1133
Mark Searles4a0f2c52018-05-07 14:43:28 +00001134 if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001135 // Force all waitcnts to 0.
1136 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1137 T = (enum InstCounterType)(T + 1)) {
1138 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1139 }
1140 CntVal[VM_CNT] = 0;
1141 CntVal[EXP_CNT] = 0;
1142 CntVal[LGKM_CNT] = 0;
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001143 } else {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001144 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1145 T = (enum InstCounterType)(T + 1)) {
Mark Searles70901b92018-04-24 15:59:59 +00001146 if (EmitWaitcnt & CNT_MASK(T)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001147 int Delta =
1148 ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
1149 int MaxDelta = ScoreBrackets->getWaitCountMax(T);
1150 if (Delta >= MaxDelta) {
1151 Delta = -1;
1152 if (T != EXP_CNT) {
1153 ScoreBrackets->setScoreLB(
1154 T, ScoreBrackets->getScoreUB(T) - MaxDelta);
1155 }
Mark Searles70901b92018-04-24 15:59:59 +00001156 EmitWaitcnt &= ~CNT_MASK(T);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001157 }
1158 CntVal[T] = Delta;
1159 } else {
1160 // If we are not waiting for a particular counter then encode
1161 // it as -1 which means "don't care."
1162 CntVal[T] = -1;
1163 }
1164 }
1165 }
1166
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001167 MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
1168 int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
1169 if (!OldWaitcnt ||
1170 (AMDGPU::decodeVmcnt(IV, Imm) !=
1171 (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
1172 (AMDGPU::decodeExpcnt(IV, Imm) !=
1173 (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
1174 (AMDGPU::decodeLgkmcnt(IV, Imm) !=
1175 (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
1176 MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
1177 if (ContainingLoop) {
1178 MachineBasicBlock *TBB = ContainingLoop->getHeader();
1179 BlockWaitcntBrackets *ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
1180 if (!ScoreBracket) {
1181 assert(!BlockVisitedSet.count(TBB));
1182 BlockWaitcntBracketsMap[TBB] =
1183 llvm::make_unique<BlockWaitcntBrackets>(ST);
1184 ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001185 }
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001186 ScoreBracket->setRevisitLoop(true);
1187 LLVM_DEBUG(dbgs() << "set-revisit2: Block"
1188 << ContainingLoop->getHeader()->getNumber() << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001189 }
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001190 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001191
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001192 // Update an existing waitcount, or make a new one.
1193 unsigned Enc = AMDGPU::encodeWaitcnt(IV,
Mark Searlesec581832018-04-25 19:21:26 +00001194 ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
1195 ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
1196 ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001197 // We don't remove waitcnts that existed prior to the waitcnt
1198 // pass. Check if the waitcnt to-be-inserted can be avoided
1199 // or if the prev waitcnt can be updated.
1200 bool insertSWaitInst = true;
1201 for (MachineBasicBlock::iterator I = MI.getIterator(),
1202 B = MI.getParent()->begin();
1203 insertSWaitInst && I != B; --I) {
1204 if (I == MI.getIterator())
1205 continue;
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +00001206
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001207 switch (I->getOpcode()) {
1208 case AMDGPU::S_WAITCNT:
1209 if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
1210 insertSWaitInst = false;
1211 else if (!OldWaitcnt) {
1212 OldWaitcnt = &*I;
1213 Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001214 }
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +00001215 break;
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001216 // TODO: skip over instructions which never require wait.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001217 }
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001218 break;
1219 }
1220 if (insertSWaitInst) {
1221 if (OldWaitcnt) {
1222 assert(OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT);
1223 if (ForceEmitZeroWaitcnts)
1224 LLVM_DEBUG(dbgs()
1225 << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
1226 if (IsForceEmitWaitcnt)
1227 LLVM_DEBUG(dbgs() << "Force emit a s_waitcnt due to debug counter\n");
Mark Searlesec581832018-04-25 19:21:26 +00001228
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001229 OldWaitcnt->getOperand(0).setImm(Enc);
1230 if (!OldWaitcnt->getParent())
1231 MI.getParent()->insert(MI, OldWaitcnt);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001232
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001233 LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
1234 << "Old Instr: " << MI << '\n'
1235 << "New Instr: " << *OldWaitcnt << '\n');
1236 } else {
1237 auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1238 MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001239 .addImm(Enc);
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001240 TrackedWaitcntSet.insert(SWaitInst);
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001241
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001242 LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1243 << "Old Instr: " << MI << '\n'
1244 << "New Instr: " << *SWaitInst << '\n');
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001245 }
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001246 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001247
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001248 if (CntVal[EXP_CNT] == 0) {
1249 ScoreBrackets->setMixedExpTypes(false);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001250 }
1251 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001252}
1253
1254void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
1255 MachineInstr *Waitcnt) {
1256 if (MBB.empty()) {
1257 MBB.push_back(Waitcnt);
1258 return;
1259 }
1260
1261 MachineBasicBlock::iterator It = MBB.end();
1262 MachineInstr *MI = &*(--It);
1263 if (MI->isBranch()) {
1264 MBB.insert(It, Waitcnt);
1265 } else {
1266 MBB.push_back(Waitcnt);
1267 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001268}
1269
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001270// This is a flat memory operation. Check to see if it has memory
1271// tokens for both LDS and Memory, and if so mark it as a flat.
1272bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1273 if (MI.memoperands_empty())
1274 return true;
1275
1276 for (const MachineMemOperand *Memop : MI.memoperands()) {
1277 unsigned AS = Memop->getAddrSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +00001278 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001279 return true;
1280 }
1281
1282 return false;
1283}
1284
Mark Searles70901b92018-04-24 15:59:59 +00001285void SIInsertWaitcnts::updateEventWaitcntAfter(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001286 MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
1287 // Now look at the instruction opcode. If it is a memory access
1288 // instruction, update the upper-bound of the appropriate counter's
1289 // bracket and the destination operand scores.
1290 // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001291 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001292 if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001293 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1294 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1295 } else {
1296 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1297 }
1298 } else if (TII->isFLAT(Inst)) {
1299 assert(Inst.mayLoad() || Inst.mayStore());
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001300
1301 if (TII->usesVM_CNT(Inst))
1302 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1303
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001304 if (TII->usesLGKM_CNT(Inst)) {
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001305 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001306
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001307 // This is a flat memory operation, so note it - it will require
1308 // that both the VM and LGKM be flushed to zero if it is pending when
1309 // a VM or LGKM dependency occurs.
1310 if (mayAccessLDSThroughFlat(Inst))
1311 ScoreBrackets->setPendingFlat();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001312 }
1313 } else if (SIInstrInfo::isVMEM(Inst) &&
1314 // TODO: get a better carve out.
1315 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1316 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1317 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1318 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
Mark Searles2a19af62018-04-26 16:11:19 +00001319 if (ST->vmemWriteNeedsExpWaitcnt() &&
Mark Searles11d0a042017-05-31 16:44:23 +00001320 (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001321 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1322 }
1323 } else if (TII->isSMRD(Inst)) {
1324 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1325 } else {
1326 switch (Inst.getOpcode()) {
1327 case AMDGPU::S_SENDMSG:
1328 case AMDGPU::S_SENDMSGHALT:
1329 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1330 break;
1331 case AMDGPU::EXP:
1332 case AMDGPU::EXP_DONE: {
1333 int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1334 if (Imm >= 32 && Imm <= 63)
1335 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1336 else if (Imm >= 12 && Imm <= 15)
1337 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1338 else
1339 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1340 break;
1341 }
1342 case AMDGPU::S_MEMTIME:
1343 case AMDGPU::S_MEMREALTIME:
1344 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1345 break;
1346 default:
1347 break;
1348 }
1349 }
1350}
1351
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001352// Merge the score brackets of the Block's predecessors;
1353// this merged score bracket is used when adding waitcnts to the Block
Kannan Narayananacb089e2017-04-12 03:25:12 +00001354void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
1355 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1356 int32_t MaxPending[NUM_INST_CNTS] = {0};
1357 int32_t MaxFlat[NUM_INST_CNTS] = {0};
1358 bool MixedExpTypes = false;
1359
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001360 // For single basic block loops, we need to retain the Block's
1361 // score bracket to have accurate Pred info. So, make a copy of Block's
1362 // score bracket, clear() it (which retains several important bits of info),
1363 // populate, and then replace en masse. For non-single basic block loops,
1364 // just clear Block's current score bracket and repopulate in-place.
1365 bool IsSelfPred;
1366 std::unique_ptr<BlockWaitcntBrackets> S;
1367
1368 IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
1369 != Block.pred_end();
1370 if (IsSelfPred) {
1371 S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1372 ScoreBrackets = S.get();
1373 }
1374
Kannan Narayananacb089e2017-04-12 03:25:12 +00001375 ScoreBrackets->clear();
1376
Kannan Narayananacb089e2017-04-12 03:25:12 +00001377 // See if there are any uninitialized predecessors. If so, emit an
1378 // s_waitcnt 0 at the beginning of the block.
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001379 for (MachineBasicBlock *Pred : Block.predecessors()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001380 BlockWaitcntBrackets *PredScoreBrackets =
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001381 BlockWaitcntBracketsMap[Pred].get();
1382 bool Visited = BlockVisitedSet.count(Pred);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001383 if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001384 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001385 }
1386 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1387 T = (enum InstCounterType)(T + 1)) {
1388 int span =
1389 PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
1390 MaxPending[T] = std::max(MaxPending[T], span);
1391 span =
1392 PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
1393 MaxFlat[T] = std::max(MaxFlat[T], span);
1394 }
1395
1396 MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
1397 }
1398
Kannan Narayananacb089e2017-04-12 03:25:12 +00001399 // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
1400 for (MachineBasicBlock *Pred : Block.predecessors()) {
1401 BlockWaitcntBrackets *PredScoreBrackets =
1402 BlockWaitcntBracketsMap[Pred].get();
Mark Searles24c92ee2018-02-07 02:21:21 +00001403 bool Visited = BlockVisitedSet.count(Pred);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001404 if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001405 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001406 }
1407
1408 int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
1409 PredScoreBrackets->getScoreLB(EXP_CNT);
1410 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1411 int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
1412 PredScoreBrackets->getScoreLB(EXP_CNT);
1413 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1414 }
1415
Kannan Narayananacb089e2017-04-12 03:25:12 +00001416#if 0
1417 // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
1418 // TODO: how does LC distinguish between function entry and main entry?
1419 // If this is the entry to a function, force a wait.
1420 MachineBasicBlock &Entry = Block.getParent()->front();
1421 if (Entry.getNumber() == Block.getNumber()) {
1422 ScoreBrackets->setWaitAtBeginning();
1423 return;
1424 }
1425#endif
1426
1427 // Now set the current Block's brackets to the largest ending bracket.
1428 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1429 T = (enum InstCounterType)(T + 1)) {
1430 ScoreBrackets->setScoreUB(T, MaxPending[T]);
1431 ScoreBrackets->setScoreLB(T, 0);
1432 ScoreBrackets->setLastFlat(T, MaxFlat[T]);
1433 }
1434
1435 ScoreBrackets->setMixedExpTypes(MixedExpTypes);
1436
1437 // Set the register scoreboard.
1438 for (MachineBasicBlock *Pred : Block.predecessors()) {
Mark Searles24c92ee2018-02-07 02:21:21 +00001439 if (!BlockVisitedSet.count(Pred)) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001440 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001441 }
1442
1443 BlockWaitcntBrackets *PredScoreBrackets =
1444 BlockWaitcntBracketsMap[Pred].get();
1445
1446 // Now merge the gpr_reg_score information
1447 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1448 T = (enum InstCounterType)(T + 1)) {
1449 int PredLB = PredScoreBrackets->getScoreLB(T);
1450 int PredUB = PredScoreBrackets->getScoreUB(T);
1451 if (PredLB < PredUB) {
1452 int PredScale = MaxPending[T] - PredUB;
1453 // Merge vgpr scores.
1454 for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
1455 int PredRegScore = PredScoreBrackets->getRegScore(J, T);
1456 if (PredRegScore <= PredLB)
1457 continue;
1458 int NewRegScore = PredScale + PredRegScore;
1459 ScoreBrackets->setRegScore(
1460 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1461 }
1462 // Also need to merge sgpr scores for lgkm_cnt.
1463 if (T == LGKM_CNT) {
1464 for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
1465 int PredRegScore =
1466 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1467 if (PredRegScore <= PredLB)
1468 continue;
1469 int NewRegScore = PredScale + PredRegScore;
1470 ScoreBrackets->setRegScore(
1471 J + NUM_ALL_VGPRS, LGKM_CNT,
1472 std::max(
1473 ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1474 NewRegScore));
1475 }
1476 }
1477 }
1478 }
1479
1480 // Also merge the WaitEvent information.
1481 ForAllWaitEventType(W) {
1482 enum InstCounterType T = PredScoreBrackets->eventCounter(W);
1483 int PredEventUB = PredScoreBrackets->getEventUB(W);
1484 if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
1485 int NewEventUB =
1486 MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
1487 if (NewEventUB > 0) {
1488 ScoreBrackets->setEventUB(
1489 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1490 }
1491 }
1492 }
1493 }
1494
Kannan Narayananacb089e2017-04-12 03:25:12 +00001495 // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
1496 // sequencing predecessors, because changes to EXEC require waitcnts due to
1497 // the delayed nature of these operations.
1498 for (MachineBasicBlock *Pred : Block.predecessors()) {
Mark Searles24c92ee2018-02-07 02:21:21 +00001499 if (!BlockVisitedSet.count(Pred)) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001500 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001501 }
1502
1503 BlockWaitcntBrackets *PredScoreBrackets =
1504 BlockWaitcntBracketsMap[Pred].get();
1505
1506 int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
1507 if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1508 int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
1509 PredScoreBrackets->getScoreUB(EXP_CNT);
1510 if (new_gds_ub > 0) {
1511 ScoreBrackets->setEventUB(
1512 GDS_GPR_LOCK,
1513 std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
1514 }
1515 }
1516 int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
1517 if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1518 int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
1519 PredScoreBrackets->getScoreUB(EXP_CNT);
1520 if (new_exp_ub > 0) {
1521 ScoreBrackets->setEventUB(
1522 EXP_GPR_LOCK,
1523 std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
1524 }
1525 }
1526 }
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001527
1528 // if a single block loop, update the score brackets. Not needed for other
1529 // blocks, as we did this in-place
1530 if (IsSelfPred) {
1531 BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1532 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001533}
1534
Mark Searles10545412018-05-30 15:47:45 +00001535/// Return true if the given basic block is a "bottom" block of a loop.
1536/// This works even if the loop is discontiguous. This also handles
1537/// multiple back-edges for the same "header" block of a loop.
Mark Searles1bc6e712018-04-19 15:42:30 +00001538bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
1539 const MachineBasicBlock *Block) {
1540 for (MachineBasicBlock *MBB : Loop->blocks()) {
1541 if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
1542 return true;
1543 }
1544 }
1545 return false;
1546}
1547
1548/// Count the number of "bottom" basic blocks of a loop.
1549unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
1550 unsigned Count = 0;
1551 for (MachineBasicBlock *MBB : Loop->blocks()) {
1552 if (MBB->isSuccessor(Loop->getHeader())) {
1553 Count++;
1554 }
1555 }
1556 return Count;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001557}
1558
1559// Generate s_waitcnt instructions where needed.
1560void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1561 MachineBasicBlock &Block) {
1562 // Initialize the state information.
1563 mergeInputScoreBrackets(Block);
1564
1565 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1566
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001567 LLVM_DEBUG({
Mark Searlesec581832018-04-25 19:21:26 +00001568 dbgs() << "*** Block" << Block.getNumber() << " ***";
Kannan Narayananacb089e2017-04-12 03:25:12 +00001569 ScoreBrackets->dump();
1570 });
1571
Kannan Narayananacb089e2017-04-12 03:25:12 +00001572 // Walk over the instructions.
1573 for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1574 Iter != E;) {
1575 MachineInstr &Inst = *Iter;
1576 // Remove any previously existing waitcnts.
1577 if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
Mark Searles65207922018-02-19 19:19:59 +00001578 // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
1579 // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
1580 // as needed.
Mark Searles24c92ee2018-02-07 02:21:21 +00001581 if (!TrackedWaitcntSet.count(&Inst))
Kannan Narayananacb089e2017-04-12 03:25:12 +00001582 ++Iter;
1583 else {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001584 ++Iter;
1585 Inst.removeFromParent();
1586 }
Mark Searles65207922018-02-19 19:19:59 +00001587 ScoreBrackets->setWaitcnt(&Inst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001588 continue;
1589 }
1590
Kannan Narayananacb089e2017-04-12 03:25:12 +00001591 bool VCCZBugWorkAround = false;
1592 if (readsVCCZ(Inst) &&
Mark Searles24c92ee2018-02-07 02:21:21 +00001593 (!VCCZBugHandledSet.count(&Inst))) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001594 if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1595 ScoreBrackets->getScoreUB(LGKM_CNT) &&
1596 ScoreBrackets->hasPendingSMEM()) {
Tom Stellardc5a154d2018-06-28 23:47:12 +00001597 if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
Kannan Narayananacb089e2017-04-12 03:25:12 +00001598 VCCZBugWorkAround = true;
1599 }
1600 }
1601
1602 // Generate an s_waitcnt instruction to be placed before
1603 // cur_Inst, if needed.
Mark Searles70901b92018-04-24 15:59:59 +00001604 generateWaitcntInstBefore(Inst, ScoreBrackets);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001605
Mark Searles70901b92018-04-24 15:59:59 +00001606 updateEventWaitcntAfter(Inst, ScoreBrackets);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001607
1608#if 0 // TODO: implement resource type check controlled by options with ub = LB.
1609 // If this instruction generates a S_SETVSKIP because it is an
1610 // indexed resource, and we are on Tahiti, then it will also force
1611 // an S_WAITCNT vmcnt(0)
1612 if (RequireCheckResourceType(Inst, context)) {
1613 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1614 ScoreBrackets->setScoreLB(VM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +00001615 ScoreBrackets->getScoreUB(VM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +00001616 }
1617#endif
1618
1619 ScoreBrackets->clearWaitcnt();
1620
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001621 LLVM_DEBUG({
Mark Searles94ae3b22018-01-30 17:17:06 +00001622 Inst.print(dbgs());
Kannan Narayananacb089e2017-04-12 03:25:12 +00001623 ScoreBrackets->dump();
1624 });
1625
1626 // Check to see if this is a GWS instruction. If so, and if this is CI or
1627 // VI, then the generated code sequence will include an S_WAITCNT 0.
1628 // TODO: Are these the only GWS instructions?
1629 if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1630 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1631 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1632 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1633 Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1634 // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
1635 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1636 ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1637 ScoreBrackets->updateByWait(LGKM_CNT,
1638 ScoreBrackets->getScoreUB(LGKM_CNT));
1639 }
1640
1641 // TODO: Remove this work-around after fixing the scheduler and enable the
1642 // assert above.
1643 if (VCCZBugWorkAround) {
1644 // Restore the vccz bit. Any time a value is written to vcc, the vcc
1645 // bit is updated, so we can restore the bit by reading the value of
1646 // vcc and then writing it back to the register.
1647 BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1648 AMDGPU::VCC)
1649 .addReg(AMDGPU::VCC);
1650 VCCZBugHandledSet.insert(&Inst);
1651 }
1652
Kannan Narayananacb089e2017-04-12 03:25:12 +00001653 ++Iter;
1654 }
1655
1656 // Check if we need to force convergence at loop footer.
1657 MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
Mark Searles1bc6e712018-04-19 15:42:30 +00001658 if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001659 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1660 WaitcntData->print();
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001661 LLVM_DEBUG(dbgs() << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001662
1663 // The iterative waitcnt insertion algorithm aims for optimal waitcnt
Mark Searles10545412018-05-30 15:47:45 +00001664 // placement, but doesn't guarantee convergence for a loop. Each
1665 // loop should take at most (n+1) iterations for it to converge naturally,
1666 // where n is the number of bottom blocks. If this threshold is reached and
1667 // the result hasn't converged, then we force convergence by inserting
1668 // a s_waitcnt at the end of loop footer.
1669 if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001670 // To ensure convergence, need to make wait events at loop footer be no
1671 // more than those from the previous iteration.
Mark Searles65207922018-02-19 19:19:59 +00001672 // As a simplification, instead of tracking individual scores and
1673 // generating the precise wait count, just wait on 0.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001674 bool HasPending = false;
1675 MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
1676 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1677 T = (enum InstCounterType)(T + 1)) {
1678 if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
1679 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1680 HasPending = true;
Mark Searles10545412018-05-30 15:47:45 +00001681 break;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001682 }
1683 }
1684
1685 if (HasPending) {
1686 if (!SWaitInst) {
Mark Searles10545412018-05-30 15:47:45 +00001687 SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
1688 DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1689 .addImm(0);
Mark Searles24c92ee2018-02-07 02:21:21 +00001690 TrackedWaitcntSet.insert(SWaitInst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001691#if 0 // TODO: Format the debug output
1692 OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
1693 OutputTransformAdd(SWaitInst, context);
1694#endif
1695 }
1696#if 0 // TODO: ??
1697 _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
1698#endif
1699 }
1700
1701 if (SWaitInst) {
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001702 LLVM_DEBUG({
Kannan Narayananacb089e2017-04-12 03:25:12 +00001703 SWaitInst->print(dbgs());
1704 dbgs() << "\nAdjusted score board:";
1705 ScoreBrackets->dump();
1706 });
1707
1708 // Add this waitcnt to the block. It is either newly created or
1709 // created in previous iterations and added back since block traversal
Mark Searles65207922018-02-19 19:19:59 +00001710 // always removes waitcnts.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001711 insertWaitcntBeforeCF(Block, SWaitInst);
1712 WaitcntData->setWaitcnt(SWaitInst);
1713 }
1714 }
1715 }
1716}
1717
1718bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
Tom Stellard5bfbae52018-07-11 20:59:01 +00001719 ST = &MF.getSubtarget<GCNSubtarget>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001720 TII = ST->getInstrInfo();
1721 TRI = &TII->getRegisterInfo();
1722 MRI = &MF.getRegInfo();
1723 MLI = &getAnalysis<MachineLoopInfo>();
Konstantin Zhuravlyov71e43ee2018-09-12 18:50:47 +00001724 IV = AMDGPU::getIsaVersion(ST->getCPU());
Mark Searles11d0a042017-05-31 16:44:23 +00001725 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001726
Mark Searles4a0f2c52018-05-07 14:43:28 +00001727 ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
Mark Searlesec581832018-04-25 19:21:26 +00001728 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1729 T = (enum InstCounterType)(T + 1))
1730 ForceEmitWaitcnt[T] = false;
1731
Kannan Narayananacb089e2017-04-12 03:25:12 +00001732 HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1733 HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1734 HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1735
1736 HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1737 HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1738 assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1739 assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1740
1741 RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1742 RegisterEncoding.VGPRL =
1743 RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1744 RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1745 RegisterEncoding.SGPRL =
1746 RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1747
Mark Searles24c92ee2018-02-07 02:21:21 +00001748 TrackedWaitcntSet.clear();
1749 BlockVisitedSet.clear();
1750 VCCZBugHandledSet.clear();
Mark Searles1bc6e712018-04-19 15:42:30 +00001751 LoopWaitcntDataMap.clear();
Scott Linder5792dd02018-06-21 18:48:48 +00001752 BlockWaitcntProcessedSet.clear();
Mark Searles24c92ee2018-02-07 02:21:21 +00001753
Nicolai Haehnle0ab31c92018-11-07 21:53:29 +00001754 // Walk over the blocks in reverse post order, inserting
Kannan Narayananacb089e2017-04-12 03:25:12 +00001755 // s_waitcnt where needed.
1756 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
1757 bool Modified = false;
1758 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1759 I = RPOT.begin(),
1760 E = RPOT.end(), J = RPOT.begin();
1761 I != E;) {
1762 MachineBasicBlock &MBB = **I;
1763
1764 BlockVisitedSet.insert(&MBB);
1765
1766 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1767 if (!ScoreBrackets) {
Mark Searlesf0b93f12018-06-04 16:51:59 +00001768 BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001769 ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1770 }
1771 ScoreBrackets->setPostOrder(MBB.getNumber());
1772 MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
1773 if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
Eugene Zelenko59e12822017-08-08 00:47:13 +00001774 LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001775
1776 // If we are walking into the block from before the loop, then guarantee
1777 // at least 1 re-walk over the loop to propagate the information, even if
1778 // no S_WAITCNT instructions were generated.
Mark Searles1bc6e712018-04-19 15:42:30 +00001779 if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
1780 unsigned Count = countNumBottomBlocks(ContainingLoop);
1781
1782 // If the loop has multiple back-edges, and so more than one "bottom"
1783 // basic block, we have to guarantee a re-walk over every blocks.
1784 if ((std::count(BlockWaitcntProcessedSet.begin(),
Mark Searlesf4e70252018-07-16 10:21:36 +00001785 BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
Mark Searles1bc6e712018-04-19 15:42:30 +00001786 BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
Mark Searles10545412018-05-30 15:47:45 +00001787 LLVM_DEBUG(dbgs() << "set-revisit1: Block"
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001788 << ContainingLoop->getHeader()->getNumber() << '\n';);
Mark Searles1bc6e712018-04-19 15:42:30 +00001789 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001790 }
1791
1792 // Walk over the instructions.
1793 insertWaitcntInBlock(MF, MBB);
1794
Mark Searles10545412018-05-30 15:47:45 +00001795 // Record that waitcnts have been processed at least once for this block.
Mark Searles1bc6e712018-04-19 15:42:30 +00001796 BlockWaitcntProcessedSet.push_back(&MBB);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001797
Mark Searles1bc6e712018-04-19 15:42:30 +00001798 // See if we want to revisit the loop. If a loop has multiple back-edges,
1799 // we shouldn't revisit the same "bottom" basic block.
1800 if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
1801 std::count(BlockWaitcntProcessedSet.begin(),
1802 BlockWaitcntProcessedSet.end(), &MBB) == 1) {
Kannan Narayanan5e73b042017-05-05 21:10:17 +00001803 MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001804 BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
1805 if (EntrySB && EntrySB->getRevisitLoop()) {
1806 EntrySB->setRevisitLoop(false);
1807 J = I;
1808 int32_t PostOrder = EntrySB->getPostOrder();
1809 // TODO: Avoid this loop. Find another way to set I.
1810 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1811 X = RPOT.begin(),
1812 Y = RPOT.end();
1813 X != Y; ++X) {
1814 MachineBasicBlock &MBBX = **X;
1815 if (MBBX.getNumber() == PostOrder) {
1816 I = X;
1817 break;
1818 }
1819 }
1820 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1821 WaitcntData->incIterCnt();
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001822 LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001823 continue;
1824 } else {
1825 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1826 // Loop converged, reset iteration count. If this loop gets revisited,
1827 // it must be from an outer loop, the counter will restart, this will
1828 // ensure we don't force convergence on such revisits.
1829 WaitcntData->resetIterCnt();
1830 }
1831 }
1832
1833 J = I;
1834 ++I;
1835 }
1836
1837 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1838
1839 bool HaveScalarStores = false;
1840
1841 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1842 ++BI) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001843 MachineBasicBlock &MBB = *BI;
1844
1845 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1846 ++I) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001847 if (!HaveScalarStores && TII->isScalarStore(*I))
1848 HaveScalarStores = true;
1849
1850 if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1851 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1852 EndPgmBlocks.push_back(&MBB);
1853 }
1854 }
1855
1856 if (HaveScalarStores) {
1857 // If scalar writes are used, the cache must be flushed or else the next
1858 // wave to reuse the same scratch memory can be clobbered.
1859 //
1860 // Insert s_dcache_wb at wave termination points if there were any scalar
1861 // stores, and only if the cache hasn't already been flushed. This could be
1862 // improved by looking across blocks for flushes in postdominating blocks
1863 // from the stores but an explicitly requested flush is probably very rare.
1864 for (MachineBasicBlock *MBB : EndPgmBlocks) {
1865 bool SeenDCacheWB = false;
1866
1867 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1868 ++I) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001869 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1870 SeenDCacheWB = true;
1871 else if (TII->isScalarStore(*I))
1872 SeenDCacheWB = false;
1873
1874 // FIXME: It would be better to insert this before a waitcnt if any.
1875 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1876 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1877 !SeenDCacheWB) {
1878 Modified = true;
1879 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1880 }
1881 }
1882 }
1883 }
1884
Mark Searles11d0a042017-05-31 16:44:23 +00001885 if (!MFI->isEntryFunction()) {
1886 // Wait for any outstanding memory operations that the input registers may
Hiroshi Inouec8e92452018-01-29 05:17:03 +00001887 // depend on. We can't track them and it's better to the wait after the
Mark Searles11d0a042017-05-31 16:44:23 +00001888 // costly call sequence.
1889
1890 // TODO: Could insert earlier and schedule more liberally with operations
1891 // that only use caller preserved registers.
1892 MachineBasicBlock &EntryBB = MF.front();
Mark Searlesed54ff12018-05-30 16:27:57 +00001893 BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1894 .addImm(0);
Mark Searles11d0a042017-05-31 16:44:23 +00001895
1896 Modified = true;
1897 }
1898
Kannan Narayananacb089e2017-04-12 03:25:12 +00001899 return Modified;
1900}