blob: bed4a705b0c63e2657d823e1af05f2e52659a95f [file] [log] [blame]
Eugene Zelenko59e12822017-08-08 00:47:13 +00001//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
Kannan Narayananacb089e2017-04-12 03:25:12 +00002//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief Insert wait instructions for memory reads and writes.
12///
13/// Memory reads and writes are issued asynchronously, so we need to insert
14/// S_WAITCNT instructions when we want to access any of their results or
15/// overwrite any register that's used asynchronously.
16//
17//===----------------------------------------------------------------------===//
18
19#include "AMDGPU.h"
20#include "AMDGPUSubtarget.h"
21#include "SIDefines.h"
22#include "SIInstrInfo.h"
23#include "SIMachineFunctionInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000024#include "SIRegisterInfo.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000025#include "Utils/AMDGPUBaseInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000026#include "llvm/ADT/DenseMap.h"
27#include "llvm/ADT/DenseSet.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000028#include "llvm/ADT/PostOrderIterator.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000029#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/SmallVector.h"
31#include "llvm/CodeGen/MachineBasicBlock.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000032#include "llvm/CodeGen/MachineFunction.h"
33#include "llvm/CodeGen/MachineFunctionPass.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000034#include "llvm/CodeGen/MachineInstr.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000035#include "llvm/CodeGen/MachineInstrBuilder.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000036#include "llvm/CodeGen/MachineLoopInfo.h"
37#include "llvm/CodeGen/MachineMemOperand.h"
38#include "llvm/CodeGen/MachineOperand.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000039#include "llvm/CodeGen/MachineRegisterInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000040#include "llvm/IR/DebugLoc.h"
41#include "llvm/Pass.h"
42#include "llvm/Support/Debug.h"
43#include "llvm/Support/ErrorHandling.h"
44#include "llvm/Support/raw_ostream.h"
45#include <algorithm>
46#include <cassert>
47#include <cstdint>
48#include <cstring>
49#include <memory>
50#include <utility>
51#include <vector>
Kannan Narayananacb089e2017-04-12 03:25:12 +000052
53#define DEBUG_TYPE "si-insert-waitcnts"
54
55using namespace llvm;
56
57namespace {
58
59// Class of object that encapsulates latest instruction counter score
60// associated with the operand. Used for determining whether
61// s_waitcnt instruction needs to be emited.
62
63#define CNT_MASK(t) (1u << (t))
64
65enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
66
Eugene Zelenko59e12822017-08-08 00:47:13 +000067using RegInterval = std::pair<signed, signed>;
Kannan Narayananacb089e2017-04-12 03:25:12 +000068
69struct {
70 int32_t VmcntMax;
71 int32_t ExpcntMax;
72 int32_t LgkmcntMax;
73 int32_t NumVGPRsMax;
74 int32_t NumSGPRsMax;
75} HardwareLimits;
76
77struct {
78 unsigned VGPR0;
79 unsigned VGPRL;
80 unsigned SGPR0;
81 unsigned SGPRL;
82} RegisterEncoding;
83
84enum WaitEventType {
85 VMEM_ACCESS, // vector-memory read & write
86 LDS_ACCESS, // lds read & write
87 GDS_ACCESS, // gds read & write
88 SQ_MESSAGE, // send message
89 SMEM_ACCESS, // scalar-memory read & write
90 EXP_GPR_LOCK, // export holding on its data src
91 GDS_GPR_LOCK, // GDS holding on its data and addr src
92 EXP_POS_ACCESS, // write to export position
93 EXP_PARAM_ACCESS, // write to export parameter
94 VMW_GPR_LOCK, // vector-memory write holding on its data src
95 NUM_WAIT_EVENTS,
96};
97
98// The mapping is:
99// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
100// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
101// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
102// We reserve a fixed number of VGPR slots in the scoring tables for
103// special tokens like SCMEM_LDS (needed for buffer load to LDS).
104enum RegisterMapping {
105 SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
106 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
107 NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
108 EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
109 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
110};
111
112#define ForAllWaitEventType(w) \
113 for (enum WaitEventType w = (enum WaitEventType)0; \
114 (w) < (enum WaitEventType)NUM_WAIT_EVENTS; \
115 (w) = (enum WaitEventType)((w) + 1))
116
117// This is a per-basic-block object that maintains current score brackets
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000118// of each wait counter, and a per-register scoreboard for each wait counter.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000119// We also maintain the latest score for every event type that can change the
120// waitcnt in order to know if there are multiple types of events within
121// the brackets. When multiple types of event happen in the bracket,
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000122// wait count may get decreased out of order, therefore we need to put in
Kannan Narayananacb089e2017-04-12 03:25:12 +0000123// "s_waitcnt 0" before use.
124class BlockWaitcntBrackets {
125public:
Eugene Zelenko59e12822017-08-08 00:47:13 +0000126 BlockWaitcntBrackets() {
127 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
128 T = (enum InstCounterType)(T + 1)) {
129 memset(VgprScores[T], 0, sizeof(VgprScores[T]));
130 }
131 }
132
133 ~BlockWaitcntBrackets() = default;
134
Kannan Narayananacb089e2017-04-12 03:25:12 +0000135 static int32_t getWaitCountMax(InstCounterType T) {
136 switch (T) {
137 case VM_CNT:
138 return HardwareLimits.VmcntMax;
139 case LGKM_CNT:
140 return HardwareLimits.LgkmcntMax;
141 case EXP_CNT:
142 return HardwareLimits.ExpcntMax;
143 default:
144 break;
145 }
146 return 0;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000147 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000148
149 void setScoreLB(InstCounterType T, int32_t Val) {
150 assert(T < NUM_INST_CNTS);
151 if (T >= NUM_INST_CNTS)
152 return;
153 ScoreLBs[T] = Val;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000154 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000155
156 void setScoreUB(InstCounterType T, int32_t Val) {
157 assert(T < NUM_INST_CNTS);
158 if (T >= NUM_INST_CNTS)
159 return;
160 ScoreUBs[T] = Val;
161 if (T == EXP_CNT) {
162 int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
163 if (ScoreLBs[T] < UB)
164 ScoreLBs[T] = UB;
165 }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000166 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000167
168 int32_t getScoreLB(InstCounterType T) {
169 assert(T < NUM_INST_CNTS);
170 if (T >= NUM_INST_CNTS)
171 return 0;
172 return ScoreLBs[T];
Eugene Zelenko59e12822017-08-08 00:47:13 +0000173 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000174
175 int32_t getScoreUB(InstCounterType T) {
176 assert(T < NUM_INST_CNTS);
177 if (T >= NUM_INST_CNTS)
178 return 0;
179 return ScoreUBs[T];
Eugene Zelenko59e12822017-08-08 00:47:13 +0000180 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000181
182 // Mapping from event to counter.
183 InstCounterType eventCounter(WaitEventType E) {
184 switch (E) {
185 case VMEM_ACCESS:
186 return VM_CNT;
187 case LDS_ACCESS:
188 case GDS_ACCESS:
189 case SQ_MESSAGE:
190 case SMEM_ACCESS:
191 return LGKM_CNT;
192 case EXP_GPR_LOCK:
193 case GDS_GPR_LOCK:
194 case VMW_GPR_LOCK:
195 case EXP_POS_ACCESS:
196 case EXP_PARAM_ACCESS:
197 return EXP_CNT;
198 default:
199 llvm_unreachable("unhandled event type");
200 }
201 return NUM_INST_CNTS;
202 }
203
204 void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
205 if (GprNo < NUM_ALL_VGPRS) {
206 if (GprNo > VgprUB) {
207 VgprUB = GprNo;
208 }
209 VgprScores[T][GprNo] = Val;
210 } else {
211 assert(T == LGKM_CNT);
212 if (GprNo - NUM_ALL_VGPRS > SgprUB) {
213 SgprUB = GprNo - NUM_ALL_VGPRS;
214 }
215 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
216 }
217 }
218
219 int32_t getRegScore(int GprNo, InstCounterType T) {
220 if (GprNo < NUM_ALL_VGPRS) {
221 return VgprScores[T][GprNo];
222 }
223 return SgprScores[GprNo - NUM_ALL_VGPRS];
224 }
225
226 void clear() {
227 memset(ScoreLBs, 0, sizeof(ScoreLBs));
228 memset(ScoreUBs, 0, sizeof(ScoreUBs));
229 memset(EventUBs, 0, sizeof(EventUBs));
230 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
231 T = (enum InstCounterType)(T + 1)) {
232 memset(VgprScores[T], 0, sizeof(VgprScores[T]));
233 }
234 memset(SgprScores, 0, sizeof(SgprScores));
235 }
236
237 RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
238 const MachineRegisterInfo *MRI,
239 const SIRegisterInfo *TRI, unsigned OpNo,
240 bool Def) const;
241
242 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
243 const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
244 unsigned OpNo, int32_t Val);
245
246 void setWaitAtBeginning() { WaitAtBeginning = true; }
247 void clearWaitAtBeginning() { WaitAtBeginning = false; }
248 bool getWaitAtBeginning() const { return WaitAtBeginning; }
249 void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
250 int32_t getMaxVGPR() const { return VgprUB; }
251 int32_t getMaxSGPR() const { return SgprUB; }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000252
Kannan Narayananacb089e2017-04-12 03:25:12 +0000253 int32_t getEventUB(enum WaitEventType W) const {
254 assert(W < NUM_WAIT_EVENTS);
255 return EventUBs[W];
256 }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000257
Kannan Narayananacb089e2017-04-12 03:25:12 +0000258 bool counterOutOfOrder(InstCounterType T);
259 unsigned int updateByWait(InstCounterType T, int ScoreToWait);
260 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
261 const MachineRegisterInfo *MRI, WaitEventType E,
262 MachineInstr &MI);
263
Kannan Narayananacb089e2017-04-12 03:25:12 +0000264 bool hasPendingSMEM() const {
265 return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
266 EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
267 }
268
269 bool hasPendingFlat() const {
270 return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
271 LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
272 (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
273 LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
274 }
275
276 void setPendingFlat() {
277 LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
278 LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
279 }
280
281 int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
282
283 void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
284
285 bool getRevisitLoop() const { return RevisitLoop; }
286 void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
287
288 void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
289 int32_t getPostOrder() const { return PostOrder; }
290
291 void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000292 void clearWaitcnt() { Waitcnt = nullptr; }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000293 MachineInstr *getWaitcnt() const { return Waitcnt; }
294
295 bool mixedExpTypes() const { return MixedExpTypes; }
296 void setMixedExpTypes(bool MixedExpTypesIn) {
297 MixedExpTypes = MixedExpTypesIn;
298 }
299
300 void print(raw_ostream &);
301 void dump() { print(dbgs()); }
302
303private:
Eugene Zelenko59e12822017-08-08 00:47:13 +0000304 bool WaitAtBeginning = false;
305 bool RevisitLoop = false;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000306 bool MixedExpTypes = false;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000307 int32_t PostOrder = 0;
308 MachineInstr *Waitcnt = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000309 int32_t ScoreLBs[NUM_INST_CNTS] = {0};
310 int32_t ScoreUBs[NUM_INST_CNTS] = {0};
311 int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
312 // Remember the last flat memory operation.
313 int32_t LastFlat[NUM_INST_CNTS] = {0};
314 // wait_cnt scores for every vgpr.
315 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000316 int32_t VgprUB = 0;
317 int32_t SgprUB = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000318 int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
319 // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
320 int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
321};
322
323// This is a per-loop-region object that records waitcnt status at the end of
324// loop footer from the previous iteration. We also maintain an iteration
325// count to track the number of times the loop has been visited. When it
326// doesn't converge naturally, we force convergence by inserting s_waitcnt 0
327// at the end of the loop footer.
328class LoopWaitcntData {
329public:
Eugene Zelenko59e12822017-08-08 00:47:13 +0000330 LoopWaitcntData() = default;
331 ~LoopWaitcntData() = default;
332
Kannan Narayananacb089e2017-04-12 03:25:12 +0000333 void incIterCnt() { IterCnt++; }
334 void resetIterCnt() { IterCnt = 0; }
335 int32_t getIterCnt() { return IterCnt; }
336
Kannan Narayananacb089e2017-04-12 03:25:12 +0000337 void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
338 MachineInstr *getWaitcnt() const { return LfWaitcnt; }
339
340 void print() {
341 DEBUG(dbgs() << " iteration " << IterCnt << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000342 }
343
344private:
345 // s_waitcnt added at the end of loop footer to stablize wait scores
346 // at the end of the loop footer.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000347 MachineInstr *LfWaitcnt = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000348 // Number of iterations the loop has been visited, not including the initial
349 // walk over.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000350 int32_t IterCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000351};
352
353class SIInsertWaitcnts : public MachineFunctionPass {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000354private:
Eugene Zelenko59e12822017-08-08 00:47:13 +0000355 const SISubtarget *ST = nullptr;
356 const SIInstrInfo *TII = nullptr;
357 const SIRegisterInfo *TRI = nullptr;
358 const MachineRegisterInfo *MRI = nullptr;
359 const MachineLoopInfo *MLI = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000360 AMDGPU::IsaInfo::IsaVersion IV;
361 AMDGPUAS AMDGPUASI;
362
363 DenseSet<MachineBasicBlock *> BlockVisitedSet;
Mark Searles24c92ee2018-02-07 02:21:21 +0000364 DenseSet<MachineInstr *> TrackedWaitcntSet;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000365 DenseSet<MachineInstr *> VCCZBugHandledSet;
366
367 DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
368 BlockWaitcntBracketsMap;
369
Mark Searles1bc6e712018-04-19 15:42:30 +0000370 std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000371
372 DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
373
374 std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
375
376public:
377 static char ID;
378
Eugene Zelenko59e12822017-08-08 00:47:13 +0000379 SIInsertWaitcnts() : MachineFunctionPass(ID) {}
Kannan Narayananacb089e2017-04-12 03:25:12 +0000380
381 bool runOnMachineFunction(MachineFunction &MF) override;
382
383 StringRef getPassName() const override {
384 return "SI insert wait instructions";
385 }
386
387 void getAnalysisUsage(AnalysisUsage &AU) const override {
388 AU.setPreservesCFG();
389 AU.addRequired<MachineLoopInfo>();
390 MachineFunctionPass::getAnalysisUsage(AU);
391 }
392
393 void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
394 // The waitcnt information is copied because it changes as the block is
395 // traversed.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000396 KillWaitBrackets.push_back(
397 llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000398 }
399
Matt Arsenault0ed39d32017-07-21 18:54:54 +0000400 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +0000401 void generateSWaitCntInstBefore(MachineInstr &MI,
402 BlockWaitcntBrackets *ScoreBrackets);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000403 void updateEventWaitCntAfter(MachineInstr &Inst,
404 BlockWaitcntBrackets *ScoreBrackets);
405 void mergeInputScoreBrackets(MachineBasicBlock &Block);
Mark Searles1bc6e712018-04-19 15:42:30 +0000406 bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
407 unsigned countNumBottomBlocks(const MachineLoop *Loop);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000408 void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
409 void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +0000410 bool isWaitcntStronger(unsigned LHS, unsigned RHS);
411 unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000412};
413
Eugene Zelenko59e12822017-08-08 00:47:13 +0000414} // end anonymous namespace
Kannan Narayananacb089e2017-04-12 03:25:12 +0000415
416RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
417 const SIInstrInfo *TII,
418 const MachineRegisterInfo *MRI,
419 const SIRegisterInfo *TRI,
420 unsigned OpNo,
421 bool Def) const {
422 const MachineOperand &Op = MI->getOperand(OpNo);
423 if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
424 (Def && !Op.isDef()))
425 return {-1, -1};
426
427 // A use via a PW operand does not need a waitcnt.
428 // A partial write is not a WAW.
429 assert(!Op.getSubReg() || !Op.isUndef());
430
431 RegInterval Result;
432 const MachineRegisterInfo &MRIA = *MRI;
433
434 unsigned Reg = TRI->getEncodingValue(Op.getReg());
435
436 if (TRI->isVGPR(MRIA, Op.getReg())) {
437 assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
438 Result.first = Reg - RegisterEncoding.VGPR0;
439 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
440 } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
441 assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
442 Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
443 assert(Result.first >= NUM_ALL_VGPRS &&
444 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
445 }
446 // TODO: Handle TTMP
447 // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
448 else
449 return {-1, -1};
450
451 const MachineInstr &MIA = *MI;
452 const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +0000453 unsigned Size = TRI->getRegSizeInBits(*RC);
454 Result.second = Result.first + (Size / 32);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000455
456 return Result;
457}
458
459void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
460 const SIInstrInfo *TII,
461 const SIRegisterInfo *TRI,
462 const MachineRegisterInfo *MRI,
463 unsigned OpNo, int32_t Val) {
464 RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
465 DEBUG({
466 const MachineOperand &Opnd = MI->getOperand(OpNo);
467 assert(TRI->isVGPR(*MRI, Opnd.getReg()));
468 });
469 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
470 setRegScore(RegNo, EXP_CNT, Val);
471 }
472}
473
474void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
475 const SIRegisterInfo *TRI,
476 const MachineRegisterInfo *MRI,
477 WaitEventType E, MachineInstr &Inst) {
478 const MachineRegisterInfo &MRIA = *MRI;
479 InstCounterType T = eventCounter(E);
480 int32_t CurrScore = getScoreUB(T) + 1;
481 // EventUB and ScoreUB need to be update regardless if this event changes
482 // the score of a register or not.
483 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
484 EventUBs[E] = CurrScore;
485 setScoreUB(T, CurrScore);
486
487 if (T == EXP_CNT) {
488 // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
489 // is required.
490 if (!MixedExpTypes) {
491 MixedExpTypes = counterOutOfOrder(EXP_CNT);
492 }
493
494 // Put score on the source vgprs. If this is a store, just use those
495 // specific register(s).
496 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
497 // All GDS operations must protect their address register (same as
498 // export.)
499 if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
500 Inst.getOpcode() != AMDGPU::DS_CONSUME) {
501 setExpScore(
502 &Inst, TII, TRI, MRI,
503 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
504 CurrScore);
505 }
506 if (Inst.mayStore()) {
507 setExpScore(
508 &Inst, TII, TRI, MRI,
509 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
510 CurrScore);
511 if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
512 AMDGPU::OpName::data1) != -1) {
513 setExpScore(&Inst, TII, TRI, MRI,
514 AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
515 AMDGPU::OpName::data1),
516 CurrScore);
517 }
518 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
519 Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
520 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
521 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
522 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
523 Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
524 Inst.getOpcode() != AMDGPU::DS_APPEND &&
525 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
526 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
527 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
528 const MachineOperand &Op = Inst.getOperand(I);
529 if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
530 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
531 }
532 }
533 }
534 } else if (TII->isFLAT(Inst)) {
535 if (Inst.mayStore()) {
536 setExpScore(
537 &Inst, TII, TRI, MRI,
538 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
539 CurrScore);
540 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
541 setExpScore(
542 &Inst, TII, TRI, MRI,
543 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
544 CurrScore);
545 }
546 } else if (TII->isMIMG(Inst)) {
547 if (Inst.mayStore()) {
548 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
549 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
550 setExpScore(
551 &Inst, TII, TRI, MRI,
552 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
553 CurrScore);
554 }
555 } else if (TII->isMTBUF(Inst)) {
556 if (Inst.mayStore()) {
557 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
558 }
559 } else if (TII->isMUBUF(Inst)) {
560 if (Inst.mayStore()) {
561 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
562 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
563 setExpScore(
564 &Inst, TII, TRI, MRI,
565 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
566 CurrScore);
567 }
568 } else {
569 if (TII->isEXP(Inst)) {
570 // For export the destination registers are really temps that
571 // can be used as the actual source after export patching, so
572 // we need to treat them like sources and set the EXP_CNT
573 // score.
574 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
575 MachineOperand &DefMO = Inst.getOperand(I);
576 if (DefMO.isReg() && DefMO.isDef() &&
577 TRI->isVGPR(MRIA, DefMO.getReg())) {
578 setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
579 CurrScore);
580 }
581 }
582 }
583 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
584 MachineOperand &MO = Inst.getOperand(I);
585 if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
586 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
587 }
588 }
589 }
590#if 0 // TODO: check if this is handled by MUBUF code above.
591 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
Evgeny Mankovbf975172017-08-16 16:47:29 +0000592 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
593 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000594 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
595 unsigned OpNo;//TODO: find the OpNo for this operand;
596 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
597 for (signed RegNo = Interval.first; RegNo < Interval.second;
Evgeny Mankovbf975172017-08-16 16:47:29 +0000598 ++RegNo) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000599 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
600 }
601#endif
602 } else {
603 // Match the score to the destination registers.
604 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
605 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
606 if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
607 continue;
608 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
609 setRegScore(RegNo, T, CurrScore);
610 }
611 }
612 if (TII->isDS(Inst) && Inst.mayStore()) {
613 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
614 }
615 }
616}
617
618void BlockWaitcntBrackets::print(raw_ostream &OS) {
619 OS << '\n';
620 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
621 T = (enum InstCounterType)(T + 1)) {
622 int LB = getScoreLB(T);
623 int UB = getScoreUB(T);
624
625 switch (T) {
626 case VM_CNT:
627 OS << " VM_CNT(" << UB - LB << "): ";
628 break;
629 case LGKM_CNT:
630 OS << " LGKM_CNT(" << UB - LB << "): ";
631 break;
632 case EXP_CNT:
633 OS << " EXP_CNT(" << UB - LB << "): ";
634 break;
635 default:
636 OS << " UNKNOWN(" << UB - LB << "): ";
637 break;
638 }
639
640 if (LB < UB) {
641 // Print vgpr scores.
642 for (int J = 0; J <= getMaxVGPR(); J++) {
643 int RegScore = getRegScore(J, T);
644 if (RegScore <= LB)
645 continue;
646 int RelScore = RegScore - LB - 1;
647 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
648 OS << RelScore << ":v" << J << " ";
649 } else {
650 OS << RelScore << ":ds ";
651 }
652 }
653 // Also need to print sgpr scores for lgkm_cnt.
654 if (T == LGKM_CNT) {
655 for (int J = 0; J <= getMaxSGPR(); J++) {
656 int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
657 if (RegScore <= LB)
658 continue;
659 int RelScore = RegScore - LB - 1;
660 OS << RelScore << ":s" << J << " ";
661 }
662 }
663 }
664 OS << '\n';
665 }
666 OS << '\n';
Kannan Narayananacb089e2017-04-12 03:25:12 +0000667}
668
669unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
670 int ScoreToWait) {
671 unsigned int NeedWait = 0;
672 if (ScoreToWait == -1) {
673 // The score to wait is unknown. This implies that it was not encountered
674 // during the path of the CFG walk done during the current traversal but
675 // may be seen on a different path. Emit an s_wait counter with a
676 // conservative value of 0 for the counter.
677 NeedWait = CNT_MASK(T);
678 setScoreLB(T, getScoreUB(T));
679 return NeedWait;
680 }
681
682 // If the score of src_operand falls within the bracket, we need an
683 // s_waitcnt instruction.
684 const int32_t LB = getScoreLB(T);
685 const int32_t UB = getScoreUB(T);
686 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
687 if (T == VM_CNT && hasPendingFlat()) {
688 // If there is a pending FLAT operation, and this is a VM waitcnt,
689 // then we need to force a waitcnt 0 for VM.
690 NeedWait = CNT_MASK(T);
691 setScoreLB(T, getScoreUB(T));
692 } else if (counterOutOfOrder(T)) {
693 // Counter can get decremented out-of-order when there
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000694 // are multiple types event in the bracket. Also emit an s_wait counter
Kannan Narayananacb089e2017-04-12 03:25:12 +0000695 // with a conservative value of 0 for the counter.
696 NeedWait = CNT_MASK(T);
697 setScoreLB(T, getScoreUB(T));
698 } else {
699 NeedWait = CNT_MASK(T);
700 setScoreLB(T, ScoreToWait);
701 }
702 }
703
704 return NeedWait;
705}
706
707// Where there are multiple types of event in the bracket of a counter,
708// the decrement may go out of order.
709bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
710 switch (T) {
711 case VM_CNT:
712 return false;
713 case LGKM_CNT: {
714 if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
715 EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
716 // Scalar memory read always can go out of order.
717 return true;
718 }
719 int NumEventTypes = 0;
720 if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
721 EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
722 NumEventTypes++;
723 }
724 if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
725 EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
726 NumEventTypes++;
727 }
728 if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
729 EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
730 NumEventTypes++;
731 }
732 if (NumEventTypes <= 1) {
733 return false;
734 }
735 break;
736 }
737 case EXP_CNT: {
738 // If there has been a mixture of export types, then a waitcnt exp(0) is
739 // required.
740 if (MixedExpTypes)
741 return true;
742 int NumEventTypes = 0;
743 if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
744 EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
745 NumEventTypes++;
746 }
747 if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
748 EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
749 NumEventTypes++;
750 }
751 if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
752 EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
753 NumEventTypes++;
754 }
755 if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
756 EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
757 NumEventTypes++;
758 }
759
760 if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
761 EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
762 NumEventTypes++;
763 }
764
765 if (NumEventTypes <= 1) {
766 return false;
767 }
768 break;
769 }
770 default:
771 break;
772 }
773 return true;
774}
775
776INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
777 false)
778INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
779 false)
780
781char SIInsertWaitcnts::ID = 0;
782
783char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
784
785FunctionPass *llvm::createSIInsertWaitcntsPass() {
786 return new SIInsertWaitcnts();
787}
788
789static bool readsVCCZ(const MachineInstr &MI) {
790 unsigned Opc = MI.getOpcode();
791 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
792 !MI.getOperand(1).isUndef();
793}
794
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +0000795/// \brief Given wait count encodings checks if LHS is stronger than RHS.
796bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
797 if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
798 return false;
799 if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
800 return false;
801 if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
802 return false;
803 return true;
804}
805
806/// \brief Given wait count encodings create a new encoding which is stronger
807/// or equal to both.
808unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
809 unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
810 AMDGPU::decodeVmcnt(IV, RHS));
811 unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
812 AMDGPU::decodeLgkmcnt(IV, RHS));
813 unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
814 AMDGPU::decodeExpcnt(IV, RHS));
815 return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
816}
817
Kannan Narayananacb089e2017-04-12 03:25:12 +0000818/// \brief Generate s_waitcnt instruction to be placed before cur_Inst.
819/// Instructions of a given type are returned in order,
820/// but instructions of different types can complete out of order.
821/// We rely on this in-order completion
822/// and simply assign a score to the memory access instructions.
823/// We keep track of the active "score bracket" to determine
824/// if an access of a memory read requires an s_waitcnt
825/// and if so what the value of each counter is.
826/// The "score bracket" is bound by the lower bound and upper bound
827/// scores (*_score_LB and *_score_ub respectively).
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +0000828void SIInsertWaitcnts::generateSWaitCntInstBefore(
Kannan Narayananacb089e2017-04-12 03:25:12 +0000829 MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
830 // To emit, or not to emit - that's the question!
831 // Start with an assumption that there is no need to emit.
832 unsigned int EmitSwaitcnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000833 // No need to wait before phi. If a phi-move exists, then the wait should
834 // has been inserted before the move. If a phi-move does not exist, then
835 // wait should be inserted before the real use. The same is true for
836 // sc-merge. It is not a coincident that all these cases correspond to the
837 // instructions that are skipped in the assembling loop.
838 bool NeedLineMapping = false; // TODO: Check on this.
839 if (MI.isDebugValue() &&
840 // TODO: any other opcode?
841 !NeedLineMapping) {
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +0000842 return;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000843 }
844
845 // See if an s_waitcnt is forced at block entry, or is needed at
846 // program end.
847 if (ScoreBrackets->getWaitAtBeginning()) {
848 // Note that we have already cleared the state, so we don't need to update
849 // it.
850 ScoreBrackets->clearWaitAtBeginning();
851 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
852 T = (enum InstCounterType)(T + 1)) {
853 EmitSwaitcnt |= CNT_MASK(T);
854 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
855 }
856 }
857
858 // See if this instruction has a forced S_WAITCNT VM.
859 // TODO: Handle other cases of NeedsWaitcntVmBefore()
860 else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
861 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
862 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
863 EmitSwaitcnt |=
864 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
865 }
866
867 // All waits must be resolved at call return.
868 // NOTE: this could be improved with knowledge of all call sites or
869 // with knowledge of the called routines.
870 if (MI.getOpcode() == AMDGPU::RETURN ||
Mark Searles11d0a042017-05-31 16:44:23 +0000871 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
872 MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000873 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
874 T = (enum InstCounterType)(T + 1)) {
875 if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
876 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
877 EmitSwaitcnt |= CNT_MASK(T);
878 }
879 }
880 }
881 // Resolve vm waits before gs-done.
882 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
883 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
884 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
885 AMDGPU::SendMsg::ID_GS_DONE)) {
886 if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
887 ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
888 EmitSwaitcnt |= CNT_MASK(VM_CNT);
889 }
890 }
891#if 0 // TODO: the following blocks of logic when we have fence.
892 else if (MI.getOpcode() == SC_FENCE) {
893 const unsigned int group_size =
894 context->shader_info->GetMaxThreadGroupSize();
895 // group_size == 0 means thread group size is unknown at compile time
896 const bool group_is_multi_wave =
897 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
898 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
899
900 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
901 SCRegType src_type = Inst->GetSrcType(i);
902 switch (src_type) {
903 case SCMEM_LDS:
904 if (group_is_multi_wave ||
Evgeny Mankovbf975172017-08-16 16:47:29 +0000905 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000906 EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
907 ScoreBrackets->getScoreUB(LGKM_CNT));
908 // LDS may have to wait for VM_CNT after buffer load to LDS
909 if (target_info->HasBufferLoadToLDS()) {
910 EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
911 ScoreBrackets->getScoreUB(VM_CNT));
912 }
913 }
914 break;
915
916 case SCMEM_GDS:
917 if (group_is_multi_wave || fence_is_global) {
918 EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000919 ScoreBrackets->getScoreUB(EXP_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000920 EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000921 ScoreBrackets->getScoreUB(LGKM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000922 }
923 break;
924
925 case SCMEM_UAV:
926 case SCMEM_TFBUF:
927 case SCMEM_RING:
928 case SCMEM_SCATTER:
929 if (group_is_multi_wave || fence_is_global) {
930 EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000931 ScoreBrackets->getScoreUB(EXP_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000932 EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000933 ScoreBrackets->getScoreUB(VM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000934 }
935 break;
936
937 case SCMEM_SCRATCH:
938 default:
939 break;
940 }
941 }
942 }
943#endif
944
945 // Export & GDS instructions do not read the EXEC mask until after the export
946 // is granted (which can occur well after the instruction is issued).
947 // The shader program must flush all EXP operations on the export-count
948 // before overwriting the EXEC mask.
949 else {
950 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
951 // Export and GDS are tracked individually, either may trigger a waitcnt
952 // for EXEC.
953 EmitSwaitcnt |= ScoreBrackets->updateByWait(
954 EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
955 EmitSwaitcnt |= ScoreBrackets->updateByWait(
956 EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
957 EmitSwaitcnt |= ScoreBrackets->updateByWait(
958 EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
959 EmitSwaitcnt |= ScoreBrackets->updateByWait(
960 EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
961 }
962
963#if 0 // TODO: the following code to handle CALL.
964 // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
965 // However, there is a problem with EXP_CNT, because the call cannot
966 // easily tell if a register is used in the function, and if it did, then
967 // the referring instruction would have to have an S_WAITCNT, which is
968 // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
969 // before the call.
970 if (MI.getOpcode() == SC_CALL) {
971 if (ScoreBrackets->getScoreUB(EXP_CNT) >
Evgeny Mankovbf975172017-08-16 16:47:29 +0000972 ScoreBrackets->getScoreLB(EXP_CNT)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000973 ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
974 EmitSwaitcnt |= CNT_MASK(EXP_CNT);
975 }
976 }
977#endif
978
Matt Arsenault0ed39d32017-07-21 18:54:54 +0000979 // FIXME: Should not be relying on memoperands.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000980 // Look at the source operands of every instruction to see if
981 // any of them results from a previous memory operation that affects
982 // its current usage. If so, an s_waitcnt instruction needs to be
983 // emitted.
984 // If the source operand was defined by a load, add the s_waitcnt
985 // instruction.
986 for (const MachineMemOperand *Memop : MI.memoperands()) {
987 unsigned AS = Memop->getAddrSpace();
988 if (AS != AMDGPUASI.LOCAL_ADDRESS)
989 continue;
990 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
991 // VM_CNT is only relevant to vgpr or LDS.
992 EmitSwaitcnt |= ScoreBrackets->updateByWait(
993 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
994 }
Matt Arsenault0ed39d32017-07-21 18:54:54 +0000995
Kannan Narayananacb089e2017-04-12 03:25:12 +0000996 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
997 const MachineOperand &Op = MI.getOperand(I);
998 const MachineRegisterInfo &MRIA = *MRI;
999 RegInterval Interval =
1000 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
1001 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1002 if (TRI->isVGPR(MRIA, Op.getReg())) {
1003 // VM_CNT is only relevant to vgpr or LDS.
1004 EmitSwaitcnt |= ScoreBrackets->updateByWait(
1005 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1006 }
1007 EmitSwaitcnt |= ScoreBrackets->updateByWait(
1008 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1009 }
1010 }
1011 // End of for loop that looks at all source operands to decide vm_wait_cnt
1012 // and lgk_wait_cnt.
1013
1014 // Two cases are handled for destination operands:
1015 // 1) If the destination operand was defined by a load, add the s_waitcnt
1016 // instruction to guarantee the right WAW order.
1017 // 2) If a destination operand that was used by a recent export/store ins,
1018 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1019 if (MI.mayStore()) {
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001020 // FIXME: Should not be relying on memoperands.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001021 for (const MachineMemOperand *Memop : MI.memoperands()) {
1022 unsigned AS = Memop->getAddrSpace();
1023 if (AS != AMDGPUASI.LOCAL_ADDRESS)
1024 continue;
1025 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1026 EmitSwaitcnt |= ScoreBrackets->updateByWait(
1027 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1028 EmitSwaitcnt |= ScoreBrackets->updateByWait(
1029 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1030 }
1031 }
1032 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1033 MachineOperand &Def = MI.getOperand(I);
1034 const MachineRegisterInfo &MRIA = *MRI;
1035 RegInterval Interval =
1036 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
1037 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1038 if (TRI->isVGPR(MRIA, Def.getReg())) {
1039 EmitSwaitcnt |= ScoreBrackets->updateByWait(
1040 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1041 EmitSwaitcnt |= ScoreBrackets->updateByWait(
1042 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1043 }
1044 EmitSwaitcnt |= ScoreBrackets->updateByWait(
1045 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1046 }
1047 } // End of for loop that looks at all dest operands.
1048 }
1049
Mark Searles94ae3b22018-01-30 17:17:06 +00001050 // TODO: Tie force zero to a compiler triage option.
1051 bool ForceZero = false;
1052
Kannan Narayananacb089e2017-04-12 03:25:12 +00001053 // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1054 // occurs before the instruction. Doing it here prevents any additional
1055 // S_WAITCNTs from being emitted if the instruction was marked as
1056 // requiring a WAITCNT beforehand.
Konstantin Zhuravlyovbe6c0ca2017-06-02 17:40:26 +00001057 if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1058 !ST->hasAutoWaitcntBeforeBarrier()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001059 EmitSwaitcnt |=
1060 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1061 EmitSwaitcnt |= ScoreBrackets->updateByWait(
1062 EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1063 EmitSwaitcnt |= ScoreBrackets->updateByWait(
1064 LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
1065 }
1066
1067 // TODO: Remove this work-around, enable the assert for Bug 457939
1068 // after fixing the scheduler. Also, the Shader Compiler code is
1069 // independent of target.
1070 if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
1071 if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1072 ScoreBrackets->getScoreUB(LGKM_CNT) &&
1073 ScoreBrackets->hasPendingSMEM()) {
1074 // Wait on everything, not just LGKM. vccz reads usually come from
1075 // terminators, and we always wait on everything at the end of the
1076 // block, so if we only wait on LGKM here, we might end up with
1077 // another s_waitcnt inserted right after this if there are non-LGKM
1078 // instructions still outstanding.
1079 ForceZero = true;
1080 EmitSwaitcnt = true;
1081 }
1082 }
1083
1084 // Does this operand processing indicate s_wait counter update?
Mark Searles94ae3b22018-01-30 17:17:06 +00001085 if (EmitSwaitcnt) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001086 int CntVal[NUM_INST_CNTS];
1087
1088 bool UseDefaultWaitcntStrategy = true;
1089 if (ForceZero) {
1090 // Force all waitcnts to 0.
1091 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1092 T = (enum InstCounterType)(T + 1)) {
1093 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1094 }
1095 CntVal[VM_CNT] = 0;
1096 CntVal[EXP_CNT] = 0;
1097 CntVal[LGKM_CNT] = 0;
1098 UseDefaultWaitcntStrategy = false;
1099 }
1100
1101 if (UseDefaultWaitcntStrategy) {
1102 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1103 T = (enum InstCounterType)(T + 1)) {
1104 if (EmitSwaitcnt & CNT_MASK(T)) {
1105 int Delta =
1106 ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
1107 int MaxDelta = ScoreBrackets->getWaitCountMax(T);
1108 if (Delta >= MaxDelta) {
1109 Delta = -1;
1110 if (T != EXP_CNT) {
1111 ScoreBrackets->setScoreLB(
1112 T, ScoreBrackets->getScoreUB(T) - MaxDelta);
1113 }
1114 EmitSwaitcnt &= ~CNT_MASK(T);
1115 }
1116 CntVal[T] = Delta;
1117 } else {
1118 // If we are not waiting for a particular counter then encode
1119 // it as -1 which means "don't care."
1120 CntVal[T] = -1;
1121 }
1122 }
1123 }
1124
1125 // If we are not waiting on any counter we can skip the wait altogether.
Mark Searles94ae3b22018-01-30 17:17:06 +00001126 if (EmitSwaitcnt != 0) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001127 MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
1128 int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
Mark Searles65207922018-02-19 19:19:59 +00001129 if (!OldWaitcnt ||
1130 (AMDGPU::decodeVmcnt(IV, Imm) !=
Kannan Narayananacb089e2017-04-12 03:25:12 +00001131 (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
1132 (AMDGPU::decodeExpcnt(IV, Imm) !=
1133 (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
1134 (AMDGPU::decodeLgkmcnt(IV, Imm) !=
1135 (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
1136 MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
1137 if (ContainingLoop) {
Kannan Narayanan5e73b042017-05-05 21:10:17 +00001138 MachineBasicBlock *TBB = ContainingLoop->getHeader();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001139 BlockWaitcntBrackets *ScoreBracket =
1140 BlockWaitcntBracketsMap[TBB].get();
1141 if (!ScoreBracket) {
Mark Searles24c92ee2018-02-07 02:21:21 +00001142 assert(!BlockVisitedSet.count(TBB));
Eugene Zelenko59e12822017-08-08 00:47:13 +00001143 BlockWaitcntBracketsMap[TBB] =
1144 llvm::make_unique<BlockWaitcntBrackets>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001145 ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
1146 }
1147 ScoreBracket->setRevisitLoop(true);
Mark Searles65207922018-02-19 19:19:59 +00001148 DEBUG(dbgs() << "set-revisit: Block"
Kannan Narayanan5e73b042017-05-05 21:10:17 +00001149 << ContainingLoop->getHeader()->getNumber() << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001150 }
1151 }
1152
1153 // Update an existing waitcount, or make a new one.
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001154 unsigned Enc = AMDGPU::encodeWaitcnt(IV, CntVal[VM_CNT],
1155 CntVal[EXP_CNT], CntVal[LGKM_CNT]);
Mark Searles65207922018-02-19 19:19:59 +00001156 // We don't remove waitcnts that existed prior to the waitcnt
1157 // pass. Check if the waitcnt to-be-inserted can be avoided
1158 // or if the prev waitcnt can be updated.
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001159 bool insertSWaitInst = true;
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +00001160 for (MachineBasicBlock::iterator I = MI.getIterator(),
1161 B = MI.getParent()->begin();
1162 insertSWaitInst && I != B; --I) {
Mark Searles65207922018-02-19 19:19:59 +00001163 if (I == MI.getIterator())
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +00001164 continue;
1165
1166 switch (I->getOpcode()) {
1167 case AMDGPU::S_WAITCNT:
1168 if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
1169 insertSWaitInst = false;
1170 else if (!OldWaitcnt) {
1171 OldWaitcnt = &*I;
1172 Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
1173 }
1174 break;
1175 // TODO: skip over instructions which never require wait.
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001176 }
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +00001177 break;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001178 }
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001179 if (insertSWaitInst) {
1180 if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
1181 OldWaitcnt->getOperand(0).setImm(Enc);
Stanislav Mekhanoshinff2763a2018-02-15 22:03:55 +00001182 if (!OldWaitcnt->getParent())
1183 MI.getParent()->insert(MI, OldWaitcnt);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001184
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001185 DEBUG(dbgs() << "updateWaitcntInBlock\n"
1186 << "Old Instr: " << MI << '\n'
1187 << "New Instr: " << *OldWaitcnt << '\n');
1188 } else {
1189 auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1190 MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1191 .addImm(Enc);
1192 TrackedWaitcntSet.insert(SWaitInst);
1193
1194 DEBUG(dbgs() << "insertWaitcntInBlock\n"
1195 << "Old Instr: " << MI << '\n'
1196 << "New Instr: " << *SWaitInst << '\n');
1197 }
1198 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001199
1200 if (CntVal[EXP_CNT] == 0) {
1201 ScoreBrackets->setMixedExpTypes(false);
1202 }
1203 }
1204 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001205}
1206
1207void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
1208 MachineInstr *Waitcnt) {
1209 if (MBB.empty()) {
1210 MBB.push_back(Waitcnt);
1211 return;
1212 }
1213
1214 MachineBasicBlock::iterator It = MBB.end();
1215 MachineInstr *MI = &*(--It);
1216 if (MI->isBranch()) {
1217 MBB.insert(It, Waitcnt);
1218 } else {
1219 MBB.push_back(Waitcnt);
1220 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001221}
1222
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001223// This is a flat memory operation. Check to see if it has memory
1224// tokens for both LDS and Memory, and if so mark it as a flat.
1225bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1226 if (MI.memoperands_empty())
1227 return true;
1228
1229 for (const MachineMemOperand *Memop : MI.memoperands()) {
1230 unsigned AS = Memop->getAddrSpace();
1231 if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
1232 return true;
1233 }
1234
1235 return false;
1236}
1237
Kannan Narayananacb089e2017-04-12 03:25:12 +00001238void SIInsertWaitcnts::updateEventWaitCntAfter(
1239 MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
1240 // Now look at the instruction opcode. If it is a memory access
1241 // instruction, update the upper-bound of the appropriate counter's
1242 // bracket and the destination operand scores.
1243 // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001244 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001245 if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001246 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1247 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1248 } else {
1249 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1250 }
1251 } else if (TII->isFLAT(Inst)) {
1252 assert(Inst.mayLoad() || Inst.mayStore());
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001253
1254 if (TII->usesVM_CNT(Inst))
1255 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1256
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001257 if (TII->usesLGKM_CNT(Inst)) {
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001258 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001259
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001260 // This is a flat memory operation, so note it - it will require
1261 // that both the VM and LGKM be flushed to zero if it is pending when
1262 // a VM or LGKM dependency occurs.
1263 if (mayAccessLDSThroughFlat(Inst))
1264 ScoreBrackets->setPendingFlat();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001265 }
1266 } else if (SIInstrInfo::isVMEM(Inst) &&
1267 // TODO: get a better carve out.
1268 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1269 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1270 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1271 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1272 if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() &&
Mark Searles11d0a042017-05-31 16:44:23 +00001273 (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001274 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1275 }
1276 } else if (TII->isSMRD(Inst)) {
1277 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1278 } else {
1279 switch (Inst.getOpcode()) {
1280 case AMDGPU::S_SENDMSG:
1281 case AMDGPU::S_SENDMSGHALT:
1282 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1283 break;
1284 case AMDGPU::EXP:
1285 case AMDGPU::EXP_DONE: {
1286 int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1287 if (Imm >= 32 && Imm <= 63)
1288 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1289 else if (Imm >= 12 && Imm <= 15)
1290 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1291 else
1292 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1293 break;
1294 }
1295 case AMDGPU::S_MEMTIME:
1296 case AMDGPU::S_MEMREALTIME:
1297 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1298 break;
1299 default:
1300 break;
1301 }
1302 }
1303}
1304
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001305// Merge the score brackets of the Block's predecessors;
1306// this merged score bracket is used when adding waitcnts to the Block
Kannan Narayananacb089e2017-04-12 03:25:12 +00001307void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
1308 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1309 int32_t MaxPending[NUM_INST_CNTS] = {0};
1310 int32_t MaxFlat[NUM_INST_CNTS] = {0};
1311 bool MixedExpTypes = false;
1312
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001313 // For single basic block loops, we need to retain the Block's
1314 // score bracket to have accurate Pred info. So, make a copy of Block's
1315 // score bracket, clear() it (which retains several important bits of info),
1316 // populate, and then replace en masse. For non-single basic block loops,
1317 // just clear Block's current score bracket and repopulate in-place.
1318 bool IsSelfPred;
1319 std::unique_ptr<BlockWaitcntBrackets> S;
1320
1321 IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
1322 != Block.pred_end();
1323 if (IsSelfPred) {
1324 S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1325 ScoreBrackets = S.get();
1326 }
1327
Kannan Narayananacb089e2017-04-12 03:25:12 +00001328 ScoreBrackets->clear();
1329
Kannan Narayananacb089e2017-04-12 03:25:12 +00001330 // See if there are any uninitialized predecessors. If so, emit an
1331 // s_waitcnt 0 at the beginning of the block.
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001332 for (MachineBasicBlock *Pred : Block.predecessors()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001333 BlockWaitcntBrackets *PredScoreBrackets =
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001334 BlockWaitcntBracketsMap[Pred].get();
1335 bool Visited = BlockVisitedSet.count(Pred);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001336 if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001337 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001338 }
1339 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1340 T = (enum InstCounterType)(T + 1)) {
1341 int span =
1342 PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
1343 MaxPending[T] = std::max(MaxPending[T], span);
1344 span =
1345 PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
1346 MaxFlat[T] = std::max(MaxFlat[T], span);
1347 }
1348
1349 MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
1350 }
1351
1352 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1353 // Also handle kills for exit block.
1354 if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1355 for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1356 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1357 T = (enum InstCounterType)(T + 1)) {
1358 int Span = KillWaitBrackets[I]->getScoreUB(T) -
1359 KillWaitBrackets[I]->getScoreLB(T);
1360 MaxPending[T] = std::max(MaxPending[T], Span);
1361 Span = KillWaitBrackets[I]->pendingFlat(T) -
1362 KillWaitBrackets[I]->getScoreLB(T);
1363 MaxFlat[T] = std::max(MaxFlat[T], Span);
1364 }
1365
1366 MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
1367 }
1368 }
1369
1370 // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
1371 for (MachineBasicBlock *Pred : Block.predecessors()) {
1372 BlockWaitcntBrackets *PredScoreBrackets =
1373 BlockWaitcntBracketsMap[Pred].get();
Mark Searles24c92ee2018-02-07 02:21:21 +00001374 bool Visited = BlockVisitedSet.count(Pred);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001375 if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001376 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001377 }
1378
1379 int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
1380 PredScoreBrackets->getScoreLB(EXP_CNT);
1381 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1382 int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
1383 PredScoreBrackets->getScoreLB(EXP_CNT);
1384 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1385 }
1386
1387 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1388 if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1389 for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1390 int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
1391 KillWaitBrackets[I]->getScoreLB(EXP_CNT);
1392 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1393 int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
1394 KillWaitBrackets[I]->getScoreLB(EXP_CNT);
1395 MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1396 }
1397 }
1398
1399#if 0
1400 // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
1401 // TODO: how does LC distinguish between function entry and main entry?
1402 // If this is the entry to a function, force a wait.
1403 MachineBasicBlock &Entry = Block.getParent()->front();
1404 if (Entry.getNumber() == Block.getNumber()) {
1405 ScoreBrackets->setWaitAtBeginning();
1406 return;
1407 }
1408#endif
1409
1410 // Now set the current Block's brackets to the largest ending bracket.
1411 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1412 T = (enum InstCounterType)(T + 1)) {
1413 ScoreBrackets->setScoreUB(T, MaxPending[T]);
1414 ScoreBrackets->setScoreLB(T, 0);
1415 ScoreBrackets->setLastFlat(T, MaxFlat[T]);
1416 }
1417
1418 ScoreBrackets->setMixedExpTypes(MixedExpTypes);
1419
1420 // Set the register scoreboard.
1421 for (MachineBasicBlock *Pred : Block.predecessors()) {
Mark Searles24c92ee2018-02-07 02:21:21 +00001422 if (!BlockVisitedSet.count(Pred)) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001423 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001424 }
1425
1426 BlockWaitcntBrackets *PredScoreBrackets =
1427 BlockWaitcntBracketsMap[Pred].get();
1428
1429 // Now merge the gpr_reg_score information
1430 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1431 T = (enum InstCounterType)(T + 1)) {
1432 int PredLB = PredScoreBrackets->getScoreLB(T);
1433 int PredUB = PredScoreBrackets->getScoreUB(T);
1434 if (PredLB < PredUB) {
1435 int PredScale = MaxPending[T] - PredUB;
1436 // Merge vgpr scores.
1437 for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
1438 int PredRegScore = PredScoreBrackets->getRegScore(J, T);
1439 if (PredRegScore <= PredLB)
1440 continue;
1441 int NewRegScore = PredScale + PredRegScore;
1442 ScoreBrackets->setRegScore(
1443 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1444 }
1445 // Also need to merge sgpr scores for lgkm_cnt.
1446 if (T == LGKM_CNT) {
1447 for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
1448 int PredRegScore =
1449 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1450 if (PredRegScore <= PredLB)
1451 continue;
1452 int NewRegScore = PredScale + PredRegScore;
1453 ScoreBrackets->setRegScore(
1454 J + NUM_ALL_VGPRS, LGKM_CNT,
1455 std::max(
1456 ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1457 NewRegScore));
1458 }
1459 }
1460 }
1461 }
1462
1463 // Also merge the WaitEvent information.
1464 ForAllWaitEventType(W) {
1465 enum InstCounterType T = PredScoreBrackets->eventCounter(W);
1466 int PredEventUB = PredScoreBrackets->getEventUB(W);
1467 if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
1468 int NewEventUB =
1469 MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
1470 if (NewEventUB > 0) {
1471 ScoreBrackets->setEventUB(
1472 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1473 }
1474 }
1475 }
1476 }
1477
1478 // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1479 // Set the register scoreboard.
1480 if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1481 for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1482 // Now merge the gpr_reg_score information.
1483 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1484 T = (enum InstCounterType)(T + 1)) {
1485 int PredLB = KillWaitBrackets[I]->getScoreLB(T);
1486 int PredUB = KillWaitBrackets[I]->getScoreUB(T);
1487 if (PredLB < PredUB) {
1488 int PredScale = MaxPending[T] - PredUB;
1489 // Merge vgpr scores.
1490 for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
1491 int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
1492 if (PredRegScore <= PredLB)
1493 continue;
1494 int NewRegScore = PredScale + PredRegScore;
1495 ScoreBrackets->setRegScore(
1496 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1497 }
1498 // Also need to merge sgpr scores for lgkm_cnt.
1499 if (T == LGKM_CNT) {
1500 for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
1501 int PredRegScore =
1502 KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1503 if (PredRegScore <= PredLB)
1504 continue;
1505 int NewRegScore = PredScale + PredRegScore;
1506 ScoreBrackets->setRegScore(
1507 J + NUM_ALL_VGPRS, LGKM_CNT,
1508 std::max(
1509 ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1510 NewRegScore));
1511 }
1512 }
1513 }
1514 }
1515
1516 // Also merge the WaitEvent information.
1517 ForAllWaitEventType(W) {
1518 enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
1519 int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
1520 if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
1521 int NewEventUB =
1522 MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
1523 if (NewEventUB > 0) {
1524 ScoreBrackets->setEventUB(
1525 W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1526 }
1527 }
1528 }
1529 }
1530 }
1531
1532 // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
1533 // sequencing predecessors, because changes to EXEC require waitcnts due to
1534 // the delayed nature of these operations.
1535 for (MachineBasicBlock *Pred : Block.predecessors()) {
Mark Searles24c92ee2018-02-07 02:21:21 +00001536 if (!BlockVisitedSet.count(Pred)) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001537 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001538 }
1539
1540 BlockWaitcntBrackets *PredScoreBrackets =
1541 BlockWaitcntBracketsMap[Pred].get();
1542
1543 int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
1544 if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1545 int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
1546 PredScoreBrackets->getScoreUB(EXP_CNT);
1547 if (new_gds_ub > 0) {
1548 ScoreBrackets->setEventUB(
1549 GDS_GPR_LOCK,
1550 std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
1551 }
1552 }
1553 int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
1554 if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1555 int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
1556 PredScoreBrackets->getScoreUB(EXP_CNT);
1557 if (new_exp_ub > 0) {
1558 ScoreBrackets->setEventUB(
1559 EXP_GPR_LOCK,
1560 std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
1561 }
1562 }
1563 }
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001564
1565 // if a single block loop, update the score brackets. Not needed for other
1566 // blocks, as we did this in-place
1567 if (IsSelfPred) {
1568 BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1569 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001570}
1571
Mark Searles1bc6e712018-04-19 15:42:30 +00001572/// Return true if the given basic block is a "bottom" block of a loop. This
1573/// differs from MachineLoop::getBottomBlock in that it works even if the loop
1574/// is discontiguous. This also handles multiple back-edges for the same
1575/// "header" block of a loop.
1576bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
1577 const MachineBasicBlock *Block) {
1578 for (MachineBasicBlock *MBB : Loop->blocks()) {
1579 if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
1580 return true;
1581 }
1582 }
1583 return false;
1584}
1585
1586/// Count the number of "bottom" basic blocks of a loop.
1587unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
1588 unsigned Count = 0;
1589 for (MachineBasicBlock *MBB : Loop->blocks()) {
1590 if (MBB->isSuccessor(Loop->getHeader())) {
1591 Count++;
1592 }
1593 }
1594 return Count;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001595}
1596
1597// Generate s_waitcnt instructions where needed.
1598void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1599 MachineBasicBlock &Block) {
1600 // Initialize the state information.
1601 mergeInputScoreBrackets(Block);
1602
1603 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1604
1605 DEBUG({
Mark Searles94ae3b22018-01-30 17:17:06 +00001606 dbgs() << "Block" << Block.getNumber();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001607 ScoreBrackets->dump();
1608 });
1609
Kannan Narayananacb089e2017-04-12 03:25:12 +00001610 // Walk over the instructions.
1611 for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1612 Iter != E;) {
1613 MachineInstr &Inst = *Iter;
1614 // Remove any previously existing waitcnts.
1615 if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
Mark Searles65207922018-02-19 19:19:59 +00001616 // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
1617 // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
1618 // as needed.
Mark Searles24c92ee2018-02-07 02:21:21 +00001619 if (!TrackedWaitcntSet.count(&Inst))
Kannan Narayananacb089e2017-04-12 03:25:12 +00001620 ++Iter;
1621 else {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001622 ++Iter;
1623 Inst.removeFromParent();
1624 }
Mark Searles65207922018-02-19 19:19:59 +00001625 ScoreBrackets->setWaitcnt(&Inst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001626 continue;
1627 }
1628
1629 // Kill instructions generate a conditional branch to the endmain block.
1630 // Merge the current waitcnt state into the endmain block information.
1631 // TODO: Are there other flavors of KILL instruction?
1632 if (Inst.getOpcode() == AMDGPU::KILL) {
1633 addKillWaitBracket(ScoreBrackets);
1634 }
1635
1636 bool VCCZBugWorkAround = false;
1637 if (readsVCCZ(Inst) &&
Mark Searles24c92ee2018-02-07 02:21:21 +00001638 (!VCCZBugHandledSet.count(&Inst))) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001639 if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1640 ScoreBrackets->getScoreUB(LGKM_CNT) &&
1641 ScoreBrackets->hasPendingSMEM()) {
1642 if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
1643 VCCZBugWorkAround = true;
1644 }
1645 }
1646
1647 // Generate an s_waitcnt instruction to be placed before
1648 // cur_Inst, if needed.
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001649 generateSWaitCntInstBefore(Inst, ScoreBrackets);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001650
1651 updateEventWaitCntAfter(Inst, ScoreBrackets);
1652
1653#if 0 // TODO: implement resource type check controlled by options with ub = LB.
1654 // If this instruction generates a S_SETVSKIP because it is an
1655 // indexed resource, and we are on Tahiti, then it will also force
1656 // an S_WAITCNT vmcnt(0)
1657 if (RequireCheckResourceType(Inst, context)) {
1658 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1659 ScoreBrackets->setScoreLB(VM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +00001660 ScoreBrackets->getScoreUB(VM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +00001661 }
1662#endif
1663
1664 ScoreBrackets->clearWaitcnt();
1665
Kannan Narayananacb089e2017-04-12 03:25:12 +00001666 DEBUG({
Mark Searles94ae3b22018-01-30 17:17:06 +00001667 Inst.print(dbgs());
Kannan Narayananacb089e2017-04-12 03:25:12 +00001668 ScoreBrackets->dump();
1669 });
1670
1671 // Check to see if this is a GWS instruction. If so, and if this is CI or
1672 // VI, then the generated code sequence will include an S_WAITCNT 0.
1673 // TODO: Are these the only GWS instructions?
1674 if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1675 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1676 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1677 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1678 Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1679 // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
1680 ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1681 ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1682 ScoreBrackets->updateByWait(LGKM_CNT,
1683 ScoreBrackets->getScoreUB(LGKM_CNT));
1684 }
1685
1686 // TODO: Remove this work-around after fixing the scheduler and enable the
1687 // assert above.
1688 if (VCCZBugWorkAround) {
1689 // Restore the vccz bit. Any time a value is written to vcc, the vcc
1690 // bit is updated, so we can restore the bit by reading the value of
1691 // vcc and then writing it back to the register.
1692 BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1693 AMDGPU::VCC)
1694 .addReg(AMDGPU::VCC);
1695 VCCZBugHandledSet.insert(&Inst);
1696 }
1697
Kannan Narayananacb089e2017-04-12 03:25:12 +00001698 ++Iter;
1699 }
1700
1701 // Check if we need to force convergence at loop footer.
1702 MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
Mark Searles1bc6e712018-04-19 15:42:30 +00001703 if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001704 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1705 WaitcntData->print();
1706 DEBUG(dbgs() << '\n';);
1707
1708 // The iterative waitcnt insertion algorithm aims for optimal waitcnt
1709 // placement and doesn't always guarantee convergence for a loop. Each
1710 // loop should take at most 2 iterations for it to converge naturally.
1711 // When this max is reached and result doesn't converge, we force
1712 // convergence by inserting a s_waitcnt at the end of loop footer.
1713 if (WaitcntData->getIterCnt() > 2) {
1714 // To ensure convergence, need to make wait events at loop footer be no
1715 // more than those from the previous iteration.
Mark Searles65207922018-02-19 19:19:59 +00001716 // As a simplification, instead of tracking individual scores and
1717 // generating the precise wait count, just wait on 0.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001718 bool HasPending = false;
1719 MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
1720 for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1721 T = (enum InstCounterType)(T + 1)) {
1722 if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
1723 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1724 HasPending = true;
1725 }
1726 }
1727
1728 if (HasPending) {
1729 if (!SWaitInst) {
1730 SWaitInst = Block.getParent()->CreateMachineInstr(
1731 TII->get(AMDGPU::S_WAITCNT), DebugLoc());
Mark Searles24c92ee2018-02-07 02:21:21 +00001732 TrackedWaitcntSet.insert(SWaitInst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001733 const MachineOperand &Op = MachineOperand::CreateImm(0);
1734 SWaitInst->addOperand(MF, Op);
1735#if 0 // TODO: Format the debug output
1736 OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
1737 OutputTransformAdd(SWaitInst, context);
1738#endif
1739 }
1740#if 0 // TODO: ??
1741 _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
1742#endif
1743 }
1744
1745 if (SWaitInst) {
1746 DEBUG({
1747 SWaitInst->print(dbgs());
1748 dbgs() << "\nAdjusted score board:";
1749 ScoreBrackets->dump();
1750 });
1751
1752 // Add this waitcnt to the block. It is either newly created or
1753 // created in previous iterations and added back since block traversal
Mark Searles65207922018-02-19 19:19:59 +00001754 // always removes waitcnts.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001755 insertWaitcntBeforeCF(Block, SWaitInst);
1756 WaitcntData->setWaitcnt(SWaitInst);
1757 }
1758 }
1759 }
1760}
1761
1762bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1763 ST = &MF.getSubtarget<SISubtarget>();
1764 TII = ST->getInstrInfo();
1765 TRI = &TII->getRegisterInfo();
1766 MRI = &MF.getRegInfo();
1767 MLI = &getAnalysis<MachineLoopInfo>();
1768 IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
Mark Searles11d0a042017-05-31 16:44:23 +00001769 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001770 AMDGPUASI = ST->getAMDGPUAS();
1771
1772 HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1773 HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1774 HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1775
1776 HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1777 HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1778 assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1779 assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1780
1781 RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1782 RegisterEncoding.VGPRL =
1783 RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1784 RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1785 RegisterEncoding.SGPRL =
1786 RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1787
Mark Searles24c92ee2018-02-07 02:21:21 +00001788 TrackedWaitcntSet.clear();
1789 BlockVisitedSet.clear();
1790 VCCZBugHandledSet.clear();
Mark Searles1bc6e712018-04-19 15:42:30 +00001791 LoopWaitcntDataMap.clear();
Mark Searles24c92ee2018-02-07 02:21:21 +00001792
Kannan Narayananacb089e2017-04-12 03:25:12 +00001793 // Walk over the blocks in reverse post-dominator order, inserting
1794 // s_waitcnt where needed.
1795 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
1796 bool Modified = false;
1797 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1798 I = RPOT.begin(),
1799 E = RPOT.end(), J = RPOT.begin();
1800 I != E;) {
1801 MachineBasicBlock &MBB = **I;
1802
1803 BlockVisitedSet.insert(&MBB);
1804
1805 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1806 if (!ScoreBrackets) {
Eugene Zelenko59e12822017-08-08 00:47:13 +00001807 BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001808 ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1809 }
1810 ScoreBrackets->setPostOrder(MBB.getNumber());
1811 MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
1812 if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
Eugene Zelenko59e12822017-08-08 00:47:13 +00001813 LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001814
1815 // If we are walking into the block from before the loop, then guarantee
1816 // at least 1 re-walk over the loop to propagate the information, even if
1817 // no S_WAITCNT instructions were generated.
Mark Searles1bc6e712018-04-19 15:42:30 +00001818 if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
1819 unsigned Count = countNumBottomBlocks(ContainingLoop);
1820
1821 // If the loop has multiple back-edges, and so more than one "bottom"
1822 // basic block, we have to guarantee a re-walk over every blocks.
1823 if ((std::count(BlockWaitcntProcessedSet.begin(),
1824 BlockWaitcntProcessedSet.end(), &MBB) < Count)) {
1825 BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
1826 DEBUG(dbgs() << "set-revisit: Block"
1827 << ContainingLoop->getHeader()->getNumber() << '\n';);
1828 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001829 }
1830
1831 // Walk over the instructions.
1832 insertWaitcntInBlock(MF, MBB);
1833
1834 // Flag that waitcnts have been processed at least once.
Mark Searles1bc6e712018-04-19 15:42:30 +00001835 BlockWaitcntProcessedSet.push_back(&MBB);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001836
Mark Searles1bc6e712018-04-19 15:42:30 +00001837 // See if we want to revisit the loop. If a loop has multiple back-edges,
1838 // we shouldn't revisit the same "bottom" basic block.
1839 if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
1840 std::count(BlockWaitcntProcessedSet.begin(),
1841 BlockWaitcntProcessedSet.end(), &MBB) == 1) {
Kannan Narayanan5e73b042017-05-05 21:10:17 +00001842 MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001843 BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
1844 if (EntrySB && EntrySB->getRevisitLoop()) {
1845 EntrySB->setRevisitLoop(false);
1846 J = I;
1847 int32_t PostOrder = EntrySB->getPostOrder();
1848 // TODO: Avoid this loop. Find another way to set I.
1849 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1850 X = RPOT.begin(),
1851 Y = RPOT.end();
1852 X != Y; ++X) {
1853 MachineBasicBlock &MBBX = **X;
1854 if (MBBX.getNumber() == PostOrder) {
1855 I = X;
1856 break;
1857 }
1858 }
1859 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1860 WaitcntData->incIterCnt();
Mark Searles65207922018-02-19 19:19:59 +00001861 DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001862 continue;
1863 } else {
1864 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1865 // Loop converged, reset iteration count. If this loop gets revisited,
1866 // it must be from an outer loop, the counter will restart, this will
1867 // ensure we don't force convergence on such revisits.
1868 WaitcntData->resetIterCnt();
1869 }
1870 }
1871
1872 J = I;
1873 ++I;
1874 }
1875
1876 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1877
1878 bool HaveScalarStores = false;
1879
1880 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1881 ++BI) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001882 MachineBasicBlock &MBB = *BI;
1883
1884 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1885 ++I) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001886 if (!HaveScalarStores && TII->isScalarStore(*I))
1887 HaveScalarStores = true;
1888
1889 if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1890 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1891 EndPgmBlocks.push_back(&MBB);
1892 }
1893 }
1894
1895 if (HaveScalarStores) {
1896 // If scalar writes are used, the cache must be flushed or else the next
1897 // wave to reuse the same scratch memory can be clobbered.
1898 //
1899 // Insert s_dcache_wb at wave termination points if there were any scalar
1900 // stores, and only if the cache hasn't already been flushed. This could be
1901 // improved by looking across blocks for flushes in postdominating blocks
1902 // from the stores but an explicitly requested flush is probably very rare.
1903 for (MachineBasicBlock *MBB : EndPgmBlocks) {
1904 bool SeenDCacheWB = false;
1905
1906 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1907 ++I) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001908 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1909 SeenDCacheWB = true;
1910 else if (TII->isScalarStore(*I))
1911 SeenDCacheWB = false;
1912
1913 // FIXME: It would be better to insert this before a waitcnt if any.
1914 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1915 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1916 !SeenDCacheWB) {
1917 Modified = true;
1918 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1919 }
1920 }
1921 }
1922 }
1923
Mark Searles11d0a042017-05-31 16:44:23 +00001924 if (!MFI->isEntryFunction()) {
1925 // Wait for any outstanding memory operations that the input registers may
Hiroshi Inouec8e92452018-01-29 05:17:03 +00001926 // depend on. We can't track them and it's better to the wait after the
Mark Searles11d0a042017-05-31 16:44:23 +00001927 // costly call sequence.
1928
1929 // TODO: Could insert earlier and schedule more liberally with operations
1930 // that only use caller preserved registers.
1931 MachineBasicBlock &EntryBB = MF.front();
1932 BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1933 .addImm(0);
1934
1935 Modified = true;
1936 }
1937
Kannan Narayananacb089e2017-04-12 03:25:12 +00001938 return Modified;
1939}