blob: 0f16117efd395b216bd03a77ab6d63a34a9e1aac [file] [log] [blame]
Eugene Zelenko59e12822017-08-08 00:47:13 +00001//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
Kannan Narayananacb089e2017-04-12 03:25:12 +00002//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
Adrian Prantl5f8f34e42018-05-01 15:54:18 +000011/// Insert wait instructions for memory reads and writes.
Kannan Narayananacb089e2017-04-12 03:25:12 +000012///
13/// Memory reads and writes are issued asynchronously, so we need to insert
14/// S_WAITCNT instructions when we want to access any of their results or
15/// overwrite any register that's used asynchronously.
Nicolai Haehnled1f45da2018-11-29 11:06:14 +000016///
17/// TODO: This pass currently keeps one timeline per hardware counter. A more
18/// finely-grained approach that keeps one timeline per event type could
19/// sometimes get away with generating weaker s_waitcnt instructions. For
20/// example, when both SMEM and LDS are in flight and we need to wait for
21/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
22/// but the pass will currently generate a conservative lgkmcnt(0) because
23/// multiple event types are in flight.
Kannan Narayananacb089e2017-04-12 03:25:12 +000024//
25//===----------------------------------------------------------------------===//
26
27#include "AMDGPU.h"
28#include "AMDGPUSubtarget.h"
29#include "SIDefines.h"
30#include "SIInstrInfo.h"
31#include "SIMachineFunctionInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000032#include "SIRegisterInfo.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000033#include "Utils/AMDGPUBaseInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000034#include "llvm/ADT/DenseMap.h"
35#include "llvm/ADT/DenseSet.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000036#include "llvm/ADT/PostOrderIterator.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000037#include "llvm/ADT/STLExtras.h"
38#include "llvm/ADT/SmallVector.h"
39#include "llvm/CodeGen/MachineBasicBlock.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000040#include "llvm/CodeGen/MachineFunction.h"
41#include "llvm/CodeGen/MachineFunctionPass.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000042#include "llvm/CodeGen/MachineInstr.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000043#include "llvm/CodeGen/MachineInstrBuilder.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000044#include "llvm/CodeGen/MachineLoopInfo.h"
45#include "llvm/CodeGen/MachineMemOperand.h"
46#include "llvm/CodeGen/MachineOperand.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000047#include "llvm/CodeGen/MachineRegisterInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000048#include "llvm/IR/DebugLoc.h"
49#include "llvm/Pass.h"
50#include "llvm/Support/Debug.h"
Mark Searlesec581832018-04-25 19:21:26 +000051#include "llvm/Support/DebugCounter.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000052#include "llvm/Support/ErrorHandling.h"
53#include "llvm/Support/raw_ostream.h"
54#include <algorithm>
55#include <cassert>
56#include <cstdint>
57#include <cstring>
58#include <memory>
59#include <utility>
60#include <vector>
Kannan Narayananacb089e2017-04-12 03:25:12 +000061
Mark Searlesec581832018-04-25 19:21:26 +000062using namespace llvm;
63
Kannan Narayananacb089e2017-04-12 03:25:12 +000064#define DEBUG_TYPE "si-insert-waitcnts"
65
Mark Searlesec581832018-04-25 19:21:26 +000066DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
67 "Force emit s_waitcnt expcnt(0) instrs");
68DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
69 "Force emit s_waitcnt lgkmcnt(0) instrs");
70DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
71 "Force emit s_waitcnt vmcnt(0) instrs");
72
73static cl::opt<unsigned> ForceEmitZeroFlag(
74 "amdgpu-waitcnt-forcezero",
75 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
76 cl::init(0), cl::Hidden);
Kannan Narayananacb089e2017-04-12 03:25:12 +000077
78namespace {
79
Nicolai Haehnleae369d72018-11-29 11:06:11 +000080template <typename EnumT>
81class enum_iterator
82 : public iterator_facade_base<enum_iterator<EnumT>,
83 std::forward_iterator_tag, const EnumT> {
84 EnumT Value;
85public:
86 enum_iterator() = default;
87 enum_iterator(EnumT Value) : Value(Value) {}
88
89 enum_iterator &operator++() {
90 Value = static_cast<EnumT>(Value + 1);
91 return *this;
92 }
93
94 bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
95
96 EnumT operator*() const { return Value; }
97};
98
Kannan Narayananacb089e2017-04-12 03:25:12 +000099// Class of object that encapsulates latest instruction counter score
100// associated with the operand. Used for determining whether
101// s_waitcnt instruction needs to be emited.
102
103#define CNT_MASK(t) (1u << (t))
104
105enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
106
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000107iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
108 return make_range(enum_iterator<InstCounterType>(VM_CNT),
109 enum_iterator<InstCounterType>(NUM_INST_CNTS));
110}
111
Eugene Zelenko59e12822017-08-08 00:47:13 +0000112using RegInterval = std::pair<signed, signed>;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000113
114struct {
115 int32_t VmcntMax;
116 int32_t ExpcntMax;
117 int32_t LgkmcntMax;
118 int32_t NumVGPRsMax;
119 int32_t NumSGPRsMax;
120} HardwareLimits;
121
122struct {
123 unsigned VGPR0;
124 unsigned VGPRL;
125 unsigned SGPR0;
126 unsigned SGPRL;
127} RegisterEncoding;
128
129enum WaitEventType {
130 VMEM_ACCESS, // vector-memory read & write
131 LDS_ACCESS, // lds read & write
132 GDS_ACCESS, // gds read & write
133 SQ_MESSAGE, // send message
134 SMEM_ACCESS, // scalar-memory read & write
135 EXP_GPR_LOCK, // export holding on its data src
136 GDS_GPR_LOCK, // GDS holding on its data and addr src
137 EXP_POS_ACCESS, // write to export position
138 EXP_PARAM_ACCESS, // write to export parameter
139 VMW_GPR_LOCK, // vector-memory write holding on its data src
140 NUM_WAIT_EVENTS,
141};
142
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000143static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
144 (1 << VMEM_ACCESS),
145 (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
146 (1 << SQ_MESSAGE),
147 (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
148 (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
149};
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000150
Kannan Narayananacb089e2017-04-12 03:25:12 +0000151// The mapping is:
152// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
153// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
154// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
155// We reserve a fixed number of VGPR slots in the scoring tables for
156// special tokens like SCMEM_LDS (needed for buffer load to LDS).
157enum RegisterMapping {
158 SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
159 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
160 NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
161 EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
162 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
163};
164
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000165void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
166 switch (T) {
167 case VM_CNT:
168 Wait.VmCnt = std::min(Wait.VmCnt, Count);
169 break;
170 case EXP_CNT:
171 Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
172 break;
173 case LGKM_CNT:
174 Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
175 break;
176 default:
177 llvm_unreachable("bad InstCounterType");
178 }
179}
180
Kannan Narayananacb089e2017-04-12 03:25:12 +0000181// This is a per-basic-block object that maintains current score brackets
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000182// of each wait counter, and a per-register scoreboard for each wait counter.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000183// We also maintain the latest score for every event type that can change the
184// waitcnt in order to know if there are multiple types of events within
185// the brackets. When multiple types of event happen in the bracket,
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000186// wait count may get decreased out of order, therefore we need to put in
Kannan Narayananacb089e2017-04-12 03:25:12 +0000187// "s_waitcnt 0" before use.
188class BlockWaitcntBrackets {
189public:
Tom Stellard5bfbae52018-07-11 20:59:01 +0000190 BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000191 for (auto T : inst_counter_types())
Eugene Zelenko59e12822017-08-08 00:47:13 +0000192 memset(VgprScores[T], 0, sizeof(VgprScores[T]));
Eugene Zelenko59e12822017-08-08 00:47:13 +0000193 }
194
195 ~BlockWaitcntBrackets() = default;
196
Kannan Narayananacb089e2017-04-12 03:25:12 +0000197 static int32_t getWaitCountMax(InstCounterType T) {
198 switch (T) {
199 case VM_CNT:
200 return HardwareLimits.VmcntMax;
201 case LGKM_CNT:
202 return HardwareLimits.LgkmcntMax;
203 case EXP_CNT:
204 return HardwareLimits.ExpcntMax;
205 default:
206 break;
207 }
208 return 0;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000209 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000210
211 void setScoreLB(InstCounterType T, int32_t Val) {
212 assert(T < NUM_INST_CNTS);
213 if (T >= NUM_INST_CNTS)
214 return;
215 ScoreLBs[T] = Val;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000216 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000217
218 void setScoreUB(InstCounterType T, int32_t Val) {
219 assert(T < NUM_INST_CNTS);
220 if (T >= NUM_INST_CNTS)
221 return;
222 ScoreUBs[T] = Val;
223 if (T == EXP_CNT) {
224 int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
225 if (ScoreLBs[T] < UB)
226 ScoreLBs[T] = UB;
227 }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000228 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000229
Nicolai Haehnlec548d912018-11-19 12:03:11 +0000230 int32_t getScoreLB(InstCounterType T) const {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000231 assert(T < NUM_INST_CNTS);
232 if (T >= NUM_INST_CNTS)
233 return 0;
234 return ScoreLBs[T];
Eugene Zelenko59e12822017-08-08 00:47:13 +0000235 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000236
Nicolai Haehnlec548d912018-11-19 12:03:11 +0000237 int32_t getScoreUB(InstCounterType T) const {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000238 assert(T < NUM_INST_CNTS);
239 if (T >= NUM_INST_CNTS)
240 return 0;
241 return ScoreUBs[T];
Eugene Zelenko59e12822017-08-08 00:47:13 +0000242 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000243
244 // Mapping from event to counter.
245 InstCounterType eventCounter(WaitEventType E) {
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000246 if (E == VMEM_ACCESS)
Kannan Narayananacb089e2017-04-12 03:25:12 +0000247 return VM_CNT;
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000248 if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
Kannan Narayananacb089e2017-04-12 03:25:12 +0000249 return LGKM_CNT;
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000250 assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
251 return EXP_CNT;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000252 }
253
254 void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
255 if (GprNo < NUM_ALL_VGPRS) {
256 if (GprNo > VgprUB) {
257 VgprUB = GprNo;
258 }
259 VgprScores[T][GprNo] = Val;
260 } else {
261 assert(T == LGKM_CNT);
262 if (GprNo - NUM_ALL_VGPRS > SgprUB) {
263 SgprUB = GprNo - NUM_ALL_VGPRS;
264 }
265 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
266 }
267 }
268
269 int32_t getRegScore(int GprNo, InstCounterType T) {
270 if (GprNo < NUM_ALL_VGPRS) {
271 return VgprScores[T][GprNo];
272 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000273 assert(T == LGKM_CNT);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000274 return SgprScores[GprNo - NUM_ALL_VGPRS];
275 }
276
277 void clear() {
278 memset(ScoreLBs, 0, sizeof(ScoreLBs));
279 memset(ScoreUBs, 0, sizeof(ScoreUBs));
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000280 PendingEvents = 0;
281 memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents));
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000282 for (auto T : inst_counter_types())
Kannan Narayananacb089e2017-04-12 03:25:12 +0000283 memset(VgprScores[T], 0, sizeof(VgprScores[T]));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000284 memset(SgprScores, 0, sizeof(SgprScores));
285 }
286
287 RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
288 const MachineRegisterInfo *MRI,
289 const SIRegisterInfo *TRI, unsigned OpNo,
290 bool Def) const;
291
292 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
293 const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
294 unsigned OpNo, int32_t Val);
295
Kannan Narayananacb089e2017-04-12 03:25:12 +0000296 int32_t getMaxVGPR() const { return VgprUB; }
297 int32_t getMaxSGPR() const { return SgprUB; }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000298
Nicolai Haehnlec548d912018-11-19 12:03:11 +0000299 bool counterOutOfOrder(InstCounterType T) const;
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000300 bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
301 bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
302 void determineWait(InstCounterType T, int ScoreToWait,
303 AMDGPU::Waitcnt &Wait) const;
304 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
305 void applyWaitcnt(InstCounterType T, unsigned Count);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000306 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
307 const MachineRegisterInfo *MRI, WaitEventType E,
308 MachineInstr &MI);
309
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000310 bool hasPendingEvent(WaitEventType E) const {
311 return PendingEvents & (1 << E);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000312 }
313
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000314 void mergePendingEvents(const BlockWaitcntBrackets &Other);
315
Kannan Narayananacb089e2017-04-12 03:25:12 +0000316 bool hasPendingFlat() const {
317 return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
318 LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
319 (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
320 LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
321 }
322
323 void setPendingFlat() {
324 LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
325 LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
326 }
327
328 int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
329
330 void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
331
332 bool getRevisitLoop() const { return RevisitLoop; }
333 void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
334
335 void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
336 int32_t getPostOrder() const { return PostOrder; }
337
Kannan Narayananacb089e2017-04-12 03:25:12 +0000338 void print(raw_ostream &);
339 void dump() { print(dbgs()); }
340
341private:
Tom Stellard5bfbae52018-07-11 20:59:01 +0000342 const GCNSubtarget *ST = nullptr;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000343 bool RevisitLoop = false;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000344 int32_t PostOrder = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000345 int32_t ScoreLBs[NUM_INST_CNTS] = {0};
346 int32_t ScoreUBs[NUM_INST_CNTS] = {0};
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000347 uint32_t PendingEvents = 0;
348 bool MixedPendingEvents[NUM_INST_CNTS] = {false};
Kannan Narayananacb089e2017-04-12 03:25:12 +0000349 // Remember the last flat memory operation.
350 int32_t LastFlat[NUM_INST_CNTS] = {0};
351 // wait_cnt scores for every vgpr.
352 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000353 int32_t VgprUB = 0;
354 int32_t SgprUB = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000355 int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
356 // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
357 int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
358};
359
360// This is a per-loop-region object that records waitcnt status at the end of
361// loop footer from the previous iteration. We also maintain an iteration
362// count to track the number of times the loop has been visited. When it
363// doesn't converge naturally, we force convergence by inserting s_waitcnt 0
364// at the end of the loop footer.
365class LoopWaitcntData {
366public:
Eugene Zelenko59e12822017-08-08 00:47:13 +0000367 LoopWaitcntData() = default;
368 ~LoopWaitcntData() = default;
369
Kannan Narayananacb089e2017-04-12 03:25:12 +0000370 void incIterCnt() { IterCnt++; }
371 void resetIterCnt() { IterCnt = 0; }
Mark Searles10545412018-05-30 15:47:45 +0000372 unsigned getIterCnt() { return IterCnt; }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000373
Kannan Narayananacb089e2017-04-12 03:25:12 +0000374 void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
375 MachineInstr *getWaitcnt() const { return LfWaitcnt; }
376
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000377 void print() { LLVM_DEBUG(dbgs() << " iteration " << IterCnt << '\n';); }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000378
379private:
380 // s_waitcnt added at the end of loop footer to stablize wait scores
381 // at the end of the loop footer.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000382 MachineInstr *LfWaitcnt = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000383 // Number of iterations the loop has been visited, not including the initial
384 // walk over.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000385 int32_t IterCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000386};
387
388class SIInsertWaitcnts : public MachineFunctionPass {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000389private:
Tom Stellard5bfbae52018-07-11 20:59:01 +0000390 const GCNSubtarget *ST = nullptr;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000391 const SIInstrInfo *TII = nullptr;
392 const SIRegisterInfo *TRI = nullptr;
393 const MachineRegisterInfo *MRI = nullptr;
394 const MachineLoopInfo *MLI = nullptr;
Konstantin Zhuravlyov71e43ee2018-09-12 18:50:47 +0000395 AMDGPU::IsaVersion IV;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000396
397 DenseSet<MachineBasicBlock *> BlockVisitedSet;
Mark Searles24c92ee2018-02-07 02:21:21 +0000398 DenseSet<MachineInstr *> TrackedWaitcntSet;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000399 DenseSet<MachineInstr *> VCCZBugHandledSet;
400
401 DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
402 BlockWaitcntBracketsMap;
403
Mark Searles1bc6e712018-04-19 15:42:30 +0000404 std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000405
406 DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
407
Mark Searles4a0f2c52018-05-07 14:43:28 +0000408 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
409 // because of amdgpu-waitcnt-forcezero flag
410 bool ForceEmitZeroWaitcnts;
Mark Searlesec581832018-04-25 19:21:26 +0000411 bool ForceEmitWaitcnt[NUM_INST_CNTS];
412
Kannan Narayananacb089e2017-04-12 03:25:12 +0000413public:
414 static char ID;
415
Konstantin Zhuravlyov77747772018-06-26 21:33:38 +0000416 SIInsertWaitcnts() : MachineFunctionPass(ID) {
417 (void)ForceExpCounter;
418 (void)ForceLgkmCounter;
419 (void)ForceVMCounter;
420 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000421
422 bool runOnMachineFunction(MachineFunction &MF) override;
423
424 StringRef getPassName() const override {
425 return "SI insert wait instructions";
426 }
427
428 void getAnalysisUsage(AnalysisUsage &AU) const override {
429 AU.setPreservesCFG();
430 AU.addRequired<MachineLoopInfo>();
431 MachineFunctionPass::getAnalysisUsage(AU);
432 }
433
Mark Searlesec581832018-04-25 19:21:26 +0000434 bool isForceEmitWaitcnt() const {
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000435 for (auto T : inst_counter_types())
Mark Searlesec581832018-04-25 19:21:26 +0000436 if (ForceEmitWaitcnt[T])
437 return true;
438 return false;
439 }
440
441 void setForceEmitWaitcnt() {
442// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
443// For debug builds, get the debug counter info and adjust if need be
444#ifndef NDEBUG
445 if (DebugCounter::isCounterSet(ForceExpCounter) &&
446 DebugCounter::shouldExecute(ForceExpCounter)) {
447 ForceEmitWaitcnt[EXP_CNT] = true;
448 } else {
449 ForceEmitWaitcnt[EXP_CNT] = false;
450 }
451
452 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
453 DebugCounter::shouldExecute(ForceLgkmCounter)) {
454 ForceEmitWaitcnt[LGKM_CNT] = true;
455 } else {
456 ForceEmitWaitcnt[LGKM_CNT] = false;
457 }
458
459 if (DebugCounter::isCounterSet(ForceVMCounter) &&
460 DebugCounter::shouldExecute(ForceVMCounter)) {
461 ForceEmitWaitcnt[VM_CNT] = true;
462 } else {
463 ForceEmitWaitcnt[VM_CNT] = false;
464 }
465#endif // NDEBUG
466 }
467
Matt Arsenault0ed39d32017-07-21 18:54:54 +0000468 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
Mark Searles70901b92018-04-24 15:59:59 +0000469 void generateWaitcntInstBefore(MachineInstr &MI,
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000470 BlockWaitcntBrackets *ScoreBrackets,
471 MachineInstr *OldWaitcntInstr);
Mark Searles70901b92018-04-24 15:59:59 +0000472 void updateEventWaitcntAfter(MachineInstr &Inst,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000473 BlockWaitcntBrackets *ScoreBrackets);
474 void mergeInputScoreBrackets(MachineBasicBlock &Block);
Mark Searles1bc6e712018-04-19 15:42:30 +0000475 bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
476 unsigned countNumBottomBlocks(const MachineLoop *Loop);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000477 void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
478 void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
479};
480
Eugene Zelenko59e12822017-08-08 00:47:13 +0000481} // end anonymous namespace
Kannan Narayananacb089e2017-04-12 03:25:12 +0000482
483RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
484 const SIInstrInfo *TII,
485 const MachineRegisterInfo *MRI,
486 const SIRegisterInfo *TRI,
487 unsigned OpNo,
488 bool Def) const {
489 const MachineOperand &Op = MI->getOperand(OpNo);
490 if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
491 (Def && !Op.isDef()))
492 return {-1, -1};
493
494 // A use via a PW operand does not need a waitcnt.
495 // A partial write is not a WAW.
496 assert(!Op.getSubReg() || !Op.isUndef());
497
498 RegInterval Result;
499 const MachineRegisterInfo &MRIA = *MRI;
500
501 unsigned Reg = TRI->getEncodingValue(Op.getReg());
502
503 if (TRI->isVGPR(MRIA, Op.getReg())) {
504 assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
505 Result.first = Reg - RegisterEncoding.VGPR0;
506 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
507 } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
508 assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
509 Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
510 assert(Result.first >= NUM_ALL_VGPRS &&
511 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
512 }
513 // TODO: Handle TTMP
514 // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
515 else
516 return {-1, -1};
517
518 const MachineInstr &MIA = *MI;
519 const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +0000520 unsigned Size = TRI->getRegSizeInBits(*RC);
521 Result.second = Result.first + (Size / 32);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000522
523 return Result;
524}
525
526void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
527 const SIInstrInfo *TII,
528 const SIRegisterInfo *TRI,
529 const MachineRegisterInfo *MRI,
530 unsigned OpNo, int32_t Val) {
531 RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000532 LLVM_DEBUG({
Kannan Narayananacb089e2017-04-12 03:25:12 +0000533 const MachineOperand &Opnd = MI->getOperand(OpNo);
534 assert(TRI->isVGPR(*MRI, Opnd.getReg()));
535 });
536 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
537 setRegScore(RegNo, EXP_CNT, Val);
538 }
539}
540
541void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
542 const SIRegisterInfo *TRI,
543 const MachineRegisterInfo *MRI,
544 WaitEventType E, MachineInstr &Inst) {
545 const MachineRegisterInfo &MRIA = *MRI;
546 InstCounterType T = eventCounter(E);
547 int32_t CurrScore = getScoreUB(T) + 1;
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000548 // PendingEvents and ScoreUB need to be update regardless if this event
549 // changes the score of a register or not.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000550 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000551 if (!hasPendingEvent(E)) {
552 if (PendingEvents & WaitEventMaskForInst[T])
553 MixedPendingEvents[T] = true;
554 PendingEvents |= 1 << E;
555 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000556 setScoreUB(T, CurrScore);
557
558 if (T == EXP_CNT) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000559 // Put score on the source vgprs. If this is a store, just use those
560 // specific register(s).
561 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
562 // All GDS operations must protect their address register (same as
563 // export.)
564 if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
565 Inst.getOpcode() != AMDGPU::DS_CONSUME) {
566 setExpScore(
567 &Inst, TII, TRI, MRI,
568 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
569 CurrScore);
570 }
571 if (Inst.mayStore()) {
572 setExpScore(
573 &Inst, TII, TRI, MRI,
574 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
575 CurrScore);
576 if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
577 AMDGPU::OpName::data1) != -1) {
578 setExpScore(&Inst, TII, TRI, MRI,
579 AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
580 AMDGPU::OpName::data1),
581 CurrScore);
582 }
583 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
584 Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
585 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
586 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
587 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
588 Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
589 Inst.getOpcode() != AMDGPU::DS_APPEND &&
590 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
591 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
592 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
593 const MachineOperand &Op = Inst.getOperand(I);
594 if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
595 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
596 }
597 }
598 }
599 } else if (TII->isFLAT(Inst)) {
600 if (Inst.mayStore()) {
601 setExpScore(
602 &Inst, TII, TRI, MRI,
603 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
604 CurrScore);
605 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
606 setExpScore(
607 &Inst, TII, TRI, MRI,
608 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
609 CurrScore);
610 }
611 } else if (TII->isMIMG(Inst)) {
612 if (Inst.mayStore()) {
613 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
614 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
615 setExpScore(
616 &Inst, TII, TRI, MRI,
617 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
618 CurrScore);
619 }
620 } else if (TII->isMTBUF(Inst)) {
621 if (Inst.mayStore()) {
622 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
623 }
624 } else if (TII->isMUBUF(Inst)) {
625 if (Inst.mayStore()) {
626 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
627 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
628 setExpScore(
629 &Inst, TII, TRI, MRI,
630 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
631 CurrScore);
632 }
633 } else {
634 if (TII->isEXP(Inst)) {
635 // For export the destination registers are really temps that
636 // can be used as the actual source after export patching, so
637 // we need to treat them like sources and set the EXP_CNT
638 // score.
639 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
640 MachineOperand &DefMO = Inst.getOperand(I);
641 if (DefMO.isReg() && DefMO.isDef() &&
642 TRI->isVGPR(MRIA, DefMO.getReg())) {
643 setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
644 CurrScore);
645 }
646 }
647 }
648 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
649 MachineOperand &MO = Inst.getOperand(I);
650 if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
651 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
652 }
653 }
654 }
655#if 0 // TODO: check if this is handled by MUBUF code above.
656 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
Evgeny Mankovbf975172017-08-16 16:47:29 +0000657 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
658 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000659 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
660 unsigned OpNo;//TODO: find the OpNo for this operand;
661 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
662 for (signed RegNo = Interval.first; RegNo < Interval.second;
Evgeny Mankovbf975172017-08-16 16:47:29 +0000663 ++RegNo) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000664 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
665 }
666#endif
667 } else {
668 // Match the score to the destination registers.
669 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
670 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
671 if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
672 continue;
673 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
674 setRegScore(RegNo, T, CurrScore);
675 }
676 }
677 if (TII->isDS(Inst) && Inst.mayStore()) {
678 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
679 }
680 }
681}
682
683void BlockWaitcntBrackets::print(raw_ostream &OS) {
684 OS << '\n';
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000685 for (auto T : inst_counter_types()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000686 int LB = getScoreLB(T);
687 int UB = getScoreUB(T);
688
689 switch (T) {
690 case VM_CNT:
691 OS << " VM_CNT(" << UB - LB << "): ";
692 break;
693 case LGKM_CNT:
694 OS << " LGKM_CNT(" << UB - LB << "): ";
695 break;
696 case EXP_CNT:
697 OS << " EXP_CNT(" << UB - LB << "): ";
698 break;
699 default:
700 OS << " UNKNOWN(" << UB - LB << "): ";
701 break;
702 }
703
704 if (LB < UB) {
705 // Print vgpr scores.
706 for (int J = 0; J <= getMaxVGPR(); J++) {
707 int RegScore = getRegScore(J, T);
708 if (RegScore <= LB)
709 continue;
710 int RelScore = RegScore - LB - 1;
711 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
712 OS << RelScore << ":v" << J << " ";
713 } else {
714 OS << RelScore << ":ds ";
715 }
716 }
717 // Also need to print sgpr scores for lgkm_cnt.
718 if (T == LGKM_CNT) {
719 for (int J = 0; J <= getMaxSGPR(); J++) {
720 int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
721 if (RegScore <= LB)
722 continue;
723 int RelScore = RegScore - LB - 1;
724 OS << RelScore << ":s" << J << " ";
725 }
726 }
727 }
728 OS << '\n';
729 }
730 OS << '\n';
Kannan Narayananacb089e2017-04-12 03:25:12 +0000731}
732
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000733/// Simplify the waitcnt, in the sense of removing redundant counts, and return
734/// whether a waitcnt instruction is needed at all.
735bool BlockWaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
736 return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
737 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
738 simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
739}
740
741bool BlockWaitcntBrackets::simplifyWaitcnt(InstCounterType T,
742 unsigned &Count) const {
743 const int32_t LB = getScoreLB(T);
744 const int32_t UB = getScoreUB(T);
745 if (Count < (unsigned)UB && UB - (int32_t)Count > LB)
746 return true;
747
748 Count = ~0u;
749 return false;
750}
751
752void BlockWaitcntBrackets::determineWait(InstCounterType T, int ScoreToWait,
753 AMDGPU::Waitcnt &Wait) const {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000754 if (ScoreToWait == -1) {
755 // The score to wait is unknown. This implies that it was not encountered
756 // during the path of the CFG walk done during the current traversal but
757 // may be seen on a different path. Emit an s_wait counter with a
758 // conservative value of 0 for the counter.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000759 addWait(Wait, T, 0);
760 return;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000761 }
762
763 // If the score of src_operand falls within the bracket, we need an
764 // s_waitcnt instruction.
765 const int32_t LB = getScoreLB(T);
766 const int32_t UB = getScoreUB(T);
767 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
Mark Searlesf0b93f12018-06-04 16:51:59 +0000768 if ((T == VM_CNT || T == LGKM_CNT) &&
769 hasPendingFlat() &&
770 !ST->hasFlatLgkmVMemCountInOrder()) {
771 // If there is a pending FLAT operation, and this is a VMem or LGKM
772 // waitcnt and the target can report early completion, then we need
773 // to force a waitcnt 0.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000774 addWait(Wait, T, 0);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000775 } else if (counterOutOfOrder(T)) {
776 // Counter can get decremented out-of-order when there
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000777 // are multiple types event in the bracket. Also emit an s_wait counter
Kannan Narayananacb089e2017-04-12 03:25:12 +0000778 // with a conservative value of 0 for the counter.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000779 addWait(Wait, T, 0);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000780 } else {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000781 addWait(Wait, T, UB - ScoreToWait);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000782 }
783 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000784}
Kannan Narayananacb089e2017-04-12 03:25:12 +0000785
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000786void BlockWaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
787 applyWaitcnt(VM_CNT, Wait.VmCnt);
788 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
789 applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000790}
791
792void BlockWaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
793 const int32_t UB = getScoreUB(T);
794 if (Count >= (unsigned)UB)
795 return;
796 if (Count != 0) {
797 if (counterOutOfOrder(T))
798 return;
799 setScoreLB(T, std::max(getScoreLB(T), UB - (int32_t)Count));
800 } else {
801 setScoreLB(T, UB);
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000802 MixedPendingEvents[T] = false;
803 PendingEvents &= ~WaitEventMaskForInst[T];
804 }
805}
806
807void BlockWaitcntBrackets::mergePendingEvents(const BlockWaitcntBrackets &Other) {
808 for (auto T : inst_counter_types()) {
809 uint32_t Old = PendingEvents & WaitEventMaskForInst[T];
810 uint32_t New = Other.PendingEvents & WaitEventMaskForInst[T];
811 if (Other.MixedPendingEvents[T] || (Old && New && Old != New))
812 MixedPendingEvents[T] = true;
813 PendingEvents |= New;
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000814 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000815}
816
817// Where there are multiple types of event in the bracket of a counter,
818// the decrement may go out of order.
Nicolai Haehnlec548d912018-11-19 12:03:11 +0000819bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000820 // Scalar memory read always can go out of order.
821 if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
822 return true;
823 return MixedPendingEvents[T];
Kannan Narayananacb089e2017-04-12 03:25:12 +0000824}
825
826INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
827 false)
828INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
829 false)
830
831char SIInsertWaitcnts::ID = 0;
832
833char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
834
835FunctionPass *llvm::createSIInsertWaitcntsPass() {
836 return new SIInsertWaitcnts();
837}
838
839static bool readsVCCZ(const MachineInstr &MI) {
840 unsigned Opc = MI.getOpcode();
841 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
842 !MI.getOperand(1).isUndef();
843}
844
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000845/// Generate s_waitcnt instruction to be placed before cur_Inst.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000846/// Instructions of a given type are returned in order,
847/// but instructions of different types can complete out of order.
848/// We rely on this in-order completion
849/// and simply assign a score to the memory access instructions.
850/// We keep track of the active "score bracket" to determine
851/// if an access of a memory read requires an s_waitcnt
852/// and if so what the value of each counter is.
853/// The "score bracket" is bound by the lower bound and upper bound
854/// scores (*_score_LB and *_score_ub respectively).
Mark Searles70901b92018-04-24 15:59:59 +0000855void SIInsertWaitcnts::generateWaitcntInstBefore(
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000856 MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets,
857 MachineInstr *OldWaitcntInstr) {
Mark Searles4a0f2c52018-05-07 14:43:28 +0000858 setForceEmitWaitcnt();
Mark Searlesec581832018-04-25 19:21:26 +0000859 bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
860
Nicolai Haehnle61396ff2018-11-07 21:53:36 +0000861 if (MI.isDebugInstr())
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +0000862 return;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000863
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000864 AMDGPU::Waitcnt Wait;
865
Kannan Narayananacb089e2017-04-12 03:25:12 +0000866 // See if this instruction has a forced S_WAITCNT VM.
867 // TODO: Handle other cases of NeedsWaitcntVmBefore()
Nicolai Haehnlef96456c2018-11-29 11:06:18 +0000868 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
Kannan Narayananacb089e2017-04-12 03:25:12 +0000869 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
870 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000871 Wait.VmCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000872 }
873
874 // All waits must be resolved at call return.
875 // NOTE: this could be improved with knowledge of all call sites or
876 // with knowledge of the called routines.
Tom Stellardc5a154d2018-06-28 23:47:12 +0000877 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
Mark Searles11d0a042017-05-31 16:44:23 +0000878 MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000879 Wait = AMDGPU::Waitcnt::allZero();
Kannan Narayananacb089e2017-04-12 03:25:12 +0000880 }
881 // Resolve vm waits before gs-done.
882 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
883 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
884 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
885 AMDGPU::SendMsg::ID_GS_DONE)) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000886 Wait.VmCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000887 }
888#if 0 // TODO: the following blocks of logic when we have fence.
889 else if (MI.getOpcode() == SC_FENCE) {
890 const unsigned int group_size =
891 context->shader_info->GetMaxThreadGroupSize();
892 // group_size == 0 means thread group size is unknown at compile time
893 const bool group_is_multi_wave =
894 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
895 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
896
897 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
898 SCRegType src_type = Inst->GetSrcType(i);
899 switch (src_type) {
900 case SCMEM_LDS:
901 if (group_is_multi_wave ||
Evgeny Mankovbf975172017-08-16 16:47:29 +0000902 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
Mark Searles70901b92018-04-24 15:59:59 +0000903 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000904 ScoreBrackets->getScoreUB(LGKM_CNT));
905 // LDS may have to wait for VM_CNT after buffer load to LDS
906 if (target_info->HasBufferLoadToLDS()) {
Mark Searles70901b92018-04-24 15:59:59 +0000907 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000908 ScoreBrackets->getScoreUB(VM_CNT));
909 }
910 }
911 break;
912
913 case SCMEM_GDS:
914 if (group_is_multi_wave || fence_is_global) {
Mark Searles70901b92018-04-24 15:59:59 +0000915 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000916 ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000917 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000918 ScoreBrackets->getScoreUB(LGKM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000919 }
920 break;
921
922 case SCMEM_UAV:
923 case SCMEM_TFBUF:
924 case SCMEM_RING:
925 case SCMEM_SCATTER:
926 if (group_is_multi_wave || fence_is_global) {
Mark Searles70901b92018-04-24 15:59:59 +0000927 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000928 ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000929 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000930 ScoreBrackets->getScoreUB(VM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000931 }
932 break;
933
934 case SCMEM_SCRATCH:
935 default:
936 break;
937 }
938 }
939 }
940#endif
941
942 // Export & GDS instructions do not read the EXEC mask until after the export
943 // is granted (which can occur well after the instruction is issued).
944 // The shader program must flush all EXP operations on the export-count
945 // before overwriting the EXEC mask.
946 else {
947 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
948 // Export and GDS are tracked individually, either may trigger a waitcnt
949 // for EXEC.
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000950 if (ScoreBrackets->hasPendingEvent(EXP_GPR_LOCK) ||
951 ScoreBrackets->hasPendingEvent(EXP_PARAM_ACCESS) ||
952 ScoreBrackets->hasPendingEvent(EXP_POS_ACCESS) ||
953 ScoreBrackets->hasPendingEvent(GDS_GPR_LOCK)) {
954 Wait.ExpCnt = 0;
955 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000956 }
957
958#if 0 // TODO: the following code to handle CALL.
959 // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
960 // However, there is a problem with EXP_CNT, because the call cannot
961 // easily tell if a register is used in the function, and if it did, then
962 // the referring instruction would have to have an S_WAITCNT, which is
963 // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
964 // before the call.
965 if (MI.getOpcode() == SC_CALL) {
966 if (ScoreBrackets->getScoreUB(EXP_CNT) >
Evgeny Mankovbf975172017-08-16 16:47:29 +0000967 ScoreBrackets->getScoreLB(EXP_CNT)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000968 ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000969 EmitWaitcnt |= CNT_MASK(EXP_CNT);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000970 }
971 }
972#endif
973
Matt Arsenault0ed39d32017-07-21 18:54:54 +0000974 // FIXME: Should not be relying on memoperands.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000975 // Look at the source operands of every instruction to see if
976 // any of them results from a previous memory operation that affects
977 // its current usage. If so, an s_waitcnt instruction needs to be
978 // emitted.
979 // If the source operand was defined by a load, add the s_waitcnt
980 // instruction.
981 for (const MachineMemOperand *Memop : MI.memoperands()) {
982 unsigned AS = Memop->getAddrSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +0000983 if (AS != AMDGPUAS::LOCAL_ADDRESS)
Kannan Narayananacb089e2017-04-12 03:25:12 +0000984 continue;
985 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
986 // VM_CNT is only relevant to vgpr or LDS.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000987 ScoreBrackets->determineWait(
988 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000989 }
Matt Arsenault0ed39d32017-07-21 18:54:54 +0000990
Kannan Narayananacb089e2017-04-12 03:25:12 +0000991 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
992 const MachineOperand &Op = MI.getOperand(I);
993 const MachineRegisterInfo &MRIA = *MRI;
994 RegInterval Interval =
995 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
996 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
997 if (TRI->isVGPR(MRIA, Op.getReg())) {
998 // VM_CNT is only relevant to vgpr or LDS.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000999 ScoreBrackets->determineWait(
1000 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001001 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001002 ScoreBrackets->determineWait(
1003 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001004 }
1005 }
1006 // End of for loop that looks at all source operands to decide vm_wait_cnt
1007 // and lgk_wait_cnt.
1008
1009 // Two cases are handled for destination operands:
1010 // 1) If the destination operand was defined by a load, add the s_waitcnt
1011 // instruction to guarantee the right WAW order.
1012 // 2) If a destination operand that was used by a recent export/store ins,
1013 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1014 if (MI.mayStore()) {
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001015 // FIXME: Should not be relying on memoperands.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001016 for (const MachineMemOperand *Memop : MI.memoperands()) {
1017 unsigned AS = Memop->getAddrSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +00001018 if (AS != AMDGPUAS::LOCAL_ADDRESS)
Kannan Narayananacb089e2017-04-12 03:25:12 +00001019 continue;
1020 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001021 ScoreBrackets->determineWait(
1022 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
1023 ScoreBrackets->determineWait(
1024 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001025 }
1026 }
1027 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1028 MachineOperand &Def = MI.getOperand(I);
1029 const MachineRegisterInfo &MRIA = *MRI;
1030 RegInterval Interval =
1031 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
1032 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1033 if (TRI->isVGPR(MRIA, Def.getReg())) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001034 ScoreBrackets->determineWait(
1035 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
1036 ScoreBrackets->determineWait(
1037 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001038 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001039 ScoreBrackets->determineWait(
1040 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001041 }
1042 } // End of for loop that looks at all dest operands.
1043 }
1044
Kannan Narayananacb089e2017-04-12 03:25:12 +00001045 // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1046 // occurs before the instruction. Doing it here prevents any additional
1047 // S_WAITCNTs from being emitted if the instruction was marked as
1048 // requiring a WAITCNT beforehand.
Konstantin Zhuravlyovbe6c0ca2017-06-02 17:40:26 +00001049 if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1050 !ST->hasAutoWaitcntBeforeBarrier()) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001051 Wait = AMDGPU::Waitcnt::allZero();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001052 }
1053
1054 // TODO: Remove this work-around, enable the assert for Bug 457939
1055 // after fixing the scheduler. Also, the Shader Compiler code is
1056 // independent of target.
Tom Stellardc5a154d2018-06-28 23:47:12 +00001057 if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001058 if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1059 ScoreBrackets->getScoreUB(LGKM_CNT) &&
Nicolai Haehnled1f45da2018-11-29 11:06:14 +00001060 ScoreBrackets->hasPendingEvent(SMEM_ACCESS)) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001061 Wait.LgkmCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001062 }
1063 }
1064
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001065 // Early-out if no wait is indicated.
1066 if (!ScoreBrackets->simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
1067 if (OldWaitcntInstr) {
1068 if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
1069 TrackedWaitcntSet.erase(OldWaitcntInstr);
1070 OldWaitcntInstr->eraseFromParent();
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001071 } else {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001072 int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
1073 ScoreBrackets->applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001074 }
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001075 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001076 return;
1077 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001078
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001079 if (ForceEmitZeroWaitcnts)
1080 Wait = AMDGPU::Waitcnt::allZero();
1081
1082 if (ForceEmitWaitcnt[VM_CNT])
1083 Wait.VmCnt = 0;
1084 if (ForceEmitWaitcnt[EXP_CNT])
1085 Wait.ExpCnt = 0;
1086 if (ForceEmitWaitcnt[LGKM_CNT])
1087 Wait.LgkmCnt = 0;
1088
1089 ScoreBrackets->applyWaitcnt(Wait);
1090
1091 AMDGPU::Waitcnt OldWait;
1092 if (OldWaitcntInstr) {
1093 OldWait =
1094 AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm());
1095 }
1096 if (OldWait.dominates(Wait))
1097 return;
1098
1099 MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
1100 if (ContainingLoop) {
1101 MachineBasicBlock *TBB = ContainingLoop->getHeader();
1102 BlockWaitcntBrackets *ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
1103 if (!ScoreBracket) {
1104 assert(!BlockVisitedSet.count(TBB));
1105 BlockWaitcntBracketsMap[TBB] =
1106 llvm::make_unique<BlockWaitcntBrackets>(ST);
1107 ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001108 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001109 ScoreBracket->setRevisitLoop(true);
1110 LLVM_DEBUG(dbgs() << "set-revisit2: Block"
1111 << ContainingLoop->getHeader()->getNumber() << '\n';);
1112 }
1113
1114 if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr))
1115 Wait = Wait.combined(OldWait);
1116
1117 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1118 if (OldWaitcntInstr) {
1119 OldWaitcntInstr->getOperand(0).setImm(Enc);
1120
1121 LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
1122 << "Old Instr: " << MI << '\n'
1123 << "New Instr: " << *OldWaitcntInstr << '\n');
1124 } else {
1125 auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1126 MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1127 .addImm(Enc);
1128 TrackedWaitcntSet.insert(SWaitInst);
1129
1130 LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1131 << "Old Instr: " << MI << '\n'
1132 << "New Instr: " << *SWaitInst << '\n');
Kannan Narayananacb089e2017-04-12 03:25:12 +00001133 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001134}
1135
1136void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
1137 MachineInstr *Waitcnt) {
1138 if (MBB.empty()) {
1139 MBB.push_back(Waitcnt);
1140 return;
1141 }
1142
1143 MachineBasicBlock::iterator It = MBB.end();
1144 MachineInstr *MI = &*(--It);
1145 if (MI->isBranch()) {
1146 MBB.insert(It, Waitcnt);
1147 } else {
1148 MBB.push_back(Waitcnt);
1149 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001150}
1151
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001152// This is a flat memory operation. Check to see if it has memory
1153// tokens for both LDS and Memory, and if so mark it as a flat.
1154bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1155 if (MI.memoperands_empty())
1156 return true;
1157
1158 for (const MachineMemOperand *Memop : MI.memoperands()) {
1159 unsigned AS = Memop->getAddrSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +00001160 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001161 return true;
1162 }
1163
1164 return false;
1165}
1166
Mark Searles70901b92018-04-24 15:59:59 +00001167void SIInsertWaitcnts::updateEventWaitcntAfter(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001168 MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
1169 // Now look at the instruction opcode. If it is a memory access
1170 // instruction, update the upper-bound of the appropriate counter's
1171 // bracket and the destination operand scores.
1172 // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001173 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001174 if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001175 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1176 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1177 } else {
1178 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1179 }
1180 } else if (TII->isFLAT(Inst)) {
1181 assert(Inst.mayLoad() || Inst.mayStore());
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001182
1183 if (TII->usesVM_CNT(Inst))
1184 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1185
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001186 if (TII->usesLGKM_CNT(Inst)) {
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001187 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001188
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001189 // This is a flat memory operation, so note it - it will require
1190 // that both the VM and LGKM be flushed to zero if it is pending when
1191 // a VM or LGKM dependency occurs.
1192 if (mayAccessLDSThroughFlat(Inst))
1193 ScoreBrackets->setPendingFlat();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001194 }
1195 } else if (SIInstrInfo::isVMEM(Inst) &&
1196 // TODO: get a better carve out.
1197 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1198 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1199 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1200 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
Mark Searles2a19af62018-04-26 16:11:19 +00001201 if (ST->vmemWriteNeedsExpWaitcnt() &&
Mark Searles11d0a042017-05-31 16:44:23 +00001202 (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001203 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1204 }
1205 } else if (TII->isSMRD(Inst)) {
1206 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1207 } else {
1208 switch (Inst.getOpcode()) {
1209 case AMDGPU::S_SENDMSG:
1210 case AMDGPU::S_SENDMSGHALT:
1211 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1212 break;
1213 case AMDGPU::EXP:
1214 case AMDGPU::EXP_DONE: {
1215 int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1216 if (Imm >= 32 && Imm <= 63)
1217 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1218 else if (Imm >= 12 && Imm <= 15)
1219 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1220 else
1221 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1222 break;
1223 }
1224 case AMDGPU::S_MEMTIME:
1225 case AMDGPU::S_MEMREALTIME:
1226 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1227 break;
1228 default:
1229 break;
1230 }
1231 }
1232}
1233
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001234// Merge the score brackets of the Block's predecessors;
1235// this merged score bracket is used when adding waitcnts to the Block
Kannan Narayananacb089e2017-04-12 03:25:12 +00001236void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
1237 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1238 int32_t MaxPending[NUM_INST_CNTS] = {0};
1239 int32_t MaxFlat[NUM_INST_CNTS] = {0};
Kannan Narayananacb089e2017-04-12 03:25:12 +00001240
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001241 // For single basic block loops, we need to retain the Block's
1242 // score bracket to have accurate Pred info. So, make a copy of Block's
1243 // score bracket, clear() it (which retains several important bits of info),
1244 // populate, and then replace en masse. For non-single basic block loops,
1245 // just clear Block's current score bracket and repopulate in-place.
1246 bool IsSelfPred;
1247 std::unique_ptr<BlockWaitcntBrackets> S;
1248
1249 IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
1250 != Block.pred_end();
1251 if (IsSelfPred) {
1252 S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1253 ScoreBrackets = S.get();
1254 }
1255
Kannan Narayananacb089e2017-04-12 03:25:12 +00001256 ScoreBrackets->clear();
1257
Kannan Narayananacb089e2017-04-12 03:25:12 +00001258 // See if there are any uninitialized predecessors. If so, emit an
1259 // s_waitcnt 0 at the beginning of the block.
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001260 for (MachineBasicBlock *Pred : Block.predecessors()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001261 BlockWaitcntBrackets *PredScoreBrackets =
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001262 BlockWaitcntBracketsMap[Pred].get();
1263 bool Visited = BlockVisitedSet.count(Pred);
Nicolai Haehnlef96456c2018-11-29 11:06:18 +00001264 if (!Visited)
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001265 continue;
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001266 for (auto T : inst_counter_types()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001267 int span =
1268 PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
1269 MaxPending[T] = std::max(MaxPending[T], span);
1270 span =
1271 PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
1272 MaxFlat[T] = std::max(MaxFlat[T], span);
1273 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001274 }
1275
Kannan Narayananacb089e2017-04-12 03:25:12 +00001276 // Now set the current Block's brackets to the largest ending bracket.
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001277 for (auto T : inst_counter_types()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001278 ScoreBrackets->setScoreUB(T, MaxPending[T]);
1279 ScoreBrackets->setScoreLB(T, 0);
1280 ScoreBrackets->setLastFlat(T, MaxFlat[T]);
1281 }
1282
Kannan Narayananacb089e2017-04-12 03:25:12 +00001283 // Set the register scoreboard.
1284 for (MachineBasicBlock *Pred : Block.predecessors()) {
Mark Searles24c92ee2018-02-07 02:21:21 +00001285 if (!BlockVisitedSet.count(Pred)) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001286 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001287 }
1288
1289 BlockWaitcntBrackets *PredScoreBrackets =
1290 BlockWaitcntBracketsMap[Pred].get();
1291
1292 // Now merge the gpr_reg_score information
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001293 for (auto T : inst_counter_types()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001294 int PredLB = PredScoreBrackets->getScoreLB(T);
1295 int PredUB = PredScoreBrackets->getScoreUB(T);
1296 if (PredLB < PredUB) {
1297 int PredScale = MaxPending[T] - PredUB;
1298 // Merge vgpr scores.
1299 for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
1300 int PredRegScore = PredScoreBrackets->getRegScore(J, T);
1301 if (PredRegScore <= PredLB)
1302 continue;
1303 int NewRegScore = PredScale + PredRegScore;
1304 ScoreBrackets->setRegScore(
1305 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1306 }
1307 // Also need to merge sgpr scores for lgkm_cnt.
1308 if (T == LGKM_CNT) {
1309 for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
1310 int PredRegScore =
1311 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1312 if (PredRegScore <= PredLB)
1313 continue;
1314 int NewRegScore = PredScale + PredRegScore;
1315 ScoreBrackets->setRegScore(
1316 J + NUM_ALL_VGPRS, LGKM_CNT,
1317 std::max(
1318 ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1319 NewRegScore));
1320 }
1321 }
1322 }
1323 }
1324
Nicolai Haehnled1f45da2018-11-29 11:06:14 +00001325 ScoreBrackets->mergePendingEvents(*PredScoreBrackets);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001326 }
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001327
1328 // if a single block loop, update the score brackets. Not needed for other
1329 // blocks, as we did this in-place
1330 if (IsSelfPred) {
1331 BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1332 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001333}
1334
Mark Searles10545412018-05-30 15:47:45 +00001335/// Return true if the given basic block is a "bottom" block of a loop.
1336/// This works even if the loop is discontiguous. This also handles
1337/// multiple back-edges for the same "header" block of a loop.
Mark Searles1bc6e712018-04-19 15:42:30 +00001338bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
1339 const MachineBasicBlock *Block) {
1340 for (MachineBasicBlock *MBB : Loop->blocks()) {
1341 if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
1342 return true;
1343 }
1344 }
1345 return false;
1346}
1347
1348/// Count the number of "bottom" basic blocks of a loop.
1349unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
1350 unsigned Count = 0;
1351 for (MachineBasicBlock *MBB : Loop->blocks()) {
1352 if (MBB->isSuccessor(Loop->getHeader())) {
1353 Count++;
1354 }
1355 }
1356 return Count;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001357}
1358
1359// Generate s_waitcnt instructions where needed.
1360void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1361 MachineBasicBlock &Block) {
1362 // Initialize the state information.
1363 mergeInputScoreBrackets(Block);
1364
1365 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1366
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001367 LLVM_DEBUG({
Mark Searlesec581832018-04-25 19:21:26 +00001368 dbgs() << "*** Block" << Block.getNumber() << " ***";
Kannan Narayananacb089e2017-04-12 03:25:12 +00001369 ScoreBrackets->dump();
1370 });
1371
Kannan Narayananacb089e2017-04-12 03:25:12 +00001372 // Walk over the instructions.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001373 MachineInstr *OldWaitcntInstr = nullptr;
1374
Kannan Narayananacb089e2017-04-12 03:25:12 +00001375 for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1376 Iter != E;) {
1377 MachineInstr &Inst = *Iter;
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001378
Kannan Narayananacb089e2017-04-12 03:25:12 +00001379 // Remove any previously existing waitcnts.
1380 if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001381 if (OldWaitcntInstr) {
1382 if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
1383 TrackedWaitcntSet.erase(OldWaitcntInstr);
1384 OldWaitcntInstr->eraseFromParent();
1385 OldWaitcntInstr = nullptr;
1386 } else if (!TrackedWaitcntSet.count(&Inst)) {
1387 // Two successive s_waitcnt's, both of which are pre-existing and
1388 // are therefore preserved.
1389 int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
1390 ScoreBrackets->applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
1391 } else {
1392 ++Iter;
1393 Inst.eraseFromParent();
1394 continue;
1395 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001396 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001397
1398 OldWaitcntInstr = &Inst;
1399 ++Iter;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001400 continue;
1401 }
1402
Kannan Narayananacb089e2017-04-12 03:25:12 +00001403 bool VCCZBugWorkAround = false;
1404 if (readsVCCZ(Inst) &&
Mark Searles24c92ee2018-02-07 02:21:21 +00001405 (!VCCZBugHandledSet.count(&Inst))) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001406 if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1407 ScoreBrackets->getScoreUB(LGKM_CNT) &&
Nicolai Haehnled1f45da2018-11-29 11:06:14 +00001408 ScoreBrackets->hasPendingEvent(SMEM_ACCESS)) {
Tom Stellardc5a154d2018-06-28 23:47:12 +00001409 if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
Kannan Narayananacb089e2017-04-12 03:25:12 +00001410 VCCZBugWorkAround = true;
1411 }
1412 }
1413
1414 // Generate an s_waitcnt instruction to be placed before
1415 // cur_Inst, if needed.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001416 generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
1417 OldWaitcntInstr = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001418
Mark Searles70901b92018-04-24 15:59:59 +00001419 updateEventWaitcntAfter(Inst, ScoreBrackets);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001420
1421#if 0 // TODO: implement resource type check controlled by options with ub = LB.
1422 // If this instruction generates a S_SETVSKIP because it is an
1423 // indexed resource, and we are on Tahiti, then it will also force
1424 // an S_WAITCNT vmcnt(0)
1425 if (RequireCheckResourceType(Inst, context)) {
1426 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1427 ScoreBrackets->setScoreLB(VM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +00001428 ScoreBrackets->getScoreUB(VM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +00001429 }
1430#endif
1431
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001432 LLVM_DEBUG({
Mark Searles94ae3b22018-01-30 17:17:06 +00001433 Inst.print(dbgs());
Kannan Narayananacb089e2017-04-12 03:25:12 +00001434 ScoreBrackets->dump();
1435 });
1436
1437 // Check to see if this is a GWS instruction. If so, and if this is CI or
1438 // VI, then the generated code sequence will include an S_WAITCNT 0.
1439 // TODO: Are these the only GWS instructions?
1440 if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1441 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1442 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1443 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1444 Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1445 // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001446 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero());
Kannan Narayananacb089e2017-04-12 03:25:12 +00001447 }
1448
1449 // TODO: Remove this work-around after fixing the scheduler and enable the
1450 // assert above.
1451 if (VCCZBugWorkAround) {
1452 // Restore the vccz bit. Any time a value is written to vcc, the vcc
1453 // bit is updated, so we can restore the bit by reading the value of
1454 // vcc and then writing it back to the register.
1455 BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1456 AMDGPU::VCC)
1457 .addReg(AMDGPU::VCC);
1458 VCCZBugHandledSet.insert(&Inst);
1459 }
1460
Kannan Narayananacb089e2017-04-12 03:25:12 +00001461 ++Iter;
1462 }
1463
1464 // Check if we need to force convergence at loop footer.
1465 MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
Mark Searles1bc6e712018-04-19 15:42:30 +00001466 if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001467 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1468 WaitcntData->print();
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001469 LLVM_DEBUG(dbgs() << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001470
1471 // The iterative waitcnt insertion algorithm aims for optimal waitcnt
Mark Searles10545412018-05-30 15:47:45 +00001472 // placement, but doesn't guarantee convergence for a loop. Each
1473 // loop should take at most (n+1) iterations for it to converge naturally,
1474 // where n is the number of bottom blocks. If this threshold is reached and
1475 // the result hasn't converged, then we force convergence by inserting
1476 // a s_waitcnt at the end of loop footer.
1477 if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001478 // To ensure convergence, need to make wait events at loop footer be no
1479 // more than those from the previous iteration.
Mark Searles65207922018-02-19 19:19:59 +00001480 // As a simplification, instead of tracking individual scores and
1481 // generating the precise wait count, just wait on 0.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001482 bool HasPending = false;
1483 MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001484 for (auto T : inst_counter_types()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001485 if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
1486 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1487 HasPending = true;
Mark Searles10545412018-05-30 15:47:45 +00001488 break;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001489 }
1490 }
1491
1492 if (HasPending) {
1493 if (!SWaitInst) {
Mark Searles10545412018-05-30 15:47:45 +00001494 SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
1495 DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1496 .addImm(0);
Mark Searles24c92ee2018-02-07 02:21:21 +00001497 TrackedWaitcntSet.insert(SWaitInst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001498#if 0 // TODO: Format the debug output
1499 OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
1500 OutputTransformAdd(SWaitInst, context);
1501#endif
1502 }
1503#if 0 // TODO: ??
1504 _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
1505#endif
1506 }
1507
1508 if (SWaitInst) {
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001509 LLVM_DEBUG({
Kannan Narayananacb089e2017-04-12 03:25:12 +00001510 SWaitInst->print(dbgs());
1511 dbgs() << "\nAdjusted score board:";
1512 ScoreBrackets->dump();
1513 });
1514
1515 // Add this waitcnt to the block. It is either newly created or
1516 // created in previous iterations and added back since block traversal
Mark Searles65207922018-02-19 19:19:59 +00001517 // always removes waitcnts.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001518 insertWaitcntBeforeCF(Block, SWaitInst);
1519 WaitcntData->setWaitcnt(SWaitInst);
1520 }
1521 }
1522 }
1523}
1524
1525bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
Tom Stellard5bfbae52018-07-11 20:59:01 +00001526 ST = &MF.getSubtarget<GCNSubtarget>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001527 TII = ST->getInstrInfo();
1528 TRI = &TII->getRegisterInfo();
1529 MRI = &MF.getRegInfo();
1530 MLI = &getAnalysis<MachineLoopInfo>();
Konstantin Zhuravlyov71e43ee2018-09-12 18:50:47 +00001531 IV = AMDGPU::getIsaVersion(ST->getCPU());
Mark Searles11d0a042017-05-31 16:44:23 +00001532 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001533
Mark Searles4a0f2c52018-05-07 14:43:28 +00001534 ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001535 for (auto T : inst_counter_types())
Mark Searlesec581832018-04-25 19:21:26 +00001536 ForceEmitWaitcnt[T] = false;
1537
Kannan Narayananacb089e2017-04-12 03:25:12 +00001538 HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1539 HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1540 HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1541
1542 HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1543 HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1544 assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1545 assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1546
1547 RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1548 RegisterEncoding.VGPRL =
1549 RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1550 RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1551 RegisterEncoding.SGPRL =
1552 RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1553
Mark Searles24c92ee2018-02-07 02:21:21 +00001554 TrackedWaitcntSet.clear();
1555 BlockVisitedSet.clear();
1556 VCCZBugHandledSet.clear();
Mark Searles1bc6e712018-04-19 15:42:30 +00001557 LoopWaitcntDataMap.clear();
Scott Linder5792dd02018-06-21 18:48:48 +00001558 BlockWaitcntProcessedSet.clear();
Mark Searles24c92ee2018-02-07 02:21:21 +00001559
Nicolai Haehnle0ab31c92018-11-07 21:53:29 +00001560 // Walk over the blocks in reverse post order, inserting
Kannan Narayananacb089e2017-04-12 03:25:12 +00001561 // s_waitcnt where needed.
1562 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
1563 bool Modified = false;
1564 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1565 I = RPOT.begin(),
1566 E = RPOT.end(), J = RPOT.begin();
1567 I != E;) {
1568 MachineBasicBlock &MBB = **I;
1569
1570 BlockVisitedSet.insert(&MBB);
1571
1572 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1573 if (!ScoreBrackets) {
Mark Searlesf0b93f12018-06-04 16:51:59 +00001574 BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001575 ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1576 }
1577 ScoreBrackets->setPostOrder(MBB.getNumber());
1578 MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
1579 if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
Eugene Zelenko59e12822017-08-08 00:47:13 +00001580 LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001581
1582 // If we are walking into the block from before the loop, then guarantee
1583 // at least 1 re-walk over the loop to propagate the information, even if
1584 // no S_WAITCNT instructions were generated.
Mark Searles1bc6e712018-04-19 15:42:30 +00001585 if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
1586 unsigned Count = countNumBottomBlocks(ContainingLoop);
1587
1588 // If the loop has multiple back-edges, and so more than one "bottom"
1589 // basic block, we have to guarantee a re-walk over every blocks.
1590 if ((std::count(BlockWaitcntProcessedSet.begin(),
Mark Searlesf4e70252018-07-16 10:21:36 +00001591 BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
Mark Searles1bc6e712018-04-19 15:42:30 +00001592 BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
Mark Searles10545412018-05-30 15:47:45 +00001593 LLVM_DEBUG(dbgs() << "set-revisit1: Block"
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001594 << ContainingLoop->getHeader()->getNumber() << '\n';);
Mark Searles1bc6e712018-04-19 15:42:30 +00001595 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001596 }
1597
1598 // Walk over the instructions.
1599 insertWaitcntInBlock(MF, MBB);
1600
Mark Searles10545412018-05-30 15:47:45 +00001601 // Record that waitcnts have been processed at least once for this block.
Mark Searles1bc6e712018-04-19 15:42:30 +00001602 BlockWaitcntProcessedSet.push_back(&MBB);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001603
Mark Searles1bc6e712018-04-19 15:42:30 +00001604 // See if we want to revisit the loop. If a loop has multiple back-edges,
1605 // we shouldn't revisit the same "bottom" basic block.
1606 if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
1607 std::count(BlockWaitcntProcessedSet.begin(),
1608 BlockWaitcntProcessedSet.end(), &MBB) == 1) {
Kannan Narayanan5e73b042017-05-05 21:10:17 +00001609 MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001610 BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
1611 if (EntrySB && EntrySB->getRevisitLoop()) {
1612 EntrySB->setRevisitLoop(false);
1613 J = I;
1614 int32_t PostOrder = EntrySB->getPostOrder();
1615 // TODO: Avoid this loop. Find another way to set I.
1616 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1617 X = RPOT.begin(),
1618 Y = RPOT.end();
1619 X != Y; ++X) {
1620 MachineBasicBlock &MBBX = **X;
1621 if (MBBX.getNumber() == PostOrder) {
1622 I = X;
1623 break;
1624 }
1625 }
1626 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1627 WaitcntData->incIterCnt();
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001628 LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001629 continue;
1630 } else {
1631 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1632 // Loop converged, reset iteration count. If this loop gets revisited,
1633 // it must be from an outer loop, the counter will restart, this will
1634 // ensure we don't force convergence on such revisits.
1635 WaitcntData->resetIterCnt();
1636 }
1637 }
1638
1639 J = I;
1640 ++I;
1641 }
1642
1643 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1644
1645 bool HaveScalarStores = false;
1646
1647 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1648 ++BI) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001649 MachineBasicBlock &MBB = *BI;
1650
1651 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1652 ++I) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001653 if (!HaveScalarStores && TII->isScalarStore(*I))
1654 HaveScalarStores = true;
1655
1656 if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1657 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1658 EndPgmBlocks.push_back(&MBB);
1659 }
1660 }
1661
1662 if (HaveScalarStores) {
1663 // If scalar writes are used, the cache must be flushed or else the next
1664 // wave to reuse the same scratch memory can be clobbered.
1665 //
1666 // Insert s_dcache_wb at wave termination points if there were any scalar
1667 // stores, and only if the cache hasn't already been flushed. This could be
1668 // improved by looking across blocks for flushes in postdominating blocks
1669 // from the stores but an explicitly requested flush is probably very rare.
1670 for (MachineBasicBlock *MBB : EndPgmBlocks) {
1671 bool SeenDCacheWB = false;
1672
1673 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1674 ++I) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001675 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1676 SeenDCacheWB = true;
1677 else if (TII->isScalarStore(*I))
1678 SeenDCacheWB = false;
1679
1680 // FIXME: It would be better to insert this before a waitcnt if any.
1681 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1682 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1683 !SeenDCacheWB) {
1684 Modified = true;
1685 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1686 }
1687 }
1688 }
1689 }
1690
Mark Searles11d0a042017-05-31 16:44:23 +00001691 if (!MFI->isEntryFunction()) {
1692 // Wait for any outstanding memory operations that the input registers may
Hiroshi Inouec8e92452018-01-29 05:17:03 +00001693 // depend on. We can't track them and it's better to the wait after the
Mark Searles11d0a042017-05-31 16:44:23 +00001694 // costly call sequence.
1695
1696 // TODO: Could insert earlier and schedule more liberally with operations
1697 // that only use caller preserved registers.
1698 MachineBasicBlock &EntryBB = MF.front();
Mark Searlesed54ff12018-05-30 16:27:57 +00001699 BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1700 .addImm(0);
Mark Searles11d0a042017-05-31 16:44:23 +00001701
1702 Modified = true;
1703 }
1704
Kannan Narayananacb089e2017-04-12 03:25:12 +00001705 return Modified;
1706}