blob: bf553771d75e21500bd82e0573e83029b1132c64 [file] [log] [blame]
Eugene Zelenko59e12822017-08-08 00:47:13 +00001//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
Kannan Narayananacb089e2017-04-12 03:25:12 +00002//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
Adrian Prantl5f8f34e42018-05-01 15:54:18 +000011/// Insert wait instructions for memory reads and writes.
Kannan Narayananacb089e2017-04-12 03:25:12 +000012///
13/// Memory reads and writes are issued asynchronously, so we need to insert
14/// S_WAITCNT instructions when we want to access any of their results or
15/// overwrite any register that's used asynchronously.
Nicolai Haehnled1f45da2018-11-29 11:06:14 +000016///
17/// TODO: This pass currently keeps one timeline per hardware counter. A more
18/// finely-grained approach that keeps one timeline per event type could
19/// sometimes get away with generating weaker s_waitcnt instructions. For
20/// example, when both SMEM and LDS are in flight and we need to wait for
21/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
22/// but the pass will currently generate a conservative lgkmcnt(0) because
23/// multiple event types are in flight.
Kannan Narayananacb089e2017-04-12 03:25:12 +000024//
25//===----------------------------------------------------------------------===//
26
27#include "AMDGPU.h"
28#include "AMDGPUSubtarget.h"
29#include "SIDefines.h"
30#include "SIInstrInfo.h"
31#include "SIMachineFunctionInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000032#include "SIRegisterInfo.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000033#include "Utils/AMDGPUBaseInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000034#include "llvm/ADT/DenseMap.h"
35#include "llvm/ADT/DenseSet.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000036#include "llvm/ADT/PostOrderIterator.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000037#include "llvm/ADT/STLExtras.h"
38#include "llvm/ADT/SmallVector.h"
39#include "llvm/CodeGen/MachineBasicBlock.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000040#include "llvm/CodeGen/MachineFunction.h"
41#include "llvm/CodeGen/MachineFunctionPass.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000042#include "llvm/CodeGen/MachineInstr.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000043#include "llvm/CodeGen/MachineInstrBuilder.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000044#include "llvm/CodeGen/MachineLoopInfo.h"
45#include "llvm/CodeGen/MachineMemOperand.h"
46#include "llvm/CodeGen/MachineOperand.h"
Kannan Narayananacb089e2017-04-12 03:25:12 +000047#include "llvm/CodeGen/MachineRegisterInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000048#include "llvm/IR/DebugLoc.h"
49#include "llvm/Pass.h"
50#include "llvm/Support/Debug.h"
Mark Searlesec581832018-04-25 19:21:26 +000051#include "llvm/Support/DebugCounter.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000052#include "llvm/Support/ErrorHandling.h"
53#include "llvm/Support/raw_ostream.h"
54#include <algorithm>
55#include <cassert>
56#include <cstdint>
57#include <cstring>
58#include <memory>
59#include <utility>
60#include <vector>
Kannan Narayananacb089e2017-04-12 03:25:12 +000061
Mark Searlesec581832018-04-25 19:21:26 +000062using namespace llvm;
63
Kannan Narayananacb089e2017-04-12 03:25:12 +000064#define DEBUG_TYPE "si-insert-waitcnts"
65
Mark Searlesec581832018-04-25 19:21:26 +000066DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
67 "Force emit s_waitcnt expcnt(0) instrs");
68DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
69 "Force emit s_waitcnt lgkmcnt(0) instrs");
70DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
71 "Force emit s_waitcnt vmcnt(0) instrs");
72
73static cl::opt<unsigned> ForceEmitZeroFlag(
74 "amdgpu-waitcnt-forcezero",
75 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
76 cl::init(0), cl::Hidden);
Kannan Narayananacb089e2017-04-12 03:25:12 +000077
78namespace {
79
Nicolai Haehnleae369d72018-11-29 11:06:11 +000080template <typename EnumT>
81class enum_iterator
82 : public iterator_facade_base<enum_iterator<EnumT>,
83 std::forward_iterator_tag, const EnumT> {
84 EnumT Value;
85public:
86 enum_iterator() = default;
87 enum_iterator(EnumT Value) : Value(Value) {}
88
89 enum_iterator &operator++() {
90 Value = static_cast<EnumT>(Value + 1);
91 return *this;
92 }
93
94 bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
95
96 EnumT operator*() const { return Value; }
97};
98
Kannan Narayananacb089e2017-04-12 03:25:12 +000099// Class of object that encapsulates latest instruction counter score
100// associated with the operand. Used for determining whether
101// s_waitcnt instruction needs to be emited.
102
103#define CNT_MASK(t) (1u << (t))
104
105enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
106
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000107iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
108 return make_range(enum_iterator<InstCounterType>(VM_CNT),
109 enum_iterator<InstCounterType>(NUM_INST_CNTS));
110}
111
Eugene Zelenko59e12822017-08-08 00:47:13 +0000112using RegInterval = std::pair<signed, signed>;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000113
114struct {
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000115 uint32_t VmcntMax;
116 uint32_t ExpcntMax;
117 uint32_t LgkmcntMax;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000118 int32_t NumVGPRsMax;
119 int32_t NumSGPRsMax;
120} HardwareLimits;
121
122struct {
123 unsigned VGPR0;
124 unsigned VGPRL;
125 unsigned SGPR0;
126 unsigned SGPRL;
127} RegisterEncoding;
128
129enum WaitEventType {
130 VMEM_ACCESS, // vector-memory read & write
131 LDS_ACCESS, // lds read & write
132 GDS_ACCESS, // gds read & write
133 SQ_MESSAGE, // send message
134 SMEM_ACCESS, // scalar-memory read & write
135 EXP_GPR_LOCK, // export holding on its data src
136 GDS_GPR_LOCK, // GDS holding on its data and addr src
137 EXP_POS_ACCESS, // write to export position
138 EXP_PARAM_ACCESS, // write to export parameter
139 VMW_GPR_LOCK, // vector-memory write holding on its data src
140 NUM_WAIT_EVENTS,
141};
142
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000143static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
144 (1 << VMEM_ACCESS),
145 (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
146 (1 << SQ_MESSAGE),
147 (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
148 (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
149};
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000150
Kannan Narayananacb089e2017-04-12 03:25:12 +0000151// The mapping is:
152// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
153// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
154// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
155// We reserve a fixed number of VGPR slots in the scoring tables for
156// special tokens like SCMEM_LDS (needed for buffer load to LDS).
157enum RegisterMapping {
158 SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
159 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
160 NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
161 EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
162 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
163};
164
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000165void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
166 switch (T) {
167 case VM_CNT:
168 Wait.VmCnt = std::min(Wait.VmCnt, Count);
169 break;
170 case EXP_CNT:
171 Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
172 break;
173 case LGKM_CNT:
174 Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
175 break;
176 default:
177 llvm_unreachable("bad InstCounterType");
178 }
179}
180
Kannan Narayananacb089e2017-04-12 03:25:12 +0000181// This is a per-basic-block object that maintains current score brackets
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000182// of each wait counter, and a per-register scoreboard for each wait counter.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000183// We also maintain the latest score for every event type that can change the
184// waitcnt in order to know if there are multiple types of events within
185// the brackets. When multiple types of event happen in the bracket,
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000186// wait count may get decreased out of order, therefore we need to put in
Kannan Narayananacb089e2017-04-12 03:25:12 +0000187// "s_waitcnt 0" before use.
188class BlockWaitcntBrackets {
189public:
Tom Stellard5bfbae52018-07-11 20:59:01 +0000190 BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000191 for (auto T : inst_counter_types())
Eugene Zelenko59e12822017-08-08 00:47:13 +0000192 memset(VgprScores[T], 0, sizeof(VgprScores[T]));
Eugene Zelenko59e12822017-08-08 00:47:13 +0000193 }
194
195 ~BlockWaitcntBrackets() = default;
196
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000197 static uint32_t getWaitCountMax(InstCounterType T) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000198 switch (T) {
199 case VM_CNT:
200 return HardwareLimits.VmcntMax;
201 case LGKM_CNT:
202 return HardwareLimits.LgkmcntMax;
203 case EXP_CNT:
204 return HardwareLimits.ExpcntMax;
205 default:
206 break;
207 }
208 return 0;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000209 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000210
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000211 void setScoreLB(InstCounterType T, uint32_t Val) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000212 assert(T < NUM_INST_CNTS);
213 if (T >= NUM_INST_CNTS)
214 return;
215 ScoreLBs[T] = Val;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000216 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000217
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000218 void setScoreUB(InstCounterType T, uint32_t Val) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000219 assert(T < NUM_INST_CNTS);
220 if (T >= NUM_INST_CNTS)
221 return;
222 ScoreUBs[T] = Val;
223 if (T == EXP_CNT) {
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000224 uint32_t UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
225 if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
Kannan Narayananacb089e2017-04-12 03:25:12 +0000226 ScoreLBs[T] = UB;
227 }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000228 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000229
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000230 uint32_t getScoreLB(InstCounterType T) const {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000231 assert(T < NUM_INST_CNTS);
232 if (T >= NUM_INST_CNTS)
233 return 0;
234 return ScoreLBs[T];
Eugene Zelenko59e12822017-08-08 00:47:13 +0000235 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000236
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000237 uint32_t getScoreUB(InstCounterType T) const {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000238 assert(T < NUM_INST_CNTS);
239 if (T >= NUM_INST_CNTS)
240 return 0;
241 return ScoreUBs[T];
Eugene Zelenko59e12822017-08-08 00:47:13 +0000242 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000243
244 // Mapping from event to counter.
245 InstCounterType eventCounter(WaitEventType E) {
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000246 if (E == VMEM_ACCESS)
Kannan Narayananacb089e2017-04-12 03:25:12 +0000247 return VM_CNT;
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000248 if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
Kannan Narayananacb089e2017-04-12 03:25:12 +0000249 return LGKM_CNT;
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000250 assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
251 return EXP_CNT;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000252 }
253
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000254 void setRegScore(int GprNo, InstCounterType T, uint32_t Val) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000255 if (GprNo < NUM_ALL_VGPRS) {
256 if (GprNo > VgprUB) {
257 VgprUB = GprNo;
258 }
259 VgprScores[T][GprNo] = Val;
260 } else {
261 assert(T == LGKM_CNT);
262 if (GprNo - NUM_ALL_VGPRS > SgprUB) {
263 SgprUB = GprNo - NUM_ALL_VGPRS;
264 }
265 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
266 }
267 }
268
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000269 uint32_t getRegScore(int GprNo, InstCounterType T) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000270 if (GprNo < NUM_ALL_VGPRS) {
271 return VgprScores[T][GprNo];
272 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000273 assert(T == LGKM_CNT);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000274 return SgprScores[GprNo - NUM_ALL_VGPRS];
275 }
276
277 void clear() {
278 memset(ScoreLBs, 0, sizeof(ScoreLBs));
279 memset(ScoreUBs, 0, sizeof(ScoreUBs));
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000280 PendingEvents = 0;
281 memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents));
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000282 for (auto T : inst_counter_types())
Kannan Narayananacb089e2017-04-12 03:25:12 +0000283 memset(VgprScores[T], 0, sizeof(VgprScores[T]));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000284 memset(SgprScores, 0, sizeof(SgprScores));
285 }
286
287 RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
288 const MachineRegisterInfo *MRI,
289 const SIRegisterInfo *TRI, unsigned OpNo,
290 bool Def) const;
291
292 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
293 const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000294 unsigned OpNo, uint32_t Val);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000295
Kannan Narayananacb089e2017-04-12 03:25:12 +0000296 int32_t getMaxVGPR() const { return VgprUB; }
297 int32_t getMaxSGPR() const { return SgprUB; }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000298
Nicolai Haehnlec548d912018-11-19 12:03:11 +0000299 bool counterOutOfOrder(InstCounterType T) const;
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000300 bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
301 bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000302 void determineWait(InstCounterType T, uint32_t ScoreToWait,
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000303 AMDGPU::Waitcnt &Wait) const;
304 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
305 void applyWaitcnt(InstCounterType T, unsigned Count);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000306 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
307 const MachineRegisterInfo *MRI, WaitEventType E,
308 MachineInstr &MI);
309
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000310 bool hasPendingEvent(WaitEventType E) const {
311 return PendingEvents & (1 << E);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000312 }
313
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000314 void mergePendingEvents(const BlockWaitcntBrackets &Other);
315
Kannan Narayananacb089e2017-04-12 03:25:12 +0000316 bool hasPendingFlat() const {
317 return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
318 LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
319 (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
320 LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
321 }
322
323 void setPendingFlat() {
324 LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
325 LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
326 }
327
328 int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
329
330 void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
331
332 bool getRevisitLoop() const { return RevisitLoop; }
333 void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
334
335 void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
336 int32_t getPostOrder() const { return PostOrder; }
337
Kannan Narayananacb089e2017-04-12 03:25:12 +0000338 void print(raw_ostream &);
339 void dump() { print(dbgs()); }
340
341private:
Tom Stellard5bfbae52018-07-11 20:59:01 +0000342 const GCNSubtarget *ST = nullptr;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000343 bool RevisitLoop = false;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000344 int32_t PostOrder = 0;
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000345 uint32_t ScoreLBs[NUM_INST_CNTS] = {0};
346 uint32_t ScoreUBs[NUM_INST_CNTS] = {0};
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000347 uint32_t PendingEvents = 0;
348 bool MixedPendingEvents[NUM_INST_CNTS] = {false};
Kannan Narayananacb089e2017-04-12 03:25:12 +0000349 // Remember the last flat memory operation.
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000350 uint32_t LastFlat[NUM_INST_CNTS] = {0};
Kannan Narayananacb089e2017-04-12 03:25:12 +0000351 // wait_cnt scores for every vgpr.
352 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000353 int32_t VgprUB = 0;
354 int32_t SgprUB = 0;
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000355 uint32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
Kannan Narayananacb089e2017-04-12 03:25:12 +0000356 // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000357 uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
Kannan Narayananacb089e2017-04-12 03:25:12 +0000358};
359
360// This is a per-loop-region object that records waitcnt status at the end of
361// loop footer from the previous iteration. We also maintain an iteration
362// count to track the number of times the loop has been visited. When it
363// doesn't converge naturally, we force convergence by inserting s_waitcnt 0
364// at the end of the loop footer.
365class LoopWaitcntData {
366public:
Eugene Zelenko59e12822017-08-08 00:47:13 +0000367 LoopWaitcntData() = default;
368 ~LoopWaitcntData() = default;
369
Kannan Narayananacb089e2017-04-12 03:25:12 +0000370 void incIterCnt() { IterCnt++; }
371 void resetIterCnt() { IterCnt = 0; }
Mark Searles10545412018-05-30 15:47:45 +0000372 unsigned getIterCnt() { return IterCnt; }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000373
Kannan Narayananacb089e2017-04-12 03:25:12 +0000374 void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
375 MachineInstr *getWaitcnt() const { return LfWaitcnt; }
376
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000377 void print() { LLVM_DEBUG(dbgs() << " iteration " << IterCnt << '\n';); }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000378
379private:
380 // s_waitcnt added at the end of loop footer to stablize wait scores
381 // at the end of the loop footer.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000382 MachineInstr *LfWaitcnt = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000383 // Number of iterations the loop has been visited, not including the initial
384 // walk over.
Eugene Zelenko59e12822017-08-08 00:47:13 +0000385 int32_t IterCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000386};
387
388class SIInsertWaitcnts : public MachineFunctionPass {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000389private:
Tom Stellard5bfbae52018-07-11 20:59:01 +0000390 const GCNSubtarget *ST = nullptr;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000391 const SIInstrInfo *TII = nullptr;
392 const SIRegisterInfo *TRI = nullptr;
393 const MachineRegisterInfo *MRI = nullptr;
394 const MachineLoopInfo *MLI = nullptr;
Konstantin Zhuravlyov71e43ee2018-09-12 18:50:47 +0000395 AMDGPU::IsaVersion IV;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000396
397 DenseSet<MachineBasicBlock *> BlockVisitedSet;
Mark Searles24c92ee2018-02-07 02:21:21 +0000398 DenseSet<MachineInstr *> TrackedWaitcntSet;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000399 DenseSet<MachineInstr *> VCCZBugHandledSet;
400
401 DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
402 BlockWaitcntBracketsMap;
403
Mark Searles1bc6e712018-04-19 15:42:30 +0000404 std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000405
406 DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
407
Mark Searles4a0f2c52018-05-07 14:43:28 +0000408 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
409 // because of amdgpu-waitcnt-forcezero flag
410 bool ForceEmitZeroWaitcnts;
Mark Searlesec581832018-04-25 19:21:26 +0000411 bool ForceEmitWaitcnt[NUM_INST_CNTS];
412
Kannan Narayananacb089e2017-04-12 03:25:12 +0000413public:
414 static char ID;
415
Konstantin Zhuravlyov77747772018-06-26 21:33:38 +0000416 SIInsertWaitcnts() : MachineFunctionPass(ID) {
417 (void)ForceExpCounter;
418 (void)ForceLgkmCounter;
419 (void)ForceVMCounter;
420 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000421
422 bool runOnMachineFunction(MachineFunction &MF) override;
423
424 StringRef getPassName() const override {
425 return "SI insert wait instructions";
426 }
427
428 void getAnalysisUsage(AnalysisUsage &AU) const override {
429 AU.setPreservesCFG();
430 AU.addRequired<MachineLoopInfo>();
431 MachineFunctionPass::getAnalysisUsage(AU);
432 }
433
Mark Searlesec581832018-04-25 19:21:26 +0000434 bool isForceEmitWaitcnt() const {
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000435 for (auto T : inst_counter_types())
Mark Searlesec581832018-04-25 19:21:26 +0000436 if (ForceEmitWaitcnt[T])
437 return true;
438 return false;
439 }
440
441 void setForceEmitWaitcnt() {
442// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
443// For debug builds, get the debug counter info and adjust if need be
444#ifndef NDEBUG
445 if (DebugCounter::isCounterSet(ForceExpCounter) &&
446 DebugCounter::shouldExecute(ForceExpCounter)) {
447 ForceEmitWaitcnt[EXP_CNT] = true;
448 } else {
449 ForceEmitWaitcnt[EXP_CNT] = false;
450 }
451
452 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
453 DebugCounter::shouldExecute(ForceLgkmCounter)) {
454 ForceEmitWaitcnt[LGKM_CNT] = true;
455 } else {
456 ForceEmitWaitcnt[LGKM_CNT] = false;
457 }
458
459 if (DebugCounter::isCounterSet(ForceVMCounter) &&
460 DebugCounter::shouldExecute(ForceVMCounter)) {
461 ForceEmitWaitcnt[VM_CNT] = true;
462 } else {
463 ForceEmitWaitcnt[VM_CNT] = false;
464 }
465#endif // NDEBUG
466 }
467
Matt Arsenault0ed39d32017-07-21 18:54:54 +0000468 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
Mark Searles70901b92018-04-24 15:59:59 +0000469 void generateWaitcntInstBefore(MachineInstr &MI,
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000470 BlockWaitcntBrackets *ScoreBrackets,
471 MachineInstr *OldWaitcntInstr);
Mark Searles70901b92018-04-24 15:59:59 +0000472 void updateEventWaitcntAfter(MachineInstr &Inst,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000473 BlockWaitcntBrackets *ScoreBrackets);
474 void mergeInputScoreBrackets(MachineBasicBlock &Block);
Mark Searles1bc6e712018-04-19 15:42:30 +0000475 bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
476 unsigned countNumBottomBlocks(const MachineLoop *Loop);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000477 void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
478 void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
479};
480
Eugene Zelenko59e12822017-08-08 00:47:13 +0000481} // end anonymous namespace
Kannan Narayananacb089e2017-04-12 03:25:12 +0000482
483RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
484 const SIInstrInfo *TII,
485 const MachineRegisterInfo *MRI,
486 const SIRegisterInfo *TRI,
487 unsigned OpNo,
488 bool Def) const {
489 const MachineOperand &Op = MI->getOperand(OpNo);
490 if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
491 (Def && !Op.isDef()))
492 return {-1, -1};
493
494 // A use via a PW operand does not need a waitcnt.
495 // A partial write is not a WAW.
496 assert(!Op.getSubReg() || !Op.isUndef());
497
498 RegInterval Result;
499 const MachineRegisterInfo &MRIA = *MRI;
500
501 unsigned Reg = TRI->getEncodingValue(Op.getReg());
502
503 if (TRI->isVGPR(MRIA, Op.getReg())) {
504 assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
505 Result.first = Reg - RegisterEncoding.VGPR0;
506 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
507 } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
508 assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
509 Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
510 assert(Result.first >= NUM_ALL_VGPRS &&
511 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
512 }
513 // TODO: Handle TTMP
514 // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
515 else
516 return {-1, -1};
517
518 const MachineInstr &MIA = *MI;
519 const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +0000520 unsigned Size = TRI->getRegSizeInBits(*RC);
521 Result.second = Result.first + (Size / 32);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000522
523 return Result;
524}
525
526void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
527 const SIInstrInfo *TII,
528 const SIRegisterInfo *TRI,
529 const MachineRegisterInfo *MRI,
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000530 unsigned OpNo, uint32_t Val) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000531 RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000532 LLVM_DEBUG({
Kannan Narayananacb089e2017-04-12 03:25:12 +0000533 const MachineOperand &Opnd = MI->getOperand(OpNo);
534 assert(TRI->isVGPR(*MRI, Opnd.getReg()));
535 });
536 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
537 setRegScore(RegNo, EXP_CNT, Val);
538 }
539}
540
541void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
542 const SIRegisterInfo *TRI,
543 const MachineRegisterInfo *MRI,
544 WaitEventType E, MachineInstr &Inst) {
545 const MachineRegisterInfo &MRIA = *MRI;
546 InstCounterType T = eventCounter(E);
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000547 uint32_t CurrScore = getScoreUB(T) + 1;
548 if (CurrScore == 0)
549 report_fatal_error("InsertWaitcnt score wraparound");
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000550 // PendingEvents and ScoreUB need to be update regardless if this event
551 // changes the score of a register or not.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000552 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000553 if (!hasPendingEvent(E)) {
554 if (PendingEvents & WaitEventMaskForInst[T])
555 MixedPendingEvents[T] = true;
556 PendingEvents |= 1 << E;
557 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000558 setScoreUB(T, CurrScore);
559
560 if (T == EXP_CNT) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000561 // Put score on the source vgprs. If this is a store, just use those
562 // specific register(s).
563 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
564 // All GDS operations must protect their address register (same as
565 // export.)
566 if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
567 Inst.getOpcode() != AMDGPU::DS_CONSUME) {
568 setExpScore(
569 &Inst, TII, TRI, MRI,
570 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
571 CurrScore);
572 }
573 if (Inst.mayStore()) {
574 setExpScore(
575 &Inst, TII, TRI, MRI,
576 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
577 CurrScore);
578 if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
579 AMDGPU::OpName::data1) != -1) {
580 setExpScore(&Inst, TII, TRI, MRI,
581 AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
582 AMDGPU::OpName::data1),
583 CurrScore);
584 }
585 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
586 Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
587 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
588 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
589 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
590 Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
591 Inst.getOpcode() != AMDGPU::DS_APPEND &&
592 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
593 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
594 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
595 const MachineOperand &Op = Inst.getOperand(I);
596 if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
597 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
598 }
599 }
600 }
601 } else if (TII->isFLAT(Inst)) {
602 if (Inst.mayStore()) {
603 setExpScore(
604 &Inst, TII, TRI, MRI,
605 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
606 CurrScore);
607 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
608 setExpScore(
609 &Inst, TII, TRI, MRI,
610 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
611 CurrScore);
612 }
613 } else if (TII->isMIMG(Inst)) {
614 if (Inst.mayStore()) {
615 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
616 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
617 setExpScore(
618 &Inst, TII, TRI, MRI,
619 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
620 CurrScore);
621 }
622 } else if (TII->isMTBUF(Inst)) {
623 if (Inst.mayStore()) {
624 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
625 }
626 } else if (TII->isMUBUF(Inst)) {
627 if (Inst.mayStore()) {
628 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
629 } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
630 setExpScore(
631 &Inst, TII, TRI, MRI,
632 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
633 CurrScore);
634 }
635 } else {
636 if (TII->isEXP(Inst)) {
637 // For export the destination registers are really temps that
638 // can be used as the actual source after export patching, so
639 // we need to treat them like sources and set the EXP_CNT
640 // score.
641 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
642 MachineOperand &DefMO = Inst.getOperand(I);
643 if (DefMO.isReg() && DefMO.isDef() &&
644 TRI->isVGPR(MRIA, DefMO.getReg())) {
645 setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
646 CurrScore);
647 }
648 }
649 }
650 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
651 MachineOperand &MO = Inst.getOperand(I);
652 if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
653 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
654 }
655 }
656 }
657#if 0 // TODO: check if this is handled by MUBUF code above.
658 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
Evgeny Mankovbf975172017-08-16 16:47:29 +0000659 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
660 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000661 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
662 unsigned OpNo;//TODO: find the OpNo for this operand;
663 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
664 for (signed RegNo = Interval.first; RegNo < Interval.second;
Evgeny Mankovbf975172017-08-16 16:47:29 +0000665 ++RegNo) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000666 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
667 }
668#endif
669 } else {
670 // Match the score to the destination registers.
671 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
672 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
673 if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
674 continue;
675 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
676 setRegScore(RegNo, T, CurrScore);
677 }
678 }
679 if (TII->isDS(Inst) && Inst.mayStore()) {
680 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
681 }
682 }
683}
684
685void BlockWaitcntBrackets::print(raw_ostream &OS) {
686 OS << '\n';
Nicolai Haehnleae369d72018-11-29 11:06:11 +0000687 for (auto T : inst_counter_types()) {
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000688 uint32_t LB = getScoreLB(T);
689 uint32_t UB = getScoreUB(T);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000690
691 switch (T) {
692 case VM_CNT:
693 OS << " VM_CNT(" << UB - LB << "): ";
694 break;
695 case LGKM_CNT:
696 OS << " LGKM_CNT(" << UB - LB << "): ";
697 break;
698 case EXP_CNT:
699 OS << " EXP_CNT(" << UB - LB << "): ";
700 break;
701 default:
702 OS << " UNKNOWN(" << UB - LB << "): ";
703 break;
704 }
705
706 if (LB < UB) {
707 // Print vgpr scores.
708 for (int J = 0; J <= getMaxVGPR(); J++) {
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000709 uint32_t RegScore = getRegScore(J, T);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000710 if (RegScore <= LB)
711 continue;
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000712 uint32_t RelScore = RegScore - LB - 1;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000713 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
714 OS << RelScore << ":v" << J << " ";
715 } else {
716 OS << RelScore << ":ds ";
717 }
718 }
719 // Also need to print sgpr scores for lgkm_cnt.
720 if (T == LGKM_CNT) {
721 for (int J = 0; J <= getMaxSGPR(); J++) {
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000722 uint32_t RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000723 if (RegScore <= LB)
724 continue;
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000725 uint32_t RelScore = RegScore - LB - 1;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000726 OS << RelScore << ":s" << J << " ";
727 }
728 }
729 }
730 OS << '\n';
731 }
732 OS << '\n';
Kannan Narayananacb089e2017-04-12 03:25:12 +0000733}
734
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000735/// Simplify the waitcnt, in the sense of removing redundant counts, and return
736/// whether a waitcnt instruction is needed at all.
737bool BlockWaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
738 return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
739 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
740 simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
741}
742
743bool BlockWaitcntBrackets::simplifyWaitcnt(InstCounterType T,
744 unsigned &Count) const {
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000745 const uint32_t LB = getScoreLB(T);
746 const uint32_t UB = getScoreUB(T);
747 if (Count < UB && UB - Count > LB)
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000748 return true;
749
750 Count = ~0u;
751 return false;
752}
753
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000754void BlockWaitcntBrackets::determineWait(InstCounterType T,
755 uint32_t ScoreToWait,
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000756 AMDGPU::Waitcnt &Wait) const {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000757 // If the score of src_operand falls within the bracket, we need an
758 // s_waitcnt instruction.
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000759 const uint32_t LB = getScoreLB(T);
760 const uint32_t UB = getScoreUB(T);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000761 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
Mark Searlesf0b93f12018-06-04 16:51:59 +0000762 if ((T == VM_CNT || T == LGKM_CNT) &&
763 hasPendingFlat() &&
764 !ST->hasFlatLgkmVMemCountInOrder()) {
765 // If there is a pending FLAT operation, and this is a VMem or LGKM
766 // waitcnt and the target can report early completion, then we need
767 // to force a waitcnt 0.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000768 addWait(Wait, T, 0);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000769 } else if (counterOutOfOrder(T)) {
770 // Counter can get decremented out-of-order when there
Mark Searlesc3c02bd2018-03-14 22:04:32 +0000771 // are multiple types event in the bracket. Also emit an s_wait counter
Kannan Narayananacb089e2017-04-12 03:25:12 +0000772 // with a conservative value of 0 for the counter.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000773 addWait(Wait, T, 0);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000774 } else {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000775 addWait(Wait, T, UB - ScoreToWait);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000776 }
777 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000778}
Kannan Narayananacb089e2017-04-12 03:25:12 +0000779
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000780void BlockWaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
781 applyWaitcnt(VM_CNT, Wait.VmCnt);
782 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
783 applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000784}
785
786void BlockWaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000787 const uint32_t UB = getScoreUB(T);
788 if (Count >= UB)
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000789 return;
790 if (Count != 0) {
791 if (counterOutOfOrder(T))
792 return;
Nicolai Haehnleab43bf62018-11-29 11:06:21 +0000793 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000794 } else {
795 setScoreLB(T, UB);
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000796 MixedPendingEvents[T] = false;
797 PendingEvents &= ~WaitEventMaskForInst[T];
798 }
799}
800
801void BlockWaitcntBrackets::mergePendingEvents(const BlockWaitcntBrackets &Other) {
802 for (auto T : inst_counter_types()) {
803 uint32_t Old = PendingEvents & WaitEventMaskForInst[T];
804 uint32_t New = Other.PendingEvents & WaitEventMaskForInst[T];
805 if (Other.MixedPendingEvents[T] || (Old && New && Old != New))
806 MixedPendingEvents[T] = true;
807 PendingEvents |= New;
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000808 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000809}
810
811// Where there are multiple types of event in the bracket of a counter,
812// the decrement may go out of order.
Nicolai Haehnlec548d912018-11-19 12:03:11 +0000813bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000814 // Scalar memory read always can go out of order.
815 if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
816 return true;
817 return MixedPendingEvents[T];
Kannan Narayananacb089e2017-04-12 03:25:12 +0000818}
819
820INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
821 false)
822INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
823 false)
824
825char SIInsertWaitcnts::ID = 0;
826
827char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
828
829FunctionPass *llvm::createSIInsertWaitcntsPass() {
830 return new SIInsertWaitcnts();
831}
832
833static bool readsVCCZ(const MachineInstr &MI) {
834 unsigned Opc = MI.getOpcode();
835 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
836 !MI.getOperand(1).isUndef();
837}
838
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000839/// Generate s_waitcnt instruction to be placed before cur_Inst.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000840/// Instructions of a given type are returned in order,
841/// but instructions of different types can complete out of order.
842/// We rely on this in-order completion
843/// and simply assign a score to the memory access instructions.
844/// We keep track of the active "score bracket" to determine
845/// if an access of a memory read requires an s_waitcnt
846/// and if so what the value of each counter is.
847/// The "score bracket" is bound by the lower bound and upper bound
848/// scores (*_score_LB and *_score_ub respectively).
Mark Searles70901b92018-04-24 15:59:59 +0000849void SIInsertWaitcnts::generateWaitcntInstBefore(
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000850 MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets,
851 MachineInstr *OldWaitcntInstr) {
Mark Searles4a0f2c52018-05-07 14:43:28 +0000852 setForceEmitWaitcnt();
Mark Searlesec581832018-04-25 19:21:26 +0000853 bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
854
Nicolai Haehnle61396ff2018-11-07 21:53:36 +0000855 if (MI.isDebugInstr())
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +0000856 return;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000857
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000858 AMDGPU::Waitcnt Wait;
859
Kannan Narayananacb089e2017-04-12 03:25:12 +0000860 // See if this instruction has a forced S_WAITCNT VM.
861 // TODO: Handle other cases of NeedsWaitcntVmBefore()
Nicolai Haehnlef96456c2018-11-29 11:06:18 +0000862 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
Kannan Narayananacb089e2017-04-12 03:25:12 +0000863 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
864 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000865 Wait.VmCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000866 }
867
868 // All waits must be resolved at call return.
869 // NOTE: this could be improved with knowledge of all call sites or
870 // with knowledge of the called routines.
Tom Stellardc5a154d2018-06-28 23:47:12 +0000871 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
Mark Searles11d0a042017-05-31 16:44:23 +0000872 MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000873 Wait = AMDGPU::Waitcnt::allZero();
Kannan Narayananacb089e2017-04-12 03:25:12 +0000874 }
875 // Resolve vm waits before gs-done.
876 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
877 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
878 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
879 AMDGPU::SendMsg::ID_GS_DONE)) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000880 Wait.VmCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +0000881 }
882#if 0 // TODO: the following blocks of logic when we have fence.
883 else if (MI.getOpcode() == SC_FENCE) {
884 const unsigned int group_size =
885 context->shader_info->GetMaxThreadGroupSize();
886 // group_size == 0 means thread group size is unknown at compile time
887 const bool group_is_multi_wave =
888 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
889 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
890
891 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
892 SCRegType src_type = Inst->GetSrcType(i);
893 switch (src_type) {
894 case SCMEM_LDS:
895 if (group_is_multi_wave ||
Evgeny Mankovbf975172017-08-16 16:47:29 +0000896 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
Mark Searles70901b92018-04-24 15:59:59 +0000897 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000898 ScoreBrackets->getScoreUB(LGKM_CNT));
899 // LDS may have to wait for VM_CNT after buffer load to LDS
900 if (target_info->HasBufferLoadToLDS()) {
Mark Searles70901b92018-04-24 15:59:59 +0000901 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
Kannan Narayananacb089e2017-04-12 03:25:12 +0000902 ScoreBrackets->getScoreUB(VM_CNT));
903 }
904 }
905 break;
906
907 case SCMEM_GDS:
908 if (group_is_multi_wave || fence_is_global) {
Mark Searles70901b92018-04-24 15:59:59 +0000909 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000910 ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000911 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000912 ScoreBrackets->getScoreUB(LGKM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000913 }
914 break;
915
916 case SCMEM_UAV:
917 case SCMEM_TFBUF:
918 case SCMEM_RING:
919 case SCMEM_SCATTER:
920 if (group_is_multi_wave || fence_is_global) {
Mark Searles70901b92018-04-24 15:59:59 +0000921 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000922 ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000923 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +0000924 ScoreBrackets->getScoreUB(VM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +0000925 }
926 break;
927
928 case SCMEM_SCRATCH:
929 default:
930 break;
931 }
932 }
933 }
934#endif
935
936 // Export & GDS instructions do not read the EXEC mask until after the export
937 // is granted (which can occur well after the instruction is issued).
938 // The shader program must flush all EXP operations on the export-count
939 // before overwriting the EXEC mask.
940 else {
941 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
942 // Export and GDS are tracked individually, either may trigger a waitcnt
943 // for EXEC.
Nicolai Haehnled1f45da2018-11-29 11:06:14 +0000944 if (ScoreBrackets->hasPendingEvent(EXP_GPR_LOCK) ||
945 ScoreBrackets->hasPendingEvent(EXP_PARAM_ACCESS) ||
946 ScoreBrackets->hasPendingEvent(EXP_POS_ACCESS) ||
947 ScoreBrackets->hasPendingEvent(GDS_GPR_LOCK)) {
948 Wait.ExpCnt = 0;
949 }
Kannan Narayananacb089e2017-04-12 03:25:12 +0000950 }
951
952#if 0 // TODO: the following code to handle CALL.
953 // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
954 // However, there is a problem with EXP_CNT, because the call cannot
955 // easily tell if a register is used in the function, and if it did, then
956 // the referring instruction would have to have an S_WAITCNT, which is
957 // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
958 // before the call.
959 if (MI.getOpcode() == SC_CALL) {
960 if (ScoreBrackets->getScoreUB(EXP_CNT) >
Evgeny Mankovbf975172017-08-16 16:47:29 +0000961 ScoreBrackets->getScoreLB(EXP_CNT)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +0000962 ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
Mark Searles70901b92018-04-24 15:59:59 +0000963 EmitWaitcnt |= CNT_MASK(EXP_CNT);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000964 }
965 }
966#endif
967
Matt Arsenault0ed39d32017-07-21 18:54:54 +0000968 // FIXME: Should not be relying on memoperands.
Kannan Narayananacb089e2017-04-12 03:25:12 +0000969 // Look at the source operands of every instruction to see if
970 // any of them results from a previous memory operation that affects
971 // its current usage. If so, an s_waitcnt instruction needs to be
972 // emitted.
973 // If the source operand was defined by a load, add the s_waitcnt
974 // instruction.
975 for (const MachineMemOperand *Memop : MI.memoperands()) {
976 unsigned AS = Memop->getAddrSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +0000977 if (AS != AMDGPUAS::LOCAL_ADDRESS)
Kannan Narayananacb089e2017-04-12 03:25:12 +0000978 continue;
979 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
980 // VM_CNT is only relevant to vgpr or LDS.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000981 ScoreBrackets->determineWait(
982 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000983 }
Matt Arsenault0ed39d32017-07-21 18:54:54 +0000984
Kannan Narayananacb089e2017-04-12 03:25:12 +0000985 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
986 const MachineOperand &Op = MI.getOperand(I);
987 const MachineRegisterInfo &MRIA = *MRI;
988 RegInterval Interval =
989 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
990 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
991 if (TRI->isVGPR(MRIA, Op.getReg())) {
992 // VM_CNT is only relevant to vgpr or LDS.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000993 ScoreBrackets->determineWait(
994 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000995 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +0000996 ScoreBrackets->determineWait(
997 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +0000998 }
999 }
1000 // End of for loop that looks at all source operands to decide vm_wait_cnt
1001 // and lgk_wait_cnt.
1002
1003 // Two cases are handled for destination operands:
1004 // 1) If the destination operand was defined by a load, add the s_waitcnt
1005 // instruction to guarantee the right WAW order.
1006 // 2) If a destination operand that was used by a recent export/store ins,
1007 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1008 if (MI.mayStore()) {
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001009 // FIXME: Should not be relying on memoperands.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001010 for (const MachineMemOperand *Memop : MI.memoperands()) {
1011 unsigned AS = Memop->getAddrSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +00001012 if (AS != AMDGPUAS::LOCAL_ADDRESS)
Kannan Narayananacb089e2017-04-12 03:25:12 +00001013 continue;
1014 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001015 ScoreBrackets->determineWait(
1016 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
1017 ScoreBrackets->determineWait(
1018 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001019 }
1020 }
1021 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1022 MachineOperand &Def = MI.getOperand(I);
1023 const MachineRegisterInfo &MRIA = *MRI;
1024 RegInterval Interval =
1025 ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
1026 for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1027 if (TRI->isVGPR(MRIA, Def.getReg())) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001028 ScoreBrackets->determineWait(
1029 VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT), Wait);
1030 ScoreBrackets->determineWait(
1031 EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001032 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001033 ScoreBrackets->determineWait(
1034 LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT), Wait);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001035 }
1036 } // End of for loop that looks at all dest operands.
1037 }
1038
Kannan Narayananacb089e2017-04-12 03:25:12 +00001039 // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1040 // occurs before the instruction. Doing it here prevents any additional
1041 // S_WAITCNTs from being emitted if the instruction was marked as
1042 // requiring a WAITCNT beforehand.
Konstantin Zhuravlyovbe6c0ca2017-06-02 17:40:26 +00001043 if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1044 !ST->hasAutoWaitcntBeforeBarrier()) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001045 Wait = AMDGPU::Waitcnt::allZero();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001046 }
1047
1048 // TODO: Remove this work-around, enable the assert for Bug 457939
1049 // after fixing the scheduler. Also, the Shader Compiler code is
1050 // independent of target.
Tom Stellardc5a154d2018-06-28 23:47:12 +00001051 if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001052 if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1053 ScoreBrackets->getScoreUB(LGKM_CNT) &&
Nicolai Haehnled1f45da2018-11-29 11:06:14 +00001054 ScoreBrackets->hasPendingEvent(SMEM_ACCESS)) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001055 Wait.LgkmCnt = 0;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001056 }
1057 }
1058
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001059 // Early-out if no wait is indicated.
1060 if (!ScoreBrackets->simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
1061 if (OldWaitcntInstr) {
1062 if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
1063 TrackedWaitcntSet.erase(OldWaitcntInstr);
1064 OldWaitcntInstr->eraseFromParent();
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001065 } else {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001066 int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
1067 ScoreBrackets->applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
Stanislav Mekhanoshindb39b4b2018-02-08 00:18:35 +00001068 }
Nicolai Haehnle61396ff2018-11-07 21:53:36 +00001069 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001070 return;
1071 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001072
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001073 if (ForceEmitZeroWaitcnts)
1074 Wait = AMDGPU::Waitcnt::allZero();
1075
1076 if (ForceEmitWaitcnt[VM_CNT])
1077 Wait.VmCnt = 0;
1078 if (ForceEmitWaitcnt[EXP_CNT])
1079 Wait.ExpCnt = 0;
1080 if (ForceEmitWaitcnt[LGKM_CNT])
1081 Wait.LgkmCnt = 0;
1082
1083 ScoreBrackets->applyWaitcnt(Wait);
1084
1085 AMDGPU::Waitcnt OldWait;
1086 if (OldWaitcntInstr) {
1087 OldWait =
1088 AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm());
1089 }
1090 if (OldWait.dominates(Wait))
1091 return;
1092
1093 MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
1094 if (ContainingLoop) {
1095 MachineBasicBlock *TBB = ContainingLoop->getHeader();
1096 BlockWaitcntBrackets *ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
1097 if (!ScoreBracket) {
1098 assert(!BlockVisitedSet.count(TBB));
1099 BlockWaitcntBracketsMap[TBB] =
1100 llvm::make_unique<BlockWaitcntBrackets>(ST);
1101 ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001102 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001103 ScoreBracket->setRevisitLoop(true);
1104 LLVM_DEBUG(dbgs() << "set-revisit2: Block"
1105 << ContainingLoop->getHeader()->getNumber() << '\n';);
1106 }
1107
1108 if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr))
1109 Wait = Wait.combined(OldWait);
1110
1111 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1112 if (OldWaitcntInstr) {
1113 OldWaitcntInstr->getOperand(0).setImm(Enc);
1114
1115 LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
1116 << "Old Instr: " << MI << '\n'
1117 << "New Instr: " << *OldWaitcntInstr << '\n');
1118 } else {
1119 auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1120 MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1121 .addImm(Enc);
1122 TrackedWaitcntSet.insert(SWaitInst);
1123
1124 LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1125 << "Old Instr: " << MI << '\n'
1126 << "New Instr: " << *SWaitInst << '\n');
Kannan Narayananacb089e2017-04-12 03:25:12 +00001127 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001128}
1129
1130void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
1131 MachineInstr *Waitcnt) {
1132 if (MBB.empty()) {
1133 MBB.push_back(Waitcnt);
1134 return;
1135 }
1136
1137 MachineBasicBlock::iterator It = MBB.end();
1138 MachineInstr *MI = &*(--It);
1139 if (MI->isBranch()) {
1140 MBB.insert(It, Waitcnt);
1141 } else {
1142 MBB.push_back(Waitcnt);
1143 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001144}
1145
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001146// This is a flat memory operation. Check to see if it has memory
1147// tokens for both LDS and Memory, and if so mark it as a flat.
1148bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1149 if (MI.memoperands_empty())
1150 return true;
1151
1152 for (const MachineMemOperand *Memop : MI.memoperands()) {
1153 unsigned AS = Memop->getAddrSpace();
Matt Arsenault0da63502018-08-31 05:49:54 +00001154 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001155 return true;
1156 }
1157
1158 return false;
1159}
1160
Mark Searles70901b92018-04-24 15:59:59 +00001161void SIInsertWaitcnts::updateEventWaitcntAfter(
Kannan Narayananacb089e2017-04-12 03:25:12 +00001162 MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
1163 // Now look at the instruction opcode. If it is a memory access
1164 // instruction, update the upper-bound of the appropriate counter's
1165 // bracket and the destination operand scores.
1166 // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001167 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001168 if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001169 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1170 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1171 } else {
1172 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1173 }
1174 } else if (TII->isFLAT(Inst)) {
1175 assert(Inst.mayLoad() || Inst.mayStore());
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001176
1177 if (TII->usesVM_CNT(Inst))
1178 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1179
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001180 if (TII->usesLGKM_CNT(Inst)) {
Matt Arsenault6ab9ea92017-07-21 18:34:51 +00001181 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001182
Matt Arsenault0ed39d32017-07-21 18:54:54 +00001183 // This is a flat memory operation, so note it - it will require
1184 // that both the VM and LGKM be flushed to zero if it is pending when
1185 // a VM or LGKM dependency occurs.
1186 if (mayAccessLDSThroughFlat(Inst))
1187 ScoreBrackets->setPendingFlat();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001188 }
1189 } else if (SIInstrInfo::isVMEM(Inst) &&
1190 // TODO: get a better carve out.
1191 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1192 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1193 Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1194 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
Mark Searles2a19af62018-04-26 16:11:19 +00001195 if (ST->vmemWriteNeedsExpWaitcnt() &&
Mark Searles11d0a042017-05-31 16:44:23 +00001196 (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001197 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1198 }
1199 } else if (TII->isSMRD(Inst)) {
1200 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1201 } else {
1202 switch (Inst.getOpcode()) {
1203 case AMDGPU::S_SENDMSG:
1204 case AMDGPU::S_SENDMSGHALT:
1205 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1206 break;
1207 case AMDGPU::EXP:
1208 case AMDGPU::EXP_DONE: {
1209 int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1210 if (Imm >= 32 && Imm <= 63)
1211 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1212 else if (Imm >= 12 && Imm <= 15)
1213 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1214 else
1215 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1216 break;
1217 }
1218 case AMDGPU::S_MEMTIME:
1219 case AMDGPU::S_MEMREALTIME:
1220 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1221 break;
1222 default:
1223 break;
1224 }
1225 }
1226}
1227
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001228// Merge the score brackets of the Block's predecessors;
1229// this merged score bracket is used when adding waitcnts to the Block
Kannan Narayananacb089e2017-04-12 03:25:12 +00001230void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
1231 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
Nicolai Haehnleab43bf62018-11-29 11:06:21 +00001232 uint32_t MaxPending[NUM_INST_CNTS] = {0};
1233 uint32_t MaxFlat[NUM_INST_CNTS] = {0};
Kannan Narayananacb089e2017-04-12 03:25:12 +00001234
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001235 // For single basic block loops, we need to retain the Block's
1236 // score bracket to have accurate Pred info. So, make a copy of Block's
1237 // score bracket, clear() it (which retains several important bits of info),
1238 // populate, and then replace en masse. For non-single basic block loops,
1239 // just clear Block's current score bracket and repopulate in-place.
1240 bool IsSelfPred;
1241 std::unique_ptr<BlockWaitcntBrackets> S;
1242
1243 IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
1244 != Block.pred_end();
1245 if (IsSelfPred) {
1246 S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1247 ScoreBrackets = S.get();
1248 }
1249
Kannan Narayananacb089e2017-04-12 03:25:12 +00001250 ScoreBrackets->clear();
1251
Kannan Narayananacb089e2017-04-12 03:25:12 +00001252 // See if there are any uninitialized predecessors. If so, emit an
1253 // s_waitcnt 0 at the beginning of the block.
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001254 for (MachineBasicBlock *Pred : Block.predecessors()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001255 BlockWaitcntBrackets *PredScoreBrackets =
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001256 BlockWaitcntBracketsMap[Pred].get();
1257 bool Visited = BlockVisitedSet.count(Pred);
Nicolai Haehnlef96456c2018-11-29 11:06:18 +00001258 if (!Visited)
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001259 continue;
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001260 for (auto T : inst_counter_types()) {
Nicolai Haehnleab43bf62018-11-29 11:06:21 +00001261 uint32_t span =
Kannan Narayananacb089e2017-04-12 03:25:12 +00001262 PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
1263 MaxPending[T] = std::max(MaxPending[T], span);
1264 span =
1265 PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
1266 MaxFlat[T] = std::max(MaxFlat[T], span);
1267 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001268 }
1269
Kannan Narayananacb089e2017-04-12 03:25:12 +00001270 // Now set the current Block's brackets to the largest ending bracket.
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001271 for (auto T : inst_counter_types()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001272 ScoreBrackets->setScoreUB(T, MaxPending[T]);
1273 ScoreBrackets->setScoreLB(T, 0);
1274 ScoreBrackets->setLastFlat(T, MaxFlat[T]);
1275 }
1276
Kannan Narayananacb089e2017-04-12 03:25:12 +00001277 // Set the register scoreboard.
1278 for (MachineBasicBlock *Pred : Block.predecessors()) {
Mark Searles24c92ee2018-02-07 02:21:21 +00001279 if (!BlockVisitedSet.count(Pred)) {
Tim Corringham6c6d5e22017-12-04 12:30:49 +00001280 continue;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001281 }
1282
1283 BlockWaitcntBrackets *PredScoreBrackets =
1284 BlockWaitcntBracketsMap[Pred].get();
1285
1286 // Now merge the gpr_reg_score information
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001287 for (auto T : inst_counter_types()) {
Nicolai Haehnleab43bf62018-11-29 11:06:21 +00001288 uint32_t PredLB = PredScoreBrackets->getScoreLB(T);
1289 uint32_t PredUB = PredScoreBrackets->getScoreUB(T);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001290 if (PredLB < PredUB) {
Nicolai Haehnleab43bf62018-11-29 11:06:21 +00001291 uint32_t PredScale = MaxPending[T] - PredUB;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001292 // Merge vgpr scores.
1293 for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
Nicolai Haehnleab43bf62018-11-29 11:06:21 +00001294 uint32_t PredRegScore = PredScoreBrackets->getRegScore(J, T);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001295 if (PredRegScore <= PredLB)
1296 continue;
Nicolai Haehnleab43bf62018-11-29 11:06:21 +00001297 uint32_t NewRegScore = PredScale + PredRegScore;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001298 ScoreBrackets->setRegScore(
1299 J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1300 }
1301 // Also need to merge sgpr scores for lgkm_cnt.
1302 if (T == LGKM_CNT) {
1303 for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
Nicolai Haehnleab43bf62018-11-29 11:06:21 +00001304 uint32_t PredRegScore =
Kannan Narayananacb089e2017-04-12 03:25:12 +00001305 PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1306 if (PredRegScore <= PredLB)
1307 continue;
Nicolai Haehnleab43bf62018-11-29 11:06:21 +00001308 uint32_t NewRegScore = PredScale + PredRegScore;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001309 ScoreBrackets->setRegScore(
1310 J + NUM_ALL_VGPRS, LGKM_CNT,
1311 std::max(
1312 ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1313 NewRegScore));
1314 }
1315 }
1316 }
1317 }
1318
Nicolai Haehnled1f45da2018-11-29 11:06:14 +00001319 ScoreBrackets->mergePendingEvents(*PredScoreBrackets);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001320 }
Mark Searlesc3c02bd2018-03-14 22:04:32 +00001321
1322 // if a single block loop, update the score brackets. Not needed for other
1323 // blocks, as we did this in-place
1324 if (IsSelfPred) {
1325 BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1326 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001327}
1328
Mark Searles10545412018-05-30 15:47:45 +00001329/// Return true if the given basic block is a "bottom" block of a loop.
1330/// This works even if the loop is discontiguous. This also handles
1331/// multiple back-edges for the same "header" block of a loop.
Mark Searles1bc6e712018-04-19 15:42:30 +00001332bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
1333 const MachineBasicBlock *Block) {
1334 for (MachineBasicBlock *MBB : Loop->blocks()) {
1335 if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
1336 return true;
1337 }
1338 }
1339 return false;
1340}
1341
1342/// Count the number of "bottom" basic blocks of a loop.
1343unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
1344 unsigned Count = 0;
1345 for (MachineBasicBlock *MBB : Loop->blocks()) {
1346 if (MBB->isSuccessor(Loop->getHeader())) {
1347 Count++;
1348 }
1349 }
1350 return Count;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001351}
1352
1353// Generate s_waitcnt instructions where needed.
1354void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1355 MachineBasicBlock &Block) {
1356 // Initialize the state information.
1357 mergeInputScoreBrackets(Block);
1358
1359 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1360
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001361 LLVM_DEBUG({
Mark Searlesec581832018-04-25 19:21:26 +00001362 dbgs() << "*** Block" << Block.getNumber() << " ***";
Kannan Narayananacb089e2017-04-12 03:25:12 +00001363 ScoreBrackets->dump();
1364 });
1365
Kannan Narayananacb089e2017-04-12 03:25:12 +00001366 // Walk over the instructions.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001367 MachineInstr *OldWaitcntInstr = nullptr;
1368
Kannan Narayananacb089e2017-04-12 03:25:12 +00001369 for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1370 Iter != E;) {
1371 MachineInstr &Inst = *Iter;
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001372
Kannan Narayananacb089e2017-04-12 03:25:12 +00001373 // Remove any previously existing waitcnts.
1374 if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001375 if (OldWaitcntInstr) {
1376 if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
1377 TrackedWaitcntSet.erase(OldWaitcntInstr);
1378 OldWaitcntInstr->eraseFromParent();
1379 OldWaitcntInstr = nullptr;
1380 } else if (!TrackedWaitcntSet.count(&Inst)) {
1381 // Two successive s_waitcnt's, both of which are pre-existing and
1382 // are therefore preserved.
1383 int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
1384 ScoreBrackets->applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
1385 } else {
1386 ++Iter;
1387 Inst.eraseFromParent();
1388 continue;
1389 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001390 }
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001391
1392 OldWaitcntInstr = &Inst;
1393 ++Iter;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001394 continue;
1395 }
1396
Kannan Narayananacb089e2017-04-12 03:25:12 +00001397 bool VCCZBugWorkAround = false;
1398 if (readsVCCZ(Inst) &&
Mark Searles24c92ee2018-02-07 02:21:21 +00001399 (!VCCZBugHandledSet.count(&Inst))) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001400 if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1401 ScoreBrackets->getScoreUB(LGKM_CNT) &&
Nicolai Haehnled1f45da2018-11-29 11:06:14 +00001402 ScoreBrackets->hasPendingEvent(SMEM_ACCESS)) {
Tom Stellardc5a154d2018-06-28 23:47:12 +00001403 if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
Kannan Narayananacb089e2017-04-12 03:25:12 +00001404 VCCZBugWorkAround = true;
1405 }
1406 }
1407
1408 // Generate an s_waitcnt instruction to be placed before
1409 // cur_Inst, if needed.
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001410 generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
1411 OldWaitcntInstr = nullptr;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001412
Mark Searles70901b92018-04-24 15:59:59 +00001413 updateEventWaitcntAfter(Inst, ScoreBrackets);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001414
1415#if 0 // TODO: implement resource type check controlled by options with ub = LB.
1416 // If this instruction generates a S_SETVSKIP because it is an
1417 // indexed resource, and we are on Tahiti, then it will also force
1418 // an S_WAITCNT vmcnt(0)
1419 if (RequireCheckResourceType(Inst, context)) {
1420 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1421 ScoreBrackets->setScoreLB(VM_CNT,
Evgeny Mankovbf975172017-08-16 16:47:29 +00001422 ScoreBrackets->getScoreUB(VM_CNT));
Kannan Narayananacb089e2017-04-12 03:25:12 +00001423 }
1424#endif
1425
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001426 LLVM_DEBUG({
Mark Searles94ae3b22018-01-30 17:17:06 +00001427 Inst.print(dbgs());
Kannan Narayananacb089e2017-04-12 03:25:12 +00001428 ScoreBrackets->dump();
1429 });
1430
1431 // Check to see if this is a GWS instruction. If so, and if this is CI or
1432 // VI, then the generated code sequence will include an S_WAITCNT 0.
1433 // TODO: Are these the only GWS instructions?
1434 if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1435 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1436 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1437 Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1438 Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1439 // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
Nicolai Haehnle1a94cbb2018-11-29 11:06:06 +00001440 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero());
Kannan Narayananacb089e2017-04-12 03:25:12 +00001441 }
1442
1443 // TODO: Remove this work-around after fixing the scheduler and enable the
1444 // assert above.
1445 if (VCCZBugWorkAround) {
1446 // Restore the vccz bit. Any time a value is written to vcc, the vcc
1447 // bit is updated, so we can restore the bit by reading the value of
1448 // vcc and then writing it back to the register.
1449 BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1450 AMDGPU::VCC)
1451 .addReg(AMDGPU::VCC);
1452 VCCZBugHandledSet.insert(&Inst);
1453 }
1454
Kannan Narayananacb089e2017-04-12 03:25:12 +00001455 ++Iter;
1456 }
1457
1458 // Check if we need to force convergence at loop footer.
1459 MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
Mark Searles1bc6e712018-04-19 15:42:30 +00001460 if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001461 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1462 WaitcntData->print();
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001463 LLVM_DEBUG(dbgs() << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001464
1465 // The iterative waitcnt insertion algorithm aims for optimal waitcnt
Mark Searles10545412018-05-30 15:47:45 +00001466 // placement, but doesn't guarantee convergence for a loop. Each
1467 // loop should take at most (n+1) iterations for it to converge naturally,
1468 // where n is the number of bottom blocks. If this threshold is reached and
1469 // the result hasn't converged, then we force convergence by inserting
1470 // a s_waitcnt at the end of loop footer.
1471 if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001472 // To ensure convergence, need to make wait events at loop footer be no
1473 // more than those from the previous iteration.
Mark Searles65207922018-02-19 19:19:59 +00001474 // As a simplification, instead of tracking individual scores and
1475 // generating the precise wait count, just wait on 0.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001476 bool HasPending = false;
1477 MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001478 for (auto T : inst_counter_types()) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001479 if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
1480 ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1481 HasPending = true;
Mark Searles10545412018-05-30 15:47:45 +00001482 break;
Kannan Narayananacb089e2017-04-12 03:25:12 +00001483 }
1484 }
1485
1486 if (HasPending) {
1487 if (!SWaitInst) {
Mark Searles10545412018-05-30 15:47:45 +00001488 SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
1489 DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1490 .addImm(0);
Mark Searles24c92ee2018-02-07 02:21:21 +00001491 TrackedWaitcntSet.insert(SWaitInst);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001492#if 0 // TODO: Format the debug output
1493 OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
1494 OutputTransformAdd(SWaitInst, context);
1495#endif
1496 }
1497#if 0 // TODO: ??
1498 _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
1499#endif
1500 }
1501
1502 if (SWaitInst) {
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001503 LLVM_DEBUG({
Kannan Narayananacb089e2017-04-12 03:25:12 +00001504 SWaitInst->print(dbgs());
1505 dbgs() << "\nAdjusted score board:";
1506 ScoreBrackets->dump();
1507 });
1508
1509 // Add this waitcnt to the block. It is either newly created or
1510 // created in previous iterations and added back since block traversal
Mark Searles65207922018-02-19 19:19:59 +00001511 // always removes waitcnts.
Kannan Narayananacb089e2017-04-12 03:25:12 +00001512 insertWaitcntBeforeCF(Block, SWaitInst);
1513 WaitcntData->setWaitcnt(SWaitInst);
1514 }
1515 }
1516 }
1517}
1518
1519bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
Tom Stellard5bfbae52018-07-11 20:59:01 +00001520 ST = &MF.getSubtarget<GCNSubtarget>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001521 TII = ST->getInstrInfo();
1522 TRI = &TII->getRegisterInfo();
1523 MRI = &MF.getRegInfo();
1524 MLI = &getAnalysis<MachineLoopInfo>();
Konstantin Zhuravlyov71e43ee2018-09-12 18:50:47 +00001525 IV = AMDGPU::getIsaVersion(ST->getCPU());
Mark Searles11d0a042017-05-31 16:44:23 +00001526 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001527
Mark Searles4a0f2c52018-05-07 14:43:28 +00001528 ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
Nicolai Haehnleae369d72018-11-29 11:06:11 +00001529 for (auto T : inst_counter_types())
Mark Searlesec581832018-04-25 19:21:26 +00001530 ForceEmitWaitcnt[T] = false;
1531
Kannan Narayananacb089e2017-04-12 03:25:12 +00001532 HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1533 HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1534 HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1535
1536 HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1537 HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1538 assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1539 assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1540
1541 RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1542 RegisterEncoding.VGPRL =
1543 RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1544 RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1545 RegisterEncoding.SGPRL =
1546 RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1547
Mark Searles24c92ee2018-02-07 02:21:21 +00001548 TrackedWaitcntSet.clear();
1549 BlockVisitedSet.clear();
1550 VCCZBugHandledSet.clear();
Mark Searles1bc6e712018-04-19 15:42:30 +00001551 LoopWaitcntDataMap.clear();
Scott Linder5792dd02018-06-21 18:48:48 +00001552 BlockWaitcntProcessedSet.clear();
Mark Searles24c92ee2018-02-07 02:21:21 +00001553
Nicolai Haehnle0ab31c92018-11-07 21:53:29 +00001554 // Walk over the blocks in reverse post order, inserting
Kannan Narayananacb089e2017-04-12 03:25:12 +00001555 // s_waitcnt where needed.
1556 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
1557 bool Modified = false;
1558 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1559 I = RPOT.begin(),
1560 E = RPOT.end(), J = RPOT.begin();
1561 I != E;) {
1562 MachineBasicBlock &MBB = **I;
1563
1564 BlockVisitedSet.insert(&MBB);
1565
1566 BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1567 if (!ScoreBrackets) {
Mark Searlesf0b93f12018-06-04 16:51:59 +00001568 BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001569 ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1570 }
1571 ScoreBrackets->setPostOrder(MBB.getNumber());
1572 MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
1573 if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
Eugene Zelenko59e12822017-08-08 00:47:13 +00001574 LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001575
1576 // If we are walking into the block from before the loop, then guarantee
1577 // at least 1 re-walk over the loop to propagate the information, even if
1578 // no S_WAITCNT instructions were generated.
Mark Searles1bc6e712018-04-19 15:42:30 +00001579 if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
1580 unsigned Count = countNumBottomBlocks(ContainingLoop);
1581
1582 // If the loop has multiple back-edges, and so more than one "bottom"
1583 // basic block, we have to guarantee a re-walk over every blocks.
1584 if ((std::count(BlockWaitcntProcessedSet.begin(),
Mark Searlesf4e70252018-07-16 10:21:36 +00001585 BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
Mark Searles1bc6e712018-04-19 15:42:30 +00001586 BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
Mark Searles10545412018-05-30 15:47:45 +00001587 LLVM_DEBUG(dbgs() << "set-revisit1: Block"
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001588 << ContainingLoop->getHeader()->getNumber() << '\n';);
Mark Searles1bc6e712018-04-19 15:42:30 +00001589 }
Kannan Narayananacb089e2017-04-12 03:25:12 +00001590 }
1591
1592 // Walk over the instructions.
1593 insertWaitcntInBlock(MF, MBB);
1594
Mark Searles10545412018-05-30 15:47:45 +00001595 // Record that waitcnts have been processed at least once for this block.
Mark Searles1bc6e712018-04-19 15:42:30 +00001596 BlockWaitcntProcessedSet.push_back(&MBB);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001597
Mark Searles1bc6e712018-04-19 15:42:30 +00001598 // See if we want to revisit the loop. If a loop has multiple back-edges,
1599 // we shouldn't revisit the same "bottom" basic block.
1600 if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
1601 std::count(BlockWaitcntProcessedSet.begin(),
1602 BlockWaitcntProcessedSet.end(), &MBB) == 1) {
Kannan Narayanan5e73b042017-05-05 21:10:17 +00001603 MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
Kannan Narayananacb089e2017-04-12 03:25:12 +00001604 BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
1605 if (EntrySB && EntrySB->getRevisitLoop()) {
1606 EntrySB->setRevisitLoop(false);
1607 J = I;
1608 int32_t PostOrder = EntrySB->getPostOrder();
1609 // TODO: Avoid this loop. Find another way to set I.
1610 for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
1611 X = RPOT.begin(),
1612 Y = RPOT.end();
1613 X != Y; ++X) {
1614 MachineBasicBlock &MBBX = **X;
1615 if (MBBX.getNumber() == PostOrder) {
1616 I = X;
1617 break;
1618 }
1619 }
1620 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1621 WaitcntData->incIterCnt();
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001622 LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
Kannan Narayananacb089e2017-04-12 03:25:12 +00001623 continue;
1624 } else {
1625 LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1626 // Loop converged, reset iteration count. If this loop gets revisited,
1627 // it must be from an outer loop, the counter will restart, this will
1628 // ensure we don't force convergence on such revisits.
1629 WaitcntData->resetIterCnt();
1630 }
1631 }
1632
1633 J = I;
1634 ++I;
1635 }
1636
1637 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1638
1639 bool HaveScalarStores = false;
1640
1641 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1642 ++BI) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001643 MachineBasicBlock &MBB = *BI;
1644
1645 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1646 ++I) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001647 if (!HaveScalarStores && TII->isScalarStore(*I))
1648 HaveScalarStores = true;
1649
1650 if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1651 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1652 EndPgmBlocks.push_back(&MBB);
1653 }
1654 }
1655
1656 if (HaveScalarStores) {
1657 // If scalar writes are used, the cache must be flushed or else the next
1658 // wave to reuse the same scratch memory can be clobbered.
1659 //
1660 // Insert s_dcache_wb at wave termination points if there were any scalar
1661 // stores, and only if the cache hasn't already been flushed. This could be
1662 // improved by looking across blocks for flushes in postdominating blocks
1663 // from the stores but an explicitly requested flush is probably very rare.
1664 for (MachineBasicBlock *MBB : EndPgmBlocks) {
1665 bool SeenDCacheWB = false;
1666
1667 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1668 ++I) {
Kannan Narayananacb089e2017-04-12 03:25:12 +00001669 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1670 SeenDCacheWB = true;
1671 else if (TII->isScalarStore(*I))
1672 SeenDCacheWB = false;
1673
1674 // FIXME: It would be better to insert this before a waitcnt if any.
1675 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1676 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1677 !SeenDCacheWB) {
1678 Modified = true;
1679 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1680 }
1681 }
1682 }
1683 }
1684
Mark Searles11d0a042017-05-31 16:44:23 +00001685 if (!MFI->isEntryFunction()) {
1686 // Wait for any outstanding memory operations that the input registers may
Hiroshi Inouec8e92452018-01-29 05:17:03 +00001687 // depend on. We can't track them and it's better to the wait after the
Mark Searles11d0a042017-05-31 16:44:23 +00001688 // costly call sequence.
1689
1690 // TODO: Could insert earlier and schedule more liberally with operations
1691 // that only use caller preserved registers.
1692 MachineBasicBlock &EntryBB = MF.front();
Mark Searlesed54ff12018-05-30 16:27:57 +00001693 BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1694 .addImm(0);
Mark Searles11d0a042017-05-31 16:44:23 +00001695
1696 Modified = true;
1697 }
1698
Kannan Narayananacb089e2017-04-12 03:25:12 +00001699 return Modified;
1700}