blob: bc86515d8b1fe835dde29a43e37699f2f1299b3a [file] [log] [blame]
Tom Stellardc4cabef2013-01-18 21:15:53 +00001//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief Insert wait instructions for memory reads and writes.
12///
13/// Memory reads and writes are issued asynchronously, so we need to insert
14/// S_WAITCNT instructions when we want to access any of their results or
15/// overwrite any register that's used asynchronously.
16//
17//===----------------------------------------------------------------------===//
18
19#include "AMDGPU.h"
Eric Christopherd9134482014-08-04 21:25:23 +000020#include "AMDGPUSubtarget.h"
Matt Arsenault9783e002014-09-29 15:50:26 +000021#include "SIDefines.h"
Matt Arsenault1fd0c622014-09-29 15:53:15 +000022#include "SIInstrInfo.h"
Tom Stellardc4cabef2013-01-18 21:15:53 +000023#include "SIMachineFunctionInfo.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000024#include "SIRegisterInfo.h"
Konstantin Zhuravlyov836cbff2016-09-30 17:01:40 +000025#include "Utils/AMDGPUBaseInfo.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000026#include "llvm/ADT/SmallVector.h"
27#include "llvm/ADT/StringRef.h"
28#include "llvm/CodeGen/MachineBasicBlock.h"
Tom Stellardc4cabef2013-01-18 21:15:53 +000029#include "llvm/CodeGen/MachineFunction.h"
30#include "llvm/CodeGen/MachineFunctionPass.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000031#include "llvm/CodeGen/MachineInstr.h"
Tom Stellardc4cabef2013-01-18 21:15:53 +000032#include "llvm/CodeGen/MachineInstrBuilder.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000033#include "llvm/CodeGen/MachineOperand.h"
Tom Stellardc4cabef2013-01-18 21:15:53 +000034#include "llvm/CodeGen/MachineRegisterInfo.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000035#include "llvm/IR/DebugLoc.h"
36#include "llvm/Pass.h"
37#include "llvm/Support/Debug.h"
38#include "llvm/Support/raw_ostream.h"
39#include "llvm/Target/TargetRegisterInfo.h"
40#include <algorithm>
41#include <cassert>
42#include <cstdint>
43#include <cstring>
44#include <new>
45#include <utility>
Tom Stellardc4cabef2013-01-18 21:15:53 +000046
Tom Stellard6e1967e2016-02-05 17:42:38 +000047#define DEBUG_TYPE "si-insert-waits"
48
Tom Stellardc4cabef2013-01-18 21:15:53 +000049using namespace llvm;
50
51namespace {
52
53/// \brief One variable for each of the hardware counters
54typedef union {
55 struct {
56 unsigned VM;
57 unsigned EXP;
58 unsigned LGKM;
59 } Named;
60 unsigned Array[3];
Tom Stellardc4cabef2013-01-18 21:15:53 +000061} Counters;
62
Marek Olsakfa58e5e2014-12-07 17:17:43 +000063typedef enum {
64 OTHER,
65 SMEM,
66 VMEM
67} InstType;
68
Tom Stellardc4cabef2013-01-18 21:15:53 +000069typedef Counters RegCounters[512];
70typedef std::pair<unsigned, unsigned> RegInterval;
71
72class SIInsertWaits : public MachineFunctionPass {
Tom Stellardc4cabef2013-01-18 21:15:53 +000073private:
Eugene Zelenko66203762017-01-21 00:53:49 +000074 const SISubtarget *ST = nullptr;
75 const SIInstrInfo *TII = nullptr;
76 const SIRegisterInfo *TRI = nullptr;
Tom Stellardc4cabef2013-01-18 21:15:53 +000077 const MachineRegisterInfo *MRI;
Konstantin Zhuravlyov9f89ede2017-02-08 14:05:23 +000078 AMDGPU::IsaInfo::IsaVersion ISA;
Tom Stellardc4cabef2013-01-18 21:15:53 +000079
Tom Stellardc4cabef2013-01-18 21:15:53 +000080 /// \brief Constant zero value
81 static const Counters ZeroCounts;
82
Konstantin Zhuravlyovcdd45472016-10-11 18:58:22 +000083 /// \brief Hardware limits
84 Counters HardwareLimits;
85
Tom Stellardc4cabef2013-01-18 21:15:53 +000086 /// \brief Counter values we have already waited on.
87 Counters WaitedOn;
88
Nicolai Haehnlef66bdb52016-04-27 15:46:01 +000089 /// \brief Counter values that we must wait on before the next counter
90 /// increase.
91 Counters DelayedWaitOn;
92
Tom Stellardc4cabef2013-01-18 21:15:53 +000093 /// \brief Counter values for last instruction issued.
94 Counters LastIssued;
95
96 /// \brief Registers used by async instructions.
97 RegCounters UsedRegs;
98
99 /// \brief Registers defined by async instructions.
100 RegCounters DefinedRegs;
101
102 /// \brief Different export instruction types seen since last wait.
Eugene Zelenko66203762017-01-21 00:53:49 +0000103 unsigned ExpInstrTypesSeen = 0;
Tom Stellardc4cabef2013-01-18 21:15:53 +0000104
Marek Olsakfa58e5e2014-12-07 17:17:43 +0000105 /// \brief Type of the last opcode.
106 InstType LastOpcodeType;
107
Marek Olsak1bd24632015-02-03 17:37:52 +0000108 bool LastInstWritesM0;
109
Tom Stellard6695ba02016-10-28 23:53:48 +0000110 /// Whether or not we have flat operations outstanding.
111 bool IsFlatOutstanding;
112
Marek Olsak8e9cc632016-01-13 17:23:09 +0000113 /// \brief Whether the machine function returns void
114 bool ReturnsVoid;
115
Tom Stellard30961762016-02-08 19:49:20 +0000116 /// Whether the VCCZ bit is possibly corrupt
Eugene Zelenko66203762017-01-21 00:53:49 +0000117 bool VCCZCorrupt = false;
Tom Stellard30961762016-02-08 19:49:20 +0000118
Tom Stellardc4cabef2013-01-18 21:15:53 +0000119 /// \brief Get increment/decrement amount for this instruction.
120 Counters getHwCounts(MachineInstr &MI);
121
122 /// \brief Is operand relevant for async execution?
123 bool isOpRelevant(MachineOperand &Op);
124
125 /// \brief Get register interval an operand affects.
Matt Arsenaultd1d499a2015-10-01 21:43:15 +0000126 RegInterval getRegInterval(const TargetRegisterClass *RC,
127 const MachineOperand &Reg) const;
Tom Stellardc4cabef2013-01-18 21:15:53 +0000128
129 /// \brief Handle instructions async components
Marek Olsakfa58e5e2014-12-07 17:17:43 +0000130 void pushInstruction(MachineBasicBlock &MBB,
Nicolai Haehnlef66bdb52016-04-27 15:46:01 +0000131 MachineBasicBlock::iterator I,
132 const Counters& Increment);
Tom Stellardc4cabef2013-01-18 21:15:53 +0000133
134 /// \brief Insert the actual wait instruction
135 bool insertWait(MachineBasicBlock &MBB,
136 MachineBasicBlock::iterator I,
137 const Counters &Counts);
138
Nicolai Haehnlef66bdb52016-04-27 15:46:01 +0000139 /// \brief Handle existing wait instructions (from intrinsics)
140 void handleExistingWait(MachineBasicBlock::iterator I);
141
Christian Konig862fd9f2013-03-01 09:46:04 +0000142 /// \brief Do we need def2def checks?
143 bool unorderedDefines(MachineInstr &MI);
144
Tom Stellardc4cabef2013-01-18 21:15:53 +0000145 /// \brief Resolve all operand dependencies to counter requirements
146 Counters handleOperands(MachineInstr &MI);
147
Marek Olsak1bd24632015-02-03 17:37:52 +0000148 /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
149 void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
150
Tom Stellard30961762016-02-08 19:49:20 +0000151 /// Return true if there are LGKM instrucitons that haven't been waited on
152 /// yet.
153 bool hasOutstandingLGKM() const;
154
Tom Stellardc4cabef2013-01-18 21:15:53 +0000155public:
Tom Stellard6e1967e2016-02-05 17:42:38 +0000156 static char ID;
157
Eugene Zelenko66203762017-01-21 00:53:49 +0000158 SIInsertWaits() : MachineFunctionPass(ID) {}
Tom Stellardc4cabef2013-01-18 21:15:53 +0000159
Craig Topper5656db42014-04-29 07:57:24 +0000160 bool runOnMachineFunction(MachineFunction &MF) override;
Tom Stellardc4cabef2013-01-18 21:15:53 +0000161
Mehdi Amini117296c2016-10-01 02:56:57 +0000162 StringRef getPassName() const override {
Matt Arsenault0cb85172015-09-25 17:21:28 +0000163 return "SI insert wait instructions";
Tom Stellardc4cabef2013-01-18 21:15:53 +0000164 }
165
Matt Arsenault0cb85172015-09-25 17:21:28 +0000166 void getAnalysisUsage(AnalysisUsage &AU) const override {
167 AU.setPreservesCFG();
168 MachineFunctionPass::getAnalysisUsage(AU);
169 }
Tom Stellardc4cabef2013-01-18 21:15:53 +0000170};
171
Eugene Zelenko66203762017-01-21 00:53:49 +0000172} // end anonymous namespace
Tom Stellardc4cabef2013-01-18 21:15:53 +0000173
Tom Stellard6e1967e2016-02-05 17:42:38 +0000174INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
175 "SI Insert Waits", false, false)
176INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
177 "SI Insert Waits", false, false)
178
Tom Stellardc4cabef2013-01-18 21:15:53 +0000179char SIInsertWaits::ID = 0;
180
Tom Stellard6e1967e2016-02-05 17:42:38 +0000181char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
182
183FunctionPass *llvm::createSIInsertWaitsPass() {
184 return new SIInsertWaits();
185}
186
Tom Stellardc4cabef2013-01-18 21:15:53 +0000187const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
188
Matt Arsenault52f14ec2016-11-07 19:09:27 +0000189static bool readsVCCZ(const MachineInstr &MI) {
190 unsigned Opc = MI.getOpcode();
191 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
192 !MI.getOperand(1).isUndef();
Tom Stellard30961762016-02-08 19:49:20 +0000193}
194
195bool SIInsertWaits::hasOutstandingLGKM() const {
196 return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
197}
Tom Stellardc4cabef2013-01-18 21:15:53 +0000198
199Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
Matt Arsenaultd1d499a2015-10-01 21:43:15 +0000200 uint64_t TSFlags = MI.getDesc().TSFlags;
Matt Arsenaulte66621b2015-09-24 19:52:27 +0000201 Counters Result = { { 0, 0, 0 } };
Tom Stellardc4cabef2013-01-18 21:15:53 +0000202
203 Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
204
205 // Only consider stores or EXP for EXP_CNT
Matt Arsenault7bee6ac2016-12-05 20:23:10 +0000206 Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore();
Tom Stellardc4cabef2013-01-18 21:15:53 +0000207
208 // LGKM may uses larger values
209 if (TSFlags & SIInstrFlags::LGKM_CNT) {
210
Matt Arsenault3add6432015-10-20 04:35:43 +0000211 if (TII->isSMRD(MI)) {
Tom Stellardc4cabef2013-01-18 21:15:53 +0000212
Matt Arsenaulte66621b2015-09-24 19:52:27 +0000213 if (MI.getNumOperands() != 0) {
Matt Arsenaultb733f002015-10-01 22:40:35 +0000214 assert(MI.getOperand(0).isReg() &&
215 "First LGKM operand must be a register!");
Michel Danzer20680b12013-08-16 16:19:24 +0000216
Matt Arsenaulte66621b2015-09-24 19:52:27 +0000217 // XXX - What if this is a write into a super register?
Matt Arsenaultd1d499a2015-10-01 21:43:15 +0000218 const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +0000219 unsigned Size = TRI->getRegSizeInBits(*RC);
220 Result.Named.LGKM = Size > 32 ? 2 : 1;
Matt Arsenaulte66621b2015-09-24 19:52:27 +0000221 } else {
222 // s_dcache_inv etc. do not have a a destination register. Assume we
223 // want a wait on these.
224 // XXX - What is the right value?
225 Result.Named.LGKM = 1;
226 }
Michel Danzer20680b12013-08-16 16:19:24 +0000227 } else {
228 // DS
229 Result.Named.LGKM = 1;
230 }
Tom Stellardc4cabef2013-01-18 21:15:53 +0000231
232 } else {
233 Result.Named.LGKM = 0;
234 }
235
236 return Result;
237}
238
239bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
Tom Stellardc4cabef2013-01-18 21:15:53 +0000240 // Constants are always irrelevant
Matt Arsenaultd1d499a2015-10-01 21:43:15 +0000241 if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
Tom Stellardc4cabef2013-01-18 21:15:53 +0000242 return false;
243
244 // Defines are always relevant
245 if (Op.isDef())
246 return true;
247
Matt Arsenault7bee6ac2016-12-05 20:23:10 +0000248 // For exports all registers are relevant.
249 // TODO: Skip undef/disabled registers.
Tom Stellardc4cabef2013-01-18 21:15:53 +0000250 MachineInstr &MI = *Op.getParent();
Matt Arsenault7bee6ac2016-12-05 20:23:10 +0000251 if (TII->isEXP(MI))
Tom Stellardc4cabef2013-01-18 21:15:53 +0000252 return true;
253
254 // For stores the stored value is also relevant
255 if (!MI.getDesc().mayStore())
256 return false;
257
Tom Stellardb3931b82015-01-06 19:52:04 +0000258 // Check if this operand is the value being stored.
Tom Stellard2d26fe72016-02-19 15:33:13 +0000259 // Special case for DS/FLAT instructions, since the address
Tom Stellardb3931b82015-01-06 19:52:04 +0000260 // operand comes before the value operand and it may have
261 // multiple data operands.
262
Tom Stellard2d26fe72016-02-19 15:33:13 +0000263 if (TII->isDS(MI)) {
Tom Stellardb3931b82015-01-06 19:52:04 +0000264 MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
265 if (Data0 && Op.isIdenticalTo(*Data0))
266 return true;
267
268 MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
Matt Arsenault8226fc42016-03-02 23:00:21 +0000269 return Data1 && Op.isIdenticalTo(*Data1);
Tom Stellardb3931b82015-01-06 19:52:04 +0000270 }
271
Matt Arsenault97279a82016-11-29 19:30:44 +0000272 if (TII->isFLAT(MI)) {
273 MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
274 if (Data && Op.isIdenticalTo(*Data))
275 return true;
276 }
277
Tom Stellardb3931b82015-01-06 19:52:04 +0000278 // NOTE: This assumes that the value operand is before the
279 // address operand, and that there is only one value operand.
Tom Stellardc4cabef2013-01-18 21:15:53 +0000280 for (MachineInstr::mop_iterator I = MI.operands_begin(),
281 E = MI.operands_end(); I != E; ++I) {
282
283 if (I->isReg() && I->isUse())
284 return Op.isIdenticalTo(*I);
285 }
286
287 return false;
288}
289
Matt Arsenaultd1d499a2015-10-01 21:43:15 +0000290RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
291 const MachineOperand &Reg) const {
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +0000292 unsigned Size = TRI->getRegSizeInBits(*RC);
293 assert(Size >= 32);
Tom Stellardc4cabef2013-01-18 21:15:53 +0000294
295 RegInterval Result;
Matt Arsenaultd1d499a2015-10-01 21:43:15 +0000296 Result.first = TRI->getEncodingValue(Reg.getReg());
Krzysztof Parzyszek44e25f32017-04-24 18:55:33 +0000297 Result.second = Result.first + Size / 32;
Tom Stellardc4cabef2013-01-18 21:15:53 +0000298
299 return Result;
300}
301
Marek Olsakfa58e5e2014-12-07 17:17:43 +0000302void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
Nicolai Haehnlef66bdb52016-04-27 15:46:01 +0000303 MachineBasicBlock::iterator I,
304 const Counters &Increment) {
Tom Stellardc4cabef2013-01-18 21:15:53 +0000305 // Get the hardware counter increments and sum them up
Tom Stellardbd8a0852015-08-21 22:47:27 +0000306 Counters Limit = ZeroCounts;
Tom Stellardc4cabef2013-01-18 21:15:53 +0000307 unsigned Sum = 0;
308
Tom Stellard6695ba02016-10-28 23:53:48 +0000309 if (TII->mayAccessFlatAddressSpace(*I))
310 IsFlatOutstanding = true;
311
Tom Stellardc4cabef2013-01-18 21:15:53 +0000312 for (unsigned i = 0; i < 3; ++i) {
313 LastIssued.Array[i] += Increment.Array[i];
Tom Stellardbd8a0852015-08-21 22:47:27 +0000314 if (Increment.Array[i])
315 Limit.Array[i] = LastIssued.Array[i];
Tom Stellardc4cabef2013-01-18 21:15:53 +0000316 Sum += Increment.Array[i];
317 }
318
319 // If we don't increase anything then that's it
Marek Olsakfa58e5e2014-12-07 17:17:43 +0000320 if (Sum == 0) {
321 LastOpcodeType = OTHER;
Tom Stellardc4cabef2013-01-18 21:15:53 +0000322 return;
Marek Olsakfa58e5e2014-12-07 17:17:43 +0000323 }
324
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000325 if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
Benjamin Kramerdf005cb2015-08-08 18:27:36 +0000326 // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
Marek Olsakfa58e5e2014-12-07 17:17:43 +0000327 // or SMEM clause, respectively.
328 //
329 // The temporary workaround is to break the clauses with S_NOP.
330 //
331 // The proper solution would be to allocate registers such that all source
332 // and destination registers don't overlap, e.g. this is illegal:
333 // r0 = load r2
334 // r2 = load r0
Tom Stellard1f520e52016-05-02 17:39:06 +0000335 if (LastOpcodeType == VMEM && Increment.Named.VM) {
Marek Olsakfa58e5e2014-12-07 17:17:43 +0000336 // Insert a NOP to break the clause.
337 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
338 .addImm(0);
Marek Olsak1bd24632015-02-03 17:37:52 +0000339 LastInstWritesM0 = false;
Marek Olsakfa58e5e2014-12-07 17:17:43 +0000340 }
341
Matt Arsenault3add6432015-10-20 04:35:43 +0000342 if (TII->isSMRD(*I))
Marek Olsakfa58e5e2014-12-07 17:17:43 +0000343 LastOpcodeType = SMEM;
344 else if (Increment.Named.VM)
345 LastOpcodeType = VMEM;
346 }
Tom Stellardc4cabef2013-01-18 21:15:53 +0000347
348 // Remember which export instructions we have seen
349 if (Increment.Named.EXP) {
Matt Arsenault7bee6ac2016-12-05 20:23:10 +0000350 ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2;
Tom Stellardc4cabef2013-01-18 21:15:53 +0000351 }
352
Marek Olsakfa58e5e2014-12-07 17:17:43 +0000353 for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
Marek Olsakfa58e5e2014-12-07 17:17:43 +0000354 MachineOperand &Op = I->getOperand(i);
Tom Stellardc4cabef2013-01-18 21:15:53 +0000355 if (!isOpRelevant(Op))
356 continue;
357
Matt Arsenaultd1d499a2015-10-01 21:43:15 +0000358 const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
359 RegInterval Interval = getRegInterval(RC, Op);
Tom Stellardc4cabef2013-01-18 21:15:53 +0000360 for (unsigned j = Interval.first; j < Interval.second; ++j) {
361
362 // Remember which registers we define
363 if (Op.isDef())
Tom Stellardbd8a0852015-08-21 22:47:27 +0000364 DefinedRegs[j] = Limit;
Tom Stellardc4cabef2013-01-18 21:15:53 +0000365
366 // and which one we are using
367 if (Op.isUse())
Tom Stellardbd8a0852015-08-21 22:47:27 +0000368 UsedRegs[j] = Limit;
Tom Stellardc4cabef2013-01-18 21:15:53 +0000369 }
370 }
371}
372
373bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
374 MachineBasicBlock::iterator I,
375 const Counters &Required) {
Tom Stellardc4cabef2013-01-18 21:15:53 +0000376 // End of program? No need to wait on anything
Marek Olsak8e9cc632016-01-13 17:23:09 +0000377 // A function not returning void needs to wait, because other bytecode will
378 // be appended after it and we don't know what it will be.
379 if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
Tom Stellardc4cabef2013-01-18 21:15:53 +0000380 return false;
381
382 // Figure out if the async instructions execute in order
383 bool Ordered[3];
384
Tom Stellard6695ba02016-10-28 23:53:48 +0000385 // VM_CNT is always ordered except when there are flat instructions, which
386 // can return out of order.
387 Ordered[0] = !IsFlatOutstanding;
Tom Stellardc4cabef2013-01-18 21:15:53 +0000388
389 // EXP_CNT is unordered if we have both EXP & VM-writes
390 Ordered[1] = ExpInstrTypesSeen == 3;
391
392 // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
393 Ordered[2] = false;
394
395 // The values we are going to put into the S_WAITCNT instruction
Konstantin Zhuravlyovcdd45472016-10-11 18:58:22 +0000396 Counters Counts = HardwareLimits;
Tom Stellardc4cabef2013-01-18 21:15:53 +0000397
398 // Do we really need to wait?
399 bool NeedWait = false;
400
401 for (unsigned i = 0; i < 3; ++i) {
Tom Stellardc4cabef2013-01-18 21:15:53 +0000402 if (Required.Array[i] <= WaitedOn.Array[i])
403 continue;
404
405 NeedWait = true;
Matt Arsenault97483692014-07-17 17:50:22 +0000406
Tom Stellardc4cabef2013-01-18 21:15:53 +0000407 if (Ordered[i]) {
408 unsigned Value = LastIssued.Array[i] - Required.Array[i];
409
Matt Arsenault97483692014-07-17 17:50:22 +0000410 // Adjust the value to the real hardware possibilities.
Konstantin Zhuravlyovcdd45472016-10-11 18:58:22 +0000411 Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]);
Tom Stellardc4cabef2013-01-18 21:15:53 +0000412
413 } else
414 Counts.Array[i] = 0;
415
Matt Arsenault97483692014-07-17 17:50:22 +0000416 // Remember on what we have waited on.
Tom Stellardc4cabef2013-01-18 21:15:53 +0000417 WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
418 }
419
420 if (!NeedWait)
421 return false;
422
423 // Reset EXP_CNT instruction types
424 if (Counts.Named.EXP == 0)
425 ExpInstrTypesSeen = 0;
426
427 // Build the wait instruction
428 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
Konstantin Zhuravlyov9f89ede2017-02-08 14:05:23 +0000429 .addImm(AMDGPU::encodeWaitcnt(ISA,
430 Counts.Named.VM,
431 Counts.Named.EXP,
432 Counts.Named.LGKM));
Tom Stellardc4cabef2013-01-18 21:15:53 +0000433
Marek Olsakfa58e5e2014-12-07 17:17:43 +0000434 LastOpcodeType = OTHER;
Marek Olsak1bd24632015-02-03 17:37:52 +0000435 LastInstWritesM0 = false;
Tom Stellard6695ba02016-10-28 23:53:48 +0000436 IsFlatOutstanding = false;
Tom Stellardc4cabef2013-01-18 21:15:53 +0000437 return true;
438}
439
440/// \brief helper function for handleOperands
441static void increaseCounters(Counters &Dst, const Counters &Src) {
Tom Stellardc4cabef2013-01-18 21:15:53 +0000442 for (unsigned i = 0; i < 3; ++i)
443 Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
444}
445
Nicolai Haehnlef66bdb52016-04-27 15:46:01 +0000446/// \brief check whether any of the counters is non-zero
447static bool countersNonZero(const Counters &Counter) {
448 for (unsigned i = 0; i < 3; ++i)
449 if (Counter.Array[i])
450 return true;
451 return false;
452}
453
454void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
455 assert(I->getOpcode() == AMDGPU::S_WAITCNT);
456
457 unsigned Imm = I->getOperand(0).getImm();
458 Counters Counts, WaitOn;
459
Konstantin Zhuravlyov9f89ede2017-02-08 14:05:23 +0000460 Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm);
461 Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm);
462 Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm);
Nicolai Haehnlef66bdb52016-04-27 15:46:01 +0000463
464 for (unsigned i = 0; i < 3; ++i) {
465 if (Counts.Array[i] <= LastIssued.Array[i])
466 WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
467 else
468 WaitOn.Array[i] = 0;
469 }
470
471 increaseCounters(DelayedWaitOn, WaitOn);
472}
473
Tom Stellardc4cabef2013-01-18 21:15:53 +0000474Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
Tom Stellardc4cabef2013-01-18 21:15:53 +0000475 Counters Result = ZeroCounts;
476
Matt Arsenaultd1d499a2015-10-01 21:43:15 +0000477 // For each register affected by this instruction increase the result
478 // sequence.
479 //
480 // TODO: We could probably just look at explicit operands if we removed VCC /
481 // EXEC from SMRD dest reg classes.
Tom Stellardc4cabef2013-01-18 21:15:53 +0000482 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
Tom Stellardc4cabef2013-01-18 21:15:53 +0000483 MachineOperand &Op = MI.getOperand(i);
Matt Arsenaultd1d499a2015-10-01 21:43:15 +0000484 if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
485 continue;
486
487 const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
488 RegInterval Interval = getRegInterval(RC, Op);
Tom Stellardc4cabef2013-01-18 21:15:53 +0000489 for (unsigned j = Interval.first; j < Interval.second; ++j) {
Christian Konig862fd9f2013-03-01 09:46:04 +0000490 if (Op.isDef()) {
Tom Stellardc4cabef2013-01-18 21:15:53 +0000491 increaseCounters(Result, UsedRegs[j]);
Christian Konigf1fd5fa2013-03-18 11:33:45 +0000492 increaseCounters(Result, DefinedRegs[j]);
Christian Konig862fd9f2013-03-01 09:46:04 +0000493 }
Tom Stellardc4cabef2013-01-18 21:15:53 +0000494
495 if (Op.isUse())
496 increaseCounters(Result, DefinedRegs[j]);
497 }
498 }
499
500 return Result;
501}
502
Marek Olsak1bd24632015-02-03 17:37:52 +0000503void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
504 MachineBasicBlock::iterator I) {
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000505 if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
Marek Olsak1bd24632015-02-03 17:37:52 +0000506 return;
507
508 // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
Jan Veselyd48445d2017-01-04 18:06:55 +0000509 if (LastInstWritesM0 && (I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT)) {
Marek Olsak1bd24632015-02-03 17:37:52 +0000510 BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
511 LastInstWritesM0 = false;
512 return;
513 }
514
515 // Set whether this instruction sets M0
516 LastInstWritesM0 = false;
517
518 unsigned NumOperands = I->getNumOperands();
519 for (unsigned i = 0; i < NumOperands; i++) {
520 const MachineOperand &Op = I->getOperand(i);
521
522 if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
523 LastInstWritesM0 = true;
524 }
525}
526
Matt Arsenault52d1b622017-03-08 01:06:58 +0000527/// Return true if \p MBB has one successor immediately following, and is its
528/// only predecessor
529static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) {
530 if (MBB.succ_size() != 1)
531 return false;
532
533 const MachineBasicBlock *Succ = *MBB.succ_begin();
534 return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ);
535}
536
Matt Arsenaulta0050b02014-06-19 01:19:19 +0000537// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
538// around other non-memory instructions.
Tom Stellardc4cabef2013-01-18 21:15:53 +0000539bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
Tom Stellardc4cabef2013-01-18 21:15:53 +0000540 bool Changes = false;
541
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000542 ST = &MF.getSubtarget<SISubtarget>();
543 TII = ST->getInstrInfo();
544 TRI = &TII->getRegisterInfo();
Tom Stellardc4cabef2013-01-18 21:15:53 +0000545 MRI = &MF.getRegInfo();
Konstantin Zhuravlyov9f89ede2017-02-08 14:05:23 +0000546 ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
Marek Olsak79c05872016-11-25 17:37:09 +0000547 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Tom Stellardc4cabef2013-01-18 21:15:53 +0000548
Konstantin Zhuravlyov9f89ede2017-02-08 14:05:23 +0000549 HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA);
550 HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA);
551 HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA);
Konstantin Zhuravlyovcdd45472016-10-11 18:58:22 +0000552
Tom Stellardc4cabef2013-01-18 21:15:53 +0000553 WaitedOn = ZeroCounts;
Nicolai Haehnlef66bdb52016-04-27 15:46:01 +0000554 DelayedWaitOn = ZeroCounts;
Tom Stellardc4cabef2013-01-18 21:15:53 +0000555 LastIssued = ZeroCounts;
Marek Olsakfa58e5e2014-12-07 17:17:43 +0000556 LastOpcodeType = OTHER;
Marek Olsak1bd24632015-02-03 17:37:52 +0000557 LastInstWritesM0 = false;
Tom Stellard6695ba02016-10-28 23:53:48 +0000558 IsFlatOutstanding = false;
Marek Olsak79c05872016-11-25 17:37:09 +0000559 ReturnsVoid = MFI->returnsVoid();
Tom Stellardc4cabef2013-01-18 21:15:53 +0000560
561 memset(&UsedRegs, 0, sizeof(UsedRegs));
562 memset(&DefinedRegs, 0, sizeof(DefinedRegs));
563
Nicolai Haehnlef66bdb52016-04-27 15:46:01 +0000564 SmallVector<MachineInstr *, 4> RemoveMI;
Marek Olsak79c05872016-11-25 17:37:09 +0000565 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
566
567 bool HaveScalarStores = false;
Nicolai Haehnlef66bdb52016-04-27 15:46:01 +0000568
Tom Stellardc4cabef2013-01-18 21:15:53 +0000569 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
570 BI != BE; ++BI) {
571
572 MachineBasicBlock &MBB = *BI;
Marek Olsak79c05872016-11-25 17:37:09 +0000573
Tom Stellardc4cabef2013-01-18 21:15:53 +0000574 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
575 I != E; ++I) {
576
Marek Olsak79c05872016-11-25 17:37:09 +0000577 if (!HaveScalarStores && TII->isScalarStore(*I))
578 HaveScalarStores = true;
579
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000580 if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
Tom Stellard30961762016-02-08 19:49:20 +0000581 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
582 // vccz bit, so when we detect that an instruction may read from a
583 // corrupt vccz bit, we need to:
584 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
585 // complete.
586 // 2. Restore the correct value of vccz by writing the current value
587 // of vcc back to vcc.
588
589 if (TII->isSMRD(I->getOpcode())) {
590 VCCZCorrupt = true;
591 } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
592 // FIXME: We only care about SMRD instructions here, not LDS or GDS.
593 // Whenever we store a value in vcc, the correct value of vccz is
594 // restored.
595 VCCZCorrupt = false;
596 }
597
598 // Check if we need to apply the bug work-around
Matt Arsenault52f14ec2016-11-07 19:09:27 +0000599 if (VCCZCorrupt && readsVCCZ(*I)) {
Tom Stellard30961762016-02-08 19:49:20 +0000600 DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
601
602 // Wait on everything, not just LGKM. vccz reads usually come from
603 // terminators, and we always wait on everything at the end of the
604 // block, so if we only wait on LGKM here, we might end up with
605 // another s_waitcnt inserted right after this if there are non-LGKM
606 // instructions still outstanding.
607 insertWait(MBB, I, LastIssued);
608
609 // Restore the vccz bit. Any time a value is written to vcc, the vcc
610 // bit is updated, so we can restore the bit by reading the value of
611 // vcc and then writing it back to the register.
612 BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
613 AMDGPU::VCC)
Matt Arsenault52f14ec2016-11-07 19:09:27 +0000614 .addReg(AMDGPU::VCC);
Tom Stellard30961762016-02-08 19:49:20 +0000615 }
616 }
617
Nicolai Haehnlef66bdb52016-04-27 15:46:01 +0000618 // Record pre-existing, explicitly requested waits
619 if (I->getOpcode() == AMDGPU::S_WAITCNT) {
620 handleExistingWait(*I);
Duncan P. N. Exon Smith4d295112016-07-08 19:16:05 +0000621 RemoveMI.push_back(&*I);
Nicolai Haehnlef66bdb52016-04-27 15:46:01 +0000622 continue;
623 }
Marek Olsak1bd24632015-02-03 17:37:52 +0000624
Nicolai Haehnlef66bdb52016-04-27 15:46:01 +0000625 Counters Required;
626
627 // Wait for everything before a barrier.
628 //
629 // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
630 // but we also want to wait for any other outstanding transfers before
631 // signalling other hardware blocks
Konstantin Zhuravlyovd7bdf242016-09-30 16:50:36 +0000632 if ((I->getOpcode() == AMDGPU::S_BARRIER &&
Konstantin Zhuravlyovbe6c0ca2017-06-02 17:40:26 +0000633 !ST->hasAutoWaitcntBeforeBarrier()) ||
Jan Veselyd48445d2017-01-04 18:06:55 +0000634 I->getOpcode() == AMDGPU::S_SENDMSG ||
635 I->getOpcode() == AMDGPU::S_SENDMSGHALT)
Nicolai Haehnlef66bdb52016-04-27 15:46:01 +0000636 Required = LastIssued;
637 else
638 Required = handleOperands(*I);
639
640 Counters Increment = getHwCounts(*I);
641
642 if (countersNonZero(Required) || countersNonZero(Increment))
643 increaseCounters(Required, DelayedWaitOn);
644
645 Changes |= insertWait(MBB, I, Required);
646
647 pushInstruction(MBB, I, Increment);
Marek Olsak1bd24632015-02-03 17:37:52 +0000648 handleSendMsg(MBB, I);
Marek Olsak79c05872016-11-25 17:37:09 +0000649
650 if (I->getOpcode() == AMDGPU::S_ENDPGM ||
Matt Arsenault5b20fbb2017-03-21 22:18:10 +0000651 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
Marek Olsak79c05872016-11-25 17:37:09 +0000652 EndPgmBlocks.push_back(&MBB);
Tom Stellardc4cabef2013-01-18 21:15:53 +0000653 }
654
Matt Arsenault52d1b622017-03-08 01:06:58 +0000655 // Wait for everything at the end of the MBB. If there is only one
656 // successor, we can defer this until the uses there.
657 if (!hasTrivialSuccessor(MBB))
658 Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
Tom Stellardc4cabef2013-01-18 21:15:53 +0000659 }
660
Marek Olsak79c05872016-11-25 17:37:09 +0000661 if (HaveScalarStores) {
662 // If scalar writes are used, the cache must be flushed or else the next
663 // wave to reuse the same scratch memory can be clobbered.
664 //
665 // Insert s_dcache_wb at wave termination points if there were any scalar
666 // stores, and only if the cache hasn't already been flushed. This could be
667 // improved by looking across blocks for flushes in postdominating blocks
668 // from the stores but an explicitly requested flush is probably very rare.
669 for (MachineBasicBlock *MBB : EndPgmBlocks) {
670 bool SeenDCacheWB = false;
671
672 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
673 I != E; ++I) {
674
675 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
676 SeenDCacheWB = true;
677 else if (TII->isScalarStore(*I))
678 SeenDCacheWB = false;
679
680 // FIXME: It would be better to insert this before a waitcnt if any.
681 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
Matt Arsenault5b20fbb2017-03-21 22:18:10 +0000682 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) {
Marek Olsak79c05872016-11-25 17:37:09 +0000683 Changes = true;
684 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
685 }
686 }
687 }
688 }
689
Nicolai Haehnlef66bdb52016-04-27 15:46:01 +0000690 for (MachineInstr *I : RemoveMI)
691 I->eraseFromParent();
692
Matt Arsenault9ac40022017-04-11 22:29:31 +0000693 if (!MFI->isEntryFunction()) {
694 // Wait for any outstanding memory operations that the input registers may
695 // depend on. We can't track them and it's better to to the wait after the
696 // costly call sequence.
697
698 // TODO: Could insert earlier and schedule more liberally with operations
699 // that only use caller preserved registers.
700 MachineBasicBlock &EntryBB = MF.front();
701 BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
702 .addImm(0);
703
704 Changes = true;
705 }
706
Tom Stellardc4cabef2013-01-18 21:15:53 +0000707 return Changes;
708}