blob: c7bec721166fb11a72970b6c4fd8605403e3bad2 [file] [log] [blame]
Eugene Zelenko59e12822017-08-08 00:47:13 +00001//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
Matt Arsenault41033282014-10-10 22:01:59 +00002//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This pass tries to fuse DS instructions with close by immediate offsets.
11// This will fuse operations such as
12// ds_read_b32 v0, v2 offset:16
13// ds_read_b32 v1, v2 offset:32
14// ==>
15// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
16//
Nicolai Haehnleb4f28de2017-11-28 08:42:46 +000017// The same is done for certain SMEM and VMEM opcodes, e.g.:
Marek Olsakb953cc32017-11-09 01:52:23 +000018// s_buffer_load_dword s4, s[0:3], 4
19// s_buffer_load_dword s5, s[0:3], 8
20// ==>
21// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
22//
Matt Arsenault41033282014-10-10 22:01:59 +000023//
24// Future improvements:
25//
26// - This currently relies on the scheduler to place loads and stores next to
27// each other, and then only merges adjacent pairs of instructions. It would
28// be good to be more flexible with interleaved instructions, and possibly run
29// before scheduling. It currently missing stores of constants because loading
30// the constant into the data register is placed between the stores, although
31// this is arguably a scheduling problem.
32//
33// - Live interval recomputing seems inefficient. This currently only matches
34// one pair, and recomputes live intervals and moves on to the next pair. It
Konstantin Zhuravlyovecc7cbf2016-03-29 15:15:44 +000035// would be better to compute a list of all merges that need to occur.
Matt Arsenault41033282014-10-10 22:01:59 +000036//
37// - With a list of instructions to process, we can also merge more. If a
38// cluster of loads have offsets that are too large to fit in the 8-bit
39// offsets, but are close enough to fit in the 8 bits, we can add to the base
40// pointer and use the new reduced offsets.
41//
42//===----------------------------------------------------------------------===//
43
44#include "AMDGPU.h"
Matt Arsenault43e92fe2016-06-24 06:30:11 +000045#include "AMDGPUSubtarget.h"
Neil Henning76504a42018-12-12 16:15:21 +000046#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
Matt Arsenault41033282014-10-10 22:01:59 +000047#include "SIInstrInfo.h"
48#include "SIRegisterInfo.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000049#include "Utils/AMDGPUBaseInfo.h"
50#include "llvm/ADT/ArrayRef.h"
51#include "llvm/ADT/SmallVector.h"
52#include "llvm/ADT/StringRef.h"
53#include "llvm/Analysis/AliasAnalysis.h"
54#include "llvm/CodeGen/MachineBasicBlock.h"
Matt Arsenault41033282014-10-10 22:01:59 +000055#include "llvm/CodeGen/MachineFunction.h"
56#include "llvm/CodeGen/MachineFunctionPass.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000057#include "llvm/CodeGen/MachineInstr.h"
Matt Arsenault41033282014-10-10 22:01:59 +000058#include "llvm/CodeGen/MachineInstrBuilder.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000059#include "llvm/CodeGen/MachineOperand.h"
Matt Arsenault41033282014-10-10 22:01:59 +000060#include "llvm/CodeGen/MachineRegisterInfo.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000061#include "llvm/IR/DebugLoc.h"
62#include "llvm/Pass.h"
Matt Arsenault41033282014-10-10 22:01:59 +000063#include "llvm/Support/Debug.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000064#include "llvm/Support/MathExtras.h"
Benjamin Kramer799003b2015-03-23 19:32:43 +000065#include "llvm/Support/raw_ostream.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000066#include <algorithm>
Eugene Zelenko66203762017-01-21 00:53:49 +000067#include <cassert>
Eugene Zelenko59e12822017-08-08 00:47:13 +000068#include <cstdlib>
Eugene Zelenko66203762017-01-21 00:53:49 +000069#include <iterator>
70#include <utility>
Matt Arsenault41033282014-10-10 22:01:59 +000071
72using namespace llvm;
73
74#define DEBUG_TYPE "si-load-store-opt"
75
76namespace {
Neil Henning76504a42018-12-12 16:15:21 +000077enum InstClassEnum {
78 UNKNOWN,
79 DS_READ,
80 DS_WRITE,
81 S_BUFFER_LOAD_IMM,
82 BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
83 BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
84 BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
85 BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
86 BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
87 BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
88 BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
89 BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
90};
91
92enum RegisterEnum {
93 SBASE = 0x1,
94 SRSRC = 0x2,
95 SOFFSET = 0x4,
96 VADDR = 0x8,
97 ADDR = 0x10,
98};
Matt Arsenault41033282014-10-10 22:01:59 +000099
100class SILoadStoreOptimizer : public MachineFunctionPass {
NAKAMURA Takumiaba2b3d2017-10-10 08:30:53 +0000101 struct CombineInfo {
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000102 MachineBasicBlock::iterator I;
103 MachineBasicBlock::iterator Paired;
104 unsigned EltSize;
105 unsigned Offset0;
106 unsigned Offset1;
Neil Henning76504a42018-12-12 16:15:21 +0000107 unsigned Width0;
108 unsigned Width1;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000109 unsigned BaseOff;
Marek Olsak6a0548a2017-11-09 01:52:30 +0000110 InstClassEnum InstClass;
Marek Olsakb953cc32017-11-09 01:52:23 +0000111 bool GLC0;
112 bool GLC1;
Marek Olsak6a0548a2017-11-09 01:52:30 +0000113 bool SLC0;
114 bool SLC1;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000115 bool UseST64;
Neil Henning76504a42018-12-12 16:15:21 +0000116 SmallVector<MachineInstr *, 8> InstsToMove;
117 };
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000118
Matt Arsenault41033282014-10-10 22:01:59 +0000119private:
Tom Stellard5bfbae52018-07-11 20:59:01 +0000120 const GCNSubtarget *STM = nullptr;
Eugene Zelenko66203762017-01-21 00:53:49 +0000121 const SIInstrInfo *TII = nullptr;
122 const SIRegisterInfo *TRI = nullptr;
123 MachineRegisterInfo *MRI = nullptr;
124 AliasAnalysis *AA = nullptr;
Neil Henning76504a42018-12-12 16:15:21 +0000125 bool OptimizeAgain;
Matt Arsenault41033282014-10-10 22:01:59 +0000126
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000127 static bool offsetsCanBeCombined(CombineInfo &CI);
Neil Henning76504a42018-12-12 16:15:21 +0000128 static bool widthsFit(const CombineInfo &CI);
129 static unsigned getNewOpcode(const CombineInfo &CI);
130 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
131 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
132 unsigned getOpcodeWidth(const MachineInstr &MI);
133 InstClassEnum getInstClass(unsigned Opc);
134 unsigned getRegs(unsigned Opc);
Matt Arsenault41033282014-10-10 22:01:59 +0000135
Marek Olsakb953cc32017-11-09 01:52:23 +0000136 bool findMatchingInst(CombineInfo &CI);
Matt Arsenault3f71c0e2017-11-29 00:55:57 +0000137
138 unsigned read2Opcode(unsigned EltSize) const;
139 unsigned read2ST64Opcode(unsigned EltSize) const;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000140 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
Matt Arsenault3f71c0e2017-11-29 00:55:57 +0000141
142 unsigned write2Opcode(unsigned EltSize) const;
143 unsigned write2ST64Opcode(unsigned EltSize) const;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000144 MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
Marek Olsakb953cc32017-11-09 01:52:23 +0000145 MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
Marek Olsak4c421a2d2017-11-09 01:52:36 +0000146 MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
Marek Olsak58410f32017-11-09 01:52:55 +0000147 MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
Matt Arsenault41033282014-10-10 22:01:59 +0000148
149public:
150 static char ID;
151
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000152 SILoadStoreOptimizer() : MachineFunctionPass(ID) {
Matt Arsenault41033282014-10-10 22:01:59 +0000153 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
154 }
155
156 bool optimizeBlock(MachineBasicBlock &MBB);
157
158 bool runOnMachineFunction(MachineFunction &MF) override;
159
Mark Searles7687d422018-01-22 21:46:43 +0000160 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
Matt Arsenault41033282014-10-10 22:01:59 +0000161
162 void getAnalysisUsage(AnalysisUsage &AU) const override {
163 AU.setPreservesCFG();
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000164 AU.addRequired<AAResultsWrapperPass>();
Matt Arsenault41033282014-10-10 22:01:59 +0000165
166 MachineFunctionPass::getAnalysisUsage(AU);
167 }
168};
169
Eugene Zelenko66203762017-01-21 00:53:49 +0000170} // end anonymous namespace.
Matt Arsenault41033282014-10-10 22:01:59 +0000171
172INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
Mark Searles7687d422018-01-22 21:46:43 +0000173 "SI Load Store Optimizer", false, false)
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000174INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
Neil Henning76504a42018-12-12 16:15:21 +0000175INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
176 false, false)
Matt Arsenault41033282014-10-10 22:01:59 +0000177
178char SILoadStoreOptimizer::ID = 0;
179
180char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
181
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000182FunctionPass *llvm::createSILoadStoreOptimizerPass() {
183 return new SILoadStoreOptimizer();
Matt Arsenault41033282014-10-10 22:01:59 +0000184}
185
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000186static void moveInstsAfter(MachineBasicBlock::iterator I,
Neil Henning76504a42018-12-12 16:15:21 +0000187 ArrayRef<MachineInstr *> InstsToMove) {
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000188 MachineBasicBlock *MBB = I->getParent();
189 ++I;
190 for (MachineInstr *MI : InstsToMove) {
191 MI->removeFromParent();
192 MBB->insert(I, MI);
193 }
194}
195
Nicolai Haehnle6cf306d2018-02-23 10:45:56 +0000196static void addDefsUsesToList(const MachineInstr &MI,
197 DenseSet<unsigned> &RegDefs,
198 DenseSet<unsigned> &PhysRegUses) {
199 for (const MachineOperand &Op : MI.operands()) {
200 if (Op.isReg()) {
201 if (Op.isDef())
202 RegDefs.insert(Op.getReg());
203 else if (Op.readsReg() &&
204 TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
205 PhysRegUses.insert(Op.getReg());
206 }
Matt Arsenaultb02cebf2018-02-08 01:56:14 +0000207 }
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000208}
209
Eugene Zelenko66203762017-01-21 00:53:49 +0000210static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
211 MachineBasicBlock::iterator B,
212 const SIInstrInfo *TII,
Neil Henning76504a42018-12-12 16:15:21 +0000213 AliasAnalysis *AA) {
Matt Arsenault67e72de2017-08-31 01:53:09 +0000214 // RAW or WAR - cannot reorder
215 // WAW - cannot reorder
216 // RAR - safe to reorder
217 return !(A->mayStore() || B->mayStore()) ||
Neil Henning76504a42018-12-12 16:15:21 +0000218 TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
Alexander Timofeevf867a402016-11-03 14:37:13 +0000219}
220
Nicolai Haehnle7b0e25b2016-10-27 08:15:07 +0000221// Add MI and its defs to the lists if MI reads one of the defs that are
222// already in the list. Returns true in that case.
Neil Henning76504a42018-12-12 16:15:21 +0000223static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
224 DenseSet<unsigned> &PhysRegUses,
225 SmallVectorImpl<MachineInstr *> &Insts) {
Matt Arsenault67e72de2017-08-31 01:53:09 +0000226 for (MachineOperand &Use : MI.operands()) {
227 // If one of the defs is read, then there is a use of Def between I and the
228 // instruction that I will potentially be merged with. We will need to move
229 // this instruction after the merged instructions.
Nicolai Haehnle6cf306d2018-02-23 10:45:56 +0000230 //
231 // Similarly, if there is a def which is read by an instruction that is to
232 // be moved for merging, then we need to move the def-instruction as well.
233 // This can only happen for physical registers such as M0; virtual
234 // registers are in SSA form.
235 if (Use.isReg() &&
236 ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
237 (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
238 PhysRegUses.count(Use.getReg())))) {
Nicolai Haehnle7b0e25b2016-10-27 08:15:07 +0000239 Insts.push_back(&MI);
Nicolai Haehnle6cf306d2018-02-23 10:45:56 +0000240 addDefsUsesToList(MI, RegDefs, PhysRegUses);
Nicolai Haehnle7b0e25b2016-10-27 08:15:07 +0000241 return true;
242 }
243 }
244
245 return false;
246}
247
Neil Henning76504a42018-12-12 16:15:21 +0000248static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
249 ArrayRef<MachineInstr *> InstsToMove,
250 const SIInstrInfo *TII, AliasAnalysis *AA) {
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000251 assert(MemOp.mayLoadOrStore());
252
253 for (MachineInstr *InstToMove : InstsToMove) {
254 if (!InstToMove->mayLoadOrStore())
255 continue;
Alexander Timofeevf867a402016-11-03 14:37:13 +0000256 if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
Neil Henning76504a42018-12-12 16:15:21 +0000257 return false;
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000258 }
259 return true;
260}
261
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000262bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
Matt Arsenault41033282014-10-10 22:01:59 +0000263 // XXX - Would the same offset be OK? Is there any reason this would happen or
264 // be useful?
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000265 if (CI.Offset0 == CI.Offset1)
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000266 return false;
267
268 // This won't be valid if the offset isn't aligned.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000269 if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000270 return false;
271
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000272 unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
273 unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
274 CI.UseST64 = false;
275 CI.BaseOff = 0;
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000276
Marek Olsak58410f32017-11-09 01:52:55 +0000277 // Handle SMEM and VMEM instructions.
Neil Henning76504a42018-12-12 16:15:21 +0000278 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
279 return (EltOffset0 + CI.Width0 == EltOffset1 ||
280 EltOffset1 + CI.Width1 == EltOffset0) &&
Marek Olsak6a0548a2017-11-09 01:52:30 +0000281 CI.GLC0 == CI.GLC1 &&
282 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
Marek Olsakb953cc32017-11-09 01:52:23 +0000283 }
284
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000285 // If the offset in elements doesn't fit in 8-bits, we might be able to use
286 // the stride 64 versions.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000287 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
288 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
289 CI.Offset0 = EltOffset0 / 64;
290 CI.Offset1 = EltOffset1 / 64;
291 CI.UseST64 = true;
292 return true;
293 }
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000294
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000295 // Check if the new offsets fit in the reduced 8-bit range.
296 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
297 CI.Offset0 = EltOffset0;
298 CI.Offset1 = EltOffset1;
299 return true;
300 }
301
302 // Try to shift base address to decrease offsets.
303 unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
304 CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
305
306 if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
307 CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
308 CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
309 CI.UseST64 = true;
310 return true;
311 }
312
313 if (isUInt<8>(OffsetDiff)) {
314 CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
315 CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
316 return true;
317 }
318
319 return false;
Matt Arsenault41033282014-10-10 22:01:59 +0000320}
321
Neil Henning76504a42018-12-12 16:15:21 +0000322bool SILoadStoreOptimizer::widthsFit(const CombineInfo &CI) {
323 const unsigned Width = (CI.Width0 + CI.Width1);
324 switch (CI.InstClass) {
325 default:
326 return Width <= 4;
327 case S_BUFFER_LOAD_IMM:
328 switch (Width) {
329 default:
330 return false;
331 case 2:
332 case 4:
333 return true;
334 }
335 }
336}
337
338unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) {
339 const unsigned Opc = MI.getOpcode();
340
341 if (TII->isMUBUF(MI)) {
342 return AMDGPU::getMUBUFDwords(Opc);
343 }
344
345 switch (Opc) {
346 default:
347 return 0;
348 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
349 return 1;
350 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
351 return 2;
352 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
353 return 4;
354 }
355}
356
357InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) {
358 if (TII->isMUBUF(Opc)) {
359 const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
360
361 // If we couldn't identify the opcode, bail out.
362 if (baseOpcode == -1) {
363 return UNKNOWN;
364 }
365
366 switch (baseOpcode) {
367 default:
368 return UNKNOWN;
369 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
370 return BUFFER_LOAD_OFFEN;
371 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
372 return BUFFER_LOAD_OFFSET;
373 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
374 return BUFFER_STORE_OFFEN;
375 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
376 return BUFFER_STORE_OFFSET;
377 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
378 return BUFFER_LOAD_OFFEN_exact;
379 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
380 return BUFFER_LOAD_OFFSET_exact;
381 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
382 return BUFFER_STORE_OFFEN_exact;
383 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
384 return BUFFER_STORE_OFFSET_exact;
385 }
386 }
387
388 switch (Opc) {
389 default:
390 return UNKNOWN;
391 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
392 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
393 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
394 return S_BUFFER_LOAD_IMM;
395 case AMDGPU::DS_READ_B32:
396 case AMDGPU::DS_READ_B64:
397 case AMDGPU::DS_READ_B32_gfx9:
398 case AMDGPU::DS_READ_B64_gfx9:
399 return DS_READ;
400 case AMDGPU::DS_WRITE_B32:
401 case AMDGPU::DS_WRITE_B64:
402 case AMDGPU::DS_WRITE_B32_gfx9:
403 case AMDGPU::DS_WRITE_B64_gfx9:
404 return DS_WRITE;
405 }
406}
407
408unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) {
409 if (TII->isMUBUF(Opc)) {
410 unsigned result = 0;
411
412 if (AMDGPU::getMUBUFHasVAddr(Opc)) {
413 result |= VADDR;
414 }
415
416 if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
417 result |= SRSRC;
418 }
419
420 if (AMDGPU::getMUBUFHasSoffset(Opc)) {
421 result |= SOFFSET;
422 }
423
424 return result;
425 }
426
427 switch (Opc) {
428 default:
429 return 0;
430 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
431 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
432 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
433 return SBASE;
434 case AMDGPU::DS_READ_B32:
435 case AMDGPU::DS_READ_B64:
436 case AMDGPU::DS_READ_B32_gfx9:
437 case AMDGPU::DS_READ_B64_gfx9:
438 case AMDGPU::DS_WRITE_B32:
439 case AMDGPU::DS_WRITE_B64:
440 case AMDGPU::DS_WRITE_B32_gfx9:
441 case AMDGPU::DS_WRITE_B64_gfx9:
442 return ADDR;
443 }
444}
445
Marek Olsakb953cc32017-11-09 01:52:23 +0000446bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
Matt Arsenault67e72de2017-08-31 01:53:09 +0000447 MachineBasicBlock *MBB = CI.I->getParent();
448 MachineBasicBlock::iterator E = MBB->end();
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000449 MachineBasicBlock::iterator MBBI = CI.I;
Matt Arsenault3cb61632017-08-30 03:26:18 +0000450
Neil Henning76504a42018-12-12 16:15:21 +0000451 const unsigned Opc = CI.I->getOpcode();
452 const InstClassEnum InstClass = getInstClass(Opc);
453
454 if (InstClass == UNKNOWN) {
455 return false;
456 }
457
458 const unsigned Regs = getRegs(Opc);
459
460 unsigned AddrOpName[5] = {0};
461 int AddrIdx[5];
462 const MachineOperand *AddrReg[5];
Marek Olsak6a0548a2017-11-09 01:52:30 +0000463 unsigned NumAddresses = 0;
Marek Olsakb953cc32017-11-09 01:52:23 +0000464
Neil Henning76504a42018-12-12 16:15:21 +0000465 if (Regs & ADDR) {
Marek Olsak6a0548a2017-11-09 01:52:30 +0000466 AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
Neil Henning76504a42018-12-12 16:15:21 +0000467 }
468
469 if (Regs & SBASE) {
Marek Olsak6a0548a2017-11-09 01:52:30 +0000470 AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
Neil Henning76504a42018-12-12 16:15:21 +0000471 }
472
473 if (Regs & SRSRC) {
Marek Olsak6a0548a2017-11-09 01:52:30 +0000474 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
Neil Henning76504a42018-12-12 16:15:21 +0000475 }
476
477 if (Regs & SOFFSET) {
478 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
479 }
480
481 if (Regs & VADDR) {
Marek Olsak6a0548a2017-11-09 01:52:30 +0000482 AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
Marek Olsak6a0548a2017-11-09 01:52:30 +0000483 }
Matt Arsenault3cb61632017-08-30 03:26:18 +0000484
Marek Olsak6a0548a2017-11-09 01:52:30 +0000485 for (unsigned i = 0; i < NumAddresses; i++) {
486 AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
487 AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
488
Neil Henning76504a42018-12-12 16:15:21 +0000489 // We only ever merge operations with the same base address register, so
490 // don't bother scanning forward if there are no other uses.
Marek Olsak6a0548a2017-11-09 01:52:30 +0000491 if (AddrReg[i]->isReg() &&
492 (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
493 MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
494 return false;
495 }
Matt Arsenault3cb61632017-08-30 03:26:18 +0000496
Matt Arsenault41033282014-10-10 22:01:59 +0000497 ++MBBI;
498
Nicolai Haehnle6cf306d2018-02-23 10:45:56 +0000499 DenseSet<unsigned> RegDefsToMove;
500 DenseSet<unsigned> PhysRegUsesToMove;
501 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
Matt Arsenault41033282014-10-10 22:01:59 +0000502
Neil Henning76504a42018-12-12 16:15:21 +0000503 for (; MBBI != E; ++MBBI) {
504 const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
505
506 if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
507 (IsDS && (MBBI->getOpcode() != Opc))) {
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000508 // This is not a matching DS instruction, but we can keep looking as
509 // long as one of these conditions are met:
510 // 1. It is safe to move I down past MBBI.
511 // 2. It is safe to move MBBI down past the instruction that I will
512 // be merged into.
Matt Arsenault41033282014-10-10 22:01:59 +0000513
Matt Arsenault2d69c922017-08-29 21:25:51 +0000514 if (MBBI->hasUnmodeledSideEffects()) {
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000515 // We can't re-order this instruction with respect to other memory
Matt Arsenault2d69c922017-08-29 21:25:51 +0000516 // operations, so we fail both conditions mentioned above.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000517 return false;
Matt Arsenault2d69c922017-08-29 21:25:51 +0000518 }
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000519
520 if (MBBI->mayLoadOrStore() &&
Neil Henning76504a42018-12-12 16:15:21 +0000521 (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
522 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000523 // We fail condition #1, but we may still be able to satisfy condition
524 // #2. Add this instruction to the move list and then we will check
525 // if condition #2 holds once we have selected the matching instruction.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000526 CI.InstsToMove.push_back(&*MBBI);
Nicolai Haehnle6cf306d2018-02-23 10:45:56 +0000527 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000528 continue;
529 }
530
531 // When we match I with another DS instruction we will be moving I down
532 // to the location of the matched instruction any uses of I will need to
533 // be moved down as well.
Nicolai Haehnle6cf306d2018-02-23 10:45:56 +0000534 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
535 CI.InstsToMove);
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000536 continue;
537 }
538
539 // Don't merge volatiles.
540 if (MBBI->hasOrderedMemoryRef())
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000541 return false;
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000542
Nicolai Haehnle7b0e25b2016-10-27 08:15:07 +0000543 // Handle a case like
544 // DS_WRITE_B32 addr, v, idx0
545 // w = DS_READ_B32 addr, idx0
546 // DS_WRITE_B32 addr, f(w), idx1
547 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
548 // merging of the two writes.
Nicolai Haehnle6cf306d2018-02-23 10:45:56 +0000549 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
550 CI.InstsToMove))
Nicolai Haehnle7b0e25b2016-10-27 08:15:07 +0000551 continue;
552
Marek Olsak6a0548a2017-11-09 01:52:30 +0000553 bool Match = true;
554 for (unsigned i = 0; i < NumAddresses; i++) {
555 const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000556
Marek Olsak6a0548a2017-11-09 01:52:30 +0000557 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
558 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
559 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
560 Match = false;
561 break;
562 }
563 continue;
564 }
565
Neil Henning76504a42018-12-12 16:15:21 +0000566 // Check same base pointer. Be careful of subregisters, which can occur
567 // with vectors of pointers.
Marek Olsak6a0548a2017-11-09 01:52:30 +0000568 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
569 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
570 Match = false;
571 break;
572 }
573 }
574
575 if (Match) {
Neil Henning76504a42018-12-12 16:15:21 +0000576 int OffsetIdx =
577 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
Marek Olsakb953cc32017-11-09 01:52:23 +0000578 CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
Neil Henning76504a42018-12-12 16:15:21 +0000579 CI.Width0 = getOpcodeWidth(*CI.I);
Marek Olsakb953cc32017-11-09 01:52:23 +0000580 CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
Neil Henning76504a42018-12-12 16:15:21 +0000581 CI.Width1 = getOpcodeWidth(*MBBI);
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000582 CI.Paired = MBBI;
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000583
Neil Henning76504a42018-12-12 16:15:21 +0000584 if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
Marek Olsakb953cc32017-11-09 01:52:23 +0000585 CI.Offset0 &= 0xffff;
586 CI.Offset1 &= 0xffff;
Marek Olsak6a0548a2017-11-09 01:52:30 +0000587 } else {
588 CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
589 CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
Marek Olsak4c421a2d2017-11-09 01:52:36 +0000590 if (CI.InstClass != S_BUFFER_LOAD_IMM) {
Marek Olsak6a0548a2017-11-09 01:52:30 +0000591 CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
592 CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
593 }
Marek Olsakb953cc32017-11-09 01:52:23 +0000594 }
595
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000596 // Check both offsets fit in the reduced range.
597 // We also need to go through the list of instructions that we plan to
598 // move and make sure they are all safe to move down past the merged
599 // instruction.
Neil Henning76504a42018-12-12 16:15:21 +0000600 if (widthsFit(CI) && offsetsCanBeCombined(CI))
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000601 if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
602 return true;
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000603 }
604
605 // We've found a load/store that we couldn't merge for some reason.
606 // We could potentially keep looking, but we'd need to make sure that
607 // it was safe to move I and also all the instruction in InstsToMove
608 // down past this instruction.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000609 // check if we can move I across MBBI and if we can move all I's users
610 if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
Nicolai Haehnle6cf306d2018-02-23 10:45:56 +0000611 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
Alexander Timofeevf867a402016-11-03 14:37:13 +0000612 break;
Matt Arsenault41033282014-10-10 22:01:59 +0000613 }
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000614 return false;
Matt Arsenault41033282014-10-10 22:01:59 +0000615}
616
Matt Arsenault3f71c0e2017-11-29 00:55:57 +0000617unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
618 if (STM->ldsRequiresM0Init())
619 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
620 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
621}
622
623unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
624 if (STM->ldsRequiresM0Init())
625 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
626
Neil Henning76504a42018-12-12 16:15:21 +0000627 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
628 : AMDGPU::DS_READ2ST64_B64_gfx9;
Matt Arsenault3f71c0e2017-11-29 00:55:57 +0000629}
630
Neil Henning76504a42018-12-12 16:15:21 +0000631MachineBasicBlock::iterator
632SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000633 MachineBasicBlock *MBB = CI.I->getParent();
Matt Arsenault41033282014-10-10 22:01:59 +0000634
635 // Be careful, since the addresses could be subregisters themselves in weird
636 // cases, like vectors of pointers.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000637 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
Matt Arsenault41033282014-10-10 22:01:59 +0000638
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000639 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
640 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
Matt Arsenault41033282014-10-10 22:01:59 +0000641
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000642 unsigned NewOffset0 = CI.Offset0;
643 unsigned NewOffset1 = CI.Offset1;
Neil Henning76504a42018-12-12 16:15:21 +0000644 unsigned Opc =
645 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000646
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000647 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
648 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
Tom Stellarde175d8a2016-08-26 21:36:47 +0000649
650 if (NewOffset0 > NewOffset1) {
651 // Canonicalize the merged instruction so the smaller offset comes first.
652 std::swap(NewOffset0, NewOffset1);
653 std::swap(SubRegIdx0, SubRegIdx1);
654 }
655
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000656 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
Neil Henning76504a42018-12-12 16:15:21 +0000657 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000658
659 const MCInstrDesc &Read2Desc = TII->get(Opc);
Matt Arsenault41033282014-10-10 22:01:59 +0000660
Neil Henning76504a42018-12-12 16:15:21 +0000661 const TargetRegisterClass *SuperRC =
662 (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
Matt Arsenault41033282014-10-10 22:01:59 +0000663 unsigned DestReg = MRI->createVirtualRegister(SuperRC);
664
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000665 DebugLoc DL = CI.I->getDebugLoc();
666
667 unsigned BaseReg = AddrReg->getReg();
Stanislav Mekhanoshin8dfcd832018-09-25 23:33:18 +0000668 unsigned BaseSubReg = AddrReg->getSubReg();
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000669 unsigned BaseRegFlags = 0;
670 if (CI.BaseOff) {
Mark Searles7687d422018-01-22 21:46:43 +0000671 unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
672 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
Neil Henning76504a42018-12-12 16:15:21 +0000673 .addImm(CI.BaseOff);
Mark Searles7687d422018-01-22 21:46:43 +0000674
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000675 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
676 BaseRegFlags = RegState::Kill;
Matt Arsenault84445dd2017-11-30 22:51:26 +0000677
Mark Searles7687d422018-01-22 21:46:43 +0000678 TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
Neil Henning76504a42018-12-12 16:15:21 +0000679 .addReg(ImmReg)
680 .addReg(AddrReg->getReg(), 0, BaseSubReg);
Stanislav Mekhanoshin8dfcd832018-09-25 23:33:18 +0000681 BaseSubReg = 0;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000682 }
683
Neil Henning76504a42018-12-12 16:15:21 +0000684 MachineInstrBuilder Read2 =
685 BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
686 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
687 .addImm(NewOffset0) // offset0
688 .addImm(NewOffset1) // offset1
689 .addImm(0) // gds
690 .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
Stanislav Mekhanoshin86b0a542017-04-14 00:33:44 +0000691
NAKAMURA Takumi9720f572016-08-30 11:50:21 +0000692 (void)Read2;
Matt Arsenault41033282014-10-10 22:01:59 +0000693
Matt Arsenault84db5d92015-07-14 17:57:36 +0000694 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
695
696 // Copy to the old destination registers.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000697 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
Diana Picus116bbab2017-01-13 09:58:52 +0000698 .add(*Dest0) // Copy to same destination including flags and sub reg.
699 .addReg(DestReg, 0, SubRegIdx0);
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000700 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
Diana Picus116bbab2017-01-13 09:58:52 +0000701 .add(*Dest1)
702 .addReg(DestReg, RegState::Kill, SubRegIdx1);
Matt Arsenault84db5d92015-07-14 17:57:36 +0000703
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000704 moveInstsAfter(Copy1, CI.InstsToMove);
Matt Arsenault84db5d92015-07-14 17:57:36 +0000705
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000706 MachineBasicBlock::iterator Next = std::next(CI.I);
707 CI.I->eraseFromParent();
708 CI.Paired->eraseFromParent();
Matt Arsenault41033282014-10-10 22:01:59 +0000709
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000710 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000711 return Next;
Matt Arsenault41033282014-10-10 22:01:59 +0000712}
713
Matt Arsenault3f71c0e2017-11-29 00:55:57 +0000714unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
715 if (STM->ldsRequiresM0Init())
716 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
Neil Henning76504a42018-12-12 16:15:21 +0000717 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
718 : AMDGPU::DS_WRITE2_B64_gfx9;
Matt Arsenault3f71c0e2017-11-29 00:55:57 +0000719}
720
721unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
722 if (STM->ldsRequiresM0Init())
Neil Henning76504a42018-12-12 16:15:21 +0000723 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
724 : AMDGPU::DS_WRITE2ST64_B64;
Matt Arsenault3f71c0e2017-11-29 00:55:57 +0000725
Neil Henning76504a42018-12-12 16:15:21 +0000726 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
727 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
Matt Arsenault3f71c0e2017-11-29 00:55:57 +0000728}
729
Neil Henning76504a42018-12-12 16:15:21 +0000730MachineBasicBlock::iterator
731SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000732 MachineBasicBlock *MBB = CI.I->getParent();
Matt Arsenault41033282014-10-10 22:01:59 +0000733
734 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
735 // sure we preserve the subregister index and any register flags set on them.
Neil Henning76504a42018-12-12 16:15:21 +0000736 const MachineOperand *AddrReg =
737 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
738 const MachineOperand *Data0 =
739 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
740 const MachineOperand *Data1 =
741 TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
Matt Arsenault41033282014-10-10 22:01:59 +0000742
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000743 unsigned NewOffset0 = CI.Offset0;
744 unsigned NewOffset1 = CI.Offset1;
Neil Henning76504a42018-12-12 16:15:21 +0000745 unsigned Opc =
746 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000747
Tom Stellarde175d8a2016-08-26 21:36:47 +0000748 if (NewOffset0 > NewOffset1) {
749 // Canonicalize the merged instruction so the smaller offset comes first.
750 std::swap(NewOffset0, NewOffset1);
751 std::swap(Data0, Data1);
752 }
753
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000754 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
Neil Henning76504a42018-12-12 16:15:21 +0000755 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000756
757 const MCInstrDesc &Write2Desc = TII->get(Opc);
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000758 DebugLoc DL = CI.I->getDebugLoc();
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000759
Mark Searles7687d422018-01-22 21:46:43 +0000760 unsigned BaseReg = AddrReg->getReg();
Stanislav Mekhanoshin8dfcd832018-09-25 23:33:18 +0000761 unsigned BaseSubReg = AddrReg->getSubReg();
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000762 unsigned BaseRegFlags = 0;
763 if (CI.BaseOff) {
Mark Searles7687d422018-01-22 21:46:43 +0000764 unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
765 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
Neil Henning76504a42018-12-12 16:15:21 +0000766 .addImm(CI.BaseOff);
Mark Searles7687d422018-01-22 21:46:43 +0000767
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000768 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
769 BaseRegFlags = RegState::Kill;
Matt Arsenault84445dd2017-11-30 22:51:26 +0000770
Mark Searles7687d422018-01-22 21:46:43 +0000771 TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
Neil Henning76504a42018-12-12 16:15:21 +0000772 .addReg(ImmReg)
773 .addReg(AddrReg->getReg(), 0, BaseSubReg);
Stanislav Mekhanoshin8dfcd832018-09-25 23:33:18 +0000774 BaseSubReg = 0;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000775 }
Matt Arsenault41033282014-10-10 22:01:59 +0000776
Neil Henning76504a42018-12-12 16:15:21 +0000777 MachineInstrBuilder Write2 =
778 BuildMI(*MBB, CI.Paired, DL, Write2Desc)
779 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
780 .add(*Data0) // data0
781 .add(*Data1) // data1
782 .addImm(NewOffset0) // offset0
783 .addImm(NewOffset1) // offset1
784 .addImm(0) // gds
785 .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
Matt Arsenault41033282014-10-10 22:01:59 +0000786
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000787 moveInstsAfter(Write2, CI.InstsToMove);
788
789 MachineBasicBlock::iterator Next = std::next(CI.I);
790 CI.I->eraseFromParent();
791 CI.Paired->eraseFromParent();
Matt Arsenault41033282014-10-10 22:01:59 +0000792
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000793 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000794 return Next;
Matt Arsenault41033282014-10-10 22:01:59 +0000795}
796
Neil Henning76504a42018-12-12 16:15:21 +0000797MachineBasicBlock::iterator
798SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
Marek Olsakb953cc32017-11-09 01:52:23 +0000799 MachineBasicBlock *MBB = CI.I->getParent();
800 DebugLoc DL = CI.I->getDebugLoc();
Neil Henning76504a42018-12-12 16:15:21 +0000801 const unsigned Opcode = getNewOpcode(CI);
Marek Olsakb953cc32017-11-09 01:52:23 +0000802
Neil Henning76504a42018-12-12 16:15:21 +0000803 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
804
Marek Olsakb953cc32017-11-09 01:52:23 +0000805 unsigned DestReg = MRI->createVirtualRegister(SuperRC);
806 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
807
808 BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
809 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
810 .addImm(MergedOffset) // offset
811 .addImm(CI.GLC0) // glc
Chandler Carruthc73c0302018-08-16 21:30:05 +0000812 .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
Marek Olsakb953cc32017-11-09 01:52:23 +0000813
Neil Henning76504a42018-12-12 16:15:21 +0000814 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
815 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
816 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
Marek Olsakb953cc32017-11-09 01:52:23 +0000817
818 // Copy to the old destination registers.
819 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
820 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
821 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
822
823 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
824 .add(*Dest0) // Copy to same destination including flags and sub reg.
825 .addReg(DestReg, 0, SubRegIdx0);
826 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
827 .add(*Dest1)
828 .addReg(DestReg, RegState::Kill, SubRegIdx1);
829
830 moveInstsAfter(Copy1, CI.InstsToMove);
831
832 MachineBasicBlock::iterator Next = std::next(CI.I);
833 CI.I->eraseFromParent();
834 CI.Paired->eraseFromParent();
835 return Next;
836}
837
Neil Henning76504a42018-12-12 16:15:21 +0000838MachineBasicBlock::iterator
839SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
Marek Olsak6a0548a2017-11-09 01:52:30 +0000840 MachineBasicBlock *MBB = CI.I->getParent();
841 DebugLoc DL = CI.I->getDebugLoc();
Marek Olsak4c421a2d2017-11-09 01:52:36 +0000842
Neil Henning76504a42018-12-12 16:15:21 +0000843 const unsigned Opcode = getNewOpcode(CI);
Marek Olsak6a0548a2017-11-09 01:52:30 +0000844
Neil Henning76504a42018-12-12 16:15:21 +0000845 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
846
847 // Copy to the new source register.
Marek Olsak6a0548a2017-11-09 01:52:30 +0000848 unsigned DestReg = MRI->createVirtualRegister(SuperRC);
849 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
850
Marek Olsak4c421a2d2017-11-09 01:52:36 +0000851 auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
852
Neil Henning76504a42018-12-12 16:15:21 +0000853 const unsigned Regs = getRegs(Opcode);
854
855 if (Regs & VADDR)
856 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
Marek Olsak4c421a2d2017-11-09 01:52:36 +0000857
858 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
Marek Olsak6a0548a2017-11-09 01:52:30 +0000859 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
860 .addImm(MergedOffset) // offset
861 .addImm(CI.GLC0) // glc
862 .addImm(CI.SLC0) // slc
863 .addImm(0) // tfe
Chandler Carruthc73c0302018-08-16 21:30:05 +0000864 .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
Marek Olsak6a0548a2017-11-09 01:52:30 +0000865
Neil Henning76504a42018-12-12 16:15:21 +0000866 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
867 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
868 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
Marek Olsak6a0548a2017-11-09 01:52:30 +0000869
870 // Copy to the old destination registers.
871 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
872 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
873 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
874
875 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
876 .add(*Dest0) // Copy to same destination including flags and sub reg.
877 .addReg(DestReg, 0, SubRegIdx0);
878 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
879 .add(*Dest1)
880 .addReg(DestReg, RegState::Kill, SubRegIdx1);
881
882 moveInstsAfter(Copy1, CI.InstsToMove);
883
884 MachineBasicBlock::iterator Next = std::next(CI.I);
885 CI.I->eraseFromParent();
886 CI.Paired->eraseFromParent();
887 return Next;
888}
889
Neil Henning76504a42018-12-12 16:15:21 +0000890unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
891 const unsigned Width = CI.Width0 + CI.Width1;
Marek Olsak58410f32017-11-09 01:52:55 +0000892
Neil Henning76504a42018-12-12 16:15:21 +0000893 switch (CI.InstClass) {
894 default:
895 return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
896 case UNKNOWN:
897 llvm_unreachable("Unknown instruction class");
898 case S_BUFFER_LOAD_IMM:
899 switch (Width) {
900 default:
901 return 0;
902 case 2:
903 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
904 case 4:
905 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
906 }
Marek Olsak58410f32017-11-09 01:52:55 +0000907 }
Marek Olsak58410f32017-11-09 01:52:55 +0000908}
909
Neil Henning76504a42018-12-12 16:15:21 +0000910std::pair<unsigned, unsigned>
911SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
912 if (CI.Offset0 > CI.Offset1) {
913 switch (CI.Width0) {
914 default:
915 return std::make_pair(0, 0);
916 case 1:
917 switch (CI.Width1) {
918 default:
919 return std::make_pair(0, 0);
920 case 1:
921 return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
922 case 2:
923 return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
924 case 3:
925 return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
926 }
927 case 2:
928 switch (CI.Width1) {
929 default:
930 return std::make_pair(0, 0);
931 case 1:
932 return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
933 case 2:
934 return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
935 }
936 case 3:
937 switch (CI.Width1) {
938 default:
939 return std::make_pair(0, 0);
940 case 1:
941 return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
942 }
943 }
944 } else {
945 switch (CI.Width0) {
946 default:
947 return std::make_pair(0, 0);
948 case 1:
949 switch (CI.Width1) {
950 default:
951 return std::make_pair(0, 0);
952 case 1:
953 return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
954 case 2:
955 return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
956 case 3:
957 return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
958 }
959 case 2:
960 switch (CI.Width1) {
961 default:
962 return std::make_pair(0, 0);
963 case 1:
964 return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
965 case 2:
966 return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
967 }
968 case 3:
969 switch (CI.Width1) {
970 default:
971 return std::make_pair(0, 0);
972 case 1:
973 return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
974 }
975 }
976 }
977}
978
979const TargetRegisterClass *
980SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
981 if (CI.InstClass == S_BUFFER_LOAD_IMM) {
982 switch (CI.Width0 + CI.Width1) {
983 default:
984 return nullptr;
985 case 2:
986 return &AMDGPU::SReg_64_XEXECRegClass;
987 case 4:
988 return &AMDGPU::SReg_128RegClass;
989 case 8:
990 return &AMDGPU::SReg_256RegClass;
991 case 16:
992 return &AMDGPU::SReg_512RegClass;
993 }
994 } else {
995 switch (CI.Width0 + CI.Width1) {
996 default:
997 return nullptr;
998 case 2:
999 return &AMDGPU::VReg_64RegClass;
1000 case 3:
1001 return &AMDGPU::VReg_96RegClass;
1002 case 4:
1003 return &AMDGPU::VReg_128RegClass;
1004 }
1005 }
1006}
1007
1008MachineBasicBlock::iterator
1009SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
Marek Olsak58410f32017-11-09 01:52:55 +00001010 MachineBasicBlock *MBB = CI.I->getParent();
1011 DebugLoc DL = CI.I->getDebugLoc();
Marek Olsak58410f32017-11-09 01:52:55 +00001012
Neil Henning76504a42018-12-12 16:15:21 +00001013 const unsigned Opcode = getNewOpcode(CI);
Marek Olsak58410f32017-11-09 01:52:55 +00001014
Neil Henning76504a42018-12-12 16:15:21 +00001015 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1016 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1017 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
Marek Olsak58410f32017-11-09 01:52:55 +00001018
1019 // Copy to the new source register.
Neil Henning76504a42018-12-12 16:15:21 +00001020 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
Marek Olsak58410f32017-11-09 01:52:55 +00001021 unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
1022
1023 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1024 const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1025
1026 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1027 .add(*Src0)
1028 .addImm(SubRegIdx0)
1029 .add(*Src1)
1030 .addImm(SubRegIdx1);
1031
1032 auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
Neil Henning76504a42018-12-12 16:15:21 +00001033 .addReg(SrcReg, RegState::Kill);
Marek Olsak58410f32017-11-09 01:52:55 +00001034
Neil Henning76504a42018-12-12 16:15:21 +00001035 const unsigned Regs = getRegs(Opcode);
1036
1037 if (Regs & VADDR)
1038 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
Marek Olsak58410f32017-11-09 01:52:55 +00001039
1040 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1041 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1042 .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
Chandler Carruthc73c0302018-08-16 21:30:05 +00001043 .addImm(CI.GLC0) // glc
1044 .addImm(CI.SLC0) // slc
1045 .addImm(0) // tfe
1046 .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
Marek Olsak58410f32017-11-09 01:52:55 +00001047
1048 moveInstsAfter(MIB, CI.InstsToMove);
1049
1050 MachineBasicBlock::iterator Next = std::next(CI.I);
1051 CI.I->eraseFromParent();
1052 CI.Paired->eraseFromParent();
1053 return Next;
1054}
1055
Matt Arsenault41033282014-10-10 22:01:59 +00001056// Scan through looking for adjacent LDS operations with constant offsets from
1057// the same base register. We rely on the scheduler to do the hard work of
1058// clustering nearby loads, and assume these are all adjacent.
1059bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
Matt Arsenault41033282014-10-10 22:01:59 +00001060 bool Modified = false;
1061
1062 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
1063 MachineInstr &MI = *I;
1064
1065 // Don't combine if volatile.
1066 if (MI.hasOrderedMemoryRef()) {
1067 ++I;
1068 continue;
1069 }
1070
Neil Henning76504a42018-12-12 16:15:21 +00001071 const unsigned Opc = MI.getOpcode();
1072
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +00001073 CombineInfo CI;
1074 CI.I = I;
Neil Henning76504a42018-12-12 16:15:21 +00001075 CI.InstClass = getInstClass(Opc);
Matt Arsenault3f71c0e2017-11-29 00:55:57 +00001076
Neil Henning76504a42018-12-12 16:15:21 +00001077 switch (CI.InstClass) {
1078 default:
1079 break;
1080 case DS_READ:
Matt Arsenault3f71c0e2017-11-29 00:55:57 +00001081 CI.EltSize =
Neil Henning76504a42018-12-12 16:15:21 +00001082 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
1083 : 4;
Marek Olsakb953cc32017-11-09 01:52:23 +00001084 if (findMatchingInst(CI)) {
Matt Arsenault41033282014-10-10 22:01:59 +00001085 Modified = true;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +00001086 I = mergeRead2Pair(CI);
Matt Arsenault41033282014-10-10 22:01:59 +00001087 } else {
1088 ++I;
1089 }
Matt Arsenault41033282014-10-10 22:01:59 +00001090 continue;
Neil Henning76504a42018-12-12 16:15:21 +00001091 case DS_WRITE:
1092 CI.EltSize =
1093 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
1094 : 4;
Marek Olsakb953cc32017-11-09 01:52:23 +00001095 if (findMatchingInst(CI)) {
Matt Arsenault41033282014-10-10 22:01:59 +00001096 Modified = true;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +00001097 I = mergeWrite2Pair(CI);
Matt Arsenault41033282014-10-10 22:01:59 +00001098 } else {
1099 ++I;
1100 }
Matt Arsenault41033282014-10-10 22:01:59 +00001101 continue;
Neil Henning76504a42018-12-12 16:15:21 +00001102 case S_BUFFER_LOAD_IMM:
Marek Olsakb953cc32017-11-09 01:52:23 +00001103 CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
Marek Olsakb953cc32017-11-09 01:52:23 +00001104 if (findMatchingInst(CI)) {
1105 Modified = true;
1106 I = mergeSBufferLoadImmPair(CI);
Neil Henning76504a42018-12-12 16:15:21 +00001107 OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
Marek Olsakb953cc32017-11-09 01:52:23 +00001108 } else {
1109 ++I;
1110 }
1111 continue;
Neil Henning76504a42018-12-12 16:15:21 +00001112 case BUFFER_LOAD_OFFEN:
1113 case BUFFER_LOAD_OFFSET:
1114 case BUFFER_LOAD_OFFEN_exact:
1115 case BUFFER_LOAD_OFFSET_exact:
Marek Olsak6a0548a2017-11-09 01:52:30 +00001116 CI.EltSize = 4;
Marek Olsak6a0548a2017-11-09 01:52:30 +00001117 if (findMatchingInst(CI)) {
1118 Modified = true;
Marek Olsak4c421a2d2017-11-09 01:52:36 +00001119 I = mergeBufferLoadPair(CI);
Neil Henning76504a42018-12-12 16:15:21 +00001120 OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
Marek Olsak6a0548a2017-11-09 01:52:30 +00001121 } else {
1122 ++I;
1123 }
1124 continue;
Neil Henning76504a42018-12-12 16:15:21 +00001125 case BUFFER_STORE_OFFEN:
1126 case BUFFER_STORE_OFFSET:
1127 case BUFFER_STORE_OFFEN_exact:
1128 case BUFFER_STORE_OFFSET_exact:
Marek Olsak58410f32017-11-09 01:52:55 +00001129 CI.EltSize = 4;
Marek Olsak58410f32017-11-09 01:52:55 +00001130 if (findMatchingInst(CI)) {
1131 Modified = true;
1132 I = mergeBufferStorePair(CI);
Neil Henning76504a42018-12-12 16:15:21 +00001133 OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
Marek Olsak58410f32017-11-09 01:52:55 +00001134 } else {
1135 ++I;
1136 }
1137 continue;
1138 }
1139
Matt Arsenault41033282014-10-10 22:01:59 +00001140 ++I;
1141 }
1142
1143 return Modified;
1144}
1145
1146bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
Matthias Braunf1caa282017-12-15 22:22:58 +00001147 if (skipFunction(MF.getFunction()))
Andrew Kaylor7de74af2016-04-25 22:23:44 +00001148 return false;
1149
Tom Stellard5bfbae52018-07-11 20:59:01 +00001150 STM = &MF.getSubtarget<GCNSubtarget>();
Marek Olsakb953cc32017-11-09 01:52:23 +00001151 if (!STM->loadStoreOptEnabled())
Matt Arsenault03d85842016-06-27 20:32:13 +00001152 return false;
1153
Marek Olsakb953cc32017-11-09 01:52:23 +00001154 TII = STM->getInstrInfo();
Matt Arsenault43e92fe2016-06-24 06:30:11 +00001155 TRI = &TII->getRegisterInfo();
1156
Matt Arsenault41033282014-10-10 22:01:59 +00001157 MRI = &MF.getRegInfo();
Tom Stellardc2ff0eb2016-08-29 19:15:22 +00001158 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
Matt Arsenault41033282014-10-10 22:01:59 +00001159
Matt Arsenault67e72de2017-08-31 01:53:09 +00001160 assert(MRI->isSSA() && "Must be run on SSA");
1161
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001162 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
Matt Arsenault41033282014-10-10 22:01:59 +00001163
Matt Arsenault41033282014-10-10 22:01:59 +00001164 bool Modified = false;
1165
Nicolai Haehnleb4f28de2017-11-28 08:42:46 +00001166 for (MachineBasicBlock &MBB : MF) {
Neil Henning76504a42018-12-12 16:15:21 +00001167 do {
1168 OptimizeAgain = false;
Marek Olsakb953cc32017-11-09 01:52:23 +00001169 Modified |= optimizeBlock(MBB);
Neil Henning76504a42018-12-12 16:15:21 +00001170 } while (OptimizeAgain);
Marek Olsakb953cc32017-11-09 01:52:23 +00001171 }
1172
Matt Arsenault41033282014-10-10 22:01:59 +00001173 return Modified;
1174}