blob: 3c1657bd2527c7c2223d832049772a759ec72ed6 [file] [log] [blame]
Eugene Zelenko59e12822017-08-08 00:47:13 +00001//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
Matt Arsenault41033282014-10-10 22:01:59 +00002//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This pass tries to fuse DS instructions with close by immediate offsets.
11// This will fuse operations such as
12// ds_read_b32 v0, v2 offset:16
13// ds_read_b32 v1, v2 offset:32
14// ==>
15// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
16//
Marek Olsakb953cc32017-11-09 01:52:23 +000017// The same is done for certain SMEM opcodes, e.g.:
18// s_buffer_load_dword s4, s[0:3], 4
19// s_buffer_load_dword s5, s[0:3], 8
20// ==>
21// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
22//
Matt Arsenault41033282014-10-10 22:01:59 +000023//
24// Future improvements:
25//
26// - This currently relies on the scheduler to place loads and stores next to
27// each other, and then only merges adjacent pairs of instructions. It would
28// be good to be more flexible with interleaved instructions, and possibly run
29// before scheduling. It currently missing stores of constants because loading
30// the constant into the data register is placed between the stores, although
31// this is arguably a scheduling problem.
32//
33// - Live interval recomputing seems inefficient. This currently only matches
34// one pair, and recomputes live intervals and moves on to the next pair. It
Konstantin Zhuravlyovecc7cbf2016-03-29 15:15:44 +000035// would be better to compute a list of all merges that need to occur.
Matt Arsenault41033282014-10-10 22:01:59 +000036//
37// - With a list of instructions to process, we can also merge more. If a
38// cluster of loads have offsets that are too large to fit in the 8-bit
39// offsets, but are close enough to fit in the 8 bits, we can add to the base
40// pointer and use the new reduced offsets.
41//
42//===----------------------------------------------------------------------===//
43
44#include "AMDGPU.h"
Matt Arsenault43e92fe2016-06-24 06:30:11 +000045#include "AMDGPUSubtarget.h"
Matt Arsenault41033282014-10-10 22:01:59 +000046#include "SIInstrInfo.h"
47#include "SIRegisterInfo.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000048#include "Utils/AMDGPUBaseInfo.h"
49#include "llvm/ADT/ArrayRef.h"
50#include "llvm/ADT/SmallVector.h"
51#include "llvm/ADT/StringRef.h"
52#include "llvm/Analysis/AliasAnalysis.h"
53#include "llvm/CodeGen/MachineBasicBlock.h"
Matt Arsenault41033282014-10-10 22:01:59 +000054#include "llvm/CodeGen/MachineFunction.h"
55#include "llvm/CodeGen/MachineFunctionPass.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000056#include "llvm/CodeGen/MachineInstr.h"
Matt Arsenault41033282014-10-10 22:01:59 +000057#include "llvm/CodeGen/MachineInstrBuilder.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000058#include "llvm/CodeGen/MachineOperand.h"
Matt Arsenault41033282014-10-10 22:01:59 +000059#include "llvm/CodeGen/MachineRegisterInfo.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000060#include "llvm/IR/DebugLoc.h"
61#include "llvm/Pass.h"
Matt Arsenault41033282014-10-10 22:01:59 +000062#include "llvm/Support/Debug.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000063#include "llvm/Support/MathExtras.h"
Benjamin Kramer799003b2015-03-23 19:32:43 +000064#include "llvm/Support/raw_ostream.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000065#include <algorithm>
Eugene Zelenko66203762017-01-21 00:53:49 +000066#include <cassert>
Eugene Zelenko59e12822017-08-08 00:47:13 +000067#include <cstdlib>
Eugene Zelenko66203762017-01-21 00:53:49 +000068#include <iterator>
69#include <utility>
Matt Arsenault41033282014-10-10 22:01:59 +000070
71using namespace llvm;
72
73#define DEBUG_TYPE "si-load-store-opt"
74
75namespace {
76
77class SILoadStoreOptimizer : public MachineFunctionPass {
Marek Olsak6a0548a2017-11-09 01:52:30 +000078 enum InstClassEnum {
79 DS_READ_WRITE,
80 S_BUFFER_LOAD_IMM,
81 BUFFER_LOAD_OFFEN,
Marek Olsak4c421a2d2017-11-09 01:52:36 +000082 BUFFER_LOAD_OFFSET,
Marek Olsak6a0548a2017-11-09 01:52:30 +000083 };
84
NAKAMURA Takumiaba2b3d2017-10-10 08:30:53 +000085 struct CombineInfo {
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +000086 MachineBasicBlock::iterator I;
87 MachineBasicBlock::iterator Paired;
88 unsigned EltSize;
89 unsigned Offset0;
90 unsigned Offset1;
91 unsigned BaseOff;
Marek Olsak6a0548a2017-11-09 01:52:30 +000092 InstClassEnum InstClass;
Marek Olsakb953cc32017-11-09 01:52:23 +000093 bool GLC0;
94 bool GLC1;
Marek Olsak6a0548a2017-11-09 01:52:30 +000095 bool SLC0;
96 bool SLC1;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +000097 bool UseST64;
Marek Olsakb953cc32017-11-09 01:52:23 +000098 bool IsX2;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +000099 SmallVector<MachineInstr*, 8> InstsToMove;
Eugene Zelenko59e12822017-08-08 00:47:13 +0000100 };
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000101
Matt Arsenault41033282014-10-10 22:01:59 +0000102private:
Marek Olsakb953cc32017-11-09 01:52:23 +0000103 const SISubtarget *STM = nullptr;
Eugene Zelenko66203762017-01-21 00:53:49 +0000104 const SIInstrInfo *TII = nullptr;
105 const SIRegisterInfo *TRI = nullptr;
106 MachineRegisterInfo *MRI = nullptr;
107 AliasAnalysis *AA = nullptr;
Marek Olsakb953cc32017-11-09 01:52:23 +0000108 unsigned CreatedX2;
Matt Arsenault41033282014-10-10 22:01:59 +0000109
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000110 static bool offsetsCanBeCombined(CombineInfo &CI);
Matt Arsenault41033282014-10-10 22:01:59 +0000111
Marek Olsakb953cc32017-11-09 01:52:23 +0000112 bool findMatchingInst(CombineInfo &CI);
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000113 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000114 MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
Marek Olsakb953cc32017-11-09 01:52:23 +0000115 MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
Marek Olsak4c421a2d2017-11-09 01:52:36 +0000116 MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
Matt Arsenault41033282014-10-10 22:01:59 +0000117
118public:
119 static char ID;
120
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000121 SILoadStoreOptimizer() : MachineFunctionPass(ID) {
Matt Arsenault41033282014-10-10 22:01:59 +0000122 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
123 }
124
125 bool optimizeBlock(MachineBasicBlock &MBB);
126
127 bool runOnMachineFunction(MachineFunction &MF) override;
128
Mehdi Amini117296c2016-10-01 02:56:57 +0000129 StringRef getPassName() const override { return "SI Load / Store Optimizer"; }
Matt Arsenault41033282014-10-10 22:01:59 +0000130
131 void getAnalysisUsage(AnalysisUsage &AU) const override {
132 AU.setPreservesCFG();
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000133 AU.addRequired<AAResultsWrapperPass>();
Matt Arsenault41033282014-10-10 22:01:59 +0000134
135 MachineFunctionPass::getAnalysisUsage(AU);
136 }
137};
138
Eugene Zelenko66203762017-01-21 00:53:49 +0000139} // end anonymous namespace.
Matt Arsenault41033282014-10-10 22:01:59 +0000140
141INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
142 "SI Load / Store Optimizer", false, false)
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000143INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
Matt Arsenault41033282014-10-10 22:01:59 +0000144INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
145 "SI Load / Store Optimizer", false, false)
146
147char SILoadStoreOptimizer::ID = 0;
148
149char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
150
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000151FunctionPass *llvm::createSILoadStoreOptimizerPass() {
152 return new SILoadStoreOptimizer();
Matt Arsenault41033282014-10-10 22:01:59 +0000153}
154
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000155static void moveInstsAfter(MachineBasicBlock::iterator I,
156 ArrayRef<MachineInstr*> InstsToMove) {
157 MachineBasicBlock *MBB = I->getParent();
158 ++I;
159 for (MachineInstr *MI : InstsToMove) {
160 MI->removeFromParent();
161 MBB->insert(I, MI);
162 }
163}
164
Matt Arsenault67e72de2017-08-31 01:53:09 +0000165static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) {
166 // XXX: Should this be looking for implicit defs?
167 for (const MachineOperand &Def : MI.defs())
168 Defs.insert(Def.getReg());
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000169}
170
Eugene Zelenko66203762017-01-21 00:53:49 +0000171static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
172 MachineBasicBlock::iterator B,
173 const SIInstrInfo *TII,
174 AliasAnalysis * AA) {
Matt Arsenault67e72de2017-08-31 01:53:09 +0000175 // RAW or WAR - cannot reorder
176 // WAW - cannot reorder
177 // RAR - safe to reorder
178 return !(A->mayStore() || B->mayStore()) ||
179 TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
Alexander Timofeevf867a402016-11-03 14:37:13 +0000180}
181
Nicolai Haehnle7b0e25b2016-10-27 08:15:07 +0000182// Add MI and its defs to the lists if MI reads one of the defs that are
183// already in the list. Returns true in that case.
184static bool
185addToListsIfDependent(MachineInstr &MI,
Matt Arsenault67e72de2017-08-31 01:53:09 +0000186 DenseSet<unsigned> &Defs,
Nicolai Haehnle7b0e25b2016-10-27 08:15:07 +0000187 SmallVectorImpl<MachineInstr*> &Insts) {
Matt Arsenault67e72de2017-08-31 01:53:09 +0000188 for (MachineOperand &Use : MI.operands()) {
189 // If one of the defs is read, then there is a use of Def between I and the
190 // instruction that I will potentially be merged with. We will need to move
191 // this instruction after the merged instructions.
192
193 if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) {
Nicolai Haehnle7b0e25b2016-10-27 08:15:07 +0000194 Insts.push_back(&MI);
195 addDefsToList(MI, Defs);
196 return true;
197 }
198 }
199
200 return false;
201}
202
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000203static bool
204canMoveInstsAcrossMemOp(MachineInstr &MemOp,
205 ArrayRef<MachineInstr*> InstsToMove,
206 const SIInstrInfo *TII,
207 AliasAnalysis *AA) {
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000208 assert(MemOp.mayLoadOrStore());
209
210 for (MachineInstr *InstToMove : InstsToMove) {
211 if (!InstToMove->mayLoadOrStore())
212 continue;
Alexander Timofeevf867a402016-11-03 14:37:13 +0000213 if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
214 return false;
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000215 }
216 return true;
217}
218
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000219bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
Matt Arsenault41033282014-10-10 22:01:59 +0000220 // XXX - Would the same offset be OK? Is there any reason this would happen or
221 // be useful?
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000222 if (CI.Offset0 == CI.Offset1)
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000223 return false;
224
225 // This won't be valid if the offset isn't aligned.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000226 if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000227 return false;
228
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000229 unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
230 unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
231 CI.UseST64 = false;
232 CI.BaseOff = 0;
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000233
Marek Olsakb953cc32017-11-09 01:52:23 +0000234 // SMEM offsets must be consecutive.
Marek Olsak6a0548a2017-11-09 01:52:30 +0000235 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
Marek Olsak4c421a2d2017-11-09 01:52:36 +0000236 CI.InstClass == BUFFER_LOAD_OFFEN ||
237 CI.InstClass == BUFFER_LOAD_OFFSET) {
Marek Olsakb953cc32017-11-09 01:52:23 +0000238 unsigned Diff = CI.IsX2 ? 2 : 1;
239 return (EltOffset0 + Diff == EltOffset1 ||
240 EltOffset1 + Diff == EltOffset0) &&
Marek Olsak6a0548a2017-11-09 01:52:30 +0000241 CI.GLC0 == CI.GLC1 &&
242 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
Marek Olsakb953cc32017-11-09 01:52:23 +0000243 }
244
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000245 // If the offset in elements doesn't fit in 8-bits, we might be able to use
246 // the stride 64 versions.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000247 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
248 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
249 CI.Offset0 = EltOffset0 / 64;
250 CI.Offset1 = EltOffset1 / 64;
251 CI.UseST64 = true;
252 return true;
253 }
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000254
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000255 // Check if the new offsets fit in the reduced 8-bit range.
256 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
257 CI.Offset0 = EltOffset0;
258 CI.Offset1 = EltOffset1;
259 return true;
260 }
261
262 // Try to shift base address to decrease offsets.
263 unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
264 CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
265
266 if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
267 CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
268 CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
269 CI.UseST64 = true;
270 return true;
271 }
272
273 if (isUInt<8>(OffsetDiff)) {
274 CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
275 CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
276 return true;
277 }
278
279 return false;
Matt Arsenault41033282014-10-10 22:01:59 +0000280}
281
Marek Olsakb953cc32017-11-09 01:52:23 +0000282bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
Matt Arsenault67e72de2017-08-31 01:53:09 +0000283 MachineBasicBlock *MBB = CI.I->getParent();
284 MachineBasicBlock::iterator E = MBB->end();
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000285 MachineBasicBlock::iterator MBBI = CI.I;
Matt Arsenault3cb61632017-08-30 03:26:18 +0000286
Marek Olsak6a0548a2017-11-09 01:52:30 +0000287 unsigned AddrOpName[3] = {0};
288 int AddrIdx[3];
289 const MachineOperand *AddrReg[3];
290 unsigned NumAddresses = 0;
Marek Olsakb953cc32017-11-09 01:52:23 +0000291
Marek Olsak6a0548a2017-11-09 01:52:30 +0000292 switch (CI.InstClass) {
293 case DS_READ_WRITE:
294 AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
295 break;
296 case S_BUFFER_LOAD_IMM:
297 AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
298 break;
299 case BUFFER_LOAD_OFFEN:
300 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
301 AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
302 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
303 break;
Marek Olsak4c421a2d2017-11-09 01:52:36 +0000304 case BUFFER_LOAD_OFFSET:
305 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
306 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
307 break;
Marek Olsak6a0548a2017-11-09 01:52:30 +0000308 default:
309 llvm_unreachable("invalid InstClass");
310 }
Matt Arsenault3cb61632017-08-30 03:26:18 +0000311
Marek Olsak6a0548a2017-11-09 01:52:30 +0000312 for (unsigned i = 0; i < NumAddresses; i++) {
313 AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
314 AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
315
316 // We only ever merge operations with the same base address register, so don't
317 // bother scanning forward if there are no other uses.
318 if (AddrReg[i]->isReg() &&
319 (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
320 MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
321 return false;
322 }
Matt Arsenault3cb61632017-08-30 03:26:18 +0000323
Matt Arsenault41033282014-10-10 22:01:59 +0000324 ++MBBI;
325
Matt Arsenault67e72de2017-08-31 01:53:09 +0000326 DenseSet<unsigned> DefsToMove;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000327 addDefsToList(*CI.I, DefsToMove);
Matt Arsenault41033282014-10-10 22:01:59 +0000328
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000329 for ( ; MBBI != E; ++MBBI) {
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000330 if (MBBI->getOpcode() != CI.I->getOpcode()) {
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000331 // This is not a matching DS instruction, but we can keep looking as
332 // long as one of these conditions are met:
333 // 1. It is safe to move I down past MBBI.
334 // 2. It is safe to move MBBI down past the instruction that I will
335 // be merged into.
Matt Arsenault41033282014-10-10 22:01:59 +0000336
Matt Arsenault2d69c922017-08-29 21:25:51 +0000337 if (MBBI->hasUnmodeledSideEffects()) {
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000338 // We can't re-order this instruction with respect to other memory
Matt Arsenault2d69c922017-08-29 21:25:51 +0000339 // operations, so we fail both conditions mentioned above.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000340 return false;
Matt Arsenault2d69c922017-08-29 21:25:51 +0000341 }
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000342
343 if (MBBI->mayLoadOrStore() &&
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000344 !memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA)) {
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000345 // We fail condition #1, but we may still be able to satisfy condition
346 // #2. Add this instruction to the move list and then we will check
347 // if condition #2 holds once we have selected the matching instruction.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000348 CI.InstsToMove.push_back(&*MBBI);
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000349 addDefsToList(*MBBI, DefsToMove);
350 continue;
351 }
352
353 // When we match I with another DS instruction we will be moving I down
354 // to the location of the matched instruction any uses of I will need to
355 // be moved down as well.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000356 addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove);
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000357 continue;
358 }
359
360 // Don't merge volatiles.
361 if (MBBI->hasOrderedMemoryRef())
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000362 return false;
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000363
Nicolai Haehnle7b0e25b2016-10-27 08:15:07 +0000364 // Handle a case like
365 // DS_WRITE_B32 addr, v, idx0
366 // w = DS_READ_B32 addr, idx0
367 // DS_WRITE_B32 addr, f(w), idx1
368 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
369 // merging of the two writes.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000370 if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove))
Nicolai Haehnle7b0e25b2016-10-27 08:15:07 +0000371 continue;
372
Marek Olsak6a0548a2017-11-09 01:52:30 +0000373 bool Match = true;
374 for (unsigned i = 0; i < NumAddresses; i++) {
375 const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000376
Marek Olsak6a0548a2017-11-09 01:52:30 +0000377 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
378 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
379 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
380 Match = false;
381 break;
382 }
383 continue;
384 }
385
386 // Check same base pointer. Be careful of subregisters, which can occur with
387 // vectors of pointers.
388 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
389 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
390 Match = false;
391 break;
392 }
393 }
394
395 if (Match) {
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000396 int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000397 AMDGPU::OpName::offset);
Marek Olsakb953cc32017-11-09 01:52:23 +0000398 CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
399 CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000400 CI.Paired = MBBI;
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000401
Marek Olsak6a0548a2017-11-09 01:52:30 +0000402 if (CI.InstClass == DS_READ_WRITE) {
Marek Olsakb953cc32017-11-09 01:52:23 +0000403 CI.Offset0 &= 0xffff;
404 CI.Offset1 &= 0xffff;
Marek Olsak6a0548a2017-11-09 01:52:30 +0000405 } else {
406 CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
407 CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
Marek Olsak4c421a2d2017-11-09 01:52:36 +0000408 if (CI.InstClass != S_BUFFER_LOAD_IMM) {
Marek Olsak6a0548a2017-11-09 01:52:30 +0000409 CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
410 CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
411 }
Marek Olsakb953cc32017-11-09 01:52:23 +0000412 }
413
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000414 // Check both offsets fit in the reduced range.
415 // We also need to go through the list of instructions that we plan to
416 // move and make sure they are all safe to move down past the merged
417 // instruction.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000418 if (offsetsCanBeCombined(CI))
419 if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
420 return true;
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000421 }
422
423 // We've found a load/store that we couldn't merge for some reason.
424 // We could potentially keep looking, but we'd need to make sure that
425 // it was safe to move I and also all the instruction in InstsToMove
426 // down past this instruction.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000427 // check if we can move I across MBBI and if we can move all I's users
428 if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
429 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
Alexander Timofeevf867a402016-11-03 14:37:13 +0000430 break;
Matt Arsenault41033282014-10-10 22:01:59 +0000431 }
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000432 return false;
Matt Arsenault41033282014-10-10 22:01:59 +0000433}
434
Matt Arsenault41033282014-10-10 22:01:59 +0000435MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000436 CombineInfo &CI) {
437 MachineBasicBlock *MBB = CI.I->getParent();
Matt Arsenault41033282014-10-10 22:01:59 +0000438
439 // Be careful, since the addresses could be subregisters themselves in weird
440 // cases, like vectors of pointers.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000441 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
Matt Arsenault41033282014-10-10 22:01:59 +0000442
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000443 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
444 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
Matt Arsenault41033282014-10-10 22:01:59 +0000445
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000446 unsigned NewOffset0 = CI.Offset0;
447 unsigned NewOffset1 = CI.Offset1;
448 unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2_B32
449 : AMDGPU::DS_READ2_B64;
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000450
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000451 if (CI.UseST64)
452 Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2ST64_B32
453 : AMDGPU::DS_READ2ST64_B64;
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000454
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000455 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
456 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
Tom Stellarde175d8a2016-08-26 21:36:47 +0000457
458 if (NewOffset0 > NewOffset1) {
459 // Canonicalize the merged instruction so the smaller offset comes first.
460 std::swap(NewOffset0, NewOffset1);
461 std::swap(SubRegIdx0, SubRegIdx1);
462 }
463
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000464 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
465 (NewOffset0 != NewOffset1) &&
466 "Computed offset doesn't fit");
467
468 const MCInstrDesc &Read2Desc = TII->get(Opc);
Matt Arsenault41033282014-10-10 22:01:59 +0000469
470 const TargetRegisterClass *SuperRC
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000471 = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
Matt Arsenault41033282014-10-10 22:01:59 +0000472 unsigned DestReg = MRI->createVirtualRegister(SuperRC);
473
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000474 DebugLoc DL = CI.I->getDebugLoc();
475
476 unsigned BaseReg = AddrReg->getReg();
477 unsigned BaseRegFlags = 0;
478 if (CI.BaseOff) {
479 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
480 BaseRegFlags = RegState::Kill;
Reid Klecknerdbc9ba32017-04-13 20:32:58 +0000481 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg)
Stanislav Mekhanoshin86b0a542017-04-14 00:33:44 +0000482 .addImm(CI.BaseOff)
483 .addReg(AddrReg->getReg());
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000484 }
485
Stanislav Mekhanoshin86b0a542017-04-14 00:33:44 +0000486 MachineInstrBuilder Read2 =
487 BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
488 .addReg(BaseReg, BaseRegFlags) // addr
489 .addImm(NewOffset0) // offset0
490 .addImm(NewOffset1) // offset1
491 .addImm(0) // gds
492 .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
493
NAKAMURA Takumi9720f572016-08-30 11:50:21 +0000494 (void)Read2;
Matt Arsenault41033282014-10-10 22:01:59 +0000495
Matt Arsenault84db5d92015-07-14 17:57:36 +0000496 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
497
498 // Copy to the old destination registers.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000499 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
Diana Picus116bbab2017-01-13 09:58:52 +0000500 .add(*Dest0) // Copy to same destination including flags and sub reg.
501 .addReg(DestReg, 0, SubRegIdx0);
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000502 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
Diana Picus116bbab2017-01-13 09:58:52 +0000503 .add(*Dest1)
504 .addReg(DestReg, RegState::Kill, SubRegIdx1);
Matt Arsenault84db5d92015-07-14 17:57:36 +0000505
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000506 moveInstsAfter(Copy1, CI.InstsToMove);
Matt Arsenault84db5d92015-07-14 17:57:36 +0000507
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000508 MachineBasicBlock::iterator Next = std::next(CI.I);
509 CI.I->eraseFromParent();
510 CI.Paired->eraseFromParent();
Matt Arsenault41033282014-10-10 22:01:59 +0000511
Matt Arsenault41033282014-10-10 22:01:59 +0000512 DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000513 return Next;
Matt Arsenault41033282014-10-10 22:01:59 +0000514}
515
516MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000517 CombineInfo &CI) {
518 MachineBasicBlock *MBB = CI.I->getParent();
Matt Arsenault41033282014-10-10 22:01:59 +0000519
520 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
521 // sure we preserve the subregister index and any register flags set on them.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000522 const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
523 const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
Matt Arsenault41033282014-10-10 22:01:59 +0000524 const MachineOperand *Data1
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000525 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
Matt Arsenault41033282014-10-10 22:01:59 +0000526
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000527 unsigned NewOffset0 = CI.Offset0;
528 unsigned NewOffset1 = CI.Offset1;
529 unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2_B32
530 : AMDGPU::DS_WRITE2_B64;
Matt Arsenault41033282014-10-10 22:01:59 +0000531
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000532 if (CI.UseST64)
533 Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
534 : AMDGPU::DS_WRITE2ST64_B64;
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000535
Tom Stellarde175d8a2016-08-26 21:36:47 +0000536 if (NewOffset0 > NewOffset1) {
537 // Canonicalize the merged instruction so the smaller offset comes first.
538 std::swap(NewOffset0, NewOffset1);
539 std::swap(Data0, Data1);
540 }
541
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000542 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
543 (NewOffset0 != NewOffset1) &&
544 "Computed offset doesn't fit");
545
546 const MCInstrDesc &Write2Desc = TII->get(Opc);
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000547 DebugLoc DL = CI.I->getDebugLoc();
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000548
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000549 unsigned BaseReg = Addr->getReg();
550 unsigned BaseRegFlags = 0;
551 if (CI.BaseOff) {
552 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
553 BaseRegFlags = RegState::Kill;
Reid Klecknerdbc9ba32017-04-13 20:32:58 +0000554 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg)
Stanislav Mekhanoshin86b0a542017-04-14 00:33:44 +0000555 .addImm(CI.BaseOff)
556 .addReg(Addr->getReg());
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000557 }
Matt Arsenault41033282014-10-10 22:01:59 +0000558
Stanislav Mekhanoshin86b0a542017-04-14 00:33:44 +0000559 MachineInstrBuilder Write2 =
560 BuildMI(*MBB, CI.Paired, DL, Write2Desc)
561 .addReg(BaseReg, BaseRegFlags) // addr
562 .add(*Data0) // data0
563 .add(*Data1) // data1
564 .addImm(NewOffset0) // offset0
565 .addImm(NewOffset1) // offset1
566 .addImm(0) // gds
567 .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
Matt Arsenault41033282014-10-10 22:01:59 +0000568
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000569 moveInstsAfter(Write2, CI.InstsToMove);
570
571 MachineBasicBlock::iterator Next = std::next(CI.I);
572 CI.I->eraseFromParent();
573 CI.Paired->eraseFromParent();
Matt Arsenault41033282014-10-10 22:01:59 +0000574
Matt Arsenault41033282014-10-10 22:01:59 +0000575 DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000576 return Next;
Matt Arsenault41033282014-10-10 22:01:59 +0000577}
578
Marek Olsakb953cc32017-11-09 01:52:23 +0000579MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
580 CombineInfo &CI) {
581 MachineBasicBlock *MBB = CI.I->getParent();
582 DebugLoc DL = CI.I->getDebugLoc();
583 unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM :
584 AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
585
586 const TargetRegisterClass *SuperRC =
587 CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;
588 unsigned DestReg = MRI->createVirtualRegister(SuperRC);
589 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
590
591 BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
592 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
593 .addImm(MergedOffset) // offset
594 .addImm(CI.GLC0) // glc
595 .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
596
597 unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
598 unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
599
600 // Handle descending offsets
601 if (CI.Offset0 > CI.Offset1)
602 std::swap(SubRegIdx0, SubRegIdx1);
603
604 // Copy to the old destination registers.
605 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
606 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
607 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
608
609 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
610 .add(*Dest0) // Copy to same destination including flags and sub reg.
611 .addReg(DestReg, 0, SubRegIdx0);
612 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
613 .add(*Dest1)
614 .addReg(DestReg, RegState::Kill, SubRegIdx1);
615
616 moveInstsAfter(Copy1, CI.InstsToMove);
617
618 MachineBasicBlock::iterator Next = std::next(CI.I);
619 CI.I->eraseFromParent();
620 CI.Paired->eraseFromParent();
621 return Next;
622}
623
Marek Olsak4c421a2d2017-11-09 01:52:36 +0000624MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
Marek Olsak6a0548a2017-11-09 01:52:30 +0000625 CombineInfo &CI) {
626 MachineBasicBlock *MBB = CI.I->getParent();
627 DebugLoc DL = CI.I->getDebugLoc();
Marek Olsak4c421a2d2017-11-09 01:52:36 +0000628 unsigned Opcode;
629
630 if (CI.InstClass == BUFFER_LOAD_OFFEN) {
631 Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN :
632 AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
633 } else {
634 Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET :
635 AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
636 }
Marek Olsak6a0548a2017-11-09 01:52:30 +0000637
638 const TargetRegisterClass *SuperRC =
639 CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
640 unsigned DestReg = MRI->createVirtualRegister(SuperRC);
641 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
642
Marek Olsak4c421a2d2017-11-09 01:52:36 +0000643 auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
644
645 if (CI.InstClass == BUFFER_LOAD_OFFEN)
646 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
647
648 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
Marek Olsak6a0548a2017-11-09 01:52:30 +0000649 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
650 .addImm(MergedOffset) // offset
651 .addImm(CI.GLC0) // glc
652 .addImm(CI.SLC0) // slc
653 .addImm(0) // tfe
654 .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
655
656 unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
657 unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
658
659 // Handle descending offsets
660 if (CI.Offset0 > CI.Offset1)
661 std::swap(SubRegIdx0, SubRegIdx1);
662
663 // Copy to the old destination registers.
664 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
665 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
666 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
667
668 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
669 .add(*Dest0) // Copy to same destination including flags and sub reg.
670 .addReg(DestReg, 0, SubRegIdx0);
671 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
672 .add(*Dest1)
673 .addReg(DestReg, RegState::Kill, SubRegIdx1);
674
675 moveInstsAfter(Copy1, CI.InstsToMove);
676
677 MachineBasicBlock::iterator Next = std::next(CI.I);
678 CI.I->eraseFromParent();
679 CI.Paired->eraseFromParent();
680 return Next;
681}
682
Matt Arsenault41033282014-10-10 22:01:59 +0000683// Scan through looking for adjacent LDS operations with constant offsets from
684// the same base register. We rely on the scheduler to do the hard work of
685// clustering nearby loads, and assume these are all adjacent.
686bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
Matt Arsenault41033282014-10-10 22:01:59 +0000687 bool Modified = false;
688
689 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
690 MachineInstr &MI = *I;
691
692 // Don't combine if volatile.
693 if (MI.hasOrderedMemoryRef()) {
694 ++I;
695 continue;
696 }
697
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000698 CombineInfo CI;
699 CI.I = I;
Matt Arsenault41033282014-10-10 22:01:59 +0000700 unsigned Opc = MI.getOpcode();
701 if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
Marek Olsak6a0548a2017-11-09 01:52:30 +0000702 CI.InstClass = DS_READ_WRITE;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000703 CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
Marek Olsakb953cc32017-11-09 01:52:23 +0000704 if (findMatchingInst(CI)) {
Matt Arsenault41033282014-10-10 22:01:59 +0000705 Modified = true;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000706 I = mergeRead2Pair(CI);
Matt Arsenault41033282014-10-10 22:01:59 +0000707 } else {
708 ++I;
709 }
710
711 continue;
Marek Olsakb953cc32017-11-09 01:52:23 +0000712 }
713 if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
Marek Olsak6a0548a2017-11-09 01:52:30 +0000714 CI.InstClass = DS_READ_WRITE;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000715 CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
Marek Olsakb953cc32017-11-09 01:52:23 +0000716 if (findMatchingInst(CI)) {
Matt Arsenault41033282014-10-10 22:01:59 +0000717 Modified = true;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000718 I = mergeWrite2Pair(CI);
Matt Arsenault41033282014-10-10 22:01:59 +0000719 } else {
720 ++I;
721 }
722
723 continue;
724 }
Marek Olsakb953cc32017-11-09 01:52:23 +0000725 if (STM->hasSBufferLoadStoreAtomicDwordxN() &&
726 (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
727 Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) {
728 // EltSize is in units of the offset encoding.
Marek Olsak6a0548a2017-11-09 01:52:30 +0000729 CI.InstClass = S_BUFFER_LOAD_IMM;
Marek Olsakb953cc32017-11-09 01:52:23 +0000730 CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
Marek Olsakb953cc32017-11-09 01:52:23 +0000731 CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
732 if (findMatchingInst(CI)) {
733 Modified = true;
734 I = mergeSBufferLoadImmPair(CI);
735 if (!CI.IsX2)
736 CreatedX2++;
737 } else {
738 ++I;
739 }
740 continue;
741 }
Marek Olsak6a0548a2017-11-09 01:52:30 +0000742 if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
Marek Olsak4c421a2d2017-11-09 01:52:36 +0000743 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
744 Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET ||
745 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) {
746 if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
747 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN)
748 CI.InstClass = BUFFER_LOAD_OFFEN;
749 else
750 CI.InstClass = BUFFER_LOAD_OFFSET;
751
Marek Olsak6a0548a2017-11-09 01:52:30 +0000752 CI.EltSize = 4;
Marek Olsak4c421a2d2017-11-09 01:52:36 +0000753 CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
754 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
Marek Olsak6a0548a2017-11-09 01:52:30 +0000755 if (findMatchingInst(CI)) {
756 Modified = true;
Marek Olsak4c421a2d2017-11-09 01:52:36 +0000757 I = mergeBufferLoadPair(CI);
Marek Olsak6a0548a2017-11-09 01:52:30 +0000758 if (!CI.IsX2)
759 CreatedX2++;
760 } else {
761 ++I;
762 }
763 continue;
764 }
Matt Arsenault41033282014-10-10 22:01:59 +0000765
766 ++I;
767 }
768
769 return Modified;
770}
771
772bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
Andrew Kaylor7de74af2016-04-25 22:23:44 +0000773 if (skipFunction(*MF.getFunction()))
774 return false;
775
Marek Olsakb953cc32017-11-09 01:52:23 +0000776 STM = &MF.getSubtarget<SISubtarget>();
777 if (!STM->loadStoreOptEnabled())
Matt Arsenault03d85842016-06-27 20:32:13 +0000778 return false;
779
Marek Olsakb953cc32017-11-09 01:52:23 +0000780 TII = STM->getInstrInfo();
Matt Arsenault43e92fe2016-06-24 06:30:11 +0000781 TRI = &TII->getRegisterInfo();
782
Matt Arsenault41033282014-10-10 22:01:59 +0000783 MRI = &MF.getRegInfo();
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000784 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
Matt Arsenault41033282014-10-10 22:01:59 +0000785
Matt Arsenault67e72de2017-08-31 01:53:09 +0000786 assert(MRI->isSSA() && "Must be run on SSA");
787
Matt Arsenault41033282014-10-10 22:01:59 +0000788 DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
789
Matt Arsenault41033282014-10-10 22:01:59 +0000790 bool Modified = false;
Marek Olsakb953cc32017-11-09 01:52:23 +0000791 CreatedX2 = 0;
Matt Arsenault41033282014-10-10 22:01:59 +0000792
793 for (MachineBasicBlock &MBB : MF)
794 Modified |= optimizeBlock(MBB);
795
Marek Olsakb953cc32017-11-09 01:52:23 +0000796 // Run again to convert x2 to x4.
797 if (CreatedX2 >= 1) {
798 for (MachineBasicBlock &MBB : MF)
799 Modified |= optimizeBlock(MBB);
800 }
801
Matt Arsenault41033282014-10-10 22:01:59 +0000802 return Modified;
803}