blob: b7541e0df62e5148ef83edd91ad4128e687253aa [file] [log] [blame]
Eugene Zelenko59e12822017-08-08 00:47:13 +00001//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
Matt Arsenault41033282014-10-10 22:01:59 +00002//
Chandler Carruth2946cd72019-01-19 08:50:56 +00003// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Matt Arsenault41033282014-10-10 22:01:59 +00006//
7//===----------------------------------------------------------------------===//
8//
9// This pass tries to fuse DS instructions with close by immediate offsets.
10// This will fuse operations such as
11// ds_read_b32 v0, v2 offset:16
12// ds_read_b32 v1, v2 offset:32
13// ==>
14// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15//
Nicolai Haehnleb4f28de2017-11-28 08:42:46 +000016// The same is done for certain SMEM and VMEM opcodes, e.g.:
Marek Olsakb953cc32017-11-09 01:52:23 +000017// s_buffer_load_dword s4, s[0:3], 4
18// s_buffer_load_dword s5, s[0:3], 8
19// ==>
20// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21//
Farhana Aleence095c52018-12-14 21:13:14 +000022// This pass also tries to promote constant offset to the immediate by
23// adjusting the base. It tries to use a base from the nearby instructions that
24// allows it to have a 13bit constant offset and then promotes the 13bit offset
25// to the immediate.
26// E.g.
27// s_movk_i32 s0, 0x1800
28// v_add_co_u32_e32 v0, vcc, s0, v2
29// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30//
31// s_movk_i32 s0, 0x1000
32// v_add_co_u32_e32 v5, vcc, s0, v2
33// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34// global_load_dwordx2 v[5:6], v[5:6], off
35// global_load_dwordx2 v[0:1], v[0:1], off
36// =>
37// s_movk_i32 s0, 0x1000
38// v_add_co_u32_e32 v5, vcc, s0, v2
39// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40// global_load_dwordx2 v[5:6], v[5:6], off
41// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
Matt Arsenault41033282014-10-10 22:01:59 +000042//
43// Future improvements:
44//
45// - This currently relies on the scheduler to place loads and stores next to
46// each other, and then only merges adjacent pairs of instructions. It would
47// be good to be more flexible with interleaved instructions, and possibly run
48// before scheduling. It currently missing stores of constants because loading
49// the constant into the data register is placed between the stores, although
50// this is arguably a scheduling problem.
51//
52// - Live interval recomputing seems inefficient. This currently only matches
53// one pair, and recomputes live intervals and moves on to the next pair. It
Konstantin Zhuravlyovecc7cbf2016-03-29 15:15:44 +000054// would be better to compute a list of all merges that need to occur.
Matt Arsenault41033282014-10-10 22:01:59 +000055//
56// - With a list of instructions to process, we can also merge more. If a
57// cluster of loads have offsets that are too large to fit in the 8-bit
58// offsets, but are close enough to fit in the 8 bits, we can add to the base
59// pointer and use the new reduced offsets.
60//
61//===----------------------------------------------------------------------===//
62
63#include "AMDGPU.h"
Matt Arsenault43e92fe2016-06-24 06:30:11 +000064#include "AMDGPUSubtarget.h"
Neil Henning76504a42018-12-12 16:15:21 +000065#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
Matt Arsenault41033282014-10-10 22:01:59 +000066#include "SIInstrInfo.h"
67#include "SIRegisterInfo.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000068#include "Utils/AMDGPUBaseInfo.h"
69#include "llvm/ADT/ArrayRef.h"
70#include "llvm/ADT/SmallVector.h"
71#include "llvm/ADT/StringRef.h"
72#include "llvm/Analysis/AliasAnalysis.h"
73#include "llvm/CodeGen/MachineBasicBlock.h"
Matt Arsenault41033282014-10-10 22:01:59 +000074#include "llvm/CodeGen/MachineFunction.h"
75#include "llvm/CodeGen/MachineFunctionPass.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000076#include "llvm/CodeGen/MachineInstr.h"
Matt Arsenault41033282014-10-10 22:01:59 +000077#include "llvm/CodeGen/MachineInstrBuilder.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000078#include "llvm/CodeGen/MachineOperand.h"
Matt Arsenault41033282014-10-10 22:01:59 +000079#include "llvm/CodeGen/MachineRegisterInfo.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000080#include "llvm/IR/DebugLoc.h"
81#include "llvm/Pass.h"
Matt Arsenault41033282014-10-10 22:01:59 +000082#include "llvm/Support/Debug.h"
Eugene Zelenko66203762017-01-21 00:53:49 +000083#include "llvm/Support/MathExtras.h"
Benjamin Kramer799003b2015-03-23 19:32:43 +000084#include "llvm/Support/raw_ostream.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000085#include <algorithm>
Eugene Zelenko66203762017-01-21 00:53:49 +000086#include <cassert>
Eugene Zelenko59e12822017-08-08 00:47:13 +000087#include <cstdlib>
Eugene Zelenko66203762017-01-21 00:53:49 +000088#include <iterator>
89#include <utility>
Matt Arsenault41033282014-10-10 22:01:59 +000090
91using namespace llvm;
92
93#define DEBUG_TYPE "si-load-store-opt"
94
95namespace {
Neil Henning76504a42018-12-12 16:15:21 +000096enum InstClassEnum {
97 UNKNOWN,
98 DS_READ,
99 DS_WRITE,
100 S_BUFFER_LOAD_IMM,
101 BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
102 BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
103 BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
104 BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
105 BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
106 BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
107 BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
108 BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
109};
110
111enum RegisterEnum {
112 SBASE = 0x1,
113 SRSRC = 0x2,
114 SOFFSET = 0x4,
115 VADDR = 0x8,
116 ADDR = 0x10,
117};
Matt Arsenault41033282014-10-10 22:01:59 +0000118
119class SILoadStoreOptimizer : public MachineFunctionPass {
NAKAMURA Takumiaba2b3d2017-10-10 08:30:53 +0000120 struct CombineInfo {
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000121 MachineBasicBlock::iterator I;
122 MachineBasicBlock::iterator Paired;
123 unsigned EltSize;
124 unsigned Offset0;
125 unsigned Offset1;
Neil Henning76504a42018-12-12 16:15:21 +0000126 unsigned Width0;
127 unsigned Width1;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000128 unsigned BaseOff;
Marek Olsak6a0548a2017-11-09 01:52:30 +0000129 InstClassEnum InstClass;
Marek Olsakb953cc32017-11-09 01:52:23 +0000130 bool GLC0;
131 bool GLC1;
Marek Olsak6a0548a2017-11-09 01:52:30 +0000132 bool SLC0;
133 bool SLC1;
Stanislav Mekhanoshina6322942019-04-30 22:08:23 +0000134 bool DLC0;
135 bool DLC1;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000136 bool UseST64;
Neil Henning76504a42018-12-12 16:15:21 +0000137 SmallVector<MachineInstr *, 8> InstsToMove;
138 };
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000139
Farhana Aleence095c52018-12-14 21:13:14 +0000140 struct BaseRegisters {
141 unsigned LoReg = 0;
142 unsigned HiReg = 0;
143
144 unsigned LoSubReg = 0;
145 unsigned HiSubReg = 0;
146 };
147
148 struct MemAddress {
149 BaseRegisters Base;
150 int64_t Offset = 0;
151 };
152
153 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
154
Matt Arsenault41033282014-10-10 22:01:59 +0000155private:
Tom Stellard5bfbae52018-07-11 20:59:01 +0000156 const GCNSubtarget *STM = nullptr;
Eugene Zelenko66203762017-01-21 00:53:49 +0000157 const SIInstrInfo *TII = nullptr;
158 const SIRegisterInfo *TRI = nullptr;
159 MachineRegisterInfo *MRI = nullptr;
160 AliasAnalysis *AA = nullptr;
Neil Henning76504a42018-12-12 16:15:21 +0000161 bool OptimizeAgain;
Matt Arsenault41033282014-10-10 22:01:59 +0000162
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000163 static bool offsetsCanBeCombined(CombineInfo &CI);
Neil Henninge85d45a2019-01-10 16:21:08 +0000164 static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
Neil Henning76504a42018-12-12 16:15:21 +0000165 static unsigned getNewOpcode(const CombineInfo &CI);
166 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
167 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
168 unsigned getOpcodeWidth(const MachineInstr &MI);
169 InstClassEnum getInstClass(unsigned Opc);
170 unsigned getRegs(unsigned Opc);
Matt Arsenault41033282014-10-10 22:01:59 +0000171
Marek Olsakb953cc32017-11-09 01:52:23 +0000172 bool findMatchingInst(CombineInfo &CI);
Matt Arsenault3f71c0e2017-11-29 00:55:57 +0000173
174 unsigned read2Opcode(unsigned EltSize) const;
175 unsigned read2ST64Opcode(unsigned EltSize) const;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000176 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
Matt Arsenault3f71c0e2017-11-29 00:55:57 +0000177
178 unsigned write2Opcode(unsigned EltSize) const;
179 unsigned write2ST64Opcode(unsigned EltSize) const;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000180 MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
Marek Olsakb953cc32017-11-09 01:52:23 +0000181 MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
Marek Olsak4c421a2d2017-11-09 01:52:36 +0000182 MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
Marek Olsak58410f32017-11-09 01:52:55 +0000183 MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
Matt Arsenault41033282014-10-10 22:01:59 +0000184
Farhana Aleence095c52018-12-14 21:13:14 +0000185 void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
186 int32_t NewOffset);
187 unsigned computeBase(MachineInstr &MI, const MemAddress &Addr);
188 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI);
189 Optional<int32_t> extractConstOffset(const MachineOperand &Op);
190 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr);
191 /// Promotes constant offset to the immediate by adjusting the base. It
192 /// tries to use a base from the nearby instructions that allows it to have
193 /// a 13bit constant offset which gets promoted to the immediate.
194 bool promoteConstantOffsetToImm(MachineInstr &CI,
195 MemInfoMap &Visited,
196 SmallPtrSet<MachineInstr *, 4> &Promoted);
197
Matt Arsenault41033282014-10-10 22:01:59 +0000198public:
199 static char ID;
200
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000201 SILoadStoreOptimizer() : MachineFunctionPass(ID) {
Matt Arsenault41033282014-10-10 22:01:59 +0000202 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
203 }
204
205 bool optimizeBlock(MachineBasicBlock &MBB);
206
207 bool runOnMachineFunction(MachineFunction &MF) override;
208
Mark Searles7687d422018-01-22 21:46:43 +0000209 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
Matt Arsenault41033282014-10-10 22:01:59 +0000210
211 void getAnalysisUsage(AnalysisUsage &AU) const override {
212 AU.setPreservesCFG();
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000213 AU.addRequired<AAResultsWrapperPass>();
Matt Arsenault41033282014-10-10 22:01:59 +0000214
215 MachineFunctionPass::getAnalysisUsage(AU);
216 }
217};
218
Eugene Zelenko66203762017-01-21 00:53:49 +0000219} // end anonymous namespace.
Matt Arsenault41033282014-10-10 22:01:59 +0000220
221INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
Mark Searles7687d422018-01-22 21:46:43 +0000222 "SI Load Store Optimizer", false, false)
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000223INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
Neil Henning76504a42018-12-12 16:15:21 +0000224INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
225 false, false)
Matt Arsenault41033282014-10-10 22:01:59 +0000226
227char SILoadStoreOptimizer::ID = 0;
228
229char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
230
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000231FunctionPass *llvm::createSILoadStoreOptimizerPass() {
232 return new SILoadStoreOptimizer();
Matt Arsenault41033282014-10-10 22:01:59 +0000233}
234
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000235static void moveInstsAfter(MachineBasicBlock::iterator I,
Neil Henning76504a42018-12-12 16:15:21 +0000236 ArrayRef<MachineInstr *> InstsToMove) {
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000237 MachineBasicBlock *MBB = I->getParent();
238 ++I;
239 for (MachineInstr *MI : InstsToMove) {
240 MI->removeFromParent();
241 MBB->insert(I, MI);
242 }
243}
244
Nicolai Haehnle6cf306d2018-02-23 10:45:56 +0000245static void addDefsUsesToList(const MachineInstr &MI,
246 DenseSet<unsigned> &RegDefs,
247 DenseSet<unsigned> &PhysRegUses) {
248 for (const MachineOperand &Op : MI.operands()) {
249 if (Op.isReg()) {
250 if (Op.isDef())
251 RegDefs.insert(Op.getReg());
252 else if (Op.readsReg() &&
253 TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
254 PhysRegUses.insert(Op.getReg());
255 }
Matt Arsenaultb02cebf2018-02-08 01:56:14 +0000256 }
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000257}
258
Eugene Zelenko66203762017-01-21 00:53:49 +0000259static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
260 MachineBasicBlock::iterator B,
Neil Henning76504a42018-12-12 16:15:21 +0000261 AliasAnalysis *AA) {
Matt Arsenault67e72de2017-08-31 01:53:09 +0000262 // RAW or WAR - cannot reorder
263 // WAW - cannot reorder
264 // RAR - safe to reorder
Changpeng Fang4cabf6d2019-02-18 23:00:26 +0000265 return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
Alexander Timofeevf867a402016-11-03 14:37:13 +0000266}
267
Nicolai Haehnle7b0e25b2016-10-27 08:15:07 +0000268// Add MI and its defs to the lists if MI reads one of the defs that are
269// already in the list. Returns true in that case.
Neil Henning76504a42018-12-12 16:15:21 +0000270static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
271 DenseSet<unsigned> &PhysRegUses,
272 SmallVectorImpl<MachineInstr *> &Insts) {
Matt Arsenault67e72de2017-08-31 01:53:09 +0000273 for (MachineOperand &Use : MI.operands()) {
274 // If one of the defs is read, then there is a use of Def between I and the
275 // instruction that I will potentially be merged with. We will need to move
276 // this instruction after the merged instructions.
Nicolai Haehnle6cf306d2018-02-23 10:45:56 +0000277 //
278 // Similarly, if there is a def which is read by an instruction that is to
279 // be moved for merging, then we need to move the def-instruction as well.
280 // This can only happen for physical registers such as M0; virtual
281 // registers are in SSA form.
282 if (Use.isReg() &&
283 ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
284 (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
285 PhysRegUses.count(Use.getReg())))) {
Nicolai Haehnle7b0e25b2016-10-27 08:15:07 +0000286 Insts.push_back(&MI);
Nicolai Haehnle6cf306d2018-02-23 10:45:56 +0000287 addDefsUsesToList(MI, RegDefs, PhysRegUses);
Nicolai Haehnle7b0e25b2016-10-27 08:15:07 +0000288 return true;
289 }
290 }
291
292 return false;
293}
294
Neil Henning76504a42018-12-12 16:15:21 +0000295static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
296 ArrayRef<MachineInstr *> InstsToMove,
Changpeng Fang4cabf6d2019-02-18 23:00:26 +0000297 AliasAnalysis *AA) {
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000298 assert(MemOp.mayLoadOrStore());
299
300 for (MachineInstr *InstToMove : InstsToMove) {
301 if (!InstToMove->mayLoadOrStore())
302 continue;
Changpeng Fang4cabf6d2019-02-18 23:00:26 +0000303 if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
Neil Henning76504a42018-12-12 16:15:21 +0000304 return false;
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000305 }
306 return true;
307}
308
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000309bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
Matt Arsenault41033282014-10-10 22:01:59 +0000310 // XXX - Would the same offset be OK? Is there any reason this would happen or
311 // be useful?
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000312 if (CI.Offset0 == CI.Offset1)
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000313 return false;
314
315 // This won't be valid if the offset isn't aligned.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000316 if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000317 return false;
318
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000319 unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
320 unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
321 CI.UseST64 = false;
322 CI.BaseOff = 0;
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000323
Marek Olsak58410f32017-11-09 01:52:55 +0000324 // Handle SMEM and VMEM instructions.
Neil Henning76504a42018-12-12 16:15:21 +0000325 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
326 return (EltOffset0 + CI.Width0 == EltOffset1 ||
327 EltOffset1 + CI.Width1 == EltOffset0) &&
Stanislav Mekhanoshina6322942019-04-30 22:08:23 +0000328 CI.GLC0 == CI.GLC1 && CI.DLC0 == CI.DLC1 &&
Marek Olsak6a0548a2017-11-09 01:52:30 +0000329 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
Marek Olsakb953cc32017-11-09 01:52:23 +0000330 }
331
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000332 // If the offset in elements doesn't fit in 8-bits, we might be able to use
333 // the stride 64 versions.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000334 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
335 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
336 CI.Offset0 = EltOffset0 / 64;
337 CI.Offset1 = EltOffset1 / 64;
338 CI.UseST64 = true;
339 return true;
340 }
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000341
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000342 // Check if the new offsets fit in the reduced 8-bit range.
343 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
344 CI.Offset0 = EltOffset0;
345 CI.Offset1 = EltOffset1;
346 return true;
347 }
348
349 // Try to shift base address to decrease offsets.
350 unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
351 CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
352
353 if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
354 CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
355 CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
356 CI.UseST64 = true;
357 return true;
358 }
359
360 if (isUInt<8>(OffsetDiff)) {
361 CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
362 CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
363 return true;
364 }
365
366 return false;
Matt Arsenault41033282014-10-10 22:01:59 +0000367}
368
Neil Henninge85d45a2019-01-10 16:21:08 +0000369bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
370 const CombineInfo &CI) {
Neil Henning76504a42018-12-12 16:15:21 +0000371 const unsigned Width = (CI.Width0 + CI.Width1);
372 switch (CI.InstClass) {
373 default:
Neil Henninge85d45a2019-01-10 16:21:08 +0000374 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
Neil Henning76504a42018-12-12 16:15:21 +0000375 case S_BUFFER_LOAD_IMM:
376 switch (Width) {
377 default:
378 return false;
379 case 2:
380 case 4:
381 return true;
382 }
383 }
384}
385
386unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) {
387 const unsigned Opc = MI.getOpcode();
388
389 if (TII->isMUBUF(MI)) {
390 return AMDGPU::getMUBUFDwords(Opc);
391 }
392
393 switch (Opc) {
394 default:
395 return 0;
396 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
397 return 1;
398 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
399 return 2;
400 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
401 return 4;
402 }
403}
404
405InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) {
406 if (TII->isMUBUF(Opc)) {
407 const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
408
409 // If we couldn't identify the opcode, bail out.
410 if (baseOpcode == -1) {
411 return UNKNOWN;
412 }
413
414 switch (baseOpcode) {
415 default:
416 return UNKNOWN;
417 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
418 return BUFFER_LOAD_OFFEN;
419 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
420 return BUFFER_LOAD_OFFSET;
421 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
422 return BUFFER_STORE_OFFEN;
423 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
424 return BUFFER_STORE_OFFSET;
425 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
426 return BUFFER_LOAD_OFFEN_exact;
427 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
428 return BUFFER_LOAD_OFFSET_exact;
429 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
430 return BUFFER_STORE_OFFEN_exact;
431 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
432 return BUFFER_STORE_OFFSET_exact;
433 }
434 }
435
436 switch (Opc) {
437 default:
438 return UNKNOWN;
439 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
440 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
441 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
442 return S_BUFFER_LOAD_IMM;
443 case AMDGPU::DS_READ_B32:
444 case AMDGPU::DS_READ_B64:
445 case AMDGPU::DS_READ_B32_gfx9:
446 case AMDGPU::DS_READ_B64_gfx9:
447 return DS_READ;
448 case AMDGPU::DS_WRITE_B32:
449 case AMDGPU::DS_WRITE_B64:
450 case AMDGPU::DS_WRITE_B32_gfx9:
451 case AMDGPU::DS_WRITE_B64_gfx9:
452 return DS_WRITE;
453 }
454}
455
456unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) {
457 if (TII->isMUBUF(Opc)) {
458 unsigned result = 0;
459
460 if (AMDGPU::getMUBUFHasVAddr(Opc)) {
461 result |= VADDR;
462 }
463
464 if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
465 result |= SRSRC;
466 }
467
468 if (AMDGPU::getMUBUFHasSoffset(Opc)) {
469 result |= SOFFSET;
470 }
471
472 return result;
473 }
474
475 switch (Opc) {
476 default:
477 return 0;
478 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
479 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
480 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
481 return SBASE;
482 case AMDGPU::DS_READ_B32:
483 case AMDGPU::DS_READ_B64:
484 case AMDGPU::DS_READ_B32_gfx9:
485 case AMDGPU::DS_READ_B64_gfx9:
486 case AMDGPU::DS_WRITE_B32:
487 case AMDGPU::DS_WRITE_B64:
488 case AMDGPU::DS_WRITE_B32_gfx9:
489 case AMDGPU::DS_WRITE_B64_gfx9:
490 return ADDR;
491 }
492}
493
Marek Olsakb953cc32017-11-09 01:52:23 +0000494bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
Matt Arsenault67e72de2017-08-31 01:53:09 +0000495 MachineBasicBlock *MBB = CI.I->getParent();
496 MachineBasicBlock::iterator E = MBB->end();
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000497 MachineBasicBlock::iterator MBBI = CI.I;
Matt Arsenault3cb61632017-08-30 03:26:18 +0000498
Neil Henning76504a42018-12-12 16:15:21 +0000499 const unsigned Opc = CI.I->getOpcode();
500 const InstClassEnum InstClass = getInstClass(Opc);
501
502 if (InstClass == UNKNOWN) {
503 return false;
504 }
505
506 const unsigned Regs = getRegs(Opc);
507
508 unsigned AddrOpName[5] = {0};
509 int AddrIdx[5];
510 const MachineOperand *AddrReg[5];
Marek Olsak6a0548a2017-11-09 01:52:30 +0000511 unsigned NumAddresses = 0;
Marek Olsakb953cc32017-11-09 01:52:23 +0000512
Neil Henning76504a42018-12-12 16:15:21 +0000513 if (Regs & ADDR) {
Marek Olsak6a0548a2017-11-09 01:52:30 +0000514 AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
Neil Henning76504a42018-12-12 16:15:21 +0000515 }
516
517 if (Regs & SBASE) {
Marek Olsak6a0548a2017-11-09 01:52:30 +0000518 AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
Neil Henning76504a42018-12-12 16:15:21 +0000519 }
520
521 if (Regs & SRSRC) {
Marek Olsak6a0548a2017-11-09 01:52:30 +0000522 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
Neil Henning76504a42018-12-12 16:15:21 +0000523 }
524
525 if (Regs & SOFFSET) {
526 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
527 }
528
529 if (Regs & VADDR) {
Marek Olsak6a0548a2017-11-09 01:52:30 +0000530 AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
Marek Olsak6a0548a2017-11-09 01:52:30 +0000531 }
Matt Arsenault3cb61632017-08-30 03:26:18 +0000532
Marek Olsak6a0548a2017-11-09 01:52:30 +0000533 for (unsigned i = 0; i < NumAddresses; i++) {
534 AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
535 AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
536
Neil Henning76504a42018-12-12 16:15:21 +0000537 // We only ever merge operations with the same base address register, so
538 // don't bother scanning forward if there are no other uses.
Marek Olsak6a0548a2017-11-09 01:52:30 +0000539 if (AddrReg[i]->isReg() &&
540 (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
541 MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
542 return false;
543 }
Matt Arsenault3cb61632017-08-30 03:26:18 +0000544
Matt Arsenault41033282014-10-10 22:01:59 +0000545 ++MBBI;
546
Nicolai Haehnle6cf306d2018-02-23 10:45:56 +0000547 DenseSet<unsigned> RegDefsToMove;
548 DenseSet<unsigned> PhysRegUsesToMove;
549 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
Matt Arsenault41033282014-10-10 22:01:59 +0000550
Neil Henning76504a42018-12-12 16:15:21 +0000551 for (; MBBI != E; ++MBBI) {
552 const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
553
554 if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
555 (IsDS && (MBBI->getOpcode() != Opc))) {
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000556 // This is not a matching DS instruction, but we can keep looking as
557 // long as one of these conditions are met:
558 // 1. It is safe to move I down past MBBI.
559 // 2. It is safe to move MBBI down past the instruction that I will
560 // be merged into.
Matt Arsenault41033282014-10-10 22:01:59 +0000561
Matt Arsenault2d69c922017-08-29 21:25:51 +0000562 if (MBBI->hasUnmodeledSideEffects()) {
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000563 // We can't re-order this instruction with respect to other memory
Matt Arsenault2d69c922017-08-29 21:25:51 +0000564 // operations, so we fail both conditions mentioned above.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000565 return false;
Matt Arsenault2d69c922017-08-29 21:25:51 +0000566 }
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000567
568 if (MBBI->mayLoadOrStore() &&
Changpeng Fang4cabf6d2019-02-18 23:00:26 +0000569 (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
570 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) {
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000571 // We fail condition #1, but we may still be able to satisfy condition
572 // #2. Add this instruction to the move list and then we will check
573 // if condition #2 holds once we have selected the matching instruction.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000574 CI.InstsToMove.push_back(&*MBBI);
Nicolai Haehnle6cf306d2018-02-23 10:45:56 +0000575 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000576 continue;
577 }
578
579 // When we match I with another DS instruction we will be moving I down
580 // to the location of the matched instruction any uses of I will need to
581 // be moved down as well.
Nicolai Haehnle6cf306d2018-02-23 10:45:56 +0000582 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
583 CI.InstsToMove);
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000584 continue;
585 }
586
587 // Don't merge volatiles.
588 if (MBBI->hasOrderedMemoryRef())
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000589 return false;
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000590
Nicolai Haehnle7b0e25b2016-10-27 08:15:07 +0000591 // Handle a case like
592 // DS_WRITE_B32 addr, v, idx0
593 // w = DS_READ_B32 addr, idx0
594 // DS_WRITE_B32 addr, f(w), idx1
595 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
596 // merging of the two writes.
Nicolai Haehnle6cf306d2018-02-23 10:45:56 +0000597 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
598 CI.InstsToMove))
Nicolai Haehnle7b0e25b2016-10-27 08:15:07 +0000599 continue;
600
Marek Olsak6a0548a2017-11-09 01:52:30 +0000601 bool Match = true;
602 for (unsigned i = 0; i < NumAddresses; i++) {
603 const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000604
Marek Olsak6a0548a2017-11-09 01:52:30 +0000605 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
606 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
607 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
608 Match = false;
609 break;
610 }
611 continue;
612 }
613
Neil Henning76504a42018-12-12 16:15:21 +0000614 // Check same base pointer. Be careful of subregisters, which can occur
615 // with vectors of pointers.
Marek Olsak6a0548a2017-11-09 01:52:30 +0000616 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
617 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
618 Match = false;
619 break;
620 }
621 }
622
623 if (Match) {
Neil Henning76504a42018-12-12 16:15:21 +0000624 int OffsetIdx =
625 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
Marek Olsakb953cc32017-11-09 01:52:23 +0000626 CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
Neil Henning76504a42018-12-12 16:15:21 +0000627 CI.Width0 = getOpcodeWidth(*CI.I);
Marek Olsakb953cc32017-11-09 01:52:23 +0000628 CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
Neil Henning76504a42018-12-12 16:15:21 +0000629 CI.Width1 = getOpcodeWidth(*MBBI);
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000630 CI.Paired = MBBI;
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000631
Neil Henning76504a42018-12-12 16:15:21 +0000632 if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
Marek Olsakb953cc32017-11-09 01:52:23 +0000633 CI.Offset0 &= 0xffff;
634 CI.Offset1 &= 0xffff;
Marek Olsak6a0548a2017-11-09 01:52:30 +0000635 } else {
636 CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
637 CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
Marek Olsak4c421a2d2017-11-09 01:52:36 +0000638 if (CI.InstClass != S_BUFFER_LOAD_IMM) {
Marek Olsak6a0548a2017-11-09 01:52:30 +0000639 CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
640 CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
641 }
Stanislav Mekhanoshina6322942019-04-30 22:08:23 +0000642 CI.DLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::dlc)->getImm();
643 CI.DLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::dlc)->getImm();
Marek Olsakb953cc32017-11-09 01:52:23 +0000644 }
645
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000646 // Check both offsets fit in the reduced range.
647 // We also need to go through the list of instructions that we plan to
648 // move and make sure they are all safe to move down past the merged
649 // instruction.
Neil Henninge85d45a2019-01-10 16:21:08 +0000650 if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
Changpeng Fang4cabf6d2019-02-18 23:00:26 +0000651 if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000652 return true;
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000653 }
654
655 // We've found a load/store that we couldn't merge for some reason.
656 // We could potentially keep looking, but we'd need to make sure that
657 // it was safe to move I and also all the instruction in InstsToMove
658 // down past this instruction.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000659 // check if we can move I across MBBI and if we can move all I's users
Changpeng Fang4cabf6d2019-02-18 23:00:26 +0000660 if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
661 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
Alexander Timofeevf867a402016-11-03 14:37:13 +0000662 break;
Matt Arsenault41033282014-10-10 22:01:59 +0000663 }
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000664 return false;
Matt Arsenault41033282014-10-10 22:01:59 +0000665}
666
Matt Arsenault3f71c0e2017-11-29 00:55:57 +0000667unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
668 if (STM->ldsRequiresM0Init())
669 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
670 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
671}
672
673unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
674 if (STM->ldsRequiresM0Init())
675 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
676
Neil Henning76504a42018-12-12 16:15:21 +0000677 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
678 : AMDGPU::DS_READ2ST64_B64_gfx9;
Matt Arsenault3f71c0e2017-11-29 00:55:57 +0000679}
680
Neil Henning76504a42018-12-12 16:15:21 +0000681MachineBasicBlock::iterator
682SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000683 MachineBasicBlock *MBB = CI.I->getParent();
Matt Arsenault41033282014-10-10 22:01:59 +0000684
685 // Be careful, since the addresses could be subregisters themselves in weird
686 // cases, like vectors of pointers.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000687 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
Matt Arsenault41033282014-10-10 22:01:59 +0000688
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000689 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
690 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
Matt Arsenault41033282014-10-10 22:01:59 +0000691
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000692 unsigned NewOffset0 = CI.Offset0;
693 unsigned NewOffset1 = CI.Offset1;
Neil Henning76504a42018-12-12 16:15:21 +0000694 unsigned Opc =
695 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000696
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000697 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
698 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
Tom Stellarde175d8a2016-08-26 21:36:47 +0000699
700 if (NewOffset0 > NewOffset1) {
701 // Canonicalize the merged instruction so the smaller offset comes first.
702 std::swap(NewOffset0, NewOffset1);
703 std::swap(SubRegIdx0, SubRegIdx1);
704 }
705
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000706 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
Neil Henning76504a42018-12-12 16:15:21 +0000707 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000708
709 const MCInstrDesc &Read2Desc = TII->get(Opc);
Matt Arsenault41033282014-10-10 22:01:59 +0000710
Neil Henning76504a42018-12-12 16:15:21 +0000711 const TargetRegisterClass *SuperRC =
712 (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
Matt Arsenault41033282014-10-10 22:01:59 +0000713 unsigned DestReg = MRI->createVirtualRegister(SuperRC);
714
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000715 DebugLoc DL = CI.I->getDebugLoc();
716
717 unsigned BaseReg = AddrReg->getReg();
Stanislav Mekhanoshin8dfcd832018-09-25 23:33:18 +0000718 unsigned BaseSubReg = AddrReg->getSubReg();
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000719 unsigned BaseRegFlags = 0;
720 if (CI.BaseOff) {
Mark Searles7687d422018-01-22 21:46:43 +0000721 unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
722 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
Neil Henning76504a42018-12-12 16:15:21 +0000723 .addImm(CI.BaseOff);
Mark Searles7687d422018-01-22 21:46:43 +0000724
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000725 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
726 BaseRegFlags = RegState::Kill;
Matt Arsenault84445dd2017-11-30 22:51:26 +0000727
Mark Searles7687d422018-01-22 21:46:43 +0000728 TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
Neil Henning76504a42018-12-12 16:15:21 +0000729 .addReg(ImmReg)
Tim Renoufcfdfba92019-03-18 19:35:44 +0000730 .addReg(AddrReg->getReg(), 0, BaseSubReg)
731 .addImm(0); // clamp bit
Stanislav Mekhanoshin8dfcd832018-09-25 23:33:18 +0000732 BaseSubReg = 0;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000733 }
734
Neil Henning76504a42018-12-12 16:15:21 +0000735 MachineInstrBuilder Read2 =
736 BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
737 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
738 .addImm(NewOffset0) // offset0
739 .addImm(NewOffset1) // offset1
740 .addImm(0) // gds
741 .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
Stanislav Mekhanoshin86b0a542017-04-14 00:33:44 +0000742
NAKAMURA Takumi9720f572016-08-30 11:50:21 +0000743 (void)Read2;
Matt Arsenault41033282014-10-10 22:01:59 +0000744
Matt Arsenault84db5d92015-07-14 17:57:36 +0000745 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
746
747 // Copy to the old destination registers.
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000748 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
Diana Picus116bbab2017-01-13 09:58:52 +0000749 .add(*Dest0) // Copy to same destination including flags and sub reg.
750 .addReg(DestReg, 0, SubRegIdx0);
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000751 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
Diana Picus116bbab2017-01-13 09:58:52 +0000752 .add(*Dest1)
753 .addReg(DestReg, RegState::Kill, SubRegIdx1);
Matt Arsenault84db5d92015-07-14 17:57:36 +0000754
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000755 moveInstsAfter(Copy1, CI.InstsToMove);
Matt Arsenault84db5d92015-07-14 17:57:36 +0000756
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000757 MachineBasicBlock::iterator Next = std::next(CI.I);
758 CI.I->eraseFromParent();
759 CI.Paired->eraseFromParent();
Matt Arsenault41033282014-10-10 22:01:59 +0000760
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000761 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000762 return Next;
Matt Arsenault41033282014-10-10 22:01:59 +0000763}
764
Matt Arsenault3f71c0e2017-11-29 00:55:57 +0000765unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
766 if (STM->ldsRequiresM0Init())
767 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
Neil Henning76504a42018-12-12 16:15:21 +0000768 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
769 : AMDGPU::DS_WRITE2_B64_gfx9;
Matt Arsenault3f71c0e2017-11-29 00:55:57 +0000770}
771
772unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
773 if (STM->ldsRequiresM0Init())
Neil Henning76504a42018-12-12 16:15:21 +0000774 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
775 : AMDGPU::DS_WRITE2ST64_B64;
Matt Arsenault3f71c0e2017-11-29 00:55:57 +0000776
Neil Henning76504a42018-12-12 16:15:21 +0000777 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
778 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
Matt Arsenault3f71c0e2017-11-29 00:55:57 +0000779}
780
Neil Henning76504a42018-12-12 16:15:21 +0000781MachineBasicBlock::iterator
782SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000783 MachineBasicBlock *MBB = CI.I->getParent();
Matt Arsenault41033282014-10-10 22:01:59 +0000784
785 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
786 // sure we preserve the subregister index and any register flags set on them.
Neil Henning76504a42018-12-12 16:15:21 +0000787 const MachineOperand *AddrReg =
788 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
789 const MachineOperand *Data0 =
790 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
791 const MachineOperand *Data1 =
792 TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
Matt Arsenault41033282014-10-10 22:01:59 +0000793
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000794 unsigned NewOffset0 = CI.Offset0;
795 unsigned NewOffset1 = CI.Offset1;
Neil Henning76504a42018-12-12 16:15:21 +0000796 unsigned Opc =
797 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000798
Tom Stellarde175d8a2016-08-26 21:36:47 +0000799 if (NewOffset0 > NewOffset1) {
800 // Canonicalize the merged instruction so the smaller offset comes first.
801 std::swap(NewOffset0, NewOffset1);
802 std::swap(Data0, Data1);
803 }
804
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000805 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
Neil Henning76504a42018-12-12 16:15:21 +0000806 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000807
808 const MCInstrDesc &Write2Desc = TII->get(Opc);
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000809 DebugLoc DL = CI.I->getDebugLoc();
Matt Arsenaultfe0a2e62014-10-10 22:12:32 +0000810
Mark Searles7687d422018-01-22 21:46:43 +0000811 unsigned BaseReg = AddrReg->getReg();
Stanislav Mekhanoshin8dfcd832018-09-25 23:33:18 +0000812 unsigned BaseSubReg = AddrReg->getSubReg();
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000813 unsigned BaseRegFlags = 0;
814 if (CI.BaseOff) {
Mark Searles7687d422018-01-22 21:46:43 +0000815 unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
816 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
Neil Henning76504a42018-12-12 16:15:21 +0000817 .addImm(CI.BaseOff);
Mark Searles7687d422018-01-22 21:46:43 +0000818
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000819 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
820 BaseRegFlags = RegState::Kill;
Matt Arsenault84445dd2017-11-30 22:51:26 +0000821
Mark Searles7687d422018-01-22 21:46:43 +0000822 TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
Neil Henning76504a42018-12-12 16:15:21 +0000823 .addReg(ImmReg)
Tim Renoufcfdfba92019-03-18 19:35:44 +0000824 .addReg(AddrReg->getReg(), 0, BaseSubReg)
825 .addImm(0); // clamp bit
Stanislav Mekhanoshin8dfcd832018-09-25 23:33:18 +0000826 BaseSubReg = 0;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000827 }
Matt Arsenault41033282014-10-10 22:01:59 +0000828
Neil Henning76504a42018-12-12 16:15:21 +0000829 MachineInstrBuilder Write2 =
830 BuildMI(*MBB, CI.Paired, DL, Write2Desc)
831 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
832 .add(*Data0) // data0
833 .add(*Data1) // data1
834 .addImm(NewOffset0) // offset0
835 .addImm(NewOffset1) // offset1
836 .addImm(0) // gds
837 .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
Matt Arsenault41033282014-10-10 22:01:59 +0000838
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +0000839 moveInstsAfter(Write2, CI.InstsToMove);
840
841 MachineBasicBlock::iterator Next = std::next(CI.I);
842 CI.I->eraseFromParent();
843 CI.Paired->eraseFromParent();
Matt Arsenault41033282014-10-10 22:01:59 +0000844
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000845 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
Tom Stellardc2ff0eb2016-08-29 19:15:22 +0000846 return Next;
Matt Arsenault41033282014-10-10 22:01:59 +0000847}
848
Neil Henning76504a42018-12-12 16:15:21 +0000849MachineBasicBlock::iterator
850SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
Marek Olsakb953cc32017-11-09 01:52:23 +0000851 MachineBasicBlock *MBB = CI.I->getParent();
852 DebugLoc DL = CI.I->getDebugLoc();
Neil Henning76504a42018-12-12 16:15:21 +0000853 const unsigned Opcode = getNewOpcode(CI);
Marek Olsakb953cc32017-11-09 01:52:23 +0000854
Neil Henning76504a42018-12-12 16:15:21 +0000855 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
856
Marek Olsakb953cc32017-11-09 01:52:23 +0000857 unsigned DestReg = MRI->createVirtualRegister(SuperRC);
858 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
859
860 BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
861 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
862 .addImm(MergedOffset) // offset
863 .addImm(CI.GLC0) // glc
Stanislav Mekhanoshina6322942019-04-30 22:08:23 +0000864 .addImm(CI.DLC0) // dlc
Chandler Carruthc73c0302018-08-16 21:30:05 +0000865 .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
Marek Olsakb953cc32017-11-09 01:52:23 +0000866
Neil Henning76504a42018-12-12 16:15:21 +0000867 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
868 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
869 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
Marek Olsakb953cc32017-11-09 01:52:23 +0000870
871 // Copy to the old destination registers.
872 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
873 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
874 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
875
876 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
877 .add(*Dest0) // Copy to same destination including flags and sub reg.
878 .addReg(DestReg, 0, SubRegIdx0);
879 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
880 .add(*Dest1)
881 .addReg(DestReg, RegState::Kill, SubRegIdx1);
882
883 moveInstsAfter(Copy1, CI.InstsToMove);
884
885 MachineBasicBlock::iterator Next = std::next(CI.I);
886 CI.I->eraseFromParent();
887 CI.Paired->eraseFromParent();
888 return Next;
889}
890
Neil Henning76504a42018-12-12 16:15:21 +0000891MachineBasicBlock::iterator
892SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
Marek Olsak6a0548a2017-11-09 01:52:30 +0000893 MachineBasicBlock *MBB = CI.I->getParent();
894 DebugLoc DL = CI.I->getDebugLoc();
Marek Olsak4c421a2d2017-11-09 01:52:36 +0000895
Neil Henning76504a42018-12-12 16:15:21 +0000896 const unsigned Opcode = getNewOpcode(CI);
Marek Olsak6a0548a2017-11-09 01:52:30 +0000897
Neil Henning76504a42018-12-12 16:15:21 +0000898 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
899
900 // Copy to the new source register.
Marek Olsak6a0548a2017-11-09 01:52:30 +0000901 unsigned DestReg = MRI->createVirtualRegister(SuperRC);
902 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
903
Marek Olsak4c421a2d2017-11-09 01:52:36 +0000904 auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
905
Neil Henning76504a42018-12-12 16:15:21 +0000906 const unsigned Regs = getRegs(Opcode);
907
908 if (Regs & VADDR)
909 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
Marek Olsak4c421a2d2017-11-09 01:52:36 +0000910
911 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
Marek Olsak6a0548a2017-11-09 01:52:30 +0000912 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
913 .addImm(MergedOffset) // offset
914 .addImm(CI.GLC0) // glc
915 .addImm(CI.SLC0) // slc
916 .addImm(0) // tfe
Stanislav Mekhanoshina6322942019-04-30 22:08:23 +0000917 .addImm(CI.DLC0) // dlc
Chandler Carruthc73c0302018-08-16 21:30:05 +0000918 .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
Marek Olsak6a0548a2017-11-09 01:52:30 +0000919
Neil Henning76504a42018-12-12 16:15:21 +0000920 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
921 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
922 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
Marek Olsak6a0548a2017-11-09 01:52:30 +0000923
924 // Copy to the old destination registers.
925 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
926 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
927 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
928
929 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
930 .add(*Dest0) // Copy to same destination including flags and sub reg.
931 .addReg(DestReg, 0, SubRegIdx0);
932 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
933 .add(*Dest1)
934 .addReg(DestReg, RegState::Kill, SubRegIdx1);
935
936 moveInstsAfter(Copy1, CI.InstsToMove);
937
938 MachineBasicBlock::iterator Next = std::next(CI.I);
939 CI.I->eraseFromParent();
940 CI.Paired->eraseFromParent();
941 return Next;
942}
943
Neil Henning76504a42018-12-12 16:15:21 +0000944unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
945 const unsigned Width = CI.Width0 + CI.Width1;
Marek Olsak58410f32017-11-09 01:52:55 +0000946
Neil Henning76504a42018-12-12 16:15:21 +0000947 switch (CI.InstClass) {
948 default:
949 return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
950 case UNKNOWN:
951 llvm_unreachable("Unknown instruction class");
952 case S_BUFFER_LOAD_IMM:
953 switch (Width) {
954 default:
955 return 0;
956 case 2:
957 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
958 case 4:
959 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
960 }
Marek Olsak58410f32017-11-09 01:52:55 +0000961 }
Marek Olsak58410f32017-11-09 01:52:55 +0000962}
963
Neil Henning76504a42018-12-12 16:15:21 +0000964std::pair<unsigned, unsigned>
965SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
966 if (CI.Offset0 > CI.Offset1) {
967 switch (CI.Width0) {
968 default:
969 return std::make_pair(0, 0);
970 case 1:
971 switch (CI.Width1) {
972 default:
973 return std::make_pair(0, 0);
974 case 1:
975 return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
976 case 2:
977 return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
978 case 3:
979 return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
980 }
981 case 2:
982 switch (CI.Width1) {
983 default:
984 return std::make_pair(0, 0);
985 case 1:
986 return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
987 case 2:
988 return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
989 }
990 case 3:
991 switch (CI.Width1) {
992 default:
993 return std::make_pair(0, 0);
994 case 1:
995 return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
996 }
997 }
998 } else {
999 switch (CI.Width0) {
1000 default:
1001 return std::make_pair(0, 0);
1002 case 1:
1003 switch (CI.Width1) {
1004 default:
1005 return std::make_pair(0, 0);
1006 case 1:
1007 return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
1008 case 2:
1009 return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
1010 case 3:
1011 return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
1012 }
1013 case 2:
1014 switch (CI.Width1) {
1015 default:
1016 return std::make_pair(0, 0);
1017 case 1:
1018 return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
1019 case 2:
1020 return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
1021 }
1022 case 3:
1023 switch (CI.Width1) {
1024 default:
1025 return std::make_pair(0, 0);
1026 case 1:
1027 return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
1028 }
1029 }
1030 }
1031}
1032
1033const TargetRegisterClass *
1034SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
1035 if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1036 switch (CI.Width0 + CI.Width1) {
1037 default:
1038 return nullptr;
1039 case 2:
1040 return &AMDGPU::SReg_64_XEXECRegClass;
1041 case 4:
1042 return &AMDGPU::SReg_128RegClass;
1043 case 8:
1044 return &AMDGPU::SReg_256RegClass;
1045 case 16:
1046 return &AMDGPU::SReg_512RegClass;
1047 }
1048 } else {
1049 switch (CI.Width0 + CI.Width1) {
1050 default:
1051 return nullptr;
1052 case 2:
1053 return &AMDGPU::VReg_64RegClass;
1054 case 3:
1055 return &AMDGPU::VReg_96RegClass;
1056 case 4:
1057 return &AMDGPU::VReg_128RegClass;
1058 }
1059 }
1060}
1061
1062MachineBasicBlock::iterator
1063SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
Marek Olsak58410f32017-11-09 01:52:55 +00001064 MachineBasicBlock *MBB = CI.I->getParent();
1065 DebugLoc DL = CI.I->getDebugLoc();
Marek Olsak58410f32017-11-09 01:52:55 +00001066
Neil Henning76504a42018-12-12 16:15:21 +00001067 const unsigned Opcode = getNewOpcode(CI);
Marek Olsak58410f32017-11-09 01:52:55 +00001068
Neil Henning76504a42018-12-12 16:15:21 +00001069 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1070 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1071 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
Marek Olsak58410f32017-11-09 01:52:55 +00001072
1073 // Copy to the new source register.
Neil Henning76504a42018-12-12 16:15:21 +00001074 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
Marek Olsak58410f32017-11-09 01:52:55 +00001075 unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
1076
1077 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1078 const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1079
1080 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1081 .add(*Src0)
1082 .addImm(SubRegIdx0)
1083 .add(*Src1)
1084 .addImm(SubRegIdx1);
1085
1086 auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
Neil Henning76504a42018-12-12 16:15:21 +00001087 .addReg(SrcReg, RegState::Kill);
Marek Olsak58410f32017-11-09 01:52:55 +00001088
Neil Henning76504a42018-12-12 16:15:21 +00001089 const unsigned Regs = getRegs(Opcode);
1090
1091 if (Regs & VADDR)
1092 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
Marek Olsak58410f32017-11-09 01:52:55 +00001093
1094 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1095 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1096 .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
Stanislav Mekhanoshina6322942019-04-30 22:08:23 +00001097 .addImm(CI.GLC0) // glc
1098 .addImm(CI.SLC0) // slc
1099 .addImm(0) // tfe
1100 .addImm(CI.DLC0) // dlc
Chandler Carruthc73c0302018-08-16 21:30:05 +00001101 .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
Marek Olsak58410f32017-11-09 01:52:55 +00001102
1103 moveInstsAfter(MIB, CI.InstsToMove);
1104
1105 MachineBasicBlock::iterator Next = std::next(CI.I);
1106 CI.I->eraseFromParent();
1107 CI.Paired->eraseFromParent();
1108 return Next;
1109}
1110
Farhana Aleence095c52018-12-14 21:13:14 +00001111MachineOperand
1112SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
1113 APInt V(32, Val, true);
1114 if (TII->isInlineConstant(V))
1115 return MachineOperand::CreateImm(Val);
1116
1117 unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1118 MachineInstr *Mov =
1119 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1120 TII->get(AMDGPU::S_MOV_B32), Reg)
1121 .addImm(Val);
Simon Pilgrim9831d402018-12-15 12:25:22 +00001122 (void)Mov;
Farhana Aleence095c52018-12-14 21:13:14 +00001123 LLVM_DEBUG(dbgs() << " "; Mov->dump());
1124 return MachineOperand::CreateReg(Reg, false);
1125}
1126
1127// Compute base address using Addr and return the final register.
1128unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1129 const MemAddress &Addr) {
1130 MachineBasicBlock *MBB = MI.getParent();
1131 MachineBasicBlock::iterator MBBI = MI.getIterator();
1132 DebugLoc DL = MI.getDebugLoc();
1133
1134 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1135 Addr.Base.LoSubReg) &&
1136 "Expected 32-bit Base-Register-Low!!");
1137
1138 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1139 Addr.Base.HiSubReg) &&
1140 "Expected 32-bit Base-Register-Hi!!");
1141
1142 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1143 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1144 MachineOperand OffsetHi =
1145 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1146 unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1147 unsigned DeadCarryReg =
1148 MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1149
1150 unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1151 unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1152 MachineInstr *LoHalf =
1153 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
1154 .addReg(CarryReg, RegState::Define)
1155 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
Tim Renoufcfdfba92019-03-18 19:35:44 +00001156 .add(OffsetLo)
1157 .addImm(0); // clamp bit
Simon Pilgrim9831d402018-12-15 12:25:22 +00001158 (void)LoHalf;
Farhana Aleence095c52018-12-14 21:13:14 +00001159 LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1160
1161 MachineInstr *HiHalf =
1162 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1163 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1164 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1165 .add(OffsetHi)
Tim Renoufcfdfba92019-03-18 19:35:44 +00001166 .addReg(CarryReg, RegState::Kill)
1167 .addImm(0); // clamp bit
Simon Pilgrim9831d402018-12-15 12:25:22 +00001168 (void)HiHalf;
Farhana Aleence095c52018-12-14 21:13:14 +00001169 LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1170
1171 unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
1172 MachineInstr *FullBase =
1173 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1174 .addReg(DestSub0)
1175 .addImm(AMDGPU::sub0)
1176 .addReg(DestSub1)
1177 .addImm(AMDGPU::sub1);
Simon Pilgrim9831d402018-12-15 12:25:22 +00001178 (void)FullBase;
Farhana Aleence095c52018-12-14 21:13:14 +00001179 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1180
1181 return FullDestReg;
1182}
1183
1184// Update base and offset with the NewBase and NewOffset in MI.
1185void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1186 unsigned NewBase,
1187 int32_t NewOffset) {
1188 TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
1189 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1190}
1191
1192Optional<int32_t>
1193SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
1194 if (Op.isImm())
1195 return Op.getImm();
1196
1197 if (!Op.isReg())
1198 return None;
1199
1200 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1201 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1202 !Def->getOperand(1).isImm())
1203 return None;
1204
1205 return Def->getOperand(1).getImm();
1206}
1207
1208// Analyze Base and extracts:
1209// - 32bit base registers, subregisters
1210// - 64bit constant offset
1211// Expecting base computation as:
1212// %OFFSET0:sgpr_32 = S_MOV_B32 8000
1213// %LO:vgpr_32, %c:sreg_64_xexec =
1214// V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1215// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1216// %Base:vreg_64 =
1217// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1218void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1219 MemAddress &Addr) {
1220 if (!Base.isReg())
1221 return;
1222
1223 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1224 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1225 || Def->getNumOperands() != 5)
1226 return;
1227
1228 MachineOperand BaseLo = Def->getOperand(1);
1229 MachineOperand BaseHi = Def->getOperand(3);
1230 if (!BaseLo.isReg() || !BaseHi.isReg())
1231 return;
1232
1233 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1234 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1235
1236 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
1237 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1238 return;
1239
1240 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1241 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1242
1243 auto Offset0P = extractConstOffset(*Src0);
1244 if (Offset0P)
1245 BaseLo = *Src1;
1246 else {
1247 if (!(Offset0P = extractConstOffset(*Src1)))
1248 return;
1249 BaseLo = *Src0;
1250 }
1251
1252 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1253 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1254
1255 if (Src0->isImm())
1256 std::swap(Src0, Src1);
1257
1258 if (!Src1->isImm())
1259 return;
1260
Farhana Aleence095c52018-12-14 21:13:14 +00001261 uint64_t Offset1 = Src1->getImm();
1262 BaseHi = *Src0;
1263
1264 Addr.Base.LoReg = BaseLo.getReg();
1265 Addr.Base.HiReg = BaseHi.getReg();
1266 Addr.Base.LoSubReg = BaseLo.getSubReg();
1267 Addr.Base.HiSubReg = BaseHi.getSubReg();
1268 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1269}
1270
1271bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1272 MachineInstr &MI,
1273 MemInfoMap &Visited,
1274 SmallPtrSet<MachineInstr *, 4> &AnchorList) {
1275
1276 // TODO: Support flat and scratch.
1277 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 ||
1278 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1279 return false;
1280
1281 // TODO: Support Store.
1282 if (!MI.mayLoad())
1283 return false;
1284
1285 if (AnchorList.count(&MI))
1286 return false;
1287
1288 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1289
1290 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1291 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
1292 return false;
1293 }
1294
1295 // Step1: Find the base-registers and a 64bit constant offset.
1296 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1297 MemAddress MAddr;
1298 if (Visited.find(&MI) == Visited.end()) {
1299 processBaseWithConstOffset(Base, MAddr);
1300 Visited[&MI] = MAddr;
1301 } else
1302 MAddr = Visited[&MI];
1303
1304 if (MAddr.Offset == 0) {
1305 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
1306 " constant offsets that can be promoted.\n";);
1307 return false;
1308 }
1309
1310 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
1311 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1312
1313 // Step2: Traverse through MI's basic block and find an anchor(that has the
1314 // same base-registers) with the highest 13bit distance from MI's offset.
1315 // E.g. (64bit loads)
1316 // bb:
1317 // addr1 = &a + 4096; load1 = load(addr1, 0)
1318 // addr2 = &a + 6144; load2 = load(addr2, 0)
1319 // addr3 = &a + 8192; load3 = load(addr3, 0)
1320 // addr4 = &a + 10240; load4 = load(addr4, 0)
1321 // addr5 = &a + 12288; load5 = load(addr5, 0)
1322 //
1323 // Starting from the first load, the optimization will try to find a new base
1324 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1325 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1326 // as the new-base(anchor) because of the maximum distance which can
1327 // accomodate more intermediate bases presumeably.
1328 //
1329 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1330 // (&a + 8192) for load1, load2, load4.
1331 // addr = &a + 8192
1332 // load1 = load(addr, -4096)
1333 // load2 = load(addr, -2048)
1334 // load3 = load(addr, 0)
1335 // load4 = load(addr, 2048)
1336 // addr5 = &a + 12288; load5 = load(addr5, 0)
1337 //
1338 MachineInstr *AnchorInst = nullptr;
1339 MemAddress AnchorAddr;
1340 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1341 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
1342
1343 MachineBasicBlock *MBB = MI.getParent();
1344 MachineBasicBlock::iterator E = MBB->end();
1345 MachineBasicBlock::iterator MBBI = MI.getIterator();
1346 ++MBBI;
1347 const SITargetLowering *TLI =
1348 static_cast<const SITargetLowering *>(STM->getTargetLowering());
1349
1350 for ( ; MBBI != E; ++MBBI) {
1351 MachineInstr &MINext = *MBBI;
1352 // TODO: Support finding an anchor(with same base) from store addresses or
1353 // any other load addresses where the opcodes are different.
1354 if (MINext.getOpcode() != MI.getOpcode() ||
1355 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1356 continue;
1357
1358 const MachineOperand &BaseNext =
1359 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1360 MemAddress MAddrNext;
1361 if (Visited.find(&MINext) == Visited.end()) {
1362 processBaseWithConstOffset(BaseNext, MAddrNext);
1363 Visited[&MINext] = MAddrNext;
1364 } else
1365 MAddrNext = Visited[&MINext];
1366
1367 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1368 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1369 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1370 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1371 continue;
1372
1373 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1374
1375 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1376 TargetLoweringBase::AddrMode AM;
1377 AM.HasBaseReg = true;
1378 AM.BaseOffs = Dist;
1379 if (TLI->isLegalGlobalAddressingMode(AM) &&
Florian Hahnabe32c92018-12-15 01:32:58 +00001380 (uint32_t)std::abs(Dist) > MaxDist) {
1381 MaxDist = std::abs(Dist);
Farhana Aleence095c52018-12-14 21:13:14 +00001382
1383 AnchorAddr = MAddrNext;
1384 AnchorInst = &MINext;
1385 }
1386 }
1387
1388 if (AnchorInst) {
1389 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
1390 AnchorInst->dump());
1391 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
1392 << AnchorAddr.Offset << "\n\n");
1393
1394 // Instead of moving up, just re-compute anchor-instruction's base address.
1395 unsigned Base = computeBase(MI, AnchorAddr);
1396
1397 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1398 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
1399
1400 for (auto P : InstsWCommonBase) {
1401 TargetLoweringBase::AddrMode AM;
1402 AM.HasBaseReg = true;
1403 AM.BaseOffs = P.second - AnchorAddr.Offset;
1404
1405 if (TLI->isLegalGlobalAddressingMode(AM)) {
1406 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
1407 dbgs() << ")"; P.first->dump());
1408 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1409 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
1410 }
1411 }
1412 AnchorList.insert(AnchorInst);
1413 return true;
1414 }
1415
1416 return false;
1417}
1418
Matt Arsenault41033282014-10-10 22:01:59 +00001419// Scan through looking for adjacent LDS operations with constant offsets from
1420// the same base register. We rely on the scheduler to do the hard work of
1421// clustering nearby loads, and assume these are all adjacent.
1422bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
Matt Arsenault41033282014-10-10 22:01:59 +00001423 bool Modified = false;
1424
Farhana Aleence095c52018-12-14 21:13:14 +00001425 // Contain the list
1426 MemInfoMap Visited;
1427 // Contains the list of instructions for which constant offsets are being
1428 // promoted to the IMM.
1429 SmallPtrSet<MachineInstr *, 4> AnchorList;
1430
Matt Arsenault41033282014-10-10 22:01:59 +00001431 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
1432 MachineInstr &MI = *I;
1433
Farhana Aleence095c52018-12-14 21:13:14 +00001434 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1435 Modified = true;
1436
Matt Arsenault41033282014-10-10 22:01:59 +00001437 // Don't combine if volatile.
1438 if (MI.hasOrderedMemoryRef()) {
1439 ++I;
1440 continue;
1441 }
1442
Neil Henning76504a42018-12-12 16:15:21 +00001443 const unsigned Opc = MI.getOpcode();
1444
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +00001445 CombineInfo CI;
1446 CI.I = I;
Neil Henning76504a42018-12-12 16:15:21 +00001447 CI.InstClass = getInstClass(Opc);
Matt Arsenault3f71c0e2017-11-29 00:55:57 +00001448
Neil Henning76504a42018-12-12 16:15:21 +00001449 switch (CI.InstClass) {
1450 default:
1451 break;
1452 case DS_READ:
Matt Arsenault3f71c0e2017-11-29 00:55:57 +00001453 CI.EltSize =
Neil Henning76504a42018-12-12 16:15:21 +00001454 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
1455 : 4;
Marek Olsakb953cc32017-11-09 01:52:23 +00001456 if (findMatchingInst(CI)) {
Matt Arsenault41033282014-10-10 22:01:59 +00001457 Modified = true;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +00001458 I = mergeRead2Pair(CI);
Matt Arsenault41033282014-10-10 22:01:59 +00001459 } else {
1460 ++I;
1461 }
Matt Arsenault41033282014-10-10 22:01:59 +00001462 continue;
Neil Henning76504a42018-12-12 16:15:21 +00001463 case DS_WRITE:
1464 CI.EltSize =
1465 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
1466 : 4;
Marek Olsakb953cc32017-11-09 01:52:23 +00001467 if (findMatchingInst(CI)) {
Matt Arsenault41033282014-10-10 22:01:59 +00001468 Modified = true;
Stanislav Mekhanoshind026f792017-04-13 17:53:07 +00001469 I = mergeWrite2Pair(CI);
Matt Arsenault41033282014-10-10 22:01:59 +00001470 } else {
1471 ++I;
1472 }
Matt Arsenault41033282014-10-10 22:01:59 +00001473 continue;
Neil Henning76504a42018-12-12 16:15:21 +00001474 case S_BUFFER_LOAD_IMM:
Marek Olsakb953cc32017-11-09 01:52:23 +00001475 CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
Marek Olsakb953cc32017-11-09 01:52:23 +00001476 if (findMatchingInst(CI)) {
1477 Modified = true;
1478 I = mergeSBufferLoadImmPair(CI);
Neil Henning76504a42018-12-12 16:15:21 +00001479 OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
Marek Olsakb953cc32017-11-09 01:52:23 +00001480 } else {
1481 ++I;
1482 }
1483 continue;
Neil Henning76504a42018-12-12 16:15:21 +00001484 case BUFFER_LOAD_OFFEN:
1485 case BUFFER_LOAD_OFFSET:
1486 case BUFFER_LOAD_OFFEN_exact:
1487 case BUFFER_LOAD_OFFSET_exact:
Marek Olsak6a0548a2017-11-09 01:52:30 +00001488 CI.EltSize = 4;
Marek Olsak6a0548a2017-11-09 01:52:30 +00001489 if (findMatchingInst(CI)) {
1490 Modified = true;
Marek Olsak4c421a2d2017-11-09 01:52:36 +00001491 I = mergeBufferLoadPair(CI);
Neil Henning76504a42018-12-12 16:15:21 +00001492 OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
Marek Olsak6a0548a2017-11-09 01:52:30 +00001493 } else {
1494 ++I;
1495 }
1496 continue;
Neil Henning76504a42018-12-12 16:15:21 +00001497 case BUFFER_STORE_OFFEN:
1498 case BUFFER_STORE_OFFSET:
1499 case BUFFER_STORE_OFFEN_exact:
1500 case BUFFER_STORE_OFFSET_exact:
Marek Olsak58410f32017-11-09 01:52:55 +00001501 CI.EltSize = 4;
Marek Olsak58410f32017-11-09 01:52:55 +00001502 if (findMatchingInst(CI)) {
1503 Modified = true;
1504 I = mergeBufferStorePair(CI);
Neil Henning76504a42018-12-12 16:15:21 +00001505 OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
Marek Olsak58410f32017-11-09 01:52:55 +00001506 } else {
1507 ++I;
1508 }
1509 continue;
1510 }
1511
Matt Arsenault41033282014-10-10 22:01:59 +00001512 ++I;
1513 }
1514
1515 return Modified;
1516}
1517
1518bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
Matthias Braunf1caa282017-12-15 22:22:58 +00001519 if (skipFunction(MF.getFunction()))
Andrew Kaylor7de74af2016-04-25 22:23:44 +00001520 return false;
1521
Tom Stellard5bfbae52018-07-11 20:59:01 +00001522 STM = &MF.getSubtarget<GCNSubtarget>();
Marek Olsakb953cc32017-11-09 01:52:23 +00001523 if (!STM->loadStoreOptEnabled())
Matt Arsenault03d85842016-06-27 20:32:13 +00001524 return false;
1525
Marek Olsakb953cc32017-11-09 01:52:23 +00001526 TII = STM->getInstrInfo();
Matt Arsenault43e92fe2016-06-24 06:30:11 +00001527 TRI = &TII->getRegisterInfo();
1528
Matt Arsenault41033282014-10-10 22:01:59 +00001529 MRI = &MF.getRegInfo();
Tom Stellardc2ff0eb2016-08-29 19:15:22 +00001530 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
Matt Arsenault41033282014-10-10 22:01:59 +00001531
Matt Arsenault67e72de2017-08-31 01:53:09 +00001532 assert(MRI->isSSA() && "Must be run on SSA");
1533
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001534 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
Matt Arsenault41033282014-10-10 22:01:59 +00001535
Matt Arsenault41033282014-10-10 22:01:59 +00001536 bool Modified = false;
1537
Nicolai Haehnleb4f28de2017-11-28 08:42:46 +00001538 for (MachineBasicBlock &MBB : MF) {
Neil Henning76504a42018-12-12 16:15:21 +00001539 do {
1540 OptimizeAgain = false;
Marek Olsakb953cc32017-11-09 01:52:23 +00001541 Modified |= optimizeBlock(MBB);
Neil Henning76504a42018-12-12 16:15:21 +00001542 } while (OptimizeAgain);
Marek Olsakb953cc32017-11-09 01:52:23 +00001543 }
1544
Matt Arsenault41033282014-10-10 22:01:59 +00001545 return Modified;
1546}