| Eugene Zelenko | 59e1282 | 2017-08-08 00:47:13 +0000 | [diff] [blame] | 1 | //===- SILoadStoreOptimizer.cpp -------------------------------------------===// | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 2 | // | 
|  | 3 | //                     The LLVM Compiler Infrastructure | 
|  | 4 | // | 
|  | 5 | // This file is distributed under the University of Illinois Open Source | 
|  | 6 | // License. See LICENSE.TXT for details. | 
|  | 7 | // | 
|  | 8 | //===----------------------------------------------------------------------===// | 
|  | 9 | // | 
|  | 10 | // This pass tries to fuse DS instructions with close by immediate offsets. | 
|  | 11 | // This will fuse operations such as | 
|  | 12 | //  ds_read_b32 v0, v2 offset:16 | 
|  | 13 | //  ds_read_b32 v1, v2 offset:32 | 
|  | 14 | // ==> | 
|  | 15 | //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 | 
|  | 16 | // | 
| Nicolai Haehnle | b4f28de | 2017-11-28 08:42:46 +0000 | [diff] [blame] | 17 | // The same is done for certain SMEM and VMEM opcodes, e.g.: | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 18 | //  s_buffer_load_dword s4, s[0:3], 4 | 
|  | 19 | //  s_buffer_load_dword s5, s[0:3], 8 | 
|  | 20 | // ==> | 
|  | 21 | //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4 | 
|  | 22 | // | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 23 | // | 
|  | 24 | // Future improvements: | 
|  | 25 | // | 
|  | 26 | // - This currently relies on the scheduler to place loads and stores next to | 
|  | 27 | //   each other, and then only merges adjacent pairs of instructions. It would | 
|  | 28 | //   be good to be more flexible with interleaved instructions, and possibly run | 
|  | 29 | //   before scheduling. It currently missing stores of constants because loading | 
|  | 30 | //   the constant into the data register is placed between the stores, although | 
|  | 31 | //   this is arguably a scheduling problem. | 
|  | 32 | // | 
|  | 33 | // - Live interval recomputing seems inefficient. This currently only matches | 
|  | 34 | //   one pair, and recomputes live intervals and moves on to the next pair. It | 
| Konstantin Zhuravlyov | ecc7cbf | 2016-03-29 15:15:44 +0000 | [diff] [blame] | 35 | //   would be better to compute a list of all merges that need to occur. | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 36 | // | 
|  | 37 | // - With a list of instructions to process, we can also merge more. If a | 
|  | 38 | //   cluster of loads have offsets that are too large to fit in the 8-bit | 
|  | 39 | //   offsets, but are close enough to fit in the 8 bits, we can add to the base | 
|  | 40 | //   pointer and use the new reduced offsets. | 
|  | 41 | // | 
|  | 42 | //===----------------------------------------------------------------------===// | 
|  | 43 |  | 
|  | 44 | #include "AMDGPU.h" | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 45 | #include "AMDGPUSubtarget.h" | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 46 | #include "SIInstrInfo.h" | 
|  | 47 | #include "SIRegisterInfo.h" | 
| Eugene Zelenko | 6620376 | 2017-01-21 00:53:49 +0000 | [diff] [blame] | 48 | #include "Utils/AMDGPUBaseInfo.h" | 
|  | 49 | #include "llvm/ADT/ArrayRef.h" | 
|  | 50 | #include "llvm/ADT/SmallVector.h" | 
|  | 51 | #include "llvm/ADT/StringRef.h" | 
|  | 52 | #include "llvm/Analysis/AliasAnalysis.h" | 
|  | 53 | #include "llvm/CodeGen/MachineBasicBlock.h" | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 54 | #include "llvm/CodeGen/MachineFunction.h" | 
|  | 55 | #include "llvm/CodeGen/MachineFunctionPass.h" | 
| Eugene Zelenko | 6620376 | 2017-01-21 00:53:49 +0000 | [diff] [blame] | 56 | #include "llvm/CodeGen/MachineInstr.h" | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 57 | #include "llvm/CodeGen/MachineInstrBuilder.h" | 
| Eugene Zelenko | 6620376 | 2017-01-21 00:53:49 +0000 | [diff] [blame] | 58 | #include "llvm/CodeGen/MachineOperand.h" | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 59 | #include "llvm/CodeGen/MachineRegisterInfo.h" | 
| Eugene Zelenko | 6620376 | 2017-01-21 00:53:49 +0000 | [diff] [blame] | 60 | #include "llvm/IR/DebugLoc.h" | 
|  | 61 | #include "llvm/Pass.h" | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 62 | #include "llvm/Support/Debug.h" | 
| Eugene Zelenko | 6620376 | 2017-01-21 00:53:49 +0000 | [diff] [blame] | 63 | #include "llvm/Support/MathExtras.h" | 
| Benjamin Kramer | 799003b | 2015-03-23 19:32:43 +0000 | [diff] [blame] | 64 | #include "llvm/Support/raw_ostream.h" | 
| Eugene Zelenko | 59e1282 | 2017-08-08 00:47:13 +0000 | [diff] [blame] | 65 | #include <algorithm> | 
| Eugene Zelenko | 6620376 | 2017-01-21 00:53:49 +0000 | [diff] [blame] | 66 | #include <cassert> | 
| Eugene Zelenko | 59e1282 | 2017-08-08 00:47:13 +0000 | [diff] [blame] | 67 | #include <cstdlib> | 
| Eugene Zelenko | 6620376 | 2017-01-21 00:53:49 +0000 | [diff] [blame] | 68 | #include <iterator> | 
|  | 69 | #include <utility> | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 70 |  | 
|  | 71 | using namespace llvm; | 
|  | 72 |  | 
|  | 73 | #define DEBUG_TYPE "si-load-store-opt" | 
|  | 74 |  | 
|  | 75 | namespace { | 
|  | 76 |  | 
|  | 77 | class SILoadStoreOptimizer : public MachineFunctionPass { | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 78 | enum InstClassEnum { | 
|  | 79 | DS_READ_WRITE, | 
|  | 80 | S_BUFFER_LOAD_IMM, | 
|  | 81 | BUFFER_LOAD_OFFEN, | 
| Marek Olsak | 4c421a2d | 2017-11-09 01:52:36 +0000 | [diff] [blame] | 82 | BUFFER_LOAD_OFFSET, | 
| Marek Olsak | 58410f3 | 2017-11-09 01:52:55 +0000 | [diff] [blame] | 83 | BUFFER_STORE_OFFEN, | 
|  | 84 | BUFFER_STORE_OFFSET, | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 85 | }; | 
|  | 86 |  | 
| NAKAMURA Takumi | aba2b3d | 2017-10-10 08:30:53 +0000 | [diff] [blame] | 87 | struct CombineInfo { | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 88 | MachineBasicBlock::iterator I; | 
|  | 89 | MachineBasicBlock::iterator Paired; | 
|  | 90 | unsigned EltSize; | 
|  | 91 | unsigned Offset0; | 
|  | 92 | unsigned Offset1; | 
|  | 93 | unsigned BaseOff; | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 94 | InstClassEnum InstClass; | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 95 | bool GLC0; | 
|  | 96 | bool GLC1; | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 97 | bool SLC0; | 
|  | 98 | bool SLC1; | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 99 | bool UseST64; | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 100 | bool IsX2; | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 101 | SmallVector<MachineInstr*, 8> InstsToMove; | 
| Eugene Zelenko | 59e1282 | 2017-08-08 00:47:13 +0000 | [diff] [blame] | 102 | }; | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 103 |  | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 104 | private: | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 105 | const SISubtarget *STM = nullptr; | 
| Eugene Zelenko | 6620376 | 2017-01-21 00:53:49 +0000 | [diff] [blame] | 106 | const SIInstrInfo *TII = nullptr; | 
|  | 107 | const SIRegisterInfo *TRI = nullptr; | 
|  | 108 | MachineRegisterInfo *MRI = nullptr; | 
|  | 109 | AliasAnalysis *AA = nullptr; | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 110 | unsigned CreatedX2; | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 111 |  | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 112 | static bool offsetsCanBeCombined(CombineInfo &CI); | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 113 |  | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 114 | bool findMatchingInst(CombineInfo &CI); | 
| Matt Arsenault | 3f71c0e | 2017-11-29 00:55:57 +0000 | [diff] [blame] | 115 |  | 
|  | 116 | unsigned read2Opcode(unsigned EltSize) const; | 
|  | 117 | unsigned read2ST64Opcode(unsigned EltSize) const; | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 118 | MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); | 
| Matt Arsenault | 3f71c0e | 2017-11-29 00:55:57 +0000 | [diff] [blame] | 119 |  | 
|  | 120 | unsigned write2Opcode(unsigned EltSize) const; | 
|  | 121 | unsigned write2ST64Opcode(unsigned EltSize) const; | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 122 | MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 123 | MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); | 
| Marek Olsak | 4c421a2d | 2017-11-09 01:52:36 +0000 | [diff] [blame] | 124 | MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); | 
| Marek Olsak | 58410f3 | 2017-11-09 01:52:55 +0000 | [diff] [blame] | 125 | unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2, | 
|  | 126 | bool &IsOffen) const; | 
|  | 127 | MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 128 |  | 
|  | 129 | public: | 
|  | 130 | static char ID; | 
|  | 131 |  | 
| Francis Visoiu Mistrih | 8b61764 | 2017-05-18 17:21:13 +0000 | [diff] [blame] | 132 | SILoadStoreOptimizer() : MachineFunctionPass(ID) { | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 133 | initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); | 
|  | 134 | } | 
|  | 135 |  | 
|  | 136 | bool optimizeBlock(MachineBasicBlock &MBB); | 
|  | 137 |  | 
|  | 138 | bool runOnMachineFunction(MachineFunction &MF) override; | 
|  | 139 |  | 
| Mark Searles | 7687d42 | 2018-01-22 21:46:43 +0000 | [diff] [blame] | 140 | StringRef getPassName() const override { return "SI Load Store Optimizer"; } | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 141 |  | 
|  | 142 | void getAnalysisUsage(AnalysisUsage &AU) const override { | 
|  | 143 | AU.setPreservesCFG(); | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 144 | AU.addRequired<AAResultsWrapperPass>(); | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 145 |  | 
|  | 146 | MachineFunctionPass::getAnalysisUsage(AU); | 
|  | 147 | } | 
|  | 148 | }; | 
|  | 149 |  | 
| Eugene Zelenko | 6620376 | 2017-01-21 00:53:49 +0000 | [diff] [blame] | 150 | } // end anonymous namespace. | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 151 |  | 
|  | 152 | INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, | 
| Mark Searles | 7687d42 | 2018-01-22 21:46:43 +0000 | [diff] [blame] | 153 | "SI Load Store Optimizer", false, false) | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 154 | INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 155 | INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, | 
| Mark Searles | 7687d42 | 2018-01-22 21:46:43 +0000 | [diff] [blame] | 156 | "SI Load Store Optimizer", false, false) | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 157 |  | 
|  | 158 | char SILoadStoreOptimizer::ID = 0; | 
|  | 159 |  | 
|  | 160 | char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; | 
|  | 161 |  | 
| Francis Visoiu Mistrih | 8b61764 | 2017-05-18 17:21:13 +0000 | [diff] [blame] | 162 | FunctionPass *llvm::createSILoadStoreOptimizerPass() { | 
|  | 163 | return new SILoadStoreOptimizer(); | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 164 | } | 
|  | 165 |  | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 166 | static void moveInstsAfter(MachineBasicBlock::iterator I, | 
|  | 167 | ArrayRef<MachineInstr*> InstsToMove) { | 
|  | 168 | MachineBasicBlock *MBB = I->getParent(); | 
|  | 169 | ++I; | 
|  | 170 | for (MachineInstr *MI : InstsToMove) { | 
|  | 171 | MI->removeFromParent(); | 
|  | 172 | MBB->insert(I, MI); | 
|  | 173 | } | 
|  | 174 | } | 
|  | 175 |  | 
| Nicolai Haehnle | 6cf306d | 2018-02-23 10:45:56 +0000 | [diff] [blame] | 176 | static void addDefsUsesToList(const MachineInstr &MI, | 
|  | 177 | DenseSet<unsigned> &RegDefs, | 
|  | 178 | DenseSet<unsigned> &PhysRegUses) { | 
|  | 179 | for (const MachineOperand &Op : MI.operands()) { | 
|  | 180 | if (Op.isReg()) { | 
|  | 181 | if (Op.isDef()) | 
|  | 182 | RegDefs.insert(Op.getReg()); | 
|  | 183 | else if (Op.readsReg() && | 
|  | 184 | TargetRegisterInfo::isPhysicalRegister(Op.getReg())) | 
|  | 185 | PhysRegUses.insert(Op.getReg()); | 
|  | 186 | } | 
| Matt Arsenault | b02cebf | 2018-02-08 01:56:14 +0000 | [diff] [blame] | 187 | } | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 188 | } | 
|  | 189 |  | 
| Eugene Zelenko | 6620376 | 2017-01-21 00:53:49 +0000 | [diff] [blame] | 190 | static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, | 
|  | 191 | MachineBasicBlock::iterator B, | 
|  | 192 | const SIInstrInfo *TII, | 
|  | 193 | AliasAnalysis * AA) { | 
| Matt Arsenault | 67e72de | 2017-08-31 01:53:09 +0000 | [diff] [blame] | 194 | // RAW or WAR - cannot reorder | 
|  | 195 | // WAW - cannot reorder | 
|  | 196 | // RAR - safe to reorder | 
|  | 197 | return !(A->mayStore() || B->mayStore()) || | 
|  | 198 | TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); | 
| Alexander Timofeev | f867a40 | 2016-11-03 14:37:13 +0000 | [diff] [blame] | 199 | } | 
|  | 200 |  | 
| Nicolai Haehnle | 7b0e25b | 2016-10-27 08:15:07 +0000 | [diff] [blame] | 201 | // Add MI and its defs to the lists if MI reads one of the defs that are | 
|  | 202 | // already in the list. Returns true in that case. | 
|  | 203 | static bool | 
|  | 204 | addToListsIfDependent(MachineInstr &MI, | 
| Nicolai Haehnle | 6cf306d | 2018-02-23 10:45:56 +0000 | [diff] [blame] | 205 | DenseSet<unsigned> &RegDefs, | 
|  | 206 | DenseSet<unsigned> &PhysRegUses, | 
| Nicolai Haehnle | 7b0e25b | 2016-10-27 08:15:07 +0000 | [diff] [blame] | 207 | SmallVectorImpl<MachineInstr*> &Insts) { | 
| Matt Arsenault | 67e72de | 2017-08-31 01:53:09 +0000 | [diff] [blame] | 208 | for (MachineOperand &Use : MI.operands()) { | 
|  | 209 | // If one of the defs is read, then there is a use of Def between I and the | 
|  | 210 | // instruction that I will potentially be merged with. We will need to move | 
|  | 211 | // this instruction after the merged instructions. | 
| Nicolai Haehnle | 6cf306d | 2018-02-23 10:45:56 +0000 | [diff] [blame] | 212 | // | 
|  | 213 | // Similarly, if there is a def which is read by an instruction that is to | 
|  | 214 | // be moved for merging, then we need to move the def-instruction as well. | 
|  | 215 | // This can only happen for physical registers such as M0; virtual | 
|  | 216 | // registers are in SSA form. | 
|  | 217 | if (Use.isReg() && | 
|  | 218 | ((Use.readsReg() && RegDefs.count(Use.getReg())) || | 
|  | 219 | (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) && | 
|  | 220 | PhysRegUses.count(Use.getReg())))) { | 
| Nicolai Haehnle | 7b0e25b | 2016-10-27 08:15:07 +0000 | [diff] [blame] | 221 | Insts.push_back(&MI); | 
| Nicolai Haehnle | 6cf306d | 2018-02-23 10:45:56 +0000 | [diff] [blame] | 222 | addDefsUsesToList(MI, RegDefs, PhysRegUses); | 
| Nicolai Haehnle | 7b0e25b | 2016-10-27 08:15:07 +0000 | [diff] [blame] | 223 | return true; | 
|  | 224 | } | 
|  | 225 | } | 
|  | 226 |  | 
|  | 227 | return false; | 
|  | 228 | } | 
|  | 229 |  | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 230 | static bool | 
|  | 231 | canMoveInstsAcrossMemOp(MachineInstr &MemOp, | 
|  | 232 | ArrayRef<MachineInstr*> InstsToMove, | 
|  | 233 | const SIInstrInfo *TII, | 
|  | 234 | AliasAnalysis *AA) { | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 235 | assert(MemOp.mayLoadOrStore()); | 
|  | 236 |  | 
|  | 237 | for (MachineInstr *InstToMove : InstsToMove) { | 
|  | 238 | if (!InstToMove->mayLoadOrStore()) | 
|  | 239 | continue; | 
| Alexander Timofeev | f867a40 | 2016-11-03 14:37:13 +0000 | [diff] [blame] | 240 | if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)) | 
|  | 241 | return false; | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 242 | } | 
|  | 243 | return true; | 
|  | 244 | } | 
|  | 245 |  | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 246 | bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 247 | // XXX - Would the same offset be OK? Is there any reason this would happen or | 
|  | 248 | // be useful? | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 249 | if (CI.Offset0 == CI.Offset1) | 
| Matt Arsenault | fe0a2e6 | 2014-10-10 22:12:32 +0000 | [diff] [blame] | 250 | return false; | 
|  | 251 |  | 
|  | 252 | // This won't be valid if the offset isn't aligned. | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 253 | if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0)) | 
| Matt Arsenault | fe0a2e6 | 2014-10-10 22:12:32 +0000 | [diff] [blame] | 254 | return false; | 
|  | 255 |  | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 256 | unsigned EltOffset0 = CI.Offset0 / CI.EltSize; | 
|  | 257 | unsigned EltOffset1 = CI.Offset1 / CI.EltSize; | 
|  | 258 | CI.UseST64 = false; | 
|  | 259 | CI.BaseOff = 0; | 
| Matt Arsenault | fe0a2e6 | 2014-10-10 22:12:32 +0000 | [diff] [blame] | 260 |  | 
| Marek Olsak | 58410f3 | 2017-11-09 01:52:55 +0000 | [diff] [blame] | 261 | // Handle SMEM and VMEM instructions. | 
|  | 262 | if (CI.InstClass != DS_READ_WRITE) { | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 263 | unsigned Diff = CI.IsX2 ? 2 : 1; | 
|  | 264 | return (EltOffset0 + Diff == EltOffset1 || | 
|  | 265 | EltOffset1 + Diff == EltOffset0) && | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 266 | CI.GLC0 == CI.GLC1 && | 
|  | 267 | (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1); | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 268 | } | 
|  | 269 |  | 
| Matt Arsenault | fe0a2e6 | 2014-10-10 22:12:32 +0000 | [diff] [blame] | 270 | // If the offset in elements doesn't fit in 8-bits, we might be able to use | 
|  | 271 | // the stride 64 versions. | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 272 | if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && | 
|  | 273 | isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { | 
|  | 274 | CI.Offset0 = EltOffset0 / 64; | 
|  | 275 | CI.Offset1 = EltOffset1 / 64; | 
|  | 276 | CI.UseST64 = true; | 
|  | 277 | return true; | 
|  | 278 | } | 
| Matt Arsenault | fe0a2e6 | 2014-10-10 22:12:32 +0000 | [diff] [blame] | 279 |  | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 280 | // Check if the new offsets fit in the reduced 8-bit range. | 
|  | 281 | if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { | 
|  | 282 | CI.Offset0 = EltOffset0; | 
|  | 283 | CI.Offset1 = EltOffset1; | 
|  | 284 | return true; | 
|  | 285 | } | 
|  | 286 |  | 
|  | 287 | // Try to shift base address to decrease offsets. | 
|  | 288 | unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0); | 
|  | 289 | CI.BaseOff = std::min(CI.Offset0, CI.Offset1); | 
|  | 290 |  | 
|  | 291 | if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) { | 
|  | 292 | CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; | 
|  | 293 | CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; | 
|  | 294 | CI.UseST64 = true; | 
|  | 295 | return true; | 
|  | 296 | } | 
|  | 297 |  | 
|  | 298 | if (isUInt<8>(OffsetDiff)) { | 
|  | 299 | CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize; | 
|  | 300 | CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize; | 
|  | 301 | return true; | 
|  | 302 | } | 
|  | 303 |  | 
|  | 304 | return false; | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 305 | } | 
|  | 306 |  | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 307 | bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { | 
| Matt Arsenault | 67e72de | 2017-08-31 01:53:09 +0000 | [diff] [blame] | 308 | MachineBasicBlock *MBB = CI.I->getParent(); | 
|  | 309 | MachineBasicBlock::iterator E = MBB->end(); | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 310 | MachineBasicBlock::iterator MBBI = CI.I; | 
| Matt Arsenault | 3cb6163 | 2017-08-30 03:26:18 +0000 | [diff] [blame] | 311 |  | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 312 | unsigned AddrOpName[3] = {0}; | 
|  | 313 | int AddrIdx[3]; | 
|  | 314 | const MachineOperand *AddrReg[3]; | 
|  | 315 | unsigned NumAddresses = 0; | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 316 |  | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 317 | switch (CI.InstClass) { | 
|  | 318 | case DS_READ_WRITE: | 
|  | 319 | AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; | 
|  | 320 | break; | 
|  | 321 | case S_BUFFER_LOAD_IMM: | 
|  | 322 | AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; | 
|  | 323 | break; | 
|  | 324 | case BUFFER_LOAD_OFFEN: | 
| Marek Olsak | 58410f3 | 2017-11-09 01:52:55 +0000 | [diff] [blame] | 325 | case BUFFER_STORE_OFFEN: | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 326 | AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; | 
|  | 327 | AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; | 
|  | 328 | AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; | 
|  | 329 | break; | 
| Marek Olsak | 4c421a2d | 2017-11-09 01:52:36 +0000 | [diff] [blame] | 330 | case BUFFER_LOAD_OFFSET: | 
| Marek Olsak | 58410f3 | 2017-11-09 01:52:55 +0000 | [diff] [blame] | 331 | case BUFFER_STORE_OFFSET: | 
| Marek Olsak | 4c421a2d | 2017-11-09 01:52:36 +0000 | [diff] [blame] | 332 | AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; | 
|  | 333 | AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; | 
|  | 334 | break; | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 335 | } | 
| Matt Arsenault | 3cb6163 | 2017-08-30 03:26:18 +0000 | [diff] [blame] | 336 |  | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 337 | for (unsigned i = 0; i < NumAddresses; i++) { | 
|  | 338 | AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]); | 
|  | 339 | AddrReg[i] = &CI.I->getOperand(AddrIdx[i]); | 
|  | 340 |  | 
|  | 341 | // We only ever merge operations with the same base address register, so don't | 
|  | 342 | // bother scanning forward if there are no other uses. | 
|  | 343 | if (AddrReg[i]->isReg() && | 
|  | 344 | (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) || | 
|  | 345 | MRI->hasOneNonDBGUse(AddrReg[i]->getReg()))) | 
|  | 346 | return false; | 
|  | 347 | } | 
| Matt Arsenault | 3cb6163 | 2017-08-30 03:26:18 +0000 | [diff] [blame] | 348 |  | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 349 | ++MBBI; | 
|  | 350 |  | 
| Nicolai Haehnle | 6cf306d | 2018-02-23 10:45:56 +0000 | [diff] [blame] | 351 | DenseSet<unsigned> RegDefsToMove; | 
|  | 352 | DenseSet<unsigned> PhysRegUsesToMove; | 
|  | 353 | addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 354 |  | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 355 | for ( ; MBBI != E; ++MBBI) { | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 356 | if (MBBI->getOpcode() != CI.I->getOpcode()) { | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 357 | // This is not a matching DS instruction, but we can keep looking as | 
|  | 358 | // long as one of these conditions are met: | 
|  | 359 | // 1. It is safe to move I down past MBBI. | 
|  | 360 | // 2. It is safe to move MBBI down past the instruction that I will | 
|  | 361 | //    be merged into. | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 362 |  | 
| Matt Arsenault | 2d69c92 | 2017-08-29 21:25:51 +0000 | [diff] [blame] | 363 | if (MBBI->hasUnmodeledSideEffects()) { | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 364 | // We can't re-order this instruction with respect to other memory | 
| Matt Arsenault | 2d69c92 | 2017-08-29 21:25:51 +0000 | [diff] [blame] | 365 | // operations, so we fail both conditions mentioned above. | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 366 | return false; | 
| Matt Arsenault | 2d69c92 | 2017-08-29 21:25:51 +0000 | [diff] [blame] | 367 | } | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 368 |  | 
|  | 369 | if (MBBI->mayLoadOrStore() && | 
| Nicolai Haehnle | dd059c1 | 2017-11-22 12:25:21 +0000 | [diff] [blame] | 370 | (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || | 
|  | 371 | !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 372 | // We fail condition #1, but we may still be able to satisfy condition | 
|  | 373 | // #2.  Add this instruction to the move list and then we will check | 
|  | 374 | // if condition #2 holds once we have selected the matching instruction. | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 375 | CI.InstsToMove.push_back(&*MBBI); | 
| Nicolai Haehnle | 6cf306d | 2018-02-23 10:45:56 +0000 | [diff] [blame] | 376 | addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 377 | continue; | 
|  | 378 | } | 
|  | 379 |  | 
|  | 380 | // When we match I with another DS instruction we will be moving I down | 
|  | 381 | // to the location of the matched instruction any uses of I will need to | 
|  | 382 | // be moved down as well. | 
| Nicolai Haehnle | 6cf306d | 2018-02-23 10:45:56 +0000 | [diff] [blame] | 383 | addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, | 
|  | 384 | CI.InstsToMove); | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 385 | continue; | 
|  | 386 | } | 
|  | 387 |  | 
|  | 388 | // Don't merge volatiles. | 
|  | 389 | if (MBBI->hasOrderedMemoryRef()) | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 390 | return false; | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 391 |  | 
| Nicolai Haehnle | 7b0e25b | 2016-10-27 08:15:07 +0000 | [diff] [blame] | 392 | // Handle a case like | 
|  | 393 | //   DS_WRITE_B32 addr, v, idx0 | 
|  | 394 | //   w = DS_READ_B32 addr, idx0 | 
|  | 395 | //   DS_WRITE_B32 addr, f(w), idx1 | 
|  | 396 | // where the DS_READ_B32 ends up in InstsToMove and therefore prevents | 
|  | 397 | // merging of the two writes. | 
| Nicolai Haehnle | 6cf306d | 2018-02-23 10:45:56 +0000 | [diff] [blame] | 398 | if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, | 
|  | 399 | CI.InstsToMove)) | 
| Nicolai Haehnle | 7b0e25b | 2016-10-27 08:15:07 +0000 | [diff] [blame] | 400 | continue; | 
|  | 401 |  | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 402 | bool Match = true; | 
|  | 403 | for (unsigned i = 0; i < NumAddresses; i++) { | 
|  | 404 | const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]); | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 405 |  | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 406 | if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { | 
|  | 407 | if (AddrReg[i]->isImm() != AddrRegNext.isImm() || | 
|  | 408 | AddrReg[i]->getImm() != AddrRegNext.getImm()) { | 
|  | 409 | Match = false; | 
|  | 410 | break; | 
|  | 411 | } | 
|  | 412 | continue; | 
|  | 413 | } | 
|  | 414 |  | 
|  | 415 | // Check same base pointer. Be careful of subregisters, which can occur with | 
|  | 416 | // vectors of pointers. | 
|  | 417 | if (AddrReg[i]->getReg() != AddrRegNext.getReg() || | 
|  | 418 | AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { | 
|  | 419 | Match = false; | 
|  | 420 | break; | 
|  | 421 | } | 
|  | 422 | } | 
|  | 423 |  | 
|  | 424 | if (Match) { | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 425 | int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 426 | AMDGPU::OpName::offset); | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 427 | CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); | 
|  | 428 | CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm(); | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 429 | CI.Paired = MBBI; | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 430 |  | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 431 | if (CI.InstClass == DS_READ_WRITE) { | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 432 | CI.Offset0 &= 0xffff; | 
|  | 433 | CI.Offset1 &= 0xffff; | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 434 | } else { | 
|  | 435 | CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm(); | 
|  | 436 | CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm(); | 
| Marek Olsak | 4c421a2d | 2017-11-09 01:52:36 +0000 | [diff] [blame] | 437 | if (CI.InstClass != S_BUFFER_LOAD_IMM) { | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 438 | CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm(); | 
|  | 439 | CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm(); | 
|  | 440 | } | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 441 | } | 
|  | 442 |  | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 443 | // Check both offsets fit in the reduced range. | 
|  | 444 | // We also need to go through the list of instructions that we plan to | 
|  | 445 | // move and make sure they are all safe to move down past the merged | 
|  | 446 | // instruction. | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 447 | if (offsetsCanBeCombined(CI)) | 
|  | 448 | if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) | 
|  | 449 | return true; | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 450 | } | 
|  | 451 |  | 
|  | 452 | // We've found a load/store that we couldn't merge for some reason. | 
|  | 453 | // We could potentially keep looking, but we'd need to make sure that | 
|  | 454 | // it was safe to move I and also all the instruction in InstsToMove | 
|  | 455 | // down past this instruction. | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 456 | // check if we can move I across MBBI and if we can move all I's users | 
|  | 457 | if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || | 
| Nicolai Haehnle | 6cf306d | 2018-02-23 10:45:56 +0000 | [diff] [blame] | 458 | !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) | 
| Alexander Timofeev | f867a40 | 2016-11-03 14:37:13 +0000 | [diff] [blame] | 459 | break; | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 460 | } | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 461 | return false; | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 462 | } | 
|  | 463 |  | 
| Matt Arsenault | 3f71c0e | 2017-11-29 00:55:57 +0000 | [diff] [blame] | 464 | unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { | 
|  | 465 | if (STM->ldsRequiresM0Init()) | 
|  | 466 | return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; | 
|  | 467 | return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; | 
|  | 468 | } | 
|  | 469 |  | 
|  | 470 | unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { | 
|  | 471 | if (STM->ldsRequiresM0Init()) | 
|  | 472 | return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; | 
|  | 473 |  | 
|  | 474 | return (EltSize == 4) ? | 
|  | 475 | AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9; | 
|  | 476 | } | 
|  | 477 |  | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 478 | MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair( | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 479 | CombineInfo &CI) { | 
|  | 480 | MachineBasicBlock *MBB = CI.I->getParent(); | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 481 |  | 
|  | 482 | // Be careful, since the addresses could be subregisters themselves in weird | 
|  | 483 | // cases, like vectors of pointers. | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 484 | const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 485 |  | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 486 | const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); | 
|  | 487 | const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst); | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 488 |  | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 489 | unsigned NewOffset0 = CI.Offset0; | 
|  | 490 | unsigned NewOffset1 = CI.Offset1; | 
| Matt Arsenault | 3f71c0e | 2017-11-29 00:55:57 +0000 | [diff] [blame] | 491 | unsigned Opc = CI.UseST64 ? | 
|  | 492 | read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); | 
| Matt Arsenault | fe0a2e6 | 2014-10-10 22:12:32 +0000 | [diff] [blame] | 493 |  | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 494 | unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; | 
|  | 495 | unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; | 
| Tom Stellard | e175d8a | 2016-08-26 21:36:47 +0000 | [diff] [blame] | 496 |  | 
|  | 497 | if (NewOffset0 > NewOffset1) { | 
|  | 498 | // Canonicalize the merged instruction so the smaller offset comes first. | 
|  | 499 | std::swap(NewOffset0, NewOffset1); | 
|  | 500 | std::swap(SubRegIdx0, SubRegIdx1); | 
|  | 501 | } | 
|  | 502 |  | 
| Matt Arsenault | fe0a2e6 | 2014-10-10 22:12:32 +0000 | [diff] [blame] | 503 | assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && | 
|  | 504 | (NewOffset0 != NewOffset1) && | 
|  | 505 | "Computed offset doesn't fit"); | 
|  | 506 |  | 
|  | 507 | const MCInstrDesc &Read2Desc = TII->get(Opc); | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 508 |  | 
|  | 509 | const TargetRegisterClass *SuperRC | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 510 | = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 511 | unsigned DestReg = MRI->createVirtualRegister(SuperRC); | 
|  | 512 |  | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 513 | DebugLoc DL = CI.I->getDebugLoc(); | 
|  | 514 |  | 
|  | 515 | unsigned BaseReg = AddrReg->getReg(); | 
|  | 516 | unsigned BaseRegFlags = 0; | 
|  | 517 | if (CI.BaseOff) { | 
| Mark Searles | 7687d42 | 2018-01-22 21:46:43 +0000 | [diff] [blame] | 518 | unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); | 
|  | 519 | BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) | 
|  | 520 | .addImm(CI.BaseOff); | 
|  | 521 |  | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 522 | BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); | 
|  | 523 | BaseRegFlags = RegState::Kill; | 
| Matt Arsenault | 84445dd | 2017-11-30 22:51:26 +0000 | [diff] [blame] | 524 |  | 
| Mark Searles | 7687d42 | 2018-01-22 21:46:43 +0000 | [diff] [blame] | 525 | TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) | 
|  | 526 | .addReg(ImmReg) | 
| Matt Arsenault | 84445dd | 2017-11-30 22:51:26 +0000 | [diff] [blame] | 527 | .addReg(AddrReg->getReg()); | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 528 | } | 
|  | 529 |  | 
| Stanislav Mekhanoshin | 86b0a54 | 2017-04-14 00:33:44 +0000 | [diff] [blame] | 530 | MachineInstrBuilder Read2 = | 
|  | 531 | BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) | 
|  | 532 | .addReg(BaseReg, BaseRegFlags) // addr | 
|  | 533 | .addImm(NewOffset0)            // offset0 | 
|  | 534 | .addImm(NewOffset1)            // offset1 | 
|  | 535 | .addImm(0)                     // gds | 
|  | 536 | .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); | 
|  | 537 |  | 
| NAKAMURA Takumi | 9720f57 | 2016-08-30 11:50:21 +0000 | [diff] [blame] | 538 | (void)Read2; | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 539 |  | 
| Matt Arsenault | 84db5d9 | 2015-07-14 17:57:36 +0000 | [diff] [blame] | 540 | const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); | 
|  | 541 |  | 
|  | 542 | // Copy to the old destination registers. | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 543 | BuildMI(*MBB, CI.Paired, DL, CopyDesc) | 
| Diana Picus | 116bbab | 2017-01-13 09:58:52 +0000 | [diff] [blame] | 544 | .add(*Dest0) // Copy to same destination including flags and sub reg. | 
|  | 545 | .addReg(DestReg, 0, SubRegIdx0); | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 546 | MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) | 
| Diana Picus | 116bbab | 2017-01-13 09:58:52 +0000 | [diff] [blame] | 547 | .add(*Dest1) | 
|  | 548 | .addReg(DestReg, RegState::Kill, SubRegIdx1); | 
| Matt Arsenault | 84db5d9 | 2015-07-14 17:57:36 +0000 | [diff] [blame] | 549 |  | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 550 | moveInstsAfter(Copy1, CI.InstsToMove); | 
| Matt Arsenault | 84db5d9 | 2015-07-14 17:57:36 +0000 | [diff] [blame] | 551 |  | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 552 | MachineBasicBlock::iterator Next = std::next(CI.I); | 
|  | 553 | CI.I->eraseFromParent(); | 
|  | 554 | CI.Paired->eraseFromParent(); | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 555 |  | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 556 | DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 557 | return Next; | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 558 | } | 
|  | 559 |  | 
| Matt Arsenault | 3f71c0e | 2017-11-29 00:55:57 +0000 | [diff] [blame] | 560 | unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { | 
|  | 561 | if (STM->ldsRequiresM0Init()) | 
|  | 562 | return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; | 
|  | 563 | return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9; | 
|  | 564 | } | 
|  | 565 |  | 
|  | 566 | unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { | 
|  | 567 | if (STM->ldsRequiresM0Init()) | 
|  | 568 | return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; | 
|  | 569 |  | 
|  | 570 | return (EltSize == 4) ? | 
|  | 571 | AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9; | 
|  | 572 | } | 
|  | 573 |  | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 574 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 575 | CombineInfo &CI) { | 
|  | 576 | MachineBasicBlock *MBB = CI.I->getParent(); | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 577 |  | 
|  | 578 | // Be sure to use .addOperand(), and not .addReg() with these. We want to be | 
|  | 579 | // sure we preserve the subregister index and any register flags set on them. | 
| Mark Searles | 7687d42 | 2018-01-22 21:46:43 +0000 | [diff] [blame] | 580 | const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 581 | const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 582 | const MachineOperand *Data1 | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 583 | = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 584 |  | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 585 | unsigned NewOffset0 = CI.Offset0; | 
|  | 586 | unsigned NewOffset1 = CI.Offset1; | 
| Matt Arsenault | 3f71c0e | 2017-11-29 00:55:57 +0000 | [diff] [blame] | 587 | unsigned Opc = CI.UseST64 ? | 
|  | 588 | write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); | 
| Matt Arsenault | fe0a2e6 | 2014-10-10 22:12:32 +0000 | [diff] [blame] | 589 |  | 
| Tom Stellard | e175d8a | 2016-08-26 21:36:47 +0000 | [diff] [blame] | 590 | if (NewOffset0 > NewOffset1) { | 
|  | 591 | // Canonicalize the merged instruction so the smaller offset comes first. | 
|  | 592 | std::swap(NewOffset0, NewOffset1); | 
|  | 593 | std::swap(Data0, Data1); | 
|  | 594 | } | 
|  | 595 |  | 
| Matt Arsenault | fe0a2e6 | 2014-10-10 22:12:32 +0000 | [diff] [blame] | 596 | assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && | 
|  | 597 | (NewOffset0 != NewOffset1) && | 
|  | 598 | "Computed offset doesn't fit"); | 
|  | 599 |  | 
|  | 600 | const MCInstrDesc &Write2Desc = TII->get(Opc); | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 601 | DebugLoc DL = CI.I->getDebugLoc(); | 
| Matt Arsenault | fe0a2e6 | 2014-10-10 22:12:32 +0000 | [diff] [blame] | 602 |  | 
| Mark Searles | 7687d42 | 2018-01-22 21:46:43 +0000 | [diff] [blame] | 603 | unsigned BaseReg = AddrReg->getReg(); | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 604 | unsigned BaseRegFlags = 0; | 
|  | 605 | if (CI.BaseOff) { | 
| Mark Searles | 7687d42 | 2018-01-22 21:46:43 +0000 | [diff] [blame] | 606 | unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); | 
|  | 607 | BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) | 
|  | 608 | .addImm(CI.BaseOff); | 
|  | 609 |  | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 610 | BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); | 
|  | 611 | BaseRegFlags = RegState::Kill; | 
| Matt Arsenault | 84445dd | 2017-11-30 22:51:26 +0000 | [diff] [blame] | 612 |  | 
| Mark Searles | 7687d42 | 2018-01-22 21:46:43 +0000 | [diff] [blame] | 613 | TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) | 
|  | 614 | .addReg(ImmReg) | 
|  | 615 | .addReg(AddrReg->getReg()); | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 616 | } | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 617 |  | 
| Stanislav Mekhanoshin | 86b0a54 | 2017-04-14 00:33:44 +0000 | [diff] [blame] | 618 | MachineInstrBuilder Write2 = | 
|  | 619 | BuildMI(*MBB, CI.Paired, DL, Write2Desc) | 
|  | 620 | .addReg(BaseReg, BaseRegFlags) // addr | 
|  | 621 | .add(*Data0)                   // data0 | 
|  | 622 | .add(*Data1)                   // data1 | 
|  | 623 | .addImm(NewOffset0)            // offset0 | 
|  | 624 | .addImm(NewOffset1)            // offset1 | 
|  | 625 | .addImm(0)                     // gds | 
|  | 626 | .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 627 |  | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 628 | moveInstsAfter(Write2, CI.InstsToMove); | 
|  | 629 |  | 
|  | 630 | MachineBasicBlock::iterator Next = std::next(CI.I); | 
|  | 631 | CI.I->eraseFromParent(); | 
|  | 632 | CI.Paired->eraseFromParent(); | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 633 |  | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 634 | DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 635 | return Next; | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 636 | } | 
|  | 637 |  | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 638 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( | 
|  | 639 | CombineInfo &CI) { | 
|  | 640 | MachineBasicBlock *MBB = CI.I->getParent(); | 
|  | 641 | DebugLoc DL = CI.I->getDebugLoc(); | 
|  | 642 | unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM : | 
|  | 643 | AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; | 
|  | 644 |  | 
|  | 645 | const TargetRegisterClass *SuperRC = | 
|  | 646 | CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass; | 
|  | 647 | unsigned DestReg = MRI->createVirtualRegister(SuperRC); | 
|  | 648 | unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); | 
|  | 649 |  | 
|  | 650 | BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) | 
|  | 651 | .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) | 
|  | 652 | .addImm(MergedOffset) // offset | 
|  | 653 | .addImm(CI.GLC0)      // glc | 
|  | 654 | .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); | 
|  | 655 |  | 
|  | 656 | unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; | 
|  | 657 | unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; | 
|  | 658 |  | 
|  | 659 | // Handle descending offsets | 
|  | 660 | if (CI.Offset0 > CI.Offset1) | 
|  | 661 | std::swap(SubRegIdx0, SubRegIdx1); | 
|  | 662 |  | 
|  | 663 | // Copy to the old destination registers. | 
|  | 664 | const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); | 
|  | 665 | const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); | 
|  | 666 | const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst); | 
|  | 667 |  | 
|  | 668 | BuildMI(*MBB, CI.Paired, DL, CopyDesc) | 
|  | 669 | .add(*Dest0) // Copy to same destination including flags and sub reg. | 
|  | 670 | .addReg(DestReg, 0, SubRegIdx0); | 
|  | 671 | MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) | 
|  | 672 | .add(*Dest1) | 
|  | 673 | .addReg(DestReg, RegState::Kill, SubRegIdx1); | 
|  | 674 |  | 
|  | 675 | moveInstsAfter(Copy1, CI.InstsToMove); | 
|  | 676 |  | 
|  | 677 | MachineBasicBlock::iterator Next = std::next(CI.I); | 
|  | 678 | CI.I->eraseFromParent(); | 
|  | 679 | CI.Paired->eraseFromParent(); | 
|  | 680 | return Next; | 
|  | 681 | } | 
|  | 682 |  | 
| Marek Olsak | 4c421a2d | 2017-11-09 01:52:36 +0000 | [diff] [blame] | 683 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 684 | CombineInfo &CI) { | 
|  | 685 | MachineBasicBlock *MBB = CI.I->getParent(); | 
|  | 686 | DebugLoc DL = CI.I->getDebugLoc(); | 
| Marek Olsak | 4c421a2d | 2017-11-09 01:52:36 +0000 | [diff] [blame] | 687 | unsigned Opcode; | 
|  | 688 |  | 
|  | 689 | if (CI.InstClass == BUFFER_LOAD_OFFEN) { | 
|  | 690 | Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN : | 
|  | 691 | AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; | 
|  | 692 | } else { | 
|  | 693 | Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET : | 
|  | 694 | AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; | 
|  | 695 | } | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 696 |  | 
|  | 697 | const TargetRegisterClass *SuperRC = | 
|  | 698 | CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; | 
|  | 699 | unsigned DestReg = MRI->createVirtualRegister(SuperRC); | 
|  | 700 | unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); | 
|  | 701 |  | 
| Marek Olsak | 4c421a2d | 2017-11-09 01:52:36 +0000 | [diff] [blame] | 702 | auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); | 
|  | 703 |  | 
|  | 704 | if (CI.InstClass == BUFFER_LOAD_OFFEN) | 
|  | 705 | MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); | 
|  | 706 |  | 
|  | 707 | MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 708 | .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) | 
|  | 709 | .addImm(MergedOffset) // offset | 
|  | 710 | .addImm(CI.GLC0)      // glc | 
|  | 711 | .addImm(CI.SLC0)      // slc | 
|  | 712 | .addImm(0)            // tfe | 
|  | 713 | .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); | 
|  | 714 |  | 
|  | 715 | unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; | 
|  | 716 | unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; | 
|  | 717 |  | 
|  | 718 | // Handle descending offsets | 
|  | 719 | if (CI.Offset0 > CI.Offset1) | 
|  | 720 | std::swap(SubRegIdx0, SubRegIdx1); | 
|  | 721 |  | 
|  | 722 | // Copy to the old destination registers. | 
|  | 723 | const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); | 
|  | 724 | const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); | 
|  | 725 | const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); | 
|  | 726 |  | 
|  | 727 | BuildMI(*MBB, CI.Paired, DL, CopyDesc) | 
|  | 728 | .add(*Dest0) // Copy to same destination including flags and sub reg. | 
|  | 729 | .addReg(DestReg, 0, SubRegIdx0); | 
|  | 730 | MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) | 
|  | 731 | .add(*Dest1) | 
|  | 732 | .addReg(DestReg, RegState::Kill, SubRegIdx1); | 
|  | 733 |  | 
|  | 734 | moveInstsAfter(Copy1, CI.InstsToMove); | 
|  | 735 |  | 
|  | 736 | MachineBasicBlock::iterator Next = std::next(CI.I); | 
|  | 737 | CI.I->eraseFromParent(); | 
|  | 738 | CI.Paired->eraseFromParent(); | 
|  | 739 | return Next; | 
|  | 740 | } | 
|  | 741 |  | 
| Marek Olsak | 58410f3 | 2017-11-09 01:52:55 +0000 | [diff] [blame] | 742 | unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode( | 
|  | 743 | const MachineInstr &I, bool &IsX2, bool &IsOffen) const { | 
|  | 744 | IsX2 = false; | 
|  | 745 | IsOffen = false; | 
|  | 746 |  | 
|  | 747 | switch (I.getOpcode()) { | 
|  | 748 | case AMDGPU::BUFFER_STORE_DWORD_OFFEN: | 
|  | 749 | IsOffen = true; | 
|  | 750 | return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; | 
|  | 751 | case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: | 
|  | 752 | IsOffen = true; | 
|  | 753 | return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact; | 
|  | 754 | case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: | 
|  | 755 | IsX2 = true; | 
|  | 756 | IsOffen = true; | 
|  | 757 | return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; | 
|  | 758 | case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact: | 
|  | 759 | IsX2 = true; | 
|  | 760 | IsOffen = true; | 
|  | 761 | return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact; | 
|  | 762 | case AMDGPU::BUFFER_STORE_DWORD_OFFSET: | 
|  | 763 | return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; | 
|  | 764 | case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: | 
|  | 765 | return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact; | 
|  | 766 | case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: | 
|  | 767 | IsX2 = true; | 
|  | 768 | return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; | 
|  | 769 | case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact: | 
|  | 770 | IsX2 = true; | 
|  | 771 | return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact; | 
|  | 772 | } | 
|  | 773 | return 0; | 
|  | 774 | } | 
|  | 775 |  | 
|  | 776 | MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( | 
|  | 777 | CombineInfo &CI) { | 
|  | 778 | MachineBasicBlock *MBB = CI.I->getParent(); | 
|  | 779 | DebugLoc DL = CI.I->getDebugLoc(); | 
|  | 780 | bool Unused1, Unused2; | 
|  | 781 | unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2); | 
|  | 782 |  | 
|  | 783 | unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; | 
|  | 784 | unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; | 
|  | 785 |  | 
|  | 786 | // Handle descending offsets | 
|  | 787 | if (CI.Offset0 > CI.Offset1) | 
|  | 788 | std::swap(SubRegIdx0, SubRegIdx1); | 
|  | 789 |  | 
|  | 790 | // Copy to the new source register. | 
|  | 791 | const TargetRegisterClass *SuperRC = | 
|  | 792 | CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; | 
|  | 793 | unsigned SrcReg = MRI->createVirtualRegister(SuperRC); | 
|  | 794 |  | 
|  | 795 | const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); | 
|  | 796 | const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); | 
|  | 797 |  | 
|  | 798 | BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) | 
|  | 799 | .add(*Src0) | 
|  | 800 | .addImm(SubRegIdx0) | 
|  | 801 | .add(*Src1) | 
|  | 802 | .addImm(SubRegIdx1); | 
|  | 803 |  | 
|  | 804 | auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) | 
|  | 805 | .addReg(SrcReg, RegState::Kill); | 
|  | 806 |  | 
|  | 807 | if (CI.InstClass == BUFFER_STORE_OFFEN) | 
|  | 808 | MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); | 
|  | 809 |  | 
|  | 810 | MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) | 
|  | 811 | .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) | 
|  | 812 | .addImm(std::min(CI.Offset0, CI.Offset1)) // offset | 
|  | 813 | .addImm(CI.GLC0)      // glc | 
|  | 814 | .addImm(CI.SLC0)      // slc | 
|  | 815 | .addImm(0)            // tfe | 
|  | 816 | .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); | 
|  | 817 |  | 
|  | 818 | moveInstsAfter(MIB, CI.InstsToMove); | 
|  | 819 |  | 
|  | 820 | MachineBasicBlock::iterator Next = std::next(CI.I); | 
|  | 821 | CI.I->eraseFromParent(); | 
|  | 822 | CI.Paired->eraseFromParent(); | 
|  | 823 | return Next; | 
|  | 824 | } | 
|  | 825 |  | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 826 | // Scan through looking for adjacent LDS operations with constant offsets from | 
|  | 827 | // the same base register. We rely on the scheduler to do the hard work of | 
|  | 828 | // clustering nearby loads, and assume these are all adjacent. | 
|  | 829 | bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 830 | bool Modified = false; | 
|  | 831 |  | 
|  | 832 | for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { | 
|  | 833 | MachineInstr &MI = *I; | 
|  | 834 |  | 
|  | 835 | // Don't combine if volatile. | 
|  | 836 | if (MI.hasOrderedMemoryRef()) { | 
|  | 837 | ++I; | 
|  | 838 | continue; | 
|  | 839 | } | 
|  | 840 |  | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 841 | CombineInfo CI; | 
|  | 842 | CI.I = I; | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 843 | unsigned Opc = MI.getOpcode(); | 
| Matt Arsenault | 3f71c0e | 2017-11-29 00:55:57 +0000 | [diff] [blame] | 844 | if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 || | 
|  | 845 | Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) { | 
|  | 846 |  | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 847 | CI.InstClass = DS_READ_WRITE; | 
| Matt Arsenault | 3f71c0e | 2017-11-29 00:55:57 +0000 | [diff] [blame] | 848 | CI.EltSize = | 
|  | 849 | (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4; | 
|  | 850 |  | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 851 | if (findMatchingInst(CI)) { | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 852 | Modified = true; | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 853 | I = mergeRead2Pair(CI); | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 854 | } else { | 
|  | 855 | ++I; | 
|  | 856 | } | 
|  | 857 |  | 
|  | 858 | continue; | 
| Matt Arsenault | 3f71c0e | 2017-11-29 00:55:57 +0000 | [diff] [blame] | 859 | } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 || | 
|  | 860 | Opc == AMDGPU::DS_WRITE_B32_gfx9 || | 
|  | 861 | Opc == AMDGPU::DS_WRITE_B64_gfx9) { | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 862 | CI.InstClass = DS_READ_WRITE; | 
| Matt Arsenault | 3f71c0e | 2017-11-29 00:55:57 +0000 | [diff] [blame] | 863 | CI.EltSize | 
|  | 864 | = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4; | 
|  | 865 |  | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 866 | if (findMatchingInst(CI)) { | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 867 | Modified = true; | 
| Stanislav Mekhanoshin | d026f79 | 2017-04-13 17:53:07 +0000 | [diff] [blame] | 868 | I = mergeWrite2Pair(CI); | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 869 | } else { | 
|  | 870 | ++I; | 
|  | 871 | } | 
|  | 872 |  | 
|  | 873 | continue; | 
|  | 874 | } | 
| Marek Olsak | b2cc779 | 2018-02-07 16:00:40 +0000 | [diff] [blame] | 875 | if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || | 
|  | 876 | Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) { | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 877 | // EltSize is in units of the offset encoding. | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 878 | CI.InstClass = S_BUFFER_LOAD_IMM; | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 879 | CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 880 | CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; | 
|  | 881 | if (findMatchingInst(CI)) { | 
|  | 882 | Modified = true; | 
|  | 883 | I = mergeSBufferLoadImmPair(CI); | 
|  | 884 | if (!CI.IsX2) | 
|  | 885 | CreatedX2++; | 
|  | 886 | } else { | 
|  | 887 | ++I; | 
|  | 888 | } | 
|  | 889 | continue; | 
|  | 890 | } | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 891 | if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || | 
| Marek Olsak | 4c421a2d | 2017-11-09 01:52:36 +0000 | [diff] [blame] | 892 | Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || | 
|  | 893 | Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET || | 
|  | 894 | Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) { | 
|  | 895 | if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || | 
|  | 896 | Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN) | 
|  | 897 | CI.InstClass = BUFFER_LOAD_OFFEN; | 
|  | 898 | else | 
|  | 899 | CI.InstClass = BUFFER_LOAD_OFFSET; | 
|  | 900 |  | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 901 | CI.EltSize = 4; | 
| Marek Olsak | 4c421a2d | 2017-11-09 01:52:36 +0000 | [diff] [blame] | 902 | CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || | 
|  | 903 | Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 904 | if (findMatchingInst(CI)) { | 
|  | 905 | Modified = true; | 
| Marek Olsak | 4c421a2d | 2017-11-09 01:52:36 +0000 | [diff] [blame] | 906 | I = mergeBufferLoadPair(CI); | 
| Marek Olsak | 6a0548a | 2017-11-09 01:52:30 +0000 | [diff] [blame] | 907 | if (!CI.IsX2) | 
|  | 908 | CreatedX2++; | 
|  | 909 | } else { | 
|  | 910 | ++I; | 
|  | 911 | } | 
|  | 912 | continue; | 
|  | 913 | } | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 914 |  | 
| Marek Olsak | 58410f3 | 2017-11-09 01:52:55 +0000 | [diff] [blame] | 915 | bool StoreIsX2, IsOffen; | 
|  | 916 | if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) { | 
|  | 917 | CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET; | 
|  | 918 | CI.EltSize = 4; | 
|  | 919 | CI.IsX2 = StoreIsX2; | 
|  | 920 | if (findMatchingInst(CI)) { | 
|  | 921 | Modified = true; | 
|  | 922 | I = mergeBufferStorePair(CI); | 
|  | 923 | if (!CI.IsX2) | 
|  | 924 | CreatedX2++; | 
|  | 925 | } else { | 
|  | 926 | ++I; | 
|  | 927 | } | 
|  | 928 | continue; | 
|  | 929 | } | 
|  | 930 |  | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 931 | ++I; | 
|  | 932 | } | 
|  | 933 |  | 
|  | 934 | return Modified; | 
|  | 935 | } | 
|  | 936 |  | 
|  | 937 | bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { | 
| Matthias Braun | f1caa28 | 2017-12-15 22:22:58 +0000 | [diff] [blame] | 938 | if (skipFunction(MF.getFunction())) | 
| Andrew Kaylor | 7de74af | 2016-04-25 22:23:44 +0000 | [diff] [blame] | 939 | return false; | 
|  | 940 |  | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 941 | STM = &MF.getSubtarget<SISubtarget>(); | 
|  | 942 | if (!STM->loadStoreOptEnabled()) | 
| Matt Arsenault | 03d8584 | 2016-06-27 20:32:13 +0000 | [diff] [blame] | 943 | return false; | 
|  | 944 |  | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 945 | TII = STM->getInstrInfo(); | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 946 | TRI = &TII->getRegisterInfo(); | 
|  | 947 |  | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 948 | MRI = &MF.getRegInfo(); | 
| Tom Stellard | c2ff0eb | 2016-08-29 19:15:22 +0000 | [diff] [blame] | 949 | AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 950 |  | 
| Matt Arsenault | 67e72de | 2017-08-31 01:53:09 +0000 | [diff] [blame] | 951 | assert(MRI->isSSA() && "Must be run on SSA"); | 
|  | 952 |  | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 953 | DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); | 
|  | 954 |  | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 955 | bool Modified = false; | 
|  | 956 |  | 
| Nicolai Haehnle | b4f28de | 2017-11-28 08:42:46 +0000 | [diff] [blame] | 957 | for (MachineBasicBlock &MBB : MF) { | 
|  | 958 | CreatedX2 = 0; | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 959 | Modified |= optimizeBlock(MBB); | 
|  | 960 |  | 
| Nicolai Haehnle | b4f28de | 2017-11-28 08:42:46 +0000 | [diff] [blame] | 961 | // Run again to convert x2 to x4. | 
|  | 962 | if (CreatedX2 >= 1) | 
| Marek Olsak | b953cc3 | 2017-11-09 01:52:23 +0000 | [diff] [blame] | 963 | Modified |= optimizeBlock(MBB); | 
|  | 964 | } | 
|  | 965 |  | 
| Matt Arsenault | 4103328 | 2014-10-10 22:01:59 +0000 | [diff] [blame] | 966 | return Modified; | 
|  | 967 | } |