| Tom Stellard | 6596ba7 | 2014-11-21 22:06:37 +0000 | [diff] [blame] | 1 | //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// | 
|  | 2 | // | 
| Chandler Carruth | 2946cd7 | 2019-01-19 08:50:56 +0000 | [diff] [blame] | 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|  | 4 | // See https://llvm.org/LICENSE.txt for license information. | 
|  | 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
| Tom Stellard | 6596ba7 | 2014-11-21 22:06:37 +0000 | [diff] [blame] | 6 | // | 
|  | 7 | /// \file | 
|  | 8 | //===----------------------------------------------------------------------===// | 
|  | 9 | // | 
|  | 10 |  | 
|  | 11 | #include "AMDGPU.h" | 
|  | 12 | #include "AMDGPUSubtarget.h" | 
|  | 13 | #include "SIInstrInfo.h" | 
| Matt Arsenault | 3cb3904 | 2017-02-27 19:35:42 +0000 | [diff] [blame] | 14 | #include "SIMachineFunctionInfo.h" | 
| Tom Stellard | 44b30b4 | 2018-05-22 02:03:23 +0000 | [diff] [blame] | 15 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" | 
| Matt Arsenault | ff3f912 | 2017-06-20 18:56:32 +0000 | [diff] [blame] | 16 | #include "llvm/ADT/DepthFirstIterator.h" | 
| Matthias Braun | f842297 | 2017-12-13 02:51:04 +0000 | [diff] [blame] | 17 | #include "llvm/CodeGen/LiveIntervals.h" | 
| Tom Stellard | 6596ba7 | 2014-11-21 22:06:37 +0000 | [diff] [blame] | 18 | #include "llvm/CodeGen/MachineFunctionPass.h" | 
|  | 19 | #include "llvm/CodeGen/MachineInstrBuilder.h" | 
|  | 20 | #include "llvm/CodeGen/MachineRegisterInfo.h" | 
| Tom Stellard | 6596ba7 | 2014-11-21 22:06:37 +0000 | [diff] [blame] | 21 | #include "llvm/Support/Debug.h" | 
| Benjamin Kramer | 799003b | 2015-03-23 19:32:43 +0000 | [diff] [blame] | 22 | #include "llvm/Support/raw_ostream.h" | 
| Tom Stellard | 6596ba7 | 2014-11-21 22:06:37 +0000 | [diff] [blame] | 23 | #include "llvm/Target/TargetMachine.h" | 
|  | 24 |  | 
|  | 25 | #define DEBUG_TYPE "si-fold-operands" | 
|  | 26 | using namespace llvm; | 
|  | 27 |  | 
|  | 28 | namespace { | 
|  | 29 |  | 
| Tom Stellard | bb763e6 | 2015-01-07 17:42:16 +0000 | [diff] [blame] | 30 | struct FoldCandidate { | 
|  | 31 | MachineInstr *UseMI; | 
| Matt Arsenault | 2bc198a | 2016-09-14 15:51:33 +0000 | [diff] [blame] | 32 | union { | 
|  | 33 | MachineOperand *OpToFold; | 
|  | 34 | uint64_t ImmToFold; | 
|  | 35 | int FrameIndexToFold; | 
|  | 36 | }; | 
| Matt Arsenault | de6c421 | 2018-08-28 18:34:24 +0000 | [diff] [blame] | 37 | int ShrinkOpcode; | 
| Matt Arsenault | 2bc198a | 2016-09-14 15:51:33 +0000 | [diff] [blame] | 38 | unsigned char UseOpNo; | 
|  | 39 | MachineOperand::MachineOperandType Kind; | 
| Stanislav Mekhanoshin | f154b4f | 2017-06-03 00:41:52 +0000 | [diff] [blame] | 40 | bool Commuted; | 
| Tom Stellard | bb763e6 | 2015-01-07 17:42:16 +0000 | [diff] [blame] | 41 |  | 
| Stanislav Mekhanoshin | f154b4f | 2017-06-03 00:41:52 +0000 | [diff] [blame] | 42 | FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, | 
| Matt Arsenault | de6c421 | 2018-08-28 18:34:24 +0000 | [diff] [blame] | 43 | bool Commuted_ = false, | 
|  | 44 | int ShrinkOp = -1) : | 
|  | 45 | UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo), | 
|  | 46 | Kind(FoldOp->getType()), | 
| Stanislav Mekhanoshin | f154b4f | 2017-06-03 00:41:52 +0000 | [diff] [blame] | 47 | Commuted(Commuted_) { | 
| Tom Stellard | 0599297 | 2015-01-07 22:44:19 +0000 | [diff] [blame] | 48 | if (FoldOp->isImm()) { | 
| Tom Stellard | 0599297 | 2015-01-07 22:44:19 +0000 | [diff] [blame] | 49 | ImmToFold = FoldOp->getImm(); | 
| Matt Arsenault | 2bc198a | 2016-09-14 15:51:33 +0000 | [diff] [blame] | 50 | } else if (FoldOp->isFI()) { | 
|  | 51 | FrameIndexToFold = FoldOp->getIndex(); | 
| Tom Stellard | 0599297 | 2015-01-07 22:44:19 +0000 | [diff] [blame] | 52 | } else { | 
| Nicolai Haehnle | 2710171 | 2019-06-25 11:52:30 +0000 | [diff] [blame] | 53 | assert(FoldOp->isReg() || FoldOp->isGlobal()); | 
| Tom Stellard | 0599297 | 2015-01-07 22:44:19 +0000 | [diff] [blame] | 54 | OpToFold = FoldOp; | 
|  | 55 | } | 
|  | 56 | } | 
| Tom Stellard | bb763e6 | 2015-01-07 17:42:16 +0000 | [diff] [blame] | 57 |  | 
| Matt Arsenault | 2bc198a | 2016-09-14 15:51:33 +0000 | [diff] [blame] | 58 | bool isFI() const { | 
|  | 59 | return Kind == MachineOperand::MO_FrameIndex; | 
|  | 60 | } | 
|  | 61 |  | 
| Tom Stellard | bb763e6 | 2015-01-07 17:42:16 +0000 | [diff] [blame] | 62 | bool isImm() const { | 
| Matt Arsenault | 2bc198a | 2016-09-14 15:51:33 +0000 | [diff] [blame] | 63 | return Kind == MachineOperand::MO_Immediate; | 
|  | 64 | } | 
|  | 65 |  | 
|  | 66 | bool isReg() const { | 
|  | 67 | return Kind == MachineOperand::MO_Register; | 
| Tom Stellard | bb763e6 | 2015-01-07 17:42:16 +0000 | [diff] [blame] | 68 | } | 
| Stanislav Mekhanoshin | f154b4f | 2017-06-03 00:41:52 +0000 | [diff] [blame] | 69 |  | 
| Nicolai Haehnle | 2710171 | 2019-06-25 11:52:30 +0000 | [diff] [blame] | 70 | bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; } | 
|  | 71 |  | 
| Stanislav Mekhanoshin | f154b4f | 2017-06-03 00:41:52 +0000 | [diff] [blame] | 72 | bool isCommuted() const { | 
|  | 73 | return Commuted; | 
|  | 74 | } | 
| Matt Arsenault | de6c421 | 2018-08-28 18:34:24 +0000 | [diff] [blame] | 75 |  | 
|  | 76 | bool needsShrink() const { | 
|  | 77 | return ShrinkOpcode != -1; | 
|  | 78 | } | 
|  | 79 |  | 
|  | 80 | int getShrinkOpcode() const { | 
|  | 81 | return ShrinkOpcode; | 
|  | 82 | } | 
| Tom Stellard | bb763e6 | 2015-01-07 17:42:16 +0000 | [diff] [blame] | 83 | }; | 
|  | 84 |  | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 85 | class SIFoldOperands : public MachineFunctionPass { | 
|  | 86 | public: | 
|  | 87 | static char ID; | 
|  | 88 | MachineRegisterInfo *MRI; | 
|  | 89 | const SIInstrInfo *TII; | 
|  | 90 | const SIRegisterInfo *TRI; | 
| Tom Stellard | 5bfbae5 | 2018-07-11 20:59:01 +0000 | [diff] [blame] | 91 | const GCNSubtarget *ST; | 
| Matt Arsenault | 60957cb | 2019-06-24 14:53:56 +0000 | [diff] [blame] | 92 | const SIMachineFunctionInfo *MFI; | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 93 |  | 
|  | 94 | void foldOperand(MachineOperand &OpToFold, | 
|  | 95 | MachineInstr *UseMI, | 
| Matt Arsenault | 60957cb | 2019-06-24 14:53:56 +0000 | [diff] [blame] | 96 | int UseOpIdx, | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 97 | SmallVectorImpl<FoldCandidate> &FoldList, | 
|  | 98 | SmallVectorImpl<MachineInstr *> &CopiesToReplace) const; | 
|  | 99 |  | 
|  | 100 | void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; | 
|  | 101 |  | 
| Matt Arsenault | d5c6515 | 2017-02-22 23:27:53 +0000 | [diff] [blame] | 102 | const MachineOperand *isClamp(const MachineInstr &MI) const; | 
|  | 103 | bool tryFoldClamp(MachineInstr &MI); | 
|  | 104 |  | 
| Matt Arsenault | 3cb3904 | 2017-02-27 19:35:42 +0000 | [diff] [blame] | 105 | std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const; | 
|  | 106 | bool tryFoldOMod(MachineInstr &MI); | 
|  | 107 |  | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 108 | public: | 
|  | 109 | SIFoldOperands() : MachineFunctionPass(ID) { | 
|  | 110 | initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); | 
|  | 111 | } | 
|  | 112 |  | 
|  | 113 | bool runOnMachineFunction(MachineFunction &MF) override; | 
|  | 114 |  | 
|  | 115 | StringRef getPassName() const override { return "SI Fold Operands"; } | 
|  | 116 |  | 
|  | 117 | void getAnalysisUsage(AnalysisUsage &AU) const override { | 
|  | 118 | AU.setPreservesCFG(); | 
|  | 119 | MachineFunctionPass::getAnalysisUsage(AU); | 
|  | 120 | } | 
|  | 121 | }; | 
|  | 122 |  | 
| Tom Stellard | 6596ba7 | 2014-11-21 22:06:37 +0000 | [diff] [blame] | 123 | } // End anonymous namespace. | 
|  | 124 |  | 
| Matt Arsenault | 427c548 | 2016-02-11 06:15:34 +0000 | [diff] [blame] | 125 | INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE, | 
|  | 126 | "SI Fold Operands", false, false) | 
| Tom Stellard | 6596ba7 | 2014-11-21 22:06:37 +0000 | [diff] [blame] | 127 |  | 
|  | 128 | char SIFoldOperands::ID = 0; | 
|  | 129 |  | 
|  | 130 | char &llvm::SIFoldOperandsID = SIFoldOperands::ID; | 
|  | 131 |  | 
| Matt Arsenault | 69e3001 | 2017-01-11 22:00:02 +0000 | [diff] [blame] | 132 | // Wrapper around isInlineConstant that understands special cases when | 
|  | 133 | // instruction types are replaced during operand folding. | 
|  | 134 | static bool isInlineConstantIfFolded(const SIInstrInfo *TII, | 
|  | 135 | const MachineInstr &UseMI, | 
|  | 136 | unsigned OpNo, | 
|  | 137 | const MachineOperand &OpToFold) { | 
|  | 138 | if (TII->isInlineConstant(UseMI, OpNo, OpToFold)) | 
|  | 139 | return true; | 
|  | 140 |  | 
|  | 141 | unsigned Opc = UseMI.getOpcode(); | 
|  | 142 | switch (Opc) { | 
|  | 143 | case AMDGPU::V_MAC_F32_e64: | 
| Matt Arsenault | 0084adc | 2018-04-30 19:08:16 +0000 | [diff] [blame] | 144 | case AMDGPU::V_MAC_F16_e64: | 
|  | 145 | case AMDGPU::V_FMAC_F32_e64: { | 
| Matt Arsenault | 69e3001 | 2017-01-11 22:00:02 +0000 | [diff] [blame] | 146 | // Special case for mac. Since this is replaced with mad when folded into | 
|  | 147 | // src2, we need to check the legality for the final instruction. | 
|  | 148 | int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); | 
|  | 149 | if (static_cast<int>(OpNo) == Src2Idx) { | 
| Matt Arsenault | 0084adc | 2018-04-30 19:08:16 +0000 | [diff] [blame] | 150 | bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64; | 
| Matt Arsenault | 69e3001 | 2017-01-11 22:00:02 +0000 | [diff] [blame] | 151 | bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; | 
| Matt Arsenault | 0084adc | 2018-04-30 19:08:16 +0000 | [diff] [blame] | 152 |  | 
|  | 153 | unsigned Opc = IsFMA ? | 
|  | 154 | AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); | 
|  | 155 | const MCInstrDesc &MadDesc = TII->get(Opc); | 
| Matt Arsenault | 69e3001 | 2017-01-11 22:00:02 +0000 | [diff] [blame] | 156 | return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType); | 
|  | 157 | } | 
| Simon Pilgrim | 0f5b350 | 2017-07-07 10:18:57 +0000 | [diff] [blame] | 158 | return false; | 
| Matt Arsenault | 69e3001 | 2017-01-11 22:00:02 +0000 | [diff] [blame] | 159 | } | 
|  | 160 | default: | 
|  | 161 | return false; | 
|  | 162 | } | 
|  | 163 | } | 
|  | 164 |  | 
| Matt Arsenault | 60957cb | 2019-06-24 14:53:56 +0000 | [diff] [blame] | 165 | // TODO: Add heuristic that the frame index might not fit in the addressing mode | 
|  | 166 | // immediate offset to avoid materializing in loops. | 
|  | 167 | static bool frameIndexMayFold(const SIInstrInfo *TII, | 
|  | 168 | const MachineInstr &UseMI, | 
|  | 169 | int OpNo, | 
|  | 170 | const MachineOperand &OpToFold) { | 
|  | 171 | return OpToFold.isFI() && | 
|  | 172 | (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) && | 
|  | 173 | OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr); | 
|  | 174 | } | 
|  | 175 |  | 
| Tom Stellard | 6596ba7 | 2014-11-21 22:06:37 +0000 | [diff] [blame] | 176 | FunctionPass *llvm::createSIFoldOperandsPass() { | 
|  | 177 | return new SIFoldOperands(); | 
|  | 178 | } | 
|  | 179 |  | 
| Tom Stellard | bb763e6 | 2015-01-07 17:42:16 +0000 | [diff] [blame] | 180 | static bool updateOperand(FoldCandidate &Fold, | 
| Matt Arsenault | de6c421 | 2018-08-28 18:34:24 +0000 | [diff] [blame] | 181 | const SIInstrInfo &TII, | 
| Stanislav Mekhanoshin | 5cf8167 | 2019-05-02 04:01:39 +0000 | [diff] [blame] | 182 | const TargetRegisterInfo &TRI, | 
|  | 183 | const GCNSubtarget &ST) { | 
| Tom Stellard | bb763e6 | 2015-01-07 17:42:16 +0000 | [diff] [blame] | 184 | MachineInstr *MI = Fold.UseMI; | 
|  | 185 | MachineOperand &Old = MI->getOperand(Fold.UseOpNo); | 
| Tom Stellard | 6596ba7 | 2014-11-21 22:06:37 +0000 | [diff] [blame] | 186 | assert(Old.isReg()); | 
|  | 187 |  | 
| Tom Stellard | bb763e6 | 2015-01-07 17:42:16 +0000 | [diff] [blame] | 188 | if (Fold.isImm()) { | 
| Stanislav Mekhanoshin | 5cf8167 | 2019-05-02 04:01:39 +0000 | [diff] [blame] | 189 | if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked && | 
|  | 190 | AMDGPU::isInlinableLiteralV216(static_cast<uint16_t>(Fold.ImmToFold), | 
|  | 191 | ST.hasInv2PiInlineImm())) { | 
| Stanislav Mekhanoshin | 160f857 | 2018-04-19 21:16:50 +0000 | [diff] [blame] | 192 | // Set op_sel/op_sel_hi on this operand or bail out if op_sel is | 
|  | 193 | // already set. | 
| Stanislav Mekhanoshin | 8b20b7d | 2018-04-17 23:09:05 +0000 | [diff] [blame] | 194 | unsigned Opcode = MI->getOpcode(); | 
|  | 195 | int OpNo = MI->getOperandNo(&Old); | 
|  | 196 | int ModIdx = -1; | 
|  | 197 | if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) | 
|  | 198 | ModIdx = AMDGPU::OpName::src0_modifiers; | 
|  | 199 | else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) | 
|  | 200 | ModIdx = AMDGPU::OpName::src1_modifiers; | 
|  | 201 | else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) | 
|  | 202 | ModIdx = AMDGPU::OpName::src2_modifiers; | 
|  | 203 | assert(ModIdx != -1); | 
|  | 204 | ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx); | 
|  | 205 | MachineOperand &Mod = MI->getOperand(ModIdx); | 
|  | 206 | unsigned Val = Mod.getImm(); | 
|  | 207 | if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1)) | 
|  | 208 | return false; | 
| Michael Liao | 389d5a3 | 2019-04-22 22:05:49 +0000 | [diff] [blame] | 209 | // Only apply the following transformation if that operand requries | 
|  | 210 | // a packed immediate. | 
|  | 211 | switch (TII.get(Opcode).OpInfo[OpNo].OperandType) { | 
| Stanislav Mekhanoshin | 5cf8167 | 2019-05-02 04:01:39 +0000 | [diff] [blame] | 212 | case AMDGPU::OPERAND_REG_IMM_V2FP16: | 
|  | 213 | case AMDGPU::OPERAND_REG_IMM_V2INT16: | 
| Michael Liao | 389d5a3 | 2019-04-22 22:05:49 +0000 | [diff] [blame] | 214 | case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: | 
|  | 215 | case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: | 
|  | 216 | // If upper part is all zero we do not need op_sel_hi. | 
|  | 217 | if (!isUInt<16>(Fold.ImmToFold)) { | 
|  | 218 | if (!(Fold.ImmToFold & 0xffff)) { | 
|  | 219 | Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); | 
|  | 220 | Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); | 
|  | 221 | Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); | 
|  | 222 | return true; | 
|  | 223 | } | 
| Stanislav Mekhanoshin | 160f857 | 2018-04-19 21:16:50 +0000 | [diff] [blame] | 224 | Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); | 
| Stanislav Mekhanoshin | 5cf8167 | 2019-05-02 04:01:39 +0000 | [diff] [blame] | 225 | Old.ChangeToImmediate(Fold.ImmToFold & 0xffff); | 
|  | 226 | return true; | 
| Stanislav Mekhanoshin | 160f857 | 2018-04-19 21:16:50 +0000 | [diff] [blame] | 227 | } | 
| Michael Liao | 389d5a3 | 2019-04-22 22:05:49 +0000 | [diff] [blame] | 228 | break; | 
|  | 229 | default: | 
|  | 230 | break; | 
| Stanislav Mekhanoshin | 160f857 | 2018-04-19 21:16:50 +0000 | [diff] [blame] | 231 | } | 
| Stanislav Mekhanoshin | 8b20b7d | 2018-04-17 23:09:05 +0000 | [diff] [blame] | 232 | } | 
| Matt Arsenault | cfd0ca3 | 2019-05-03 15:21:53 +0000 | [diff] [blame] | 233 | } | 
| Matt Arsenault | de6c421 | 2018-08-28 18:34:24 +0000 | [diff] [blame] | 234 |  | 
| Nicolai Haehnle | 2710171 | 2019-06-25 11:52:30 +0000 | [diff] [blame] | 235 | if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) { | 
| Matt Arsenault | cfd0ca3 | 2019-05-03 15:21:53 +0000 | [diff] [blame] | 236 | MachineBasicBlock *MBB = MI->getParent(); | 
|  | 237 | auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI); | 
|  | 238 | if (Liveness != MachineBasicBlock::LQR_Dead) | 
|  | 239 | return false; | 
| Matt Arsenault | de6c421 | 2018-08-28 18:34:24 +0000 | [diff] [blame] | 240 |  | 
| Matt Arsenault | cfd0ca3 | 2019-05-03 15:21:53 +0000 | [diff] [blame] | 241 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | 
|  | 242 | int Op32 = Fold.getShrinkOpcode(); | 
|  | 243 | MachineOperand &Dst0 = MI->getOperand(0); | 
|  | 244 | MachineOperand &Dst1 = MI->getOperand(1); | 
|  | 245 | assert(Dst0.isDef() && Dst1.isDef()); | 
| Matt Arsenault | de6c421 | 2018-08-28 18:34:24 +0000 | [diff] [blame] | 246 |  | 
| Matt Arsenault | cfd0ca3 | 2019-05-03 15:21:53 +0000 | [diff] [blame] | 247 | bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg()); | 
| Matt Arsenault | 44a8a75 | 2018-08-28 18:44:16 +0000 | [diff] [blame] | 248 |  | 
| Matt Arsenault | cfd0ca3 | 2019-05-03 15:21:53 +0000 | [diff] [blame] | 249 | const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg()); | 
|  | 250 | unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC); | 
| Matt Arsenault | de6c421 | 2018-08-28 18:34:24 +0000 | [diff] [blame] | 251 |  | 
| Matt Arsenault | cfd0ca3 | 2019-05-03 15:21:53 +0000 | [diff] [blame] | 252 | MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32); | 
| Matt Arsenault | de6c421 | 2018-08-28 18:34:24 +0000 | [diff] [blame] | 253 |  | 
| Matt Arsenault | cfd0ca3 | 2019-05-03 15:21:53 +0000 | [diff] [blame] | 254 | if (HaveNonDbgCarryUse) { | 
|  | 255 | BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg()) | 
|  | 256 | .addReg(AMDGPU::VCC, RegState::Kill); | 
| Matt Arsenault | de6c421 | 2018-08-28 18:34:24 +0000 | [diff] [blame] | 257 | } | 
|  | 258 |  | 
| Matt Arsenault | cfd0ca3 | 2019-05-03 15:21:53 +0000 | [diff] [blame] | 259 | // Keep the old instruction around to avoid breaking iterators, but | 
|  | 260 | // replace it with a dummy instruction to remove uses. | 
|  | 261 | // | 
|  | 262 | // FIXME: We should not invert how this pass looks at operands to avoid | 
|  | 263 | // this. Should track set of foldable movs instead of looking for uses | 
|  | 264 | // when looking at a use. | 
|  | 265 | Dst0.setReg(NewReg0); | 
|  | 266 | for (unsigned I = MI->getNumOperands() - 1; I > 0; --I) | 
|  | 267 | MI->RemoveOperand(I); | 
|  | 268 | MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF)); | 
|  | 269 |  | 
|  | 270 | if (Fold.isCommuted()) | 
|  | 271 | TII.commuteInstruction(*Inst32, false); | 
| Tom Stellard | 6596ba7 | 2014-11-21 22:06:37 +0000 | [diff] [blame] | 272 | return true; | 
|  | 273 | } | 
|  | 274 |  | 
| Matt Arsenault | de6c421 | 2018-08-28 18:34:24 +0000 | [diff] [blame] | 275 | assert(!Fold.needsShrink() && "not handled"); | 
|  | 276 |  | 
| Matt Arsenault | cfd0ca3 | 2019-05-03 15:21:53 +0000 | [diff] [blame] | 277 | if (Fold.isImm()) { | 
|  | 278 | Old.ChangeToImmediate(Fold.ImmToFold); | 
|  | 279 | return true; | 
|  | 280 | } | 
|  | 281 |  | 
| Nicolai Haehnle | 2710171 | 2019-06-25 11:52:30 +0000 | [diff] [blame] | 282 | if (Fold.isGlobal()) { | 
|  | 283 | Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(), | 
|  | 284 | Fold.OpToFold->getTargetFlags()); | 
|  | 285 | return true; | 
|  | 286 | } | 
|  | 287 |  | 
| Matt Arsenault | 2bc198a | 2016-09-14 15:51:33 +0000 | [diff] [blame] | 288 | if (Fold.isFI()) { | 
|  | 289 | Old.ChangeToFrameIndex(Fold.FrameIndexToFold); | 
|  | 290 | return true; | 
|  | 291 | } | 
|  | 292 |  | 
| Tom Stellard | bb763e6 | 2015-01-07 17:42:16 +0000 | [diff] [blame] | 293 | MachineOperand *New = Fold.OpToFold; | 
| Matt Arsenault | e75e197 | 2019-06-18 12:23:45 +0000 | [diff] [blame] | 294 | Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); | 
|  | 295 | Old.setIsUndef(New->isUndef()); | 
|  | 296 | return true; | 
| Tom Stellard | 6596ba7 | 2014-11-21 22:06:37 +0000 | [diff] [blame] | 297 | } | 
|  | 298 |  | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 299 | static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList, | 
| Tom Stellard | db5a11f | 2015-07-13 15:47:57 +0000 | [diff] [blame] | 300 | const MachineInstr *MI) { | 
|  | 301 | for (auto Candidate : FoldList) { | 
|  | 302 | if (Candidate.UseMI == MI) | 
|  | 303 | return true; | 
|  | 304 | } | 
|  | 305 | return false; | 
|  | 306 | } | 
|  | 307 |  | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 308 | static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, | 
| Tom Stellard | 0599297 | 2015-01-07 22:44:19 +0000 | [diff] [blame] | 309 | MachineInstr *MI, unsigned OpNo, | 
|  | 310 | MachineOperand *OpToFold, | 
|  | 311 | const SIInstrInfo *TII) { | 
| Duncan P. N. Exon Smith | 9cfc75c | 2016-06-30 00:01:54 +0000 | [diff] [blame] | 312 | if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { | 
| Konstantin Zhuravlyov | f86e4b7 | 2016-11-13 07:01:11 +0000 | [diff] [blame] | 313 | // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 | 
| Tom Stellard | db5a11f | 2015-07-13 15:47:57 +0000 | [diff] [blame] | 314 | unsigned Opc = MI->getOpcode(); | 
| Matt Arsenault | 0084adc | 2018-04-30 19:08:16 +0000 | [diff] [blame] | 315 | if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || | 
|  | 316 | Opc == AMDGPU::V_FMAC_F32_e64) && | 
| Tom Stellard | db5a11f | 2015-07-13 15:47:57 +0000 | [diff] [blame] | 317 | (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { | 
| Matt Arsenault | 0084adc | 2018-04-30 19:08:16 +0000 | [diff] [blame] | 318 | bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64; | 
| Matt Arsenault | 69e3001 | 2017-01-11 22:00:02 +0000 | [diff] [blame] | 319 | bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; | 
| Matt Arsenault | 0084adc | 2018-04-30 19:08:16 +0000 | [diff] [blame] | 320 | unsigned NewOpc = IsFMA ? | 
|  | 321 | AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); | 
| Konstantin Zhuravlyov | f86e4b7 | 2016-11-13 07:01:11 +0000 | [diff] [blame] | 322 |  | 
|  | 323 | // Check if changing this to a v_mad_{f16, f32} instruction will allow us | 
|  | 324 | // to fold the operand. | 
| Matt Arsenault | 0084adc | 2018-04-30 19:08:16 +0000 | [diff] [blame] | 325 | MI->setDesc(TII->get(NewOpc)); | 
| Tom Stellard | db5a11f | 2015-07-13 15:47:57 +0000 | [diff] [blame] | 326 | bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); | 
|  | 327 | if (FoldAsMAD) { | 
|  | 328 | MI->untieRegOperand(OpNo); | 
|  | 329 | return true; | 
|  | 330 | } | 
|  | 331 | MI->setDesc(TII->get(Opc)); | 
|  | 332 | } | 
|  | 333 |  | 
| Tom Stellard | 8485fa0 | 2016-12-07 02:42:15 +0000 | [diff] [blame] | 334 | // Special case for s_setreg_b32 | 
|  | 335 | if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) { | 
|  | 336 | MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32)); | 
|  | 337 | FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); | 
|  | 338 | return true; | 
|  | 339 | } | 
|  | 340 |  | 
| Tom Stellard | db5a11f | 2015-07-13 15:47:57 +0000 | [diff] [blame] | 341 | // If we are already folding into another operand of MI, then | 
|  | 342 | // we can't commute the instruction, otherwise we risk making the | 
|  | 343 | // other fold illegal. | 
|  | 344 | if (isUseMIInFoldList(FoldList, MI)) | 
|  | 345 | return false; | 
|  | 346 |  | 
| Matt Arsenault | de6c421 | 2018-08-28 18:34:24 +0000 | [diff] [blame] | 347 | unsigned CommuteOpNo = OpNo; | 
|  | 348 |  | 
| Tom Stellard | 0599297 | 2015-01-07 22:44:19 +0000 | [diff] [blame] | 349 | // Operand is not legal, so try to commute the instruction to | 
|  | 350 | // see if this makes it possible to fold. | 
| Andrew Kaylor | 16c4da0 | 2015-09-28 20:33:22 +0000 | [diff] [blame] | 351 | unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex; | 
|  | 352 | unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; | 
| Duncan P. N. Exon Smith | 9cfc75c | 2016-06-30 00:01:54 +0000 | [diff] [blame] | 353 | bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1); | 
| Tom Stellard | 0599297 | 2015-01-07 22:44:19 +0000 | [diff] [blame] | 354 |  | 
|  | 355 | if (CanCommute) { | 
|  | 356 | if (CommuteIdx0 == OpNo) | 
| Matt Arsenault | de6c421 | 2018-08-28 18:34:24 +0000 | [diff] [blame] | 357 | CommuteOpNo = CommuteIdx1; | 
| Tom Stellard | 0599297 | 2015-01-07 22:44:19 +0000 | [diff] [blame] | 358 | else if (CommuteIdx1 == OpNo) | 
| Matt Arsenault | de6c421 | 2018-08-28 18:34:24 +0000 | [diff] [blame] | 359 | CommuteOpNo = CommuteIdx0; | 
| Tom Stellard | 0599297 | 2015-01-07 22:44:19 +0000 | [diff] [blame] | 360 | } | 
|  | 361 |  | 
| Matt Arsenault | de6c421 | 2018-08-28 18:34:24 +0000 | [diff] [blame] | 362 |  | 
| Andrew Kaylor | 16c4da0 | 2015-09-28 20:33:22 +0000 | [diff] [blame] | 363 | // One of operands might be an Imm operand, and OpNo may refer to it after | 
|  | 364 | // the call of commuteInstruction() below. Such situations are avoided | 
|  | 365 | // here explicitly as OpNo must be a register operand to be a candidate | 
|  | 366 | // for memory folding. | 
|  | 367 | if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() || | 
|  | 368 | !MI->getOperand(CommuteIdx1).isReg())) | 
|  | 369 | return false; | 
|  | 370 |  | 
|  | 371 | if (!CanCommute || | 
| Duncan P. N. Exon Smith | 9cfc75c | 2016-06-30 00:01:54 +0000 | [diff] [blame] | 372 | !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1)) | 
| Tom Stellard | 0599297 | 2015-01-07 22:44:19 +0000 | [diff] [blame] | 373 | return false; | 
|  | 374 |  | 
| Matt Arsenault | de6c421 | 2018-08-28 18:34:24 +0000 | [diff] [blame] | 375 | if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) { | 
|  | 376 | if ((Opc == AMDGPU::V_ADD_I32_e64 || | 
|  | 377 | Opc == AMDGPU::V_SUB_I32_e64 || | 
|  | 378 | Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME | 
| Nicolai Haehnle | 2710171 | 2019-06-25 11:52:30 +0000 | [diff] [blame] | 379 | (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) { | 
| Matt Arsenault | de6c421 | 2018-08-28 18:34:24 +0000 | [diff] [blame] | 380 | MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); | 
|  | 381 |  | 
|  | 382 | // Verify the other operand is a VGPR, otherwise we would violate the | 
|  | 383 | // constant bus restriction. | 
|  | 384 | unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0; | 
|  | 385 | MachineOperand &OtherOp = MI->getOperand(OtherIdx); | 
|  | 386 | if (!OtherOp.isReg() || | 
|  | 387 | !TII->getRegisterInfo().isVGPR(MRI, OtherOp.getReg())) | 
|  | 388 | return false; | 
|  | 389 |  | 
| Fangrui Song | 9cca227 | 2018-08-28 19:19:03 +0000 | [diff] [blame] | 390 | assert(MI->getOperand(1).isDef()); | 
| Matt Arsenault | de6c421 | 2018-08-28 18:34:24 +0000 | [diff] [blame] | 391 |  | 
| Matt Arsenault | 2c8936f | 2019-05-03 13:42:56 +0000 | [diff] [blame] | 392 | // Make sure to get the 32-bit version of the commuted opcode. | 
|  | 393 | unsigned MaybeCommutedOpc = MI->getOpcode(); | 
|  | 394 | int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc); | 
|  | 395 |  | 
| Matt Arsenault | de6c421 | 2018-08-28 18:34:24 +0000 | [diff] [blame] | 396 | FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true, | 
|  | 397 | Op32)); | 
|  | 398 | return true; | 
|  | 399 | } | 
|  | 400 |  | 
| Stanislav Mekhanoshin | f154b4f | 2017-06-03 00:41:52 +0000 | [diff] [blame] | 401 | TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1); | 
| Tom Stellard | 0599297 | 2015-01-07 22:44:19 +0000 | [diff] [blame] | 402 | return false; | 
| Stanislav Mekhanoshin | f154b4f | 2017-06-03 00:41:52 +0000 | [diff] [blame] | 403 | } | 
|  | 404 |  | 
| Matt Arsenault | de6c421 | 2018-08-28 18:34:24 +0000 | [diff] [blame] | 405 | FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true)); | 
| Stanislav Mekhanoshin | f154b4f | 2017-06-03 00:41:52 +0000 | [diff] [blame] | 406 | return true; | 
| Tom Stellard | 0599297 | 2015-01-07 22:44:19 +0000 | [diff] [blame] | 407 | } | 
|  | 408 |  | 
|  | 409 | FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); | 
|  | 410 | return true; | 
|  | 411 | } | 
|  | 412 |  | 
| Matt Arsenault | 5e63a04 | 2016-10-06 18:12:13 +0000 | [diff] [blame] | 413 | // If the use operand doesn't care about the value, this may be an operand only | 
|  | 414 | // used for register indexing, in which case it is unsafe to fold. | 
| Stanislav Mekhanoshin | 56ea488 | 2017-05-30 16:49:24 +0000 | [diff] [blame] | 415 | static bool isUseSafeToFold(const SIInstrInfo *TII, | 
|  | 416 | const MachineInstr &MI, | 
| Matt Arsenault | 5e63a04 | 2016-10-06 18:12:13 +0000 | [diff] [blame] | 417 | const MachineOperand &UseMO) { | 
| Stanislav Mekhanoshin | 56ea488 | 2017-05-30 16:49:24 +0000 | [diff] [blame] | 418 | return !UseMO.isUndef() && !TII->isSDWA(MI); | 
| Matt Arsenault | 5e63a04 | 2016-10-06 18:12:13 +0000 | [diff] [blame] | 419 | //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); | 
|  | 420 | } | 
|  | 421 |  | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 422 | void SIFoldOperands::foldOperand( | 
|  | 423 | MachineOperand &OpToFold, | 
|  | 424 | MachineInstr *UseMI, | 
| Matt Arsenault | 60957cb | 2019-06-24 14:53:56 +0000 | [diff] [blame] | 425 | int UseOpIdx, | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 426 | SmallVectorImpl<FoldCandidate> &FoldList, | 
|  | 427 | SmallVectorImpl<MachineInstr *> &CopiesToReplace) const { | 
| Tom Stellard | b8ce14c | 2015-08-28 23:45:19 +0000 | [diff] [blame] | 428 | const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); | 
|  | 429 |  | 
| Stanislav Mekhanoshin | 56ea488 | 2017-05-30 16:49:24 +0000 | [diff] [blame] | 430 | if (!isUseSafeToFold(TII, *UseMI, UseOp)) | 
| Matt Arsenault | 5e63a04 | 2016-10-06 18:12:13 +0000 | [diff] [blame] | 431 | return; | 
|  | 432 |  | 
| Tom Stellard | b8ce14c | 2015-08-28 23:45:19 +0000 | [diff] [blame] | 433 | // FIXME: Fold operands with subregs. | 
| Matt Arsenault | 3661e90 | 2016-08-15 16:18:36 +0000 | [diff] [blame] | 434 | if (UseOp.isReg() && OpToFold.isReg()) { | 
|  | 435 | if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister) | 
|  | 436 | return; | 
|  | 437 |  | 
|  | 438 | // Don't fold subregister extracts into tied operands, only if it is a full | 
|  | 439 | // copy since a subregister use tied to a full register def doesn't really | 
|  | 440 | // make sense. e.g. don't fold: | 
|  | 441 | // | 
| Francis Visoiu Mistrih | 93ef145 | 2017-11-30 12:12:19 +0000 | [diff] [blame] | 442 | // %1 = COPY %0:sub1 | 
|  | 443 | // %2<tied3> = V_MAC_{F16, F32} %3, %4, %1<tied0> | 
| Matt Arsenault | 3661e90 | 2016-08-15 16:18:36 +0000 | [diff] [blame] | 444 | // | 
|  | 445 | //  into | 
| Francis Visoiu Mistrih | 93ef145 | 2017-11-30 12:12:19 +0000 | [diff] [blame] | 446 | // %2<tied3> = V_MAC_{F16, F32} %3, %4, %0:sub1<tied0> | 
| Matt Arsenault | 3661e90 | 2016-08-15 16:18:36 +0000 | [diff] [blame] | 447 | if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister) | 
|  | 448 | return; | 
| Tom Stellard | b8ce14c | 2015-08-28 23:45:19 +0000 | [diff] [blame] | 449 | } | 
|  | 450 |  | 
| Tom Stellard | 9a19767 | 2015-09-09 15:43:26 +0000 | [diff] [blame] | 451 | // Special case for REG_SEQUENCE: We can't fold literals into | 
|  | 452 | // REG_SEQUENCE instructions, so we have to fold them into the | 
|  | 453 | // uses of REG_SEQUENCE. | 
| Matt Arsenault | a24d84b | 2016-11-23 21:51:07 +0000 | [diff] [blame] | 454 | if (UseMI->isRegSequence()) { | 
| Tom Stellard | 9a19767 | 2015-09-09 15:43:26 +0000 | [diff] [blame] | 455 | unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); | 
|  | 456 | unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); | 
|  | 457 |  | 
| Matt Arsenault | 4d000d2 | 2019-06-19 20:44:15 +0000 | [diff] [blame] | 458 | MachineRegisterInfo::use_iterator Next; | 
| Tom Stellard | 9a19767 | 2015-09-09 15:43:26 +0000 | [diff] [blame] | 459 | for (MachineRegisterInfo::use_iterator | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 460 | RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end(); | 
| Matt Arsenault | 4d000d2 | 2019-06-19 20:44:15 +0000 | [diff] [blame] | 461 | RSUse != RSE; RSUse = Next) { | 
|  | 462 | Next = std::next(RSUse); | 
| Tom Stellard | 9a19767 | 2015-09-09 15:43:26 +0000 | [diff] [blame] | 463 |  | 
|  | 464 | MachineInstr *RSUseMI = RSUse->getParent(); | 
|  | 465 | if (RSUse->getSubReg() != RegSeqDstSubReg) | 
|  | 466 | continue; | 
|  | 467 |  | 
|  | 468 | foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 469 | CopiesToReplace); | 
| Tom Stellard | 9a19767 | 2015-09-09 15:43:26 +0000 | [diff] [blame] | 470 | } | 
| Matt Arsenault | a24d84b | 2016-11-23 21:51:07 +0000 | [diff] [blame] | 471 |  | 
| Tom Stellard | 9a19767 | 2015-09-09 15:43:26 +0000 | [diff] [blame] | 472 | return; | 
|  | 473 | } | 
|  | 474 |  | 
| Matt Arsenault | 60957cb | 2019-06-24 14:53:56 +0000 | [diff] [blame] | 475 | if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) { | 
|  | 476 | // Sanity check that this is a stack access. | 
|  | 477 | // FIXME: Should probably use stack pseudos before frame lowering. | 
|  | 478 | MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset); | 
|  | 479 | if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() && | 
|  | 480 | SOff->getReg() != MFI->getStackPtrOffsetReg())) | 
|  | 481 | return; | 
| Tom Stellard | b8ce14c | 2015-08-28 23:45:19 +0000 | [diff] [blame] | 482 |  | 
| Matt Arsenault | 60957cb | 2019-06-24 14:53:56 +0000 | [diff] [blame] | 483 | if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() != | 
|  | 484 | MFI->getScratchRSrcReg()) | 
|  | 485 | return; | 
| Tom Stellard | b8ce14c | 2015-08-28 23:45:19 +0000 | [diff] [blame] | 486 |  | 
| Matt Arsenault | 60957cb | 2019-06-24 14:53:56 +0000 | [diff] [blame] | 487 | // A frame index will resolve to a positive constant, so it should always be | 
|  | 488 | // safe to fold the addressing mode, even pre-GFX9. | 
|  | 489 | UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex()); | 
|  | 490 | SOff->setReg(MFI->getStackPtrOffsetReg()); | 
|  | 491 | return; | 
|  | 492 | } | 
|  | 493 |  | 
| Nicolai Haehnle | 2710171 | 2019-06-25 11:52:30 +0000 | [diff] [blame] | 494 | bool FoldingImmLike = | 
|  | 495 | OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); | 
| Matt Arsenault | 60957cb | 2019-06-24 14:53:56 +0000 | [diff] [blame] | 496 |  | 
|  | 497 | if (FoldingImmLike && UseMI->isCopy()) { | 
| Matt Arsenault | a24d84b | 2016-11-23 21:51:07 +0000 | [diff] [blame] | 498 | unsigned DestReg = UseMI->getOperand(0).getReg(); | 
|  | 499 | const TargetRegisterClass *DestRC | 
|  | 500 | = TargetRegisterInfo::isVirtualRegister(DestReg) ? | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 501 | MRI->getRegClass(DestReg) : | 
|  | 502 | TRI->getPhysRegClass(DestReg); | 
| Matt Arsenault | a24d84b | 2016-11-23 21:51:07 +0000 | [diff] [blame] | 503 |  | 
| Alexander Timofeev | 201f892 | 2018-08-30 13:55:04 +0000 | [diff] [blame] | 504 | unsigned SrcReg  = UseMI->getOperand(1).getReg(); | 
|  | 505 | if (TargetRegisterInfo::isVirtualRegister(DestReg) && | 
|  | 506 | TargetRegisterInfo::isVirtualRegister(SrcReg)) { | 
|  | 507 | const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg); | 
|  | 508 | if (TRI->isSGPRClass(SrcRC) && TRI->hasVGPRs(DestRC)) { | 
|  | 509 | MachineRegisterInfo::use_iterator NextUse; | 
|  | 510 | SmallVector<FoldCandidate, 4> CopyUses; | 
|  | 511 | for (MachineRegisterInfo::use_iterator | 
|  | 512 | Use = MRI->use_begin(DestReg), E = MRI->use_end(); | 
|  | 513 | Use != E; Use = NextUse) { | 
|  | 514 | NextUse = std::next(Use); | 
|  | 515 | FoldCandidate FC = FoldCandidate(Use->getParent(), | 
|  | 516 | Use.getOperandNo(), &UseMI->getOperand(1)); | 
|  | 517 | CopyUses.push_back(FC); | 
|  | 518 | } | 
|  | 519 | for (auto & F : CopyUses) { | 
|  | 520 | foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, | 
|  | 521 | FoldList, CopiesToReplace); | 
|  | 522 | } | 
|  | 523 | } | 
|  | 524 | } | 
|  | 525 |  | 
|  | 526 | // In order to fold immediates into copies, we need to change the | 
|  | 527 | // copy to a MOV. | 
|  | 528 |  | 
| Matt Arsenault | a24d84b | 2016-11-23 21:51:07 +0000 | [diff] [blame] | 529 | unsigned MovOp = TII->getMovOpcode(DestRC); | 
|  | 530 | if (MovOp == AMDGPU::COPY) | 
|  | 531 | return; | 
|  | 532 |  | 
|  | 533 | UseMI->setDesc(TII->get(MovOp)); | 
|  | 534 | CopiesToReplace.push_back(UseMI); | 
|  | 535 | } else { | 
| Stanislav Mekhanoshin | b080adf | 2018-09-27 18:55:20 +0000 | [diff] [blame] | 536 | if (UseMI->isCopy() && OpToFold.isReg() && | 
|  | 537 | TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) && | 
| Stanislav Mekhanoshin | b080adf | 2018-09-27 18:55:20 +0000 | [diff] [blame] | 538 | TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) && | 
|  | 539 | TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()) && | 
|  | 540 | !UseMI->getOperand(1).getSubReg()) { | 
|  | 541 | UseMI->getOperand(1).setReg(OpToFold.getReg()); | 
|  | 542 | UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); | 
|  | 543 | UseMI->getOperand(1).setIsKill(false); | 
|  | 544 | CopiesToReplace.push_back(UseMI); | 
|  | 545 | OpToFold.setIsKill(false); | 
|  | 546 | return; | 
|  | 547 | } | 
|  | 548 |  | 
| Matt Arsenault | bcb5ea0 | 2019-06-18 12:23:46 +0000 | [diff] [blame] | 549 | unsigned UseOpc = UseMI->getOpcode(); | 
|  | 550 | if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 || | 
|  | 551 | (UseOpc == AMDGPU::V_READLANE_B32 && | 
|  | 552 | (int)UseOpIdx == | 
|  | 553 | AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) { | 
|  | 554 | // %vgpr = V_MOV_B32 imm | 
|  | 555 | // %sgpr = V_READFIRSTLANE_B32 %vgpr | 
|  | 556 | // => | 
|  | 557 | // %sgpr = S_MOV_B32 imm | 
| Matt Arsenault | 60957cb | 2019-06-24 14:53:56 +0000 | [diff] [blame] | 558 | if (FoldingImmLike) { | 
| Matt Arsenault | f39f3bd | 2019-06-18 12:48:36 +0000 | [diff] [blame] | 559 | if (execMayBeModifiedBeforeUse(*MRI, | 
|  | 560 | UseMI->getOperand(UseOpIdx).getReg(), | 
|  | 561 | *OpToFold.getParent(), | 
|  | 562 | UseMI)) | 
| Matt Arsenault | bcb5ea0 | 2019-06-18 12:23:46 +0000 | [diff] [blame] | 563 | return; | 
|  | 564 |  | 
|  | 565 | UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32)); | 
| Matt Arsenault | 4d000d2 | 2019-06-19 20:44:15 +0000 | [diff] [blame] | 566 |  | 
|  | 567 | // FIXME: ChangeToImmediate should clear subreg | 
|  | 568 | UseMI->getOperand(1).setSubReg(0); | 
| Matt Arsenault | 60957cb | 2019-06-24 14:53:56 +0000 | [diff] [blame] | 569 | if (OpToFold.isImm()) | 
|  | 570 | UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); | 
|  | 571 | else | 
|  | 572 | UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex()); | 
| Matt Arsenault | bcb5ea0 | 2019-06-18 12:23:46 +0000 | [diff] [blame] | 573 | UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane) | 
|  | 574 | return; | 
|  | 575 | } | 
|  | 576 |  | 
|  | 577 | if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) { | 
| Matt Arsenault | f39f3bd | 2019-06-18 12:48:36 +0000 | [diff] [blame] | 578 | if (execMayBeModifiedBeforeUse(*MRI, | 
|  | 579 | UseMI->getOperand(UseOpIdx).getReg(), | 
|  | 580 | *OpToFold.getParent(), | 
|  | 581 | UseMI)) | 
| Matt Arsenault | bcb5ea0 | 2019-06-18 12:23:46 +0000 | [diff] [blame] | 582 | return; | 
|  | 583 |  | 
|  | 584 | // %vgpr = COPY %sgpr0 | 
|  | 585 | // %sgpr1 = V_READFIRSTLANE_B32 %vgpr | 
|  | 586 | // => | 
|  | 587 | // %sgpr1 = COPY %sgpr0 | 
|  | 588 | UseMI->setDesc(TII->get(AMDGPU::COPY)); | 
|  | 589 | UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane) | 
|  | 590 | return; | 
|  | 591 | } | 
|  | 592 | } | 
|  | 593 |  | 
| Matt Arsenault | a24d84b | 2016-11-23 21:51:07 +0000 | [diff] [blame] | 594 | const MCInstrDesc &UseDesc = UseMI->getDesc(); | 
|  | 595 |  | 
|  | 596 | // Don't fold into target independent nodes.  Target independent opcodes | 
|  | 597 | // don't have defined register classes. | 
|  | 598 | if (UseDesc.isVariadic() || | 
| Matt Arsenault | c908e3f | 2018-02-08 01:12:46 +0000 | [diff] [blame] | 599 | UseOp.isImplicit() || | 
| Matt Arsenault | a24d84b | 2016-11-23 21:51:07 +0000 | [diff] [blame] | 600 | UseDesc.OpInfo[UseOpIdx].RegClass == -1) | 
|  | 601 | return; | 
|  | 602 | } | 
|  | 603 |  | 
| Matt Arsenault | 60957cb | 2019-06-24 14:53:56 +0000 | [diff] [blame] | 604 | if (!FoldingImmLike) { | 
| Matt Arsenault | a24d84b | 2016-11-23 21:51:07 +0000 | [diff] [blame] | 605 | tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); | 
|  | 606 |  | 
|  | 607 | // FIXME: We could try to change the instruction from 64-bit to 32-bit | 
|  | 608 | // to enable more folding opportunites.  The shrink operands pass | 
|  | 609 | // already does this. | 
| Tom Stellard | b8ce14c | 2015-08-28 23:45:19 +0000 | [diff] [blame] | 610 | return; | 
|  | 611 | } | 
|  | 612 |  | 
| Tom Stellard | b8ce14c | 2015-08-28 23:45:19 +0000 | [diff] [blame] | 613 |  | 
| Matt Arsenault | a24d84b | 2016-11-23 21:51:07 +0000 | [diff] [blame] | 614 | const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc(); | 
|  | 615 | const TargetRegisterClass *FoldRC = | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 616 | TRI->getRegClass(FoldDesc.OpInfo[0].RegClass); | 
| Matt Arsenault | a24d84b | 2016-11-23 21:51:07 +0000 | [diff] [blame] | 617 |  | 
|  | 618 | // Split 64-bit constants into 32-bits for folding. | 
|  | 619 | if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) { | 
|  | 620 | unsigned UseReg = UseOp.getReg(); | 
| Matt Arsenault | e75e197 | 2019-06-18 12:23:45 +0000 | [diff] [blame] | 621 | const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg); | 
| Matt Arsenault | a24d84b | 2016-11-23 21:51:07 +0000 | [diff] [blame] | 622 |  | 
|  | 623 | if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64) | 
|  | 624 | return; | 
|  | 625 |  | 
| Matt Arsenault | eb522e6 | 2017-02-27 22:15:25 +0000 | [diff] [blame] | 626 | APInt Imm(64, OpToFold.getImm()); | 
| Matt Arsenault | a24d84b | 2016-11-23 21:51:07 +0000 | [diff] [blame] | 627 | if (UseOp.getSubReg() == AMDGPU::sub0) { | 
|  | 628 | Imm = Imm.getLoBits(32); | 
|  | 629 | } else { | 
|  | 630 | assert(UseOp.getSubReg() == AMDGPU::sub1); | 
|  | 631 | Imm = Imm.getHiBits(32); | 
|  | 632 | } | 
| Matt Arsenault | eb522e6 | 2017-02-27 22:15:25 +0000 | [diff] [blame] | 633 |  | 
|  | 634 | MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); | 
|  | 635 | tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); | 
|  | 636 | return; | 
| Matt Arsenault | a24d84b | 2016-11-23 21:51:07 +0000 | [diff] [blame] | 637 | } | 
|  | 638 |  | 
| Matt Arsenault | eb522e6 | 2017-02-27 22:15:25 +0000 | [diff] [blame] | 639 |  | 
|  | 640 |  | 
|  | 641 | tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); | 
| Tom Stellard | b8ce14c | 2015-08-28 23:45:19 +0000 | [diff] [blame] | 642 | } | 
|  | 643 |  | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 644 | static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 645 | uint32_t LHS, uint32_t RHS) { | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 646 | switch (Opcode) { | 
|  | 647 | case AMDGPU::V_AND_B32_e64: | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 648 | case AMDGPU::V_AND_B32_e32: | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 649 | case AMDGPU::S_AND_B32: | 
|  | 650 | Result = LHS & RHS; | 
|  | 651 | return true; | 
|  | 652 | case AMDGPU::V_OR_B32_e64: | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 653 | case AMDGPU::V_OR_B32_e32: | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 654 | case AMDGPU::S_OR_B32: | 
|  | 655 | Result = LHS | RHS; | 
|  | 656 | return true; | 
|  | 657 | case AMDGPU::V_XOR_B32_e64: | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 658 | case AMDGPU::V_XOR_B32_e32: | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 659 | case AMDGPU::S_XOR_B32: | 
|  | 660 | Result = LHS ^ RHS; | 
|  | 661 | return true; | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 662 | case AMDGPU::V_LSHL_B32_e64: | 
|  | 663 | case AMDGPU::V_LSHL_B32_e32: | 
|  | 664 | case AMDGPU::S_LSHL_B32: | 
|  | 665 | // The instruction ignores the high bits for out of bounds shifts. | 
|  | 666 | Result = LHS << (RHS & 31); | 
|  | 667 | return true; | 
|  | 668 | case AMDGPU::V_LSHLREV_B32_e64: | 
|  | 669 | case AMDGPU::V_LSHLREV_B32_e32: | 
|  | 670 | Result = RHS << (LHS & 31); | 
|  | 671 | return true; | 
|  | 672 | case AMDGPU::V_LSHR_B32_e64: | 
|  | 673 | case AMDGPU::V_LSHR_B32_e32: | 
|  | 674 | case AMDGPU::S_LSHR_B32: | 
|  | 675 | Result = LHS >> (RHS & 31); | 
|  | 676 | return true; | 
|  | 677 | case AMDGPU::V_LSHRREV_B32_e64: | 
|  | 678 | case AMDGPU::V_LSHRREV_B32_e32: | 
|  | 679 | Result = RHS >> (LHS & 31); | 
|  | 680 | return true; | 
|  | 681 | case AMDGPU::V_ASHR_I32_e64: | 
|  | 682 | case AMDGPU::V_ASHR_I32_e32: | 
|  | 683 | case AMDGPU::S_ASHR_I32: | 
|  | 684 | Result = static_cast<int32_t>(LHS) >> (RHS & 31); | 
|  | 685 | return true; | 
|  | 686 | case AMDGPU::V_ASHRREV_I32_e64: | 
|  | 687 | case AMDGPU::V_ASHRREV_I32_e32: | 
|  | 688 | Result = static_cast<int32_t>(RHS) >> (LHS & 31); | 
|  | 689 | return true; | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 690 | default: | 
|  | 691 | return false; | 
|  | 692 | } | 
|  | 693 | } | 
|  | 694 |  | 
|  | 695 | static unsigned getMovOpc(bool IsScalar) { | 
|  | 696 | return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; | 
|  | 697 | } | 
|  | 698 |  | 
| Matt Arsenault | c2ee42c | 2016-10-06 17:54:30 +0000 | [diff] [blame] | 699 | /// Remove any leftover implicit operands from mutating the instruction. e.g. | 
|  | 700 | /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def | 
|  | 701 | /// anymore. | 
|  | 702 | static void stripExtraCopyOperands(MachineInstr &MI) { | 
|  | 703 | const MCInstrDesc &Desc = MI.getDesc(); | 
|  | 704 | unsigned NumOps = Desc.getNumOperands() + | 
|  | 705 | Desc.getNumImplicitUses() + | 
|  | 706 | Desc.getNumImplicitDefs(); | 
|  | 707 |  | 
|  | 708 | for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I) | 
|  | 709 | MI.RemoveOperand(I); | 
|  | 710 | } | 
|  | 711 |  | 
|  | 712 | static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) { | 
|  | 713 | MI.setDesc(NewDesc); | 
|  | 714 | stripExtraCopyOperands(MI); | 
|  | 715 | } | 
|  | 716 |  | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 717 | static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI, | 
|  | 718 | MachineOperand &Op) { | 
|  | 719 | if (Op.isReg()) { | 
|  | 720 | // If this has a subregister, it obviously is a register source. | 
| Matt Arsenault | cbda7ff | 2018-03-10 16:05:35 +0000 | [diff] [blame] | 721 | if (Op.getSubReg() != AMDGPU::NoSubRegister || | 
|  | 722 | !TargetRegisterInfo::isVirtualRegister(Op.getReg())) | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 723 | return &Op; | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 724 |  | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 725 | MachineInstr *Def = MRI.getVRegDef(Op.getReg()); | 
| Matt Arsenault | 7f67b35 | 2017-06-20 18:28:02 +0000 | [diff] [blame] | 726 | if (Def && Def->isMoveImmediate()) { | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 727 | MachineOperand &ImmSrc = Def->getOperand(1); | 
|  | 728 | if (ImmSrc.isImm()) | 
|  | 729 | return &ImmSrc; | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 730 | } | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 731 | } | 
|  | 732 |  | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 733 | return &Op; | 
|  | 734 | } | 
|  | 735 |  | 
|  | 736 | // Try to simplify operations with a constant that may appear after instruction | 
|  | 737 | // selection. | 
|  | 738 | // TODO: See if a frame index with a fixed offset can fold. | 
|  | 739 | static bool tryConstantFoldOp(MachineRegisterInfo &MRI, | 
|  | 740 | const SIInstrInfo *TII, | 
|  | 741 | MachineInstr *MI, | 
|  | 742 | MachineOperand *ImmOp) { | 
|  | 743 | unsigned Opc = MI->getOpcode(); | 
|  | 744 | if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || | 
|  | 745 | Opc == AMDGPU::S_NOT_B32) { | 
|  | 746 | MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm()); | 
|  | 747 | mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); | 
|  | 748 | return true; | 
|  | 749 | } | 
|  | 750 |  | 
|  | 751 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); | 
|  | 752 | if (Src1Idx == -1) | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 753 | return false; | 
|  | 754 |  | 
|  | 755 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 756 | MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx)); | 
|  | 757 | MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx)); | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 758 |  | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 759 | if (!Src0->isImm() && !Src1->isImm()) | 
|  | 760 | return false; | 
|  | 761 |  | 
| Matt Arsenault | 0d1b393 | 2018-08-06 15:40:20 +0000 | [diff] [blame] | 762 | if (MI->getOpcode() == AMDGPU::V_LSHL_OR_B32) { | 
|  | 763 | if (Src0->isImm() && Src0->getImm() == 0) { | 
|  | 764 | // v_lshl_or_b32 0, X, Y -> copy Y | 
|  | 765 | // v_lshl_or_b32 0, X, K -> v_mov_b32 K | 
|  | 766 | bool UseCopy = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->isReg(); | 
|  | 767 | MI->RemoveOperand(Src1Idx); | 
|  | 768 | MI->RemoveOperand(Src0Idx); | 
|  | 769 |  | 
|  | 770 | MI->setDesc(TII->get(UseCopy ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32)); | 
|  | 771 | return true; | 
|  | 772 | } | 
|  | 773 | } | 
|  | 774 |  | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 775 | // and k0, k1 -> v_mov_b32 (k0 & k1) | 
|  | 776 | // or k0, k1 -> v_mov_b32 (k0 | k1) | 
|  | 777 | // xor k0, k1 -> v_mov_b32 (k0 ^ k1) | 
|  | 778 | if (Src0->isImm() && Src1->isImm()) { | 
|  | 779 | int32_t NewImm; | 
|  | 780 | if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm())) | 
|  | 781 | return false; | 
|  | 782 |  | 
|  | 783 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); | 
|  | 784 | bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg()); | 
|  | 785 |  | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 786 | // Be careful to change the right operand, src0 may belong to a different | 
|  | 787 | // instruction. | 
|  | 788 | MI->getOperand(Src0Idx).ChangeToImmediate(NewImm); | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 789 | MI->RemoveOperand(Src1Idx); | 
| Matt Arsenault | c2ee42c | 2016-10-06 17:54:30 +0000 | [diff] [blame] | 790 | mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR))); | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 791 | return true; | 
|  | 792 | } | 
|  | 793 |  | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 794 | if (!MI->isCommutable()) | 
|  | 795 | return false; | 
|  | 796 |  | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 797 | if (Src0->isImm() && !Src1->isImm()) { | 
|  | 798 | std::swap(Src0, Src1); | 
|  | 799 | std::swap(Src0Idx, Src1Idx); | 
|  | 800 | } | 
|  | 801 |  | 
|  | 802 | int32_t Src1Val = static_cast<int32_t>(Src1->getImm()); | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 803 | if (Opc == AMDGPU::V_OR_B32_e64 || | 
|  | 804 | Opc == AMDGPU::V_OR_B32_e32 || | 
|  | 805 | Opc == AMDGPU::S_OR_B32) { | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 806 | if (Src1Val == 0) { | 
|  | 807 | // y = or x, 0 => y = copy x | 
|  | 808 | MI->RemoveOperand(Src1Idx); | 
| Matt Arsenault | c2ee42c | 2016-10-06 17:54:30 +0000 | [diff] [blame] | 809 | mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 810 | } else if (Src1Val == -1) { | 
|  | 811 | // y = or x, -1 => y = v_mov_b32 -1 | 
|  | 812 | MI->RemoveOperand(Src1Idx); | 
| Matt Arsenault | c2ee42c | 2016-10-06 17:54:30 +0000 | [diff] [blame] | 813 | mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32))); | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 814 | } else | 
|  | 815 | return false; | 
|  | 816 |  | 
|  | 817 | return true; | 
|  | 818 | } | 
|  | 819 |  | 
|  | 820 | if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 || | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 821 | MI->getOpcode() == AMDGPU::V_AND_B32_e32 || | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 822 | MI->getOpcode() == AMDGPU::S_AND_B32) { | 
|  | 823 | if (Src1Val == 0) { | 
|  | 824 | // y = and x, 0 => y = v_mov_b32 0 | 
|  | 825 | MI->RemoveOperand(Src0Idx); | 
| Matt Arsenault | c2ee42c | 2016-10-06 17:54:30 +0000 | [diff] [blame] | 826 | mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32))); | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 827 | } else if (Src1Val == -1) { | 
|  | 828 | // y = and x, -1 => y = copy x | 
|  | 829 | MI->RemoveOperand(Src1Idx); | 
| Matt Arsenault | c2ee42c | 2016-10-06 17:54:30 +0000 | [diff] [blame] | 830 | mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); | 
|  | 831 | stripExtraCopyOperands(*MI); | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 832 | } else | 
|  | 833 | return false; | 
|  | 834 |  | 
|  | 835 | return true; | 
|  | 836 | } | 
|  | 837 |  | 
|  | 838 | if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 || | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 839 | MI->getOpcode() == AMDGPU::V_XOR_B32_e32 || | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 840 | MI->getOpcode() == AMDGPU::S_XOR_B32) { | 
|  | 841 | if (Src1Val == 0) { | 
|  | 842 | // y = xor x, 0 => y = copy x | 
|  | 843 | MI->RemoveOperand(Src1Idx); | 
| Matt Arsenault | c2ee42c | 2016-10-06 17:54:30 +0000 | [diff] [blame] | 844 | mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 845 | return true; | 
| Matt Arsenault | fa5f767 | 2016-09-14 15:19:03 +0000 | [diff] [blame] | 846 | } | 
|  | 847 | } | 
|  | 848 |  | 
|  | 849 | return false; | 
|  | 850 | } | 
|  | 851 |  | 
| Stanislav Mekhanoshin | 70603dc | 2017-03-24 18:55:20 +0000 | [diff] [blame] | 852 | // Try to fold an instruction into a simpler one | 
|  | 853 | static bool tryFoldInst(const SIInstrInfo *TII, | 
|  | 854 | MachineInstr *MI) { | 
|  | 855 | unsigned Opc = MI->getOpcode(); | 
|  | 856 |  | 
|  | 857 | if (Opc == AMDGPU::V_CNDMASK_B32_e32    || | 
|  | 858 | Opc == AMDGPU::V_CNDMASK_B32_e64    || | 
|  | 859 | Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) { | 
|  | 860 | const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); | 
|  | 861 | const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1); | 
| Tim Renouf | 2e94f6e | 2019-03-18 19:25:39 +0000 | [diff] [blame] | 862 | int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers); | 
|  | 863 | int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); | 
|  | 864 | if (Src1->isIdenticalTo(*Src0) && | 
|  | 865 | (Src1ModIdx == -1 || !MI->getOperand(Src1ModIdx).getImm()) && | 
|  | 866 | (Src0ModIdx == -1 || !MI->getOperand(Src0ModIdx).getImm())) { | 
| Nicola Zaghen | d34e60c | 2018-05-14 12:53:11 +0000 | [diff] [blame] | 867 | LLVM_DEBUG(dbgs() << "Folded " << *MI << " into "); | 
| Tim Renouf | 2e94f6e | 2019-03-18 19:25:39 +0000 | [diff] [blame] | 868 | auto &NewDesc = | 
|  | 869 | TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false)); | 
| Stanislav Mekhanoshin | 70603dc | 2017-03-24 18:55:20 +0000 | [diff] [blame] | 870 | int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); | 
|  | 871 | if (Src2Idx != -1) | 
|  | 872 | MI->RemoveOperand(Src2Idx); | 
|  | 873 | MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1)); | 
| Tim Renouf | 2e94f6e | 2019-03-18 19:25:39 +0000 | [diff] [blame] | 874 | if (Src1ModIdx != -1) | 
|  | 875 | MI->RemoveOperand(Src1ModIdx); | 
|  | 876 | if (Src0ModIdx != -1) | 
|  | 877 | MI->RemoveOperand(Src0ModIdx); | 
|  | 878 | mutateCopyOp(*MI, NewDesc); | 
| Nicola Zaghen | d34e60c | 2018-05-14 12:53:11 +0000 | [diff] [blame] | 879 | LLVM_DEBUG(dbgs() << *MI << '\n'); | 
| Stanislav Mekhanoshin | 70603dc | 2017-03-24 18:55:20 +0000 | [diff] [blame] | 880 | return true; | 
|  | 881 | } | 
|  | 882 | } | 
|  | 883 |  | 
|  | 884 | return false; | 
|  | 885 | } | 
|  | 886 |  | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 887 | void SIFoldOperands::foldInstOperand(MachineInstr &MI, | 
|  | 888 | MachineOperand &OpToFold) const { | 
|  | 889 | // We need mutate the operands of new mov instructions to add implicit | 
|  | 890 | // uses of EXEC, but adding them invalidates the use_iterator, so defer | 
|  | 891 | // this. | 
|  | 892 | SmallVector<MachineInstr *, 4> CopiesToReplace; | 
|  | 893 | SmallVector<FoldCandidate, 4> FoldList; | 
|  | 894 | MachineOperand &Dst = MI.getOperand(0); | 
|  | 895 |  | 
| Nicolai Haehnle | 2710171 | 2019-06-25 11:52:30 +0000 | [diff] [blame] | 896 | bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 897 | if (FoldingImm) { | 
|  | 898 | unsigned NumLiteralUses = 0; | 
|  | 899 | MachineOperand *NonInlineUse = nullptr; | 
|  | 900 | int NonInlineUseOpNo = -1; | 
|  | 901 |  | 
| Vitaly Buka | 7450398 | 2017-10-15 05:35:02 +0000 | [diff] [blame] | 902 | MachineRegisterInfo::use_iterator NextUse; | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 903 | for (MachineRegisterInfo::use_iterator | 
|  | 904 | Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end(); | 
|  | 905 | Use != E; Use = NextUse) { | 
|  | 906 | NextUse = std::next(Use); | 
|  | 907 | MachineInstr *UseMI = Use->getParent(); | 
|  | 908 | unsigned OpNo = Use.getOperandNo(); | 
|  | 909 |  | 
|  | 910 | // Folding the immediate may reveal operations that can be constant | 
|  | 911 | // folded or replaced with a copy. This can happen for example after | 
|  | 912 | // frame indices are lowered to constants or from splitting 64-bit | 
|  | 913 | // constants. | 
|  | 914 | // | 
|  | 915 | // We may also encounter cases where one or both operands are | 
|  | 916 | // immediates materialized into a register, which would ordinarily not | 
|  | 917 | // be folded due to multiple uses or operand constraints. | 
|  | 918 |  | 
|  | 919 | if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) { | 
| Nicola Zaghen | d34e60c | 2018-05-14 12:53:11 +0000 | [diff] [blame] | 920 | LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI << '\n'); | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 921 |  | 
|  | 922 | // Some constant folding cases change the same immediate's use to a new | 
|  | 923 | // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user | 
|  | 924 | // again. The same constant folded instruction could also have a second | 
|  | 925 | // use operand. | 
|  | 926 | NextUse = MRI->use_begin(Dst.getReg()); | 
| Nicolai Haehnle | a253e4c | 2017-07-18 14:54:41 +0000 | [diff] [blame] | 927 | FoldList.clear(); | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 928 | continue; | 
|  | 929 | } | 
|  | 930 |  | 
|  | 931 | // Try to fold any inline immediate uses, and then only fold other | 
|  | 932 | // constants if they have one use. | 
|  | 933 | // | 
|  | 934 | // The legality of the inline immediate must be checked based on the use | 
|  | 935 | // operand, not the defining instruction, because 32-bit instructions | 
|  | 936 | // with 32-bit inline immediate sources may be used to materialize | 
|  | 937 | // constants used in 16-bit operands. | 
|  | 938 | // | 
|  | 939 | // e.g. it is unsafe to fold: | 
|  | 940 | //  s_mov_b32 s0, 1.0    // materializes 0x3f800000 | 
|  | 941 | //  v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00 | 
|  | 942 |  | 
|  | 943 | // Folding immediates with more than one use will increase program size. | 
|  | 944 | // FIXME: This will also reduce register usage, which may be better | 
|  | 945 | // in some cases. A better heuristic is needed. | 
| Matt Arsenault | 69e3001 | 2017-01-11 22:00:02 +0000 | [diff] [blame] | 946 | if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) { | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 947 | foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); | 
| Matt Arsenault | 60957cb | 2019-06-24 14:53:56 +0000 | [diff] [blame] | 948 | } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) { | 
|  | 949 | foldOperand(OpToFold, UseMI, OpNo, FoldList, | 
|  | 950 | CopiesToReplace); | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 951 | } else { | 
|  | 952 | if (++NumLiteralUses == 1) { | 
|  | 953 | NonInlineUse = &*Use; | 
|  | 954 | NonInlineUseOpNo = OpNo; | 
|  | 955 | } | 
|  | 956 | } | 
|  | 957 | } | 
|  | 958 |  | 
|  | 959 | if (NumLiteralUses == 1) { | 
|  | 960 | MachineInstr *UseMI = NonInlineUse->getParent(); | 
|  | 961 | foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace); | 
|  | 962 | } | 
|  | 963 | } else { | 
|  | 964 | // Folding register. | 
| Alexander Timofeev | 993e279 | 2019-01-03 19:55:32 +0000 | [diff] [blame] | 965 | SmallVector <MachineRegisterInfo::use_iterator, 4> UsesToProcess; | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 966 | for (MachineRegisterInfo::use_iterator | 
|  | 967 | Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end(); | 
|  | 968 | Use != E; ++Use) { | 
| Alexander Timofeev | 993e279 | 2019-01-03 19:55:32 +0000 | [diff] [blame] | 969 | UsesToProcess.push_back(Use); | 
|  | 970 | } | 
|  | 971 | for (auto U : UsesToProcess) { | 
|  | 972 | MachineInstr *UseMI = U->getParent(); | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 973 |  | 
| Alexander Timofeev | 993e279 | 2019-01-03 19:55:32 +0000 | [diff] [blame] | 974 | foldOperand(OpToFold, UseMI, U.getOperandNo(), | 
|  | 975 | FoldList, CopiesToReplace); | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 976 | } | 
|  | 977 | } | 
|  | 978 |  | 
|  | 979 | MachineFunction *MF = MI.getParent()->getParent(); | 
|  | 980 | // Make sure we add EXEC uses to any new v_mov instructions created. | 
|  | 981 | for (MachineInstr *Copy : CopiesToReplace) | 
|  | 982 | Copy->addImplicitDefUseOperands(*MF); | 
|  | 983 |  | 
|  | 984 | for (FoldCandidate &Fold : FoldList) { | 
| Stanislav Mekhanoshin | 5cf8167 | 2019-05-02 04:01:39 +0000 | [diff] [blame] | 985 | if (updateOperand(Fold, *TII, *TRI, *ST)) { | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 986 | // Clear kill flags. | 
|  | 987 | if (Fold.isReg()) { | 
|  | 988 | assert(Fold.OpToFold && Fold.OpToFold->isReg()); | 
|  | 989 | // FIXME: Probably shouldn't bother trying to fold if not an | 
|  | 990 | // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR | 
|  | 991 | // copies. | 
|  | 992 | MRI->clearKillFlags(Fold.OpToFold->getReg()); | 
|  | 993 | } | 
| Nicola Zaghen | d34e60c | 2018-05-14 12:53:11 +0000 | [diff] [blame] | 994 | LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " | 
|  | 995 | << static_cast<int>(Fold.UseOpNo) << " of " | 
|  | 996 | << *Fold.UseMI << '\n'); | 
| Stanislav Mekhanoshin | 70603dc | 2017-03-24 18:55:20 +0000 | [diff] [blame] | 997 | tryFoldInst(TII, Fold.UseMI); | 
| Stanislav Mekhanoshin | f154b4f | 2017-06-03 00:41:52 +0000 | [diff] [blame] | 998 | } else if (Fold.isCommuted()) { | 
|  | 999 | // Restoring instruction's original operand order if fold has failed. | 
|  | 1000 | TII->commuteInstruction(*Fold.UseMI, false); | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 1001 | } | 
|  | 1002 | } | 
|  | 1003 | } | 
|  | 1004 |  | 
| Matt Arsenault | f48e5c9 | 2017-10-05 00:13:20 +0000 | [diff] [blame] | 1005 | // Clamp patterns are canonically selected to v_max_* instructions, so only | 
|  | 1006 | // handle them. | 
| Matt Arsenault | d5c6515 | 2017-02-22 23:27:53 +0000 | [diff] [blame] | 1007 | const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { | 
|  | 1008 | unsigned Op = MI.getOpcode(); | 
|  | 1009 | switch (Op) { | 
|  | 1010 | case AMDGPU::V_MAX_F32_e64: | 
| Matt Arsenault | 79a45db | 2017-02-22 23:53:37 +0000 | [diff] [blame] | 1011 | case AMDGPU::V_MAX_F16_e64: | 
| Matt Arsenault | ab4a5cd | 2017-08-31 23:53:50 +0000 | [diff] [blame] | 1012 | case AMDGPU::V_MAX_F64: | 
|  | 1013 | case AMDGPU::V_PK_MAX_F16: { | 
| Matt Arsenault | d5c6515 | 2017-02-22 23:27:53 +0000 | [diff] [blame] | 1014 | if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm()) | 
|  | 1015 | return nullptr; | 
|  | 1016 |  | 
|  | 1017 | // Make sure sources are identical. | 
|  | 1018 | const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); | 
|  | 1019 | const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); | 
| Stanislav Mekhanoshin | 286a422 | 2017-06-05 01:03:04 +0000 | [diff] [blame] | 1020 | if (!Src0->isReg() || !Src1->isReg() || | 
| Matt Arsenault | aafff87 | 2017-10-05 00:13:17 +0000 | [diff] [blame] | 1021 | Src0->getReg() != Src1->getReg() || | 
| Stanislav Mekhanoshin | 286a422 | 2017-06-05 01:03:04 +0000 | [diff] [blame] | 1022 | Src0->getSubReg() != Src1->getSubReg() || | 
| Matt Arsenault | d5c6515 | 2017-02-22 23:27:53 +0000 | [diff] [blame] | 1023 | Src0->getSubReg() != AMDGPU::NoSubRegister) | 
|  | 1024 | return nullptr; | 
|  | 1025 |  | 
|  | 1026 | // Can't fold up if we have modifiers. | 
| Matt Arsenault | ab4a5cd | 2017-08-31 23:53:50 +0000 | [diff] [blame] | 1027 | if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) | 
|  | 1028 | return nullptr; | 
|  | 1029 |  | 
|  | 1030 | unsigned Src0Mods | 
|  | 1031 | = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm(); | 
|  | 1032 | unsigned Src1Mods | 
|  | 1033 | = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm(); | 
|  | 1034 |  | 
|  | 1035 | // Having a 0 op_sel_hi would require swizzling the output in the source | 
|  | 1036 | // instruction, which we can't do. | 
| Stanislav Mekhanoshin | da644c0 | 2019-03-13 21:15:52 +0000 | [diff] [blame] | 1037 | unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 | 
|  | 1038 | : 0u; | 
| Matt Arsenault | ab4a5cd | 2017-08-31 23:53:50 +0000 | [diff] [blame] | 1039 | if (Src0Mods != UnsetMods && Src1Mods != UnsetMods) | 
| Matt Arsenault | d5c6515 | 2017-02-22 23:27:53 +0000 | [diff] [blame] | 1040 | return nullptr; | 
|  | 1041 | return Src0; | 
|  | 1042 | } | 
|  | 1043 | default: | 
|  | 1044 | return nullptr; | 
|  | 1045 | } | 
|  | 1046 | } | 
|  | 1047 |  | 
|  | 1048 | // We obviously have multiple uses in a clamp since the register is used twice | 
|  | 1049 | // in the same instruction. | 
|  | 1050 | static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) { | 
|  | 1051 | int Count = 0; | 
|  | 1052 | for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end(); | 
|  | 1053 | I != E; ++I) { | 
|  | 1054 | if (++Count > 1) | 
|  | 1055 | return false; | 
|  | 1056 | } | 
|  | 1057 |  | 
|  | 1058 | return true; | 
|  | 1059 | } | 
|  | 1060 |  | 
| Matt Arsenault | 8cbb488 | 2017-09-20 21:01:24 +0000 | [diff] [blame] | 1061 | // FIXME: Clamp for v_mad_mixhi_f16 handled during isel. | 
| Matt Arsenault | d5c6515 | 2017-02-22 23:27:53 +0000 | [diff] [blame] | 1062 | bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { | 
|  | 1063 | const MachineOperand *ClampSrc = isClamp(MI); | 
|  | 1064 | if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg())) | 
|  | 1065 | return false; | 
|  | 1066 |  | 
|  | 1067 | MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg()); | 
| Matt Arsenault | ab4a5cd | 2017-08-31 23:53:50 +0000 | [diff] [blame] | 1068 |  | 
|  | 1069 | // The type of clamp must be compatible. | 
|  | 1070 | if (TII->getClampMask(*Def) != TII->getClampMask(MI)) | 
| Matt Arsenault | d5c6515 | 2017-02-22 23:27:53 +0000 | [diff] [blame] | 1071 | return false; | 
| Matt Arsenault | ab4a5cd | 2017-08-31 23:53:50 +0000 | [diff] [blame] | 1072 |  | 
| Matt Arsenault | d5c6515 | 2017-02-22 23:27:53 +0000 | [diff] [blame] | 1073 | MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp); | 
|  | 1074 | if (!DefClamp) | 
|  | 1075 | return false; | 
|  | 1076 |  | 
| Nicola Zaghen | d34e60c | 2018-05-14 12:53:11 +0000 | [diff] [blame] | 1077 | LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def | 
|  | 1078 | << '\n'); | 
| Matt Arsenault | d5c6515 | 2017-02-22 23:27:53 +0000 | [diff] [blame] | 1079 |  | 
|  | 1080 | // Clamp is applied after omod, so it is OK if omod is set. | 
|  | 1081 | DefClamp->setImm(1); | 
|  | 1082 | MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); | 
|  | 1083 | MI.eraseFromParent(); | 
|  | 1084 | return true; | 
|  | 1085 | } | 
|  | 1086 |  | 
| Matt Arsenault | 3cb3904 | 2017-02-27 19:35:42 +0000 | [diff] [blame] | 1087 | static int getOModValue(unsigned Opc, int64_t Val) { | 
|  | 1088 | switch (Opc) { | 
|  | 1089 | case AMDGPU::V_MUL_F32_e64: { | 
|  | 1090 | switch (static_cast<uint32_t>(Val)) { | 
|  | 1091 | case 0x3f000000: // 0.5 | 
|  | 1092 | return SIOutMods::DIV2; | 
|  | 1093 | case 0x40000000: // 2.0 | 
|  | 1094 | return SIOutMods::MUL2; | 
|  | 1095 | case 0x40800000: // 4.0 | 
|  | 1096 | return SIOutMods::MUL4; | 
|  | 1097 | default: | 
|  | 1098 | return SIOutMods::NONE; | 
|  | 1099 | } | 
|  | 1100 | } | 
|  | 1101 | case AMDGPU::V_MUL_F16_e64: { | 
|  | 1102 | switch (static_cast<uint16_t>(Val)) { | 
|  | 1103 | case 0x3800: // 0.5 | 
|  | 1104 | return SIOutMods::DIV2; | 
|  | 1105 | case 0x4000: // 2.0 | 
|  | 1106 | return SIOutMods::MUL2; | 
|  | 1107 | case 0x4400: // 4.0 | 
|  | 1108 | return SIOutMods::MUL4; | 
|  | 1109 | default: | 
|  | 1110 | return SIOutMods::NONE; | 
|  | 1111 | } | 
|  | 1112 | } | 
|  | 1113 | default: | 
|  | 1114 | llvm_unreachable("invalid mul opcode"); | 
|  | 1115 | } | 
|  | 1116 | } | 
|  | 1117 |  | 
|  | 1118 | // FIXME: Does this really not support denormals with f16? | 
|  | 1119 | // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not | 
|  | 1120 | // handled, so will anything other than that break? | 
|  | 1121 | std::pair<const MachineOperand *, int> | 
|  | 1122 | SIFoldOperands::isOMod(const MachineInstr &MI) const { | 
|  | 1123 | unsigned Op = MI.getOpcode(); | 
|  | 1124 | switch (Op) { | 
|  | 1125 | case AMDGPU::V_MUL_F32_e64: | 
|  | 1126 | case AMDGPU::V_MUL_F16_e64: { | 
|  | 1127 | // If output denormals are enabled, omod is ignored. | 
|  | 1128 | if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) || | 
|  | 1129 | (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals())) | 
|  | 1130 | return std::make_pair(nullptr, SIOutMods::NONE); | 
|  | 1131 |  | 
|  | 1132 | const MachineOperand *RegOp = nullptr; | 
|  | 1133 | const MachineOperand *ImmOp = nullptr; | 
|  | 1134 | const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); | 
|  | 1135 | const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); | 
|  | 1136 | if (Src0->isImm()) { | 
|  | 1137 | ImmOp = Src0; | 
|  | 1138 | RegOp = Src1; | 
|  | 1139 | } else if (Src1->isImm()) { | 
|  | 1140 | ImmOp = Src1; | 
|  | 1141 | RegOp = Src0; | 
|  | 1142 | } else | 
|  | 1143 | return std::make_pair(nullptr, SIOutMods::NONE); | 
|  | 1144 |  | 
|  | 1145 | int OMod = getOModValue(Op, ImmOp->getImm()); | 
|  | 1146 | if (OMod == SIOutMods::NONE || | 
|  | 1147 | TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || | 
|  | 1148 | TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || | 
|  | 1149 | TII->hasModifiersSet(MI, AMDGPU::OpName::omod) || | 
|  | 1150 | TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) | 
|  | 1151 | return std::make_pair(nullptr, SIOutMods::NONE); | 
|  | 1152 |  | 
|  | 1153 | return std::make_pair(RegOp, OMod); | 
|  | 1154 | } | 
|  | 1155 | case AMDGPU::V_ADD_F32_e64: | 
|  | 1156 | case AMDGPU::V_ADD_F16_e64: { | 
|  | 1157 | // If output denormals are enabled, omod is ignored. | 
|  | 1158 | if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) || | 
|  | 1159 | (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals())) | 
|  | 1160 | return std::make_pair(nullptr, SIOutMods::NONE); | 
|  | 1161 |  | 
|  | 1162 | // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x | 
|  | 1163 | const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); | 
|  | 1164 | const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); | 
|  | 1165 |  | 
|  | 1166 | if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() && | 
|  | 1167 | Src0->getSubReg() == Src1->getSubReg() && | 
|  | 1168 | !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) && | 
|  | 1169 | !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) && | 
|  | 1170 | !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) && | 
|  | 1171 | !TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) | 
|  | 1172 | return std::make_pair(Src0, SIOutMods::MUL2); | 
|  | 1173 |  | 
|  | 1174 | return std::make_pair(nullptr, SIOutMods::NONE); | 
|  | 1175 | } | 
|  | 1176 | default: | 
|  | 1177 | return std::make_pair(nullptr, SIOutMods::NONE); | 
|  | 1178 | } | 
|  | 1179 | } | 
|  | 1180 |  | 
|  | 1181 | // FIXME: Does this need to check IEEE bit on function? | 
|  | 1182 | bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) { | 
|  | 1183 | const MachineOperand *RegOp; | 
|  | 1184 | int OMod; | 
|  | 1185 | std::tie(RegOp, OMod) = isOMod(MI); | 
|  | 1186 | if (OMod == SIOutMods::NONE || !RegOp->isReg() || | 
|  | 1187 | RegOp->getSubReg() != AMDGPU::NoSubRegister || | 
|  | 1188 | !hasOneNonDBGUseInst(*MRI, RegOp->getReg())) | 
|  | 1189 | return false; | 
|  | 1190 |  | 
|  | 1191 | MachineInstr *Def = MRI->getVRegDef(RegOp->getReg()); | 
|  | 1192 | MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod); | 
|  | 1193 | if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE) | 
|  | 1194 | return false; | 
|  | 1195 |  | 
|  | 1196 | // Clamp is applied after omod. If the source already has clamp set, don't | 
|  | 1197 | // fold it. | 
|  | 1198 | if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp)) | 
|  | 1199 | return false; | 
|  | 1200 |  | 
| Nicola Zaghen | d34e60c | 2018-05-14 12:53:11 +0000 | [diff] [blame] | 1201 | LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n'); | 
| Matt Arsenault | 3cb3904 | 2017-02-27 19:35:42 +0000 | [diff] [blame] | 1202 |  | 
|  | 1203 | DefOMod->setImm(OMod); | 
|  | 1204 | MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); | 
|  | 1205 | MI.eraseFromParent(); | 
|  | 1206 | return true; | 
|  | 1207 | } | 
|  | 1208 |  | 
| Tom Stellard | 6596ba7 | 2014-11-21 22:06:37 +0000 | [diff] [blame] | 1209 | bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { | 
| Matthias Braun | f1caa28 | 2017-12-15 22:22:58 +0000 | [diff] [blame] | 1210 | if (skipFunction(MF.getFunction())) | 
| Andrew Kaylor | 7de74af | 2016-04-25 22:23:44 +0000 | [diff] [blame] | 1211 | return false; | 
|  | 1212 |  | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 1213 | MRI = &MF.getRegInfo(); | 
| Tom Stellard | 5bfbae5 | 2018-07-11 20:59:01 +0000 | [diff] [blame] | 1214 | ST = &MF.getSubtarget<GCNSubtarget>(); | 
| Matt Arsenault | d5c6515 | 2017-02-22 23:27:53 +0000 | [diff] [blame] | 1215 | TII = ST->getInstrInfo(); | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 1216 | TRI = &TII->getRegisterInfo(); | 
| Matt Arsenault | 60957cb | 2019-06-24 14:53:56 +0000 | [diff] [blame] | 1217 | MFI = MF.getInfo<SIMachineFunctionInfo>(); | 
| Matt Arsenault | 3cb3904 | 2017-02-27 19:35:42 +0000 | [diff] [blame] | 1218 |  | 
|  | 1219 | // omod is ignored by hardware if IEEE bit is enabled. omod also does not | 
|  | 1220 | // correctly handle signed zeros. | 
|  | 1221 | // | 
| Matt Arsenault | 055e4dc | 2019-03-29 19:14:54 +0000 | [diff] [blame] | 1222 | // FIXME: Also need to check strictfp | 
|  | 1223 | bool IsIEEEMode = MFI->getMode().IEEE; | 
| Matt Arsenault | 13b0db9 | 2018-08-12 08:44:25 +0000 | [diff] [blame] | 1224 | bool HasNSZ = MFI->hasNoSignedZerosFPMath(); | 
| Matt Arsenault | 3cb3904 | 2017-02-27 19:35:42 +0000 | [diff] [blame] | 1225 |  | 
| Matt Arsenault | ff3f912 | 2017-06-20 18:56:32 +0000 | [diff] [blame] | 1226 | for (MachineBasicBlock *MBB : depth_first(&MF)) { | 
| Tom Stellard | 6596ba7 | 2014-11-21 22:06:37 +0000 | [diff] [blame] | 1227 | MachineBasicBlock::iterator I, Next; | 
| Matt Arsenault | ff3f912 | 2017-06-20 18:56:32 +0000 | [diff] [blame] | 1228 | for (I = MBB->begin(); I != MBB->end(); I = Next) { | 
| Tom Stellard | 6596ba7 | 2014-11-21 22:06:37 +0000 | [diff] [blame] | 1229 | Next = std::next(I); | 
|  | 1230 | MachineInstr &MI = *I; | 
|  | 1231 |  | 
| Stanislav Mekhanoshin | 70603dc | 2017-03-24 18:55:20 +0000 | [diff] [blame] | 1232 | tryFoldInst(TII, &MI); | 
|  | 1233 |  | 
| Sam Kolton | 27e0f8b | 2017-03-31 11:42:43 +0000 | [diff] [blame] | 1234 | if (!TII->isFoldableCopy(MI)) { | 
| Matt Arsenault | 13b0db9 | 2018-08-12 08:44:25 +0000 | [diff] [blame] | 1235 | // TODO: Omod might be OK if there is NSZ only on the source | 
|  | 1236 | // instruction, and not the omod multiply. | 
|  | 1237 | if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) || | 
|  | 1238 | !tryFoldOMod(MI)) | 
| Matt Arsenault | 3cb3904 | 2017-02-27 19:35:42 +0000 | [diff] [blame] | 1239 | tryFoldClamp(MI); | 
| Tom Stellard | 6596ba7 | 2014-11-21 22:06:37 +0000 | [diff] [blame] | 1240 | continue; | 
| Matt Arsenault | d5c6515 | 2017-02-22 23:27:53 +0000 | [diff] [blame] | 1241 | } | 
| Tom Stellard | 6596ba7 | 2014-11-21 22:06:37 +0000 | [diff] [blame] | 1242 |  | 
|  | 1243 | MachineOperand &OpToFold = MI.getOperand(1); | 
| Nicolai Haehnle | 2710171 | 2019-06-25 11:52:30 +0000 | [diff] [blame] | 1244 | bool FoldingImm = | 
|  | 1245 | OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); | 
| Tom Stellard | 26cc18d | 2015-01-07 22:18:27 +0000 | [diff] [blame] | 1246 |  | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 1247 | // FIXME: We could also be folding things like TargetIndexes. | 
| Tom Stellard | 0599297 | 2015-01-07 22:44:19 +0000 | [diff] [blame] | 1248 | if (!FoldingImm && !OpToFold.isReg()) | 
|  | 1249 | continue; | 
|  | 1250 |  | 
| Tom Stellard | 6596ba7 | 2014-11-21 22:06:37 +0000 | [diff] [blame] | 1251 | if (OpToFold.isReg() && | 
| Nicolai Haehnle | 82fc962 | 2016-01-07 17:10:29 +0000 | [diff] [blame] | 1252 | !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg())) | 
| Tom Stellard | 6596ba7 | 2014-11-21 22:06:37 +0000 | [diff] [blame] | 1253 | continue; | 
|  | 1254 |  | 
| Marek Olsak | 926c56f | 2016-01-13 11:44:29 +0000 | [diff] [blame] | 1255 | // Prevent folding operands backwards in the function. For example, | 
|  | 1256 | // the COPY opcode must not be replaced by 1 in this example: | 
|  | 1257 | // | 
| Francis Visoiu Mistrih | a8a83d1 | 2017-12-07 10:40:31 +0000 | [diff] [blame] | 1258 | //    %3 = COPY %vgpr0; VGPR_32:%3 | 
| Marek Olsak | 926c56f | 2016-01-13 11:44:29 +0000 | [diff] [blame] | 1259 | //    ... | 
| Francis Visoiu Mistrih | a8a83d1 | 2017-12-07 10:40:31 +0000 | [diff] [blame] | 1260 | //    %vgpr0 = V_MOV_B32_e32 1, implicit %exec | 
| Marek Olsak | 926c56f | 2016-01-13 11:44:29 +0000 | [diff] [blame] | 1261 | MachineOperand &Dst = MI.getOperand(0); | 
|  | 1262 | if (Dst.isReg() && | 
|  | 1263 | !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) | 
|  | 1264 | continue; | 
|  | 1265 |  | 
| Matt Arsenault | 51818c1 | 2017-01-10 23:32:04 +0000 | [diff] [blame] | 1266 | foldInstOperand(MI, OpToFold); | 
| Tom Stellard | 6596ba7 | 2014-11-21 22:06:37 +0000 | [diff] [blame] | 1267 | } | 
|  | 1268 | } | 
|  | 1269 | return false; | 
|  | 1270 | } |