Eugene Zelenko | 60433b6 | 2017-10-05 00:33:50 +0000 | [diff] [blame] | 1 | //====- X86CmovConversion.cpp - Convert Cmov to Branch --------------------===// |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
Eugene Zelenko | 60433b6 | 2017-10-05 00:33:50 +0000 | [diff] [blame] | 9 | // |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 10 | /// \file |
Sanjay Patel | 7e5af84 | 2017-08-30 13:16:25 +0000 | [diff] [blame] | 11 | /// This file implements a pass that converts X86 cmov instructions into |
| 12 | /// branches when profitable. This pass is conservative. It transforms if and |
Sanjay Patel | 7b8183f | 2017-08-30 13:19:23 +0000 | [diff] [blame] | 13 | /// only if it can guarantee a gain with high confidence. |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 14 | /// |
| 15 | /// Thus, the optimization applies under the following conditions: |
Sanjay Patel | 7e5af84 | 2017-08-30 13:16:25 +0000 | [diff] [blame] | 16 | /// 1. Consider as candidates only CMOVs in innermost loops (assume that |
| 17 | /// most hotspots are represented by these loops). |
| 18 | /// 2. Given a group of CMOV instructions that are using the same EFLAGS def |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 19 | /// instruction: |
Sanjay Patel | 7e5af84 | 2017-08-30 13:16:25 +0000 | [diff] [blame] | 20 | /// a. Consider them as candidates only if all have the same code condition |
| 21 | /// or the opposite one to prevent generating more than one conditional |
| 22 | /// jump per EFLAGS def instruction. |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 23 | /// b. Consider them as candidates only if all are profitable to be |
Sanjay Patel | 7e5af84 | 2017-08-30 13:16:25 +0000 | [diff] [blame] | 24 | /// converted (assume that one bad conversion may cause a degradation). |
| 25 | /// 3. Apply conversion only for loops that are found profitable and only for |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 26 | /// CMOV candidates that were found profitable. |
Sanjay Patel | 7e5af84 | 2017-08-30 13:16:25 +0000 | [diff] [blame] | 27 | /// a. A loop is considered profitable only if conversion will reduce its |
| 28 | /// depth cost by some threshold. |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 29 | /// b. CMOV is considered profitable if the cost of its condition is higher |
| 30 | /// than the average cost of its true-value and false-value by 25% of |
Sanjay Patel | 7b8183f | 2017-08-30 13:19:23 +0000 | [diff] [blame] | 31 | /// branch-misprediction-penalty. This assures no degradation even with |
Sanjay Patel | 7e5af84 | 2017-08-30 13:16:25 +0000 | [diff] [blame] | 32 | /// 25% branch misprediction. |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 33 | /// |
| 34 | /// Note: This pass is assumed to run on SSA machine code. |
Eugene Zelenko | 60433b6 | 2017-10-05 00:33:50 +0000 | [diff] [blame] | 35 | // |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 36 | //===----------------------------------------------------------------------===// |
| 37 | // |
| 38 | // External interfaces: |
| 39 | // FunctionPass *llvm::createX86CmovConverterPass(); |
| 40 | // bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF); |
| 41 | // |
Eugene Zelenko | 60433b6 | 2017-10-05 00:33:50 +0000 | [diff] [blame] | 42 | //===----------------------------------------------------------------------===// |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 43 | |
| 44 | #include "X86.h" |
| 45 | #include "X86InstrInfo.h" |
Eugene Zelenko | 60433b6 | 2017-10-05 00:33:50 +0000 | [diff] [blame] | 46 | #include "llvm/ADT/ArrayRef.h" |
| 47 | #include "llvm/ADT/DenseMap.h" |
| 48 | #include "llvm/ADT/STLExtras.h" |
| 49 | #include "llvm/ADT/SmallPtrSet.h" |
| 50 | #include "llvm/ADT/SmallVector.h" |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 51 | #include "llvm/ADT/Statistic.h" |
Eugene Zelenko | 60433b6 | 2017-10-05 00:33:50 +0000 | [diff] [blame] | 52 | #include "llvm/CodeGen/MachineBasicBlock.h" |
| 53 | #include "llvm/CodeGen/MachineFunction.h" |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 54 | #include "llvm/CodeGen/MachineFunctionPass.h" |
Eugene Zelenko | 60433b6 | 2017-10-05 00:33:50 +0000 | [diff] [blame] | 55 | #include "llvm/CodeGen/MachineInstr.h" |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 56 | #include "llvm/CodeGen/MachineInstrBuilder.h" |
| 57 | #include "llvm/CodeGen/MachineLoopInfo.h" |
Eugene Zelenko | 60433b6 | 2017-10-05 00:33:50 +0000 | [diff] [blame] | 58 | #include "llvm/CodeGen/MachineOperand.h" |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 59 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
David Blaikie | 3f833ed | 2017-11-08 01:01:31 +0000 | [diff] [blame] | 60 | #include "llvm/CodeGen/TargetInstrInfo.h" |
David Blaikie | b3bde2e | 2017-11-17 01:07:10 +0000 | [diff] [blame] | 61 | #include "llvm/CodeGen/TargetRegisterInfo.h" |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 62 | #include "llvm/CodeGen/TargetSchedule.h" |
David Blaikie | b3bde2e | 2017-11-17 01:07:10 +0000 | [diff] [blame] | 63 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
Eugene Zelenko | 60433b6 | 2017-10-05 00:33:50 +0000 | [diff] [blame] | 64 | #include "llvm/IR/DebugLoc.h" |
| 65 | #include "llvm/MC/MCSchedule.h" |
| 66 | #include "llvm/Pass.h" |
| 67 | #include "llvm/Support/CommandLine.h" |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 68 | #include "llvm/Support/Debug.h" |
| 69 | #include "llvm/Support/raw_ostream.h" |
Eugene Zelenko | 60433b6 | 2017-10-05 00:33:50 +0000 | [diff] [blame] | 70 | #include <algorithm> |
| 71 | #include <cassert> |
| 72 | #include <iterator> |
| 73 | #include <utility> |
| 74 | |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 75 | using namespace llvm; |
| 76 | |
Amjad Aboud | 8ef85a0 | 2017-10-02 21:46:37 +0000 | [diff] [blame] | 77 | #define DEBUG_TYPE "x86-cmov-conversion" |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 78 | |
| 79 | STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups"); |
| 80 | STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates"); |
| 81 | STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops"); |
| 82 | STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups"); |
| 83 | |
Amjad Aboud | 8ef85a0 | 2017-10-02 21:46:37 +0000 | [diff] [blame] | 84 | namespace llvm { |
Amjad Aboud | 8ef85a0 | 2017-10-02 21:46:37 +0000 | [diff] [blame] | 85 | |
Eugene Zelenko | 60433b6 | 2017-10-05 00:33:50 +0000 | [diff] [blame] | 86 | void initializeX86CmovConverterPassPass(PassRegistry &); |
| 87 | |
| 88 | } // end namespace llvm |
| 89 | |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 90 | // This internal switch can be used to turn off the cmov/branch optimization. |
| 91 | static cl::opt<bool> |
| 92 | EnableCmovConverter("x86-cmov-converter", |
| 93 | cl::desc("Enable the X86 cmov-to-branch optimization."), |
| 94 | cl::init(true), cl::Hidden); |
| 95 | |
Amjad Aboud | 6fa6813 | 2017-08-08 12:17:56 +0000 | [diff] [blame] | 96 | static cl::opt<unsigned> |
| 97 | GainCycleThreshold("x86-cmov-converter-threshold", |
| 98 | cl::desc("Minimum gain per loop (in cycles) threshold."), |
| 99 | cl::init(4), cl::Hidden); |
| 100 | |
Chandler Carruth | 93a6455 | 2017-08-19 05:01:19 +0000 | [diff] [blame] | 101 | static cl::opt<bool> ForceMemOperand( |
| 102 | "x86-cmov-converter-force-mem-operand", |
| 103 | cl::desc("Convert cmovs to branches whenever they have memory operands."), |
| 104 | cl::init(true), cl::Hidden); |
| 105 | |
Eugene Zelenko | 60433b6 | 2017-10-05 00:33:50 +0000 | [diff] [blame] | 106 | namespace { |
| 107 | |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 108 | /// Converts X86 cmov instructions into branches when profitable. |
| 109 | class X86CmovConverterPass : public MachineFunctionPass { |
| 110 | public: |
Amjad Aboud | 8ef85a0 | 2017-10-02 21:46:37 +0000 | [diff] [blame] | 111 | X86CmovConverterPass() : MachineFunctionPass(ID) { |
| 112 | initializeX86CmovConverterPassPass(*PassRegistry::getPassRegistry()); |
| 113 | } |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 114 | |
| 115 | StringRef getPassName() const override { return "X86 cmov Conversion"; } |
| 116 | bool runOnMachineFunction(MachineFunction &MF) override; |
| 117 | void getAnalysisUsage(AnalysisUsage &AU) const override; |
| 118 | |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 119 | /// Pass identification, replacement for typeid. |
| 120 | static char ID; |
| 121 | |
Amjad Aboud | 8ef85a0 | 2017-10-02 21:46:37 +0000 | [diff] [blame] | 122 | private: |
Chandler Carruth | 93a6455 | 2017-08-19 05:01:19 +0000 | [diff] [blame] | 123 | MachineRegisterInfo *MRI; |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 124 | const TargetInstrInfo *TII; |
Chandler Carruth | 93a6455 | 2017-08-19 05:01:19 +0000 | [diff] [blame] | 125 | const TargetRegisterInfo *TRI; |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 126 | TargetSchedModel TSchedModel; |
| 127 | |
| 128 | /// List of consecutive CMOV instructions. |
Eugene Zelenko | 60433b6 | 2017-10-05 00:33:50 +0000 | [diff] [blame] | 129 | using CmovGroup = SmallVector<MachineInstr *, 2>; |
| 130 | using CmovGroups = SmallVector<CmovGroup, 2>; |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 131 | |
| 132 | /// Collect all CMOV-group-candidates in \p CurrLoop and update \p |
| 133 | /// CmovInstGroups accordingly. |
| 134 | /// |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame] | 135 | /// \param Blocks List of blocks to process. |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 136 | /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop. |
| 137 | /// \returns true iff it found any CMOV-group-candidate. |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame] | 138 | bool collectCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks, |
Chandler Carruth | 93a6455 | 2017-08-19 05:01:19 +0000 | [diff] [blame] | 139 | CmovGroups &CmovInstGroups, |
| 140 | bool IncludeLoads = false); |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 141 | |
| 142 | /// Check if it is profitable to transform each CMOV-group-candidates into |
| 143 | /// branch. Remove all groups that are not profitable from \p CmovInstGroups. |
| 144 | /// |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame] | 145 | /// \param Blocks List of blocks to process. |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 146 | /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop. |
| 147 | /// \returns true iff any CMOV-group-candidate remain. |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame] | 148 | bool checkForProfitableCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks, |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 149 | CmovGroups &CmovInstGroups); |
| 150 | |
| 151 | /// Convert the given list of consecutive CMOV instructions into a branch. |
| 152 | /// |
| 153 | /// \param Group Consecutive CMOV instructions to be converted into branch. |
| 154 | void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr *> &Group) const; |
| 155 | }; |
| 156 | |
Eugene Zelenko | 60433b6 | 2017-10-05 00:33:50 +0000 | [diff] [blame] | 157 | } // end anonymous namespace |
| 158 | |
| 159 | char X86CmovConverterPass::ID = 0; |
| 160 | |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 161 | void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const { |
| 162 | MachineFunctionPass::getAnalysisUsage(AU); |
| 163 | AU.addRequired<MachineLoopInfo>(); |
| 164 | } |
| 165 | |
| 166 | bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) { |
Matthias Braun | f1caa28 | 2017-12-15 22:22:58 +0000 | [diff] [blame] | 167 | if (skipFunction(MF.getFunction())) |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 168 | return false; |
| 169 | if (!EnableCmovConverter) |
| 170 | return false; |
| 171 | |
| 172 | DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName() |
| 173 | << "**********\n"); |
| 174 | |
| 175 | bool Changed = false; |
| 176 | MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); |
| 177 | const TargetSubtargetInfo &STI = MF.getSubtarget(); |
| 178 | MRI = &MF.getRegInfo(); |
| 179 | TII = STI.getInstrInfo(); |
Chandler Carruth | 93a6455 | 2017-08-19 05:01:19 +0000 | [diff] [blame] | 180 | TRI = STI.getRegisterInfo(); |
Sanjay Patel | 0d7df36 | 2018-04-08 19:56:04 +0000 | [diff] [blame] | 181 | TSchedModel.init(&STI); |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 182 | |
Chandler Carruth | 93a6455 | 2017-08-19 05:01:19 +0000 | [diff] [blame] | 183 | // Before we handle the more subtle cases of register-register CMOVs inside |
| 184 | // of potentially hot loops, we want to quickly remove all CMOVs with |
| 185 | // a memory operand. The CMOV will risk a stall waiting for the load to |
| 186 | // complete that speculative execution behind a branch is better suited to |
| 187 | // handle on modern x86 chips. |
| 188 | if (ForceMemOperand) { |
| 189 | CmovGroups AllCmovGroups; |
| 190 | SmallVector<MachineBasicBlock *, 4> Blocks; |
| 191 | for (auto &MBB : MF) |
| 192 | Blocks.push_back(&MBB); |
| 193 | if (collectCmovCandidates(Blocks, AllCmovGroups, /*IncludeLoads*/ true)) { |
| 194 | for (auto &Group : AllCmovGroups) { |
| 195 | // Skip any group that doesn't do at least one memory operand cmov. |
| 196 | if (!llvm::any_of(Group, [&](MachineInstr *I) { return I->mayLoad(); })) |
| 197 | continue; |
| 198 | |
| 199 | // For CMOV groups which we can rewrite and which contain a memory load, |
| 200 | // always rewrite them. On x86, a CMOV will dramatically amplify any |
| 201 | // memory latency by blocking speculative execution. |
| 202 | Changed = true; |
| 203 | convertCmovInstsToBranches(Group); |
| 204 | } |
| 205 | } |
| 206 | } |
| 207 | |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 208 | //===--------------------------------------------------------------------===// |
Chandler Carruth | 93a6455 | 2017-08-19 05:01:19 +0000 | [diff] [blame] | 209 | // Register-operand Conversion Algorithm |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 210 | // --------- |
| 211 | // For each inner most loop |
| 212 | // collectCmovCandidates() { |
| 213 | // Find all CMOV-group-candidates. |
| 214 | // } |
| 215 | // |
| 216 | // checkForProfitableCmovCandidates() { |
| 217 | // * Calculate both loop-depth and optimized-loop-depth. |
| 218 | // * Use these depth to check for loop transformation profitability. |
| 219 | // * Check for CMOV-group-candidate transformation profitability. |
| 220 | // } |
| 221 | // |
| 222 | // For each profitable CMOV-group-candidate |
| 223 | // convertCmovInstsToBranches() { |
| 224 | // * Create FalseBB, SinkBB, Conditional branch to SinkBB. |
| 225 | // * Replace each CMOV instruction with a PHI instruction in SinkBB. |
| 226 | // } |
| 227 | // |
| 228 | // Note: For more details, see each function description. |
| 229 | //===--------------------------------------------------------------------===// |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 230 | |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame] | 231 | // Build up the loops in pre-order. |
| 232 | SmallVector<MachineLoop *, 4> Loops(MLI.begin(), MLI.end()); |
| 233 | // Note that we need to check size on each iteration as we accumulate child |
| 234 | // loops. |
| 235 | for (int i = 0; i < (int)Loops.size(); ++i) |
| 236 | for (MachineLoop *Child : Loops[i]->getSubLoops()) |
| 237 | Loops.push_back(Child); |
| 238 | |
| 239 | for (MachineLoop *CurrLoop : Loops) { |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 240 | // Optimize only inner most loops. |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame] | 241 | if (!CurrLoop->getSubLoops().empty()) |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 242 | continue; |
| 243 | |
| 244 | // List of consecutive CMOV instructions to be processed. |
| 245 | CmovGroups CmovInstGroups; |
| 246 | |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame] | 247 | if (!collectCmovCandidates(CurrLoop->getBlocks(), CmovInstGroups)) |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 248 | continue; |
| 249 | |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame] | 250 | if (!checkForProfitableCmovCandidates(CurrLoop->getBlocks(), |
| 251 | CmovInstGroups)) |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 252 | continue; |
| 253 | |
| 254 | Changed = true; |
| 255 | for (auto &Group : CmovInstGroups) |
| 256 | convertCmovInstsToBranches(Group); |
| 257 | } |
Chandler Carruth | 93a6455 | 2017-08-19 05:01:19 +0000 | [diff] [blame] | 258 | |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 259 | return Changed; |
| 260 | } |
| 261 | |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame] | 262 | bool X86CmovConverterPass::collectCmovCandidates( |
Chandler Carruth | 93a6455 | 2017-08-19 05:01:19 +0000 | [diff] [blame] | 263 | ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups, |
| 264 | bool IncludeLoads) { |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 265 | //===--------------------------------------------------------------------===// |
| 266 | // Collect all CMOV-group-candidates and add them into CmovInstGroups. |
| 267 | // |
| 268 | // CMOV-group: |
| 269 | // CMOV instructions, in same MBB, that uses same EFLAGS def instruction. |
| 270 | // |
| 271 | // CMOV-group-candidate: |
| 272 | // CMOV-group where all the CMOV instructions are |
| 273 | // 1. consecutive. |
| 274 | // 2. have same condition code or opposite one. |
| 275 | // 3. have only operand registers (X86::CMOVrr). |
| 276 | //===--------------------------------------------------------------------===// |
| 277 | // List of possible improvement (TODO's): |
| 278 | // -------------------------------------- |
| 279 | // TODO: Add support for X86::CMOVrm instructions. |
| 280 | // TODO: Add support for X86::SETcc instructions. |
| 281 | // TODO: Add support for CMOV-groups with non consecutive CMOV instructions. |
| 282 | //===--------------------------------------------------------------------===// |
| 283 | |
| 284 | // Current processed CMOV-Group. |
| 285 | CmovGroup Group; |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame] | 286 | for (auto *MBB : Blocks) { |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 287 | Group.clear(); |
| 288 | // Condition code of first CMOV instruction current processed range and its |
| 289 | // opposite condition code. |
Chandler Carruth | 93a6455 | 2017-08-19 05:01:19 +0000 | [diff] [blame] | 290 | X86::CondCode FirstCC, FirstOppCC, MemOpCC; |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 291 | // Indicator of a non CMOVrr instruction in the current processed range. |
| 292 | bool FoundNonCMOVInst = false; |
| 293 | // Indicator for current processed CMOV-group if it should be skipped. |
| 294 | bool SkipGroup = false; |
| 295 | |
| 296 | for (auto &I : *MBB) { |
Amjad Aboud | c8d6797 | 2017-10-15 11:00:56 +0000 | [diff] [blame] | 297 | // Skip debug instructions. |
| 298 | if (I.isDebugValue()) |
| 299 | continue; |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 300 | X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode()); |
| 301 | // Check if we found a X86::CMOVrr instruction. |
Chandler Carruth | 93a6455 | 2017-08-19 05:01:19 +0000 | [diff] [blame] | 302 | if (CC != X86::COND_INVALID && (IncludeLoads || !I.mayLoad())) { |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 303 | if (Group.empty()) { |
| 304 | // We found first CMOV in the range, reset flags. |
| 305 | FirstCC = CC; |
| 306 | FirstOppCC = X86::GetOppositeBranchCondition(CC); |
Chandler Carruth | 93a6455 | 2017-08-19 05:01:19 +0000 | [diff] [blame] | 307 | // Clear out the prior group's memory operand CC. |
| 308 | MemOpCC = X86::COND_INVALID; |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 309 | FoundNonCMOVInst = false; |
| 310 | SkipGroup = false; |
| 311 | } |
| 312 | Group.push_back(&I); |
| 313 | // Check if it is a non-consecutive CMOV instruction or it has different |
| 314 | // condition code than FirstCC or FirstOppCC. |
| 315 | if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC)) |
| 316 | // Mark the SKipGroup indicator to skip current processed CMOV-Group. |
| 317 | SkipGroup = true; |
Chandler Carruth | 93a6455 | 2017-08-19 05:01:19 +0000 | [diff] [blame] | 318 | if (I.mayLoad()) { |
| 319 | if (MemOpCC == X86::COND_INVALID) |
| 320 | // The first memory operand CMOV. |
| 321 | MemOpCC = CC; |
| 322 | else if (CC != MemOpCC) |
| 323 | // Can't handle mixed conditions with memory operands. |
| 324 | SkipGroup = true; |
| 325 | } |
Chandler Carruth | 585bfc8 | 2017-09-06 06:28:08 +0000 | [diff] [blame] | 326 | // Check if we were relying on zero-extending behavior of the CMOV. |
| 327 | if (!SkipGroup && |
| 328 | llvm::any_of( |
| 329 | MRI->use_nodbg_instructions(I.defs().begin()->getReg()), |
| 330 | [&](MachineInstr &UseI) { |
| 331 | return UseI.getOpcode() == X86::SUBREG_TO_REG; |
| 332 | })) |
| 333 | // FIXME: We should model the cost of using an explicit MOV to handle |
| 334 | // the zero-extension rather than just refusing to handle this. |
| 335 | SkipGroup = true; |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 336 | continue; |
| 337 | } |
| 338 | // If Group is empty, keep looking for first CMOV in the range. |
| 339 | if (Group.empty()) |
| 340 | continue; |
| 341 | |
| 342 | // We found a non X86::CMOVrr instruction. |
| 343 | FoundNonCMOVInst = true; |
| 344 | // Check if this instruction define EFLAGS, to determine end of processed |
| 345 | // range, as there would be no more instructions using current EFLAGS def. |
| 346 | if (I.definesRegister(X86::EFLAGS)) { |
| 347 | // Check if current processed CMOV-group should not be skipped and add |
| 348 | // it as a CMOV-group-candidate. |
| 349 | if (!SkipGroup) |
| 350 | CmovInstGroups.push_back(Group); |
| 351 | else |
| 352 | ++NumOfSkippedCmovGroups; |
| 353 | Group.clear(); |
| 354 | } |
| 355 | } |
| 356 | // End of basic block is considered end of range, check if current processed |
| 357 | // CMOV-group should not be skipped and add it as a CMOV-group-candidate. |
| 358 | if (Group.empty()) |
| 359 | continue; |
| 360 | if (!SkipGroup) |
| 361 | CmovInstGroups.push_back(Group); |
| 362 | else |
| 363 | ++NumOfSkippedCmovGroups; |
| 364 | } |
| 365 | |
| 366 | NumOfCmovGroupCandidate += CmovInstGroups.size(); |
| 367 | return !CmovInstGroups.empty(); |
| 368 | } |
| 369 | |
| 370 | /// \returns Depth of CMOV instruction as if it was converted into branch. |
| 371 | /// \param TrueOpDepth depth cost of CMOV true value operand. |
| 372 | /// \param FalseOpDepth depth cost of CMOV false value operand. |
| 373 | static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) { |
| 374 | //===--------------------------------------------------------------------===// |
| 375 | // With no info about branch weight, we assume 50% for each value operand. |
| 376 | // Thus, depth of optimized CMOV instruction is the rounded up average of |
| 377 | // its True-Operand-Value-Depth and False-Operand-Value-Depth. |
| 378 | //===--------------------------------------------------------------------===// |
| 379 | return (TrueOpDepth + FalseOpDepth + 1) / 2; |
| 380 | } |
| 381 | |
| 382 | bool X86CmovConverterPass::checkForProfitableCmovCandidates( |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame] | 383 | ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups) { |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 384 | struct DepthInfo { |
| 385 | /// Depth of original loop. |
| 386 | unsigned Depth; |
| 387 | /// Depth of optimized loop. |
| 388 | unsigned OptDepth; |
| 389 | }; |
| 390 | /// Number of loop iterations to calculate depth for ?! |
| 391 | static const unsigned LoopIterations = 2; |
| 392 | DenseMap<MachineInstr *, DepthInfo> DepthMap; |
| 393 | DepthInfo LoopDepth[LoopIterations] = {{0, 0}, {0, 0}}; |
| 394 | enum { PhyRegType = 0, VirRegType = 1, RegTypeNum = 2 }; |
| 395 | /// For each register type maps the register to its last def instruction. |
| 396 | DenseMap<unsigned, MachineInstr *> RegDefMaps[RegTypeNum]; |
| 397 | /// Maps register operand to its def instruction, which can be nullptr if it |
| 398 | /// is unknown (e.g., operand is defined outside the loop). |
| 399 | DenseMap<MachineOperand *, MachineInstr *> OperandToDefMap; |
| 400 | |
| 401 | // Set depth of unknown instruction (i.e., nullptr) to zero. |
| 402 | DepthMap[nullptr] = {0, 0}; |
| 403 | |
| 404 | SmallPtrSet<MachineInstr *, 4> CmovInstructions; |
| 405 | for (auto &Group : CmovInstGroups) |
| 406 | CmovInstructions.insert(Group.begin(), Group.end()); |
| 407 | |
| 408 | //===--------------------------------------------------------------------===// |
| 409 | // Step 1: Calculate instruction depth and loop depth. |
| 410 | // Optimized-Loop: |
| 411 | // loop with CMOV-group-candidates converted into branches. |
| 412 | // |
| 413 | // Instruction-Depth: |
| 414 | // instruction latency + max operand depth. |
| 415 | // * For CMOV instruction in optimized loop the depth is calculated as: |
| 416 | // CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth) |
| 417 | // TODO: Find a better way to estimate the latency of the branch instruction |
| 418 | // rather than using the CMOV latency. |
| 419 | // |
| 420 | // Loop-Depth: |
| 421 | // max instruction depth of all instructions in the loop. |
| 422 | // Note: instruction with max depth represents the critical-path in the loop. |
| 423 | // |
| 424 | // Loop-Depth[i]: |
| 425 | // Loop-Depth calculated for first `i` iterations. |
| 426 | // Note: it is enough to calculate depth for up to two iterations. |
| 427 | // |
| 428 | // Depth-Diff[i]: |
| 429 | // Number of cycles saved in first 'i` iterations by optimizing the loop. |
| 430 | //===--------------------------------------------------------------------===// |
| 431 | for (unsigned I = 0; I < LoopIterations; ++I) { |
| 432 | DepthInfo &MaxDepth = LoopDepth[I]; |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame] | 433 | for (auto *MBB : Blocks) { |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 434 | // Clear physical registers Def map. |
| 435 | RegDefMaps[PhyRegType].clear(); |
| 436 | for (MachineInstr &MI : *MBB) { |
Amjad Aboud | c8d6797 | 2017-10-15 11:00:56 +0000 | [diff] [blame] | 437 | // Skip debug instructions. |
| 438 | if (MI.isDebugValue()) |
| 439 | continue; |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 440 | unsigned MIDepth = 0; |
| 441 | unsigned MIDepthOpt = 0; |
| 442 | bool IsCMOV = CmovInstructions.count(&MI); |
| 443 | for (auto &MO : MI.uses()) { |
| 444 | // Checks for "isUse()" as "uses()" returns also implicit definitions. |
| 445 | if (!MO.isReg() || !MO.isUse()) |
| 446 | continue; |
| 447 | unsigned Reg = MO.getReg(); |
| 448 | auto &RDM = RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)]; |
| 449 | if (MachineInstr *DefMI = RDM.lookup(Reg)) { |
| 450 | OperandToDefMap[&MO] = DefMI; |
| 451 | DepthInfo Info = DepthMap.lookup(DefMI); |
| 452 | MIDepth = std::max(MIDepth, Info.Depth); |
| 453 | if (!IsCMOV) |
| 454 | MIDepthOpt = std::max(MIDepthOpt, Info.OptDepth); |
| 455 | } |
| 456 | } |
| 457 | |
| 458 | if (IsCMOV) |
| 459 | MIDepthOpt = getDepthOfOptCmov( |
| 460 | DepthMap[OperandToDefMap.lookup(&MI.getOperand(1))].OptDepth, |
| 461 | DepthMap[OperandToDefMap.lookup(&MI.getOperand(2))].OptDepth); |
| 462 | |
| 463 | // Iterates over all operands to handle implicit definitions as well. |
| 464 | for (auto &MO : MI.operands()) { |
| 465 | if (!MO.isReg() || !MO.isDef()) |
| 466 | continue; |
| 467 | unsigned Reg = MO.getReg(); |
| 468 | RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)][Reg] = &MI; |
| 469 | } |
| 470 | |
| 471 | unsigned Latency = TSchedModel.computeInstrLatency(&MI); |
| 472 | DepthMap[&MI] = {MIDepth += Latency, MIDepthOpt += Latency}; |
| 473 | MaxDepth.Depth = std::max(MaxDepth.Depth, MIDepth); |
| 474 | MaxDepth.OptDepth = std::max(MaxDepth.OptDepth, MIDepthOpt); |
| 475 | } |
| 476 | } |
| 477 | } |
| 478 | |
| 479 | unsigned Diff[LoopIterations] = {LoopDepth[0].Depth - LoopDepth[0].OptDepth, |
| 480 | LoopDepth[1].Depth - LoopDepth[1].OptDepth}; |
| 481 | |
| 482 | //===--------------------------------------------------------------------===// |
| 483 | // Step 2: Check if Loop worth to be optimized. |
| 484 | // Worth-Optimize-Loop: |
| 485 | // case 1: Diff[1] == Diff[0] |
| 486 | // Critical-path is iteration independent - there is no dependency |
| 487 | // of critical-path instructions on critical-path instructions of |
| 488 | // previous iteration. |
| 489 | // Thus, it is enough to check gain percent of 1st iteration - |
| 490 | // To be conservative, the optimized loop need to have a depth of |
| 491 | // 12.5% cycles less than original loop, per iteration. |
| 492 | // |
| 493 | // case 2: Diff[1] > Diff[0] |
| 494 | // Critical-path is iteration dependent - there is dependency of |
| 495 | // critical-path instructions on critical-path instructions of |
| 496 | // previous iteration. |
Amjad Aboud | 6fa6813 | 2017-08-08 12:17:56 +0000 | [diff] [blame] | 497 | // Thus, check the gain percent of the 2nd iteration (similar to the |
| 498 | // previous case), but it is also required to check the gradient of |
| 499 | // the gain - the change in Depth-Diff compared to the change in |
| 500 | // Loop-Depth between 1st and 2nd iterations. |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 501 | // To be conservative, the gradient need to be at least 50%. |
| 502 | // |
Amjad Aboud | 6fa6813 | 2017-08-08 12:17:56 +0000 | [diff] [blame] | 503 | // In addition, In order not to optimize loops with very small gain, the |
| 504 | // gain (in cycles) after 2nd iteration should not be less than a given |
| 505 | // threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply. |
| 506 | // |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 507 | // If loop is not worth optimizing, remove all CMOV-group-candidates. |
| 508 | //===--------------------------------------------------------------------===// |
Amjad Aboud | 6fa6813 | 2017-08-08 12:17:56 +0000 | [diff] [blame] | 509 | if (Diff[1] < GainCycleThreshold) |
| 510 | return false; |
| 511 | |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 512 | bool WorthOptLoop = false; |
| 513 | if (Diff[1] == Diff[0]) |
| 514 | WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth; |
| 515 | else if (Diff[1] > Diff[0]) |
| 516 | WorthOptLoop = |
Amjad Aboud | 6fa6813 | 2017-08-08 12:17:56 +0000 | [diff] [blame] | 517 | (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth) && |
| 518 | (Diff[1] * 8 >= LoopDepth[1].Depth); |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 519 | |
| 520 | if (!WorthOptLoop) |
| 521 | return false; |
| 522 | |
| 523 | ++NumOfLoopCandidate; |
| 524 | |
| 525 | //===--------------------------------------------------------------------===// |
| 526 | // Step 3: Check for each CMOV-group-candidate if it worth to be optimized. |
| 527 | // Worth-Optimize-Group: |
| 528 | // Iff it worths to optimize all CMOV instructions in the group. |
| 529 | // |
| 530 | // Worth-Optimize-CMOV: |
| 531 | // Predicted branch is faster than CMOV by the difference between depth of |
| 532 | // condition operand and depth of taken (predicted) value operand. |
| 533 | // To be conservative, the gain of such CMOV transformation should cover at |
| 534 | // at least 25% of branch-misprediction-penalty. |
| 535 | //===--------------------------------------------------------------------===// |
| 536 | unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty; |
| 537 | CmovGroups TempGroups; |
| 538 | std::swap(TempGroups, CmovInstGroups); |
| 539 | for (auto &Group : TempGroups) { |
| 540 | bool WorthOpGroup = true; |
| 541 | for (auto *MI : Group) { |
| 542 | // Avoid CMOV instruction which value is used as a pointer to load from. |
| 543 | // This is another conservative check to avoid converting CMOV instruction |
| 544 | // used with tree-search like algorithm, where the branch is unpredicted. |
| 545 | auto UIs = MRI->use_instructions(MI->defs().begin()->getReg()); |
| 546 | if (UIs.begin() != UIs.end() && ++UIs.begin() == UIs.end()) { |
| 547 | unsigned Op = UIs.begin()->getOpcode(); |
| 548 | if (Op == X86::MOV64rm || Op == X86::MOV32rm) { |
| 549 | WorthOpGroup = false; |
| 550 | break; |
| 551 | } |
| 552 | } |
| 553 | |
| 554 | unsigned CondCost = |
| 555 | DepthMap[OperandToDefMap.lookup(&MI->getOperand(3))].Depth; |
| 556 | unsigned ValCost = getDepthOfOptCmov( |
| 557 | DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth, |
| 558 | DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth); |
| 559 | if (ValCost > CondCost || (CondCost - ValCost) * 4 < MispredictPenalty) { |
| 560 | WorthOpGroup = false; |
| 561 | break; |
| 562 | } |
| 563 | } |
| 564 | |
| 565 | if (WorthOpGroup) |
| 566 | CmovInstGroups.push_back(Group); |
| 567 | } |
| 568 | |
| 569 | return !CmovInstGroups.empty(); |
| 570 | } |
| 571 | |
| 572 | static bool checkEFLAGSLive(MachineInstr *MI) { |
| 573 | if (MI->killsRegister(X86::EFLAGS)) |
| 574 | return false; |
| 575 | |
| 576 | // The EFLAGS operand of MI might be missing a kill marker. |
| 577 | // Figure out whether EFLAGS operand should LIVE after MI instruction. |
| 578 | MachineBasicBlock *BB = MI->getParent(); |
| 579 | MachineBasicBlock::iterator ItrMI = MI; |
| 580 | |
| 581 | // Scan forward through BB for a use/def of EFLAGS. |
| 582 | for (auto I = std::next(ItrMI), E = BB->end(); I != E; ++I) { |
| 583 | if (I->readsRegister(X86::EFLAGS)) |
| 584 | return true; |
| 585 | if (I->definesRegister(X86::EFLAGS)) |
| 586 | return false; |
| 587 | } |
| 588 | |
| 589 | // We hit the end of the block, check whether EFLAGS is live into a successor. |
| 590 | for (auto I = BB->succ_begin(), E = BB->succ_end(); I != E; ++I) { |
| 591 | if ((*I)->isLiveIn(X86::EFLAGS)) |
| 592 | return true; |
| 593 | } |
| 594 | |
| 595 | return false; |
| 596 | } |
| 597 | |
Amjad Aboud | c8d6797 | 2017-10-15 11:00:56 +0000 | [diff] [blame] | 598 | /// Given /p First CMOV instruction and /p Last CMOV instruction representing a |
| 599 | /// group of CMOV instructions, which may contain debug instructions in between, |
| 600 | /// move all debug instructions to after the last CMOV instruction, making the |
| 601 | /// CMOV group consecutive. |
| 602 | static void packCmovGroup(MachineInstr *First, MachineInstr *Last) { |
| 603 | assert(X86::getCondFromCMovOpc(Last->getOpcode()) != X86::COND_INVALID && |
| 604 | "Last instruction in a CMOV group must be a CMOV instruction"); |
| 605 | |
| 606 | SmallVector<MachineInstr *, 2> DBGInstructions; |
| 607 | for (auto I = First->getIterator(), E = Last->getIterator(); I != E; I++) { |
| 608 | if (I->isDebugValue()) |
| 609 | DBGInstructions.push_back(&*I); |
| 610 | } |
| 611 | |
| 612 | // Splice the debug instruction after the cmov group. |
| 613 | MachineBasicBlock *MBB = First->getParent(); |
| 614 | for (auto *MI : DBGInstructions) |
| 615 | MBB->insertAfter(Last, MI->removeFromParent()); |
| 616 | } |
| 617 | |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 618 | void X86CmovConverterPass::convertCmovInstsToBranches( |
| 619 | SmallVectorImpl<MachineInstr *> &Group) const { |
| 620 | assert(!Group.empty() && "No CMOV instructions to convert"); |
| 621 | ++NumOfOptimizedCmovGroups; |
| 622 | |
Amjad Aboud | c8d6797 | 2017-10-15 11:00:56 +0000 | [diff] [blame] | 623 | // If the CMOV group is not packed, e.g., there are debug instructions between |
| 624 | // first CMOV and last CMOV, then pack the group and make the CMOV instruction |
| 625 | // consecutive by moving the debug instructions to after the last CMOV. |
| 626 | packCmovGroup(Group.front(), Group.back()); |
| 627 | |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 628 | // To convert a CMOVcc instruction, we actually have to insert the diamond |
| 629 | // control-flow pattern. The incoming instruction knows the destination vreg |
| 630 | // to set, the condition code register to branch on, the true/false values to |
| 631 | // select between, and a branch opcode to use. |
| 632 | |
| 633 | // Before |
| 634 | // ----- |
| 635 | // MBB: |
| 636 | // cond = cmp ... |
| 637 | // v1 = CMOVge t1, f1, cond |
| 638 | // v2 = CMOVlt t2, f2, cond |
| 639 | // v3 = CMOVge v1, f3, cond |
| 640 | // |
| 641 | // After |
| 642 | // ----- |
| 643 | // MBB: |
| 644 | // cond = cmp ... |
| 645 | // jge %SinkMBB |
| 646 | // |
| 647 | // FalseMBB: |
| 648 | // jmp %SinkMBB |
| 649 | // |
| 650 | // SinkMBB: |
| 651 | // %v1 = phi[%f1, %FalseMBB], [%t1, %MBB] |
| 652 | // %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch |
| 653 | // ; true-value with false-value |
| 654 | // %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use |
| 655 | // ; previous Phi instruction result |
| 656 | |
| 657 | MachineInstr &MI = *Group.front(); |
| 658 | MachineInstr *LastCMOV = Group.back(); |
| 659 | DebugLoc DL = MI.getDebugLoc(); |
Chandler Carruth | 93a6455 | 2017-08-19 05:01:19 +0000 | [diff] [blame] | 660 | |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 661 | X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode())); |
| 662 | X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); |
Chandler Carruth | 93a6455 | 2017-08-19 05:01:19 +0000 | [diff] [blame] | 663 | // Potentially swap the condition codes so that any memory operand to a CMOV |
| 664 | // is in the *false* position instead of the *true* position. We can invert |
| 665 | // any non-memory operand CMOV instructions to cope with this and we ensure |
| 666 | // memory operand CMOVs are only included with a single condition code. |
| 667 | if (llvm::any_of(Group, [&](MachineInstr *I) { |
| 668 | return I->mayLoad() && X86::getCondFromCMovOpc(I->getOpcode()) == CC; |
| 669 | })) |
| 670 | std::swap(CC, OppCC); |
| 671 | |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 672 | MachineBasicBlock *MBB = MI.getParent(); |
| 673 | MachineFunction::iterator It = ++MBB->getIterator(); |
| 674 | MachineFunction *F = MBB->getParent(); |
| 675 | const BasicBlock *BB = MBB->getBasicBlock(); |
| 676 | |
| 677 | MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB); |
| 678 | MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB); |
| 679 | F->insert(It, FalseMBB); |
| 680 | F->insert(It, SinkMBB); |
| 681 | |
| 682 | // If the EFLAGS register isn't dead in the terminator, then claim that it's |
| 683 | // live into the sink and copy blocks. |
| 684 | if (checkEFLAGSLive(LastCMOV)) { |
| 685 | FalseMBB->addLiveIn(X86::EFLAGS); |
| 686 | SinkMBB->addLiveIn(X86::EFLAGS); |
| 687 | } |
| 688 | |
| 689 | // Transfer the remainder of BB and its successor edges to SinkMBB. |
| 690 | SinkMBB->splice(SinkMBB->begin(), MBB, |
| 691 | std::next(MachineBasicBlock::iterator(LastCMOV)), MBB->end()); |
| 692 | SinkMBB->transferSuccessorsAndUpdatePHIs(MBB); |
| 693 | |
| 694 | // Add the false and sink blocks as its successors. |
| 695 | MBB->addSuccessor(FalseMBB); |
| 696 | MBB->addSuccessor(SinkMBB); |
| 697 | |
| 698 | // Create the conditional branch instruction. |
| 699 | BuildMI(MBB, DL, TII->get(X86::GetCondBranchFromCond(CC))).addMBB(SinkMBB); |
| 700 | |
| 701 | // Add the sink block to the false block successors. |
| 702 | FalseMBB->addSuccessor(SinkMBB); |
| 703 | |
| 704 | MachineInstrBuilder MIB; |
| 705 | MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); |
| 706 | MachineBasicBlock::iterator MIItEnd = |
| 707 | std::next(MachineBasicBlock::iterator(LastCMOV)); |
Chandler Carruth | 93a6455 | 2017-08-19 05:01:19 +0000 | [diff] [blame] | 708 | MachineBasicBlock::iterator FalseInsertionPoint = FalseMBB->begin(); |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 709 | MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin(); |
Chandler Carruth | 93a6455 | 2017-08-19 05:01:19 +0000 | [diff] [blame] | 710 | |
| 711 | // First we need to insert an explicit load on the false path for any memory |
| 712 | // operand. We also need to potentially do register rewriting here, but it is |
| 713 | // simpler as the memory operands are always on the false path so we can |
| 714 | // simply take that input, whatever it is. |
| 715 | DenseMap<unsigned, unsigned> FalseBBRegRewriteTable; |
| 716 | for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;) { |
| 717 | auto &MI = *MIIt++; |
| 718 | // Skip any CMOVs in this group which don't load from memory. |
| 719 | if (!MI.mayLoad()) { |
| 720 | // Remember the false-side register input. |
Chandler Carruth | 9ef881e | 2017-08-19 23:35:50 +0000 | [diff] [blame] | 721 | unsigned FalseReg = |
Chandler Carruth | 93a6455 | 2017-08-19 05:01:19 +0000 | [diff] [blame] | 722 | MI.getOperand(X86::getCondFromCMovOpc(MI.getOpcode()) == CC ? 1 : 2) |
| 723 | .getReg(); |
Chandler Carruth | 9ef881e | 2017-08-19 23:35:50 +0000 | [diff] [blame] | 724 | // Walk back through any intermediate cmovs referenced. |
Eugene Zelenko | 60433b6 | 2017-10-05 00:33:50 +0000 | [diff] [blame] | 725 | while (true) { |
Chandler Carruth | 9ef881e | 2017-08-19 23:35:50 +0000 | [diff] [blame] | 726 | auto FRIt = FalseBBRegRewriteTable.find(FalseReg); |
| 727 | if (FRIt == FalseBBRegRewriteTable.end()) |
| 728 | break; |
| 729 | FalseReg = FRIt->second; |
| 730 | } |
| 731 | FalseBBRegRewriteTable[MI.getOperand(0).getReg()] = FalseReg; |
Chandler Carruth | 93a6455 | 2017-08-19 05:01:19 +0000 | [diff] [blame] | 732 | continue; |
| 733 | } |
| 734 | |
| 735 | // The condition must be the *opposite* of the one we've decided to branch |
| 736 | // on as the branch will go *around* the load and the load should happen |
| 737 | // when the CMOV condition is false. |
| 738 | assert(X86::getCondFromCMovOpc(MI.getOpcode()) == OppCC && |
| 739 | "Can only handle memory-operand cmov instructions with a condition " |
| 740 | "opposite to the selected branch direction."); |
| 741 | |
| 742 | // The goal is to rewrite the cmov from: |
| 743 | // |
| 744 | // MBB: |
| 745 | // %A = CMOVcc %B (tied), (mem) |
| 746 | // |
| 747 | // to |
| 748 | // |
| 749 | // MBB: |
| 750 | // %A = CMOVcc %B (tied), %C |
| 751 | // FalseMBB: |
| 752 | // %C = MOV (mem) |
| 753 | // |
| 754 | // Which will allow the next loop to rewrite the CMOV in terms of a PHI: |
| 755 | // |
| 756 | // MBB: |
| 757 | // JMP!cc SinkMBB |
| 758 | // FalseMBB: |
| 759 | // %C = MOV (mem) |
| 760 | // SinkMBB: |
| 761 | // %A = PHI [ %C, FalseMBB ], [ %B, MBB] |
| 762 | |
| 763 | // Get a fresh register to use as the destination of the MOV. |
| 764 | const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg()); |
| 765 | unsigned TmpReg = MRI->createVirtualRegister(RC); |
| 766 | |
| 767 | SmallVector<MachineInstr *, 4> NewMIs; |
| 768 | bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg, |
| 769 | /*UnfoldLoad*/ true, |
| 770 | /*UnfoldStore*/ false, NewMIs); |
| 771 | (void)Unfolded; |
| 772 | assert(Unfolded && "Should never fail to unfold a loading cmov!"); |
| 773 | |
| 774 | // Move the new CMOV to just before the old one and reset any impacted |
| 775 | // iterator. |
| 776 | auto *NewCMOV = NewMIs.pop_back_val(); |
| 777 | assert(X86::getCondFromCMovOpc(NewCMOV->getOpcode()) == OppCC && |
| 778 | "Last new instruction isn't the expected CMOV!"); |
| 779 | DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump()); |
| 780 | MBB->insert(MachineBasicBlock::iterator(MI), NewCMOV); |
| 781 | if (&*MIItBegin == &MI) |
| 782 | MIItBegin = MachineBasicBlock::iterator(NewCMOV); |
| 783 | |
| 784 | // Sink whatever instructions were needed to produce the unfolded operand |
| 785 | // into the false block. |
| 786 | for (auto *NewMI : NewMIs) { |
| 787 | DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump()); |
| 788 | FalseMBB->insert(FalseInsertionPoint, NewMI); |
| 789 | // Re-map any operands that are from other cmovs to the inputs for this block. |
| 790 | for (auto &MOp : NewMI->uses()) { |
| 791 | if (!MOp.isReg()) |
| 792 | continue; |
| 793 | auto It = FalseBBRegRewriteTable.find(MOp.getReg()); |
| 794 | if (It == FalseBBRegRewriteTable.end()) |
| 795 | continue; |
| 796 | |
| 797 | MOp.setReg(It->second); |
| 798 | // This might have been a kill when it referenced the cmov result, but |
| 799 | // it won't necessarily be once rewritten. |
| 800 | // FIXME: We could potentially improve this by tracking whether the |
| 801 | // operand to the cmov was also a kill, and then skipping the PHI node |
| 802 | // construction below. |
| 803 | MOp.setIsKill(false); |
| 804 | } |
| 805 | } |
| 806 | MBB->erase(MachineBasicBlock::iterator(MI), |
| 807 | std::next(MachineBasicBlock::iterator(MI))); |
| 808 | |
| 809 | // Add this PHI to the rewrite table. |
| 810 | FalseBBRegRewriteTable[NewCMOV->getOperand(0).getReg()] = TmpReg; |
| 811 | } |
| 812 | |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 813 | // As we are creating the PHIs, we have to be careful if there is more than |
| 814 | // one. Later CMOVs may reference the results of earlier CMOVs, but later |
| 815 | // PHIs have to reference the individual true/false inputs from earlier PHIs. |
| 816 | // That also means that PHI construction must work forward from earlier to |
| 817 | // later, and that the code must maintain a mapping from earlier PHI's |
| 818 | // destination registers, and the registers that went into the PHI. |
| 819 | DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable; |
| 820 | |
| 821 | for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { |
| 822 | unsigned DestReg = MIIt->getOperand(0).getReg(); |
| 823 | unsigned Op1Reg = MIIt->getOperand(1).getReg(); |
| 824 | unsigned Op2Reg = MIIt->getOperand(2).getReg(); |
| 825 | |
| 826 | // If this CMOV we are processing is the opposite condition from the jump we |
| 827 | // generated, then we have to swap the operands for the PHI that is going to |
| 828 | // be generated. |
| 829 | if (X86::getCondFromCMovOpc(MIIt->getOpcode()) == OppCC) |
| 830 | std::swap(Op1Reg, Op2Reg); |
| 831 | |
| 832 | auto Op1Itr = RegRewriteTable.find(Op1Reg); |
| 833 | if (Op1Itr != RegRewriteTable.end()) |
| 834 | Op1Reg = Op1Itr->second.first; |
| 835 | |
| 836 | auto Op2Itr = RegRewriteTable.find(Op2Reg); |
| 837 | if (Op2Itr != RegRewriteTable.end()) |
| 838 | Op2Reg = Op2Itr->second.second; |
| 839 | |
| 840 | // SinkMBB: |
| 841 | // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ] |
| 842 | // ... |
| 843 | MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg) |
| 844 | .addReg(Op1Reg) |
| 845 | .addMBB(FalseMBB) |
| 846 | .addReg(Op2Reg) |
| 847 | .addMBB(MBB); |
| 848 | (void)MIB; |
| 849 | DEBUG(dbgs() << "\tFrom: "; MIIt->dump()); |
| 850 | DEBUG(dbgs() << "\tTo: "; MIB->dump()); |
| 851 | |
| 852 | // Add this PHI to the rewrite table. |
| 853 | RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg); |
| 854 | } |
| 855 | |
| 856 | // Now remove the CMOV(s). |
| 857 | MBB->erase(MIItBegin, MIItEnd); |
| 858 | } |
| 859 | |
Amjad Aboud | 8ef85a0 | 2017-10-02 21:46:37 +0000 | [diff] [blame] | 860 | INITIALIZE_PASS_BEGIN(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion", |
| 861 | false, false) |
| 862 | INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) |
| 863 | INITIALIZE_PASS_END(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion", |
| 864 | false, false) |
| 865 | |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 866 | FunctionPass *llvm::createX86CmovConverterPass() { |
| 867 | return new X86CmovConverterPass(); |
| 868 | } |