Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 1 | //====-- X86CmovConversion.cpp - Convert Cmov to Branch -------------------===// |
| 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
| 9 | /// \file |
| 10 | /// This file implements a pass that converts X86 cmov instructions into branch |
| 11 | /// when profitable. This pass is conservative, i.e., it applies transformation |
| 12 | /// if and only if it can gaurantee a gain with high confidence. |
| 13 | /// |
| 14 | /// Thus, the optimization applies under the following conditions: |
| 15 | /// 1. Consider as a candidate only CMOV in most inner loop, assuming that |
| 16 | /// most hotspots are represented by these loops. |
| 17 | /// 2. Given a group of CMOV instructions, that are using same EFLAGS def |
| 18 | /// instruction: |
| 19 | /// a. Consider them as candidates only if all have same code condition or |
| 20 | /// opposite one, to prevent generating more than one conditional jump |
| 21 | /// per EFLAGS def instruction. |
| 22 | /// b. Consider them as candidates only if all are profitable to be |
| 23 | /// converted, assuming that one bad conversion may casue a degradation. |
| 24 | /// 3. Apply conversion only for loop that are found profitable and only for |
| 25 | /// CMOV candidates that were found profitable. |
| 26 | /// a. Loop is considered profitable only if conversion will reduce its |
| 27 | /// depth cost by some thrishold. |
| 28 | /// b. CMOV is considered profitable if the cost of its condition is higher |
| 29 | /// than the average cost of its true-value and false-value by 25% of |
| 30 | /// branch-misprediction-penalty, this to assure no degredassion even |
| 31 | /// with 25% branch misprediction. |
| 32 | /// |
| 33 | /// Note: This pass is assumed to run on SSA machine code. |
| 34 | //===----------------------------------------------------------------------===// |
| 35 | // |
| 36 | // External interfaces: |
| 37 | // FunctionPass *llvm::createX86CmovConverterPass(); |
| 38 | // bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF); |
| 39 | // |
| 40 | |
| 41 | #include "X86.h" |
| 42 | #include "X86InstrInfo.h" |
| 43 | #include "X86Subtarget.h" |
| 44 | #include "llvm/ADT/Statistic.h" |
| 45 | #include "llvm/CodeGen/MachineFunctionPass.h" |
| 46 | #include "llvm/CodeGen/MachineInstrBuilder.h" |
| 47 | #include "llvm/CodeGen/MachineLoopInfo.h" |
| 48 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
| 49 | #include "llvm/CodeGen/Passes.h" |
| 50 | #include "llvm/CodeGen/TargetSchedule.h" |
| 51 | #include "llvm/IR/InstIterator.h" |
| 52 | #include "llvm/Support/Debug.h" |
| 53 | #include "llvm/Support/raw_ostream.h" |
| 54 | using namespace llvm; |
| 55 | |
| 56 | #define DEBUG_TYPE "x86-cmov-converter" |
| 57 | |
| 58 | STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups"); |
| 59 | STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates"); |
| 60 | STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops"); |
| 61 | STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups"); |
| 62 | |
| 63 | namespace { |
| 64 | // This internal switch can be used to turn off the cmov/branch optimization. |
| 65 | static cl::opt<bool> |
| 66 | EnableCmovConverter("x86-cmov-converter", |
| 67 | cl::desc("Enable the X86 cmov-to-branch optimization."), |
| 68 | cl::init(true), cl::Hidden); |
| 69 | |
Amjad Aboud | 6fa6813 | 2017-08-08 12:17:56 +0000 | [diff] [blame] | 70 | static cl::opt<unsigned> |
| 71 | GainCycleThreshold("x86-cmov-converter-threshold", |
| 72 | cl::desc("Minimum gain per loop (in cycles) threshold."), |
| 73 | cl::init(4), cl::Hidden); |
| 74 | |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 75 | /// Converts X86 cmov instructions into branches when profitable. |
| 76 | class X86CmovConverterPass : public MachineFunctionPass { |
| 77 | public: |
| 78 | X86CmovConverterPass() : MachineFunctionPass(ID) {} |
| 79 | ~X86CmovConverterPass() {} |
| 80 | |
| 81 | StringRef getPassName() const override { return "X86 cmov Conversion"; } |
| 82 | bool runOnMachineFunction(MachineFunction &MF) override; |
| 83 | void getAnalysisUsage(AnalysisUsage &AU) const override; |
| 84 | |
| 85 | private: |
| 86 | /// Pass identification, replacement for typeid. |
| 87 | static char ID; |
| 88 | |
| 89 | const MachineRegisterInfo *MRI; |
| 90 | const TargetInstrInfo *TII; |
| 91 | TargetSchedModel TSchedModel; |
| 92 | |
| 93 | /// List of consecutive CMOV instructions. |
| 94 | typedef SmallVector<MachineInstr *, 2> CmovGroup; |
| 95 | typedef SmallVector<CmovGroup, 2> CmovGroups; |
| 96 | |
| 97 | /// Collect all CMOV-group-candidates in \p CurrLoop and update \p |
| 98 | /// CmovInstGroups accordingly. |
| 99 | /// |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame^] | 100 | /// \param Blocks List of blocks to process. |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 101 | /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop. |
| 102 | /// \returns true iff it found any CMOV-group-candidate. |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame^] | 103 | bool collectCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks, |
| 104 | CmovGroups &CmovInstGroups); |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 105 | |
| 106 | /// Check if it is profitable to transform each CMOV-group-candidates into |
| 107 | /// branch. Remove all groups that are not profitable from \p CmovInstGroups. |
| 108 | /// |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame^] | 109 | /// \param Blocks List of blocks to process. |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 110 | /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop. |
| 111 | /// \returns true iff any CMOV-group-candidate remain. |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame^] | 112 | bool checkForProfitableCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks, |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 113 | CmovGroups &CmovInstGroups); |
| 114 | |
| 115 | /// Convert the given list of consecutive CMOV instructions into a branch. |
| 116 | /// |
| 117 | /// \param Group Consecutive CMOV instructions to be converted into branch. |
| 118 | void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr *> &Group) const; |
| 119 | }; |
| 120 | |
| 121 | char X86CmovConverterPass::ID = 0; |
| 122 | |
| 123 | void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const { |
| 124 | MachineFunctionPass::getAnalysisUsage(AU); |
| 125 | AU.addRequired<MachineLoopInfo>(); |
| 126 | } |
| 127 | |
| 128 | bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) { |
| 129 | if (skipFunction(*MF.getFunction())) |
| 130 | return false; |
| 131 | if (!EnableCmovConverter) |
| 132 | return false; |
| 133 | |
| 134 | DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName() |
| 135 | << "**********\n"); |
| 136 | |
| 137 | bool Changed = false; |
| 138 | MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); |
| 139 | const TargetSubtargetInfo &STI = MF.getSubtarget(); |
| 140 | MRI = &MF.getRegInfo(); |
| 141 | TII = STI.getInstrInfo(); |
| 142 | TSchedModel.init(STI.getSchedModel(), &STI, TII); |
| 143 | |
| 144 | //===--------------------------------------------------------------------===// |
| 145 | // Algorithm |
| 146 | // --------- |
| 147 | // For each inner most loop |
| 148 | // collectCmovCandidates() { |
| 149 | // Find all CMOV-group-candidates. |
| 150 | // } |
| 151 | // |
| 152 | // checkForProfitableCmovCandidates() { |
| 153 | // * Calculate both loop-depth and optimized-loop-depth. |
| 154 | // * Use these depth to check for loop transformation profitability. |
| 155 | // * Check for CMOV-group-candidate transformation profitability. |
| 156 | // } |
| 157 | // |
| 158 | // For each profitable CMOV-group-candidate |
| 159 | // convertCmovInstsToBranches() { |
| 160 | // * Create FalseBB, SinkBB, Conditional branch to SinkBB. |
| 161 | // * Replace each CMOV instruction with a PHI instruction in SinkBB. |
| 162 | // } |
| 163 | // |
| 164 | // Note: For more details, see each function description. |
| 165 | //===--------------------------------------------------------------------===// |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 166 | |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame^] | 167 | // Build up the loops in pre-order. |
| 168 | SmallVector<MachineLoop *, 4> Loops(MLI.begin(), MLI.end()); |
| 169 | // Note that we need to check size on each iteration as we accumulate child |
| 170 | // loops. |
| 171 | for (int i = 0; i < (int)Loops.size(); ++i) |
| 172 | for (MachineLoop *Child : Loops[i]->getSubLoops()) |
| 173 | Loops.push_back(Child); |
| 174 | |
| 175 | for (MachineLoop *CurrLoop : Loops) { |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 176 | // Optimize only inner most loops. |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame^] | 177 | if (!CurrLoop->getSubLoops().empty()) |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 178 | continue; |
| 179 | |
| 180 | // List of consecutive CMOV instructions to be processed. |
| 181 | CmovGroups CmovInstGroups; |
| 182 | |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame^] | 183 | if (!collectCmovCandidates(CurrLoop->getBlocks(), CmovInstGroups)) |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 184 | continue; |
| 185 | |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame^] | 186 | if (!checkForProfitableCmovCandidates(CurrLoop->getBlocks(), |
| 187 | CmovInstGroups)) |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 188 | continue; |
| 189 | |
| 190 | Changed = true; |
| 191 | for (auto &Group : CmovInstGroups) |
| 192 | convertCmovInstsToBranches(Group); |
| 193 | } |
| 194 | return Changed; |
| 195 | } |
| 196 | |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame^] | 197 | bool X86CmovConverterPass::collectCmovCandidates( |
| 198 | ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups) { |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 199 | //===--------------------------------------------------------------------===// |
| 200 | // Collect all CMOV-group-candidates and add them into CmovInstGroups. |
| 201 | // |
| 202 | // CMOV-group: |
| 203 | // CMOV instructions, in same MBB, that uses same EFLAGS def instruction. |
| 204 | // |
| 205 | // CMOV-group-candidate: |
| 206 | // CMOV-group where all the CMOV instructions are |
| 207 | // 1. consecutive. |
| 208 | // 2. have same condition code or opposite one. |
| 209 | // 3. have only operand registers (X86::CMOVrr). |
| 210 | //===--------------------------------------------------------------------===// |
| 211 | // List of possible improvement (TODO's): |
| 212 | // -------------------------------------- |
| 213 | // TODO: Add support for X86::CMOVrm instructions. |
| 214 | // TODO: Add support for X86::SETcc instructions. |
| 215 | // TODO: Add support for CMOV-groups with non consecutive CMOV instructions. |
| 216 | //===--------------------------------------------------------------------===// |
| 217 | |
| 218 | // Current processed CMOV-Group. |
| 219 | CmovGroup Group; |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame^] | 220 | for (auto *MBB : Blocks) { |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 221 | Group.clear(); |
| 222 | // Condition code of first CMOV instruction current processed range and its |
| 223 | // opposite condition code. |
| 224 | X86::CondCode FirstCC, FirstOppCC; |
| 225 | // Indicator of a non CMOVrr instruction in the current processed range. |
| 226 | bool FoundNonCMOVInst = false; |
| 227 | // Indicator for current processed CMOV-group if it should be skipped. |
| 228 | bool SkipGroup = false; |
| 229 | |
| 230 | for (auto &I : *MBB) { |
| 231 | X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode()); |
| 232 | // Check if we found a X86::CMOVrr instruction. |
| 233 | if (CC != X86::COND_INVALID && !I.mayLoad()) { |
| 234 | if (Group.empty()) { |
| 235 | // We found first CMOV in the range, reset flags. |
| 236 | FirstCC = CC; |
| 237 | FirstOppCC = X86::GetOppositeBranchCondition(CC); |
| 238 | FoundNonCMOVInst = false; |
| 239 | SkipGroup = false; |
| 240 | } |
| 241 | Group.push_back(&I); |
| 242 | // Check if it is a non-consecutive CMOV instruction or it has different |
| 243 | // condition code than FirstCC or FirstOppCC. |
| 244 | if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC)) |
| 245 | // Mark the SKipGroup indicator to skip current processed CMOV-Group. |
| 246 | SkipGroup = true; |
| 247 | continue; |
| 248 | } |
| 249 | // If Group is empty, keep looking for first CMOV in the range. |
| 250 | if (Group.empty()) |
| 251 | continue; |
| 252 | |
| 253 | // We found a non X86::CMOVrr instruction. |
| 254 | FoundNonCMOVInst = true; |
| 255 | // Check if this instruction define EFLAGS, to determine end of processed |
| 256 | // range, as there would be no more instructions using current EFLAGS def. |
| 257 | if (I.definesRegister(X86::EFLAGS)) { |
| 258 | // Check if current processed CMOV-group should not be skipped and add |
| 259 | // it as a CMOV-group-candidate. |
| 260 | if (!SkipGroup) |
| 261 | CmovInstGroups.push_back(Group); |
| 262 | else |
| 263 | ++NumOfSkippedCmovGroups; |
| 264 | Group.clear(); |
| 265 | } |
| 266 | } |
| 267 | // End of basic block is considered end of range, check if current processed |
| 268 | // CMOV-group should not be skipped and add it as a CMOV-group-candidate. |
| 269 | if (Group.empty()) |
| 270 | continue; |
| 271 | if (!SkipGroup) |
| 272 | CmovInstGroups.push_back(Group); |
| 273 | else |
| 274 | ++NumOfSkippedCmovGroups; |
| 275 | } |
| 276 | |
| 277 | NumOfCmovGroupCandidate += CmovInstGroups.size(); |
| 278 | return !CmovInstGroups.empty(); |
| 279 | } |
| 280 | |
| 281 | /// \returns Depth of CMOV instruction as if it was converted into branch. |
| 282 | /// \param TrueOpDepth depth cost of CMOV true value operand. |
| 283 | /// \param FalseOpDepth depth cost of CMOV false value operand. |
| 284 | static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) { |
| 285 | //===--------------------------------------------------------------------===// |
| 286 | // With no info about branch weight, we assume 50% for each value operand. |
| 287 | // Thus, depth of optimized CMOV instruction is the rounded up average of |
| 288 | // its True-Operand-Value-Depth and False-Operand-Value-Depth. |
| 289 | //===--------------------------------------------------------------------===// |
| 290 | return (TrueOpDepth + FalseOpDepth + 1) / 2; |
| 291 | } |
| 292 | |
| 293 | bool X86CmovConverterPass::checkForProfitableCmovCandidates( |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame^] | 294 | ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups) { |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 295 | struct DepthInfo { |
| 296 | /// Depth of original loop. |
| 297 | unsigned Depth; |
| 298 | /// Depth of optimized loop. |
| 299 | unsigned OptDepth; |
| 300 | }; |
| 301 | /// Number of loop iterations to calculate depth for ?! |
| 302 | static const unsigned LoopIterations = 2; |
| 303 | DenseMap<MachineInstr *, DepthInfo> DepthMap; |
| 304 | DepthInfo LoopDepth[LoopIterations] = {{0, 0}, {0, 0}}; |
| 305 | enum { PhyRegType = 0, VirRegType = 1, RegTypeNum = 2 }; |
| 306 | /// For each register type maps the register to its last def instruction. |
| 307 | DenseMap<unsigned, MachineInstr *> RegDefMaps[RegTypeNum]; |
| 308 | /// Maps register operand to its def instruction, which can be nullptr if it |
| 309 | /// is unknown (e.g., operand is defined outside the loop). |
| 310 | DenseMap<MachineOperand *, MachineInstr *> OperandToDefMap; |
| 311 | |
| 312 | // Set depth of unknown instruction (i.e., nullptr) to zero. |
| 313 | DepthMap[nullptr] = {0, 0}; |
| 314 | |
| 315 | SmallPtrSet<MachineInstr *, 4> CmovInstructions; |
| 316 | for (auto &Group : CmovInstGroups) |
| 317 | CmovInstructions.insert(Group.begin(), Group.end()); |
| 318 | |
| 319 | //===--------------------------------------------------------------------===// |
| 320 | // Step 1: Calculate instruction depth and loop depth. |
| 321 | // Optimized-Loop: |
| 322 | // loop with CMOV-group-candidates converted into branches. |
| 323 | // |
| 324 | // Instruction-Depth: |
| 325 | // instruction latency + max operand depth. |
| 326 | // * For CMOV instruction in optimized loop the depth is calculated as: |
| 327 | // CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth) |
| 328 | // TODO: Find a better way to estimate the latency of the branch instruction |
| 329 | // rather than using the CMOV latency. |
| 330 | // |
| 331 | // Loop-Depth: |
| 332 | // max instruction depth of all instructions in the loop. |
| 333 | // Note: instruction with max depth represents the critical-path in the loop. |
| 334 | // |
| 335 | // Loop-Depth[i]: |
| 336 | // Loop-Depth calculated for first `i` iterations. |
| 337 | // Note: it is enough to calculate depth for up to two iterations. |
| 338 | // |
| 339 | // Depth-Diff[i]: |
| 340 | // Number of cycles saved in first 'i` iterations by optimizing the loop. |
| 341 | //===--------------------------------------------------------------------===// |
| 342 | for (unsigned I = 0; I < LoopIterations; ++I) { |
| 343 | DepthInfo &MaxDepth = LoopDepth[I]; |
Chandler Carruth | e3b3547 | 2017-08-19 04:28:20 +0000 | [diff] [blame^] | 344 | for (auto *MBB : Blocks) { |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 345 | // Clear physical registers Def map. |
| 346 | RegDefMaps[PhyRegType].clear(); |
| 347 | for (MachineInstr &MI : *MBB) { |
| 348 | unsigned MIDepth = 0; |
| 349 | unsigned MIDepthOpt = 0; |
| 350 | bool IsCMOV = CmovInstructions.count(&MI); |
| 351 | for (auto &MO : MI.uses()) { |
| 352 | // Checks for "isUse()" as "uses()" returns also implicit definitions. |
| 353 | if (!MO.isReg() || !MO.isUse()) |
| 354 | continue; |
| 355 | unsigned Reg = MO.getReg(); |
| 356 | auto &RDM = RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)]; |
| 357 | if (MachineInstr *DefMI = RDM.lookup(Reg)) { |
| 358 | OperandToDefMap[&MO] = DefMI; |
| 359 | DepthInfo Info = DepthMap.lookup(DefMI); |
| 360 | MIDepth = std::max(MIDepth, Info.Depth); |
| 361 | if (!IsCMOV) |
| 362 | MIDepthOpt = std::max(MIDepthOpt, Info.OptDepth); |
| 363 | } |
| 364 | } |
| 365 | |
| 366 | if (IsCMOV) |
| 367 | MIDepthOpt = getDepthOfOptCmov( |
| 368 | DepthMap[OperandToDefMap.lookup(&MI.getOperand(1))].OptDepth, |
| 369 | DepthMap[OperandToDefMap.lookup(&MI.getOperand(2))].OptDepth); |
| 370 | |
| 371 | // Iterates over all operands to handle implicit definitions as well. |
| 372 | for (auto &MO : MI.operands()) { |
| 373 | if (!MO.isReg() || !MO.isDef()) |
| 374 | continue; |
| 375 | unsigned Reg = MO.getReg(); |
| 376 | RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)][Reg] = &MI; |
| 377 | } |
| 378 | |
| 379 | unsigned Latency = TSchedModel.computeInstrLatency(&MI); |
| 380 | DepthMap[&MI] = {MIDepth += Latency, MIDepthOpt += Latency}; |
| 381 | MaxDepth.Depth = std::max(MaxDepth.Depth, MIDepth); |
| 382 | MaxDepth.OptDepth = std::max(MaxDepth.OptDepth, MIDepthOpt); |
| 383 | } |
| 384 | } |
| 385 | } |
| 386 | |
| 387 | unsigned Diff[LoopIterations] = {LoopDepth[0].Depth - LoopDepth[0].OptDepth, |
| 388 | LoopDepth[1].Depth - LoopDepth[1].OptDepth}; |
| 389 | |
| 390 | //===--------------------------------------------------------------------===// |
| 391 | // Step 2: Check if Loop worth to be optimized. |
| 392 | // Worth-Optimize-Loop: |
| 393 | // case 1: Diff[1] == Diff[0] |
| 394 | // Critical-path is iteration independent - there is no dependency |
| 395 | // of critical-path instructions on critical-path instructions of |
| 396 | // previous iteration. |
| 397 | // Thus, it is enough to check gain percent of 1st iteration - |
| 398 | // To be conservative, the optimized loop need to have a depth of |
| 399 | // 12.5% cycles less than original loop, per iteration. |
| 400 | // |
| 401 | // case 2: Diff[1] > Diff[0] |
| 402 | // Critical-path is iteration dependent - there is dependency of |
| 403 | // critical-path instructions on critical-path instructions of |
| 404 | // previous iteration. |
Amjad Aboud | 6fa6813 | 2017-08-08 12:17:56 +0000 | [diff] [blame] | 405 | // Thus, check the gain percent of the 2nd iteration (similar to the |
| 406 | // previous case), but it is also required to check the gradient of |
| 407 | // the gain - the change in Depth-Diff compared to the change in |
| 408 | // Loop-Depth between 1st and 2nd iterations. |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 409 | // To be conservative, the gradient need to be at least 50%. |
| 410 | // |
Amjad Aboud | 6fa6813 | 2017-08-08 12:17:56 +0000 | [diff] [blame] | 411 | // In addition, In order not to optimize loops with very small gain, the |
| 412 | // gain (in cycles) after 2nd iteration should not be less than a given |
| 413 | // threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply. |
| 414 | // |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 415 | // If loop is not worth optimizing, remove all CMOV-group-candidates. |
| 416 | //===--------------------------------------------------------------------===// |
Amjad Aboud | 6fa6813 | 2017-08-08 12:17:56 +0000 | [diff] [blame] | 417 | if (Diff[1] < GainCycleThreshold) |
| 418 | return false; |
| 419 | |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 420 | bool WorthOptLoop = false; |
| 421 | if (Diff[1] == Diff[0]) |
| 422 | WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth; |
| 423 | else if (Diff[1] > Diff[0]) |
| 424 | WorthOptLoop = |
Amjad Aboud | 6fa6813 | 2017-08-08 12:17:56 +0000 | [diff] [blame] | 425 | (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth) && |
| 426 | (Diff[1] * 8 >= LoopDepth[1].Depth); |
Amjad Aboud | 4563c06 | 2017-07-16 17:39:56 +0000 | [diff] [blame] | 427 | |
| 428 | if (!WorthOptLoop) |
| 429 | return false; |
| 430 | |
| 431 | ++NumOfLoopCandidate; |
| 432 | |
| 433 | //===--------------------------------------------------------------------===// |
| 434 | // Step 3: Check for each CMOV-group-candidate if it worth to be optimized. |
| 435 | // Worth-Optimize-Group: |
| 436 | // Iff it worths to optimize all CMOV instructions in the group. |
| 437 | // |
| 438 | // Worth-Optimize-CMOV: |
| 439 | // Predicted branch is faster than CMOV by the difference between depth of |
| 440 | // condition operand and depth of taken (predicted) value operand. |
| 441 | // To be conservative, the gain of such CMOV transformation should cover at |
| 442 | // at least 25% of branch-misprediction-penalty. |
| 443 | //===--------------------------------------------------------------------===// |
| 444 | unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty; |
| 445 | CmovGroups TempGroups; |
| 446 | std::swap(TempGroups, CmovInstGroups); |
| 447 | for (auto &Group : TempGroups) { |
| 448 | bool WorthOpGroup = true; |
| 449 | for (auto *MI : Group) { |
| 450 | // Avoid CMOV instruction which value is used as a pointer to load from. |
| 451 | // This is another conservative check to avoid converting CMOV instruction |
| 452 | // used with tree-search like algorithm, where the branch is unpredicted. |
| 453 | auto UIs = MRI->use_instructions(MI->defs().begin()->getReg()); |
| 454 | if (UIs.begin() != UIs.end() && ++UIs.begin() == UIs.end()) { |
| 455 | unsigned Op = UIs.begin()->getOpcode(); |
| 456 | if (Op == X86::MOV64rm || Op == X86::MOV32rm) { |
| 457 | WorthOpGroup = false; |
| 458 | break; |
| 459 | } |
| 460 | } |
| 461 | |
| 462 | unsigned CondCost = |
| 463 | DepthMap[OperandToDefMap.lookup(&MI->getOperand(3))].Depth; |
| 464 | unsigned ValCost = getDepthOfOptCmov( |
| 465 | DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth, |
| 466 | DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth); |
| 467 | if (ValCost > CondCost || (CondCost - ValCost) * 4 < MispredictPenalty) { |
| 468 | WorthOpGroup = false; |
| 469 | break; |
| 470 | } |
| 471 | } |
| 472 | |
| 473 | if (WorthOpGroup) |
| 474 | CmovInstGroups.push_back(Group); |
| 475 | } |
| 476 | |
| 477 | return !CmovInstGroups.empty(); |
| 478 | } |
| 479 | |
| 480 | static bool checkEFLAGSLive(MachineInstr *MI) { |
| 481 | if (MI->killsRegister(X86::EFLAGS)) |
| 482 | return false; |
| 483 | |
| 484 | // The EFLAGS operand of MI might be missing a kill marker. |
| 485 | // Figure out whether EFLAGS operand should LIVE after MI instruction. |
| 486 | MachineBasicBlock *BB = MI->getParent(); |
| 487 | MachineBasicBlock::iterator ItrMI = MI; |
| 488 | |
| 489 | // Scan forward through BB for a use/def of EFLAGS. |
| 490 | for (auto I = std::next(ItrMI), E = BB->end(); I != E; ++I) { |
| 491 | if (I->readsRegister(X86::EFLAGS)) |
| 492 | return true; |
| 493 | if (I->definesRegister(X86::EFLAGS)) |
| 494 | return false; |
| 495 | } |
| 496 | |
| 497 | // We hit the end of the block, check whether EFLAGS is live into a successor. |
| 498 | for (auto I = BB->succ_begin(), E = BB->succ_end(); I != E; ++I) { |
| 499 | if ((*I)->isLiveIn(X86::EFLAGS)) |
| 500 | return true; |
| 501 | } |
| 502 | |
| 503 | return false; |
| 504 | } |
| 505 | |
| 506 | void X86CmovConverterPass::convertCmovInstsToBranches( |
| 507 | SmallVectorImpl<MachineInstr *> &Group) const { |
| 508 | assert(!Group.empty() && "No CMOV instructions to convert"); |
| 509 | ++NumOfOptimizedCmovGroups; |
| 510 | |
| 511 | // To convert a CMOVcc instruction, we actually have to insert the diamond |
| 512 | // control-flow pattern. The incoming instruction knows the destination vreg |
| 513 | // to set, the condition code register to branch on, the true/false values to |
| 514 | // select between, and a branch opcode to use. |
| 515 | |
| 516 | // Before |
| 517 | // ----- |
| 518 | // MBB: |
| 519 | // cond = cmp ... |
| 520 | // v1 = CMOVge t1, f1, cond |
| 521 | // v2 = CMOVlt t2, f2, cond |
| 522 | // v3 = CMOVge v1, f3, cond |
| 523 | // |
| 524 | // After |
| 525 | // ----- |
| 526 | // MBB: |
| 527 | // cond = cmp ... |
| 528 | // jge %SinkMBB |
| 529 | // |
| 530 | // FalseMBB: |
| 531 | // jmp %SinkMBB |
| 532 | // |
| 533 | // SinkMBB: |
| 534 | // %v1 = phi[%f1, %FalseMBB], [%t1, %MBB] |
| 535 | // %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch |
| 536 | // ; true-value with false-value |
| 537 | // %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use |
| 538 | // ; previous Phi instruction result |
| 539 | |
| 540 | MachineInstr &MI = *Group.front(); |
| 541 | MachineInstr *LastCMOV = Group.back(); |
| 542 | DebugLoc DL = MI.getDebugLoc(); |
| 543 | X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode())); |
| 544 | X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); |
| 545 | MachineBasicBlock *MBB = MI.getParent(); |
| 546 | MachineFunction::iterator It = ++MBB->getIterator(); |
| 547 | MachineFunction *F = MBB->getParent(); |
| 548 | const BasicBlock *BB = MBB->getBasicBlock(); |
| 549 | |
| 550 | MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB); |
| 551 | MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB); |
| 552 | F->insert(It, FalseMBB); |
| 553 | F->insert(It, SinkMBB); |
| 554 | |
| 555 | // If the EFLAGS register isn't dead in the terminator, then claim that it's |
| 556 | // live into the sink and copy blocks. |
| 557 | if (checkEFLAGSLive(LastCMOV)) { |
| 558 | FalseMBB->addLiveIn(X86::EFLAGS); |
| 559 | SinkMBB->addLiveIn(X86::EFLAGS); |
| 560 | } |
| 561 | |
| 562 | // Transfer the remainder of BB and its successor edges to SinkMBB. |
| 563 | SinkMBB->splice(SinkMBB->begin(), MBB, |
| 564 | std::next(MachineBasicBlock::iterator(LastCMOV)), MBB->end()); |
| 565 | SinkMBB->transferSuccessorsAndUpdatePHIs(MBB); |
| 566 | |
| 567 | // Add the false and sink blocks as its successors. |
| 568 | MBB->addSuccessor(FalseMBB); |
| 569 | MBB->addSuccessor(SinkMBB); |
| 570 | |
| 571 | // Create the conditional branch instruction. |
| 572 | BuildMI(MBB, DL, TII->get(X86::GetCondBranchFromCond(CC))).addMBB(SinkMBB); |
| 573 | |
| 574 | // Add the sink block to the false block successors. |
| 575 | FalseMBB->addSuccessor(SinkMBB); |
| 576 | |
| 577 | MachineInstrBuilder MIB; |
| 578 | MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); |
| 579 | MachineBasicBlock::iterator MIItEnd = |
| 580 | std::next(MachineBasicBlock::iterator(LastCMOV)); |
| 581 | MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin(); |
| 582 | // As we are creating the PHIs, we have to be careful if there is more than |
| 583 | // one. Later CMOVs may reference the results of earlier CMOVs, but later |
| 584 | // PHIs have to reference the individual true/false inputs from earlier PHIs. |
| 585 | // That also means that PHI construction must work forward from earlier to |
| 586 | // later, and that the code must maintain a mapping from earlier PHI's |
| 587 | // destination registers, and the registers that went into the PHI. |
| 588 | DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable; |
| 589 | |
| 590 | for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { |
| 591 | unsigned DestReg = MIIt->getOperand(0).getReg(); |
| 592 | unsigned Op1Reg = MIIt->getOperand(1).getReg(); |
| 593 | unsigned Op2Reg = MIIt->getOperand(2).getReg(); |
| 594 | |
| 595 | // If this CMOV we are processing is the opposite condition from the jump we |
| 596 | // generated, then we have to swap the operands for the PHI that is going to |
| 597 | // be generated. |
| 598 | if (X86::getCondFromCMovOpc(MIIt->getOpcode()) == OppCC) |
| 599 | std::swap(Op1Reg, Op2Reg); |
| 600 | |
| 601 | auto Op1Itr = RegRewriteTable.find(Op1Reg); |
| 602 | if (Op1Itr != RegRewriteTable.end()) |
| 603 | Op1Reg = Op1Itr->second.first; |
| 604 | |
| 605 | auto Op2Itr = RegRewriteTable.find(Op2Reg); |
| 606 | if (Op2Itr != RegRewriteTable.end()) |
| 607 | Op2Reg = Op2Itr->second.second; |
| 608 | |
| 609 | // SinkMBB: |
| 610 | // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ] |
| 611 | // ... |
| 612 | MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg) |
| 613 | .addReg(Op1Reg) |
| 614 | .addMBB(FalseMBB) |
| 615 | .addReg(Op2Reg) |
| 616 | .addMBB(MBB); |
| 617 | (void)MIB; |
| 618 | DEBUG(dbgs() << "\tFrom: "; MIIt->dump()); |
| 619 | DEBUG(dbgs() << "\tTo: "; MIB->dump()); |
| 620 | |
| 621 | // Add this PHI to the rewrite table. |
| 622 | RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg); |
| 623 | } |
| 624 | |
| 625 | // Now remove the CMOV(s). |
| 626 | MBB->erase(MIItBegin, MIItEnd); |
| 627 | } |
| 628 | |
| 629 | } // End anonymous namespace. |
| 630 | |
| 631 | FunctionPass *llvm::createX86CmovConverterPass() { |
| 632 | return new X86CmovConverterPass(); |
| 633 | } |