Evandro Menezes | 94edf02 | 2017-02-01 02:54:34 +0000 | [diff] [blame] | 1 | //===- X86MacroFusion.cpp - X86 Macro Fusion ------------------------------===// |
| 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
Florian Hahn | e16d310 | 2017-06-19 11:26:15 +0000 | [diff] [blame] | 5 | // \file This file is distributed under the University of Illinois Open Source |
Evandro Menezes | 94edf02 | 2017-02-01 02:54:34 +0000 | [diff] [blame] | 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
| 9 | // |
Florian Hahn | e16d310 | 2017-06-19 11:26:15 +0000 | [diff] [blame] | 10 | // This file contains the X86 implementation of the DAG scheduling mutation to |
| 11 | // pair instructions back to back. |
Evandro Menezes | 94edf02 | 2017-02-01 02:54:34 +0000 | [diff] [blame] | 12 | // |
| 13 | //===----------------------------------------------------------------------===// |
| 14 | |
| 15 | #include "X86MacroFusion.h" |
| 16 | #include "X86Subtarget.h" |
Florian Hahn | e16d310 | 2017-06-19 11:26:15 +0000 | [diff] [blame] | 17 | #include "llvm/ADT/Statistic.h" |
| 18 | #include "llvm/Support/CommandLine.h" |
Evandro Menezes | 94edf02 | 2017-02-01 02:54:34 +0000 | [diff] [blame] | 19 | #include "llvm/Target/TargetInstrInfo.h" |
Florian Hahn | e16d310 | 2017-06-19 11:26:15 +0000 | [diff] [blame] | 20 | |
| 21 | #define DEBUG_TYPE "misched" |
| 22 | |
| 23 | STATISTIC(NumFused, "Number of instr pairs fused"); |
Evandro Menezes | a8d3301 | 2017-02-21 22:16:13 +0000 | [diff] [blame] | 24 | |
Evandro Menezes | 94edf02 | 2017-02-01 02:54:34 +0000 | [diff] [blame] | 25 | using namespace llvm; |
| 26 | |
Florian Hahn | e16d310 | 2017-06-19 11:26:15 +0000 | [diff] [blame] | 27 | static cl::opt<bool> EnableMacroFusion("x86-misched-fusion", cl::Hidden, |
| 28 | cl::desc("Enable scheduling for macro fusion."), cl::init(true)); |
| 29 | |
| 30 | namespace { |
| 31 | |
| 32 | /// \brief Verify that the instruction pair, First and Second, |
| 33 | /// should be scheduled back to back. If either instruction is unspecified, |
| 34 | /// then verify that the other instruction may be part of a pair at all. |
| 35 | static bool shouldScheduleAdjacent(const X86Subtarget &ST, |
| 36 | const MachineInstr *First, |
| 37 | const MachineInstr *Second) { |
Evandro Menezes | 94edf02 | 2017-02-01 02:54:34 +0000 | [diff] [blame] | 38 | // Check if this processor supports macro-fusion. Since this is a minor |
| 39 | // heuristic, we haven't specifically reserved a feature. hasAVX is a decent |
| 40 | // proxy for SandyBridge+. |
| 41 | if (!ST.hasAVX()) |
| 42 | return false; |
| 43 | |
| 44 | enum { |
| 45 | FuseTest, |
| 46 | FuseCmp, |
| 47 | FuseInc |
| 48 | } FuseKind; |
| 49 | |
Florian Hahn | e16d310 | 2017-06-19 11:26:15 +0000 | [diff] [blame] | 50 | assert((First || Second) && "At least one instr must be specified"); |
| 51 | unsigned FirstOpcode = First |
| 52 | ? First->getOpcode() |
Evandro Menezes | b9b7f4b | 2017-02-21 22:16:11 +0000 | [diff] [blame] | 53 | : static_cast<unsigned>(X86::INSTRUCTION_LIST_END); |
Florian Hahn | e16d310 | 2017-06-19 11:26:15 +0000 | [diff] [blame] | 54 | unsigned SecondOpcode = Second |
| 55 | ? Second->getOpcode() |
| 56 | : static_cast<unsigned>(X86::INSTRUCTION_LIST_END); |
Evandro Menezes | 94edf02 | 2017-02-01 02:54:34 +0000 | [diff] [blame] | 57 | |
| 58 | switch (SecondOpcode) { |
| 59 | default: |
| 60 | return false; |
| 61 | case X86::JE_1: |
| 62 | case X86::JNE_1: |
| 63 | case X86::JL_1: |
| 64 | case X86::JLE_1: |
| 65 | case X86::JG_1: |
| 66 | case X86::JGE_1: |
| 67 | FuseKind = FuseInc; |
| 68 | break; |
| 69 | case X86::JB_1: |
| 70 | case X86::JBE_1: |
| 71 | case X86::JA_1: |
| 72 | case X86::JAE_1: |
| 73 | FuseKind = FuseCmp; |
| 74 | break; |
| 75 | case X86::JS_1: |
| 76 | case X86::JNS_1: |
| 77 | case X86::JP_1: |
| 78 | case X86::JNP_1: |
| 79 | case X86::JO_1: |
| 80 | case X86::JNO_1: |
| 81 | FuseKind = FuseTest; |
| 82 | break; |
| 83 | } |
| 84 | |
| 85 | switch (FirstOpcode) { |
| 86 | default: |
| 87 | return false; |
| 88 | case X86::TEST8rr: |
| 89 | case X86::TEST16rr: |
| 90 | case X86::TEST32rr: |
| 91 | case X86::TEST64rr: |
| 92 | case X86::TEST8ri: |
| 93 | case X86::TEST16ri: |
| 94 | case X86::TEST32ri: |
| 95 | case X86::TEST32i32: |
| 96 | case X86::TEST64i32: |
| 97 | case X86::TEST64ri32: |
| 98 | case X86::TEST8rm: |
| 99 | case X86::TEST16rm: |
| 100 | case X86::TEST32rm: |
| 101 | case X86::TEST64rm: |
| 102 | case X86::TEST8ri_NOREX: |
| 103 | case X86::AND16i16: |
| 104 | case X86::AND16ri: |
| 105 | case X86::AND16ri8: |
| 106 | case X86::AND16rm: |
| 107 | case X86::AND16rr: |
| 108 | case X86::AND32i32: |
| 109 | case X86::AND32ri: |
| 110 | case X86::AND32ri8: |
| 111 | case X86::AND32rm: |
| 112 | case X86::AND32rr: |
| 113 | case X86::AND64i32: |
| 114 | case X86::AND64ri32: |
| 115 | case X86::AND64ri8: |
| 116 | case X86::AND64rm: |
| 117 | case X86::AND64rr: |
| 118 | case X86::AND8i8: |
| 119 | case X86::AND8ri: |
| 120 | case X86::AND8rm: |
| 121 | case X86::AND8rr: |
| 122 | return true; |
| 123 | case X86::CMP16i16: |
| 124 | case X86::CMP16ri: |
| 125 | case X86::CMP16ri8: |
| 126 | case X86::CMP16rm: |
| 127 | case X86::CMP16rr: |
| 128 | case X86::CMP32i32: |
| 129 | case X86::CMP32ri: |
| 130 | case X86::CMP32ri8: |
| 131 | case X86::CMP32rm: |
| 132 | case X86::CMP32rr: |
| 133 | case X86::CMP64i32: |
| 134 | case X86::CMP64ri32: |
| 135 | case X86::CMP64ri8: |
| 136 | case X86::CMP64rm: |
| 137 | case X86::CMP64rr: |
| 138 | case X86::CMP8i8: |
| 139 | case X86::CMP8ri: |
| 140 | case X86::CMP8rm: |
| 141 | case X86::CMP8rr: |
| 142 | case X86::ADD16i16: |
| 143 | case X86::ADD16ri: |
| 144 | case X86::ADD16ri8: |
| 145 | case X86::ADD16ri8_DB: |
| 146 | case X86::ADD16ri_DB: |
| 147 | case X86::ADD16rm: |
| 148 | case X86::ADD16rr: |
| 149 | case X86::ADD16rr_DB: |
| 150 | case X86::ADD32i32: |
| 151 | case X86::ADD32ri: |
| 152 | case X86::ADD32ri8: |
| 153 | case X86::ADD32ri8_DB: |
| 154 | case X86::ADD32ri_DB: |
| 155 | case X86::ADD32rm: |
| 156 | case X86::ADD32rr: |
| 157 | case X86::ADD32rr_DB: |
| 158 | case X86::ADD64i32: |
| 159 | case X86::ADD64ri32: |
| 160 | case X86::ADD64ri32_DB: |
| 161 | case X86::ADD64ri8: |
| 162 | case X86::ADD64ri8_DB: |
| 163 | case X86::ADD64rm: |
| 164 | case X86::ADD64rr: |
| 165 | case X86::ADD64rr_DB: |
| 166 | case X86::ADD8i8: |
| 167 | case X86::ADD8mi: |
| 168 | case X86::ADD8mr: |
| 169 | case X86::ADD8ri: |
| 170 | case X86::ADD8rm: |
| 171 | case X86::ADD8rr: |
| 172 | case X86::SUB16i16: |
| 173 | case X86::SUB16ri: |
| 174 | case X86::SUB16ri8: |
| 175 | case X86::SUB16rm: |
| 176 | case X86::SUB16rr: |
| 177 | case X86::SUB32i32: |
| 178 | case X86::SUB32ri: |
| 179 | case X86::SUB32ri8: |
| 180 | case X86::SUB32rm: |
| 181 | case X86::SUB32rr: |
| 182 | case X86::SUB64i32: |
| 183 | case X86::SUB64ri32: |
| 184 | case X86::SUB64ri8: |
| 185 | case X86::SUB64rm: |
| 186 | case X86::SUB64rr: |
| 187 | case X86::SUB8i8: |
| 188 | case X86::SUB8ri: |
| 189 | case X86::SUB8rm: |
| 190 | case X86::SUB8rr: |
| 191 | return FuseKind == FuseCmp || FuseKind == FuseInc; |
| 192 | case X86::INC16r: |
| 193 | case X86::INC32r: |
| 194 | case X86::INC64r: |
| 195 | case X86::INC8r: |
| 196 | case X86::DEC16r: |
| 197 | case X86::DEC32r: |
| 198 | case X86::DEC64r: |
| 199 | case X86::DEC8r: |
| 200 | return FuseKind == FuseInc; |
| 201 | case X86::INSTRUCTION_LIST_END: |
| 202 | return true; |
| 203 | } |
| 204 | } |
| 205 | |
Florian Hahn | e16d310 | 2017-06-19 11:26:15 +0000 | [diff] [blame] | 206 | /// \brief Post-process the DAG to create cluster edges between instructions |
| 207 | /// that may be fused by the processor into a single operation. |
| 208 | class X86MacroFusion : public ScheduleDAGMutation { |
| 209 | public: |
| 210 | X86MacroFusion() {} |
| 211 | |
| 212 | void apply(ScheduleDAGInstrs *DAGInstrs) override; |
| 213 | }; |
| 214 | |
| 215 | void X86MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { |
| 216 | ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); |
| 217 | const X86Subtarget &ST = DAG->MF.getSubtarget<X86Subtarget>(); |
| 218 | |
| 219 | // For now, assume targets can only fuse with the branch. |
| 220 | SUnit &ExitSU = DAG->ExitSU; |
| 221 | MachineInstr *Branch = ExitSU.getInstr(); |
| 222 | if (!Branch || !shouldScheduleAdjacent(ST, nullptr, Branch)) |
| 223 | return; |
| 224 | |
| 225 | for (SDep &PredDep : ExitSU.Preds) { |
| 226 | if (PredDep.isWeak()) |
| 227 | continue; |
| 228 | SUnit &SU = *PredDep.getSUnit(); |
| 229 | MachineInstr &Pred = *SU.getInstr(); |
| 230 | if (!shouldScheduleAdjacent(ST, &Pred, Branch)) |
| 231 | continue; |
| 232 | |
| 233 | // Create a single weak edge from SU to ExitSU. The only effect is to cause |
| 234 | // bottom-up scheduling to heavily prioritize the clustered SU. There is no |
| 235 | // need to copy predecessor edges from ExitSU to SU, since top-down |
| 236 | // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling |
| 237 | // of SU, we could create an artificial edge from the deepest root, but it |
| 238 | // hasn't been needed yet. |
| 239 | bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); |
| 240 | (void)Success; |
| 241 | assert(Success && "No DAG nodes should be reachable from ExitSU"); |
| 242 | |
| 243 | // Adjust latency of data deps between the nodes. |
| 244 | for (SDep &PredDep : ExitSU.Preds) |
| 245 | if (PredDep.getSUnit() == &SU) |
| 246 | PredDep.setLatency(0); |
| 247 | for (SDep &SuccDep : SU.Succs) |
| 248 | if (SuccDep.getSUnit() == &ExitSU) |
| 249 | SuccDep.setLatency(0); |
| 250 | |
| 251 | ++NumFused; |
| 252 | DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse "; |
| 253 | SU.print(dbgs(), DAG); |
| 254 | dbgs() << " - ExitSU" |
| 255 | << " / " << DAG->TII->getName(Pred.getOpcode()) << " - " |
| 256 | << DAG->TII->getName(Branch->getOpcode()) << '\n';); |
| 257 | |
| 258 | break; |
| 259 | } |
| 260 | } |
| 261 | |
| 262 | } // end namespace |
| 263 | |
Evandro Menezes | 94edf02 | 2017-02-01 02:54:34 +0000 | [diff] [blame] | 264 | namespace llvm { |
| 265 | |
| 266 | std::unique_ptr<ScheduleDAGMutation> |
| 267 | createX86MacroFusionDAGMutation () { |
Florian Hahn | e16d310 | 2017-06-19 11:26:15 +0000 | [diff] [blame] | 268 | return EnableMacroFusion ? make_unique<X86MacroFusion>() : nullptr; |
Evandro Menezes | 94edf02 | 2017-02-01 02:54:34 +0000 | [diff] [blame] | 269 | } |
| 270 | |
| 271 | } // end namespace llvm |