Evandro Menezes | 94edf02 | 2017-02-01 02:54:34 +0000 | [diff] [blame] | 1 | //===- AArch64MacroFusion.cpp - AArch64 Macro Fusion ----------------------===// |
| 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
| 9 | // |
| 10 | // \file This file contains the AArch64 implementation of the DAG scheduling mutation |
| 11 | // to pair instructions back to back. |
| 12 | // |
| 13 | //===----------------------------------------------------------------------===// |
| 14 | |
| 15 | #include "AArch64MacroFusion.h" |
| 16 | #include "AArch64Subtarget.h" |
| 17 | #include "llvm/Support/CommandLine.h" |
| 18 | #include "llvm/Target/TargetInstrInfo.h" |
| 19 | |
| 20 | #define DEBUG_TYPE "misched" |
| 21 | |
| 22 | using namespace llvm; |
| 23 | |
| 24 | static cl::opt<bool> EnableMacroFusion("aarch64-misched-fusion", cl::Hidden, |
| 25 | cl::desc("Enable scheduling for macro fusion."), cl::init(true)); |
| 26 | |
| 27 | namespace { |
| 28 | |
| 29 | /// \brief Verify that the instruction pair, \param First and \param Second, |
| 30 | /// should be scheduled back to back. Given an anchor instruction, if the other |
| 31 | /// instruction is unspecified, then verify that the anchor instruction may be |
| 32 | /// part of a pair at all. |
| 33 | static bool shouldScheduleAdjacent(const AArch64InstrInfo &TII, |
| 34 | const AArch64Subtarget &ST, |
| 35 | const MachineInstr *First, |
| 36 | const MachineInstr *Second) { |
| 37 | unsigned FirstOpcode = First ? |
| 38 | First->getOpcode() : AArch64::INSTRUCTION_LIST_END; |
| 39 | unsigned SecondOpcode = Second ? |
| 40 | Second->getOpcode() : AArch64::INSTRUCTION_LIST_END; |
| 41 | |
| 42 | if (ST.hasArithmeticBccFusion()) |
| 43 | // Fuse CMN, CMP, TST followed by Bcc. |
| 44 | if (SecondOpcode == AArch64::Bcc) |
| 45 | switch (FirstOpcode) { |
| 46 | default: |
| 47 | return false; |
| 48 | case AArch64::ADDSWri: |
| 49 | case AArch64::ADDSWrr: |
| 50 | case AArch64::ADDSXri: |
| 51 | case AArch64::ADDSXrr: |
| 52 | case AArch64::ANDSWri: |
| 53 | case AArch64::ANDSWrr: |
| 54 | case AArch64::ANDSXri: |
| 55 | case AArch64::ANDSXrr: |
| 56 | case AArch64::SUBSWri: |
| 57 | case AArch64::SUBSWrr: |
| 58 | case AArch64::SUBSXri: |
| 59 | case AArch64::SUBSXrr: |
| 60 | case AArch64::BICSWrr: |
| 61 | case AArch64::BICSXrr: |
| 62 | return true; |
| 63 | case AArch64::ADDSWrs: |
| 64 | case AArch64::ADDSXrs: |
| 65 | case AArch64::ANDSWrs: |
| 66 | case AArch64::ANDSXrs: |
| 67 | case AArch64::SUBSWrs: |
| 68 | case AArch64::SUBSXrs: |
| 69 | case AArch64::BICSWrs: |
| 70 | case AArch64::BICSXrs: |
| 71 | // Shift value can be 0 making these behave like the "rr" variant... |
| 72 | return !TII.hasShiftedReg(*First); |
| 73 | case AArch64::INSTRUCTION_LIST_END: |
| 74 | return true; |
| 75 | } |
| 76 | |
| 77 | if (ST.hasArithmeticCbzFusion()) |
| 78 | // Fuse ALU operations followed by CBZ/CBNZ. |
| 79 | if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || |
| 80 | SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) |
| 81 | switch (FirstOpcode) { |
| 82 | default: |
| 83 | return false; |
| 84 | case AArch64::ADDWri: |
| 85 | case AArch64::ADDWrr: |
| 86 | case AArch64::ADDXri: |
| 87 | case AArch64::ADDXrr: |
| 88 | case AArch64::ANDWri: |
| 89 | case AArch64::ANDWrr: |
| 90 | case AArch64::ANDXri: |
| 91 | case AArch64::ANDXrr: |
| 92 | case AArch64::EORWri: |
| 93 | case AArch64::EORWrr: |
| 94 | case AArch64::EORXri: |
| 95 | case AArch64::EORXrr: |
| 96 | case AArch64::ORRWri: |
| 97 | case AArch64::ORRWrr: |
| 98 | case AArch64::ORRXri: |
| 99 | case AArch64::ORRXrr: |
| 100 | case AArch64::SUBWri: |
| 101 | case AArch64::SUBWrr: |
| 102 | case AArch64::SUBXri: |
| 103 | case AArch64::SUBXrr: |
| 104 | return true; |
| 105 | case AArch64::ADDWrs: |
| 106 | case AArch64::ADDXrs: |
| 107 | case AArch64::ANDWrs: |
| 108 | case AArch64::ANDXrs: |
| 109 | case AArch64::SUBWrs: |
| 110 | case AArch64::SUBXrs: |
| 111 | case AArch64::BICWrs: |
| 112 | case AArch64::BICXrs: |
| 113 | // Shift value can be 0 making these behave like the "rr" variant... |
| 114 | return !TII.hasShiftedReg(*First); |
| 115 | case AArch64::INSTRUCTION_LIST_END: |
| 116 | return true; |
| 117 | } |
| 118 | |
Evandro Menezes | b21fb29 | 2017-02-01 02:54:39 +0000 | [diff] [blame^] | 119 | if (ST.hasFuseAES()) |
| 120 | // Fuse AES crypto operations. |
| 121 | switch(FirstOpcode) { |
| 122 | // AES encode. |
| 123 | case AArch64::AESErr: |
| 124 | return SecondOpcode == AArch64::AESMCrr || |
| 125 | SecondOpcode == AArch64::INSTRUCTION_LIST_END; |
| 126 | // AES decode. |
| 127 | case AArch64::AESDrr: |
| 128 | return SecondOpcode == AArch64::AESIMCrr || |
| 129 | SecondOpcode == AArch64::INSTRUCTION_LIST_END; |
| 130 | } |
| 131 | |
Evandro Menezes | 94edf02 | 2017-02-01 02:54:34 +0000 | [diff] [blame] | 132 | return false; |
| 133 | } |
| 134 | |
| 135 | /// \brief Implement the fusion of instruction pairs in the scheduling |
| 136 | /// \param DAG, anchored at the instruction in \param ASU. \param Preds |
| 137 | /// indicates if its dependencies in \param APreds are predecessors instead of |
| 138 | /// successors. |
| 139 | static bool scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit *ASU, |
| 140 | SmallVectorImpl<SDep> &APreds, bool Preds) { |
| 141 | const AArch64InstrInfo *TII = static_cast<const AArch64InstrInfo *>(DAG->TII); |
| 142 | const AArch64Subtarget &ST = DAG->MF.getSubtarget<AArch64Subtarget>(); |
| 143 | |
| 144 | const MachineInstr *AMI = ASU->getInstr(); |
| 145 | if (!AMI || AMI->isPseudo() || AMI->isTransient() || |
| 146 | (Preds && !shouldScheduleAdjacent(*TII, ST, nullptr, AMI)) || |
| 147 | (!Preds && !shouldScheduleAdjacent(*TII, ST, AMI, nullptr))) |
| 148 | return false; |
| 149 | |
| 150 | for (SDep &BDep : APreds) { |
| 151 | if (BDep.isWeak()) |
| 152 | continue; |
| 153 | |
| 154 | SUnit *BSU = BDep.getSUnit(); |
| 155 | const MachineInstr *BMI = BSU->getInstr(); |
| 156 | if (!BMI || BMI->isPseudo() || BMI->isTransient() || |
| 157 | (Preds && !shouldScheduleAdjacent(*TII, ST, BMI, AMI)) || |
| 158 | (!Preds && !shouldScheduleAdjacent(*TII, ST, AMI, BMI))) |
| 159 | continue; |
| 160 | |
| 161 | // Create a single weak edge between the adjacent instrs. The only |
| 162 | // effect is to cause bottom-up scheduling to heavily prioritize the |
| 163 | // clustered instrs. |
| 164 | if (Preds) |
| 165 | DAG->addEdge(ASU, SDep(BSU, SDep::Cluster)); |
| 166 | else |
| 167 | DAG->addEdge(BSU, SDep(ASU, SDep::Cluster)); |
| 168 | |
| 169 | // Adjust the latency between the 1st instr and its predecessors/successors. |
| 170 | for (SDep &Dep : APreds) |
| 171 | if (Dep.getSUnit() == BSU) |
| 172 | Dep.setLatency(0); |
| 173 | |
| 174 | // Adjust the latency between the 2nd instr and its successors/predecessors. |
| 175 | auto &BSuccs = Preds ? BSU->Succs : BSU->Preds; |
| 176 | for (SDep &Dep : BSuccs) |
| 177 | if (Dep.getSUnit() == ASU) |
| 178 | Dep.setLatency(0); |
| 179 | |
| 180 | DEBUG(dbgs() << "Macro fuse "; |
| 181 | Preds ? BSU->print(dbgs(), DAG) : ASU->print(dbgs(), DAG); |
| 182 | dbgs() << " - "; |
| 183 | Preds ? ASU->print(dbgs(), DAG) : BSU->print(dbgs(), DAG); |
| 184 | dbgs() << '\n'); |
| 185 | |
| 186 | return true; |
| 187 | } |
| 188 | |
| 189 | return false; |
| 190 | } |
| 191 | |
| 192 | /// \brief Post-process the DAG to create cluster edges between instructions |
| 193 | /// that may be fused by the processor into a single operation. |
| 194 | class AArch64MacroFusion : public ScheduleDAGMutation { |
| 195 | public: |
| 196 | AArch64MacroFusion() {} |
| 197 | |
| 198 | void apply(ScheduleDAGInstrs *DAGInstrs) override; |
| 199 | }; |
| 200 | |
| 201 | void AArch64MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { |
| 202 | ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); |
| 203 | |
| 204 | // For each of the SUnits in the scheduling block, try to fuse the instruction |
| 205 | // in it with one in its successors. |
| 206 | for (SUnit &ASU : DAG->SUnits) |
| 207 | scheduleAdjacentImpl(DAG, &ASU, ASU.Succs, false); |
| 208 | |
| 209 | // Try to fuse the instruction in the ExitSU with one in its predecessors. |
| 210 | scheduleAdjacentImpl(DAG, &DAG->ExitSU, DAG->ExitSU.Preds, true); |
| 211 | } |
| 212 | |
| 213 | } // end namespace |
| 214 | |
| 215 | |
| 216 | namespace llvm { |
| 217 | |
| 218 | std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation () { |
| 219 | return EnableMacroFusion ? make_unique<AArch64MacroFusion>() : nullptr; |
| 220 | } |
| 221 | |
| 222 | } // end namespace llvm |