|  | // | 
|  | //                     The LLVM Compiler Infrastructure | 
|  | // | 
|  | // This file is distributed under the University of Illinois Open Source | 
|  | // License. See LICENSE.TXT for details. | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  | // | 
|  | // This file contains a pass that performs optimization on SIMD instructions | 
|  | // with high latency by splitting them into more efficient series of | 
|  | // instructions. | 
|  | // | 
|  | // 1. Rewrite certain SIMD instructions with vector element due to their | 
|  | // inefficiency on some targets. | 
|  | // | 
|  | // For example: | 
|  | //    fmla v0.4s, v1.4s, v2.s[1] | 
|  | // | 
|  | // Is rewritten into: | 
|  | //    dup v3.4s, v2.s[1] | 
|  | //    fmla v0.4s, v1.4s, v3.4s | 
|  | // | 
|  | // 2. Rewrite interleaved memory access instructions due to their | 
|  | // inefficiency on some targets. | 
|  | // | 
|  | // For example: | 
|  | //    st2 {v0.4s, v1.4s}, addr | 
|  | // | 
|  | // Is rewritten into: | 
|  | //    zip1 v2.4s, v0.4s, v1.4s | 
|  | //    zip2 v3.4s, v0.4s, v1.4s | 
|  | //    stp  q2, q3,  addr | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  |  | 
|  | #include "AArch64InstrInfo.h" | 
|  | #include "llvm/ADT/SmallVector.h" | 
|  | #include "llvm/ADT/Statistic.h" | 
|  | #include "llvm/ADT/StringRef.h" | 
|  | #include "llvm/CodeGen/MachineBasicBlock.h" | 
|  | #include "llvm/CodeGen/MachineFunction.h" | 
|  | #include "llvm/CodeGen/MachineFunctionPass.h" | 
|  | #include "llvm/CodeGen/MachineInstr.h" | 
|  | #include "llvm/CodeGen/MachineInstrBuilder.h" | 
|  | #include "llvm/CodeGen/MachineOperand.h" | 
|  | #include "llvm/CodeGen/MachineRegisterInfo.h" | 
|  | #include "llvm/CodeGen/TargetInstrInfo.h" | 
|  | #include "llvm/CodeGen/TargetSchedule.h" | 
|  | #include "llvm/CodeGen/TargetSubtargetInfo.h" | 
|  | #include "llvm/MC/MCInstrDesc.h" | 
|  | #include "llvm/MC/MCSchedule.h" | 
|  | #include "llvm/Pass.h" | 
|  | #include <unordered_map> | 
|  |  | 
|  | using namespace llvm; | 
|  |  | 
|  | #define DEBUG_TYPE "aarch64-simdinstr-opt" | 
|  |  | 
|  | STATISTIC(NumModifiedInstr, | 
|  | "Number of SIMD instructions modified"); | 
|  |  | 
|  | #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME                                     \ | 
|  | "AArch64 SIMD instructions optimization pass" | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | struct AArch64SIMDInstrOpt : public MachineFunctionPass { | 
|  | static char ID; | 
|  |  | 
|  | const TargetInstrInfo *TII; | 
|  | MachineRegisterInfo *MRI; | 
|  | TargetSchedModel SchedModel; | 
|  |  | 
|  | // The two maps below are used to cache decisions instead of recomputing: | 
|  | // This is used to cache instruction replacement decisions within function | 
|  | // units and across function units. | 
|  | std::map<std::pair<unsigned, std::string>, bool> SIMDInstrTable; | 
|  | // This is used to cache the decision of whether to leave the interleaved | 
|  | // store instructions replacement pass early or not for a particular target. | 
|  | std::unordered_map<std::string, bool> InterlEarlyExit; | 
|  |  | 
|  | typedef enum { | 
|  | VectorElem, | 
|  | Interleave | 
|  | } Subpass; | 
|  |  | 
|  | // Instruction represented by OrigOpc is replaced by instructions in ReplOpc. | 
|  | struct InstReplInfo { | 
|  | unsigned OrigOpc; | 
|  | std::vector<unsigned> ReplOpc; | 
|  | const TargetRegisterClass RC; | 
|  | }; | 
|  |  | 
|  | #define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \ | 
|  | {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC} | 
|  | #define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \ | 
|  | OpcR7, OpcR8, OpcR9, RC) \ | 
|  | {OpcOrg, \ | 
|  | {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC} | 
|  |  | 
|  | // The Instruction Replacement Table: | 
|  | std::vector<InstReplInfo> IRT = { | 
|  | // ST2 instructions | 
|  | RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, | 
|  | AArch64::STPQi, AArch64::FPR128RegClass), | 
|  | RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, | 
|  | AArch64::STPQi, AArch64::FPR128RegClass), | 
|  | RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, | 
|  | AArch64::STPDi, AArch64::FPR64RegClass), | 
|  | RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, | 
|  | AArch64::STPQi, AArch64::FPR128RegClass), | 
|  | RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, | 
|  | AArch64::STPDi, AArch64::FPR64RegClass), | 
|  | RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, | 
|  | AArch64::STPQi, AArch64::FPR128RegClass), | 
|  | RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, | 
|  | AArch64::STPDi, AArch64::FPR64RegClass), | 
|  | // ST4 instructions | 
|  | RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, | 
|  | AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, | 
|  | AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, | 
|  | AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), | 
|  | RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, | 
|  | AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, | 
|  | AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, | 
|  | AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), | 
|  | RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, | 
|  | AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, | 
|  | AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, | 
|  | AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass), | 
|  | RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, | 
|  | AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, | 
|  | AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, | 
|  | AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), | 
|  | RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, | 
|  | AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, | 
|  | AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, | 
|  | AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass), | 
|  | RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, | 
|  | AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, | 
|  | AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, | 
|  | AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), | 
|  | RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, | 
|  | AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, | 
|  | AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, | 
|  | AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass) | 
|  | }; | 
|  |  | 
|  | // A costly instruction is replaced in this work by N efficient instructions | 
|  | // The maximum of N is curently 10 and it is for ST4 case. | 
|  | static const unsigned MaxNumRepl = 10; | 
|  |  | 
|  | AArch64SIMDInstrOpt() : MachineFunctionPass(ID) { | 
|  | initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry()); | 
|  | } | 
|  |  | 
|  | /// Based only on latency of instructions, determine if it is cost efficient | 
|  | /// to replace the instruction InstDesc by the instructions stored in the | 
|  | /// array InstDescRepl. | 
|  | /// Return true if replacement is expected to be faster. | 
|  | bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, | 
|  | SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID); | 
|  |  | 
|  | /// Determine if we need to exit the instruction replacement optimization | 
|  | /// passes early. This makes sure that no compile time is spent in this pass | 
|  | /// for targets with no need for any of these optimizations. | 
|  | /// Return true if early exit of the pass is recommended. | 
|  | bool shouldExitEarly(MachineFunction *MF, Subpass SP); | 
|  |  | 
|  | /// Check whether an equivalent DUP instruction has already been | 
|  | /// created or not. | 
|  | /// Return true when the DUP instruction already exists. In this case, | 
|  | /// DestReg will point to the destination of the already created DUP. | 
|  | bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg, | 
|  | unsigned LaneNumber, unsigned *DestReg) const; | 
|  |  | 
|  | /// Certain SIMD instructions with vector element operand are not efficient. | 
|  | /// Rewrite them into SIMD instructions with vector operands. This rewrite | 
|  | /// is driven by the latency of the instructions. | 
|  | /// Return true if the SIMD instruction is modified. | 
|  | bool optimizeVectElement(MachineInstr &MI); | 
|  |  | 
|  | /// Process The REG_SEQUENCE instruction, and extract the source | 
|  | /// operands of the ST2/4 instruction from it. | 
|  | /// Example of such instructions. | 
|  | ///    %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1; | 
|  | /// Return true when the instruction is processed successfully. | 
|  | bool processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg, | 
|  | unsigned* StRegKill, unsigned NumArg) const; | 
|  |  | 
|  | /// Load/Store Interleaving instructions are not always beneficial. | 
|  | /// Replace them by ZIP instructionand classical load/store. | 
|  | /// Return true if the SIMD instruction is modified. | 
|  | bool optimizeLdStInterleave(MachineInstr &MI); | 
|  |  | 
|  | /// Return the number of useful source registers for this | 
|  | /// instruction (2 for ST2 and 4 for ST4). | 
|  | unsigned determineSrcReg(MachineInstr &MI) const; | 
|  |  | 
|  | bool runOnMachineFunction(MachineFunction &Fn) override; | 
|  |  | 
|  | StringRef getPassName() const override { | 
|  | return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME; | 
|  | } | 
|  | }; | 
|  |  | 
|  | char AArch64SIMDInstrOpt::ID = 0; | 
|  |  | 
|  | } // end anonymous namespace | 
|  |  | 
|  | INITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt", | 
|  | AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false) | 
|  |  | 
|  | /// Based only on latency of instructions, determine if it is cost efficient | 
|  | /// to replace the instruction InstDesc by the instructions stored in the | 
|  | /// array InstDescRepl. | 
|  | /// Return true if replacement is expected to be faster. | 
|  | bool AArch64SIMDInstrOpt:: | 
|  | shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, | 
|  | SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) { | 
|  | // Check if replacement decision is already available in the cached table. | 
|  | // if so, return it. | 
|  | std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU(); | 
|  | auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget); | 
|  | if (SIMDInstrTable.find(InstID) != SIMDInstrTable.end()) | 
|  | return SIMDInstrTable[InstID]; | 
|  |  | 
|  | unsigned SCIdx = InstDesc->getSchedClass(); | 
|  | const MCSchedClassDesc *SCDesc = | 
|  | SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx); | 
|  |  | 
|  | // If a target does not define resources for the instructions | 
|  | // of interest, then return false for no replacement. | 
|  | const MCSchedClassDesc *SCDescRepl; | 
|  | if (!SCDesc->isValid() || SCDesc->isVariant()) | 
|  | { | 
|  | SIMDInstrTable[InstID] = false; | 
|  | return false; | 
|  | } | 
|  | for (auto IDesc : InstDescRepl) | 
|  | { | 
|  | SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc( | 
|  | IDesc->getSchedClass()); | 
|  | if (!SCDescRepl->isValid() || SCDescRepl->isVariant()) | 
|  | { | 
|  | SIMDInstrTable[InstID] = false; | 
|  | return false; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Replacement cost. | 
|  | unsigned ReplCost = 0; | 
|  | for (auto IDesc :InstDescRepl) | 
|  | ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode()); | 
|  |  | 
|  | if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost) | 
|  | { | 
|  | SIMDInstrTable[InstID] = true; | 
|  | return true; | 
|  | } | 
|  | else | 
|  | { | 
|  | SIMDInstrTable[InstID] = false; | 
|  | return false; | 
|  | } | 
|  | } | 
|  |  | 
|  | /// Determine if we need to exit this pass for a kind of instruction replacement | 
|  | /// early. This makes sure that no compile time is spent in this pass for | 
|  | /// targets with no need for any of these optimizations beyond performing this | 
|  | /// check. | 
|  | /// Return true if early exit of this pass for a kind of instruction | 
|  | /// replacement is recommended for a target. | 
|  | bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) { | 
|  | const MCInstrDesc* OriginalMCID; | 
|  | SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID; | 
|  |  | 
|  | switch (SP) { | 
|  | // For this optimization, check by comparing the latency of a representative | 
|  | // instruction to that of the replacement instructions. | 
|  | // TODO: check for all concerned instructions. | 
|  | case VectorElem: | 
|  | OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed); | 
|  | ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane)); | 
|  | ReplInstrMCID.push_back(&TII->get(AArch64::FMLAv4f32)); | 
|  | if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) | 
|  | return false; | 
|  | break; | 
|  |  | 
|  | // For this optimization, check for all concerned instructions. | 
|  | case Interleave: | 
|  | std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU(); | 
|  | if (InterlEarlyExit.find(Subtarget) != InterlEarlyExit.end()) | 
|  | return InterlEarlyExit[Subtarget]; | 
|  |  | 
|  | for (auto &I : IRT) { | 
|  | OriginalMCID = &TII->get(I.OrigOpc); | 
|  | for (auto &Repl : I.ReplOpc) | 
|  | ReplInstrMCID.push_back(&TII->get(Repl)); | 
|  | if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) { | 
|  | InterlEarlyExit[Subtarget] = false; | 
|  | return false; | 
|  | } | 
|  | ReplInstrMCID.clear(); | 
|  | } | 
|  | InterlEarlyExit[Subtarget] = true; | 
|  | break; | 
|  | } | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | /// Check whether an equivalent DUP instruction has already been | 
|  | /// created or not. | 
|  | /// Return true when the DUP instruction already exists. In this case, | 
|  | /// DestReg will point to the destination of the already created DUP. | 
|  | bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode, | 
|  | unsigned SrcReg, unsigned LaneNumber, | 
|  | unsigned *DestReg) const { | 
|  | for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin(); | 
|  | MII != MIE;) { | 
|  | MII--; | 
|  | MachineInstr *CurrentMI = &*MII; | 
|  |  | 
|  | if (CurrentMI->getOpcode() == DupOpcode && | 
|  | CurrentMI->getNumOperands() == 3 && | 
|  | CurrentMI->getOperand(1).getReg() == SrcReg && | 
|  | CurrentMI->getOperand(2).getImm() == LaneNumber) { | 
|  | *DestReg = CurrentMI->getOperand(0).getReg(); | 
|  | return true; | 
|  | } | 
|  | } | 
|  |  | 
|  | return false; | 
|  | } | 
|  |  | 
|  | /// Certain SIMD instructions with vector element operand are not efficient. | 
|  | /// Rewrite them into SIMD instructions with vector operands. This rewrite | 
|  | /// is driven by the latency of the instructions. | 
|  | /// The instruction of concerns are for the time being FMLA, FMLS, FMUL, | 
|  | /// and FMULX and hence they are hardcoded. | 
|  | /// | 
|  | /// For example: | 
|  | ///    fmla v0.4s, v1.4s, v2.s[1] | 
|  | /// | 
|  | /// Is rewritten into | 
|  | ///    dup  v3.4s, v2.s[1]      // DUP not necessary if redundant | 
|  | ///    fmla v0.4s, v1.4s, v3.4s | 
|  | /// | 
|  | /// Return true if the SIMD instruction is modified. | 
|  | bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) { | 
|  | const MCInstrDesc *MulMCID, *DupMCID; | 
|  | const TargetRegisterClass *RC = &AArch64::FPR128RegClass; | 
|  |  | 
|  | switch (MI.getOpcode()) { | 
|  | default: | 
|  | return false; | 
|  |  | 
|  | // 4X32 instructions | 
|  | case AArch64::FMLAv4i32_indexed: | 
|  | DupMCID = &TII->get(AArch64::DUPv4i32lane); | 
|  | MulMCID = &TII->get(AArch64::FMLAv4f32); | 
|  | break; | 
|  | case AArch64::FMLSv4i32_indexed: | 
|  | DupMCID = &TII->get(AArch64::DUPv4i32lane); | 
|  | MulMCID = &TII->get(AArch64::FMLSv4f32); | 
|  | break; | 
|  | case AArch64::FMULXv4i32_indexed: | 
|  | DupMCID = &TII->get(AArch64::DUPv4i32lane); | 
|  | MulMCID = &TII->get(AArch64::FMULXv4f32); | 
|  | break; | 
|  | case AArch64::FMULv4i32_indexed: | 
|  | DupMCID = &TII->get(AArch64::DUPv4i32lane); | 
|  | MulMCID = &TII->get(AArch64::FMULv4f32); | 
|  | break; | 
|  |  | 
|  | // 2X64 instructions | 
|  | case AArch64::FMLAv2i64_indexed: | 
|  | DupMCID = &TII->get(AArch64::DUPv2i64lane); | 
|  | MulMCID = &TII->get(AArch64::FMLAv2f64); | 
|  | break; | 
|  | case AArch64::FMLSv2i64_indexed: | 
|  | DupMCID = &TII->get(AArch64::DUPv2i64lane); | 
|  | MulMCID = &TII->get(AArch64::FMLSv2f64); | 
|  | break; | 
|  | case AArch64::FMULXv2i64_indexed: | 
|  | DupMCID = &TII->get(AArch64::DUPv2i64lane); | 
|  | MulMCID = &TII->get(AArch64::FMULXv2f64); | 
|  | break; | 
|  | case AArch64::FMULv2i64_indexed: | 
|  | DupMCID = &TII->get(AArch64::DUPv2i64lane); | 
|  | MulMCID = &TII->get(AArch64::FMULv2f64); | 
|  | break; | 
|  |  | 
|  | // 2X32 instructions | 
|  | case AArch64::FMLAv2i32_indexed: | 
|  | RC = &AArch64::FPR64RegClass; | 
|  | DupMCID = &TII->get(AArch64::DUPv2i32lane); | 
|  | MulMCID = &TII->get(AArch64::FMLAv2f32); | 
|  | break; | 
|  | case AArch64::FMLSv2i32_indexed: | 
|  | RC = &AArch64::FPR64RegClass; | 
|  | DupMCID = &TII->get(AArch64::DUPv2i32lane); | 
|  | MulMCID = &TII->get(AArch64::FMLSv2f32); | 
|  | break; | 
|  | case AArch64::FMULXv2i32_indexed: | 
|  | RC = &AArch64::FPR64RegClass; | 
|  | DupMCID = &TII->get(AArch64::DUPv2i32lane); | 
|  | MulMCID = &TII->get(AArch64::FMULXv2f32); | 
|  | break; | 
|  | case AArch64::FMULv2i32_indexed: | 
|  | RC = &AArch64::FPR64RegClass; | 
|  | DupMCID = &TII->get(AArch64::DUPv2i32lane); | 
|  | MulMCID = &TII->get(AArch64::FMULv2f32); | 
|  | break; | 
|  | } | 
|  |  | 
|  | SmallVector<const MCInstrDesc*, 2> ReplInstrMCID; | 
|  | ReplInstrMCID.push_back(DupMCID); | 
|  | ReplInstrMCID.push_back(MulMCID); | 
|  | if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()), | 
|  | ReplInstrMCID)) | 
|  | return false; | 
|  |  | 
|  | const DebugLoc &DL = MI.getDebugLoc(); | 
|  | MachineBasicBlock &MBB = *MI.getParent(); | 
|  | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | 
|  |  | 
|  | // Get the operands of the current SIMD arithmetic instruction. | 
|  | unsigned MulDest = MI.getOperand(0).getReg(); | 
|  | unsigned SrcReg0 = MI.getOperand(1).getReg(); | 
|  | unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill()); | 
|  | unsigned SrcReg1 = MI.getOperand(2).getReg(); | 
|  | unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill()); | 
|  | unsigned DupDest; | 
|  |  | 
|  | // Instructions of interest have either 4 or 5 operands. | 
|  | if (MI.getNumOperands() == 5) { | 
|  | unsigned SrcReg2 = MI.getOperand(3).getReg(); | 
|  | unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill()); | 
|  | unsigned LaneNumber = MI.getOperand(4).getImm(); | 
|  | // Create a new DUP instruction. Note that if an equivalent DUP instruction | 
|  | // has already been created before, then use that one instead of creating | 
|  | // a new one. | 
|  | if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) { | 
|  | DupDest = MRI.createVirtualRegister(RC); | 
|  | BuildMI(MBB, MI, DL, *DupMCID, DupDest) | 
|  | .addReg(SrcReg2, Src2IsKill) | 
|  | .addImm(LaneNumber); | 
|  | } | 
|  | BuildMI(MBB, MI, DL, *MulMCID, MulDest) | 
|  | .addReg(SrcReg0, Src0IsKill) | 
|  | .addReg(SrcReg1, Src1IsKill) | 
|  | .addReg(DupDest, Src2IsKill); | 
|  | } else if (MI.getNumOperands() == 4) { | 
|  | unsigned LaneNumber = MI.getOperand(3).getImm(); | 
|  | if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) { | 
|  | DupDest = MRI.createVirtualRegister(RC); | 
|  | BuildMI(MBB, MI, DL, *DupMCID, DupDest) | 
|  | .addReg(SrcReg1, Src1IsKill) | 
|  | .addImm(LaneNumber); | 
|  | } | 
|  | BuildMI(MBB, MI, DL, *MulMCID, MulDest) | 
|  | .addReg(SrcReg0, Src0IsKill) | 
|  | .addReg(DupDest, Src1IsKill); | 
|  | } else { | 
|  | return false; | 
|  | } | 
|  |  | 
|  | ++NumModifiedInstr; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | /// Load/Store Interleaving instructions are not always beneficial. | 
|  | /// Replace them by ZIP instructions and classical load/store. | 
|  | /// | 
|  | /// For example: | 
|  | ///    st2 {v0.4s, v1.4s}, addr | 
|  | /// | 
|  | /// Is rewritten into: | 
|  | ///    zip1 v2.4s, v0.4s, v1.4s | 
|  | ///    zip2 v3.4s, v0.4s, v1.4s | 
|  | ///    stp  q2, q3, addr | 
|  | // | 
|  | /// For example: | 
|  | ///    st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr | 
|  | /// | 
|  | /// Is rewritten into: | 
|  | ///    zip1 v4.4s, v0.4s, v2.4s | 
|  | ///    zip2 v5.4s, v0.4s, v2.4s | 
|  | ///    zip1 v6.4s, v1.4s, v3.4s | 
|  | ///    zip2 v7.4s, v1.4s, v3.4s | 
|  | ///    zip1 v8.4s, v4.4s, v6.4s | 
|  | ///    zip2 v9.4s, v4.4s, v6.4s | 
|  | ///    zip1 v10.4s, v5.4s, v7.4s | 
|  | ///    zip2 v11.4s, v5.4s, v7.4s | 
|  | ///    stp  q8, q9, addr | 
|  | ///    stp  q10, q11, addr+32 | 
|  | /// | 
|  | /// Currently only instructions related to ST2 and ST4 are considered. | 
|  | /// Other may be added later. | 
|  | /// Return true if the SIMD instruction is modified. | 
|  | bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) { | 
|  |  | 
|  | unsigned SeqReg, AddrReg; | 
|  | unsigned StReg[4], StRegKill[4]; | 
|  | MachineInstr *DefiningMI; | 
|  | const DebugLoc &DL = MI.getDebugLoc(); | 
|  | MachineBasicBlock &MBB = *MI.getParent(); | 
|  | SmallVector<unsigned, MaxNumRepl> ZipDest; | 
|  | SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID; | 
|  |  | 
|  | // If current instruction matches any of the rewriting rules, then | 
|  | // gather information about parameters of the new instructions. | 
|  | bool Match = false; | 
|  | for (auto &I : IRT) { | 
|  | if (MI.getOpcode() == I.OrigOpc) { | 
|  | SeqReg  = MI.getOperand(0).getReg(); | 
|  | AddrReg = MI.getOperand(1).getReg(); | 
|  | DefiningMI = MRI->getUniqueVRegDef(SeqReg); | 
|  | unsigned NumReg = determineSrcReg(MI); | 
|  | if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg)) | 
|  | return false; | 
|  |  | 
|  | for (auto &Repl : I.ReplOpc) { | 
|  | ReplInstrMCID.push_back(&TII->get(Repl)); | 
|  | // Generate destination registers but only for non-store instruction. | 
|  | if (Repl != AArch64::STPQi && Repl != AArch64::STPDi) | 
|  | ZipDest.push_back(MRI->createVirtualRegister(&I.RC)); | 
|  | } | 
|  | Match = true; | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (!Match) | 
|  | return false; | 
|  |  | 
|  | // Determine if it is profitable to replace MI by the series of instructions | 
|  | // represented in ReplInstrMCID. | 
|  | if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()), | 
|  | ReplInstrMCID)) | 
|  | return false; | 
|  |  | 
|  | // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at | 
|  | // this point, the code generation is hardcoded and does not rely on the IRT | 
|  | // table used above given that code generation for ST2 replacement is somewhat | 
|  | // different than for ST4 replacement. We could have added more info into the | 
|  | // table related to how we build new instructions but we may be adding more | 
|  | // complexity with that). | 
|  | switch (MI.getOpcode()) { | 
|  | default: | 
|  | return false; | 
|  |  | 
|  | case AArch64::ST2Twov16b: | 
|  | case AArch64::ST2Twov8b: | 
|  | case AArch64::ST2Twov8h: | 
|  | case AArch64::ST2Twov4h: | 
|  | case AArch64::ST2Twov4s: | 
|  | case AArch64::ST2Twov2s: | 
|  | case AArch64::ST2Twov2d: | 
|  | // ZIP instructions | 
|  | BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0]) | 
|  | .addReg(StReg[0]) | 
|  | .addReg(StReg[1]); | 
|  | BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1]) | 
|  | .addReg(StReg[0], StRegKill[0]) | 
|  | .addReg(StReg[1], StRegKill[1]); | 
|  | // STP instructions | 
|  | BuildMI(MBB, MI, DL, *ReplInstrMCID[2]) | 
|  | .addReg(ZipDest[0]) | 
|  | .addReg(ZipDest[1]) | 
|  | .addReg(AddrReg) | 
|  | .addImm(0); | 
|  | break; | 
|  |  | 
|  | case AArch64::ST4Fourv16b: | 
|  | case AArch64::ST4Fourv8b: | 
|  | case AArch64::ST4Fourv8h: | 
|  | case AArch64::ST4Fourv4h: | 
|  | case AArch64::ST4Fourv4s: | 
|  | case AArch64::ST4Fourv2s: | 
|  | case AArch64::ST4Fourv2d: | 
|  | // ZIP instructions | 
|  | BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0]) | 
|  | .addReg(StReg[0]) | 
|  | .addReg(StReg[2]); | 
|  | BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1]) | 
|  | .addReg(StReg[0], StRegKill[0]) | 
|  | .addReg(StReg[2], StRegKill[2]); | 
|  | BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2]) | 
|  | .addReg(StReg[1]) | 
|  | .addReg(StReg[3]); | 
|  | BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3]) | 
|  | .addReg(StReg[1], StRegKill[1]) | 
|  | .addReg(StReg[3], StRegKill[3]); | 
|  | BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4]) | 
|  | .addReg(ZipDest[0]) | 
|  | .addReg(ZipDest[2]); | 
|  | BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5]) | 
|  | .addReg(ZipDest[0]) | 
|  | .addReg(ZipDest[2]); | 
|  | BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6]) | 
|  | .addReg(ZipDest[1]) | 
|  | .addReg(ZipDest[3]); | 
|  | BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7]) | 
|  | .addReg(ZipDest[1]) | 
|  | .addReg(ZipDest[3]); | 
|  | // stp instructions | 
|  | BuildMI(MBB, MI, DL, *ReplInstrMCID[8]) | 
|  | .addReg(ZipDest[4]) | 
|  | .addReg(ZipDest[5]) | 
|  | .addReg(AddrReg) | 
|  | .addImm(0); | 
|  | BuildMI(MBB, MI, DL, *ReplInstrMCID[9]) | 
|  | .addReg(ZipDest[6]) | 
|  | .addReg(ZipDest[7]) | 
|  | .addReg(AddrReg) | 
|  | .addImm(2); | 
|  | break; | 
|  | } | 
|  |  | 
|  | ++NumModifiedInstr; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | /// Process The REG_SEQUENCE instruction, and extract the source | 
|  | /// operands of the ST2/4 instruction from it. | 
|  | /// Example of such instruction. | 
|  | ///    %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1; | 
|  | /// Return true when the instruction is processed successfully. | 
|  | bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI, | 
|  | unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const { | 
|  | assert (DefiningMI != NULL); | 
|  | if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE) | 
|  | return false; | 
|  |  | 
|  | for (unsigned i=0; i<NumArg; i++) { | 
|  | StReg[i]     = DefiningMI->getOperand(2*i+1).getReg(); | 
|  | StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill()); | 
|  |  | 
|  | // Sanity check for the other arguments. | 
|  | if (DefiningMI->getOperand(2*i+2).isImm()) { | 
|  | switch (DefiningMI->getOperand(2*i+2).getImm()) { | 
|  | default: | 
|  | return false; | 
|  |  | 
|  | case AArch64::dsub0: | 
|  | case AArch64::dsub1: | 
|  | case AArch64::dsub2: | 
|  | case AArch64::dsub3: | 
|  | case AArch64::qsub0: | 
|  | case AArch64::qsub1: | 
|  | case AArch64::qsub2: | 
|  | case AArch64::qsub3: | 
|  | break; | 
|  | } | 
|  | } | 
|  | else | 
|  | return false; | 
|  | } | 
|  | return true; | 
|  | } | 
|  |  | 
|  | /// Return the number of useful source registers for this instruction | 
|  | /// (2 for ST2 and 4 for ST4). | 
|  | unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const { | 
|  | switch (MI.getOpcode()) { | 
|  | default: | 
|  | llvm_unreachable("Unsupported instruction for this pass"); | 
|  |  | 
|  | case AArch64::ST2Twov16b: | 
|  | case AArch64::ST2Twov8b: | 
|  | case AArch64::ST2Twov8h: | 
|  | case AArch64::ST2Twov4h: | 
|  | case AArch64::ST2Twov4s: | 
|  | case AArch64::ST2Twov2s: | 
|  | case AArch64::ST2Twov2d: | 
|  | return 2; | 
|  |  | 
|  | case AArch64::ST4Fourv16b: | 
|  | case AArch64::ST4Fourv8b: | 
|  | case AArch64::ST4Fourv8h: | 
|  | case AArch64::ST4Fourv4h: | 
|  | case AArch64::ST4Fourv4s: | 
|  | case AArch64::ST4Fourv2s: | 
|  | case AArch64::ST4Fourv2d: | 
|  | return 4; | 
|  | } | 
|  | } | 
|  |  | 
|  | bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) { | 
|  | if (skipFunction(MF.getFunction())) | 
|  | return false; | 
|  |  | 
|  | TII = MF.getSubtarget().getInstrInfo(); | 
|  | MRI = &MF.getRegInfo(); | 
|  | const TargetSubtargetInfo &ST = MF.getSubtarget(); | 
|  | const AArch64InstrInfo *AAII = | 
|  | static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); | 
|  | if (!AAII) | 
|  | return false; | 
|  | SchedModel.init(&ST); | 
|  | if (!SchedModel.hasInstrSchedModel()) | 
|  | return false; | 
|  |  | 
|  | bool Changed = false; | 
|  | for (auto OptimizationKind : {VectorElem, Interleave}) { | 
|  | if (!shouldExitEarly(&MF, OptimizationKind)) { | 
|  | SmallVector<MachineInstr *, 8> RemoveMIs; | 
|  | for (MachineBasicBlock &MBB : MF) { | 
|  | for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); | 
|  | MII != MIE;) { | 
|  | MachineInstr &MI = *MII; | 
|  | bool InstRewrite; | 
|  | if (OptimizationKind == VectorElem) | 
|  | InstRewrite = optimizeVectElement(MI) ; | 
|  | else | 
|  | InstRewrite = optimizeLdStInterleave(MI); | 
|  | if (InstRewrite) { | 
|  | // Add MI to the list of instructions to be removed given that it | 
|  | // has been replaced. | 
|  | RemoveMIs.push_back(&MI); | 
|  | Changed = true; | 
|  | } | 
|  | ++MII; | 
|  | } | 
|  | } | 
|  | for (MachineInstr *MI : RemoveMIs) | 
|  | MI->eraseFromParent(); | 
|  | } | 
|  | } | 
|  |  | 
|  | return Changed; | 
|  | } | 
|  |  | 
|  | /// Returns an instance of the high cost ASIMD instruction replacement | 
|  | /// optimization pass. | 
|  | FunctionPass *llvm::createAArch64SIMDInstrOptPass() { | 
|  | return new AArch64SIMDInstrOpt(); | 
|  | } |