llvm/lib/Target/AArch64/AArch64MacroFusion.cpp - toolchain/llvm-project - Gitiles

 //===- AArch64MacroFusion.cpp - AArch64 Macro Fusion ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // \file This file contains the AArch64 implementation of the DAG scheduling mutation
 // to pair instructions back to back.
 //
 //===----------------------------------------------------------------------===//

 #include "AArch64MacroFusion.h"
 #include "AArch64Subtarget.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetInstrInfo.h"

 #define DEBUG_TYPE "misched"

 STATISTIC(NumFused, "Number of instr pairs fused");

 using namespace llvm;

 static cl::opt<bool> EnableMacroFusion("aarch64-misched-fusion", cl::Hidden,
   cl::desc("Enable scheduling for macro fusion."), cl::init(true));

 namespace {

 /// \brief Verify that the instr pair, FirstMI and SecondMI, should be fused
 /// together.  Given an anchor instr, when the other instr is unspecified, then
 /// check if the anchor instr may be part of a fused pair at all.
 static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
                                    const TargetSubtargetInfo &TSI,
                                    const MachineInstr *FirstMI,
                                    const MachineInstr *SecondMI) {
   assert((FirstMI || SecondMI) && "At least one instr must be specified");

   const AArch64InstrInfo &II = static_cast<const AArch64InstrInfo&>(TII);
   const AArch64Subtarget &ST = static_cast<const AArch64Subtarget&>(TSI);

   // Assume wildcards for unspecified instrs.
   unsigned FirstOpcode =
     FirstMI ? FirstMI->getOpcode()
 	    : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
   unsigned SecondOpcode =
     SecondMI ? SecondMI->getOpcode()
              : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);

   if (ST.hasArithmeticBccFusion())
     // Fuse CMN, CMP, TST followed by Bcc.
     if (SecondOpcode == AArch64::Bcc)
       switch (FirstOpcode) {
       default:
         return false;
       case AArch64::ADDSWri:
       case AArch64::ADDSWrr:
       case AArch64::ADDSXri:
       case AArch64::ADDSXrr:
       case AArch64::ANDSWri:
       case AArch64::ANDSWrr:
       case AArch64::ANDSXri:
       case AArch64::ANDSXrr:
       case AArch64::SUBSWri:
       case AArch64::SUBSWrr:
       case AArch64::SUBSXri:
       case AArch64::SUBSXrr:
       case AArch64::BICSWrr:
       case AArch64::BICSXrr:
         return true;
       case AArch64::ADDSWrs:
       case AArch64::ADDSXrs:
       case AArch64::ANDSWrs:
       case AArch64::ANDSXrs:
       case AArch64::SUBSWrs:
       case AArch64::SUBSXrs:
       case AArch64::BICSWrs:
       case AArch64::BICSXrs:
         // Shift value can be 0 making these behave like the "rr" variant...
         return !II.hasShiftedReg(*FirstMI);
       case AArch64::INSTRUCTION_LIST_END:
         return true;
       }

   if (ST.hasArithmeticCbzFusion())
     // Fuse ALU operations followed by CBZ/CBNZ.
     if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
         SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX)
       switch (FirstOpcode) {
       default:
         return false;
       case AArch64::ADDWri:
       case AArch64::ADDWrr:
       case AArch64::ADDXri:
       case AArch64::ADDXrr:
       case AArch64::ANDWri:
       case AArch64::ANDWrr:
       case AArch64::ANDXri:
       case AArch64::ANDXrr:
       case AArch64::EORWri:
       case AArch64::EORWrr:
       case AArch64::EORXri:
       case AArch64::EORXrr:
       case AArch64::ORRWri:
       case AArch64::ORRWrr:
       case AArch64::ORRXri:
       case AArch64::ORRXrr:
       case AArch64::SUBWri:
       case AArch64::SUBWrr:
       case AArch64::SUBXri:
       case AArch64::SUBXrr:
         return true;
       case AArch64::ADDWrs:
       case AArch64::ADDXrs:
       case AArch64::ANDWrs:
       case AArch64::ANDXrs:
       case AArch64::SUBWrs:
       case AArch64::SUBXrs:
       case AArch64::BICWrs:
       case AArch64::BICXrs:
         // Shift value can be 0 making these behave like the "rr" variant...
         return !II.hasShiftedReg(*FirstMI);
       case AArch64::INSTRUCTION_LIST_END:
         return true;
       }

   if (ST.hasFuseAES())
     // Fuse AES crypto operations.
     switch(FirstOpcode) {
     // AES encode.
     case AArch64::AESErr:
       return SecondOpcode == AArch64::AESMCrr ||
              SecondOpcode == AArch64::INSTRUCTION_LIST_END;
     // AES decode.
     case AArch64::AESDrr:
       return SecondOpcode == AArch64::AESIMCrr ||
              SecondOpcode == AArch64::INSTRUCTION_LIST_END;
     }

   if (ST.hasFuseLiterals())
     // Fuse literal generation operations.
     switch (FirstOpcode) {
     // PC relative address.
     case AArch64::ADRP:
       return SecondOpcode == AArch64::ADDXri ||
              SecondOpcode == AArch64::INSTRUCTION_LIST_END;
     // 32 bit immediate.
     case AArch64::MOVZWi:
       return (SecondOpcode == AArch64::MOVKWi &&
               SecondMI->getOperand(3).getImm() == 16) ||
              SecondOpcode == AArch64::INSTRUCTION_LIST_END;
     // Lower half of 64 bit immediate.
     case AArch64::MOVZXi:
       return (SecondOpcode == AArch64::MOVKXi &&
               SecondMI->getOperand(3).getImm() == 16) ||
              SecondOpcode == AArch64::INSTRUCTION_LIST_END;
     // Upper half of 64 bit immediate.
     case AArch64::MOVKXi:
       return FirstMI->getOperand(3).getImm() == 32 &&
              ((SecondOpcode == AArch64::MOVKXi &&
                SecondMI->getOperand(3).getImm() == 48) ||
               SecondOpcode == AArch64::INSTRUCTION_LIST_END);
     }

   return false;
 }

 /// \brief Implement the fusion of instr pairs in the scheduling DAG,
 /// anchored at the instr in AnchorSU..
 static bool scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit &AnchorSU) {
   const MachineInstr *AnchorMI = AnchorSU.getInstr();
   if (!AnchorMI || AnchorMI->isPseudo() || AnchorMI->isTransient())
     return false;

   // If the anchor instr is the ExitSU, then consider its predecessors;
   // otherwise, its successors.
   bool Preds = (&AnchorSU == &DAG->ExitSU);
   SmallVectorImpl<SDep> &AnchorDeps = Preds ? AnchorSU.Preds : AnchorSU.Succs;

   const MachineInstr *FirstMI = Preds ? nullptr : AnchorMI;
   const MachineInstr *SecondMI = Preds ? AnchorMI : nullptr;

   // Check if the anchor instr may be fused.
   if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(),
                               FirstMI, SecondMI))
     return false;

   // Explorer for fusion candidates among the dependencies of the anchor instr.
   for (SDep &Dep : AnchorDeps) {
     // Ignore dependencies that don't enforce ordering.
     if (Dep.isWeak())
       continue;

     SUnit &DepSU = *Dep.getSUnit();
     // Ignore the ExitSU if the dependents are successors.
     if (!Preds && &DepSU == &DAG->ExitSU)
       continue;

     const MachineInstr *DepMI = DepSU.getInstr();
     if (!DepMI || DepMI->isPseudo() || DepMI->isTransient())
       continue;

     FirstMI = Preds ? DepMI : AnchorMI;
     SecondMI = Preds ? AnchorMI : DepMI;
     if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(),
                                 FirstMI, SecondMI))
       continue;

     // Create a single weak edge between the adjacent instrs. The only effect is
     // to cause bottom-up scheduling to heavily prioritize the clustered instrs.
     SUnit &FirstSU = Preds ? DepSU : AnchorSU;
     SUnit &SecondSU = Preds ? AnchorSU : DepSU;
     DAG->addEdge(&SecondSU, SDep(&FirstSU, SDep::Cluster));

     // Adjust the latency between the anchor instr and its
     // predecessors/successors.
     for (SDep &IDep : AnchorDeps)
       if (IDep.getSUnit() == &DepSU)
         IDep.setLatency(0);

     // Adjust the latency between the dependent instr and its
     // successors/predecessors.
     for (SDep &IDep : Preds ? DepSU.Succs : DepSU.Preds)
       if (IDep.getSUnit() == &AnchorSU)
         IDep.setLatency(0);

     DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse ";
           FirstSU.print(dbgs(), DAG); dbgs() << " - ";
           SecondSU.print(dbgs(), DAG); dbgs() << " /  ";
           dbgs() << DAG->TII->getName(FirstMI->getOpcode()) << " - " <<
                     DAG->TII->getName(SecondMI->getOpcode()) << '\n'; );

     ++NumFused;
     return true;
   }

   return false;
 }

 /// \brief Post-process the DAG to create cluster edges between instrs that may
 /// be fused by the processor into a single operation.
 class AArch64MacroFusion : public ScheduleDAGMutation {
 public:
   AArch64MacroFusion() {}

   void apply(ScheduleDAGInstrs *DAGInstrs) override;
 };

 void AArch64MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
   ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);

   // For each of the SUnits in the scheduling block, try to fuse the instr in it
   // with one in its successors.
   for (SUnit &ISU : DAG->SUnits)
     scheduleAdjacentImpl(DAG, ISU);

   // Try to fuse the instr in the ExitSU with one in its predecessors.
   scheduleAdjacentImpl(DAG, DAG->ExitSU);
 }

 } // end namespace


 namespace llvm {

 std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation () {
   return EnableMacroFusion ? make_unique<AArch64MacroFusion>() : nullptr;
 }

 } // end namespace llvm
	//===- AArch64MacroFusion.cpp - AArch64 Macro Fusion ----------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// \file This file contains the AArch64 implementation of the DAG scheduling mutation
	// to pair instructions back to back.
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64MacroFusion.h"
	#include "AArch64Subtarget.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Target/TargetInstrInfo.h"

	#define DEBUG_TYPE "misched"

	STATISTIC(NumFused, "Number of instr pairs fused");

	using namespace llvm;

	static cl::opt<bool> EnableMacroFusion("aarch64-misched-fusion", cl::Hidden,
	cl::desc("Enable scheduling for macro fusion."), cl::init(true));

	namespace {

	/// \brief Verify that the instr pair, FirstMI and SecondMI, should be fused
	/// together. Given an anchor instr, when the other instr is unspecified, then
	/// check if the anchor instr may be part of a fused pair at all.
	static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
	const TargetSubtargetInfo &TSI,
	const MachineInstr *FirstMI,
	const MachineInstr *SecondMI) {
	assert((FirstMI \|\| SecondMI) && "At least one instr must be specified");

	const AArch64InstrInfo &II = static_cast<const AArch64InstrInfo&>(TII);
	const AArch64Subtarget &ST = static_cast<const AArch64Subtarget&>(TSI);

	// Assume wildcards for unspecified instrs.
	unsigned FirstOpcode =
	FirstMI ? FirstMI->getOpcode()
	: static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
	unsigned SecondOpcode =
	SecondMI ? SecondMI->getOpcode()
	: static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);

	if (ST.hasArithmeticBccFusion())
	// Fuse CMN, CMP, TST followed by Bcc.
	if (SecondOpcode == AArch64::Bcc)
	switch (FirstOpcode) {
	default:
	return false;
	case AArch64::ADDSWri:
	case AArch64::ADDSWrr:
	case AArch64::ADDSXri:
	case AArch64::ADDSXrr:
	case AArch64::ANDSWri:
	case AArch64::ANDSWrr:
	case AArch64::ANDSXri:
	case AArch64::ANDSXrr:
	case AArch64::SUBSWri:
	case AArch64::SUBSWrr:
	case AArch64::SUBSXri:
	case AArch64::SUBSXrr:
	case AArch64::BICSWrr:
	case AArch64::BICSXrr:
	return true;
	case AArch64::ADDSWrs:
	case AArch64::ADDSXrs:
	case AArch64::ANDSWrs:
	case AArch64::ANDSXrs:
	case AArch64::SUBSWrs:
	case AArch64::SUBSXrs:
	case AArch64::BICSWrs:
	case AArch64::BICSXrs:
	// Shift value can be 0 making these behave like the "rr" variant...
	return !II.hasShiftedReg(*FirstMI);
	case AArch64::INSTRUCTION_LIST_END:
	return true;
	}

	if (ST.hasArithmeticCbzFusion())
	// Fuse ALU operations followed by CBZ/CBNZ.
	if (SecondOpcode == AArch64::CBNZW \|\| SecondOpcode == AArch64::CBNZX \|\|
	SecondOpcode == AArch64::CBZW \|\| SecondOpcode == AArch64::CBZX)
	switch (FirstOpcode) {
	default:
	return false;
	case AArch64::ADDWri:
	case AArch64::ADDWrr:
	case AArch64::ADDXri:
	case AArch64::ADDXrr:
	case AArch64::ANDWri:
	case AArch64::ANDWrr:
	case AArch64::ANDXri:
	case AArch64::ANDXrr:
	case AArch64::EORWri:
	case AArch64::EORWrr:
	case AArch64::EORXri:
	case AArch64::EORXrr:
	case AArch64::ORRWri:
	case AArch64::ORRWrr:
	case AArch64::ORRXri:
	case AArch64::ORRXrr:
	case AArch64::SUBWri:
	case AArch64::SUBWrr:
	case AArch64::SUBXri:
	case AArch64::SUBXrr:
	return true;
	case AArch64::ADDWrs:
	case AArch64::ADDXrs:
	case AArch64::ANDWrs:
	case AArch64::ANDXrs:
	case AArch64::SUBWrs:
	case AArch64::SUBXrs:
	case AArch64::BICWrs:
	case AArch64::BICXrs:
	// Shift value can be 0 making these behave like the "rr" variant...
	return !II.hasShiftedReg(*FirstMI);
	case AArch64::INSTRUCTION_LIST_END:
	return true;
	}

	if (ST.hasFuseAES())
	// Fuse AES crypto operations.
	switch(FirstOpcode) {
	// AES encode.
	case AArch64::AESErr:
	return SecondOpcode == AArch64::AESMCrr \|\|
	SecondOpcode == AArch64::INSTRUCTION_LIST_END;
	// AES decode.
	case AArch64::AESDrr:
	return SecondOpcode == AArch64::AESIMCrr \|\|
	SecondOpcode == AArch64::INSTRUCTION_LIST_END;
	}

	if (ST.hasFuseLiterals())
	// Fuse literal generation operations.
	switch (FirstOpcode) {
	// PC relative address.
	case AArch64::ADRP:
	return SecondOpcode == AArch64::ADDXri \|\|
	SecondOpcode == AArch64::INSTRUCTION_LIST_END;
	// 32 bit immediate.
	case AArch64::MOVZWi:
	return (SecondOpcode == AArch64::MOVKWi &&
	SecondMI->getOperand(3).getImm() == 16) \|\|
	SecondOpcode == AArch64::INSTRUCTION_LIST_END;
	// Lower half of 64 bit immediate.
	case AArch64::MOVZXi:
	return (SecondOpcode == AArch64::MOVKXi &&
	SecondMI->getOperand(3).getImm() == 16) \|\|
	SecondOpcode == AArch64::INSTRUCTION_LIST_END;
	// Upper half of 64 bit immediate.
	case AArch64::MOVKXi:
	return FirstMI->getOperand(3).getImm() == 32 &&
	((SecondOpcode == AArch64::MOVKXi &&
	SecondMI->getOperand(3).getImm() == 48) \|\|
	SecondOpcode == AArch64::INSTRUCTION_LIST_END);
	}

	return false;
	}

	/// \brief Implement the fusion of instr pairs in the scheduling DAG,
	/// anchored at the instr in AnchorSU..
	static bool scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit &AnchorSU) {
	const MachineInstr *AnchorMI = AnchorSU.getInstr();
	if (!AnchorMI \|\| AnchorMI->isPseudo() \|\| AnchorMI->isTransient())
	return false;

	// If the anchor instr is the ExitSU, then consider its predecessors;
	// otherwise, its successors.
	bool Preds = (&AnchorSU == &DAG->ExitSU);
	SmallVectorImpl<SDep> &AnchorDeps = Preds ? AnchorSU.Preds : AnchorSU.Succs;

	const MachineInstr *FirstMI = Preds ? nullptr : AnchorMI;
	const MachineInstr *SecondMI = Preds ? AnchorMI : nullptr;

	// Check if the anchor instr may be fused.
	if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(),
	FirstMI, SecondMI))
	return false;

	// Explorer for fusion candidates among the dependencies of the anchor instr.
	for (SDep &Dep : AnchorDeps) {
	// Ignore dependencies that don't enforce ordering.
	if (Dep.isWeak())
	continue;

	SUnit &DepSU = *Dep.getSUnit();
	// Ignore the ExitSU if the dependents are successors.
	if (!Preds && &DepSU == &DAG->ExitSU)
	continue;

	const MachineInstr *DepMI = DepSU.getInstr();
	if (!DepMI \|\| DepMI->isPseudo() \|\| DepMI->isTransient())
	continue;

	FirstMI = Preds ? DepMI : AnchorMI;
	SecondMI = Preds ? AnchorMI : DepMI;
	if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(),
	FirstMI, SecondMI))
	continue;

	// Create a single weak edge between the adjacent instrs. The only effect is
	// to cause bottom-up scheduling to heavily prioritize the clustered instrs.
	SUnit &FirstSU = Preds ? DepSU : AnchorSU;
	SUnit &SecondSU = Preds ? AnchorSU : DepSU;
	DAG->addEdge(&SecondSU, SDep(&FirstSU, SDep::Cluster));

	// Adjust the latency between the anchor instr and its
	// predecessors/successors.
	for (SDep &IDep : AnchorDeps)
	if (IDep.getSUnit() == &DepSU)
	IDep.setLatency(0);

	// Adjust the latency between the dependent instr and its
	// successors/predecessors.
	for (SDep &IDep : Preds ? DepSU.Succs : DepSU.Preds)
	if (IDep.getSUnit() == &AnchorSU)
	IDep.setLatency(0);

	DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse ";
	FirstSU.print(dbgs(), DAG); dbgs() << " - ";
	SecondSU.print(dbgs(), DAG); dbgs() << " / ";
	dbgs() << DAG->TII->getName(FirstMI->getOpcode()) << " - " <<
	DAG->TII->getName(SecondMI->getOpcode()) << '\n'; );

	++NumFused;
	return true;
	}

	return false;
	}

	/// \brief Post-process the DAG to create cluster edges between instrs that may
	/// be fused by the processor into a single operation.
	class AArch64MacroFusion : public ScheduleDAGMutation {
	public:
	AArch64MacroFusion() {}

	void apply(ScheduleDAGInstrs *DAGInstrs) override;
	};

	void AArch64MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
	ScheduleDAGMI DAG = static_cast<ScheduleDAGMI>(DAGInstrs);

	// For each of the SUnits in the scheduling block, try to fuse the instr in it
	// with one in its successors.
	for (SUnit &ISU : DAG->SUnits)
	scheduleAdjacentImpl(DAG, ISU);

	// Try to fuse the instr in the ExitSU with one in its predecessors.
	scheduleAdjacentImpl(DAG, DAG->ExitSU);
	}

	} // end namespace


	namespace llvm {

	std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation () {
	return EnableMacroFusion ? make_unique<AArch64MacroFusion>() : nullptr;
	}

	} // end namespace llvm