Re-commit: [globalisel] Add a combiner helpers for extending loads and use them in a pre-legalize combiner for AArch64

Summary: Depends on D45541

Reviewers: ab, aditya_nandakumar, bogner, rtereshin, volkan, rovka, javed.absar, aemerson

Subscribers: aemerson, rengolin, mgorny, javed.absar, kristof.beyls, llvm-commits

Differential Revision: https://reviews.llvm.org/D45543

The previous commit failed portions of the test-suite on GreenDragon due to
duplicate COPY instructions and iterator invalidation. Both issues have now
been fixed. To assist with this, a helper (cloneVirtualRegister) has been added
to MachineRegisterInfo that can be used to get another register that has the same
type and class/bank as an existing one.

llvm-svn: 343654
diff --git a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
index 0bc5b87..f3f075a 100644
--- a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
@@ -25,6 +25,34 @@
 
 using namespace llvm;
 
+namespace {
+/// This class acts as the glue the joins the CombinerHelper to the overall
+/// Combine algorithm. The CombinerHelper is intended to report the
+/// modifications it makes to the MIR to the CombinerChangeObserver and the
+/// observer subclass will act on these events. In this case, instruction
+/// erasure will cancel any future visits to the erased instruction and
+/// instruction creation will schedule that instruction for a future visit.
+/// Other Combiner implementations may require more complex behaviour from
+/// their CombinerChangeObserver subclass.
+class WorkListMaintainer : public CombinerChangeObserver {
+  using WorkListTy = GISelWorkList<512>;
+  WorkListTy &WorkList;
+
+public:
+  WorkListMaintainer(WorkListTy &WorkList) : WorkList(WorkList) {}
+  virtual ~WorkListMaintainer() {}
+
+  void erasedInstr(MachineInstr &MI) override {
+    LLVM_DEBUG(dbgs() << "Erased: "; MI.print(dbgs()); dbgs() << "\n");
+    WorkList.remove(&MI);
+  }
+  void createdInstr(MachineInstr &MI) override {
+    LLVM_DEBUG(dbgs() << "Created: "; MI.print(dbgs()); dbgs() << "\n");
+    WorkList.insert(&MI);
+  }
+};
+}
+
 Combiner::Combiner(CombinerInfo &Info, const TargetPassConfig *TPC)
     : CInfo(Info), TPC(TPC) {
   (void)this->TPC; // FIXME: Remove when used.
@@ -53,6 +81,7 @@
     // down RPOT.
     Changed = false;
     GISelWorkList<512> WorkList;
+    WorkListMaintainer Observer(WorkList);
     for (MachineBasicBlock *MBB : post_order(&MF)) {
       if (MBB->empty())
         continue;
@@ -72,7 +101,7 @@
     while (!WorkList.empty()) {
       MachineInstr *CurrInst = WorkList.pop_back_val();
       LLVM_DEBUG(dbgs() << "Try combining " << *CurrInst << "\n";);
-      Changed |= CInfo.combine(*CurrInst, Builder);
+      Changed |= CInfo.combine(Observer, *CurrInst, Builder);
     }
     MFChanged |= Changed;
   } while (Changed);
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 44e904a..8d44cde 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6,18 +6,28 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 #define DEBUG_TYPE "gi-combine"
 
 using namespace llvm;
 
-CombinerHelper::CombinerHelper(MachineIRBuilder &B) :
-  Builder(B), MRI(Builder.getMF().getRegInfo()) {}
+CombinerHelper::CombinerHelper(CombinerChangeObserver &Observer,
+                               MachineIRBuilder &B)
+    : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer) {}
+
+void CombinerHelper::eraseInstr(MachineInstr &MI) {
+  Observer.erasedInstr(MI);
+}
+void CombinerHelper::scheduleForVisit(MachineInstr &MI) {
+  Observer.createdInstr(MI);
+}
 
 bool CombinerHelper::tryCombineCopy(MachineInstr &MI) {
   if (MI.getOpcode() != TargetOpcode::COPY)
@@ -36,6 +46,214 @@
   return false;
 }
 
+namespace {
+struct PreferredTuple {
+  LLT Ty;                // The result type of the extend.
+  unsigned ExtendOpcode; // G_ANYEXT/G_SEXT/G_ZEXT
+  MachineInstr *MI;
+};
+
+/// Select a preference between two uses. CurrentUse is the current preference
+/// while *ForCandidate is attributes of the candidate under consideration.
+PreferredTuple ChoosePreferredUse(PreferredTuple &CurrentUse,
+                                  const LLT &TyForCandidate,
+                                  unsigned OpcodeForCandidate,
+                                  MachineInstr *MIForCandidate) {
+  if (!CurrentUse.Ty.isValid()) {
+    if (CurrentUse.ExtendOpcode == OpcodeForCandidate)
+      return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
+    if (CurrentUse.ExtendOpcode == TargetOpcode::G_ANYEXT &&
+        (OpcodeForCandidate == TargetOpcode::G_SEXT ||
+         OpcodeForCandidate == TargetOpcode::G_ZEXT ||
+         OpcodeForCandidate == TargetOpcode::G_ANYEXT))
+      return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
+    return CurrentUse;
+  }
+
+  // We permit the extend to hoist through basic blocks but this is only
+  // sensible if the target has extending loads. If you end up lowering back
+  // into a load and extend during the legalizer then the end result is
+  // hoisting the extend up to the load.
+
+  // Prefer defined extensions to undefined extensions as these are more
+  // likely to reduce the number of instructions.
+  if (OpcodeForCandidate == TargetOpcode::G_ANYEXT &&
+      CurrentUse.ExtendOpcode != TargetOpcode::G_ANYEXT)
+    return CurrentUse;
+  else if (CurrentUse.ExtendOpcode == TargetOpcode::G_ANYEXT &&
+           OpcodeForCandidate != TargetOpcode::G_ANYEXT)
+    return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
+
+  // Prefer sign extensions to zero extensions as sign-extensions tend to be
+  // more expensive.
+  if (CurrentUse.Ty == TyForCandidate) {
+    if (CurrentUse.ExtendOpcode == TargetOpcode::G_SEXT &&
+        OpcodeForCandidate == TargetOpcode::G_ZEXT)
+      return CurrentUse;
+    else if (CurrentUse.ExtendOpcode == TargetOpcode::G_ZEXT &&
+             OpcodeForCandidate == TargetOpcode::G_SEXT)
+      return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
+  }
+
+  // This is potentially target specific. We've chosen the largest type
+  // because G_TRUNC is usually free. One potential catch with this is that
+  // some targets have a reduced number of larger registers than smaller
+  // registers and this choice potentially increases the live-range for the
+  // larger value.
+  if (TyForCandidate.getSizeInBits() > CurrentUse.Ty.getSizeInBits()) {
+    return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
+  }
+  return CurrentUse;
+};
+} // end anonymous namespace
+
+bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) {
+  // We match the loads and follow the uses to the extend instead of matching
+  // the extends and following the def to the load. This is because the load
+  // must remain in the same position for correctness (unless we also add code
+  // to find a safe place to sink it) whereas the extend is freely movable.
+  // It also prevents us from duplicating the load for the volatile case or just
+  // for performance.
+
+  if (MI.getOpcode() != TargetOpcode::G_LOAD &&
+      MI.getOpcode() != TargetOpcode::G_SEXTLOAD &&
+      MI.getOpcode() != TargetOpcode::G_ZEXTLOAD)
+    return false;
+
+  auto &LoadValue = MI.getOperand(0);
+  assert(LoadValue.isReg() && "Result wasn't a register?");
+
+  LLT LoadValueTy = MRI.getType(LoadValue.getReg());
+  if (!LoadValueTy.isScalar())
+    return false;
+
+  // Find the preferred type aside from the any-extends (unless it's the only
+  // one) and non-extending ops. We'll emit an extending load to that type and
+  // and emit a variant of (extend (trunc X)) for the others according to the
+  // relative type sizes. At the same time, pick an extend to use based on the
+  // extend involved in the chosen type.
+  unsigned PreferredOpcode = MI.getOpcode() == TargetOpcode::G_LOAD
+                                 ? TargetOpcode::G_ANYEXT
+                                 : MI.getOpcode() == TargetOpcode::G_SEXTLOAD
+                                       ? TargetOpcode::G_SEXT
+                                       : TargetOpcode::G_ZEXT;
+  PreferredTuple Preferred = {LLT(), PreferredOpcode, nullptr};
+  for (auto &UseMI : MRI.use_instructions(LoadValue.getReg())) {
+    if (UseMI.getOpcode() == TargetOpcode::G_SEXT ||
+        UseMI.getOpcode() == TargetOpcode::G_ZEXT || !Preferred.Ty.isValid())
+      Preferred = ChoosePreferredUse(Preferred,
+                                     MRI.getType(UseMI.getOperand(0).getReg()),
+                                     UseMI.getOpcode(), &UseMI);
+  }
+
+  // There were no extends
+  if (!Preferred.MI)
+    return false;
+  // It should be impossible to chose an extend without selecting a different
+  // type since by definition the result of an extend is larger.
+  assert(Preferred.Ty != LoadValueTy && "Extending to same type?");
+
+  // Rewrite the load and schedule the canonical use for erasure.
+  const auto TruncateUse = [](MachineIRBuilder &Builder, MachineOperand &UseMO,
+                              unsigned DstReg, unsigned SrcReg) {
+    MachineInstr &UseMI = *UseMO.getParent();
+    MachineBasicBlock &UseMBB = *UseMI.getParent();
+
+    Builder.setInsertPt(UseMBB, MachineBasicBlock::iterator(UseMI));
+    Builder.buildTrunc(DstReg, SrcReg);
+  };
+
+  // Rewrite the load to the chosen extending load.
+  unsigned ChosenDstReg = Preferred.MI->getOperand(0).getReg();
+  MI.setDesc(
+      Builder.getTII().get(Preferred.ExtendOpcode == TargetOpcode::G_SEXT
+                               ? TargetOpcode::G_SEXTLOAD
+                               : Preferred.ExtendOpcode == TargetOpcode::G_ZEXT
+                                     ? TargetOpcode::G_ZEXTLOAD
+                                     : TargetOpcode::G_LOAD));
+
+  // Rewrite all the uses to fix up the types.
+  SmallVector<MachineInstr *, 1> ScheduleForErase;
+  SmallVector<std::pair<MachineOperand*, unsigned>, 4> ScheduleForAssignReg;
+  for (auto &UseMO : MRI.use_operands(LoadValue.getReg())) {
+    MachineInstr *UseMI = UseMO.getParent();
+
+    // If the extend is compatible with the preferred extend then we should fix
+    // up the type and extend so that it uses the preferred use.
+    if (UseMI->getOpcode() == Preferred.ExtendOpcode ||
+        UseMI->getOpcode() == TargetOpcode::G_ANYEXT) {
+      unsigned UseDstReg = UseMI->getOperand(0).getReg();
+      unsigned UseSrcReg = UseMI->getOperand(1).getReg();
+      const LLT &UseDstTy = MRI.getType(UseDstReg);
+      if (UseDstReg != ChosenDstReg) {
+        if (Preferred.Ty == UseDstTy) {
+          // If the use has the same type as the preferred use, then merge
+          // the vregs and erase the extend. For example:
+          //    %1:_(s8) = G_LOAD ...
+          //    %2:_(s32) = G_SEXT %1(s8)
+          //    %3:_(s32) = G_ANYEXT %1(s8)
+          //    ... = ... %3(s32)
+          // rewrites to:
+          //    %2:_(s32) = G_SEXTLOAD ...
+          //    ... = ... %2(s32)
+          MRI.replaceRegWith(UseDstReg, ChosenDstReg);
+          ScheduleForErase.push_back(UseMO.getParent());
+          Observer.erasedInstr(*UseMO.getParent());
+        } else if (Preferred.Ty.getSizeInBits() < UseDstTy.getSizeInBits()) {
+          // If the preferred size is smaller, then keep the extend but extend
+          // from the result of the extending load. For example:
+          //    %1:_(s8) = G_LOAD ...
+          //    %2:_(s32) = G_SEXT %1(s8)
+          //    %3:_(s64) = G_ANYEXT %1(s8)
+          //    ... = ... %3(s64)
+          /// rewrites to:
+          //    %2:_(s32) = G_SEXTLOAD ...
+          //    %3:_(s64) = G_ANYEXT %2:_(s32)
+          //    ... = ... %3(s64)
+          MRI.replaceRegWith(UseSrcReg, ChosenDstReg);
+        } else {
+          // If the preferred size is large, then insert a truncate. For
+          // example:
+          //    %1:_(s8) = G_LOAD ...
+          //    %2:_(s64) = G_SEXT %1(s8)
+          //    %3:_(s32) = G_ZEXT %1(s8)
+          //    ... = ... %3(s32)
+          /// rewrites to:
+          //    %2:_(s64) = G_SEXTLOAD ...
+          //    %4:_(s8) = G_TRUNC %2:_(s32)
+          //    %3:_(s64) = G_ZEXT %2:_(s8)
+          //    ... = ... %3(s64)
+          unsigned NewVReg = MRI.cloneVirtualRegister(MI.getOperand(0).getReg());
+          TruncateUse(Builder, UseMO, NewVReg, ChosenDstReg);
+          ScheduleForAssignReg.emplace_back(&UseMO, NewVReg);
+        }
+        continue;
+      }
+      // The use is (one of) the uses of the preferred use we chose earlier.
+      // We're going to update the load to def this value later so just erase
+      // the old extend.
+      ScheduleForErase.push_back(UseMO.getParent());
+      Observer.erasedInstr(*UseMO.getParent());
+      continue;
+    }
+
+    // The use isn't an extend. Truncate back to the type we originally loaded.
+    // This is free on many targets.
+    unsigned NewVReg = MRI.cloneVirtualRegister(MI.getOperand(0).getReg());
+    TruncateUse(Builder, UseMO, NewVReg, ChosenDstReg);
+    ScheduleForAssignReg.emplace_back(&UseMO, NewVReg);
+  }
+  for (auto &Assignment : ScheduleForAssignReg)
+    Assignment.first->setReg(Assignment.second);
+  for (auto &EraseMI : ScheduleForErase)
+    EraseMI->eraseFromParent();
+  MI.getOperand(0).setReg(ChosenDstReg);
+
+  return true;
+}
+
 bool CombinerHelper::tryCombine(MachineInstr &MI) {
-  return tryCombineCopy(MI);
+  if (tryCombineCopy(MI))
+    return true;
+  return tryCombineExtendingLoads(MI);
 }
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index f632a9b..1da99d9 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -177,6 +177,16 @@
   return Reg;
 }
 
+unsigned MachineRegisterInfo::cloneVirtualRegister(unsigned VReg,
+                                                   StringRef Name) {
+  unsigned Reg = createIncompleteVirtualRegister(Name);
+  VRegInfo[Reg].first = VRegInfo[VReg].first;
+  setType(Reg, getType(VReg));
+  if (TheDelegate)
+    TheDelegate->MRI_NoteNewVirtualRegister(Reg);
+  return Reg;
+}
+
 void MachineRegisterInfo::setType(unsigned VReg, LLT Ty) {
   // Check that VReg doesn't have a class.
   assert((getRegClassOrRegBank(VReg).isNull() ||
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index edda13c..74f22e2 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -53,6 +53,7 @@
 InstructionSelector *
 createAArch64InstructionSelector(const AArch64TargetMachine &,
                                  AArch64Subtarget &, AArch64RegisterBankInfo &);
+FunctionPass *createAArch64PreLegalizeCombiner();
 
 void initializeAArch64A53Fix835769Pass(PassRegistry&);
 void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
@@ -65,6 +66,7 @@
 void initializeAArch64ExpandPseudoPass(PassRegistry&);
 void initializeAArch64LoadStoreOptPass(PassRegistry&);
 void initializeAArch64SIMDInstrOptPass(PassRegistry&);
+void initializeAArch64PreLegalizerCombinerPass(PassRegistry&);
 void initializeAArch64PromoteConstantPass(PassRegistry&);
 void initializeAArch64RedundantCopyEliminationPass(PassRegistry&);
 void initializeAArch64StorePairSuppressPass(PassRegistry&);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index c4b9b45..a66f527 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -158,6 +158,7 @@
   initializeAArch64ExpandPseudoPass(*PR);
   initializeAArch64LoadStoreOptPass(*PR);
   initializeAArch64SIMDInstrOptPass(*PR);
+  initializeAArch64PreLegalizerCombinerPass(*PR);
   initializeAArch64PromoteConstantPass(*PR);
   initializeAArch64RedundantCopyEliminationPass(*PR);
   initializeAArch64StorePairSuppressPass(*PR);
@@ -348,6 +349,7 @@
   bool addPreISel() override;
   bool addInstSelector() override;
   bool addIRTranslator() override;
+  void addPreLegalizeMachineIR() override;
   bool addLegalizeMachineIR() override;
   bool addRegBankSelect() override;
   void addPreGlobalInstructionSelect() override;
@@ -449,6 +451,10 @@
   return false;
 }
 
+void AArch64PassConfig::addPreLegalizeMachineIR() {
+  addPass(createAArch64PreLegalizeCombiner());
+}
+
 bool AArch64PassConfig::addLegalizeMachineIR() {
   addPass(new Legalizer());
   return false;
diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt
index d9a0051..e6ca69c 100644
--- a/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -43,6 +43,7 @@
   AArch64LoadStoreOptimizer.cpp
   AArch64MacroFusion.cpp
   AArch64MCInstLower.cpp
+  AArch64PreLegalizerCombiner.cpp
   AArch64PromoteConstant.cpp
   AArch64PBQPRegAlloc.cpp
   AArch64RegisterBankInfo.cpp