AMDGPU/SI: Enable the post-ra scheduler

Summary:
This includes a hazard recognizer implementation to replace some of
the hazard handling we had during frame index elimination.

Reviewers: arsenm

Subscribers: qcolombet, arsenm, llvm-commits

Differential Revision: http://reviews.llvm.org/D18602

llvm-svn: 268143
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 13abe7f..342afff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -384,6 +384,17 @@
 }
 
 void GCNPassConfig::addPreEmitPass() {
+
+  // The hazard recognizer that runs as part of the post-ra scheduler does not
+  // gaurantee to be able handle all hazards correctly.  This is because
+  // if there are multiple scheduling regions in a basic block, the regions
+  // are scheduled bottom up, so when we begin to schedule a region we don't
+  // know what instructions were emitted directly before it.
+  //
+  // Here we add a stand-alone hazard recognizer pass which can handle all cases.
+  // hazard recognizer pass.
+  addPass(&PostRAHazardRecognizerID);
+
   addPass(createSIInsertWaitsPass(), false);
   addPass(createSIShrinkInstructionsPass());
   addPass(createSILowerControlFlowPass(), false);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index ef68e95..d09791f 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -47,6 +47,7 @@
   AMDGPUInstrInfo.cpp
   AMDGPUPromoteAlloca.cpp
   AMDGPURegisterInfo.cpp
+  GCNHazardRecognizer.cpp
   R600ClauseMergePass.cpp
   R600ControlFlowFinalizer.cpp
   R600EmitClauseMarkers.cpp
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
new file mode 100644
index 0000000..4c830bc
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -0,0 +1,182 @@
+//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements hazard recognizers for scheduling on GCN processors.
+//
+//===----------------------------------------------------------------------===//
+
+#include "GCNHazardRecognizer.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Hazard Recoginizer Implementation
+//===----------------------------------------------------------------------===//
+
+GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
+  CurrCycleInstr(nullptr),
+  MF(MF) {
+  MaxLookAhead = 5;
+}
+
+void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
+  EmitInstruction(SU->getInstr());
+}
+
+void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
+  CurrCycleInstr = MI;
+}
+
+ScheduleHazardRecognizer::HazardType
+GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(MF.getSubtarget().getInstrInfo());
+  MachineInstr *MI = SU->getInstr();
+
+  if (TII->isSMRD(*MI) && checkSMRDHazards(MI) > 0)
+    return NoopHazard;
+
+  if (TII->isVMEM(*MI) && checkVMEMHazards(MI) > 0)
+    return NoopHazard;
+
+  return NoHazard;
+}
+
+unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
+  return PreEmitNoops(SU->getInstr());
+}
+
+unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(MF.getSubtarget().getInstrInfo());
+
+  if (TII->isSMRD(*MI))
+    return std::max(0, checkSMRDHazards(MI));
+
+  if (TII->isVMEM(*MI))
+    return std::max(0, checkVMEMHazards(MI));
+
+  return 0;
+}
+
+void GCNHazardRecognizer::EmitNoop() {
+  EmittedInstrs.push_front(nullptr);
+}
+
+void GCNHazardRecognizer::AdvanceCycle() {
+
+  // When the scheduler detects a stall, it will call AdvanceCycle() without
+  // emitting any instructions.
+  if (!CurrCycleInstr)
+    return;
+
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(MF.getSubtarget().getInstrInfo());
+  unsigned NumWaitStates = TII->getNumWaitStates(*CurrCycleInstr);
+
+  // Keep track of emitted instructions
+  EmittedInstrs.push_front(CurrCycleInstr);
+
+  // Add a nullptr for each additional wait state after the first.  Make sure
+  // not to add more than getMaxLookAhead() items to the list, since we
+  // truncate the list to that size right after this loop.
+  for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
+       i < e; ++i) {
+    EmittedInstrs.push_front(nullptr);
+  }
+
+  // getMaxLookahead() is the largest number of wait states we will ever need
+  // to insert, so there is no point in keeping track of more than that many
+  // wait states.
+  EmittedInstrs.resize(getMaxLookAhead());
+
+  CurrCycleInstr = nullptr;
+}
+
+void GCNHazardRecognizer::RecedeCycle() {
+  llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
+                              std::function<bool(MachineInstr*)> IsHazardDef ) {
+  const TargetRegisterInfo *TRI =
+      MF.getSubtarget<AMDGPUSubtarget>().getRegisterInfo();
+
+  int WaitStates = -1;
+  for (MachineInstr *MI : EmittedInstrs) {
+    ++WaitStates;
+    if (!MI || !IsHazardDef(MI))
+      continue;
+    if (MI->modifiesRegister(Reg, TRI))
+      return WaitStates;
+  }
+  return std::numeric_limits<int>::max();
+}
+
+//===----------------------------------------------------------------------===//
+// No-op Hazard Detection
+//===----------------------------------------------------------------------===//
+
+int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
+
+  // This SMRD hazard only affects SI.
+  if (ST.getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS)
+    return 0;
+
+  // A read of an SGPR by SMRD instruction requires 4 wait states when the
+  // SGPR was written by a VALU instruction.
+  int SmrdSgprWaitStates = 4;
+  int WaitStatesNeeded = 0;
+  auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+
+  for (const MachineOperand &Use : SMRD->uses()) {
+    if (!Use.isReg())
+      continue;
+    int WaitStatesNeededForUse =
+        SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+  }
+  return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
+
+  if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    return 0;
+
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+  // A read of an SGPR by a VMEM instruction requires 5 wait states when the
+  // SGPR was written by a VALU Instruction.
+  int VmemSgprWaitStates = 5;
+  int WaitStatesNeeded = 0;
+  auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+
+  for (const MachineOperand &Use : VMEM->uses()) {
+    if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
+      continue;
+
+    int WaitStatesNeededForUse =
+        VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+  }
+  return WaitStatesNeeded;
+}
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
new file mode 100644
index 0000000..e75c350
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -0,0 +1,59 @@
+//===-- GCNHazardRecognizers.h - GCN Hazard Recognizers ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling on GCN processors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
+#define LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
+
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include <functional>
+#include <list>
+
+namespace llvm {
+
+class MachineFunction;
+class MachineInstr;
+class ScheduleDAG;
+class SIInstrInfo;
+
+class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
+
+  // This variable stores the instruction that has been emitted this cycle.
+  // It will be added to EmittedInstrs, when AdvanceCycle() or RecedeCycle() is
+  // called.
+  MachineInstr *CurrCycleInstr;
+  std::list<MachineInstr*> EmittedInstrs;
+  const MachineFunction &MF;
+
+  int getWaitStatesSinceDef(unsigned Reg,
+                            std::function<bool(MachineInstr*)> IsHazardDef =
+                            [](MachineInstr*) {return true;});
+
+  int checkSMRDHazards(MachineInstr *SMRD);
+  int checkVMEMHazards(MachineInstr* VMEM);
+public:
+  GCNHazardRecognizer(const MachineFunction &MF);
+  // We can only issue one instruction per cycle.
+  bool atIssueLimit() const override { return true; }
+  void EmitInstruction(SUnit *SU) override;
+  void EmitInstruction(MachineInstr *MI) override;
+  HazardType getHazardType(SUnit *SU, int Stalls) override;
+  void EmitNoop() override;
+  unsigned PreEmitNoops(SUnit *SU) override;
+  unsigned PreEmitNoops(MachineInstr *) override;
+  void AdvanceCycle() override;
+  void RecedeCycle() override;
+};
+
+} // end namespace llvm
+
+#endif //LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2ab4f78..eb17ffe 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -15,11 +15,13 @@
 
 #include "SIInstrInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "GCNHazardRecognizer.h"
 #include "SIDefines.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/IR/Function.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -816,6 +818,20 @@
   }
 }
 
+void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MI) const {
+  insertWaitStates(MBB, MI, 1);
+}
+
+unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default: return 1; // FIXME: Do wait states equal cycles?
+
+  case AMDGPU::S_NOP:
+    return MI.getOperand(0).getImm() + 1;
+  }
+}
+
 bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MBB.findDebugLoc(MI);
@@ -1188,8 +1204,11 @@
 
   if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
       getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
-    assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() &&
-           "read2 / write2 not expected here yet");
+
+    if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand()) {
+      // FIXME: Handle ds_read2 / ds_write2.
+      return false;
+    }
     unsigned Width0 = (*MIa->memoperands_begin())->getSize();
     unsigned Width1 = (*MIb->memoperands_begin())->getSize();
     if (BaseReg0 == BaseReg1 &&
@@ -2964,3 +2983,18 @@
       {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
   return makeArrayRef(TargetIndices);
 }
+
+/// This is used by the post-RA scheduler (SchedulePostRAList.cpp).  The
+/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
+ScheduleHazardRecognizer *
+SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                            const ScheduleDAG *DAG) const {
+  return new GCNHazardRecognizer(DAG->MF);
+}
+
+/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
+/// pass.
+ScheduleHazardRecognizer *
+SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
+  return new GCNHazardRecognizer(MF);
+}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index a5cd2e1..2121ae1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -169,6 +169,14 @@
     return get(Opcode).TSFlags & SIInstrFlags::VALU;
   }
 
+  static bool isVMEM(const MachineInstr &MI) {
+    return isMUBUF(MI) || isMTBUF(MI) || isMIMG(MI);
+  }
+
+  bool isVMEM(uint16_t Opcode) const {
+    return isMUBUF(Opcode) || isMTBUF(Opcode) || isMIMG(Opcode);
+  }
+
   static bool isSOP1(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::SOP1;
   }
@@ -440,6 +448,12 @@
   void insertWaitStates(MachineBasicBlock &MBB,MachineBasicBlock::iterator MI,
                         int Count) const;
 
+  void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const;
+
+  /// \brief Return the number of wait states that result from executing this
+  /// instruction.
+  unsigned getNumWaitStates(const MachineInstr &MI) const;
+
   /// \brief Returns the operand named \p Op.  If \p MI does not have an
   /// operand named \c Op, this function returns nullptr.
   LLVM_READONLY
@@ -472,6 +486,13 @@
   ArrayRef<std::pair<int, const char *>>
   getSerializableTargetIndices() const override;
 
+  ScheduleHazardRecognizer *
+  CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                 const ScheduleDAG *DAG) const override;
+
+  ScheduleHazardRecognizer *
+  CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override;
+
 };
 
 namespace AMDGPU {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 4bc89ea..d0ba8e6 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -596,22 +596,6 @@
         }
       }
 
-      // TODO: only do this when it is needed
-      switch (MF->getSubtarget<AMDGPUSubtarget>().getGeneration()) {
-      case AMDGPUSubtarget::SOUTHERN_ISLANDS:
-        // "VALU writes SGPR" -> "SMRD reads that SGPR" needs 4 wait states
-        // ("S_NOP 3") on SI
-        TII->insertWaitStates(*MBB, MI, 4);
-        break;
-      case AMDGPUSubtarget::SEA_ISLANDS:
-        break;
-      default: // VOLCANIC_ISLANDS and later
-        // "VALU writes SGPR -> VMEM reads that SGPR" needs 5 wait states
-        // ("S_NOP 4") on VI and later. This also applies to VALUs which write
-        // VCC, but we're unlikely to see VMEM use VCC.
-        TII->insertWaitStates(*MBB, MI, 5);
-      }
-
       MI->eraseFromParent();
       break;
     }
@@ -991,3 +975,14 @@
     }
   }
 }
+
+bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
+                            unsigned Reg) const {
+  const TargetRegisterClass *RC;
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    RC = MRI.getRegClass(Reg);
+  else
+    RC = getPhysRegClass(Reg);
+
+  return hasVGPRs(RC);
+}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index f641031..e43b2c1 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -188,6 +188,8 @@
   unsigned getSGPR32PressureSet() const { return SGPR32SetID; };
   unsigned getVGPR32PressureSet() const { return VGPR32SetID; };
 
+  bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const;
+
 private:
   void buildScratchLoadStore(MachineBasicBlock::iterator MI,
                              unsigned LoadStoreOp, unsigned Value,
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index 4a46eb4..26f73c4 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -42,6 +42,7 @@
 class SISchedMachineModel : SchedMachineModel {
   let CompleteModel = 0;
   let IssueWidth = 1;
+  let PostRAScheduler = 1;
 }
 
 def SIFullSpeedModel : SISchedMachineModel;