[Hexagon] Improve scheduling heuristic for large basic blocks This patch changes the isLatencyBound heuristic to look at the path length based upon the number of packets needed to schedule a basic block. For small basic blocks, the heuristic uses a small threshold for isLatencyBound. For large basic blocks, the heuristic uses a large threshold. The goal is to increase the priority of an instruction in a small basic block that has a large height or depth relative to the code size. For large functions, the height and depth are ignored because it increases the live range of a register and causes more spills. That is, for large functions, it is more important to schedule instructions when available, and attempt to keep the defs and uses closer together. Patch by Brendon Cahoon. llvm-svn: 327987

commit: 5ffd808a27ec8fe211b86310f74a3e3a5c8c1950 [log] [tgz]
author: Krzysztof Parzyszek <kparzysz@codeaurora.org> Tue Mar 20 14:54:01 2018 +0000
committer: Krzysztof Parzyszek <kparzysz@codeaurora.org> Tue Mar 20 14:54:01 2018 +0000
tree: 098d4cbee7ba861221751b5d33c4482f36b250d8
parent: ad991868c1c4d4e778f41ebccc1b4ad8ce978d31 [diff]
diff --git a/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp b/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
index 9d5c984..8ac333f4 100644
--- a/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp

@@ -32,6 +32,7 @@
   UsesDotCur = nullptr;
   DotCurPNum = -1;
   UsesLoad = false;
+  PrefVectorStoreNew = nullptr;
   RegDefs.clear();
 }
 
@@ -80,6 +81,7 @@
     DotCurPNum = -1;
   }
   UsesLoad = false;
+  PrefVectorStoreNew = nullptr;
   PacketNum++;
   RegDefs.clear();
 }
@@ -89,8 +91,14 @@
 /// bank conflict. Case 2 - if a packet contains a dot cur instruction, then we
 /// prefer the instruction that can use the dot cur result. However, if the use
 /// is not scheduled in the same packet, then prefer other instructions in the
-/// subsequent packet.
+/// subsequent packet. Case 3 - we prefer a vector store that can be converted
+/// to a .new store. The packetizer will not generate the .new store if the
+/// store doesn't have resources to fit in the packet (but the .new store may
+/// have resources). We attempt to schedule the store as soon as possible to
+/// help packetize the two instructions together.
 bool HexagonHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
+  if (PrefVectorStoreNew != nullptr && PrefVectorStoreNew != SU)
+    return true;
   if (UsesLoad && SU->isInstr() && SU->getInstr()->mayLoad())
     return true;
   return UsesDotCur && ((SU == UsesDotCur) ^ (DotCurPNum == (int)PacketNum));
@@ -144,4 +152,13 @@
   }
 
   UsesLoad = MI->mayLoad();
+
+  if (TII->isHVXVec(*MI) && !MI->mayLoad() && !MI->mayStore())
+    for (auto &S : SU->Succs)
+      if (S.isAssignedRegDep() && S.getLatency() == 0 &&
+          TII->mayBeNewStore(*S.getSUnit()->getInstr()) &&
+          Resources->canReserveResources(*S.getSUnit()->getInstr())) {
+        PrefVectorStoreNew = S.getSUnit();
+        break;
+      }
 }

diff --git a/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h b/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h
index f904b05..2874d73 100644
--- a/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h
+++ b/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h

@@ -23,15 +23,21 @@
 class HexagonHazardRecognizer : public ScheduleHazardRecognizer {
   DFAPacketizer *Resources;
   const HexagonInstrInfo *TII;
-  unsigned PacketNum;
+  unsigned PacketNum = 0;
   // If the packet contains a potential dot cur instruction. This is
   // used for the scheduling priority function.
-  SUnit *UsesDotCur;
+  SUnit *UsesDotCur = nullptr;
   // The packet number when a dor cur is emitted. If its use is not generated
   // in the same packet, then try to wait another cycle before emitting.
-  int DotCurPNum;
+  int DotCurPNum = -1;
   // Does the packet contain a load. Used to restrict another load, if possible.
   bool UsesLoad = false;
+  // Check if we should prefer a vector store that will become a .new version.
+  // The .new store uses different resources than a normal store, and the
+  // packetizer will not generate the .new if the regular store does not have
+  // resources available (even if the .new version does). To help, the schedule
+  // attempts to schedule the .new as soon as possible in the packet.
+  SUnit *PrefVectorStoreNew = nullptr;
   // The set of registers defined by instructions in the current packet.
   SmallSet<unsigned, 8> RegDefs;
 
@@ -39,8 +45,7 @@
   HexagonHazardRecognizer(const InstrItineraryData *II,
                           const HexagonInstrInfo *HII,
                           const HexagonSubtarget &ST)
-    : Resources(ST.createDFAPacketizer(II)), TII(HII), PacketNum(0),
-    UsesDotCur(nullptr), DotCurPNum(-1) { }
+    : Resources(ST.createDFAPacketizer(II)), TII(HII) { }
 
   ~HexagonHazardRecognizer() override {
     if (Resources)

diff --git a/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index d7f670c..8599c1a 100644
--- a/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp

@@ -473,7 +473,11 @@
   if (CheckPending)
     releasePending();
 
-  for (unsigned i = 0; Available.empty(); ++i) {
+  for (unsigned i = 0;
+       Available.empty() ||
+         (Available.size() == 1 &&
+          !ResourceModel->isResourceAvailable(*Available.begin(), isTop()));
+       ++i) {
     assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
            "permanent hazard"); (void)i;
     ResourceModel->reserveResources(nullptr, isTop());
@@ -625,6 +629,10 @@
   return 0;
 }
 
+static unsigned getWeakLeft(const SUnit *SU, bool IsTop) {
+  return (IsTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
+}
+
 // Constants used to denote relative importance of
 // heuristic components for cost computation.
 static const unsigned PriorityOne = 200;
@@ -782,7 +790,7 @@
 
   // Give preference to a zero latency instruction if the dependent
   // instruction is in the current packet.
-  if (Q.getID() == TopQID) {
+  if (Q.getID() == TopQID && getWeakLeft(SU, true) == 0) {
     for (const SDep &PI : SU->Preds) {
       if (!PI.getSUnit()->getInstr()->isPseudo() && PI.isAssignedRegDep() &&
           PI.getLatency() == 0 &&
@@ -791,7 +799,7 @@
         DEBUG(if (verbose) dbgs() << "Z|");
       }
     }
-  } else {
+  } else if (Q.getID() == BotQID && getWeakLeft(SU, false) == 0) {
     for (const SDep &SI : SU->Succs) {
       if (!SI.getSUnit()->getInstr()->isPseudo() && SI.isAssignedRegDep() &&
           SI.getLatency() == 0 &&

diff --git a/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h b/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h
index e26e9ab..3248c6a 100644
--- a/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h
+++ b/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h

@@ -98,6 +98,7 @@
   void schedule() override;
 
   RegisterClassInfo *getRegClassInfo() { return RegClassInfo; }
+  int getBBSize() { return BB->size(); }
 };
 
 //===----------------------------------------------------------------------===//
@@ -143,6 +144,7 @@
 
     unsigned CurrCycle = 0;
     unsigned IssueCount = 0;
+    unsigned CriticalPathLength = 0;
 
     /// MinReadyCycle - Cycle of the soonest available instruction.
     unsigned MinReadyCycle = std::numeric_limits<unsigned>::max();
@@ -166,6 +168,25 @@
       SchedModel = smodel;
       CurrCycle = 0;
       IssueCount = 0;
+      // Initialize the critical path length limit, which used by the scheduling
+      // cost model to determine the value for scheduling an instruction. We use
+      // a slightly different heuristic for small and large functions. For small
+      // functions, it's important to use the height/depth of the instruction.
+      // For large functions, prioritizing by height or depth increases spills.
+      CriticalPathLength = DAG->getBBSize() / SchedModel->getIssueWidth();
+      if (DAG->getBBSize() < 50)
+        // We divide by two as a cheap and simple heuristic to reduce the
+        // critcal path length, which increases the priority of using the graph
+        // height/depth in the scheduler's cost computation.
+        CriticalPathLength >>= 1;
+      else {
+        // For large basic blocks, we prefer a larger critical path length to
+        // decrease the priority of using the graph height/depth.
+        unsigned MaxPath = 0;
+        for (auto &SU : DAG->SUnits)
+          MaxPath = std::max(MaxPath, isTop() ? SU.getHeight() : SU.getDepth());
+        CriticalPathLength = std::max(CriticalPathLength, MaxPath) + 1;
+      }
     }
 
     bool isTop() const {
@@ -185,6 +206,13 @@
     void removeReady(SUnit *SU);
 
     SUnit *pickOnlyChoice();
+ 
+    bool isLatencyBound(SUnit *SU) {
+      if (CurrCycle >= CriticalPathLength)
+        return true;
+      unsigned PathLength = isTop() ? SU->getHeight() : SU->getDepth();
+      return CriticalPathLength - CurrCycle <= PathLength;
+    }
   };
 
   VLIWMachineScheduler *DAG = nullptr;
commit	5ffd808a27ec8fe211b86310f74a3e3a5c8c1950	[log] [tgz]
author	Krzysztof Parzyszek <kparzysz@codeaurora.org>	Tue Mar 20 14:54:01 2018 +0000
committer	Krzysztof Parzyszek <kparzysz@codeaurora.org>	Tue Mar 20 14:54:01 2018 +0000
tree	098d4cbee7ba861221751b5d33c4482f36b250d8
parent	ad991868c1c4d4e778f41ebccc1b4ad8ce978d31 [diff]