[llvm-mca] Print the "Block RThroughput" in the SummaryView.

This patch implements the "block reciprocal throughput" computation in the
SummaryView.

The block reciprocal throughput is computed as the MAX of:
  - NumMicroOps / DispatchWidth
  - Resource Cycles / #Units   (for every resource consumed).

The block throughput is bounded from above by the hardware dispatch throughput.
That is because the DispatchWidth is an upper bound on how many opcodes can be part
of a single dispatch group.

The block throughput is also limited by the amount of hardware parallelism. The
number of available resource units affects how the resource pressure is
distributed, and also how many blocks can be delivered every cycle.

llvm-svn: 333095
diff --git a/llvm/tools/llvm-mca/SummaryView.cpp b/llvm/tools/llvm-mca/SummaryView.cpp
index 511727b..9b6e1d9 100644
--- a/llvm/tools/llvm-mca/SummaryView.cpp
+++ b/llvm/tools/llvm-mca/SummaryView.cpp
@@ -14,6 +14,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "SummaryView.h"
+#include "Support.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Format.h"
 
 namespace mca {
@@ -22,19 +24,83 @@
 
 using namespace llvm;
 
+void SummaryView::onInstructionEvent(const HWInstructionEvent &Event) {
+  // We are only interested in the "instruction dispatched" events generated by
+  // the dispatch stage for instructions that are part of iteration #0.
+  if (Event.Type != HWInstructionEvent::Dispatched)
+    return;
+
+  if (Event.IR.getSourceIndex() >= Source.size())
+    return;
+
+  // Update the cumulative number of resource cycles based on the processor
+  // resource usage information available from the instruction descriptor. We need to
+  // compute the cumulative number of resource cycles for every processor
+  // resource which is consumed by an instruction of the block.
+  const Instruction &Inst = *Event.IR.getInstruction();
+  const InstrDesc &Desc = Inst.getDesc();
+  NumMicroOps += Desc.NumMicroOps;
+  for (const std::pair<uint64_t, const ResourceUsage> &RU : Desc.Resources) {
+    if (!RU.second.size())
+      continue;
+
+    assert(RU.second.NumUnits && "Expected more than one unit used!");
+    if (ProcResourceUsage.find(RU.first) == ProcResourceUsage.end()) {
+      ProcResourceUsage[RU.first] = RU.second.size();
+      continue;
+    }
+
+    ProcResourceUsage[RU.first] += RU.second.size();
+  }
+}
+
+double SummaryView::getBlockRThroughput() const {
+  assert(NumMicroOps && "Expected at least one micro opcode!");
+
+  SmallVector<uint64_t, 8> Masks(SM.getNumProcResourceKinds());
+  computeProcResourceMasks(SM, Masks);
+
+  // The block throughput is bounded from above by the hardware dispatch
+  // throughput. That is because the DispatchWidth is an upper bound on the
+  // number of opcodes that can be part of a single dispatch group.
+  double Max = static_cast<double>(NumMicroOps) / DispatchWidth;
+
+  // The block throughput is also limited by the amount of hardware parallelism.
+  // The number of available resource units affects the resource pressure
+  // distributed, as well as how many blocks can be executed every cycle.
+  for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    uint64_t Mask = Masks[I];
+    const auto It = ProcResourceUsage.find_as(Mask);
+    if (It != ProcResourceUsage.end()) {
+      const MCProcResourceDesc &MCDesc = *SM.getProcResource(I);
+      unsigned NumUnits = MCDesc.NumUnits;
+      double Throughput = static_cast<double>(It->second) / NumUnits;
+      Max = std::max(Max, Throughput);
+    }
+  }
+
+  // The block reciprocal throughput is computed as the MAX of:
+  //  -  (#uOps / DispatchWidth)
+  //  -  (#units / resource cycles) for every consumed processor resource.
+  return Max;
+}
+
 void SummaryView::printView(raw_ostream &OS) const {
   unsigned Iterations = Source.getNumIterations();
   unsigned Instructions = Source.size();
   unsigned TotalInstructions = Instructions * Iterations;
   double IPC = (double)TotalInstructions / TotalCycles;
+  double BlockRThroughput = getBlockRThroughput();
 
   std::string Buffer;
   raw_string_ostream TempStream(Buffer);
-  TempStream << "Iterations:     " << Iterations;
-  TempStream << "\nInstructions:   " << TotalInstructions;
-  TempStream << "\nTotal Cycles:   " << TotalCycles;
-  TempStream << "\nDispatch Width: " << DispatchWidth;
-  TempStream << "\nIPC:            " << format("%.2f", IPC) << '\n';
+  TempStream << "Iterations:        " << Iterations;
+  TempStream << "\nInstructions:      " << TotalInstructions;
+  TempStream << "\nTotal Cycles:      " << TotalCycles;
+  TempStream << "\nDispatch Width:    " << DispatchWidth;
+  TempStream << "\nIPC:               " << format("%.2f", IPC);
+  TempStream << "\nBlock RThroughput: " << format("%.1f", BlockRThroughput)
+             << '\n';
   TempStream.flush();
   OS << Buffer;
 }