Re-land r329156 "Add llvm-exegesis tool."

Fixed to depend on and initialize the native target instead of X86.

llvm-svn: 329169
diff --git a/llvm/tools/llvm-exegesis/lib/Uops.cpp b/llvm/tools/llvm-exegesis/lib/Uops.cpp
new file mode 100644
index 0000000..bcfc557
--- /dev/null
+++ b/llvm/tools/llvm-exegesis/lib/Uops.cpp
@@ -0,0 +1,248 @@
+//===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Uops.h"
+#include "BenchmarkResult.h"
+#include "InstructionSnippetGenerator.h"
+#include "PerfHelper.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <random>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace exegesis {
+
+// FIXME: Handle memory (see PR36906)
+static bool isInvalidOperand(const llvm::MCOperandInfo &OpInfo) {
+  switch (OpInfo.OperandType) {
+  default:
+    return true;
+  case llvm::MCOI::OPERAND_IMMEDIATE:
+  case llvm::MCOI::OPERAND_REGISTER:
+    return false;
+  }
+}
+
+static llvm::Error makeError(llvm::Twine Msg) {
+  return llvm::make_error<llvm::StringError>(Msg,
+                                             llvm::inconvertibleErrorCode());
+}
+
+// FIXME: Read the counter names from the ProcResourceUnits when PR36984 is
+// fixed.
+static const std::string *getEventNameFromProcResName(const char *ProcResName) {
+  static const std::unordered_map<std::string, std::string> Entries = {
+      {"SBPort0", "UOPS_DISPATCHED_PORT:PORT_0"},
+      {"SBPort1", "UOPS_DISPATCHED_PORT:PORT_1"},
+      {"SBPort4", "UOPS_DISPATCHED_PORT:PORT_4"},
+      {"SBPort5", "UOPS_DISPATCHED_PORT:PORT_5"},
+      {"HWPort0", "UOPS_DISPATCHED_PORT:PORT_0"},
+      {"HWPort1", "UOPS_DISPATCHED_PORT:PORT_1"},
+      {"HWPort2", "UOPS_DISPATCHED_PORT:PORT_2"},
+      {"HWPort3", "UOPS_DISPATCHED_PORT:PORT_3"},
+      {"HWPort4", "UOPS_DISPATCHED_PORT:PORT_4"},
+      {"HWPort5", "UOPS_DISPATCHED_PORT:PORT_5"},
+      {"HWPort6", "UOPS_DISPATCHED_PORT:PORT_6"},
+      {"HWPort7", "UOPS_DISPATCHED_PORT:PORT_7"},
+      {"SKLPort0", "UOPS_DISPATCHED_PORT:PORT_0"},
+      {"SKLPort1", "UOPS_DISPATCHED_PORT:PORT_1"},
+      {"SKLPort2", "UOPS_DISPATCHED_PORT:PORT_2"},
+      {"SKLPort3", "UOPS_DISPATCHED_PORT:PORT_3"},
+      {"SKLPort4", "UOPS_DISPATCHED_PORT:PORT_4"},
+      {"SKLPort5", "UOPS_DISPATCHED_PORT:PORT_5"},
+      {"SKLPort6", "UOPS_DISPATCHED_PORT:PORT_6"},
+      {"SKXPort7", "UOPS_DISPATCHED_PORT:PORT_7"},
+      {"SKXPort0", "UOPS_DISPATCHED_PORT:PORT_0"},
+      {"SKXPort1", "UOPS_DISPATCHED_PORT:PORT_1"},
+      {"SKXPort2", "UOPS_DISPATCHED_PORT:PORT_2"},
+      {"SKXPort3", "UOPS_DISPATCHED_PORT:PORT_3"},
+      {"SKXPort4", "UOPS_DISPATCHED_PORT:PORT_4"},
+      {"SKXPort5", "UOPS_DISPATCHED_PORT:PORT_5"},
+      {"SKXPort6", "UOPS_DISPATCHED_PORT:PORT_6"},
+      {"SKXPort7", "UOPS_DISPATCHED_PORT:PORT_7"},
+  };
+  const auto It = Entries.find(ProcResName);
+  return It == Entries.end() ? nullptr : &It->second;
+}
+
+static std::vector<llvm::MCInst> generateIndependentAssignments(
+    const LLVMState &State, const llvm::MCInstrDesc &InstrDesc,
+    llvm::SmallVector<Variable, 8> Vars, int MaxAssignments) {
+  std::unordered_set<llvm::MCPhysReg> IsUsedByAnyVar;
+  for (const Variable &Var : Vars) {
+    if (Var.IsUse) {
+      IsUsedByAnyVar.insert(Var.PossibleRegisters.begin(),
+                            Var.PossibleRegisters.end());
+    }
+  }
+
+  std::vector<llvm::MCInst> Pattern;
+  for (int A = 0; A < MaxAssignments; ++A) {
+    // FIXME: This is a bit pessimistic. We should get away with an
+    // assignment that ensures that the set of assigned registers for uses and
+    // the set of assigned registers for defs do not intersect (registers
+    // for uses (resp defs) do not have to be all distinct).
+    const std::vector<llvm::MCPhysReg> Regs = getExclusiveAssignment(Vars);
+    if (Regs.empty())
+      break;
+    // Remove all assigned registers defs that are used by at least one other
+    // variable from the list of possible variable registers. This ensures that
+    // we never create a RAW hazard that would lead to serialization.
+    for (size_t I = 0, E = Vars.size(); I < E; ++I) {
+      llvm::MCPhysReg Reg = Regs[I];
+      if (Vars[I].IsDef && IsUsedByAnyVar.count(Reg)) {
+        Vars[I].PossibleRegisters.remove(Reg);
+      }
+    }
+    // Create an MCInst and check assembly.
+    llvm::MCInst Inst = generateMCInst(InstrDesc, Vars, Regs);
+    if (!State.canAssemble(Inst))
+      continue;
+    Pattern.push_back(std::move(Inst));
+  }
+  return Pattern;
+}
+
+UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
+
+const char *UopsBenchmarkRunner::getDisplayName() const { return "uops"; }
+
+llvm::Expected<std::vector<llvm::MCInst>> UopsBenchmarkRunner::createCode(
+    const LLVMState &State, const unsigned OpcodeIndex,
+    const unsigned NumRepetitions, const JitFunctionContext &Context) const {
+  const auto &InstrInfo = State.getInstrInfo();
+  const auto &RegInfo = State.getRegInfo();
+  const llvm::MCInstrDesc &InstrDesc = InstrInfo.get(OpcodeIndex);
+  for (const llvm::MCOperandInfo &OpInfo : InstrDesc.operands()) {
+    if (isInvalidOperand(OpInfo))
+      return makeError("Only registers and immediates are supported");
+  }
+
+  // FIXME: Load constants into registers (e.g. with fld1) to not break
+  // instructions like x87.
+
+  // Ideally we would like the only limitation on executing uops to be the issue
+  // ports. Maximizing port pressure increases the likelihood that the load is
+  // distributed evenly across possible ports.
+
+  // To achieve that, one approach is to generate instructions that do not have
+  // data dependencies between them.
+  //
+  // For some instructions, this is trivial:
+  //    mov rax, qword ptr [rsi]
+  //    mov rax, qword ptr [rsi]
+  //    mov rax, qword ptr [rsi]
+  //    mov rax, qword ptr [rsi]
+  // For the above snippet, haswell just renames rax four times and executes the
+  // four instructions two at a time on P23 and P0126.
+  //
+  // For some instructions, we just need to make sure that the source is
+  // different from the destination. For example, IDIV8r reads from GPR and
+  // writes to AX. We just need to ensure that the variable is assigned a
+  // register which is different from AX:
+  //    idiv bx
+  //    idiv bx
+  //    idiv bx
+  //    idiv bx
+  // The above snippet will be able to fully saturate the ports, while the same
+  // with ax would issue one uop every `latency(IDIV8r)` cycles.
+  //
+  // Some instructions make this harder because they both read and write from
+  // the same register:
+  //    inc rax
+  //    inc rax
+  //    inc rax
+  //    inc rax
+  // This has a data dependency from each instruction to the next, limit the
+  // number of instructions that can be issued in parallel.
+  // It turns out that this is not a big issue on recent Intel CPUs because they
+  // have heuristics to balance port pressure. In the snippet above, subsequent
+  // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
+  // might end up executing them all on P0 (just because they can), or try
+  // avoiding P5 because it's usually under high pressure from vector
+  // instructions.
+  // This issue is even more important for high-latency instructions because
+  // they increase the idle time of the CPU, e.g. :
+  //    imul rax, rbx
+  //    imul rax, rbx
+  //    imul rax, rbx
+  //    imul rax, rbx
+  //
+  // To avoid that, we do the renaming statically by generating as many
+  // independent exclusive assignments as possible (until all possible registers
+  // are exhausted) e.g.:
+  //    imul rax, rbx
+  //    imul rcx, rbx
+  //    imul rdx, rbx
+  //    imul r8,  rbx
+  //
+  // Some instruction even make the above static renaming impossible because
+  // they implicitly read and write from the same operand, e.g. ADC16rr reads
+  // and writes from EFLAGS.
+  // In that case we just use a greedy register assignment and hope for the
+  // best.
+
+  const auto Vars = getVariables(RegInfo, InstrDesc, Context.getReservedRegs());
+
+  // Generate as many independent exclusive assignments as possible.
+  constexpr const int MaxStaticRenames = 20;
+  std::vector<llvm::MCInst> Pattern =
+      generateIndependentAssignments(State, InstrDesc, Vars, MaxStaticRenames);
+  if (Pattern.empty()) {
+    // We don't even have a single exclusive assignment, fallback to a greedy
+    // assignment.
+    // FIXME: Tell the user about this decision to help debugging.
+    const std::vector<llvm::MCPhysReg> Regs = getGreedyAssignment(Vars);
+    if (!Vars.empty() && Regs.empty())
+      return makeError("No feasible greedy assignment");
+    llvm::MCInst Inst = generateMCInst(InstrDesc, Vars, Regs);
+    if (!State.canAssemble(Inst))
+      return makeError("Cannot assemble greedy assignment");
+    Pattern.push_back(std::move(Inst));
+  }
+
+  // Generate repetitions of the pattern until benchmark_iterations is reached.
+  std::vector<llvm::MCInst> Result;
+  Result.reserve(NumRepetitions);
+  for (unsigned I = 0; I < NumRepetitions; ++I)
+    Result.push_back(Pattern[I % Pattern.size()]);
+  return Result;
+}
+
+std::vector<BenchmarkMeasure>
+UopsBenchmarkRunner::runMeasurements(const LLVMState &State,
+                                     const JitFunction &Function,
+                                     const unsigned NumRepetitions) const {
+  const auto &SchedModel = State.getSubtargetInfo().getSchedModel();
+
+  std::vector<BenchmarkMeasure> Result;
+  for (unsigned ProcResIdx = 1;
+       ProcResIdx < SchedModel.getNumProcResourceKinds(); ++ProcResIdx) {
+    const llvm::MCProcResourceDesc &ProcRes =
+        *SchedModel.getProcResource(ProcResIdx);
+    const std::string *const EventName =
+        getEventNameFromProcResName(ProcRes.Name);
+    if (!EventName)
+      continue;
+    pfm::Counter Counter{pfm::PerfEvent(*EventName)};
+    Counter.start();
+    Function();
+    Counter.stop();
+    Result.push_back({llvm::itostr(ProcResIdx),
+                      static_cast<double>(Counter.read()) / NumRepetitions,
+                      ProcRes.Name});
+  }
+  return Result;
+}
+
+} // namespace exegesis