[llvm-mca] Introduce the llvm-mca library and organize the directory accordingly. NFC.

Summary:
This patch introduces llvm-mca as a library.  The driver (llvm-mca.cpp), views, and stats, are not part of the library. 
Those are separate components that are not required for the functioning of llvm-mca.

The directory has been organized as follows:
All library source files now reside in:
  - `lib/HardwareUnits/` - All subclasses of HardwareUnit (these represent the simulated hardware components of a backend).
      (LSUnit does not inherit from HardwareUnit, but Scheduler does which uses LSUnit).  
  - `lib/Stages/` - All subclasses of the pipeline stages.
  - `lib/` - This is the root of the library and contains library code that does not fit into the Stages or HardwareUnit subdirs.

All library header files now reside in the `include` directory and mimic the same layout as the `lib` directory mentioned above.

In the (near) future we would like to move the library (include and lib) contents from tools and into the core of llvm somewhere.
That change would allow various analysis and optimization passes to make use of MCA  functionality for things like cost modeling.

I left all of the non-library code just where it has always been, in the root of the llvm-mca directory. 
The include directives for the non-library source file have been updated to refer to the llvm-mca library headers.
I updated the llvm-mca/CMakeLists.txt file to include the library headers, but I made the non-library code
explicitly reference the library's 'include' directory.  Once we eventually (hopefully) migrate the MCA library
components into llvm the include directives used by the non-library source files will be updated to point to the
proper location in llvm.

Reviewers: andreadb, courbet, RKSimon

Reviewed By: andreadb

Subscribers: mgorny, javed.absar, tschuett, gbedwell, llvm-commits

Differential Revision: https://reviews.llvm.org/D50929

llvm-svn: 340755
diff --git a/llvm/tools/llvm-mca/lib/CMakeLists.txt b/llvm/tools/llvm-mca/lib/CMakeLists.txt
new file mode 100644
index 0000000..df9ed32
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/CMakeLists.txt
@@ -0,0 +1,33 @@
+include_directories(${LLVM_MCA_SOURCE_DIR}/include)
+
+add_library(LLVMMCA
+  STATIC
+  Context.cpp
+  HWEventListener.cpp
+  HardwareUnits/HardwareUnit.cpp
+  HardwareUnits/LSUnit.cpp
+  HardwareUnits/RegisterFile.cpp
+  HardwareUnits/ResourceManager.cpp
+  HardwareUnits/RetireControlUnit.cpp
+  HardwareUnits/Scheduler.cpp
+  InstrBuilder.cpp
+  Instruction.cpp
+  Pipeline.cpp
+  Stages/DispatchStage.cpp
+  Stages/ExecuteStage.cpp
+  Stages/FetchStage.cpp
+  Stages/InstructionTables.cpp
+  Stages/RetireStage.cpp
+  Stages/Stage.cpp
+  Support.cpp
+  )
+
+llvm_update_compile_flags(LLVMMCA)
+llvm_map_components_to_libnames(libs
+  CodeGen
+  MC
+  Support
+  )
+
+target_link_libraries(LLVMMCA ${libs})
+set_target_properties(LLVMMCA PROPERTIES FOLDER "Libraries")
diff --git a/llvm/tools/llvm-mca/lib/Context.cpp b/llvm/tools/llvm-mca/lib/Context.cpp
new file mode 100644
index 0000000..31c09dc
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/Context.cpp
@@ -0,0 +1,65 @@
+//===---------------------------- Context.cpp -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a class for holding ownership of various simulated
+/// hardware units.  A Context also provides a utility routine for constructing
+/// a default out-of-order pipeline with fetch, dispatch, execute, and retire
+/// stages.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Context.h"
+#include "HardwareUnits/RegisterFile.h"
+#include "HardwareUnits/RetireControlUnit.h"
+#include "HardwareUnits/Scheduler.h"
+#include "Stages/DispatchStage.h"
+#include "Stages/ExecuteStage.h"
+#include "Stages/FetchStage.h"
+#include "Stages/RetireStage.h"
+
+namespace mca {
+
+using namespace llvm;
+
+std::unique_ptr<Pipeline>
+Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB,
+                               SourceMgr &SrcMgr) {
+  const MCSchedModel &SM = STI.getSchedModel();
+
+  // Create the hardware units defining the backend.
+  auto RCU = llvm::make_unique<RetireControlUnit>(SM);
+  auto PRF = llvm::make_unique<RegisterFile>(SM, MRI, Opts.RegisterFileSize);
+  auto LSU = llvm::make_unique<LSUnit>(Opts.LoadQueueSize, Opts.StoreQueueSize,
+                                       Opts.AssumeNoAlias);
+  auto HWS = llvm::make_unique<Scheduler>(SM, LSU.get());
+
+  // Create the pipeline and its stages.
+  auto StagePipeline = llvm::make_unique<Pipeline>();
+  auto Fetch = llvm::make_unique<FetchStage>(IB, SrcMgr);
+  auto Dispatch = llvm::make_unique<DispatchStage>(
+      STI, MRI, Opts.RegisterFileSize, Opts.DispatchWidth, *RCU, *PRF);
+  auto Execute = llvm::make_unique<ExecuteStage>(*HWS);
+  auto Retire = llvm::make_unique<RetireStage>(*RCU, *PRF);
+
+  // Pass the ownership of all the hardware units to this Context.
+  addHardwareUnit(std::move(RCU));
+  addHardwareUnit(std::move(PRF));
+  addHardwareUnit(std::move(LSU));
+  addHardwareUnit(std::move(HWS));
+
+  // Build the pipeline.
+  StagePipeline->appendStage(std::move(Fetch));
+  StagePipeline->appendStage(std::move(Dispatch));
+  StagePipeline->appendStage(std::move(Execute));
+  StagePipeline->appendStage(std::move(Retire));
+  return StagePipeline;
+}
+
+} // namespace mca
diff --git a/llvm/tools/llvm-mca/lib/HWEventListener.cpp b/llvm/tools/llvm-mca/lib/HWEventListener.cpp
new file mode 100644
index 0000000..f27a04a
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/HWEventListener.cpp
@@ -0,0 +1,21 @@
+//===----------------------- HWEventListener.cpp ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a vtable anchor for class HWEventListener.
+///
+//===----------------------------------------------------------------------===//
+
+#include "HWEventListener.h"
+
+namespace mca {
+
+// Anchor the vtable here.
+void HWEventListener::anchor() {}
+} // namespace mca
diff --git a/llvm/tools/llvm-mca/lib/HardwareUnits/HardwareUnit.cpp b/llvm/tools/llvm-mca/lib/HardwareUnits/HardwareUnit.cpp
new file mode 100644
index 0000000..daeda06
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/HardwareUnits/HardwareUnit.cpp
@@ -0,0 +1,23 @@
+//===------------------------- HardwareUnit.cpp -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the anchor for the base class that describes
+/// simulated hardware units.
+///
+//===----------------------------------------------------------------------===//
+
+#include "HardwareUnits/HardwareUnit.h"
+
+namespace mca {
+
+// Pin the vtable with this method.
+HardwareUnit::~HardwareUnit() = default;
+
+} // namespace mca
diff --git a/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp b/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
new file mode 100644
index 0000000..b845c5b
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
@@ -0,0 +1,156 @@
+//===----------------------- LSUnit.cpp --------------------------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// A Load-Store Unit for the llvm-mca tool.
+///
+//===----------------------------------------------------------------------===//
+
+#include "HardwareUnits/LSUnit.h"
+#include "Instruction.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+#ifndef NDEBUG
+void LSUnit::dump() const {
+  dbgs() << "[LSUnit] LQ_Size = " << LQ_Size << '\n';
+  dbgs() << "[LSUnit] SQ_Size = " << SQ_Size << '\n';
+  dbgs() << "[LSUnit] NextLQSlotIdx = " << LoadQueue.size() << '\n';
+  dbgs() << "[LSUnit] NextSQSlotIdx = " << StoreQueue.size() << '\n';
+}
+#endif
+
+void LSUnit::assignLQSlot(unsigned Index) {
+  assert(!isLQFull());
+  assert(LoadQueue.count(Index) == 0);
+
+  LLVM_DEBUG(dbgs() << "[LSUnit] - AssignLQSlot <Idx=" << Index
+                    << ",slot=" << LoadQueue.size() << ">\n");
+  LoadQueue.insert(Index);
+}
+
+void LSUnit::assignSQSlot(unsigned Index) {
+  assert(!isSQFull());
+  assert(StoreQueue.count(Index) == 0);
+
+  LLVM_DEBUG(dbgs() << "[LSUnit] - AssignSQSlot <Idx=" << Index
+                    << ",slot=" << StoreQueue.size() << ">\n");
+  StoreQueue.insert(Index);
+}
+
+void LSUnit::dispatch(const InstRef &IR) {
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+  unsigned IsMemBarrier = Desc.HasSideEffects;
+  assert((Desc.MayLoad || Desc.MayStore) && "Not a memory operation!");
+
+  const unsigned Index = IR.getSourceIndex();
+  if (Desc.MayLoad) {
+    if (IsMemBarrier)
+      LoadBarriers.insert(Index);
+    assignLQSlot(Index);
+  }
+
+  if (Desc.MayStore) {
+    if (IsMemBarrier)
+      StoreBarriers.insert(Index);
+    assignSQSlot(Index);
+  }
+}
+
+LSUnit::Status LSUnit::isAvailable(const InstRef &IR) const {
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+  if (Desc.MayLoad && isLQFull())
+    return LSUnit::LSU_LQUEUE_FULL;
+  if (Desc.MayStore && isSQFull())
+    return LSUnit::LSU_SQUEUE_FULL;
+  return LSUnit::LSU_AVAILABLE;
+}
+
+bool LSUnit::isReady(const InstRef &IR) const {
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+  const unsigned Index = IR.getSourceIndex();
+  bool IsALoad = Desc.MayLoad;
+  bool IsAStore = Desc.MayStore;
+  assert((IsALoad || IsAStore) && "Not a memory operation!");
+  assert((!IsALoad || LoadQueue.count(Index) == 1) && "Load not in queue!");
+  assert((!IsAStore || StoreQueue.count(Index) == 1) && "Store not in queue!");
+
+  if (IsALoad && !LoadBarriers.empty()) {
+    unsigned LoadBarrierIndex = *LoadBarriers.begin();
+    if (Index > LoadBarrierIndex)
+      return false;
+    if (Index == LoadBarrierIndex && Index != *LoadQueue.begin())
+      return false;
+  }
+
+  if (IsAStore && !StoreBarriers.empty()) {
+    unsigned StoreBarrierIndex = *StoreBarriers.begin();
+    if (Index > StoreBarrierIndex)
+      return false;
+    if (Index == StoreBarrierIndex && Index != *StoreQueue.begin())
+      return false;
+  }
+
+  if (NoAlias && IsALoad)
+    return true;
+
+  if (StoreQueue.size()) {
+    // Check if this memory operation is younger than the older store.
+    if (Index > *StoreQueue.begin())
+      return false;
+  }
+
+  // Okay, we are older than the oldest store in the queue.
+  // If there are no pending loads, then we can say for sure that this
+  // instruction is ready.
+  if (isLQEmpty())
+    return true;
+
+  // Check if there are no older loads.
+  if (Index <= *LoadQueue.begin())
+    return true;
+
+  // There is at least one younger load.
+  return !IsAStore;
+}
+
+void LSUnit::onInstructionExecuted(const InstRef &IR) {
+  const unsigned Index = IR.getSourceIndex();
+  std::set<unsigned>::iterator it = LoadQueue.find(Index);
+  if (it != LoadQueue.end()) {
+    LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index
+                      << " has been removed from the load queue.\n");
+    LoadQueue.erase(it);
+  }
+
+  it = StoreQueue.find(Index);
+  if (it != StoreQueue.end()) {
+    LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index
+                      << " has been removed from the store queue.\n");
+    StoreQueue.erase(it);
+  }
+
+  if (!StoreBarriers.empty() && Index == *StoreBarriers.begin()) {
+    LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index
+                      << " has been removed from the set of store barriers.\n");
+    StoreBarriers.erase(StoreBarriers.begin());
+  }
+  if (!LoadBarriers.empty() && Index == *LoadBarriers.begin()) {
+    LLVM_DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index
+                      << " has been removed from the set of load barriers.\n");
+    LoadBarriers.erase(LoadBarriers.begin());
+  }
+}
+} // namespace mca
diff --git a/llvm/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp b/llvm/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
new file mode 100644
index 0000000..081fd3b
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
@@ -0,0 +1,350 @@
+//===--------------------- RegisterFile.cpp ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a register mapping file class.  This class is responsible
+/// for managing hardware register files and the tracking of data dependencies
+/// between registers.
+///
+//===----------------------------------------------------------------------===//
+
+#include "HardwareUnits/RegisterFile.h"
+#include "Instruction.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+RegisterFile::RegisterFile(const llvm::MCSchedModel &SM,
+                           const llvm::MCRegisterInfo &mri, unsigned NumRegs)
+    : MRI(mri), RegisterMappings(mri.getNumRegs(),
+                                 {WriteRef(), {IndexPlusCostPairTy(0, 1), 0}}) {
+  initialize(SM, NumRegs);
+}
+
+void RegisterFile::initialize(const MCSchedModel &SM, unsigned NumRegs) {
+  // Create a default register file that "sees" all the machine registers
+  // declared by the target. The number of physical registers in the default
+  // register file is set equal to `NumRegs`. A value of zero for `NumRegs`
+  // means: this register file has an unbounded number of physical registers.
+  addRegisterFile({} /* all registers */, NumRegs);
+  if (!SM.hasExtraProcessorInfo())
+    return;
+
+  // For each user defined register file, allocate a RegisterMappingTracker
+  // object. The size of every register file, as well as the mapping between
+  // register files and register classes is specified via tablegen.
+  const MCExtraProcessorInfo &Info = SM.getExtraProcessorInfo();
+  for (unsigned I = 0, E = Info.NumRegisterFiles; I < E; ++I) {
+    const MCRegisterFileDesc &RF = Info.RegisterFiles[I];
+    // Skip invalid register files with zero physical registers.
+    unsigned Length = RF.NumRegisterCostEntries;
+    if (!RF.NumPhysRegs)
+      continue;
+    // The cost of a register definition is equivalent to the number of
+    // physical registers that are allocated at register renaming stage.
+    const MCRegisterCostEntry *FirstElt =
+        &Info.RegisterCostTable[RF.RegisterCostEntryIdx];
+    addRegisterFile(ArrayRef<MCRegisterCostEntry>(FirstElt, Length),
+                    RF.NumPhysRegs);
+  }
+}
+
+void RegisterFile::addRegisterFile(ArrayRef<MCRegisterCostEntry> Entries,
+                                   unsigned NumPhysRegs) {
+  // A default register file is always allocated at index #0. That register file
+  // is mainly used to count the total number of mappings created by all
+  // register files at runtime. Users can limit the number of available physical
+  // registers in register file #0 through the command line flag
+  // `-register-file-size`.
+  unsigned RegisterFileIndex = RegisterFiles.size();
+  RegisterFiles.emplace_back(NumPhysRegs);
+
+  // Special case where there is no register class identifier in the set.
+  // An empty set of register classes means: this register file contains all
+  // the physical registers specified by the target.
+  // We optimistically assume that a register can be renamed at the cost of a
+  // single physical register. The constructor of RegisterFile ensures that
+  // a RegisterMapping exists for each logical register defined by the Target.
+  if (Entries.empty())
+    return;
+
+  // Now update the cost of individual registers.
+  for (const MCRegisterCostEntry &RCE : Entries) {
+    const MCRegisterClass &RC = MRI.getRegClass(RCE.RegisterClassID);
+    for (const MCPhysReg Reg : RC) {
+      RegisterRenamingInfo &Entry = RegisterMappings[Reg].second;
+      IndexPlusCostPairTy &IPC = Entry.IndexPlusCost;
+      if (IPC.first && IPC.first != RegisterFileIndex) {
+        // The only register file that is allowed to overlap is the default
+        // register file at index #0. The analysis is inaccurate if register
+        // files overlap.
+        errs() << "warning: register " << MRI.getName(Reg)
+               << " defined in multiple register files.";
+      }
+      IPC = std::make_pair(RegisterFileIndex, RCE.Cost);
+      Entry.RenameAs = Reg;
+
+      // Assume the same cost for each sub-register.
+      for (MCSubRegIterator I(Reg, &MRI); I.isValid(); ++I) {
+        RegisterRenamingInfo &OtherEntry = RegisterMappings[*I].second;
+        if (!OtherEntry.IndexPlusCost.first &&
+            (!OtherEntry.RenameAs ||
+             MRI.isSuperRegister(*I, OtherEntry.RenameAs))) {
+          OtherEntry.IndexPlusCost = IPC;
+          OtherEntry.RenameAs = Reg;
+        }
+      }
+    }
+  }
+}
+
+void RegisterFile::allocatePhysRegs(const RegisterRenamingInfo &Entry,
+                                    MutableArrayRef<unsigned> UsedPhysRegs) {
+  unsigned RegisterFileIndex = Entry.IndexPlusCost.first;
+  unsigned Cost = Entry.IndexPlusCost.second;
+  if (RegisterFileIndex) {
+    RegisterMappingTracker &RMT = RegisterFiles[RegisterFileIndex];
+    RMT.NumUsedPhysRegs += Cost;
+    UsedPhysRegs[RegisterFileIndex] += Cost;
+  }
+
+  // Now update the default register mapping tracker.
+  RegisterFiles[0].NumUsedPhysRegs += Cost;
+  UsedPhysRegs[0] += Cost;
+}
+
+void RegisterFile::freePhysRegs(const RegisterRenamingInfo &Entry,
+                                MutableArrayRef<unsigned> FreedPhysRegs) {
+  unsigned RegisterFileIndex = Entry.IndexPlusCost.first;
+  unsigned Cost = Entry.IndexPlusCost.second;
+  if (RegisterFileIndex) {
+    RegisterMappingTracker &RMT = RegisterFiles[RegisterFileIndex];
+    RMT.NumUsedPhysRegs -= Cost;
+    FreedPhysRegs[RegisterFileIndex] += Cost;
+  }
+
+  // Now update the default register mapping tracker.
+  RegisterFiles[0].NumUsedPhysRegs -= Cost;
+  FreedPhysRegs[0] += Cost;
+}
+
+void RegisterFile::addRegisterWrite(WriteRef Write,
+                                    MutableArrayRef<unsigned> UsedPhysRegs,
+                                    bool ShouldAllocatePhysRegs) {
+  WriteState &WS = *Write.getWriteState();
+  unsigned RegID = WS.getRegisterID();
+  assert(RegID && "Adding an invalid register definition?");
+
+  LLVM_DEBUG({
+    dbgs() << "RegisterFile: addRegisterWrite [ " << Write.getSourceIndex()
+           << ", " << MRI.getName(RegID) << "]\n";
+  });
+
+  // If RenameAs is equal to RegID, then RegID is subject to register renaming
+  // and false dependencies on RegID are all eliminated.
+
+  // If RenameAs references the invalid register, then we optimistically assume
+  // that it can be renamed. In the absence of tablegen descriptors for register
+  // files, RenameAs is always set to the invalid register ID.  In all other
+  // cases, RenameAs must be either equal to RegID, or it must reference a
+  // super-register of RegID.
+
+  // If RenameAs is a super-register of RegID, then a write to RegID has always
+  // a false dependency on RenameAs. The only exception is for when the write
+  // implicitly clears the upper portion of the underlying register.
+  // If a write clears its super-registers, then it is renamed as `RenameAs`.
+  const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second;
+  if (RRI.RenameAs && RRI.RenameAs != RegID) {
+    RegID = RRI.RenameAs;
+    WriteRef &OtherWrite = RegisterMappings[RegID].first;
+
+    if (!WS.clearsSuperRegisters()) {
+      // The processor keeps the definition of `RegID` together with register
+      // `RenameAs`. Since this partial write is not renamed, no physical
+      // register is allocated.
+      ShouldAllocatePhysRegs = false;
+
+      if (OtherWrite.getWriteState() &&
+          (OtherWrite.getSourceIndex() != Write.getSourceIndex())) {
+        // This partial write has a false dependency on RenameAs.
+        WS.setDependentWrite(OtherWrite.getWriteState());
+      }
+    }
+  }
+
+  // Update the mapping for register RegID including its sub-registers.
+  RegisterMappings[RegID].first = Write;
+  for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I)
+    RegisterMappings[*I].first = Write;
+
+  // No physical registers are allocated for instructions that are optimized in
+  // hardware. For example, zero-latency data-dependency breaking instructions
+  // don't consume physical registers.
+  if (ShouldAllocatePhysRegs)
+    allocatePhysRegs(RegisterMappings[RegID].second, UsedPhysRegs);
+
+  if (!WS.clearsSuperRegisters())
+    return;
+
+  for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I)
+    RegisterMappings[*I].first = Write;
+}
+
+void RegisterFile::removeRegisterWrite(const WriteState &WS,
+                                       MutableArrayRef<unsigned> FreedPhysRegs,
+                                       bool ShouldFreePhysRegs) {
+  unsigned RegID = WS.getRegisterID();
+
+  assert(RegID != 0 && "Invalidating an already invalid register?");
+  assert(WS.getCyclesLeft() != UNKNOWN_CYCLES &&
+         "Invalidating a write of unknown cycles!");
+  assert(WS.getCyclesLeft() <= 0 && "Invalid cycles left for this write!");
+
+  unsigned RenameAs = RegisterMappings[RegID].second.RenameAs;
+  if (RenameAs && RenameAs != RegID) {
+    RegID = RenameAs;
+
+    if (!WS.clearsSuperRegisters()) {
+      // Keep the definition of `RegID` together with register `RenameAs`.
+      ShouldFreePhysRegs = false;
+    }
+  }
+
+  if (ShouldFreePhysRegs)
+    freePhysRegs(RegisterMappings[RegID].second, FreedPhysRegs);
+
+  WriteRef &WR = RegisterMappings[RegID].first;
+  if (WR.getWriteState() == &WS)
+    WR.invalidate();
+
+  for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) {
+    WriteRef &OtherWR = RegisterMappings[*I].first;
+    if (OtherWR.getWriteState() == &WS)
+      OtherWR.invalidate();
+  }
+
+  if (!WS.clearsSuperRegisters())
+    return;
+
+  for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I) {
+    WriteRef &OtherWR = RegisterMappings[*I].first;
+    if (OtherWR.getWriteState() == &WS)
+      OtherWR.invalidate();
+  }
+}
+
+void RegisterFile::collectWrites(SmallVectorImpl<WriteRef> &Writes,
+                                 unsigned RegID) const {
+  assert(RegID && RegID < RegisterMappings.size());
+  LLVM_DEBUG(dbgs() << "RegisterFile: collecting writes for register "
+                    << MRI.getName(RegID) << '\n');
+  const WriteRef &WR = RegisterMappings[RegID].first;
+  if (WR.isValid())
+    Writes.push_back(WR);
+
+  // Handle potential partial register updates.
+  for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) {
+    const WriteRef &WR = RegisterMappings[*I].first;
+    if (WR.isValid())
+      Writes.push_back(WR);
+  }
+
+  // Remove duplicate entries and resize the input vector.
+  llvm::sort(Writes.begin(), Writes.end(),
+             [](const WriteRef &Lhs, const WriteRef &Rhs) {
+               return Lhs.getWriteState() < Rhs.getWriteState();
+             });
+  auto It = std::unique(Writes.begin(), Writes.end());
+  Writes.resize(std::distance(Writes.begin(), It));
+
+  LLVM_DEBUG({
+    for (const WriteRef &WR : Writes) {
+      const WriteState &WS = *WR.getWriteState();
+      dbgs() << "[PRF] Found a dependent use of Register "
+             << MRI.getName(WS.getRegisterID()) << " (defined by intruction #"
+             << WR.getSourceIndex() << ")\n";
+    }
+  });
+}
+
+unsigned RegisterFile::isAvailable(ArrayRef<unsigned> Regs) const {
+  SmallVector<unsigned, 4> NumPhysRegs(getNumRegisterFiles());
+
+  // Find how many new mappings must be created for each register file.
+  for (const unsigned RegID : Regs) {
+    const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second;
+    const IndexPlusCostPairTy &Entry = RRI.IndexPlusCost;
+    if (Entry.first)
+      NumPhysRegs[Entry.first] += Entry.second;
+    NumPhysRegs[0] += Entry.second;
+  }
+
+  unsigned Response = 0;
+  for (unsigned I = 0, E = getNumRegisterFiles(); I < E; ++I) {
+    unsigned NumRegs = NumPhysRegs[I];
+    if (!NumRegs)
+      continue;
+
+    const RegisterMappingTracker &RMT = RegisterFiles[I];
+    if (!RMT.NumPhysRegs) {
+      // The register file has an unbounded number of microarchitectural
+      // registers.
+      continue;
+    }
+
+    if (RMT.NumPhysRegs < NumRegs) {
+      // The current register file is too small. This may occur if the number of
+      // microarchitectural registers in register file #0 was changed by the
+      // users via flag -reg-file-size. Alternatively, the scheduling model
+      // specified a too small number of registers for this register file.
+      LLVM_DEBUG(dbgs() << "Not enough registers in the register file.\n");
+
+      // FIXME: Normalize the instruction register count to match the
+      // NumPhysRegs value.  This is a highly unusual case, and is not expected
+      // to occur.  This normalization is hiding an inconsistency in either the
+      // scheduling model or in the value that the user might have specified
+      // for NumPhysRegs.
+      NumRegs = RMT.NumPhysRegs;
+    }
+
+    if (RMT.NumPhysRegs < (RMT.NumUsedPhysRegs + NumRegs))
+      Response |= (1U << I);
+  }
+
+  return Response;
+}
+
+#ifndef NDEBUG
+void RegisterFile::dump() const {
+  for (unsigned I = 0, E = MRI.getNumRegs(); I < E; ++I) {
+    const RegisterMapping &RM = RegisterMappings[I];
+    if (!RM.first.getWriteState())
+      continue;
+    const RegisterRenamingInfo &RRI = RM.second;
+    dbgs() << MRI.getName(I) << ", " << I << ", PRF=" << RRI.IndexPlusCost.first
+           << ", Cost=" << RRI.IndexPlusCost.second
+           << ", RenameAs=" << RRI.RenameAs << ", ";
+    RM.first.dump();
+    dbgs() << '\n';
+  }
+
+  for (unsigned I = 0, E = getNumRegisterFiles(); I < E; ++I) {
+    dbgs() << "Register File #" << I;
+    const RegisterMappingTracker &RMT = RegisterFiles[I];
+    dbgs() << "\n  TotalMappings:        " << RMT.NumPhysRegs
+           << "\n  NumUsedMappings:      " << RMT.NumUsedPhysRegs << '\n';
+  }
+}
+#endif
+
+} // namespace mca
diff --git a/llvm/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp b/llvm/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp
new file mode 100644
index 0000000..18d0e74
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp
@@ -0,0 +1,309 @@
+//===--------------------- ResourceManager.cpp ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// The classes here represent processor resource units and their management
+/// strategy.  These classes are managed by the Scheduler.
+///
+//===----------------------------------------------------------------------===//
+
+#include "HardwareUnits/ResourceManager.h"
+#include "Support.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mca {
+
+using namespace llvm;
+
+#define DEBUG_TYPE "llvm-mca"
+ResourceStrategy::~ResourceStrategy() = default;
+
+void DefaultResourceStrategy::skipMask(uint64_t Mask) {
+  NextInSequenceMask &= (~Mask);
+  if (!NextInSequenceMask) {
+    NextInSequenceMask = ResourceUnitMask ^ RemovedFromNextInSequence;
+    RemovedFromNextInSequence = 0;
+  }
+}
+
+uint64_t DefaultResourceStrategy::select(uint64_t ReadyMask) {
+  // This method assumes that ReadyMask cannot be zero.
+  uint64_t CandidateMask = llvm::PowerOf2Floor(NextInSequenceMask);
+  while (!(ReadyMask & CandidateMask)) {
+    skipMask(CandidateMask);
+    CandidateMask = llvm::PowerOf2Floor(NextInSequenceMask);
+  }
+  return CandidateMask;
+}
+
+void DefaultResourceStrategy::used(uint64_t Mask) {
+  if (Mask > NextInSequenceMask) {
+    RemovedFromNextInSequence |= Mask;
+    return;
+  }
+  skipMask(Mask);
+}
+
+ResourceState::ResourceState(const MCProcResourceDesc &Desc, unsigned Index,
+                             uint64_t Mask)
+    : ProcResourceDescIndex(Index), ResourceMask(Mask),
+      BufferSize(Desc.BufferSize) {
+  if (llvm::countPopulation(ResourceMask) > 1)
+    ResourceSizeMask = ResourceMask ^ llvm::PowerOf2Floor(ResourceMask);
+  else
+    ResourceSizeMask = (1ULL << Desc.NumUnits) - 1;
+  ReadyMask = ResourceSizeMask;
+  AvailableSlots = BufferSize == -1 ? 0U : static_cast<unsigned>(BufferSize);
+  Unavailable = false;
+}
+
+bool ResourceState::isReady(unsigned NumUnits) const {
+  return (!isReserved() || isADispatchHazard()) &&
+         llvm::countPopulation(ReadyMask) >= NumUnits;
+}
+
+ResourceStateEvent ResourceState::isBufferAvailable() const {
+  if (isADispatchHazard() && isReserved())
+    return RS_RESERVED;
+  if (!isBuffered() || AvailableSlots)
+    return RS_BUFFER_AVAILABLE;
+  return RS_BUFFER_UNAVAILABLE;
+}
+
+#ifndef NDEBUG
+void ResourceState::dump() const {
+  dbgs() << "MASK: " << ResourceMask << ", SIZE_MASK: " << ResourceSizeMask
+         << ", RDYMASK: " << ReadyMask << ", BufferSize=" << BufferSize
+         << ", AvailableSlots=" << AvailableSlots
+         << ", Reserved=" << Unavailable << '\n';
+}
+#endif
+
+static unsigned getResourceStateIndex(uint64_t Mask) {
+  return std::numeric_limits<uint64_t>::digits - llvm::countLeadingZeros(Mask);
+}
+
+static std::unique_ptr<ResourceStrategy>
+getStrategyFor(const ResourceState &RS) {
+  if (RS.isAResourceGroup() || RS.getNumUnits() > 1)
+    return llvm::make_unique<DefaultResourceStrategy>(RS.getReadyMask());
+  return std::unique_ptr<ResourceStrategy>(nullptr);
+}
+
+ResourceManager::ResourceManager(const MCSchedModel &SM)
+    : ProcResID2Mask(SM.getNumProcResourceKinds()) {
+  computeProcResourceMasks(SM, ProcResID2Mask);
+  Resources.resize(SM.getNumProcResourceKinds());
+  Strategies.resize(SM.getNumProcResourceKinds());
+
+  for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    uint64_t Mask = ProcResID2Mask[I];
+    unsigned Index = getResourceStateIndex(Mask);
+    Resources[Index] =
+        llvm::make_unique<ResourceState>(*SM.getProcResource(I), I, Mask);
+    Strategies[Index] = getStrategyFor(*Resources[Index]);
+  }
+}
+
+void ResourceManager::setCustomStrategyImpl(std::unique_ptr<ResourceStrategy> S,
+                                            uint64_t ResourceMask) {
+  unsigned Index = getResourceStateIndex(ResourceMask);
+  assert(Index < Resources.size() && "Invalid processor resource index!");
+  assert(S && "Unexpected null strategy in input!");
+  Strategies[Index] = std::move(S);
+}
+
+unsigned ResourceManager::resolveResourceMask(uint64_t Mask) const {
+  return Resources[getResourceStateIndex(Mask)]->getProcResourceID();
+}
+
+unsigned ResourceManager::getNumUnits(uint64_t ResourceID) const {
+  return Resources[getResourceStateIndex(ResourceID)]->getNumUnits();
+}
+
+// Returns the actual resource consumed by this Use.
+// First, is the primary resource ID.
+// Second, is the specific sub-resource ID.
+ResourceRef ResourceManager::selectPipe(uint64_t ResourceID) {
+  unsigned Index = getResourceStateIndex(ResourceID);
+  ResourceState &RS = *Resources[Index];
+  assert(RS.isReady() && "No available units to select!");
+
+  // Special case where RS is not a group, and it only declares a single
+  // resource unit.
+  if (!RS.isAResourceGroup() && RS.getNumUnits() == 1)
+    return std::make_pair(ResourceID, RS.getReadyMask());
+
+  uint64_t SubResourceID = Strategies[Index]->select(RS.getReadyMask());
+  if (RS.isAResourceGroup())
+    return selectPipe(SubResourceID);
+  return std::make_pair(ResourceID, SubResourceID);
+}
+
+void ResourceManager::use(const ResourceRef &RR) {
+  // Mark the sub-resource referenced by RR as used.
+  ResourceState &RS = *Resources[getResourceStateIndex(RR.first)];
+  RS.markSubResourceAsUsed(RR.second);
+  // If there are still available units in RR.first,
+  // then we are done.
+  if (RS.isReady())
+    return;
+
+  // Notify to other resources that RR.first is no longer available.
+  for (std::unique_ptr<ResourceState> &Res : Resources) {
+    ResourceState &Current = *Res;
+    if (!Current.isAResourceGroup() || Current.getResourceMask() == RR.first)
+      continue;
+
+    if (Current.containsResource(RR.first)) {
+      unsigned Index = getResourceStateIndex(Current.getResourceMask());
+      Current.markSubResourceAsUsed(RR.first);
+      Strategies[Index]->used(RR.first);
+    }
+  }
+}
+
+void ResourceManager::release(const ResourceRef &RR) {
+  ResourceState &RS = *Resources[getResourceStateIndex(RR.first)];
+  bool WasFullyUsed = !RS.isReady();
+  RS.releaseSubResource(RR.second);
+  if (!WasFullyUsed)
+    return;
+
+  for (std::unique_ptr<ResourceState> &Res : Resources) {
+    ResourceState &Current = *Res;
+    if (!Current.isAResourceGroup() || Current.getResourceMask() == RR.first)
+      continue;
+
+    if (Current.containsResource(RR.first))
+      Current.releaseSubResource(RR.first);
+  }
+}
+
+ResourceStateEvent
+ResourceManager::canBeDispatched(ArrayRef<uint64_t> Buffers) const {
+  ResourceStateEvent Result = ResourceStateEvent::RS_BUFFER_AVAILABLE;
+  for (uint64_t Buffer : Buffers) {
+    ResourceState &RS = *Resources[getResourceStateIndex(Buffer)];
+    Result = RS.isBufferAvailable();
+    if (Result != ResourceStateEvent::RS_BUFFER_AVAILABLE)
+      break;
+  }
+  return Result;
+}
+
+void ResourceManager::reserveBuffers(ArrayRef<uint64_t> Buffers) {
+  for (const uint64_t Buffer : Buffers) {
+    ResourceState &RS = *Resources[getResourceStateIndex(Buffer)];
+    assert(RS.isBufferAvailable() == ResourceStateEvent::RS_BUFFER_AVAILABLE);
+    RS.reserveBuffer();
+
+    if (RS.isADispatchHazard()) {
+      assert(!RS.isReserved());
+      RS.setReserved();
+    }
+  }
+}
+
+void ResourceManager::releaseBuffers(ArrayRef<uint64_t> Buffers) {
+  for (const uint64_t R : Buffers)
+    Resources[getResourceStateIndex(R)]->releaseBuffer();
+}
+
+bool ResourceManager::canBeIssued(const InstrDesc &Desc) const {
+  return std::all_of(Desc.Resources.begin(), Desc.Resources.end(),
+                     [&](const std::pair<uint64_t, const ResourceUsage> &E) {
+                       unsigned NumUnits =
+                           E.second.isReserved() ? 0U : E.second.NumUnits;
+                       unsigned Index = getResourceStateIndex(E.first);
+                       return Resources[Index]->isReady(NumUnits);
+                     });
+}
+
+// Returns true if all resources are in-order, and there is at least one
+// resource which is a dispatch hazard (BufferSize = 0).
+bool ResourceManager::mustIssueImmediately(const InstrDesc &Desc) const {
+  if (!canBeIssued(Desc))
+    return false;
+  bool AllInOrderResources = all_of(Desc.Buffers, [&](uint64_t BufferMask) {
+    unsigned Index = getResourceStateIndex(BufferMask);
+    const ResourceState &Resource = *Resources[Index];
+    return Resource.isInOrder() || Resource.isADispatchHazard();
+  });
+  if (!AllInOrderResources)
+    return false;
+
+  return any_of(Desc.Buffers, [&](uint64_t BufferMask) {
+    return Resources[getResourceStateIndex(BufferMask)]->isADispatchHazard();
+  });
+}
+
+void ResourceManager::issueInstruction(
+    const InstrDesc &Desc,
+    SmallVectorImpl<std::pair<ResourceRef, double>> &Pipes) {
+  for (const std::pair<uint64_t, ResourceUsage> &R : Desc.Resources) {
+    const CycleSegment &CS = R.second.CS;
+    if (!CS.size()) {
+      releaseResource(R.first);
+      continue;
+    }
+
+    assert(CS.begin() == 0 && "Invalid {Start, End} cycles!");
+    if (!R.second.isReserved()) {
+      ResourceRef Pipe = selectPipe(R.first);
+      use(Pipe);
+      BusyResources[Pipe] += CS.size();
+      // Replace the resource mask with a valid processor resource index.
+      const ResourceState &RS = *Resources[getResourceStateIndex(Pipe.first)];
+      Pipe.first = RS.getProcResourceID();
+      Pipes.emplace_back(
+          std::pair<ResourceRef, double>(Pipe, static_cast<double>(CS.size())));
+    } else {
+      assert((countPopulation(R.first) > 1) && "Expected a group!");
+      // Mark this group as reserved.
+      assert(R.second.isReserved());
+      reserveResource(R.first);
+      BusyResources[ResourceRef(R.first, R.first)] += CS.size();
+    }
+  }
+}
+
+void ResourceManager::cycleEvent(SmallVectorImpl<ResourceRef> &ResourcesFreed) {
+  for (std::pair<ResourceRef, unsigned> &BR : BusyResources) {
+    if (BR.second)
+      BR.second--;
+    if (!BR.second) {
+      // Release this resource.
+      const ResourceRef &RR = BR.first;
+
+      if (countPopulation(RR.first) == 1)
+        release(RR);
+
+      releaseResource(RR.first);
+      ResourcesFreed.push_back(RR);
+    }
+  }
+
+  for (const ResourceRef &RF : ResourcesFreed)
+    BusyResources.erase(RF);
+}
+
+void ResourceManager::reserveResource(uint64_t ResourceID) {
+  ResourceState &Resource = *Resources[getResourceStateIndex(ResourceID)];
+  assert(!Resource.isReserved());
+  Resource.setReserved();
+}
+
+void ResourceManager::releaseResource(uint64_t ResourceID) {
+  ResourceState &Resource = *Resources[getResourceStateIndex(ResourceID)];
+  Resource.clearReserved();
+}
+
+} // namespace mca
diff --git a/llvm/tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp b/llvm/tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp
new file mode 100644
index 0000000..205fe84
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp
@@ -0,0 +1,87 @@
+//===---------------------- RetireControlUnit.cpp ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file simulates the hardware responsible for retiring instructions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "HardwareUnits/RetireControlUnit.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+RetireControlUnit::RetireControlUnit(const llvm::MCSchedModel &SM)
+    : NextAvailableSlotIdx(0), CurrentInstructionSlotIdx(0),
+      AvailableSlots(SM.MicroOpBufferSize), MaxRetirePerCycle(0) {
+  // Check if the scheduling model provides extra information about the machine
+  // processor. If so, then use that information to set the reorder buffer size
+  // and the maximum number of instructions retired per cycle.
+  if (SM.hasExtraProcessorInfo()) {
+    const MCExtraProcessorInfo &EPI = SM.getExtraProcessorInfo();
+    if (EPI.ReorderBufferSize)
+      AvailableSlots = EPI.ReorderBufferSize;
+    MaxRetirePerCycle = EPI.MaxRetirePerCycle;
+  }
+
+  assert(AvailableSlots && "Invalid reorder buffer size!");
+  Queue.resize(AvailableSlots);
+}
+
+// Reserves a number of slots, and returns a new token.
+unsigned RetireControlUnit::reserveSlot(const InstRef &IR,
+                                        unsigned NumMicroOps) {
+  assert(isAvailable(NumMicroOps));
+  unsigned NormalizedQuantity =
+      std::min(NumMicroOps, static_cast<unsigned>(Queue.size()));
+  // Zero latency instructions may have zero mOps. Artificially bump this
+  // value to 1. Although zero latency instructions don't consume scheduler
+  // resources, they still consume one slot in the retire queue.
+  NormalizedQuantity = std::max(NormalizedQuantity, 1U);
+  unsigned TokenID = NextAvailableSlotIdx;
+  Queue[NextAvailableSlotIdx] = {IR, NormalizedQuantity, false};
+  NextAvailableSlotIdx += NormalizedQuantity;
+  NextAvailableSlotIdx %= Queue.size();
+  AvailableSlots -= NormalizedQuantity;
+  return TokenID;
+}
+
+const RetireControlUnit::RUToken &RetireControlUnit::peekCurrentToken() const {
+  return Queue[CurrentInstructionSlotIdx];
+}
+
+void RetireControlUnit::consumeCurrentToken() {
+  const RetireControlUnit::RUToken &Current = peekCurrentToken();
+  assert(Current.NumSlots && "Reserved zero slots?");
+  assert(Current.IR.isValid() && "Invalid RUToken in the RCU queue.");
+
+  // Update the slot index to be the next item in the circular queue.
+  CurrentInstructionSlotIdx += Current.NumSlots;
+  CurrentInstructionSlotIdx %= Queue.size();
+  AvailableSlots += Current.NumSlots;
+}
+
+void RetireControlUnit::onInstructionExecuted(unsigned TokenID) {
+  assert(Queue.size() > TokenID);
+  assert(Queue[TokenID].Executed == false && Queue[TokenID].IR.isValid());
+  Queue[TokenID].Executed = true;
+}
+
+#ifndef NDEBUG
+void RetireControlUnit::dump() const {
+  dbgs() << "Retire Unit: { Total Slots=" << Queue.size()
+         << ", Available Slots=" << AvailableSlots << " }\n";
+}
+#endif
+
+} // namespace mca
diff --git a/llvm/tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp b/llvm/tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp
new file mode 100644
index 0000000..151af23
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp
@@ -0,0 +1,244 @@
+//===--------------------- Scheduler.cpp ------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A scheduler for processor resource units and processor resource groups.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HardwareUnits/Scheduler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mca {
+
+using namespace llvm;
+
+#define DEBUG_TYPE "llvm-mca"
+
+void Scheduler::initializeStrategy(std::unique_ptr<SchedulerStrategy> S) {
+  // Ensure we have a valid (non-null) strategy object.
+  Strategy = S ? std::move(S) : llvm::make_unique<DefaultSchedulerStrategy>();
+}
+
+// Anchor the vtable of SchedulerStrategy and DefaultSchedulerStrategy.
+SchedulerStrategy::~SchedulerStrategy() = default;
+DefaultSchedulerStrategy::~DefaultSchedulerStrategy() = default;
+
+#ifndef NDEBUG
+void Scheduler::dump() const {
+  dbgs() << "[SCHEDULER]: WaitSet size is: " << WaitSet.size() << '\n';
+  dbgs() << "[SCHEDULER]: ReadySet size is: " << ReadySet.size() << '\n';
+  dbgs() << "[SCHEDULER]: IssuedSet size is: " << IssuedSet.size() << '\n';
+  Resources->dump();
+}
+#endif
+
+Scheduler::Status Scheduler::isAvailable(const InstRef &IR) const {
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+
+  switch (Resources->canBeDispatched(Desc.Buffers)) {
+  case ResourceStateEvent::RS_BUFFER_UNAVAILABLE:
+    return Scheduler::SC_BUFFERS_FULL;
+  case ResourceStateEvent::RS_RESERVED:
+    return Scheduler::SC_DISPATCH_GROUP_STALL;
+  case ResourceStateEvent::RS_BUFFER_AVAILABLE:
+    break;
+  }
+
+  // Give lower priority to LSUnit stall events.
+  switch (LSU->isAvailable(IR)) {
+  case LSUnit::LSU_LQUEUE_FULL:
+    return Scheduler::SC_LOAD_QUEUE_FULL;
+  case LSUnit::LSU_SQUEUE_FULL:
+    return Scheduler::SC_STORE_QUEUE_FULL;
+  case LSUnit::LSU_AVAILABLE:
+    return Scheduler::SC_AVAILABLE;
+  }
+
+  llvm_unreachable("Don't know how to process this LSU state result!");
+}
+
+void Scheduler::issueInstructionImpl(
+    InstRef &IR,
+    SmallVectorImpl<std::pair<ResourceRef, double>> &UsedResources) {
+  Instruction *IS = IR.getInstruction();
+  const InstrDesc &D = IS->getDesc();
+
+  // Issue the instruction and collect all the consumed resources
+  // into a vector. That vector is then used to notify the listener.
+  Resources->issueInstruction(D, UsedResources);
+
+  // Notify the instruction that it started executing.
+  // This updates the internal state of each write.
+  IS->execute();
+
+  if (IS->isExecuting())
+    IssuedSet.emplace_back(IR);
+  else if (IS->isExecuted())
+    LSU->onInstructionExecuted(IR);
+}
+
+// Release the buffered resources and issue the instruction.
+void Scheduler::issueInstruction(
+    InstRef &IR, SmallVectorImpl<std::pair<ResourceRef, double>> &UsedResources,
+    SmallVectorImpl<InstRef> &ReadyInstructions) {
+  const Instruction &Inst = *IR.getInstruction();
+  bool HasDependentUsers = Inst.hasDependentUsers();
+
+  Resources->releaseBuffers(Inst.getDesc().Buffers);
+  issueInstructionImpl(IR, UsedResources);
+  // Instructions that have been issued during this cycle might have unblocked
+  // other dependent instructions. Dependent instructions may be issued during
+  // this same cycle if operands have ReadAdvance entries.  Promote those
+  // instructions to the ReadySet and notify the caller that those are ready.
+  if (HasDependentUsers)
+    promoteToReadySet(ReadyInstructions);
+}
+
+void Scheduler::promoteToReadySet(SmallVectorImpl<InstRef> &Ready) {
+  // Scan the set of waiting instructions and promote them to the
+  // ready queue if operands are all ready.
+  unsigned RemovedElements = 0;
+  for (auto I = WaitSet.begin(), E = WaitSet.end(); I != E;) {
+    InstRef &IR = *I;
+    if (!IR.isValid())
+      break;
+
+    // Check if this instruction is now ready. In case, force
+    // a transition in state using method 'update()'.
+    Instruction &IS = *IR.getInstruction();
+    if (!IS.isReady())
+      IS.update();
+
+    // Check if there are still unsolved data dependencies.
+    if (!isReady(IR)) {
+      ++I;
+      continue;
+    }
+
+    Ready.emplace_back(IR);
+    ReadySet.emplace_back(IR);
+
+    IR.invalidate();
+    ++RemovedElements;
+    std::iter_swap(I, E - RemovedElements);
+  }
+
+  WaitSet.resize(WaitSet.size() - RemovedElements);
+}
+
+InstRef Scheduler::select() {
+  unsigned QueueIndex = ReadySet.size();
+  for (unsigned I = 0, E = ReadySet.size(); I != E; ++I) {
+    const InstRef &IR = ReadySet[I];
+    if (QueueIndex == ReadySet.size() ||
+        Strategy->compare(IR, ReadySet[QueueIndex])) {
+      const InstrDesc &D = IR.getInstruction()->getDesc();
+      if (Resources->canBeIssued(D))
+        QueueIndex = I;
+    }
+  }
+
+  if (QueueIndex == ReadySet.size())
+    return InstRef();
+
+  // We found an instruction to issue.
+  InstRef IR = ReadySet[QueueIndex];
+  std::swap(ReadySet[QueueIndex], ReadySet[ReadySet.size() - 1]);
+  ReadySet.pop_back();
+  return IR;
+}
+
+void Scheduler::updateIssuedSet(SmallVectorImpl<InstRef> &Executed) {
+  unsigned RemovedElements = 0;
+  for (auto I = IssuedSet.begin(), E = IssuedSet.end(); I != E;) {
+    InstRef &IR = *I;
+    if (!IR.isValid())
+      break;
+    Instruction &IS = *IR.getInstruction();
+    if (!IS.isExecuted()) {
+      LLVM_DEBUG(dbgs() << "[SCHEDULER]: Instruction #" << IR
+                        << " is still executing.\n");
+      ++I;
+      continue;
+    }
+
+    // Instruction IR has completed execution.
+    LSU->onInstructionExecuted(IR);
+    Executed.emplace_back(IR);
+    ++RemovedElements;
+    IR.invalidate();
+    std::iter_swap(I, E - RemovedElements);
+  }
+
+  IssuedSet.resize(IssuedSet.size() - RemovedElements);
+}
+
+void Scheduler::cycleEvent(SmallVectorImpl<ResourceRef> &Freed,
+                           SmallVectorImpl<InstRef> &Executed,
+                           SmallVectorImpl<InstRef> &Ready) {
+  // Release consumed resources.
+  Resources->cycleEvent(Freed);
+
+  // Propagate the cycle event to the 'Issued' and 'Wait' sets.
+  for (InstRef &IR : IssuedSet)
+    IR.getInstruction()->cycleEvent();
+
+  updateIssuedSet(Executed);
+
+  for (InstRef &IR : WaitSet)
+    IR.getInstruction()->cycleEvent();
+
+  promoteToReadySet(Ready);
+}
+
+bool Scheduler::mustIssueImmediately(const InstRef &IR) const {
+  // Instructions that use an in-order dispatch/issue processor resource must be
+  // issued immediately to the pipeline(s). Any other in-order buffered
+  // resources (i.e. BufferSize=1) is consumed.
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+  return Desc.isZeroLatency() || Resources->mustIssueImmediately(Desc);
+}
+
+void Scheduler::dispatch(const InstRef &IR) {
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+  Resources->reserveBuffers(Desc.Buffers);
+
+  // If necessary, reserve queue entries in the load-store unit (LSU).
+  bool IsMemOp = Desc.MayLoad || Desc.MayStore;
+  if (IsMemOp)
+    LSU->dispatch(IR);
+
+  if (!isReady(IR)) {
+    LLVM_DEBUG(dbgs() << "[SCHEDULER] Adding #" << IR << " to the WaitSet\n");
+    WaitSet.push_back(IR);
+    return;
+  }
+
+  // Don't add a zero-latency instruction to the Ready queue.
+  // A zero-latency instruction doesn't consume any scheduler resources. That is
+  // because it doesn't need to be executed, and it is often removed at register
+  // renaming stage. For example, register-register moves are often optimized at
+  // register renaming stage by simply updating register aliases. On some
+  // targets, zero-idiom instructions (for example: a xor that clears the value
+  // of a register) are treated specially, and are often eliminated at register
+  // renaming stage.
+  if (!mustIssueImmediately(IR)) {
+    LLVM_DEBUG(dbgs() << "[SCHEDULER] Adding #" << IR << " to the ReadySet\n");
+    ReadySet.push_back(IR);
+  }
+}
+
+bool Scheduler::isReady(const InstRef &IR) const {
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+  bool IsMemOp = Desc.MayLoad || Desc.MayStore;
+  return IR.getInstruction()->isReady() && (!IsMemOp || LSU->isReady(IR));
+}
+
+} // namespace mca
diff --git a/llvm/tools/llvm-mca/lib/InstrBuilder.cpp b/llvm/tools/llvm-mca/lib/InstrBuilder.cpp
new file mode 100644
index 0000000..1688a56
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/InstrBuilder.cpp
@@ -0,0 +1,485 @@
+//===--------------------- InstrBuilder.cpp ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the InstrBuilder interface.
+///
+//===----------------------------------------------------------------------===//
+
+#include "InstrBuilder.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+using namespace llvm;
+
+static void initializeUsedResources(InstrDesc &ID,
+                                    const MCSchedClassDesc &SCDesc,
+                                    const MCSubtargetInfo &STI,
+                                    ArrayRef<uint64_t> ProcResourceMasks) {
+  const MCSchedModel &SM = STI.getSchedModel();
+
+  // Populate resources consumed.
+  using ResourcePlusCycles = std::pair<uint64_t, ResourceUsage>;
+  std::vector<ResourcePlusCycles> Worklist;
+
+  // Track cycles contributed by resources that are in a "Super" relationship.
+  // This is required if we want to correctly match the behavior of method
+  // SubtargetEmitter::ExpandProcResource() in Tablegen. When computing the set
+  // of "consumed" processor resources and resource cycles, the logic in
+  // ExpandProcResource() doesn't update the number of resource cycles
+  // contributed by a "Super" resource to a group.
+  // We need to take this into account when we find that a processor resource is
+  // part of a group, and it is also used as the "Super" of other resources.
+  // This map stores the number of cycles contributed by sub-resources that are
+  // part of a "Super" resource. The key value is the "Super" resource mask ID.
+  DenseMap<uint64_t, unsigned> SuperResources;
+
+  for (unsigned I = 0, E = SCDesc.NumWriteProcResEntries; I < E; ++I) {
+    const MCWriteProcResEntry *PRE = STI.getWriteProcResBegin(&SCDesc) + I;
+    const MCProcResourceDesc &PR = *SM.getProcResource(PRE->ProcResourceIdx);
+    uint64_t Mask = ProcResourceMasks[PRE->ProcResourceIdx];
+    if (PR.BufferSize != -1)
+      ID.Buffers.push_back(Mask);
+    CycleSegment RCy(0, PRE->Cycles, false);
+    Worklist.emplace_back(ResourcePlusCycles(Mask, ResourceUsage(RCy)));
+    if (PR.SuperIdx) {
+      uint64_t Super = ProcResourceMasks[PR.SuperIdx];
+      SuperResources[Super] += PRE->Cycles;
+    }
+  }
+
+  // Sort elements by mask popcount, so that we prioritize resource units over
+  // resource groups, and smaller groups over larger groups.
+  llvm::sort(Worklist.begin(), Worklist.end(),
+             [](const ResourcePlusCycles &A, const ResourcePlusCycles &B) {
+               unsigned popcntA = countPopulation(A.first);
+               unsigned popcntB = countPopulation(B.first);
+               if (popcntA < popcntB)
+                 return true;
+               if (popcntA > popcntB)
+                 return false;
+               return A.first < B.first;
+             });
+
+  uint64_t UsedResourceUnits = 0;
+
+  // Remove cycles contributed by smaller resources.
+  for (unsigned I = 0, E = Worklist.size(); I < E; ++I) {
+    ResourcePlusCycles &A = Worklist[I];
+    if (!A.second.size()) {
+      A.second.NumUnits = 0;
+      A.second.setReserved();
+      ID.Resources.emplace_back(A);
+      continue;
+    }
+
+    ID.Resources.emplace_back(A);
+    uint64_t NormalizedMask = A.first;
+    if (countPopulation(A.first) == 1) {
+      UsedResourceUnits |= A.first;
+    } else {
+      // Remove the leading 1 from the resource group mask.
+      NormalizedMask ^= PowerOf2Floor(NormalizedMask);
+    }
+
+    for (unsigned J = I + 1; J < E; ++J) {
+      ResourcePlusCycles &B = Worklist[J];
+      if ((NormalizedMask & B.first) == NormalizedMask) {
+        B.second.CS.Subtract(A.second.size() - SuperResources[A.first]);
+        if (countPopulation(B.first) > 1)
+          B.second.NumUnits++;
+      }
+    }
+  }
+
+  // A SchedWrite may specify a number of cycles in which a resource group
+  // is reserved. For example (on target x86; cpu Haswell):
+  //
+  //  SchedWriteRes<[HWPort0, HWPort1, HWPort01]> {
+  //    let ResourceCycles = [2, 2, 3];
+  //  }
+  //
+  // This means:
+  // Resource units HWPort0 and HWPort1 are both used for 2cy.
+  // Resource group HWPort01 is the union of HWPort0 and HWPort1.
+  // Since this write touches both HWPort0 and HWPort1 for 2cy, HWPort01
+  // will not be usable for 2 entire cycles from instruction issue.
+  //
+  // On top of those 2cy, SchedWriteRes explicitly specifies an extra latency
+  // of 3 cycles for HWPort01. This tool assumes that the 3cy latency is an
+  // extra delay on top of the 2 cycles latency.
+  // During those extra cycles, HWPort01 is not usable by other instructions.
+  for (ResourcePlusCycles &RPC : ID.Resources) {
+    if (countPopulation(RPC.first) > 1 && !RPC.second.isReserved()) {
+      // Remove the leading 1 from the resource group mask.
+      uint64_t Mask = RPC.first ^ PowerOf2Floor(RPC.first);
+      if ((Mask & UsedResourceUnits) == Mask)
+        RPC.second.setReserved();
+    }
+  }
+
+  LLVM_DEBUG({
+    for (const std::pair<uint64_t, ResourceUsage> &R : ID.Resources)
+      dbgs() << "\t\tMask=" << R.first << ", cy=" << R.second.size() << '\n';
+    for (const uint64_t R : ID.Buffers)
+      dbgs() << "\t\tBuffer Mask=" << R << '\n';
+  });
+}
+
+static void computeMaxLatency(InstrDesc &ID, const MCInstrDesc &MCDesc,
+                              const MCSchedClassDesc &SCDesc,
+                              const MCSubtargetInfo &STI) {
+  if (MCDesc.isCall()) {
+    // We cannot estimate how long this call will take.
+    // Artificially set an arbitrarily high latency (100cy).
+    ID.MaxLatency = 100U;
+    return;
+  }
+
+  int Latency = MCSchedModel::computeInstrLatency(STI, SCDesc);
+  // If latency is unknown, then conservatively assume a MaxLatency of 100cy.
+  ID.MaxLatency = Latency < 0 ? 100U : static_cast<unsigned>(Latency);
+}
+
+Error InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
+                                   unsigned SchedClassID) {
+  const MCInstrDesc &MCDesc = MCII.get(MCI.getOpcode());
+  const MCSchedModel &SM = STI.getSchedModel();
+  const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
+
+  // These are for now the (strong) assumptions made by this algorithm:
+  //  * The number of explicit and implicit register definitions in a MCInst
+  //    matches the number of explicit and implicit definitions according to
+  //    the opcode descriptor (MCInstrDesc).
+  //  * Register definitions take precedence over register uses in the operands
+  //    list.
+  //  * If an opcode specifies an optional definition, then the optional
+  //    definition is always the last operand in the sequence, and it can be
+  //    set to zero (i.e. "no register").
+  //
+  // These assumptions work quite well for most out-of-order in-tree targets
+  // like x86. This is mainly because the vast majority of instructions is
+  // expanded to MCInst using a straightforward lowering logic that preserves
+  // the ordering of the operands.
+  unsigned NumExplicitDefs = MCDesc.getNumDefs();
+  unsigned NumImplicitDefs = MCDesc.getNumImplicitDefs();
+  unsigned NumWriteLatencyEntries = SCDesc.NumWriteLatencyEntries;
+  unsigned TotalDefs = NumExplicitDefs + NumImplicitDefs;
+  if (MCDesc.hasOptionalDef())
+    TotalDefs++;
+  ID.Writes.resize(TotalDefs);
+  // Iterate over the operands list, and skip non-register operands.
+  // The first NumExplictDefs register operands are expected to be register
+  // definitions.
+  unsigned CurrentDef = 0;
+  unsigned i = 0;
+  for (; i < MCI.getNumOperands() && CurrentDef < NumExplicitDefs; ++i) {
+    const MCOperand &Op = MCI.getOperand(i);
+    if (!Op.isReg())
+      continue;
+
+    WriteDescriptor &Write = ID.Writes[CurrentDef];
+    Write.OpIndex = i;
+    if (CurrentDef < NumWriteLatencyEntries) {
+      const MCWriteLatencyEntry &WLE =
+          *STI.getWriteLatencyEntry(&SCDesc, CurrentDef);
+      // Conservatively default to MaxLatency.
+      Write.Latency =
+          WLE.Cycles < 0 ? ID.MaxLatency : static_cast<unsigned>(WLE.Cycles);
+      Write.SClassOrWriteResourceID = WLE.WriteResourceID;
+    } else {
+      // Assign a default latency for this write.
+      Write.Latency = ID.MaxLatency;
+      Write.SClassOrWriteResourceID = 0;
+    }
+    Write.IsOptionalDef = false;
+    LLVM_DEBUG({
+      dbgs() << "\t\t[Def] OpIdx=" << Write.OpIndex
+             << ", Latency=" << Write.Latency
+             << ", WriteResourceID=" << Write.SClassOrWriteResourceID << '\n';
+    });
+    CurrentDef++;
+  }
+
+  if (CurrentDef != NumExplicitDefs) {
+    return make_error<StringError>(
+        "error: Expected more register operand definitions.",
+        inconvertibleErrorCode());
+  }
+
+  CurrentDef = 0;
+  for (CurrentDef = 0; CurrentDef < NumImplicitDefs; ++CurrentDef) {
+    unsigned Index = NumExplicitDefs + CurrentDef;
+    WriteDescriptor &Write = ID.Writes[Index];
+    Write.OpIndex = ~CurrentDef;
+    Write.RegisterID = MCDesc.getImplicitDefs()[CurrentDef];
+    if (Index < NumWriteLatencyEntries) {
+      const MCWriteLatencyEntry &WLE =
+          *STI.getWriteLatencyEntry(&SCDesc, Index);
+      // Conservatively default to MaxLatency.
+      Write.Latency =
+          WLE.Cycles < 0 ? ID.MaxLatency : static_cast<unsigned>(WLE.Cycles);
+      Write.SClassOrWriteResourceID = WLE.WriteResourceID;
+    } else {
+      // Assign a default latency for this write.
+      Write.Latency = ID.MaxLatency;
+      Write.SClassOrWriteResourceID = 0;
+    }
+
+    Write.IsOptionalDef = false;
+    assert(Write.RegisterID != 0 && "Expected a valid phys register!");
+    LLVM_DEBUG({
+      dbgs() << "\t\t[Def] OpIdx=" << Write.OpIndex
+             << ", PhysReg=" << MRI.getName(Write.RegisterID)
+             << ", Latency=" << Write.Latency
+             << ", WriteResourceID=" << Write.SClassOrWriteResourceID << '\n';
+    });
+  }
+
+  if (MCDesc.hasOptionalDef()) {
+    // Always assume that the optional definition is the last operand of the
+    // MCInst sequence.
+    const MCOperand &Op = MCI.getOperand(MCI.getNumOperands() - 1);
+    if (i == MCI.getNumOperands() || !Op.isReg())
+      return make_error<StringError>(
+          "error: expected a register operand for an optional "
+          "definition. Instruction has not be correctly analyzed.",
+          inconvertibleErrorCode());
+
+    WriteDescriptor &Write = ID.Writes[TotalDefs - 1];
+    Write.OpIndex = MCI.getNumOperands() - 1;
+    // Assign a default latency for this write.
+    Write.Latency = ID.MaxLatency;
+    Write.SClassOrWriteResourceID = 0;
+    Write.IsOptionalDef = true;
+  }
+
+  return ErrorSuccess();
+}
+
+Error InstrBuilder::populateReads(InstrDesc &ID, const MCInst &MCI,
+                                  unsigned SchedClassID) {
+  const MCInstrDesc &MCDesc = MCII.get(MCI.getOpcode());
+  unsigned NumExplicitDefs = MCDesc.getNumDefs();
+
+  // Skip explicit definitions.
+  unsigned i = 0;
+  for (; i < MCI.getNumOperands() && NumExplicitDefs; ++i) {
+    const MCOperand &Op = MCI.getOperand(i);
+    if (Op.isReg())
+      NumExplicitDefs--;
+  }
+
+  if (NumExplicitDefs) {
+    return make_error<StringError>(
+        "error: Expected more register operand definitions. ",
+        inconvertibleErrorCode());
+  }
+
+  unsigned NumExplicitUses = MCI.getNumOperands() - i;
+  unsigned NumImplicitUses = MCDesc.getNumImplicitUses();
+  if (MCDesc.hasOptionalDef()) {
+    assert(NumExplicitUses);
+    NumExplicitUses--;
+  }
+  unsigned TotalUses = NumExplicitUses + NumImplicitUses;
+  if (!TotalUses)
+    return ErrorSuccess();
+
+  ID.Reads.resize(TotalUses);
+  for (unsigned CurrentUse = 0; CurrentUse < NumExplicitUses; ++CurrentUse) {
+    ReadDescriptor &Read = ID.Reads[CurrentUse];
+    Read.OpIndex = i + CurrentUse;
+    Read.UseIndex = CurrentUse;
+    Read.SchedClassID = SchedClassID;
+    LLVM_DEBUG(dbgs() << "\t\t[Use] OpIdx=" << Read.OpIndex
+                      << ", UseIndex=" << Read.UseIndex << '\n');
+  }
+
+  for (unsigned CurrentUse = 0; CurrentUse < NumImplicitUses; ++CurrentUse) {
+    ReadDescriptor &Read = ID.Reads[NumExplicitUses + CurrentUse];
+    Read.OpIndex = ~CurrentUse;
+    Read.UseIndex = NumExplicitUses + CurrentUse;
+    Read.RegisterID = MCDesc.getImplicitUses()[CurrentUse];
+    Read.SchedClassID = SchedClassID;
+    LLVM_DEBUG(dbgs() << "\t\t[Use] OpIdx=" << Read.OpIndex << ", RegisterID="
+                      << MRI.getName(Read.RegisterID) << '\n');
+  }
+  return ErrorSuccess();
+}
+
+Expected<const InstrDesc &>
+InstrBuilder::createInstrDescImpl(const MCInst &MCI) {
+  assert(STI.getSchedModel().hasInstrSchedModel() &&
+         "Itineraries are not yet supported!");
+
+  // Obtain the instruction descriptor from the opcode.
+  unsigned short Opcode = MCI.getOpcode();
+  const MCInstrDesc &MCDesc = MCII.get(Opcode);
+  const MCSchedModel &SM = STI.getSchedModel();
+
+  // Then obtain the scheduling class information from the instruction.
+  unsigned SchedClassID = MCDesc.getSchedClass();
+  unsigned CPUID = SM.getProcessorID();
+
+  // Try to solve variant scheduling classes.
+  if (SchedClassID) {
+    while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant())
+      SchedClassID = STI.resolveVariantSchedClass(SchedClassID, &MCI, CPUID);
+
+    if (!SchedClassID) {
+      return make_error<StringError>("unable to resolve this variant class.",
+                                     inconvertibleErrorCode());
+    }
+  }
+
+  // Check if this instruction is supported. Otherwise, report an error.
+  const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
+  if (SCDesc.NumMicroOps == MCSchedClassDesc::InvalidNumMicroOps) {
+    std::string ToString;
+    llvm::raw_string_ostream OS(ToString);
+    WithColor::error() << "found an unsupported instruction in the input"
+                       << " assembly sequence.\n";
+    MCIP.printInst(&MCI, OS, "", STI);
+    OS.flush();
+    WithColor::note() << "instruction: " << ToString << '\n';
+    return make_error<StringError>(
+        "Don't know how to analyze unsupported instructions",
+        inconvertibleErrorCode());
+  }
+
+  // Create a new empty descriptor.
+  std::unique_ptr<InstrDesc> ID = llvm::make_unique<InstrDesc>();
+  ID->NumMicroOps = SCDesc.NumMicroOps;
+
+  if (MCDesc.isCall()) {
+    // We don't correctly model calls.
+    WithColor::warning() << "found a call in the input assembly sequence.\n";
+    WithColor::note() << "call instructions are not correctly modeled. "
+                      << "Assume a latency of 100cy.\n";
+  }
+
+  if (MCDesc.isReturn()) {
+    WithColor::warning() << "found a return instruction in the input"
+                         << " assembly sequence.\n";
+    WithColor::note() << "program counter updates are ignored.\n";
+  }
+
+  ID->MayLoad = MCDesc.mayLoad();
+  ID->MayStore = MCDesc.mayStore();
+  ID->HasSideEffects = MCDesc.hasUnmodeledSideEffects();
+
+  initializeUsedResources(*ID, SCDesc, STI, ProcResourceMasks);
+  computeMaxLatency(*ID, MCDesc, SCDesc, STI);
+  if (auto Err = populateWrites(*ID, MCI, SchedClassID))
+    return std::move(Err);
+  if (auto Err = populateReads(*ID, MCI, SchedClassID))
+    return std::move(Err);
+
+  LLVM_DEBUG(dbgs() << "\t\tMaxLatency=" << ID->MaxLatency << '\n');
+  LLVM_DEBUG(dbgs() << "\t\tNumMicroOps=" << ID->NumMicroOps << '\n');
+
+  // Now add the new descriptor.
+  SchedClassID = MCDesc.getSchedClass();
+  if (!SM.getSchedClassDesc(SchedClassID)->isVariant()) {
+    Descriptors[MCI.getOpcode()] = std::move(ID);
+    return *Descriptors[MCI.getOpcode()];
+  }
+
+  VariantDescriptors[&MCI] = std::move(ID);
+  return *VariantDescriptors[&MCI];
+}
+
+Expected<const InstrDesc &>
+InstrBuilder::getOrCreateInstrDesc(const MCInst &MCI) {
+  if (Descriptors.find_as(MCI.getOpcode()) != Descriptors.end())
+    return *Descriptors[MCI.getOpcode()];
+
+  if (VariantDescriptors.find(&MCI) != VariantDescriptors.end())
+    return *VariantDescriptors[&MCI];
+
+  return createInstrDescImpl(MCI);
+}
+
+Expected<std::unique_ptr<Instruction>>
+InstrBuilder::createInstruction(const MCInst &MCI) {
+  Expected<const InstrDesc &> DescOrErr = getOrCreateInstrDesc(MCI);
+  if (!DescOrErr)
+    return DescOrErr.takeError();
+  const InstrDesc &D = *DescOrErr;
+  std::unique_ptr<Instruction> NewIS = llvm::make_unique<Instruction>(D);
+
+  // Initialize Reads first.
+  for (const ReadDescriptor &RD : D.Reads) {
+    int RegID = -1;
+    if (!RD.isImplicitRead()) {
+      // explicit read.
+      const MCOperand &Op = MCI.getOperand(RD.OpIndex);
+      // Skip non-register operands.
+      if (!Op.isReg())
+        continue;
+      RegID = Op.getReg();
+    } else {
+      // Implicit read.
+      RegID = RD.RegisterID;
+    }
+
+    // Skip invalid register operands.
+    if (!RegID)
+      continue;
+
+    // Okay, this is a register operand. Create a ReadState for it.
+    assert(RegID > 0 && "Invalid register ID found!");
+    NewIS->getUses().emplace_back(llvm::make_unique<ReadState>(RD, RegID));
+  }
+
+  // Early exit if there are no writes.
+  if (D.Writes.empty())
+    return std::move(NewIS);
+
+  // Track register writes that implicitly clear the upper portion of the
+  // underlying super-registers using an APInt.
+  APInt WriteMask(D.Writes.size(), 0);
+
+  // Now query the MCInstrAnalysis object to obtain information about which
+  // register writes implicitly clear the upper portion of a super-register.
+  MCIA.clearsSuperRegisters(MRI, MCI, WriteMask);
+
+  // Check if this is a dependency breaking instruction.
+  if (MCIA.isDependencyBreaking(STI, MCI))
+    NewIS->setDependencyBreaking();
+
+  // Initialize writes.
+  unsigned WriteIndex = 0;
+  for (const WriteDescriptor &WD : D.Writes) {
+    unsigned RegID = WD.isImplicitWrite() ? WD.RegisterID
+                                          : MCI.getOperand(WD.OpIndex).getReg();
+    // Check if this is a optional definition that references NoReg.
+    if (WD.IsOptionalDef && !RegID) {
+      ++WriteIndex;
+      continue;
+    }
+
+    assert(RegID && "Expected a valid register ID!");
+    NewIS->getDefs().emplace_back(llvm::make_unique<WriteState>(
+        WD, RegID, /* ClearsSuperRegs */ WriteMask[WriteIndex]));
+    ++WriteIndex;
+  }
+
+  return std::move(NewIS);
+}
+} // namespace mca
diff --git a/llvm/tools/llvm-mca/lib/Instruction.cpp b/llvm/tools/llvm-mca/lib/Instruction.cpp
new file mode 100644
index 0000000..0c84767
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/Instruction.cpp
@@ -0,0 +1,177 @@
+//===--------------------- Instruction.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines abstractions used by the Pipeline to model register reads,
+// register writes and instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Instruction.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mca {
+
+using namespace llvm;
+
+void ReadState::writeStartEvent(unsigned Cycles) {
+  assert(DependentWrites);
+  assert(CyclesLeft == UNKNOWN_CYCLES);
+
+  // This read may be dependent on more than one write. This typically occurs
+  // when a definition is the result of multiple writes where at least one
+  // write does a partial register update.
+  // The HW is forced to do some extra bookkeeping to track of all the
+  // dependent writes, and implement a merging scheme for the partial writes.
+  --DependentWrites;
+  TotalCycles = std::max(TotalCycles, Cycles);
+
+  if (!DependentWrites) {
+    CyclesLeft = TotalCycles;
+    IsReady = !CyclesLeft;
+  }
+}
+
+void WriteState::onInstructionIssued() {
+  assert(CyclesLeft == UNKNOWN_CYCLES);
+  // Update the number of cycles left based on the WriteDescriptor info.
+  CyclesLeft = getLatency();
+
+  // Now that the time left before write-back is known, notify
+  // all the users.
+  for (const std::pair<ReadState *, int> &User : Users) {
+    ReadState *RS = User.first;
+    unsigned ReadCycles = std::max(0, CyclesLeft - User.second);
+    RS->writeStartEvent(ReadCycles);
+  }
+}
+
+void WriteState::addUser(ReadState *User, int ReadAdvance) {
+  // If CyclesLeft is different than -1, then we don't need to
+  // update the list of users. We can just notify the user with
+  // the actual number of cycles left (which may be zero).
+  if (CyclesLeft != UNKNOWN_CYCLES) {
+    unsigned ReadCycles = std::max(0, CyclesLeft - ReadAdvance);
+    User->writeStartEvent(ReadCycles);
+    return;
+  }
+
+  std::pair<ReadState *, int> NewPair(User, ReadAdvance);
+  Users.insert(NewPair);
+}
+
+void WriteState::cycleEvent() {
+  // Note: CyclesLeft can be a negative number. It is an error to
+  // make it an unsigned quantity because users of this write may
+  // specify a negative ReadAdvance.
+  if (CyclesLeft != UNKNOWN_CYCLES)
+    CyclesLeft--;
+}
+
+void ReadState::cycleEvent() {
+  // Update the total number of cycles.
+  if (DependentWrites && TotalCycles) {
+    --TotalCycles;
+    return;
+  }
+
+  // Bail out immediately if we don't know how many cycles are left.
+  if (CyclesLeft == UNKNOWN_CYCLES)
+    return;
+
+  if (CyclesLeft) {
+    --CyclesLeft;
+    IsReady = !CyclesLeft;
+  }
+}
+
+#ifndef NDEBUG
+void WriteState::dump() const {
+  dbgs() << "{ OpIdx=" << WD.OpIndex << ", Lat=" << getLatency() << ", RegID "
+         << getRegisterID() << ", Cycles Left=" << getCyclesLeft() << " }";
+}
+
+void WriteRef::dump() const {
+  dbgs() << "IID=" << getSourceIndex() << ' ';
+  if (isValid())
+    getWriteState()->dump();
+  else
+    dbgs() << "(null)";
+}
+#endif
+
+void Instruction::dispatch(unsigned RCUToken) {
+  assert(Stage == IS_INVALID);
+  Stage = IS_AVAILABLE;
+  RCUTokenID = RCUToken;
+
+  // Check if input operands are already available.
+  update();
+}
+
+void Instruction::execute() {
+  assert(Stage == IS_READY);
+  Stage = IS_EXECUTING;
+
+  // Set the cycles left before the write-back stage.
+  CyclesLeft = Desc.MaxLatency;
+
+  for (UniqueDef &Def : Defs)
+    Def->onInstructionIssued();
+
+  // Transition to the "executed" stage if this is a zero-latency instruction.
+  if (!CyclesLeft)
+    Stage = IS_EXECUTED;
+}
+
+void Instruction::update() {
+  assert(isDispatched() && "Unexpected instruction stage found!");
+
+  if (!llvm::all_of(Uses, [](const UniqueUse &Use) { return Use->isReady(); }))
+    return;
+
+  // A partial register write cannot complete before a dependent write.
+  auto IsDefReady = [&](const UniqueDef &Def) {
+    if (const WriteState *Write = Def->getDependentWrite()) {
+      int WriteLatency = Write->getCyclesLeft();
+      if (WriteLatency == UNKNOWN_CYCLES)
+        return false;
+      return static_cast<unsigned>(WriteLatency) < Desc.MaxLatency;
+    }
+    return true;
+  };
+
+  if (llvm::all_of(Defs, IsDefReady))
+    Stage = IS_READY;
+}
+
+void Instruction::cycleEvent() {
+  if (isReady())
+    return;
+
+  if (isDispatched()) {
+    for (UniqueUse &Use : Uses)
+      Use->cycleEvent();
+
+    update();
+    return;
+  }
+
+  assert(isExecuting() && "Instruction not in-flight?");
+  assert(CyclesLeft && "Instruction already executed?");
+  for (UniqueDef &Def : Defs)
+    Def->cycleEvent();
+  CyclesLeft--;
+  if (!CyclesLeft)
+    Stage = IS_EXECUTED;
+}
+
+const unsigned WriteRef::INVALID_IID = std::numeric_limits<unsigned>::max();
+
+} // namespace mca
diff --git a/llvm/tools/llvm-mca/lib/LLVMBuild.txt b/llvm/tools/llvm-mca/lib/LLVMBuild.txt
new file mode 100644
index 0000000..e3bf632
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./tools/llvm-mca/lib/LLVMBuild.txt -----------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = MCA
+parent = Libraries
+required_libraries = CodeGen MC Support
diff --git a/llvm/tools/llvm-mca/lib/Pipeline.cpp b/llvm/tools/llvm-mca/lib/Pipeline.cpp
new file mode 100644
index 0000000..a67ae98
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/Pipeline.cpp
@@ -0,0 +1,97 @@
+//===--------------------- Pipeline.cpp -------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements an ordered container of stages that simulate the
+/// pipeline of a hardware backend.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Pipeline.h"
+#include "HWEventListener.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/Debug.h"
+
+namespace mca {
+
+#define DEBUG_TYPE "llvm-mca"
+
+using namespace llvm;
+
+void Pipeline::addEventListener(HWEventListener *Listener) {
+  if (Listener)
+    Listeners.insert(Listener);
+  for (auto &S : Stages)
+    S->addListener(Listener);
+}
+
+bool Pipeline::hasWorkToProcess() {
+  return llvm::any_of(Stages, [](const std::unique_ptr<Stage> &S) {
+    return S->hasWorkToComplete();
+  });
+}
+
+llvm::Error Pipeline::run() {
+  assert(!Stages.empty() && "Unexpected empty pipeline found!");
+
+  while (hasWorkToProcess()) {
+    notifyCycleBegin();
+    if (llvm::Error Err = runCycle())
+      return Err;
+    notifyCycleEnd();
+    ++Cycles;
+  }
+  return llvm::ErrorSuccess();
+}
+
+llvm::Error Pipeline::runCycle() {
+  llvm::Error Err = llvm::ErrorSuccess();
+  // Update stages before we start processing new instructions.
+  for (auto I = Stages.rbegin(), E = Stages.rend(); I != E && !Err; ++I) {
+    const std::unique_ptr<Stage> &S = *I;
+    Err = S->cycleStart();
+  }
+
+  // Now fetch and execute new instructions.
+  InstRef IR;
+  Stage &FirstStage = *Stages[0];
+  while (!Err && FirstStage.isAvailable(IR))
+    Err = FirstStage.execute(IR);
+
+  // Update stages in preparation for a new cycle.
+  for (auto I = Stages.rbegin(), E = Stages.rend(); I != E && !Err; ++I) {
+    const std::unique_ptr<Stage> &S = *I;
+    Err = S->cycleEnd();
+  }
+
+  return Err;
+}
+
+void Pipeline::appendStage(std::unique_ptr<Stage> S) {
+  assert(S && "Invalid null stage in input!");
+  if (!Stages.empty()) {
+    Stage *Last = Stages.back().get();
+    Last->setNextInSequence(S.get());
+  }
+
+  Stages.push_back(std::move(S));
+}
+
+void Pipeline::notifyCycleBegin() {
+  LLVM_DEBUG(dbgs() << "[E] Cycle begin: " << Cycles << '\n');
+  for (HWEventListener *Listener : Listeners)
+    Listener->onCycleBegin();
+}
+
+void Pipeline::notifyCycleEnd() {
+  LLVM_DEBUG(dbgs() << "[E] Cycle end: " << Cycles << "\n\n");
+  for (HWEventListener *Listener : Listeners)
+    Listener->onCycleEnd();
+}
+} // namespace mca.
diff --git a/llvm/tools/llvm-mca/lib/Stages/DispatchStage.cpp b/llvm/tools/llvm-mca/lib/Stages/DispatchStage.cpp
new file mode 100644
index 0000000..81098cb
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/Stages/DispatchStage.cpp
@@ -0,0 +1,160 @@
+//===--------------------- DispatchStage.cpp --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file models the dispatch component of an instruction pipeline.
+///
+/// The DispatchStage is responsible for updating instruction dependencies
+/// and communicating to the simulated instruction scheduler that an instruction
+/// is ready to be scheduled for execution.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Stages/DispatchStage.h"
+#include "HWEventListener.h"
+#include "HardwareUnits/Scheduler.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+void DispatchStage::notifyInstructionDispatched(const InstRef &IR,
+                                                ArrayRef<unsigned> UsedRegs) {
+  LLVM_DEBUG(dbgs() << "[E] Instruction Dispatched: #" << IR << '\n');
+  notifyEvent<HWInstructionEvent>(HWInstructionDispatchedEvent(IR, UsedRegs));
+}
+
+bool DispatchStage::checkPRF(const InstRef &IR) const {
+  SmallVector<unsigned, 4> RegDefs;
+  for (const std::unique_ptr<WriteState> &RegDef :
+       IR.getInstruction()->getDefs())
+    RegDefs.emplace_back(RegDef->getRegisterID());
+
+  const unsigned RegisterMask = PRF.isAvailable(RegDefs);
+  // A mask with all zeroes means: register files are available.
+  if (RegisterMask) {
+    notifyEvent<HWStallEvent>(
+        HWStallEvent(HWStallEvent::RegisterFileStall, IR));
+    return false;
+  }
+
+  return true;
+}
+
+bool DispatchStage::checkRCU(const InstRef &IR) const {
+  const unsigned NumMicroOps = IR.getInstruction()->getDesc().NumMicroOps;
+  if (RCU.isAvailable(NumMicroOps))
+    return true;
+  notifyEvent<HWStallEvent>(
+      HWStallEvent(HWStallEvent::RetireControlUnitStall, IR));
+  return false;
+}
+
+bool DispatchStage::canDispatch(const InstRef &IR) const {
+  return checkRCU(IR) && checkPRF(IR) && checkNextStage(IR);
+}
+
+void DispatchStage::updateRAWDependencies(ReadState &RS,
+                                          const MCSubtargetInfo &STI) {
+  SmallVector<WriteRef, 4> DependentWrites;
+
+  collectWrites(DependentWrites, RS.getRegisterID());
+  RS.setDependentWrites(DependentWrites.size());
+  // We know that this read depends on all the writes in DependentWrites.
+  // For each write, check if we have ReadAdvance information, and use it
+  // to figure out in how many cycles this read becomes available.
+  const ReadDescriptor &RD = RS.getDescriptor();
+  const MCSchedModel &SM = STI.getSchedModel();
+  const MCSchedClassDesc *SC = SM.getSchedClassDesc(RD.SchedClassID);
+  for (WriteRef &WR : DependentWrites) {
+    WriteState &WS = *WR.getWriteState();
+    unsigned WriteResID = WS.getWriteResourceID();
+    int ReadAdvance = STI.getReadAdvanceCycles(SC, RD.UseIndex, WriteResID);
+    WS.addUser(&RS, ReadAdvance);
+  }
+}
+
+llvm::Error DispatchStage::dispatch(InstRef IR) {
+  assert(!CarryOver && "Cannot dispatch another instruction!");
+  Instruction &IS = *IR.getInstruction();
+  const InstrDesc &Desc = IS.getDesc();
+  const unsigned NumMicroOps = Desc.NumMicroOps;
+  if (NumMicroOps > DispatchWidth) {
+    assert(AvailableEntries == DispatchWidth);
+    AvailableEntries = 0;
+    CarryOver = NumMicroOps - DispatchWidth;
+  } else {
+    assert(AvailableEntries >= NumMicroOps);
+    AvailableEntries -= NumMicroOps;
+  }
+
+  // A dependency-breaking instruction doesn't have to wait on the register
+  // input operands, and it is often optimized at register renaming stage.
+  // Update RAW dependencies if this instruction is not a dependency-breaking
+  // instruction. A dependency-breaking instruction is a zero-latency
+  // instruction that doesn't consume hardware resources.
+  // An example of dependency-breaking instruction on X86 is a zero-idiom XOR.
+  bool IsDependencyBreaking = IS.isDependencyBreaking();
+  for (std::unique_ptr<ReadState> &RS : IS.getUses())
+    if (RS->isImplicitRead() || !IsDependencyBreaking)
+      updateRAWDependencies(*RS, STI);
+
+  // By default, a dependency-breaking zero-latency instruction is expected to
+  // be optimized at register renaming stage. That means, no physical register
+  // is allocated to the instruction.
+  bool ShouldAllocateRegisters =
+      !(Desc.isZeroLatency() && IsDependencyBreaking);
+  SmallVector<unsigned, 4> RegisterFiles(PRF.getNumRegisterFiles());
+  for (std::unique_ptr<WriteState> &WS : IS.getDefs()) {
+    PRF.addRegisterWrite(WriteRef(IR.first, WS.get()), RegisterFiles,
+                         ShouldAllocateRegisters);
+  }
+
+  // Reserve slots in the RCU, and notify the instruction that it has been
+  // dispatched to the schedulers for execution.
+  IS.dispatch(RCU.reserveSlot(IR, NumMicroOps));
+
+  // Notify listeners of the "instruction dispatched" event,
+  // and move IR to the next stage.
+  notifyInstructionDispatched(IR, RegisterFiles);
+  return moveToTheNextStage(IR);
+}
+
+llvm::Error DispatchStage::cycleStart() {
+  AvailableEntries = CarryOver >= DispatchWidth ? 0 : DispatchWidth - CarryOver;
+  CarryOver = CarryOver >= DispatchWidth ? CarryOver - DispatchWidth : 0U;
+  return llvm::ErrorSuccess();
+}
+
+bool DispatchStage::isAvailable(const InstRef &IR) const {
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+  unsigned Required = std::min(Desc.NumMicroOps, DispatchWidth);
+  if (Required > AvailableEntries)
+    return false;
+  // The dispatch logic doesn't internally buffer instructions.  It only accepts
+  // instructions that can be successfully moved to the next stage during this
+  // same cycle.
+  return canDispatch(IR);
+}
+
+llvm::Error DispatchStage::execute(InstRef &IR) {
+  assert(canDispatch(IR) && "Cannot dispatch another instruction!");
+  return dispatch(IR);
+}
+
+#ifndef NDEBUG
+void DispatchStage::dump() const {
+  PRF.dump();
+  RCU.dump();
+}
+#endif
+} // namespace mca
diff --git a/llvm/tools/llvm-mca/lib/Stages/ExecuteStage.cpp b/llvm/tools/llvm-mca/lib/Stages/ExecuteStage.cpp
new file mode 100644
index 0000000..7fff4b8
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/Stages/ExecuteStage.cpp
@@ -0,0 +1,195 @@
+//===---------------------- ExecuteStage.cpp --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the execution stage of an instruction pipeline.
+///
+/// The ExecuteStage is responsible for managing the hardware scheduler
+/// and issuing notifications that an instruction has been executed.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Stages/ExecuteStage.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+using namespace llvm;
+
+HWStallEvent::GenericEventType toHWStallEventType(Scheduler::Status Status) {
+  switch (Status) {
+  case Scheduler::SC_LOAD_QUEUE_FULL:
+    return HWStallEvent::LoadQueueFull;
+  case Scheduler::SC_STORE_QUEUE_FULL:
+    return HWStallEvent::StoreQueueFull;
+  case Scheduler::SC_BUFFERS_FULL:
+    return HWStallEvent::SchedulerQueueFull;
+  case Scheduler::SC_DISPATCH_GROUP_STALL:
+    return HWStallEvent::DispatchGroupStall;
+  case Scheduler::SC_AVAILABLE:
+    return HWStallEvent::Invalid;
+  }
+
+  llvm_unreachable("Don't know how to process this StallKind!");
+}
+
+bool ExecuteStage::isAvailable(const InstRef &IR) const {
+  if (Scheduler::Status S = HWS.isAvailable(IR)) {
+    HWStallEvent::GenericEventType ET = toHWStallEventType(S);
+    notifyEvent<HWStallEvent>(HWStallEvent(ET, IR));
+    return false;
+  }
+
+  return true;
+}
+
+Error ExecuteStage::issueInstruction(InstRef &IR) {
+  SmallVector<std::pair<ResourceRef, double>, 4> Used;
+  SmallVector<InstRef, 4> Ready;
+  HWS.issueInstruction(IR, Used, Ready);
+
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+  notifyReleasedBuffers(Desc.Buffers);
+  notifyInstructionIssued(IR, Used);
+  if (IR.getInstruction()->isExecuted()) {
+    notifyInstructionExecuted(IR);
+    //FIXME: add a buffer of executed instructions.
+    if (Error S = moveToTheNextStage(IR))
+      return S;
+  }
+
+  for (const InstRef &I : Ready)
+    notifyInstructionReady(I);
+  return ErrorSuccess();
+}
+
+Error ExecuteStage::issueReadyInstructions() {
+  InstRef IR = HWS.select();
+  while (IR.isValid()) {
+    if (Error Err = issueInstruction(IR))
+      return Err;
+
+    // Select the next instruction to issue.
+    IR = HWS.select();
+  }
+
+  return ErrorSuccess();
+}
+
+Error ExecuteStage::cycleStart() {
+  llvm::SmallVector<ResourceRef, 8> Freed;
+  llvm::SmallVector<InstRef, 4> Executed;
+  llvm::SmallVector<InstRef, 4> Ready;
+
+  HWS.cycleEvent(Freed, Executed, Ready);
+
+  for (const ResourceRef &RR : Freed)
+    notifyResourceAvailable(RR);
+
+  for (InstRef &IR : Executed) {
+    notifyInstructionExecuted(IR);
+    //FIXME: add a buffer of executed instructions.
+    if (Error S = moveToTheNextStage(IR))
+      return S;
+  }
+
+  for (const InstRef &IR : Ready)
+    notifyInstructionReady(IR);
+
+  return issueReadyInstructions();
+}
+
+// Schedule the instruction for execution on the hardware.
+Error ExecuteStage::execute(InstRef &IR) {
+  assert(isAvailable(IR) && "Scheduler is not available!");
+
+#ifndef NDEBUG
+  // Ensure that the HWS has not stored this instruction in its queues.
+  HWS.sanityCheck(IR);
+#endif
+  // Reserve a slot in each buffered resource. Also, mark units with
+  // BufferSize=0 as reserved. Resources with a buffer size of zero will only
+  // be released after MCIS is issued, and all the ResourceCycles for those
+  // units have been consumed.
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+  HWS.dispatch(IR);
+  notifyReservedBuffers(Desc.Buffers);
+  if (!HWS.isReady(IR))
+    return ErrorSuccess();
+
+  // If we did not return early, then the scheduler is ready for execution.
+  notifyInstructionReady(IR);
+
+  // If we cannot issue immediately, the HWS will add IR to its ready queue for
+  // execution later, so we must return early here.
+  if (!HWS.mustIssueImmediately(IR))
+    return ErrorSuccess();
+
+  // Issue IR to the underlying pipelines.
+  return issueInstruction(IR);
+}
+
+void ExecuteStage::notifyInstructionExecuted(const InstRef &IR) {
+  LLVM_DEBUG(dbgs() << "[E] Instruction Executed: #" << IR << '\n');
+  notifyEvent<HWInstructionEvent>(
+      HWInstructionEvent(HWInstructionEvent::Executed, IR));
+}
+
+void ExecuteStage::notifyInstructionReady(const InstRef &IR) {
+  LLVM_DEBUG(dbgs() << "[E] Instruction Ready: #" << IR << '\n');
+  notifyEvent<HWInstructionEvent>(
+      HWInstructionEvent(HWInstructionEvent::Ready, IR));
+}
+
+void ExecuteStage::notifyResourceAvailable(const ResourceRef &RR) {
+  LLVM_DEBUG(dbgs() << "[E] Resource Available: [" << RR.first << '.'
+                    << RR.second << "]\n");
+  for (HWEventListener *Listener : getListeners())
+    Listener->onResourceAvailable(RR);
+}
+
+void ExecuteStage::notifyInstructionIssued(
+    const InstRef &IR, ArrayRef<std::pair<ResourceRef, double>> Used) {
+  LLVM_DEBUG({
+    dbgs() << "[E] Instruction Issued: #" << IR << '\n';
+    for (const std::pair<ResourceRef, unsigned> &Resource : Used) {
+      dbgs() << "[E] Resource Used: [" << Resource.first.first << '.'
+             << Resource.first.second << "], ";
+      dbgs() << "cycles: " << Resource.second << '\n';
+    }
+  });
+  notifyEvent<HWInstructionEvent>(HWInstructionIssuedEvent(IR, Used));
+}
+
+void ExecuteStage::notifyReservedBuffers(ArrayRef<uint64_t> Buffers) {
+  if (Buffers.empty())
+    return;
+
+  SmallVector<unsigned, 4> BufferIDs(Buffers.begin(), Buffers.end());
+  std::transform(Buffers.begin(), Buffers.end(), BufferIDs.begin(),
+                 [&](uint64_t Op) { return HWS.getResourceID(Op); });
+  for (HWEventListener *Listener : getListeners())
+    Listener->onReservedBuffers(BufferIDs);
+}
+
+void ExecuteStage::notifyReleasedBuffers(ArrayRef<uint64_t> Buffers) {
+  if (Buffers.empty())
+    return;
+
+  SmallVector<unsigned, 4> BufferIDs(Buffers.begin(), Buffers.end());
+  std::transform(Buffers.begin(), Buffers.end(), BufferIDs.begin(),
+                 [&](uint64_t Op) { return HWS.getResourceID(Op); });
+  for (HWEventListener *Listener : getListeners())
+    Listener->onReleasedBuffers(BufferIDs);
+}
+
+} // namespace mca
diff --git a/llvm/tools/llvm-mca/lib/Stages/FetchStage.cpp b/llvm/tools/llvm-mca/lib/Stages/FetchStage.cpp
new file mode 100644
index 0000000..99c0aab
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/Stages/FetchStage.cpp
@@ -0,0 +1,82 @@
+//===---------------------- FetchStage.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the Fetch stage of an instruction pipeline.  Its sole
+/// purpose in life is to produce instructions for the rest of the pipeline.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Stages/FetchStage.h"
+
+namespace mca {
+
+bool FetchStage::hasWorkToComplete() const {
+  return CurrentInstruction.get() || SM.hasNext();
+}
+
+bool FetchStage::isAvailable(const InstRef & /* unused */) const {
+  if (!CurrentInstruction)
+    return false;
+  assert(SM.hasNext() && "Unexpected internal state!");
+  const SourceRef SR = SM.peekNext();
+  InstRef IR(SR.first, CurrentInstruction.get());
+  return checkNextStage(IR);
+}
+
+llvm::Error FetchStage::getNextInstruction() {
+  assert(!CurrentInstruction && "There is already an instruction to process!");
+  if (!SM.hasNext())
+    return llvm::ErrorSuccess();
+  const SourceRef SR = SM.peekNext();
+  llvm::Expected<std::unique_ptr<Instruction>> InstOrErr =
+      IB.createInstruction(*SR.second);
+  if (!InstOrErr)
+    return InstOrErr.takeError();
+  CurrentInstruction = std::move(InstOrErr.get());
+  return llvm::ErrorSuccess();
+}
+
+llvm::Error FetchStage::execute(InstRef & /*unused */) {
+  assert(CurrentInstruction && "There is no instruction to process!");
+  const SourceRef SR = SM.peekNext();
+  InstRef IR(SR.first, CurrentInstruction.get());
+  assert(checkNextStage(IR) && "Invalid fetch!");
+
+  Instructions[IR.getSourceIndex()] = std::move(CurrentInstruction);
+  if (llvm::Error Val = moveToTheNextStage(IR))
+    return Val;
+
+  SM.updateNext();
+
+  // Move the program counter.
+  return getNextInstruction();
+}
+
+llvm::Error FetchStage::cycleStart() {
+  if (!CurrentInstruction && SM.hasNext())
+    return getNextInstruction();
+  return llvm::ErrorSuccess();
+}
+
+llvm::Error FetchStage::cycleEnd() {
+  // Find the first instruction which hasn't been retired.
+  const InstMap::iterator It =
+      llvm::find_if(Instructions, [](const InstMap::value_type &KeyValuePair) {
+        return !KeyValuePair.second->isRetired();
+      });
+
+  // Erase instructions up to the first that hasn't been retired.
+  if (It != Instructions.begin())
+    Instructions.erase(Instructions.begin(), It);
+
+  return llvm::ErrorSuccess();
+}
+
+} // namespace mca
diff --git a/llvm/tools/llvm-mca/lib/Stages/InstructionTables.cpp b/llvm/tools/llvm-mca/lib/Stages/InstructionTables.cpp
new file mode 100644
index 0000000..1fdeefa
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/Stages/InstructionTables.cpp
@@ -0,0 +1,70 @@
+//===--------------------- InstructionTables.cpp ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements the method InstructionTables::execute().
+/// Method execute() prints a theoretical resource pressure distribution based
+/// on the information available in the scheduling model, and without running
+/// the pipeline.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Stages/InstructionTables.h"
+
+namespace mca {
+
+using namespace llvm;
+
+Error InstructionTables::execute(InstRef &IR) {
+  ArrayRef<uint64_t> Masks = IB.getProcResourceMasks();
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+  UsedResources.clear();
+
+  // Identify the resources consumed by this instruction.
+  for (const std::pair<uint64_t, ResourceUsage> Resource : Desc.Resources) {
+    // Skip zero-cycle resources (i.e., unused resources).
+    if (!Resource.second.size())
+      continue;
+    double Cycles = static_cast<double>(Resource.second.size());
+    unsigned Index = std::distance(
+        Masks.begin(), std::find(Masks.begin(), Masks.end(), Resource.first));
+    const MCProcResourceDesc &ProcResource = *SM.getProcResource(Index);
+    unsigned NumUnits = ProcResource.NumUnits;
+    if (!ProcResource.SubUnitsIdxBegin) {
+      // The number of cycles consumed by each unit.
+      Cycles /= NumUnits;
+      for (unsigned I = 0, E = NumUnits; I < E; ++I) {
+        ResourceRef ResourceUnit = std::make_pair(Index, 1U << I);
+        UsedResources.emplace_back(std::make_pair(ResourceUnit, Cycles));
+      }
+      continue;
+    }
+
+    // This is a group. Obtain the set of resources contained in this
+    // group. Some of these resources may implement multiple units.
+    // Uniformly distribute Cycles across all of the units.
+    for (unsigned I1 = 0; I1 < NumUnits; ++I1) {
+      unsigned SubUnitIdx = ProcResource.SubUnitsIdxBegin[I1];
+      const MCProcResourceDesc &SubUnit = *SM.getProcResource(SubUnitIdx);
+      // Compute the number of cycles consumed by each resource unit.
+      double RUCycles = Cycles / (NumUnits * SubUnit.NumUnits);
+      for (unsigned I2 = 0, E2 = SubUnit.NumUnits; I2 < E2; ++I2) {
+        ResourceRef ResourceUnit = std::make_pair(SubUnitIdx, 1U << I2);
+        UsedResources.emplace_back(std::make_pair(ResourceUnit, RUCycles));
+      }
+    }
+  }
+
+  // Send a fake instruction issued event to all the views.
+  HWInstructionIssuedEvent Event(IR, UsedResources);
+  notifyEvent<HWInstructionIssuedEvent>(Event);
+  return ErrorSuccess();
+}
+
+} // namespace mca
diff --git a/llvm/tools/llvm-mca/lib/Stages/RetireStage.cpp b/llvm/tools/llvm-mca/lib/Stages/RetireStage.cpp
new file mode 100644
index 0000000..768a68e
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/Stages/RetireStage.cpp
@@ -0,0 +1,62 @@
+//===---------------------- RetireStage.cpp ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the retire stage of an instruction pipeline.
+/// The RetireStage represents the process logic that interacts with the
+/// simulated RetireControlUnit hardware.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Stages/RetireStage.h"
+#include "HWEventListener.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "llvm-mca"
+
+namespace mca {
+
+llvm::Error RetireStage::cycleStart() {
+  if (RCU.isEmpty())
+    return llvm::ErrorSuccess();
+
+  const unsigned MaxRetirePerCycle = RCU.getMaxRetirePerCycle();
+  unsigned NumRetired = 0;
+  while (!RCU.isEmpty()) {
+    if (MaxRetirePerCycle != 0 && NumRetired == MaxRetirePerCycle)
+      break;
+    const RetireControlUnit::RUToken &Current = RCU.peekCurrentToken();
+    if (!Current.Executed)
+      break;
+    RCU.consumeCurrentToken();
+    notifyInstructionRetired(Current.IR);
+    NumRetired++;
+  }
+
+  return llvm::ErrorSuccess();
+}
+
+llvm::Error RetireStage::execute(InstRef &IR) {
+  RCU.onInstructionExecuted(IR.getInstruction()->getRCUTokenID());
+  return llvm::ErrorSuccess();
+}
+
+void RetireStage::notifyInstructionRetired(const InstRef &IR) {
+  LLVM_DEBUG(llvm::dbgs() << "[E] Instruction Retired: #" << IR << '\n');
+  llvm::SmallVector<unsigned, 4> FreedRegs(PRF.getNumRegisterFiles());
+  const Instruction &Inst = *IR.getInstruction();
+  const InstrDesc &Desc = Inst.getDesc();
+
+  bool ShouldFreeRegs = !(Desc.isZeroLatency() && Inst.isDependencyBreaking());
+  for (const std::unique_ptr<WriteState> &WS : Inst.getDefs())
+    PRF.removeRegisterWrite(*WS.get(), FreedRegs, ShouldFreeRegs);
+  notifyEvent<HWInstructionEvent>(HWInstructionRetiredEvent(IR, FreedRegs));
+}
+
+} // namespace mca
diff --git a/llvm/tools/llvm-mca/lib/Stages/Stage.cpp b/llvm/tools/llvm-mca/lib/Stages/Stage.cpp
new file mode 100644
index 0000000..e8cd74f
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/Stages/Stage.cpp
@@ -0,0 +1,27 @@
+//===---------------------- Stage.cpp ---------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a stage.
+/// A chain of stages compose an instruction pipeline.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Stages/Stage.h"
+
+namespace mca {
+
+// Pin the vtable here in the implementation file.
+Stage::~Stage() = default;
+
+void Stage::addListener(HWEventListener *Listener) {
+  Listeners.insert(Listener);
+}
+
+} // namespace mca
diff --git a/llvm/tools/llvm-mca/lib/Support.cpp b/llvm/tools/llvm-mca/lib/Support.cpp
new file mode 100644
index 0000000..8f6b8a9
--- /dev/null
+++ b/llvm/tools/llvm-mca/lib/Support.cpp
@@ -0,0 +1,79 @@
+//===--------------------- Support.cpp --------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements a few helper functions used by various pipeline
+/// components.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Support.h"
+#include "llvm/MC/MCSchedule.h"
+
+namespace mca {
+
+using namespace llvm;
+
+void computeProcResourceMasks(const MCSchedModel &SM,
+                              SmallVectorImpl<uint64_t> &Masks) {
+  unsigned ProcResourceID = 0;
+
+  // Create a unique bitmask for every processor resource unit.
+  // Skip resource at index 0, since it always references 'InvalidUnit'.
+  Masks.resize(SM.getNumProcResourceKinds());
+  for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    const MCProcResourceDesc &Desc = *SM.getProcResource(I);
+    if (Desc.SubUnitsIdxBegin)
+      continue;
+    Masks[I] = 1ULL << ProcResourceID;
+    ProcResourceID++;
+  }
+
+  // Create a unique bitmask for every processor resource group.
+  for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    const MCProcResourceDesc &Desc = *SM.getProcResource(I);
+    if (!Desc.SubUnitsIdxBegin)
+      continue;
+    Masks[I] = 1ULL << ProcResourceID;
+    for (unsigned U = 0; U < Desc.NumUnits; ++U) {
+      uint64_t OtherMask = Masks[Desc.SubUnitsIdxBegin[U]];
+      Masks[I] |= OtherMask;
+    }
+    ProcResourceID++;
+  }
+}
+
+double computeBlockRThroughput(const MCSchedModel &SM, unsigned DispatchWidth,
+                               unsigned NumMicroOps,
+                               ArrayRef<unsigned> ProcResourceUsage) {
+  // The block throughput is bounded from above by the hardware dispatch
+  // throughput. That is because the DispatchWidth is an upper bound on the
+  // number of opcodes that can be part of a single dispatch group.
+  double Max = static_cast<double>(NumMicroOps) / DispatchWidth;
+
+  // The block throughput is also limited by the amount of hardware parallelism.
+  // The number of available resource units affects the resource pressure
+  // distribution, as well as how many blocks can be executed every cycle.
+  for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    unsigned ResourceCycles = ProcResourceUsage[I];
+    if (!ResourceCycles)
+      continue;
+
+    const MCProcResourceDesc &MCDesc = *SM.getProcResource(I);
+    double Throughput = static_cast<double>(ResourceCycles) / MCDesc.NumUnits;
+    Max = std::max(Max, Throughput);
+  }
+
+  // The block reciprocal throughput is computed as the MAX of:
+  //  - (NumMicroOps / DispatchWidth)
+  //  - (NumUnits / ResourceCycles)   for every consumed processor resource.
+  return Max;
+}
+
+} // namespace mca