[llvm-mca] Improved support for dependency-breaking instructions.

The tool assumes that a zero-latency instruction that doesn't consume hardware
resources is an optimizable dependency-breaking instruction. That means, it
doesn't have to wait on register input operands, and it doesn't consume any
physical register. The PRF knows how to optimize it at register renaming stage.

llvm-svn: 332249
diff --git a/llvm/tools/llvm-mca/Dispatch.cpp b/llvm/tools/llvm-mca/Dispatch.cpp
index 2c0227d..1a3fb7e 100644
--- a/llvm/tools/llvm-mca/Dispatch.cpp
+++ b/llvm/tools/llvm-mca/Dispatch.cpp
@@ -91,8 +91,8 @@
   }
 }
 
-void RegisterFile::createNewMappings(IndexPlusCostPairTy Entry,
-                                     MutableArrayRef<unsigned> UsedPhysRegs) {
+void RegisterFile::allocatePhysRegs(IndexPlusCostPairTy Entry,
+                                    MutableArrayRef<unsigned> UsedPhysRegs) {
   unsigned RegisterFileIndex = Entry.first;
   unsigned Cost = Entry.second;
   if (RegisterFileIndex) {
@@ -106,8 +106,8 @@
   UsedPhysRegs[0] += Cost;
 }
 
-void RegisterFile::removeMappings(IndexPlusCostPairTy Entry,
-                                  MutableArrayRef<unsigned> FreedPhysRegs) {
+void RegisterFile::freePhysRegs(IndexPlusCostPairTy Entry,
+                                MutableArrayRef<unsigned> FreedPhysRegs) {
   unsigned RegisterFileIndex = Entry.first;
   unsigned Cost = Entry.second;
   if (RegisterFileIndex) {
@@ -121,8 +121,9 @@
   FreedPhysRegs[0] += Cost;
 }
 
-void RegisterFile::addRegisterMapping(WriteState &WS,
-                                      MutableArrayRef<unsigned> UsedPhysRegs) {
+void RegisterFile::addRegisterWrite(WriteState &WS,
+                                    MutableArrayRef<unsigned> UsedPhysRegs,
+                                    bool ShouldAllocatePhysRegs) {
   unsigned RegID = WS.getRegisterID();
   assert(RegID && "Adding an invalid register definition?");
 
@@ -131,7 +132,11 @@
   for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I)
     RegisterMappings[*I].first = &WS;
 
-  createNewMappings(Mapping.second, UsedPhysRegs);
+  // No physical registers are allocated for instructions that are optimized in
+  // hardware. For example, zero-latency data-dependency breaking instructions
+  // don't consume physical registers.
+  if (ShouldAllocatePhysRegs)
+    allocatePhysRegs(Mapping.second, UsedPhysRegs);
 
   // If this is a partial update, then we are done.
   if (!WS.fullyUpdatesSuperRegs())
@@ -141,8 +146,9 @@
     RegisterMappings[*I].first = &WS;
 }
 
-void RegisterFile::invalidateRegisterMapping(
-    const WriteState &WS, MutableArrayRef<unsigned> FreedPhysRegs) {
+void RegisterFile::removeRegisterWrite(
+    const WriteState &WS, MutableArrayRef<unsigned> FreedPhysRegs,
+    bool ShouldFreePhysRegs) {
   unsigned RegID = WS.getRegisterID();
   bool ShouldInvalidateSuperRegs = WS.fullyUpdatesSuperRegs();
 
@@ -154,7 +160,8 @@
   if (!Mapping.first)
     return;
 
-  removeMappings(Mapping.second, FreedPhysRegs);
+  if (ShouldFreePhysRegs)
+    freePhysRegs(Mapping.second, FreedPhysRegs);
 
   if (Mapping.first == &WS)
     Mapping.first = nullptr;
@@ -261,8 +268,10 @@
 void DispatchUnit::notifyInstructionRetired(const InstRef &IR) {
   LLVM_DEBUG(dbgs() << "[E] Instruction Retired: " << IR << '\n');
   SmallVector<unsigned, 4> FreedRegs(RAT->getNumRegisterFiles());
+  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+
   for (const std::unique_ptr<WriteState> &WS : IR.getInstruction()->getDefs())
-    RAT->invalidateRegisterMapping(*WS.get(), FreedRegs);
+    RAT->removeRegisterWrite(*WS.get(), FreedRegs, !Desc.isZeroLatency());
   Owner->notifyInstructionEvent(HWInstructionRetiredEvent(IR, FreedRegs));
   Owner->eraseInstruction(IR);
 }
@@ -339,18 +348,22 @@
     AvailableEntries -= NumMicroOps;
   }
 
-  // Update RAW dependencies if this instruction is not a zero-latency
-  // instruction. The assumption is that a zero-latency instruction doesn't
-  // require to be issued to the scheduler for execution. More importantly, it
-  // doesn't have to wait on the register input operands.
-  if (Desc.MaxLatency || !Desc.Resources.empty())
+  // A dependency-breaking instruction doesn't have to wait on the register
+  // input operands, and it is often optimized at register renaming stage.
+  // Update RAW dependencies if this instruction is not a dependency-breaking
+  // instruction. A dependency-breaking instruction is a zero-latency
+  // instruction that doesn't consume hardware resources.
+  // An example of dependency-breaking instruction on X86 is a zero-idiom XOR.
+  if (!Desc.isZeroLatency())
     for (std::unique_ptr<ReadState> &RS : IS.getUses())
       updateRAWDependencies(*RS, STI);
 
-  // Allocate new mappings.
+  // By default, a dependency-breaking zero-latency instruction is expected to
+  // be optimized at register renaming stage. That means, no physical register
+  // is allocated to the instruction.
   SmallVector<unsigned, 4> RegisterFiles(RAT->getNumRegisterFiles());
   for (std::unique_ptr<WriteState> &WS : IS.getDefs())
-    RAT->addRegisterMapping(*WS, RegisterFiles);
+    RAT->addRegisterWrite(*WS, RegisterFiles, !Desc.isZeroLatency());
 
   // Reserve slots in the RCU, and notify the instruction that it has been
   // dispatched to the schedulers for execution.