[llvm-mca][X86] Teach how to identify register writes that implicitly clear the upper portion of a super-register.

This patch teaches llvm-mca how to identify register writes that implicitly zero
the upper portion of a super-register.

On X86-64, a general purpose register is implemented in hardware as a 64-bit
register. Quoting the Intel 64 Software Developer's Manual: "an update to the
lower 32 bits of a 64 bit integer register is architecturally defined to zero
extend the upper 32 bits".  Also, a write to an XMM register performed by an AVX
instruction implicitly zeroes the upper 128 bits of the aliasing YMM register.

This patch adds a new method named clearsSuperRegisters to the MCInstrAnalysis
interface to help identify instructions that implicitly clear the upper portion
of a super-register.  The rest of the patch teaches llvm-mca how to use that new
method to obtain the information, and update the register dependencies
accordingly.

I compared the kernels from tests clear-super-register-1.s and
clear-super-register-2.s against the output from perf on btver2.  Previously
there was a large discrepancy between the estimated IPC and the measured IPC.
Now the differences are mostly in the noise.

Differential Revision: https://reviews.llvm.org/D48225

llvm-svn: 335113
diff --git a/llvm/tools/llvm-mca/InstrBuilder.cpp b/llvm/tools/llvm-mca/InstrBuilder.cpp
index 8a66a76..dbdf0ed 100644
--- a/llvm/tools/llvm-mca/InstrBuilder.cpp
+++ b/llvm/tools/llvm-mca/InstrBuilder.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstrBuilder.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/Debug.h"
@@ -158,23 +159,6 @@
                            const MCInstrDesc &MCDesc,
                            const MCSchedClassDesc &SCDesc,
                            const MCSubtargetInfo &STI) {
-  // Set if writes through this opcode may update super registers.
-  // TODO: on x86-64, a 4 byte write of a general purpose register always
-  // fully updates the super-register.
-  // More in general, (at least on x86) not all register writes perform
-  // a partial (super-)register update.
-  // For example, an AVX instruction that writes on a XMM register implicitly
-  // zeroes the upper half of every aliasing super-register.
-  //
-  // For now, we pessimistically assume that writes are all potentially
-  // partial register updates. This is a good default for most targets, execept
-  // for those like x86 which implement a special semantic for certain opcodes.
-  // At least on x86, this may lead to an inaccurate prediction of the
-  // instruction level parallelism.
-  bool FullyUpdatesSuperRegisters = false;
-
-  // Now Populate Writes.
-
   // This algorithm currently works under the strong (and potentially incorrect)
   // assumption that information related to register def/uses can be obtained
   // from MCInstrDesc.
@@ -275,7 +259,6 @@
       Write.Latency = ID.MaxLatency;
       Write.SClassOrWriteResourceID = 0;
     }
-    Write.FullyUpdatesSuperRegs = FullyUpdatesSuperRegisters;
     Write.IsOptionalDef = false;
     LLVM_DEBUG({
       dbgs() << "\t\tOpIdx=" << Write.OpIndex << ", Latency=" << Write.Latency
@@ -488,16 +471,35 @@
     NewIS->getUses().emplace_back(llvm::make_unique<ReadState>(RD, RegID));
   }
 
+  // Early exit if there are no writes.
+  if (D.Writes.empty())
+    return NewIS;
+
+  // Track register writes that implicitly clear the upper portion of the
+  // underlying super-registers using an APInt.
+  APInt WriteMask(D.Writes.size(), 0);
+
+  // Now query the MCInstrAnalysis object to obtain information about which
+  // register writes implicitly clear the upper portion of a super-register.
+  MCIA.clearsSuperRegisters(MRI, MCI, WriteMask);
+
   // Initialize writes.
+  unsigned WriteIndex = 0;
   for (const WriteDescriptor &WD : D.Writes) {
     unsigned RegID =
         WD.OpIndex == -1 ? WD.RegisterID : MCI.getOperand(WD.OpIndex).getReg();
     // Check if this is a optional definition that references NoReg.
-    if (WD.IsOptionalDef && !RegID)
+    if (WD.IsOptionalDef && !RegID) {
+      ++WriteIndex;
       continue;
+    }
 
     assert(RegID && "Expected a valid register ID!");
-    NewIS->getDefs().emplace_back(llvm::make_unique<WriteState>(WD, RegID));
+    APInt CurrWriteMask = WriteMask & (1 << WriteIndex);
+    bool UpdatesSuperRegisters = CurrWriteMask.getBoolValue();
+    NewIS->getDefs().emplace_back(
+        llvm::make_unique<WriteState>(WD, RegID, UpdatesSuperRegisters));
+    ++WriteIndex;
   }
 
   return NewIS;
diff --git a/llvm/tools/llvm-mca/InstrBuilder.h b/llvm/tools/llvm-mca/InstrBuilder.h
index 146e917..1c325d9 100644
--- a/llvm/tools/llvm-mca/InstrBuilder.h
+++ b/llvm/tools/llvm-mca/InstrBuilder.h
@@ -17,7 +17,9 @@
 
 #include "Instruction.h"
 #include "Support.h"
+#include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 
 namespace mca {
@@ -37,6 +39,8 @@
 class InstrBuilder {
   const llvm::MCSubtargetInfo &STI;
   const llvm::MCInstrInfo &MCII;
+  const llvm::MCRegisterInfo &MRI;
+  const llvm::MCInstrAnalysis &MCIA;
   llvm::SmallVector<uint64_t, 8> ProcResourceMasks;
 
   llvm::DenseMap<unsigned short, std::unique_ptr<const InstrDesc>> Descriptors;
@@ -48,8 +52,10 @@
   InstrBuilder &operator=(const InstrBuilder &) = delete;
 
 public:
-  InstrBuilder(const llvm::MCSubtargetInfo &sti, const llvm::MCInstrInfo &mcii)
-      : STI(sti), MCII(mcii),
+  InstrBuilder(const llvm::MCSubtargetInfo &sti, const llvm::MCInstrInfo &mcii,
+               const llvm::MCRegisterInfo &mri,
+               const llvm::MCInstrAnalysis &mcia)
+      : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia),
         ProcResourceMasks(STI.getSchedModel().getNumProcResourceKinds()) {
     computeProcResourceMasks(STI.getSchedModel(), ProcResourceMasks);
   }
diff --git a/llvm/tools/llvm-mca/Instruction.h b/llvm/tools/llvm-mca/Instruction.h
index e0a4504..dc21e88 100644
--- a/llvm/tools/llvm-mca/Instruction.h
+++ b/llvm/tools/llvm-mca/Instruction.h
@@ -70,11 +70,6 @@
   // This field is set to a value different than zero only if this
   // is an implicit definition.
   unsigned RegisterID;
-  // True if this write generates a partial update of a super-registers.
-  // On X86, this flag is set by byte/word writes on GPR registers. Also,
-  // a write of an XMM register only partially updates the corresponding
-  // YMM super-register if the write is associated to a legacy SSE instruction.
-  bool FullyUpdatesSuperRegs;
   // Instruction itineraries would set this field to the SchedClass ID.
   // Otherwise, it defaults to the WriteResourceID from the MCWriteLatencyEntry
   // element associated to this write.
@@ -129,6 +124,10 @@
   // field RegisterID from WD.
   unsigned RegisterID;
 
+  // True if this write implicitly clears the upper portion of RegisterID's
+  // super-registers.
+  bool ClearsSuperRegs;
+
   // A list of dependent reads. Users is a set of dependent
   // reads. A dependent read is added to the set only if CyclesLeft
   // is "unknown". As soon as CyclesLeft is 'known', each user in the set
@@ -138,8 +137,10 @@
   std::set<std::pair<ReadState *, int>> Users;
 
 public:
-  WriteState(const WriteDescriptor &Desc, unsigned RegID)
-      : WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID) {}
+  WriteState(const WriteDescriptor &Desc, unsigned RegID,
+             bool clearsSuperRegs = false)
+      : WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID),
+        ClearsSuperRegs(clearsSuperRegs) {}
   WriteState(const WriteState &Other) = delete;
   WriteState &operator=(const WriteState &Other) = delete;
 
@@ -148,7 +149,7 @@
   unsigned getRegisterID() const { return RegisterID; }
 
   void addUser(ReadState *Use, int ReadAdvance);
-  bool fullyUpdatesSuperRegs() const { return WD.FullyUpdatesSuperRegs; }
+  bool clearsSuperRegisters() const { return ClearsSuperRegs; }
 
   // On every cycle, update CyclesLeft and notify dependent users.
   void cycleEvent();
diff --git a/llvm/tools/llvm-mca/RegisterFile.cpp b/llvm/tools/llvm-mca/RegisterFile.cpp
index b12c7a4..9679bb4 100644
--- a/llvm/tools/llvm-mca/RegisterFile.cpp
+++ b/llvm/tools/llvm-mca/RegisterFile.cpp
@@ -138,7 +138,7 @@
     allocatePhysRegs(Mapping.second, UsedPhysRegs);
 
   // If this is a partial update, then we are done.
-  if (!WS.fullyUpdatesSuperRegs())
+  if (!WS.clearsSuperRegisters())
     return;
 
   for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I)
@@ -149,7 +149,7 @@
                                        MutableArrayRef<unsigned> FreedPhysRegs,
                                        bool ShouldFreePhysRegs) {
   unsigned RegID = WS.getRegisterID();
-  bool ShouldInvalidateSuperRegs = WS.fullyUpdatesSuperRegs();
+  bool ShouldInvalidateSuperRegs = WS.clearsSuperRegisters();
 
   assert(RegID != 0 && "Invalidating an already invalid register?");
   assert(WS.getCyclesLeft() != -512 &&
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
index 4d10704..372be3e 100644
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -388,6 +388,9 @@
 
   std::unique_ptr<MCInstrInfo> MCII(TheTarget->createMCInstrInfo());
 
+  std::unique_ptr<MCInstrAnalysis> MCIA(
+      TheTarget->createMCInstrAnalysis(MCII.get()));
+
   if (!MCPU.compare("native"))
     MCPU = llvm::sys::getHostCPUName();
 
@@ -457,7 +460,7 @@
     Width = DispatchWidth;
 
   // Create an instruction builder.
-  mca::InstrBuilder IB(*STI, *MCII);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, *MCIA);
 
   // Number each region in the sequence.
   unsigned RegionIdx = 0;