[llvm-mca][X86] Teach how to identify register writes that implicitly clear the upper portion of a super-register.
This patch teaches llvm-mca how to identify register writes that implicitly zero
the upper portion of a super-register.
On X86-64, a general purpose register is implemented in hardware as a 64-bit
register. Quoting the Intel 64 Software Developer's Manual: "an update to the
lower 32 bits of a 64 bit integer register is architecturally defined to zero
extend the upper 32 bits". Also, a write to an XMM register performed by an AVX
instruction implicitly zeroes the upper 128 bits of the aliasing YMM register.
This patch adds a new method named clearsSuperRegisters to the MCInstrAnalysis
interface to help identify instructions that implicitly clear the upper portion
of a super-register. The rest of the patch teaches llvm-mca how to use that new
method to obtain the information, and update the register dependencies
accordingly.
I compared the kernels from tests clear-super-register-1.s and
clear-super-register-2.s against the output from perf on btver2. Previously
there was a large discrepancy between the estimated IPC and the measured IPC.
Now the differences are mostly in the noise.
Differential Revision: https://reviews.llvm.org/D48225
llvm-svn: 335113
diff --git a/llvm/tools/llvm-mca/InstrBuilder.cpp b/llvm/tools/llvm-mca/InstrBuilder.cpp
index 8a66a76..dbdf0ed 100644
--- a/llvm/tools/llvm-mca/InstrBuilder.cpp
+++ b/llvm/tools/llvm-mca/InstrBuilder.cpp
@@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "InstrBuilder.h"
+#include "llvm/ADT/APInt.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/MC/MCInst.h"
#include "llvm/Support/Debug.h"
@@ -158,23 +159,6 @@
const MCInstrDesc &MCDesc,
const MCSchedClassDesc &SCDesc,
const MCSubtargetInfo &STI) {
- // Set if writes through this opcode may update super registers.
- // TODO: on x86-64, a 4 byte write of a general purpose register always
- // fully updates the super-register.
- // More in general, (at least on x86) not all register writes perform
- // a partial (super-)register update.
- // For example, an AVX instruction that writes on a XMM register implicitly
- // zeroes the upper half of every aliasing super-register.
- //
- // For now, we pessimistically assume that writes are all potentially
- // partial register updates. This is a good default for most targets, execept
- // for those like x86 which implement a special semantic for certain opcodes.
- // At least on x86, this may lead to an inaccurate prediction of the
- // instruction level parallelism.
- bool FullyUpdatesSuperRegisters = false;
-
- // Now Populate Writes.
-
// This algorithm currently works under the strong (and potentially incorrect)
// assumption that information related to register def/uses can be obtained
// from MCInstrDesc.
@@ -275,7 +259,6 @@
Write.Latency = ID.MaxLatency;
Write.SClassOrWriteResourceID = 0;
}
- Write.FullyUpdatesSuperRegs = FullyUpdatesSuperRegisters;
Write.IsOptionalDef = false;
LLVM_DEBUG({
dbgs() << "\t\tOpIdx=" << Write.OpIndex << ", Latency=" << Write.Latency
@@ -488,16 +471,35 @@
NewIS->getUses().emplace_back(llvm::make_unique<ReadState>(RD, RegID));
}
+ // Early exit if there are no writes.
+ if (D.Writes.empty())
+ return NewIS;
+
+ // Track register writes that implicitly clear the upper portion of the
+ // underlying super-registers using an APInt.
+ APInt WriteMask(D.Writes.size(), 0);
+
+ // Now query the MCInstrAnalysis object to obtain information about which
+ // register writes implicitly clear the upper portion of a super-register.
+ MCIA.clearsSuperRegisters(MRI, MCI, WriteMask);
+
// Initialize writes.
+ unsigned WriteIndex = 0;
for (const WriteDescriptor &WD : D.Writes) {
unsigned RegID =
WD.OpIndex == -1 ? WD.RegisterID : MCI.getOperand(WD.OpIndex).getReg();
// Check if this is a optional definition that references NoReg.
- if (WD.IsOptionalDef && !RegID)
+ if (WD.IsOptionalDef && !RegID) {
+ ++WriteIndex;
continue;
+ }
assert(RegID && "Expected a valid register ID!");
- NewIS->getDefs().emplace_back(llvm::make_unique<WriteState>(WD, RegID));
+ APInt CurrWriteMask = WriteMask & (1 << WriteIndex);
+ bool UpdatesSuperRegisters = CurrWriteMask.getBoolValue();
+ NewIS->getDefs().emplace_back(
+ llvm::make_unique<WriteState>(WD, RegID, UpdatesSuperRegisters));
+ ++WriteIndex;
}
return NewIS;
diff --git a/llvm/tools/llvm-mca/InstrBuilder.h b/llvm/tools/llvm-mca/InstrBuilder.h
index 146e917..1c325d9 100644
--- a/llvm/tools/llvm-mca/InstrBuilder.h
+++ b/llvm/tools/llvm-mca/InstrBuilder.h
@@ -17,7 +17,9 @@
#include "Instruction.h"
#include "Support.h"
+#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
namespace mca {
@@ -37,6 +39,8 @@
class InstrBuilder {
const llvm::MCSubtargetInfo &STI;
const llvm::MCInstrInfo &MCII;
+ const llvm::MCRegisterInfo &MRI;
+ const llvm::MCInstrAnalysis &MCIA;
llvm::SmallVector<uint64_t, 8> ProcResourceMasks;
llvm::DenseMap<unsigned short, std::unique_ptr<const InstrDesc>> Descriptors;
@@ -48,8 +52,10 @@
InstrBuilder &operator=(const InstrBuilder &) = delete;
public:
- InstrBuilder(const llvm::MCSubtargetInfo &sti, const llvm::MCInstrInfo &mcii)
- : STI(sti), MCII(mcii),
+ InstrBuilder(const llvm::MCSubtargetInfo &sti, const llvm::MCInstrInfo &mcii,
+ const llvm::MCRegisterInfo &mri,
+ const llvm::MCInstrAnalysis &mcia)
+ : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia),
ProcResourceMasks(STI.getSchedModel().getNumProcResourceKinds()) {
computeProcResourceMasks(STI.getSchedModel(), ProcResourceMasks);
}
diff --git a/llvm/tools/llvm-mca/Instruction.h b/llvm/tools/llvm-mca/Instruction.h
index e0a4504..dc21e88 100644
--- a/llvm/tools/llvm-mca/Instruction.h
+++ b/llvm/tools/llvm-mca/Instruction.h
@@ -70,11 +70,6 @@
// This field is set to a value different than zero only if this
// is an implicit definition.
unsigned RegisterID;
- // True if this write generates a partial update of a super-registers.
- // On X86, this flag is set by byte/word writes on GPR registers. Also,
- // a write of an XMM register only partially updates the corresponding
- // YMM super-register if the write is associated to a legacy SSE instruction.
- bool FullyUpdatesSuperRegs;
// Instruction itineraries would set this field to the SchedClass ID.
// Otherwise, it defaults to the WriteResourceID from the MCWriteLatencyEntry
// element associated to this write.
@@ -129,6 +124,10 @@
// field RegisterID from WD.
unsigned RegisterID;
+ // True if this write implicitly clears the upper portion of RegisterID's
+ // super-registers.
+ bool ClearsSuperRegs;
+
// A list of dependent reads. Users is a set of dependent
// reads. A dependent read is added to the set only if CyclesLeft
// is "unknown". As soon as CyclesLeft is 'known', each user in the set
@@ -138,8 +137,10 @@
std::set<std::pair<ReadState *, int>> Users;
public:
- WriteState(const WriteDescriptor &Desc, unsigned RegID)
- : WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID) {}
+ WriteState(const WriteDescriptor &Desc, unsigned RegID,
+ bool clearsSuperRegs = false)
+ : WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID),
+ ClearsSuperRegs(clearsSuperRegs) {}
WriteState(const WriteState &Other) = delete;
WriteState &operator=(const WriteState &Other) = delete;
@@ -148,7 +149,7 @@
unsigned getRegisterID() const { return RegisterID; }
void addUser(ReadState *Use, int ReadAdvance);
- bool fullyUpdatesSuperRegs() const { return WD.FullyUpdatesSuperRegs; }
+ bool clearsSuperRegisters() const { return ClearsSuperRegs; }
// On every cycle, update CyclesLeft and notify dependent users.
void cycleEvent();
diff --git a/llvm/tools/llvm-mca/RegisterFile.cpp b/llvm/tools/llvm-mca/RegisterFile.cpp
index b12c7a4..9679bb4 100644
--- a/llvm/tools/llvm-mca/RegisterFile.cpp
+++ b/llvm/tools/llvm-mca/RegisterFile.cpp
@@ -138,7 +138,7 @@
allocatePhysRegs(Mapping.second, UsedPhysRegs);
// If this is a partial update, then we are done.
- if (!WS.fullyUpdatesSuperRegs())
+ if (!WS.clearsSuperRegisters())
return;
for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I)
@@ -149,7 +149,7 @@
MutableArrayRef<unsigned> FreedPhysRegs,
bool ShouldFreePhysRegs) {
unsigned RegID = WS.getRegisterID();
- bool ShouldInvalidateSuperRegs = WS.fullyUpdatesSuperRegs();
+ bool ShouldInvalidateSuperRegs = WS.clearsSuperRegisters();
assert(RegID != 0 && "Invalidating an already invalid register?");
assert(WS.getCyclesLeft() != -512 &&
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
index 4d10704..372be3e 100644
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -388,6 +388,9 @@
std::unique_ptr<MCInstrInfo> MCII(TheTarget->createMCInstrInfo());
+ std::unique_ptr<MCInstrAnalysis> MCIA(
+ TheTarget->createMCInstrAnalysis(MCII.get()));
+
if (!MCPU.compare("native"))
MCPU = llvm::sys::getHostCPUName();
@@ -457,7 +460,7 @@
Width = DispatchWidth;
// Create an instruction builder.
- mca::InstrBuilder IB(*STI, *MCII);
+ mca::InstrBuilder IB(*STI, *MCII, *MRI, *MCIA);
// Number each region in the sequence.
unsigned RegionIdx = 0;