[llvm-exegeis] Computing Latency configuration upfront so we can generate many CodeTemplates at once.

Summary: LatencyGenerator now computes all possible mode of serial execution for an Instruction upfront and generates CodeTemplate for the ones that give the best results (e.g. no need to generate a two instructions snippet when repeating a single one would do). The next step is to generate even more configurations for cases (e.g. for XOR we should generate "XOR EAX, EAX, EAX" and "XOR EAX, EAX, EBX")

Reviewers: courbet

Reviewed By: courbet

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D53320

llvm-svn: 344689
diff --git a/llvm/tools/llvm-exegesis/lib/CodeTemplate.cpp b/llvm/tools/llvm-exegesis/lib/CodeTemplate.cpp
index 34433da..df9d18b 100644
--- a/llvm/tools/llvm-exegesis/lib/CodeTemplate.cpp
+++ b/llvm/tools/llvm-exegesis/lib/CodeTemplate.cpp
@@ -65,4 +65,54 @@
   return Result;
 }
 
+bool isEnumValue(ExecutionMode Execution) {
+  return llvm::isPowerOf2_32(static_cast<uint32_t>(Execution));
+}
+
+llvm::StringRef getName(ExecutionMode Bit) {
+  assert(isEnumValue(Bit) && "Bit must be a power of two");
+  switch (Bit) {
+  case ExecutionMode::UNKNOWN:
+    return "UNKNOWN";
+  case ExecutionMode::ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS:
+    return "ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS";
+  case ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS:
+    return "ALWAYS_SERIAL_TIED_REGS_ALIAS";
+  case ExecutionMode::SERIAL_VIA_MEMORY_INSTR:
+    return "SERIAL_VIA_MEMORY_INSTR";
+  case ExecutionMode::SERIAL_VIA_EXPLICIT_REGS:
+    return "SERIAL_VIA_EXPLICIT_REGS";
+  case ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR:
+    return "SERIAL_VIA_NON_MEMORY_INSTR";
+  case ExecutionMode::ALWAYS_PARALLEL_MISSING_USE_OR_DEF:
+    return "ALWAYS_PARALLEL_MISSING_USE_OR_DEF";
+  case ExecutionMode::PARALLEL_VIA_EXPLICIT_REGS:
+    return "PARALLEL_VIA_EXPLICIT_REGS";
+  }
+  llvm_unreachable("Missing enum case");
+}
+
+static const ExecutionMode kAllExecutionModeBits[] = {
+    ExecutionMode::ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS,
+    ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS,
+    ExecutionMode::SERIAL_VIA_MEMORY_INSTR,
+    ExecutionMode::SERIAL_VIA_EXPLICIT_REGS,
+    ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR,
+    ExecutionMode::ALWAYS_PARALLEL_MISSING_USE_OR_DEF,
+    ExecutionMode::PARALLEL_VIA_EXPLICIT_REGS,
+};
+
+llvm::ArrayRef<ExecutionMode> getAllExecutionBits() {
+  return kAllExecutionModeBits;
+}
+
+llvm::SmallVector<ExecutionMode, 4>
+getExecutionModeBits(ExecutionMode Execution) {
+  llvm::SmallVector<ExecutionMode, 4> Result;
+  for (const auto Bit : getAllExecutionBits())
+    if ((Execution & Bit) == Bit)
+      Result.push_back(Bit);
+  return Result;
+}
+
 } // namespace exegesis
diff --git a/llvm/tools/llvm-exegesis/lib/CodeTemplate.h b/llvm/tools/llvm-exegesis/lib/CodeTemplate.h
index e5006eb..734992f 100644
--- a/llvm/tools/llvm-exegesis/lib/CodeTemplate.h
+++ b/llvm/tools/llvm-exegesis/lib/CodeTemplate.h
@@ -17,6 +17,7 @@
 #define LLVM_TOOLS_LLVM_EXEGESIS_CODETEMPLATE_H
 
 #include "MCInstrDescView.h"
+#include "llvm/ADT/BitmaskEnum.h"
 
 namespace exegesis {
 
@@ -45,9 +46,65 @@
   llvm::SmallVector<llvm::MCOperand, 4> VariableValues;
 };
 
+enum class ExecutionMode : uint8_t {
+  UNKNOWN = 0U,
+  // The instruction is always serial because implicit Use and Def alias.
+  // e.g. AAA (alias via EFLAGS)
+  ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS = 1u << 0,
+
+  // The instruction is always serial because one Def is tied to a Use.
+  // e.g. AND32ri (alias via tied GR32)
+  ALWAYS_SERIAL_TIED_REGS_ALIAS = 1u << 1,
+
+  // The execution can be made serial by inserting a second instruction that
+  // clobbers/reads memory.
+  // e.g. MOV8rm
+  SERIAL_VIA_MEMORY_INSTR = 1u << 2,
+
+  // The execution can be made serial by picking one Def that aliases with one
+  // Use.
+  // e.g. VXORPSrr XMM1, XMM1, XMM2
+  SERIAL_VIA_EXPLICIT_REGS = 1u << 3,
+
+  // The execution can be made serial by inserting a second instruction that
+  // uses one of the Defs and defs one of the Uses.
+  // e.g.
+  // 1st instruction: MMX_PMOVMSKBrr ECX, MM7
+  // 2nd instruction: MMX_MOVD64rr MM7, ECX
+  //  or instruction: MMX_MOVD64to64rr MM7, ECX
+  //  or instruction: MMX_PINSRWrr MM7, MM7, ECX, 1
+  SERIAL_VIA_NON_MEMORY_INSTR = 1u << 4,
+
+  // The execution is always parallel because the instruction is missing Use or
+  // Def operands.
+  ALWAYS_PARALLEL_MISSING_USE_OR_DEF = 1u << 5,
+
+  // The execution can be made parallel by repeating the same instruction but
+  // making sure that Defs of one instruction do not alias with Uses of the
+  // second one.
+  PARALLEL_VIA_EXPLICIT_REGS = 1u << 6,
+
+  LLVM_MARK_AS_BITMASK_ENUM(/*Largest*/ PARALLEL_VIA_EXPLICIT_REGS)
+};
+
+// Returns whether Execution is one of the values defined in the enum above.
+bool isEnumValue(ExecutionMode Execution);
+
+// Returns a human readable string for the enum.
+llvm::StringRef getName(ExecutionMode Execution);
+
+// Returns a sequence of increasing powers of two corresponding to all the
+// Execution flags.
+llvm::ArrayRef<ExecutionMode> getAllExecutionBits();
+
+// Decomposes Execution into individual set bits.
+llvm::SmallVector<ExecutionMode, 4> getExecutionModeBits(ExecutionMode);
+
+LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
+
 // A CodeTemplate is a set of InstructionTemplates that may not be fully
 // specified (i.e. some variables are not yet set). This allows the
-// BenchmarkRunner to instantiate it many times with specific values to study
+// SnippetGenerator to instantiate it many times with specific values to study
 // their impact on instruction's performance.
 struct CodeTemplate {
   CodeTemplate() = default;
@@ -57,6 +114,7 @@
   CodeTemplate(const CodeTemplate &) = delete;
   CodeTemplate &operator=(const CodeTemplate &) = delete;
 
+  ExecutionMode Execution = ExecutionMode::UNKNOWN;
   // Some information about how this template has been created.
   std::string Info;
   // The list of the instructions for this template.
diff --git a/llvm/tools/llvm-exegesis/lib/Latency.cpp b/llvm/tools/llvm-exegesis/lib/Latency.cpp
index 040b42b..7b991a4 100644
--- a/llvm/tools/llvm-exegesis/lib/Latency.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Latency.cpp
@@ -20,53 +20,148 @@
 
 namespace exegesis {
 
-LatencySnippetGenerator::~LatencySnippetGenerator() = default;
+struct ExecutionClass {
+  ExecutionMode Mask;
+  const char *Description;
+} static const kExecutionClasses[] = {
+    {ExecutionMode::ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS |
+         ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS,
+     "Repeating a single implicitly serial instruction"},
+    {ExecutionMode::SERIAL_VIA_EXPLICIT_REGS,
+     "Repeating a single explicitly serial instruction"},
+    {ExecutionMode::SERIAL_VIA_MEMORY_INSTR |
+         ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR,
+     "Repeating two instructions"},
+};
 
-llvm::Expected<std::vector<CodeTemplate>>
-generateTwoInstructionPrototypes(const LLVMState &State,
-                                 const Instruction &Instr) {
+static constexpr size_t kMaxAliasingInstructions = 10;
+
+static std::vector<Instruction>
+computeAliasingInstructions(const LLVMState &State, const Instruction &Instr,
+                            size_t MaxAliasingInstructions) {
+  // Randomly iterate the set of instructions.
   std::vector<unsigned> Opcodes;
   Opcodes.resize(State.getInstrInfo().getNumOpcodes());
   std::iota(Opcodes.begin(), Opcodes.end(), 0U);
   std::shuffle(Opcodes.begin(), Opcodes.end(), randomGenerator());
+
+  std::vector<Instruction> AliasingInstructions;
   for (const unsigned OtherOpcode : Opcodes) {
-    if (OtherOpcode == Instr.Description->Opcode)
+    if (OtherOpcode == Instr.Description->getOpcode())
       continue;
     const Instruction OtherInstr(State, OtherOpcode);
     if (OtherInstr.hasMemoryOperands())
       continue;
-    const AliasingConfigurations Forward(Instr, OtherInstr);
-    const AliasingConfigurations Back(OtherInstr, Instr);
-    if (Forward.empty() || Back.empty())
-      continue;
-    InstructionTemplate ThisIT(Instr);
-    InstructionTemplate OtherIT(OtherInstr);
-    if (!Forward.hasImplicitAliasing())
-      setRandomAliasing(Forward, ThisIT, OtherIT);
-    if (!Back.hasImplicitAliasing())
-      setRandomAliasing(Back, OtherIT, ThisIT);
-    CodeTemplate CT;
-    CT.Info = llvm::formatv("creating cycle through {0}.",
-                            State.getInstrInfo().getName(OtherOpcode));
-    CT.Instructions.push_back(std::move(ThisIT));
-    CT.Instructions.push_back(std::move(OtherIT));
-    return getSingleton(CT);
+    if (Instr.hasAliasingRegistersThrough(OtherInstr))
+      AliasingInstructions.push_back(std::move(OtherInstr));
+    if (AliasingInstructions.size() >= MaxAliasingInstructions)
+      break;
   }
-  return llvm::make_error<BenchmarkFailure>(
-      "Infeasible : Didn't find any scheme to make the instruction serial");
+  return AliasingInstructions;
 }
 
+static ExecutionMode getExecutionModes(const Instruction &Instr) {
+  ExecutionMode EM;
+  if (Instr.hasAliasingImplicitRegisters())
+    EM |= ExecutionMode::ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS;
+  if (Instr.hasTiedRegisters())
+    EM |= ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS;
+  if (Instr.hasMemoryOperands())
+    EM |= ExecutionMode::SERIAL_VIA_MEMORY_INSTR;
+  else {
+    if (Instr.hasAliasingRegisters())
+      EM |= ExecutionMode::SERIAL_VIA_EXPLICIT_REGS;
+    if (Instr.hasOneUseOrOneDef())
+      EM |= ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR;
+  }
+  return EM;
+}
+
+static void appendCodeTemplates(const LLVMState &State,
+                                const Instruction &Instr,
+                                ExecutionMode ExecutionModeBit,
+                                llvm::StringRef ExecutionClassDescription,
+                                std::vector<CodeTemplate> &CodeTemplates) {
+  assert(isEnumValue(ExecutionModeBit) && "Bit must be a power of two");
+  switch (ExecutionModeBit) {
+  case ExecutionMode::ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS:
+    // Nothing to do, the instruction is always serial.
+    LLVM_FALLTHROUGH;
+  case ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS: {
+    // Picking whatever value for the tied variable will make the instruction
+    // serial.
+    CodeTemplate CT;
+    CT.Execution = ExecutionModeBit;
+    CT.Info = ExecutionClassDescription;
+    CT.Instructions.push_back(Instr);
+    CodeTemplates.push_back(std::move(CT));
+    return;
+  }
+  case ExecutionMode::SERIAL_VIA_MEMORY_INSTR: {
+    // Select back-to-back memory instruction.
+    // TODO: Implement me.
+    return;
+  }
+  case ExecutionMode::SERIAL_VIA_EXPLICIT_REGS: {
+    // Making the execution of this instruction serial by selecting one def
+    // register to alias with one use register.
+    const AliasingConfigurations SelfAliasing(Instr, Instr);
+    assert(!SelfAliasing.empty() && !SelfAliasing.hasImplicitAliasing() &&
+           "Instr must alias itself explicitly");
+    InstructionTemplate IT(Instr);
+    // This is a self aliasing instruction so defs and uses are from the same
+    // instance, hence twice IT in the following call.
+    setRandomAliasing(SelfAliasing, IT, IT);
+    CodeTemplate CT;
+    CT.Execution = ExecutionModeBit;
+    CT.Info = ExecutionClassDescription;
+    CT.Instructions.push_back(std::move(IT));
+    CodeTemplates.push_back(std::move(CT));
+    return;
+  }
+  case ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR: {
+    // Select back-to-back non-memory instruction.
+    for (const auto OtherInstr :
+         computeAliasingInstructions(State, Instr, kMaxAliasingInstructions)) {
+      const AliasingConfigurations Forward(Instr, OtherInstr);
+      const AliasingConfigurations Back(OtherInstr, Instr);
+      InstructionTemplate ThisIT(Instr);
+      InstructionTemplate OtherIT(OtherInstr);
+      if (!Forward.hasImplicitAliasing())
+        setRandomAliasing(Forward, ThisIT, OtherIT);
+      if (!Back.hasImplicitAliasing())
+        setRandomAliasing(Back, OtherIT, ThisIT);
+      CodeTemplate CT;
+      CT.Execution = ExecutionModeBit;
+      CT.Info = ExecutionClassDescription;
+      CT.Instructions.push_back(std::move(ThisIT));
+      CT.Instructions.push_back(std::move(OtherIT));
+      CodeTemplates.push_back(std::move(CT));
+    }
+    return;
+  }
+  default:
+    llvm_unreachable("Unhandled enum value");
+  }
+}
+
+LatencySnippetGenerator::~LatencySnippetGenerator() = default;
+
 llvm::Expected<std::vector<CodeTemplate>>
 LatencySnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
-  if (Instr.hasMemoryOperands())
+  std::vector<CodeTemplate> Results;
+  const ExecutionMode EM = getExecutionModes(Instr);
+  for (const auto EC : kExecutionClasses) {
+    for (const auto ExecutionModeBit : getExecutionModeBits(EM & EC.Mask))
+      appendCodeTemplates(State, Instr, ExecutionModeBit, EC.Description,
+                          Results);
+    if (!Results.empty())
+      break;
+  }
+  if (Results.empty())
     return llvm::make_error<BenchmarkFailure>(
-        "Infeasible : has memory operands");
-  return llvm::handleExpected( //
-      generateSelfAliasingCodeTemplates(Instr),
-      [this, &Instr]() {
-        return generateTwoInstructionPrototypes(State, Instr);
-      },
-      [](const BenchmarkFailure &) { /*Consume Error*/ });
+        "No strategy found to make the execution serial");
+  return std::move(Results);
 }
 
 const char *LatencyBenchmarkRunner::getCounterName() const {
diff --git a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp
index fa93788..59f5652 100644
--- a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp
+++ b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp
@@ -27,7 +27,14 @@
   return TiedOperands[0];
 }
 
-bool Variable::hasTiedOperands() const { return TiedOperands.size() > 1; }
+bool Variable::hasTiedOperands() const {
+  assert(TiedOperands.size() <= 2 &&
+         "No more than two operands can be tied together");
+  // By definition only Use and Def operands can be tied together.
+  // TiedOperands[0] is the Def operand (LLVM stores defs first).
+  // TiedOperands[1] is the Use operand.
+  return TiedOperands.size() > 1;
+}
 
 unsigned Operand::getIndex() const {
   assert(Index >= 0 && "Index must be set");
@@ -197,6 +204,10 @@
   return AllDefRegs.anyCommon(AllUseRegs);
 }
 
+bool Instruction::hasOneUseOrOneDef() const {
+  return AllDefRegs.count() || AllUseRegs.count();
+}
+
 void Instruction::dump(const llvm::MCRegisterInfo &RegInfo,
                        llvm::raw_ostream &Stream) const {
   Stream << "- " << Name << "\n";
@@ -288,8 +299,7 @@
 }
 
 AliasingConfigurations::AliasingConfigurations(
-    const Instruction &DefInstruction, const Instruction &UseInstruction)
-    : DefInstruction(DefInstruction), UseInstruction(UseInstruction) {
+    const Instruction &DefInstruction, const Instruction &UseInstruction) {
   if (UseInstruction.AllUseRegs.anyCommon(DefInstruction.AllDefRegs)) {
     auto CommonRegisters = UseInstruction.AllUseRegs;
     CommonRegisters &= DefInstruction.AllDefRegs;
diff --git a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.h b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.h
index 6910538..17f3e2b 100644
--- a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.h
+++ b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.h
@@ -125,6 +125,11 @@
   // reads or write the same memory region.
   bool hasMemoryOperands() const;
 
+  // Returns whether this instruction as at least one use or one def.
+  // Repeating this instruction may execute sequentially by adding an
+  // instruction that aliases one of these.
+  bool hasOneUseOrOneDef() const;
+
   // Convenient function to help with debugging.
   void dump(const llvm::MCRegisterInfo &RegInfo,
             llvm::raw_ostream &Stream) const;
@@ -174,10 +179,7 @@
 
   bool empty() const; // True if no aliasing configuration is found.
   bool hasImplicitAliasing() const;
-  void setExplicitAliasing() const;
 
-  const Instruction &DefInstruction;
-  const Instruction &UseInstruction;
   llvm::SmallVector<AliasingRegisterOperands, 32> Configurations;
 };
 
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp b/llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp
index feee61d..cdf54a3 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp
@@ -22,7 +22,7 @@
 
 namespace exegesis {
 
-std::vector<CodeTemplate> getSingleton(CodeTemplate &CT) {
+std::vector<CodeTemplate> getSingleton(CodeTemplate &&CT) {
   std::vector<CodeTemplate> Result;
   Result.push_back(std::move(CT));
   return Result;
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetGenerator.h b/llvm/tools/llvm-exegesis/lib/SnippetGenerator.h
index e48cf0c..4b307fd 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetGenerator.h
+++ b/llvm/tools/llvm-exegesis/lib/SnippetGenerator.h
@@ -30,7 +30,7 @@
 
 namespace exegesis {
 
-std::vector<CodeTemplate> getSingleton(CodeTemplate &CT);
+std::vector<CodeTemplate> getSingleton(CodeTemplate &&CT);
 
 // Generates code templates that has a self-dependency.
 llvm::Expected<std::vector<CodeTemplate>>
diff --git a/llvm/tools/llvm-exegesis/lib/Uops.cpp b/llvm/tools/llvm-exegesis/lib/Uops.cpp
index a3ada77..d8065ad 100644
--- a/llvm/tools/llvm-exegesis/lib/Uops.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Uops.cpp
@@ -153,13 +153,13 @@
     CT.Info = "instruction is parallel, repeating a random one.";
     CT.Instructions.push_back(std::move(IT));
     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
-    return getSingleton(CT);
+    return getSingleton(std::move(CT));
   }
   if (SelfAliasing.hasImplicitAliasing()) {
     CT.Info = "instruction is serial, repeating a random one.";
     CT.Instructions.push_back(std::move(IT));
     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
-    return getSingleton(CT);
+    return getSingleton(std::move(CT));
   }
   const auto TiedVariables = getVariablesWithTiedOperands(Instr);
   if (!TiedVariables.empty()) {
@@ -181,7 +181,7 @@
       CT.Instructions.push_back(std::move(TmpIT));
     }
     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
-    return getSingleton(CT);
+    return getSingleton(std::move(CT));
   }
   const auto &ReservedRegisters = State.getRATC().reservedRegisters();
   // No tied variables, we pick random values for defs.
@@ -218,7 +218,7 @@
       "instruction has no tied variables picking Uses different from defs";
   CT.Instructions.push_back(std::move(IT));
   instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
-  return getSingleton(CT);
+  return getSingleton(std::move(CT));
 }
 
 std::vector<BenchmarkMeasure>