blob: 59d2fc408fb8d348a99df75a73b898375252f4e1 [file] [log] [blame]
Clement Courbetac74acd2018-04-04 11:37:06 +00001//===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9
10#include "Uops.h"
11#include "BenchmarkResult.h"
12#include "InstructionSnippetGenerator.h"
13#include "PerfHelper.h"
14#include "llvm/ADT/StringExtras.h"
15#include "llvm/MC/MCInstrDesc.h"
16#include "llvm/MC/MCSchedule.h"
17#include "llvm/Support/Error.h"
18#include <algorithm>
19#include <random>
20#include <unordered_map>
21#include <unordered_set>
22
23namespace exegesis {
24
25// FIXME: Handle memory (see PR36906)
26static bool isInvalidOperand(const llvm::MCOperandInfo &OpInfo) {
27 switch (OpInfo.OperandType) {
28 default:
29 return true;
30 case llvm::MCOI::OPERAND_IMMEDIATE:
31 case llvm::MCOI::OPERAND_REGISTER:
32 return false;
33 }
34}
35
36static llvm::Error makeError(llvm::Twine Msg) {
37 return llvm::make_error<llvm::StringError>(Msg,
38 llvm::inconvertibleErrorCode());
39}
40
Clement Courbetac74acd2018-04-04 11:37:06 +000041static std::vector<llvm::MCInst> generateIndependentAssignments(
42 const LLVMState &State, const llvm::MCInstrDesc &InstrDesc,
43 llvm::SmallVector<Variable, 8> Vars, int MaxAssignments) {
44 std::unordered_set<llvm::MCPhysReg> IsUsedByAnyVar;
45 for (const Variable &Var : Vars) {
46 if (Var.IsUse) {
47 IsUsedByAnyVar.insert(Var.PossibleRegisters.begin(),
48 Var.PossibleRegisters.end());
49 }
50 }
51
52 std::vector<llvm::MCInst> Pattern;
53 for (int A = 0; A < MaxAssignments; ++A) {
54 // FIXME: This is a bit pessimistic. We should get away with an
55 // assignment that ensures that the set of assigned registers for uses and
56 // the set of assigned registers for defs do not intersect (registers
57 // for uses (resp defs) do not have to be all distinct).
58 const std::vector<llvm::MCPhysReg> Regs = getExclusiveAssignment(Vars);
59 if (Regs.empty())
60 break;
61 // Remove all assigned registers defs that are used by at least one other
62 // variable from the list of possible variable registers. This ensures that
63 // we never create a RAW hazard that would lead to serialization.
64 for (size_t I = 0, E = Vars.size(); I < E; ++I) {
65 llvm::MCPhysReg Reg = Regs[I];
66 if (Vars[I].IsDef && IsUsedByAnyVar.count(Reg)) {
67 Vars[I].PossibleRegisters.remove(Reg);
68 }
69 }
70 // Create an MCInst and check assembly.
71 llvm::MCInst Inst = generateMCInst(InstrDesc, Vars, Regs);
72 if (!State.canAssemble(Inst))
73 continue;
74 Pattern.push_back(std::move(Inst));
75 }
76 return Pattern;
77}
78
79UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
80
81const char *UopsBenchmarkRunner::getDisplayName() const { return "uops"; }
82
83llvm::Expected<std::vector<llvm::MCInst>> UopsBenchmarkRunner::createCode(
84 const LLVMState &State, const unsigned OpcodeIndex,
85 const unsigned NumRepetitions, const JitFunctionContext &Context) const {
86 const auto &InstrInfo = State.getInstrInfo();
87 const auto &RegInfo = State.getRegInfo();
88 const llvm::MCInstrDesc &InstrDesc = InstrInfo.get(OpcodeIndex);
89 for (const llvm::MCOperandInfo &OpInfo : InstrDesc.operands()) {
90 if (isInvalidOperand(OpInfo))
91 return makeError("Only registers and immediates are supported");
92 }
93
94 // FIXME: Load constants into registers (e.g. with fld1) to not break
95 // instructions like x87.
96
97 // Ideally we would like the only limitation on executing uops to be the issue
98 // ports. Maximizing port pressure increases the likelihood that the load is
99 // distributed evenly across possible ports.
100
101 // To achieve that, one approach is to generate instructions that do not have
102 // data dependencies between them.
103 //
104 // For some instructions, this is trivial:
105 // mov rax, qword ptr [rsi]
106 // mov rax, qword ptr [rsi]
107 // mov rax, qword ptr [rsi]
108 // mov rax, qword ptr [rsi]
109 // For the above snippet, haswell just renames rax four times and executes the
110 // four instructions two at a time on P23 and P0126.
111 //
112 // For some instructions, we just need to make sure that the source is
113 // different from the destination. For example, IDIV8r reads from GPR and
114 // writes to AX. We just need to ensure that the variable is assigned a
115 // register which is different from AX:
116 // idiv bx
117 // idiv bx
118 // idiv bx
119 // idiv bx
120 // The above snippet will be able to fully saturate the ports, while the same
121 // with ax would issue one uop every `latency(IDIV8r)` cycles.
122 //
123 // Some instructions make this harder because they both read and write from
124 // the same register:
125 // inc rax
126 // inc rax
127 // inc rax
128 // inc rax
129 // This has a data dependency from each instruction to the next, limit the
130 // number of instructions that can be issued in parallel.
131 // It turns out that this is not a big issue on recent Intel CPUs because they
132 // have heuristics to balance port pressure. In the snippet above, subsequent
133 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
134 // might end up executing them all on P0 (just because they can), or try
135 // avoiding P5 because it's usually under high pressure from vector
136 // instructions.
137 // This issue is even more important for high-latency instructions because
138 // they increase the idle time of the CPU, e.g. :
139 // imul rax, rbx
140 // imul rax, rbx
141 // imul rax, rbx
142 // imul rax, rbx
143 //
144 // To avoid that, we do the renaming statically by generating as many
145 // independent exclusive assignments as possible (until all possible registers
146 // are exhausted) e.g.:
147 // imul rax, rbx
148 // imul rcx, rbx
149 // imul rdx, rbx
150 // imul r8, rbx
151 //
152 // Some instruction even make the above static renaming impossible because
153 // they implicitly read and write from the same operand, e.g. ADC16rr reads
154 // and writes from EFLAGS.
155 // In that case we just use a greedy register assignment and hope for the
156 // best.
157
158 const auto Vars = getVariables(RegInfo, InstrDesc, Context.getReservedRegs());
159
160 // Generate as many independent exclusive assignments as possible.
161 constexpr const int MaxStaticRenames = 20;
162 std::vector<llvm::MCInst> Pattern =
163 generateIndependentAssignments(State, InstrDesc, Vars, MaxStaticRenames);
164 if (Pattern.empty()) {
165 // We don't even have a single exclusive assignment, fallback to a greedy
166 // assignment.
167 // FIXME: Tell the user about this decision to help debugging.
168 const std::vector<llvm::MCPhysReg> Regs = getGreedyAssignment(Vars);
169 if (!Vars.empty() && Regs.empty())
170 return makeError("No feasible greedy assignment");
171 llvm::MCInst Inst = generateMCInst(InstrDesc, Vars, Regs);
172 if (!State.canAssemble(Inst))
173 return makeError("Cannot assemble greedy assignment");
174 Pattern.push_back(std::move(Inst));
175 }
176
177 // Generate repetitions of the pattern until benchmark_iterations is reached.
178 std::vector<llvm::MCInst> Result;
179 Result.reserve(NumRepetitions);
180 for (unsigned I = 0; I < NumRepetitions; ++I)
181 Result.push_back(Pattern[I % Pattern.size()]);
182 return Result;
183}
184
185std::vector<BenchmarkMeasure>
186UopsBenchmarkRunner::runMeasurements(const LLVMState &State,
187 const JitFunction &Function,
188 const unsigned NumRepetitions) const {
189 const auto &SchedModel = State.getSubtargetInfo().getSchedModel();
190
191 std::vector<BenchmarkMeasure> Result;
192 for (unsigned ProcResIdx = 1;
193 ProcResIdx < SchedModel.getNumProcResourceKinds(); ++ProcResIdx) {
Clement Courbetb4493792018-04-10 08:16:37 +0000194 const char *const PfmCounters = SchedModel.getExtraProcessorInfo()
195 .PfmCounters.IssueCounters[ProcResIdx];
196 if (!PfmCounters)
Clement Courbetac74acd2018-04-04 11:37:06 +0000197 continue;
Clement Courbetb4493792018-04-10 08:16:37 +0000198 // FIXME: Sum results when there are several counters for a single ProcRes
199 // (e.g. P23 on SandyBridge).
200 pfm::Counter Counter{pfm::PerfEvent(PfmCounters)};
Clement Courbetac74acd2018-04-04 11:37:06 +0000201 Counter.start();
202 Function();
203 Counter.stop();
204 Result.push_back({llvm::itostr(ProcResIdx),
205 static_cast<double>(Counter.read()) / NumRepetitions,
Clement Courbetb4493792018-04-10 08:16:37 +0000206 SchedModel.getProcResource(ProcResIdx)->Name});
Clement Courbetac74acd2018-04-04 11:37:06 +0000207 }
208 return Result;
209}
210
211} // namespace exegesis