blob: bcfc5572a1f6f2e5b54ee50718114fcfbc125c15 [file] [log] [blame]
Clement Courbetac74acd2018-04-04 11:37:06 +00001//===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9
10#include "Uops.h"
11#include "BenchmarkResult.h"
12#include "InstructionSnippetGenerator.h"
13#include "PerfHelper.h"
14#include "llvm/ADT/StringExtras.h"
15#include "llvm/MC/MCInstrDesc.h"
16#include "llvm/MC/MCSchedule.h"
17#include "llvm/Support/Error.h"
18#include <algorithm>
19#include <random>
20#include <unordered_map>
21#include <unordered_set>
22
23namespace exegesis {
24
25// FIXME: Handle memory (see PR36906)
26static bool isInvalidOperand(const llvm::MCOperandInfo &OpInfo) {
27 switch (OpInfo.OperandType) {
28 default:
29 return true;
30 case llvm::MCOI::OPERAND_IMMEDIATE:
31 case llvm::MCOI::OPERAND_REGISTER:
32 return false;
33 }
34}
35
36static llvm::Error makeError(llvm::Twine Msg) {
37 return llvm::make_error<llvm::StringError>(Msg,
38 llvm::inconvertibleErrorCode());
39}
40
41// FIXME: Read the counter names from the ProcResourceUnits when PR36984 is
42// fixed.
43static const std::string *getEventNameFromProcResName(const char *ProcResName) {
44 static const std::unordered_map<std::string, std::string> Entries = {
45 {"SBPort0", "UOPS_DISPATCHED_PORT:PORT_0"},
46 {"SBPort1", "UOPS_DISPATCHED_PORT:PORT_1"},
47 {"SBPort4", "UOPS_DISPATCHED_PORT:PORT_4"},
48 {"SBPort5", "UOPS_DISPATCHED_PORT:PORT_5"},
49 {"HWPort0", "UOPS_DISPATCHED_PORT:PORT_0"},
50 {"HWPort1", "UOPS_DISPATCHED_PORT:PORT_1"},
51 {"HWPort2", "UOPS_DISPATCHED_PORT:PORT_2"},
52 {"HWPort3", "UOPS_DISPATCHED_PORT:PORT_3"},
53 {"HWPort4", "UOPS_DISPATCHED_PORT:PORT_4"},
54 {"HWPort5", "UOPS_DISPATCHED_PORT:PORT_5"},
55 {"HWPort6", "UOPS_DISPATCHED_PORT:PORT_6"},
56 {"HWPort7", "UOPS_DISPATCHED_PORT:PORT_7"},
57 {"SKLPort0", "UOPS_DISPATCHED_PORT:PORT_0"},
58 {"SKLPort1", "UOPS_DISPATCHED_PORT:PORT_1"},
59 {"SKLPort2", "UOPS_DISPATCHED_PORT:PORT_2"},
60 {"SKLPort3", "UOPS_DISPATCHED_PORT:PORT_3"},
61 {"SKLPort4", "UOPS_DISPATCHED_PORT:PORT_4"},
62 {"SKLPort5", "UOPS_DISPATCHED_PORT:PORT_5"},
63 {"SKLPort6", "UOPS_DISPATCHED_PORT:PORT_6"},
64 {"SKXPort7", "UOPS_DISPATCHED_PORT:PORT_7"},
65 {"SKXPort0", "UOPS_DISPATCHED_PORT:PORT_0"},
66 {"SKXPort1", "UOPS_DISPATCHED_PORT:PORT_1"},
67 {"SKXPort2", "UOPS_DISPATCHED_PORT:PORT_2"},
68 {"SKXPort3", "UOPS_DISPATCHED_PORT:PORT_3"},
69 {"SKXPort4", "UOPS_DISPATCHED_PORT:PORT_4"},
70 {"SKXPort5", "UOPS_DISPATCHED_PORT:PORT_5"},
71 {"SKXPort6", "UOPS_DISPATCHED_PORT:PORT_6"},
72 {"SKXPort7", "UOPS_DISPATCHED_PORT:PORT_7"},
73 };
74 const auto It = Entries.find(ProcResName);
75 return It == Entries.end() ? nullptr : &It->second;
76}
77
78static std::vector<llvm::MCInst> generateIndependentAssignments(
79 const LLVMState &State, const llvm::MCInstrDesc &InstrDesc,
80 llvm::SmallVector<Variable, 8> Vars, int MaxAssignments) {
81 std::unordered_set<llvm::MCPhysReg> IsUsedByAnyVar;
82 for (const Variable &Var : Vars) {
83 if (Var.IsUse) {
84 IsUsedByAnyVar.insert(Var.PossibleRegisters.begin(),
85 Var.PossibleRegisters.end());
86 }
87 }
88
89 std::vector<llvm::MCInst> Pattern;
90 for (int A = 0; A < MaxAssignments; ++A) {
91 // FIXME: This is a bit pessimistic. We should get away with an
92 // assignment that ensures that the set of assigned registers for uses and
93 // the set of assigned registers for defs do not intersect (registers
94 // for uses (resp defs) do not have to be all distinct).
95 const std::vector<llvm::MCPhysReg> Regs = getExclusiveAssignment(Vars);
96 if (Regs.empty())
97 break;
98 // Remove all assigned registers defs that are used by at least one other
99 // variable from the list of possible variable registers. This ensures that
100 // we never create a RAW hazard that would lead to serialization.
101 for (size_t I = 0, E = Vars.size(); I < E; ++I) {
102 llvm::MCPhysReg Reg = Regs[I];
103 if (Vars[I].IsDef && IsUsedByAnyVar.count(Reg)) {
104 Vars[I].PossibleRegisters.remove(Reg);
105 }
106 }
107 // Create an MCInst and check assembly.
108 llvm::MCInst Inst = generateMCInst(InstrDesc, Vars, Regs);
109 if (!State.canAssemble(Inst))
110 continue;
111 Pattern.push_back(std::move(Inst));
112 }
113 return Pattern;
114}
115
116UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
117
118const char *UopsBenchmarkRunner::getDisplayName() const { return "uops"; }
119
120llvm::Expected<std::vector<llvm::MCInst>> UopsBenchmarkRunner::createCode(
121 const LLVMState &State, const unsigned OpcodeIndex,
122 const unsigned NumRepetitions, const JitFunctionContext &Context) const {
123 const auto &InstrInfo = State.getInstrInfo();
124 const auto &RegInfo = State.getRegInfo();
125 const llvm::MCInstrDesc &InstrDesc = InstrInfo.get(OpcodeIndex);
126 for (const llvm::MCOperandInfo &OpInfo : InstrDesc.operands()) {
127 if (isInvalidOperand(OpInfo))
128 return makeError("Only registers and immediates are supported");
129 }
130
131 // FIXME: Load constants into registers (e.g. with fld1) to not break
132 // instructions like x87.
133
134 // Ideally we would like the only limitation on executing uops to be the issue
135 // ports. Maximizing port pressure increases the likelihood that the load is
136 // distributed evenly across possible ports.
137
138 // To achieve that, one approach is to generate instructions that do not have
139 // data dependencies between them.
140 //
141 // For some instructions, this is trivial:
142 // mov rax, qword ptr [rsi]
143 // mov rax, qword ptr [rsi]
144 // mov rax, qword ptr [rsi]
145 // mov rax, qword ptr [rsi]
146 // For the above snippet, haswell just renames rax four times and executes the
147 // four instructions two at a time on P23 and P0126.
148 //
149 // For some instructions, we just need to make sure that the source is
150 // different from the destination. For example, IDIV8r reads from GPR and
151 // writes to AX. We just need to ensure that the variable is assigned a
152 // register which is different from AX:
153 // idiv bx
154 // idiv bx
155 // idiv bx
156 // idiv bx
157 // The above snippet will be able to fully saturate the ports, while the same
158 // with ax would issue one uop every `latency(IDIV8r)` cycles.
159 //
160 // Some instructions make this harder because they both read and write from
161 // the same register:
162 // inc rax
163 // inc rax
164 // inc rax
165 // inc rax
166 // This has a data dependency from each instruction to the next, limit the
167 // number of instructions that can be issued in parallel.
168 // It turns out that this is not a big issue on recent Intel CPUs because they
169 // have heuristics to balance port pressure. In the snippet above, subsequent
170 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
171 // might end up executing them all on P0 (just because they can), or try
172 // avoiding P5 because it's usually under high pressure from vector
173 // instructions.
174 // This issue is even more important for high-latency instructions because
175 // they increase the idle time of the CPU, e.g. :
176 // imul rax, rbx
177 // imul rax, rbx
178 // imul rax, rbx
179 // imul rax, rbx
180 //
181 // To avoid that, we do the renaming statically by generating as many
182 // independent exclusive assignments as possible (until all possible registers
183 // are exhausted) e.g.:
184 // imul rax, rbx
185 // imul rcx, rbx
186 // imul rdx, rbx
187 // imul r8, rbx
188 //
189 // Some instruction even make the above static renaming impossible because
190 // they implicitly read and write from the same operand, e.g. ADC16rr reads
191 // and writes from EFLAGS.
192 // In that case we just use a greedy register assignment and hope for the
193 // best.
194
195 const auto Vars = getVariables(RegInfo, InstrDesc, Context.getReservedRegs());
196
197 // Generate as many independent exclusive assignments as possible.
198 constexpr const int MaxStaticRenames = 20;
199 std::vector<llvm::MCInst> Pattern =
200 generateIndependentAssignments(State, InstrDesc, Vars, MaxStaticRenames);
201 if (Pattern.empty()) {
202 // We don't even have a single exclusive assignment, fallback to a greedy
203 // assignment.
204 // FIXME: Tell the user about this decision to help debugging.
205 const std::vector<llvm::MCPhysReg> Regs = getGreedyAssignment(Vars);
206 if (!Vars.empty() && Regs.empty())
207 return makeError("No feasible greedy assignment");
208 llvm::MCInst Inst = generateMCInst(InstrDesc, Vars, Regs);
209 if (!State.canAssemble(Inst))
210 return makeError("Cannot assemble greedy assignment");
211 Pattern.push_back(std::move(Inst));
212 }
213
214 // Generate repetitions of the pattern until benchmark_iterations is reached.
215 std::vector<llvm::MCInst> Result;
216 Result.reserve(NumRepetitions);
217 for (unsigned I = 0; I < NumRepetitions; ++I)
218 Result.push_back(Pattern[I % Pattern.size()]);
219 return Result;
220}
221
222std::vector<BenchmarkMeasure>
223UopsBenchmarkRunner::runMeasurements(const LLVMState &State,
224 const JitFunction &Function,
225 const unsigned NumRepetitions) const {
226 const auto &SchedModel = State.getSubtargetInfo().getSchedModel();
227
228 std::vector<BenchmarkMeasure> Result;
229 for (unsigned ProcResIdx = 1;
230 ProcResIdx < SchedModel.getNumProcResourceKinds(); ++ProcResIdx) {
231 const llvm::MCProcResourceDesc &ProcRes =
232 *SchedModel.getProcResource(ProcResIdx);
233 const std::string *const EventName =
234 getEventNameFromProcResName(ProcRes.Name);
235 if (!EventName)
236 continue;
237 pfm::Counter Counter{pfm::PerfEvent(*EventName)};
238 Counter.start();
239 Function();
240 Counter.stop();
241 Result.push_back({llvm::itostr(ProcResIdx),
242 static_cast<double>(Counter.read()) / NumRepetitions,
243 ProcRes.Name});
244 }
245 return Result;
246}
247
248} // namespace exegesis