Blame - llvm/tools/llvm-exegesis/lib/Uops.cpp - toolchain/llvm-project

blob: 59d2fc408fb8d348a99df75a73b898375252f4e1 [file] [log] [blame]

Clement Courbet	ac74acd	2018-04-04 11:37:06 +0000	[diff] [blame]	1	//===-- Uops.cpp ------------------------------------------------- C++ --===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9
				10	#include "Uops.h"
				11	#include "BenchmarkResult.h"
				12	#include "InstructionSnippetGenerator.h"
				13	#include "PerfHelper.h"
				14	#include "llvm/ADT/StringExtras.h"
				15	#include "llvm/MC/MCInstrDesc.h"
				16	#include "llvm/MC/MCSchedule.h"
				17	#include "llvm/Support/Error.h"
				18	#include <algorithm>
				19	#include <random>
				20	#include <unordered_map>
				21	#include <unordered_set>
				22
				23	namespace exegesis {
				24
				25	// FIXME: Handle memory (see PR36906)
				26	static bool isInvalidOperand(const llvm::MCOperandInfo &OpInfo) {
				27	switch (OpInfo.OperandType) {
				28	default:
				29	return true;
				30	case llvm::MCOI::OPERAND_IMMEDIATE:
				31	case llvm::MCOI::OPERAND_REGISTER:
				32	return false;
				33	}
				34	}
				35
				36	static llvm::Error makeError(llvm::Twine Msg) {
				37	return llvm::make_error<llvm::StringError>(Msg,
				38	llvm::inconvertibleErrorCode());
				39	}
				40
Clement Courbet	ac74acd	2018-04-04 11:37:06 +0000	[diff] [blame]	41	static std::vector<llvm::MCInst> generateIndependentAssignments(
				42	const LLVMState &State, const llvm::MCInstrDesc &InstrDesc,
				43	llvm::SmallVector<Variable, 8> Vars, int MaxAssignments) {
				44	std::unordered_set<llvm::MCPhysReg> IsUsedByAnyVar;
				45	for (const Variable &Var : Vars) {
				46	if (Var.IsUse) {
				47	IsUsedByAnyVar.insert(Var.PossibleRegisters.begin(),
				48	Var.PossibleRegisters.end());
				49	}
				50	}
				51
				52	std::vector<llvm::MCInst> Pattern;
				53	for (int A = 0; A < MaxAssignments; ++A) {
				54	// FIXME: This is a bit pessimistic. We should get away with an
				55	// assignment that ensures that the set of assigned registers for uses and
				56	// the set of assigned registers for defs do not intersect (registers
				57	// for uses (resp defs) do not have to be all distinct).
				58	const std::vector<llvm::MCPhysReg> Regs = getExclusiveAssignment(Vars);
				59	if (Regs.empty())
				60	break;
				61	// Remove all assigned registers defs that are used by at least one other
				62	// variable from the list of possible variable registers. This ensures that
				63	// we never create a RAW hazard that would lead to serialization.
				64	for (size_t I = 0, E = Vars.size(); I < E; ++I) {
				65	llvm::MCPhysReg Reg = Regs[I];
				66	if (Vars[I].IsDef && IsUsedByAnyVar.count(Reg)) {
				67	Vars[I].PossibleRegisters.remove(Reg);
				68	}
				69	}
				70	// Create an MCInst and check assembly.
				71	llvm::MCInst Inst = generateMCInst(InstrDesc, Vars, Regs);
				72	if (!State.canAssemble(Inst))
				73	continue;
				74	Pattern.push_back(std::move(Inst));
				75	}
				76	return Pattern;
				77	}
				78
				79	UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
				80
				81	const char *UopsBenchmarkRunner::getDisplayName() const { return "uops"; }
				82
				83	llvm::Expected<std::vector<llvm::MCInst>> UopsBenchmarkRunner::createCode(
				84	const LLVMState &State, const unsigned OpcodeIndex,
				85	const unsigned NumRepetitions, const JitFunctionContext &Context) const {
				86	const auto &InstrInfo = State.getInstrInfo();
				87	const auto &RegInfo = State.getRegInfo();
				88	const llvm::MCInstrDesc &InstrDesc = InstrInfo.get(OpcodeIndex);
				89	for (const llvm::MCOperandInfo &OpInfo : InstrDesc.operands()) {
				90	if (isInvalidOperand(OpInfo))
				91	return makeError("Only registers and immediates are supported");
				92	}
				93
				94	// FIXME: Load constants into registers (e.g. with fld1) to not break
				95	// instructions like x87.
				96
				97	// Ideally we would like the only limitation on executing uops to be the issue
				98	// ports. Maximizing port pressure increases the likelihood that the load is
				99	// distributed evenly across possible ports.
				100
				101	// To achieve that, one approach is to generate instructions that do not have
				102	// data dependencies between them.
				103	//
				104	// For some instructions, this is trivial:
				105	// mov rax, qword ptr [rsi]
				106	// mov rax, qword ptr [rsi]
				107	// mov rax, qword ptr [rsi]
				108	// mov rax, qword ptr [rsi]
				109	// For the above snippet, haswell just renames rax four times and executes the
				110	// four instructions two at a time on P23 and P0126.
				111	//
				112	// For some instructions, we just need to make sure that the source is
				113	// different from the destination. For example, IDIV8r reads from GPR and
				114	// writes to AX. We just need to ensure that the variable is assigned a
				115	// register which is different from AX:
				116	// idiv bx
				117	// idiv bx
				118	// idiv bx
				119	// idiv bx
				120	// The above snippet will be able to fully saturate the ports, while the same
				121	// with ax would issue one uop every `latency(IDIV8r)` cycles.
				122	//
				123	// Some instructions make this harder because they both read and write from
				124	// the same register:
				125	// inc rax
				126	// inc rax
				127	// inc rax
				128	// inc rax
				129	// This has a data dependency from each instruction to the next, limit the
				130	// number of instructions that can be issued in parallel.
				131	// It turns out that this is not a big issue on recent Intel CPUs because they
				132	// have heuristics to balance port pressure. In the snippet above, subsequent
				133	// instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
				134	// might end up executing them all on P0 (just because they can), or try
				135	// avoiding P5 because it's usually under high pressure from vector
				136	// instructions.
				137	// This issue is even more important for high-latency instructions because
				138	// they increase the idle time of the CPU, e.g. :
				139	// imul rax, rbx
				140	// imul rax, rbx
				141	// imul rax, rbx
				142	// imul rax, rbx
				143	//
				144	// To avoid that, we do the renaming statically by generating as many
				145	// independent exclusive assignments as possible (until all possible registers
				146	// are exhausted) e.g.:
				147	// imul rax, rbx
				148	// imul rcx, rbx
				149	// imul rdx, rbx
				150	// imul r8, rbx
				151	//
				152	// Some instruction even make the above static renaming impossible because
				153	// they implicitly read and write from the same operand, e.g. ADC16rr reads
				154	// and writes from EFLAGS.
				155	// In that case we just use a greedy register assignment and hope for the
				156	// best.
				157
				158	const auto Vars = getVariables(RegInfo, InstrDesc, Context.getReservedRegs());
				159
				160	// Generate as many independent exclusive assignments as possible.
				161	constexpr const int MaxStaticRenames = 20;
				162	std::vector<llvm::MCInst> Pattern =
				163	generateIndependentAssignments(State, InstrDesc, Vars, MaxStaticRenames);
				164	if (Pattern.empty()) {
				165	// We don't even have a single exclusive assignment, fallback to a greedy
				166	// assignment.
				167	// FIXME: Tell the user about this decision to help debugging.
				168	const std::vector<llvm::MCPhysReg> Regs = getGreedyAssignment(Vars);
				169	if (!Vars.empty() && Regs.empty())
				170	return makeError("No feasible greedy assignment");
				171	llvm::MCInst Inst = generateMCInst(InstrDesc, Vars, Regs);
				172	if (!State.canAssemble(Inst))
				173	return makeError("Cannot assemble greedy assignment");
				174	Pattern.push_back(std::move(Inst));
				175	}
				176
				177	// Generate repetitions of the pattern until benchmark_iterations is reached.
				178	std::vector<llvm::MCInst> Result;
				179	Result.reserve(NumRepetitions);
				180	for (unsigned I = 0; I < NumRepetitions; ++I)
				181	Result.push_back(Pattern[I % Pattern.size()]);
				182	return Result;
				183	}
				184
				185	std::vector<BenchmarkMeasure>
				186	UopsBenchmarkRunner::runMeasurements(const LLVMState &State,
				187	const JitFunction &Function,
				188	const unsigned NumRepetitions) const {
				189	const auto &SchedModel = State.getSubtargetInfo().getSchedModel();
				190
				191	std::vector<BenchmarkMeasure> Result;
				192	for (unsigned ProcResIdx = 1;
				193	ProcResIdx < SchedModel.getNumProcResourceKinds(); ++ProcResIdx) {
Clement Courbet	b449379	2018-04-10 08:16:37 +0000	[diff] [blame^]	194	const char *const PfmCounters = SchedModel.getExtraProcessorInfo()
				195	.PfmCounters.IssueCounters[ProcResIdx];
				196	if (!PfmCounters)
Clement Courbet	ac74acd	2018-04-04 11:37:06 +0000	[diff] [blame]	197	continue;
Clement Courbet	b449379	2018-04-10 08:16:37 +0000	[diff] [blame^]	198	// FIXME: Sum results when there are several counters for a single ProcRes
				199	// (e.g. P23 on SandyBridge).
				200	pfm::Counter Counter{pfm::PerfEvent(PfmCounters)};
Clement Courbet	ac74acd	2018-04-04 11:37:06 +0000	[diff] [blame]	201	Counter.start();
				202	Function();
				203	Counter.stop();
				204	Result.push_back({llvm::itostr(ProcResIdx),
				205	static_cast<double>(Counter.read()) / NumRepetitions,
Clement Courbet	b449379	2018-04-10 08:16:37 +0000	[diff] [blame^]	206	SchedModel.getProcResource(ProcResIdx)->Name});
Clement Courbet	ac74acd	2018-04-04 11:37:06 +0000	[diff] [blame]	207	}
				208	return Result;
				209	}
				210
				211	} // namespace exegesis