Blame - llvm/tools/llvm-exegesis/lib/Uops.cpp - toolchain/llvm-project

blob: bcfc5572a1f6f2e5b54ee50718114fcfbc125c15 [file] [log] [blame]

Clement Courbet	ac74acd	2018-04-04 11:37:06 +0000	[diff] [blame]	1	//===-- Uops.cpp ------------------------------------------------- C++ --===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9
				10	#include "Uops.h"
				11	#include "BenchmarkResult.h"
				12	#include "InstructionSnippetGenerator.h"
				13	#include "PerfHelper.h"
				14	#include "llvm/ADT/StringExtras.h"
				15	#include "llvm/MC/MCInstrDesc.h"
				16	#include "llvm/MC/MCSchedule.h"
				17	#include "llvm/Support/Error.h"
				18	#include <algorithm>
				19	#include <random>
				20	#include <unordered_map>
				21	#include <unordered_set>
				22
				23	namespace exegesis {
				24
				25	// FIXME: Handle memory (see PR36906)
				26	static bool isInvalidOperand(const llvm::MCOperandInfo &OpInfo) {
				27	switch (OpInfo.OperandType) {
				28	default:
				29	return true;
				30	case llvm::MCOI::OPERAND_IMMEDIATE:
				31	case llvm::MCOI::OPERAND_REGISTER:
				32	return false;
				33	}
				34	}
				35
				36	static llvm::Error makeError(llvm::Twine Msg) {
				37	return llvm::make_error<llvm::StringError>(Msg,
				38	llvm::inconvertibleErrorCode());
				39	}
				40
				41	// FIXME: Read the counter names from the ProcResourceUnits when PR36984 is
				42	// fixed.
				43	static const std::string getEventNameFromProcResName(const char ProcResName) {
				44	static const std::unordered_map<std::string, std::string> Entries = {
				45	{"SBPort0", "UOPS_DISPATCHED_PORT:PORT_0"},
				46	{"SBPort1", "UOPS_DISPATCHED_PORT:PORT_1"},
				47	{"SBPort4", "UOPS_DISPATCHED_PORT:PORT_4"},
				48	{"SBPort5", "UOPS_DISPATCHED_PORT:PORT_5"},
				49	{"HWPort0", "UOPS_DISPATCHED_PORT:PORT_0"},
				50	{"HWPort1", "UOPS_DISPATCHED_PORT:PORT_1"},
				51	{"HWPort2", "UOPS_DISPATCHED_PORT:PORT_2"},
				52	{"HWPort3", "UOPS_DISPATCHED_PORT:PORT_3"},
				53	{"HWPort4", "UOPS_DISPATCHED_PORT:PORT_4"},
				54	{"HWPort5", "UOPS_DISPATCHED_PORT:PORT_5"},
				55	{"HWPort6", "UOPS_DISPATCHED_PORT:PORT_6"},
				56	{"HWPort7", "UOPS_DISPATCHED_PORT:PORT_7"},
				57	{"SKLPort0", "UOPS_DISPATCHED_PORT:PORT_0"},
				58	{"SKLPort1", "UOPS_DISPATCHED_PORT:PORT_1"},
				59	{"SKLPort2", "UOPS_DISPATCHED_PORT:PORT_2"},
				60	{"SKLPort3", "UOPS_DISPATCHED_PORT:PORT_3"},
				61	{"SKLPort4", "UOPS_DISPATCHED_PORT:PORT_4"},
				62	{"SKLPort5", "UOPS_DISPATCHED_PORT:PORT_5"},
				63	{"SKLPort6", "UOPS_DISPATCHED_PORT:PORT_6"},
				64	{"SKXPort7", "UOPS_DISPATCHED_PORT:PORT_7"},
				65	{"SKXPort0", "UOPS_DISPATCHED_PORT:PORT_0"},
				66	{"SKXPort1", "UOPS_DISPATCHED_PORT:PORT_1"},
				67	{"SKXPort2", "UOPS_DISPATCHED_PORT:PORT_2"},
				68	{"SKXPort3", "UOPS_DISPATCHED_PORT:PORT_3"},
				69	{"SKXPort4", "UOPS_DISPATCHED_PORT:PORT_4"},
				70	{"SKXPort5", "UOPS_DISPATCHED_PORT:PORT_5"},
				71	{"SKXPort6", "UOPS_DISPATCHED_PORT:PORT_6"},
				72	{"SKXPort7", "UOPS_DISPATCHED_PORT:PORT_7"},
				73	};
				74	const auto It = Entries.find(ProcResName);
				75	return It == Entries.end() ? nullptr : &It->second;
				76	}
				77
				78	static std::vector<llvm::MCInst> generateIndependentAssignments(
				79	const LLVMState &State, const llvm::MCInstrDesc &InstrDesc,
				80	llvm::SmallVector<Variable, 8> Vars, int MaxAssignments) {
				81	std::unordered_set<llvm::MCPhysReg> IsUsedByAnyVar;
				82	for (const Variable &Var : Vars) {
				83	if (Var.IsUse) {
				84	IsUsedByAnyVar.insert(Var.PossibleRegisters.begin(),
				85	Var.PossibleRegisters.end());
				86	}
				87	}
				88
				89	std::vector<llvm::MCInst> Pattern;
				90	for (int A = 0; A < MaxAssignments; ++A) {
				91	// FIXME: This is a bit pessimistic. We should get away with an
				92	// assignment that ensures that the set of assigned registers for uses and
				93	// the set of assigned registers for defs do not intersect (registers
				94	// for uses (resp defs) do not have to be all distinct).
				95	const std::vector<llvm::MCPhysReg> Regs = getExclusiveAssignment(Vars);
				96	if (Regs.empty())
				97	break;
				98	// Remove all assigned registers defs that are used by at least one other
				99	// variable from the list of possible variable registers. This ensures that
				100	// we never create a RAW hazard that would lead to serialization.
				101	for (size_t I = 0, E = Vars.size(); I < E; ++I) {
				102	llvm::MCPhysReg Reg = Regs[I];
				103	if (Vars[I].IsDef && IsUsedByAnyVar.count(Reg)) {
				104	Vars[I].PossibleRegisters.remove(Reg);
				105	}
				106	}
				107	// Create an MCInst and check assembly.
				108	llvm::MCInst Inst = generateMCInst(InstrDesc, Vars, Regs);
				109	if (!State.canAssemble(Inst))
				110	continue;
				111	Pattern.push_back(std::move(Inst));
				112	}
				113	return Pattern;
				114	}
				115
				116	UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
				117
				118	const char *UopsBenchmarkRunner::getDisplayName() const { return "uops"; }
				119
				120	llvm::Expected<std::vector<llvm::MCInst>> UopsBenchmarkRunner::createCode(
				121	const LLVMState &State, const unsigned OpcodeIndex,
				122	const unsigned NumRepetitions, const JitFunctionContext &Context) const {
				123	const auto &InstrInfo = State.getInstrInfo();
				124	const auto &RegInfo = State.getRegInfo();
				125	const llvm::MCInstrDesc &InstrDesc = InstrInfo.get(OpcodeIndex);
				126	for (const llvm::MCOperandInfo &OpInfo : InstrDesc.operands()) {
				127	if (isInvalidOperand(OpInfo))
				128	return makeError("Only registers and immediates are supported");
				129	}
				130
				131	// FIXME: Load constants into registers (e.g. with fld1) to not break
				132	// instructions like x87.
				133
				134	// Ideally we would like the only limitation on executing uops to be the issue
				135	// ports. Maximizing port pressure increases the likelihood that the load is
				136	// distributed evenly across possible ports.
				137
				138	// To achieve that, one approach is to generate instructions that do not have
				139	// data dependencies between them.
				140	//
				141	// For some instructions, this is trivial:
				142	// mov rax, qword ptr [rsi]
				143	// mov rax, qword ptr [rsi]
				144	// mov rax, qword ptr [rsi]
				145	// mov rax, qword ptr [rsi]
				146	// For the above snippet, haswell just renames rax four times and executes the
				147	// four instructions two at a time on P23 and P0126.
				148	//
				149	// For some instructions, we just need to make sure that the source is
				150	// different from the destination. For example, IDIV8r reads from GPR and
				151	// writes to AX. We just need to ensure that the variable is assigned a
				152	// register which is different from AX:
				153	// idiv bx
				154	// idiv bx
				155	// idiv bx
				156	// idiv bx
				157	// The above snippet will be able to fully saturate the ports, while the same
				158	// with ax would issue one uop every `latency(IDIV8r)` cycles.
				159	//
				160	// Some instructions make this harder because they both read and write from
				161	// the same register:
				162	// inc rax
				163	// inc rax
				164	// inc rax
				165	// inc rax
				166	// This has a data dependency from each instruction to the next, limit the
				167	// number of instructions that can be issued in parallel.
				168	// It turns out that this is not a big issue on recent Intel CPUs because they
				169	// have heuristics to balance port pressure. In the snippet above, subsequent
				170	// instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
				171	// might end up executing them all on P0 (just because they can), or try
				172	// avoiding P5 because it's usually under high pressure from vector
				173	// instructions.
				174	// This issue is even more important for high-latency instructions because
				175	// they increase the idle time of the CPU, e.g. :
				176	// imul rax, rbx
				177	// imul rax, rbx
				178	// imul rax, rbx
				179	// imul rax, rbx
				180	//
				181	// To avoid that, we do the renaming statically by generating as many
				182	// independent exclusive assignments as possible (until all possible registers
				183	// are exhausted) e.g.:
				184	// imul rax, rbx
				185	// imul rcx, rbx
				186	// imul rdx, rbx
				187	// imul r8, rbx
				188	//
				189	// Some instruction even make the above static renaming impossible because
				190	// they implicitly read and write from the same operand, e.g. ADC16rr reads
				191	// and writes from EFLAGS.
				192	// In that case we just use a greedy register assignment and hope for the
				193	// best.
				194
				195	const auto Vars = getVariables(RegInfo, InstrDesc, Context.getReservedRegs());
				196
				197	// Generate as many independent exclusive assignments as possible.
				198	constexpr const int MaxStaticRenames = 20;
				199	std::vector<llvm::MCInst> Pattern =
				200	generateIndependentAssignments(State, InstrDesc, Vars, MaxStaticRenames);
				201	if (Pattern.empty()) {
				202	// We don't even have a single exclusive assignment, fallback to a greedy
				203	// assignment.
				204	// FIXME: Tell the user about this decision to help debugging.
				205	const std::vector<llvm::MCPhysReg> Regs = getGreedyAssignment(Vars);
				206	if (!Vars.empty() && Regs.empty())
				207	return makeError("No feasible greedy assignment");
				208	llvm::MCInst Inst = generateMCInst(InstrDesc, Vars, Regs);
				209	if (!State.canAssemble(Inst))
				210	return makeError("Cannot assemble greedy assignment");
				211	Pattern.push_back(std::move(Inst));
				212	}
				213
				214	// Generate repetitions of the pattern until benchmark_iterations is reached.
				215	std::vector<llvm::MCInst> Result;
				216	Result.reserve(NumRepetitions);
				217	for (unsigned I = 0; I < NumRepetitions; ++I)
				218	Result.push_back(Pattern[I % Pattern.size()]);
				219	return Result;
				220	}
				221
				222	std::vector<BenchmarkMeasure>
				223	UopsBenchmarkRunner::runMeasurements(const LLVMState &State,
				224	const JitFunction &Function,
				225	const unsigned NumRepetitions) const {
				226	const auto &SchedModel = State.getSubtargetInfo().getSchedModel();
				227
				228	std::vector<BenchmarkMeasure> Result;
				229	for (unsigned ProcResIdx = 1;
				230	ProcResIdx < SchedModel.getNumProcResourceKinds(); ++ProcResIdx) {
				231	const llvm::MCProcResourceDesc &ProcRes =
				232	*SchedModel.getProcResource(ProcResIdx);
				233	const std::string *const EventName =
				234	getEventNameFromProcResName(ProcRes.Name);
				235	if (!EventName)
				236	continue;
				237	pfm::Counter Counter{pfm::PerfEvent(*EventName)};
				238	Counter.start();
				239	Function();
				240	Counter.stop();
				241	Result.push_back({llvm::itostr(ProcResIdx),
				242	static_cast<double>(Counter.read()) / NumRepetitions,
				243	ProcRes.Name});
				244	}
				245	return Result;
				246	}
				247
				248	} // namespace exegesis