Blame - llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp - toolchain/llvm-project

blob: a4e9ead00709eeb998cc9f418667fb0713924514 [file] [log] [blame]

Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1	//====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	/// \file
				10	///
				11	/// Provide a pass which mitigates speculative execution attacks which operate
				12	/// by speculating incorrectly past some predicate (a type check, bounds check,
				13	/// or other condition) to reach a load with invalid inputs and leak the data
				14	/// accessed by that load using a side channel out of the speculative domain.
				15	///
				16	/// For details on the attacks, see the first variant in both the Project Zero
				17	/// writeup and the Spectre paper:
				18	/// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
				19	/// https://spectreattack.com/spectre.pdf
				20	///
				21	//===----------------------------------------------------------------------===//
				22
				23	#include "X86.h"
				24	#include "X86InstrBuilder.h"
				25	#include "X86InstrInfo.h"
				26	#include "X86Subtarget.h"
				27	#include "llvm/ADT/ArrayRef.h"
				28	#include "llvm/ADT/DenseMap.h"
				29	#include "llvm/ADT/STLExtras.h"
				30	#include "llvm/ADT/ScopeExit.h"
				31	#include "llvm/ADT/SmallPtrSet.h"
				32	#include "llvm/ADT/SmallSet.h"
				33	#include "llvm/ADT/SmallVector.h"
				34	#include "llvm/ADT/SparseBitVector.h"
				35	#include "llvm/ADT/Statistic.h"
				36	#include "llvm/CodeGen/MachineBasicBlock.h"
				37	#include "llvm/CodeGen/MachineConstantPool.h"
				38	#include "llvm/CodeGen/MachineFunction.h"
				39	#include "llvm/CodeGen/MachineFunctionPass.h"
				40	#include "llvm/CodeGen/MachineInstr.h"
				41	#include "llvm/CodeGen/MachineInstrBuilder.h"
				42	#include "llvm/CodeGen/MachineModuleInfo.h"
				43	#include "llvm/CodeGen/MachineOperand.h"
				44	#include "llvm/CodeGen/MachineRegisterInfo.h"
				45	#include "llvm/CodeGen/MachineSSAUpdater.h"
				46	#include "llvm/CodeGen/TargetInstrInfo.h"
				47	#include "llvm/CodeGen/TargetRegisterInfo.h"
				48	#include "llvm/CodeGen/TargetSchedule.h"
				49	#include "llvm/CodeGen/TargetSubtargetInfo.h"
				50	#include "llvm/IR/DebugLoc.h"
				51	#include "llvm/MC/MCSchedule.h"
				52	#include "llvm/Pass.h"
				53	#include "llvm/Support/CommandLine.h"
				54	#include "llvm/Support/Debug.h"
				55	#include "llvm/Support/raw_ostream.h"
				56	#include <algorithm>
				57	#include <cassert>
				58	#include <iterator>
				59	#include <utility>
				60
				61	using namespace llvm;
				62
				63	#define PASS_KEY "x86-speculative-load-hardening"
				64	#define DEBUG_TYPE PASS_KEY
				65
				66	STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced");
				67	STATISTIC(NumBranchesUntraced, "Number of branches unable to trace");
				68	STATISTIC(NumAddrRegsHardened,
				69	"Number of address mode used registers hardaned");
				70	STATISTIC(NumPostLoadRegsHardened,
				71	"Number of post-load register values hardened");
				72	STATISTIC(NumInstsInserted, "Number of instructions inserted");
				73	STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
				74
				75	static cl::opt<bool> HardenEdgesWithLFENCE(
				76	PASS_KEY "-lfence",
				77	cl::desc(
				78	"Use LFENCE along each conditional edge to harden against speculative "
				79	"loads rather than conditional movs and poisoned pointers."),
				80	cl::init(false), cl::Hidden);
				81
				82	static cl::opt<bool> EnablePostLoadHardening(
				83	PASS_KEY "-post-load",
				84	cl::desc("Harden the value loaded after it is loaded by "
				85	"flushing the loaded bits to 1. This is hard to do "
				86	"in general but can be done easily for GPRs."),
				87	cl::init(true), cl::Hidden);
				88
				89	static cl::opt<bool> FenceCallAndRet(
				90	PASS_KEY "-fence-call-and-ret",
				91	cl::desc("Use a full speculation fence to harden both call and ret edges "
				92	"rather than a lighter weight mitigation."),
				93	cl::init(false), cl::Hidden);
				94
				95	static cl::opt<bool> HardenInterprocedurally(
				96	PASS_KEY "-ip",
				97	cl::desc("Harden interprocedurally by passing our state in and out of "
				98	"functions in the high bits of the stack pointer."),
				99	cl::init(true), cl::Hidden);
				100
				101	static cl::opt<bool>
				102	HardenLoads(PASS_KEY "-loads",
				103	cl::desc("Sanitize loads from memory. When disable, no "
				104	"significant security is provided."),
				105	cl::init(true), cl::Hidden);
				106
				107	namespace llvm {
				108
				109	void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
				110
				111	} // end namespace llvm
				112
				113	namespace {
				114
				115	class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
				116	public:
				117	X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) {
				118	initializeX86SpeculativeLoadHardeningPassPass(
				119	*PassRegistry::getPassRegistry());
				120	}
				121
				122	StringRef getPassName() const override {
				123	return "X86 speculative load hardening";
				124	}
				125	bool runOnMachineFunction(MachineFunction &MF) override;
				126	void getAnalysisUsage(AnalysisUsage &AU) const override;
				127
				128	/// Pass identification, replacement for typeid.
				129	static char ID;
				130
				131	private:
				132	/// The information about a block's conditional terminators needed to trace
				133	/// our predicate state through the exiting edges.
				134	struct BlockCondInfo {
				135	MachineBasicBlock *MBB;
				136
				137	// We mostly have one conditional branch, and in extremely rare cases have
				138	// two. Three and more are so rare as to be unimportant for compile time.
				139	SmallVector<MachineInstr *, 2> CondBrs;
				140
				141	MachineInstr *UncondBr;
				142	};
				143
				144	const X86Subtarget *Subtarget;
				145	MachineRegisterInfo *MRI;
				146	const X86InstrInfo *TII;
				147	const TargetRegisterInfo *TRI;
				148	const TargetRegisterClass *PredStateRC;
				149
				150	void hardenEdgesWithLFENCE(MachineFunction &MF);
				151
				152	SmallVector<BlockCondInfo, 16> collectBlockCondInfo(MachineFunction &MF);
				153
				154	void checkAllLoads(MachineFunction &MF, MachineSSAUpdater &PredStateSSA);
				155
				156	unsigned saveEFLAGS(MachineBasicBlock &MBB,
				157	MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
				158	void restoreEFLAGS(MachineBasicBlock &MBB,
				159	MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
				160	unsigned OFReg);
				161
				162	void mergePredStateIntoSP(MachineBasicBlock &MBB,
				163	MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
				164	unsigned PredStateReg);
				165	unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
				166	MachineBasicBlock::iterator InsertPt,
				167	DebugLoc Loc);
				168
				169	void
				170	hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
				171	MachineOperand &IndexMO, MachineSSAUpdater &PredStateSSA,
				172	SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
				173	MachineInstr *
				174	sinkPostLoadHardenedInst(MachineInstr &MI,
				175	SmallPtrSetImpl<MachineInstr *> &HardenedLoads);
Chandler Carruth	e66a6f4	2018-07-16 11:38:48 +0000	[diff] [blame]	176	bool canHardenPostLoad(MachineInstr &MI);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	177	void hardenPostLoad(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
				178	void checkReturnInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
				179	void checkCallInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
				180	};
				181
				182	} // end anonymous namespace
				183
				184	char X86SpeculativeLoadHardeningPass::ID = 0;
				185
				186	void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
				187	AnalysisUsage &AU) const {
				188	MachineFunctionPass::getAnalysisUsage(AU);
				189	}
				190
				191	static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB,
				192	MachineBasicBlock &Succ, int SuccCount,
				193	MachineInstr Br, MachineInstr &UncondBr,
				194	const X86InstrInfo &TII) {
				195	assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!");
				196
				197	MachineFunction &MF = *MBB.getParent();
				198
				199	MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
				200
				201	// We have to insert the new block immediately after the current one as we
				202	// don't know what layout-successor relationships the successor has and we
				203	// may not be able to (and generally don't want to) try to fix those up.
				204	MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
				205
				206	// Update the branch instruction if necessary.
				207	if (Br) {
				208	assert(Br->getOperand(0).getMBB() == &Succ &&
				209	"Didn't start with the right target!");
				210	Br->getOperand(0).setMBB(&NewMBB);
				211
				212	// If this successor was reached through a branch rather than fallthrough,
				213	// we might have broken fallthrough and so need to inject a new
				214	// unconditional branch.
				215	if (!UncondBr) {
				216	MachineBasicBlock &OldLayoutSucc =
				217	*std::next(MachineFunction::iterator(&NewMBB));
				218	assert(MBB.isSuccessor(&OldLayoutSucc) &&
				219	"Without an unconditional branch, the old layout successor should "
				220	"be an actual successor!");
				221	auto BrBuilder =
				222	BuildMI(&MBB, DebugLoc(), TII.get(X86::JMP_1)).addMBB(&OldLayoutSucc);
				223	// Update the unconditional branch now that we've added one.
				224	UncondBr = &*BrBuilder;
				225	}
				226
				227	// Insert unconditional "jump Succ" instruction in the new block if
				228	// necessary.
				229	if (!NewMBB.isLayoutSuccessor(&Succ)) {
				230	SmallVector<MachineOperand, 4> Cond;
				231	TII.insertBranch(NewMBB, &Succ, nullptr, Cond, Br->getDebugLoc());
				232	}
				233	} else {
				234	assert(!UncondBr &&
				235	"Cannot have a branchless successor and an unconditional branch!");
				236	assert(NewMBB.isLayoutSuccessor(&Succ) &&
				237	"A non-branch successor must have been a layout successor before "
				238	"and now is a layout successor of the new block.");
				239	}
				240
				241	// If this is the only edge to the successor, we can just replace it in the
				242	// CFG. Otherwise we need to add a new entry in the CFG for the new
				243	// successor.
				244	if (SuccCount == 1) {
				245	MBB.replaceSuccessor(&Succ, &NewMBB);
				246	} else {
				247	MBB.splitSuccessor(&Succ, &NewMBB);
				248	}
				249
				250	// Hook up the edge from the new basic block to the old successor in the CFG.
				251	NewMBB.addSuccessor(&Succ);
				252
				253	// Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
				254	for (MachineInstr &MI : Succ) {
				255	if (!MI.isPHI())
				256	break;
				257	for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
				258	OpIdx += 2) {
				259	MachineOperand &OpV = MI.getOperand(OpIdx);
				260	MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
				261	assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
				262	if (OpMBB.getMBB() != &MBB)
				263	continue;
				264
				265	// If this is the last edge to the succesor, just replace MBB in the PHI
				266	if (SuccCount == 1) {
				267	OpMBB.setMBB(&NewMBB);
				268	break;
				269	}
				270
				271	// Otherwise, append a new pair of operands for the new incoming edge.
				272	MI.addOperand(MF, OpV);
				273	MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
				274	break;
				275	}
				276	}
				277
				278	// Inherit live-ins from the successor
				279	for (auto &LI : Succ.liveins())
				280	NewMBB.addLiveIn(LI);
				281
				282	LLVM_DEBUG(dbgs() << " Split edge from '" << MBB.getName() << "' to '"
				283	<< Succ.getName() << "'.\n");
				284	return NewMBB;
				285	}
				286
Chandler Carruth	3ffcc03	2018-07-15 23:46:36 +0000	[diff] [blame]	287	/// Removing duplicate PHI operands to leave the PHI in a canonical and
				288	/// predictable form.
				289	///
				290	/// FIXME: It's really frustrating that we have to do this, but SSA-form in MIR
				291	/// isn't what you might expect. We may have multiple entries in PHI nodes for
				292	/// a single predecessor. This makes CFG-updating extremely complex, so here we
				293	/// simplify all PHI nodes to a model even simpler than the IR's model: exactly
				294	/// one entry per predecessor, regardless of how many edges there are.
				295	static void canonicalizePHIOperands(MachineFunction &MF) {
				296	SmallPtrSet<MachineBasicBlock *, 4> Preds;
				297	SmallVector<int, 4> DupIndices;
				298	for (auto &MBB : MF)
				299	for (auto &MI : MBB) {
				300	if (!MI.isPHI())
				301	break;
				302
				303	// First we scan the operands of the PHI looking for duplicate entries
				304	// a particular predecessor. We retain the operand index of each duplicate
				305	// entry found.
				306	for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
				307	OpIdx += 2)
				308	if (!Preds.insert(MI.getOperand(OpIdx + 1).getMBB()).second)
				309	DupIndices.push_back(OpIdx);
				310
				311	// Now walk the duplicate indices, removing both the block and value. Note
				312	// that these are stored as a vector making this element-wise removal
				313	// :w
				314	// potentially quadratic.
				315	//
				316	// FIXME: It is really frustrating that we have to use a quadratic
				317	// removal algorithm here. There should be a better way, but the use-def
				318	// updates required make that impossible using the public API.
				319	//
				320	// Note that we have to process these backwards so that we don't
				321	// invalidate other indices with each removal.
				322	while (!DupIndices.empty()) {
				323	int OpIdx = DupIndices.pop_back_val();
				324	// Remove both the block and value operand, again in reverse order to
				325	// preserve indices.
				326	MI.RemoveOperand(OpIdx + 1);
				327	MI.RemoveOperand(OpIdx);
				328	}
				329
				330	Preds.clear();
				331	}
				332	}
				333
Chandler Carruth	3620b99	2018-07-16 10:46:16 +0000	[diff] [blame]	334	/// Helper to scan a function for loads vulnerable to misspeculation that we
				335	/// want to harden.
				336	///
				337	/// We use this to avoid making changes to functions where there is nothing we
				338	/// need to do to harden against misspeculation.
				339	static bool hasVulnerableLoad(MachineFunction &MF) {
				340	for (MachineBasicBlock &MBB : MF) {
				341	for (MachineInstr &MI : MBB) {
				342	// Loads within this basic block after an LFENCE are not at risk of
				343	// speculatively executing with invalid predicates from prior control
				344	// flow. So break out of this block but continue scanning the function.
				345	if (MI.getOpcode() == X86::LFENCE)
				346	break;
				347
				348	// Looking for loads only.
				349	if (!MI.mayLoad())
				350	continue;
				351
				352	// An MFENCE is modeled as a load but isn't vulnerable to misspeculation.
				353	if (MI.getOpcode() == X86::MFENCE)
				354	continue;
				355
				356	// We found a load.
				357	return true;
				358	}
				359	}
				360
				361	// No loads found.
				362	return false;
				363	}
				364
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	365	bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
				366	MachineFunction &MF) {
				367	LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
				368	<< " **********\n");
				369
				370	Subtarget = &MF.getSubtarget<X86Subtarget>();
				371	MRI = &MF.getRegInfo();
				372	TII = Subtarget->getInstrInfo();
				373	TRI = Subtarget->getRegisterInfo();
				374	// FIXME: Support for 32-bit.
				375	PredStateRC = &X86::GR64_NOSPRegClass;
				376
				377	if (MF.begin() == MF.end())
				378	// Nothing to do for a degenerate empty function...
				379	return false;
				380
				381	// We support an alternative hardening technique based on a debug flag.
				382	if (HardenEdgesWithLFENCE) {
				383	hardenEdgesWithLFENCE(MF);
				384	return true;
				385	}
				386
				387	// Create a dummy debug loc to use for all the generated code here.
				388	DebugLoc Loc;
				389
				390	MachineBasicBlock &Entry = *MF.begin();
				391	auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(Entry.begin());
				392
				393	// Do a quick scan to see if we have any checkable loads.
Chandler Carruth	3620b99	2018-07-16 10:46:16 +0000	[diff] [blame]	394	bool HasVulnerableLoad = hasVulnerableLoad(MF);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	395
				396	// See if we have any conditional branching blocks that we will need to trace
				397	// predicate state through.
				398	SmallVector<BlockCondInfo, 16> Infos = collectBlockCondInfo(MF);
				399
				400	// If we have no interesting conditions or loads, nothing to do here.
Chandler Carruth	3620b99	2018-07-16 10:46:16 +0000	[diff] [blame]	401	if (!HasVulnerableLoad && Infos.empty())
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	402	return true;
				403
				404	unsigned PredStateReg;
				405	unsigned PredStateSizeInBytes = TRI->getRegSizeInBits(*PredStateRC) / 8;
				406
				407	// The poison value is required to be an all-ones value for many aspects of
				408	// this mitigation.
				409	const int PoisonVal = -1;
				410	unsigned PoisonReg = MRI->createVirtualRegister(PredStateRC);
				411	BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64ri32), PoisonReg)
				412	.addImm(PoisonVal);
				413	++NumInstsInserted;
				414
				415	// If we have loads being hardened and we've asked for call and ret edges to
				416	// get a full fence-based mitigation, inject that fence.
Chandler Carruth	3620b99	2018-07-16 10:46:16 +0000	[diff] [blame]	417	if (HasVulnerableLoad && FenceCallAndRet) {
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	418	// We need to insert an LFENCE at the start of the function to suspend any
				419	// incoming misspeculation from the caller. This helps two-fold: the caller
				420	// may not have been protected as this code has been, and this code gets to
				421	// not take any specific action to protect across calls.
				422	// FIXME: We could skip this for functions which unconditionally return
				423	// a constant.
				424	BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::LFENCE));
				425	++NumInstsInserted;
				426	++NumLFENCEsInserted;
				427	}
				428
Chandler Carruth	fb503ac	2018-07-14 09:32:37 +0000	[diff] [blame]	429	// If we guarded the entry with an LFENCE and have no conditionals to protect
				430	// in blocks, then we're done.
				431	if (FenceCallAndRet && Infos.empty())
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	432	// We may have changed the function's code at this point to insert fences.
				433	return true;
				434
				435	// For every basic block in the function which can b
				436	if (HardenInterprocedurally && !FenceCallAndRet) {
				437	// Set up the predicate state by extracting it from the incoming stack
				438	// pointer so we pick up any misspeculation in our caller.
				439	PredStateReg = extractPredStateFromSP(Entry, EntryInsertPt, Loc);
				440	} else {
				441	// Otherwise, just build the predicate state itself by zeroing a register
				442	// as we don't need any initial state.
				443	PredStateReg = MRI->createVirtualRegister(PredStateRC);
				444	unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
				445	auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
				446	PredStateSubReg);
				447	++NumInstsInserted;
				448	MachineOperand *ZeroEFLAGSDefOp =
				449	ZeroI->findRegisterDefOperand(X86::EFLAGS);
				450	assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
				451	"Must have an implicit def of EFLAGS!");
				452	ZeroEFLAGSDefOp->setIsDead(true);
				453	BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
				454	PredStateReg)
				455	.addImm(0)
				456	.addReg(PredStateSubReg)
				457	.addImm(X86::sub_32bit);
				458	}
				459
				460	// We're going to need to trace predicate state throughout the function's
				461	// CFG. Prepare for this by setting up our initial state of PHIs with unique
				462	// predecessor entries and all the initial predicate state.
Chandler Carruth	3ffcc03	2018-07-15 23:46:36 +0000	[diff] [blame]	463	canonicalizePHIOperands(MF);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	464
				465	// Track the updated values in an SSA updater to rewrite into SSA form at the
				466	// end.
				467	MachineSSAUpdater PredStateSSA(MF);
				468	PredStateSSA.Initialize(PredStateReg);
				469	PredStateSSA.AddAvailableValue(&Entry, PredStateReg);
				470	// Collect the inserted instructions so we can rewrite their uses of the
				471	// predicate state into SSA form.
				472	SmallVector<MachineInstr *, 16> CMovs;
				473
				474	// Now walk all of the basic blocks looking for ones that end in conditional
				475	// jumps where we need to update this register along each edge.
				476	for (BlockCondInfo &Info : Infos) {
				477	MachineBasicBlock &MBB = *Info.MBB;
				478	SmallVectorImpl<MachineInstr *> &CondBrs = Info.CondBrs;
				479	MachineInstr *UncondBr = Info.UncondBr;
				480
				481	LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName()
				482	<< "\n");
				483	++NumCondBranchesTraced;
				484
				485	// Compute the non-conditional successor as either the target of any
				486	// unconditional branch or the layout successor.
				487	MachineBasicBlock *UncondSucc =
				488	UncondBr ? (UncondBr->getOpcode() == X86::JMP_1
				489	? UncondBr->getOperand(0).getMBB()
				490	: nullptr)
				491	: &*std::next(MachineFunction::iterator(&MBB));
				492
				493	// Count how many edges there are to any given successor.
				494	SmallDenseMap<MachineBasicBlock *, int> SuccCounts;
				495	if (UncondSucc)
				496	++SuccCounts[UncondSucc];
				497	for (auto *CondBr : CondBrs)
				498	++SuccCounts[CondBr->getOperand(0).getMBB()];
				499
				500	// A lambda to insert cmov instructions into a block checking all of the
				501	// condition codes in a sequence.
				502	auto BuildCheckingBlockForSuccAndConds =
				503	[&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount,
				504	MachineInstr Br, MachineInstr &UncondBr,
				505	ArrayRef<X86::CondCode> Conds) {
				506	// First, we split the edge to insert the checking block into a safe
				507	// location.
				508	auto &CheckingMBB =
				509	(SuccCount == 1 && Succ.pred_size() == 1)
				510	? Succ
				511	: splitEdge(MBB, Succ, SuccCount, Br, UncondBr, *TII);
				512
				513	bool LiveEFLAGS = Succ.isLiveIn(X86::EFLAGS);
				514	if (!LiveEFLAGS)
				515	CheckingMBB.addLiveIn(X86::EFLAGS);
				516
				517	// Now insert the cmovs to implement the checks.
				518	auto InsertPt = CheckingMBB.begin();
Erich Keane	ac8cb22	2018-07-13 14:43:20 +0000	[diff] [blame]	519	assert((InsertPt == CheckingMBB.end() \|\| !InsertPt->isPHI()) &&
				520	"Should never have a PHI in the initial checking block as it "
				521	"always has a single predecessor!");
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	522
				523	// We will wire each cmov to each other, but need to start with the
				524	// incoming pred state.
				525	unsigned CurStateReg = PredStateReg;
				526
				527	for (X86::CondCode Cond : Conds) {
				528	auto CMovOp = X86::getCMovFromCond(Cond, PredStateSizeInBytes);
				529
				530	unsigned UpdatedStateReg = MRI->createVirtualRegister(PredStateRC);
				531	auto CMovI = BuildMI(CheckingMBB, InsertPt, Loc, TII->get(CMovOp),
				532	UpdatedStateReg)
				533	.addReg(CurStateReg)
				534	.addReg(PoisonReg);
				535	// If this is the last cmov and the EFLAGS weren't originally
				536	// live-in, mark them as killed.
				537	if (!LiveEFLAGS && Cond == Conds.back())
				538	CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
				539
				540	++NumInstsInserted;
				541	LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump();
				542	dbgs() << "\n");
				543
				544	// The first one of the cmovs will be using the top level
				545	// `PredStateReg` and need to get rewritten into SSA form.
				546	if (CurStateReg == PredStateReg)
				547	CMovs.push_back(&*CMovI);
				548
				549	// The next cmov should start from this one's def.
				550	CurStateReg = UpdatedStateReg;
				551	}
				552
				553	// And put the last one into the available values for PredStateSSA.
				554	PredStateSSA.AddAvailableValue(&CheckingMBB, CurStateReg);
				555	};
				556
				557	std::vector<X86::CondCode> UncondCodeSeq;
				558	for (auto *CondBr : CondBrs) {
				559	MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB();
				560	int &SuccCount = SuccCounts[&Succ];
				561
				562	X86::CondCode Cond = X86::getCondFromBranchOpc(CondBr->getOpcode());
				563	X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond);
				564	UncondCodeSeq.push_back(Cond);
				565
				566	BuildCheckingBlockForSuccAndConds(MBB, Succ, SuccCount, CondBr, UncondBr,
				567	{InvCond});
				568
				569	// Decrement the successor count now that we've split one of the edges.
				570	// We need to keep the count of edges to the successor accurate in order
				571	// to know above when to replace the successor in the CFG vs. just
				572	// adding the new successor.
				573	--SuccCount;
				574	}
				575
				576	// Since we may have split edges and changed the number of successors,
				577	// normalize the probabilities. This avoids doing it each time we split an
				578	// edge.
				579	MBB.normalizeSuccProbs();
				580
				581	// Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
				582	// need to intersect the other condition codes. We can do this by just
				583	// doing a cmov for each one.
				584	if (!UncondSucc)
				585	// If we have no fallthrough to protect (perhaps it is an indirect jump?)
				586	// just skip this and continue.
				587	continue;
				588
				589	assert(SuccCounts[UncondSucc] == 1 &&
				590	"We should never have more than one edge to the unconditional "
				591	"successor at this point because every other edge must have been "
				592	"split above!");
				593
				594	// Sort and unique the codes to minimize them.
				595	llvm::sort(UncondCodeSeq.begin(), UncondCodeSeq.end());
				596	UncondCodeSeq.erase(std::unique(UncondCodeSeq.begin(), UncondCodeSeq.end()),
				597	UncondCodeSeq.end());
				598
				599	// Build a checking version of the successor.
				600	BuildCheckingBlockForSuccAndConds(MBB, UncondSucc, /SuccCount*/ 1,
				601	UncondBr, UncondBr, UncondCodeSeq);
				602	}
				603
				604	// We may also enter basic blocks in this function via exception handling
				605	// control flow. Here, if we are hardening interprocedurally, we need to
				606	// re-capture the predicate state from the throwing code. In the Itanium ABI,
				607	// the throw will always look like a call to __cxa_throw and will have the
				608	// predicate state in the stack pointer, so extract fresh predicate state from
				609	// the stack pointer and make it available in SSA.
				610	// FIXME: Handle non-itanium ABI EH models.
				611	if (HardenInterprocedurally) {
				612	for (MachineBasicBlock &MBB : MF) {
				613	assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!");
				614	assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
				615	assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
				616	if (!MBB.isEHPad())
				617	continue;
				618	PredStateSSA.AddAvailableValue(
				619	&MBB,
				620	extractPredStateFromSP(MBB, MBB.SkipPHIsAndLabels(MBB.begin()), Loc));
				621	}
				622	}
				623
				624	// Now check all of the loads using the predicate state.
				625	checkAllLoads(MF, PredStateSSA);
				626
				627	// Now rewrite all the uses of the pred state using the SSA updater so that
				628	// we track updates through the CFG.
				629	for (MachineInstr *CMovI : CMovs)
				630	for (MachineOperand &Op : CMovI->operands()) {
				631	if (!Op.isReg() \|\| Op.getReg() != PredStateReg)
				632	continue;
				633
				634	PredStateSSA.RewriteUse(Op);
				635	}
				636
				637	// If we are hardening interprocedurally, find each returning block and
				638	// protect the caller from being returned to through misspeculation.
				639	if (HardenInterprocedurally)
				640	for (MachineBasicBlock &MBB : MF) {
				641	if (MBB.empty())
				642	continue;
				643
				644	MachineInstr &MI = MBB.back();
				645	if (!MI.isReturn())
				646	continue;
				647
				648	checkReturnInstr(MI, PredStateSSA);
				649	}
				650
				651	LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump();
				652	dbgs() << "\n"; MF.verify(this));
				653	return true;
				654	}
				655
				656	/// Implements the naive hardening approach of putting an LFENCE after every
				657	/// potentially mis-predicted control flow construct.
				658	///
				659	/// We include this as an alternative mostly for the purpose of comparison. The
				660	/// performance impact of this is expected to be extremely severe and not
				661	/// practical for any real-world users.
				662	void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
				663	MachineFunction &MF) {
				664	// First, we scan the function looking for blocks that are reached along edges
				665	// that we might want to harden.
				666	SmallSetVector<MachineBasicBlock *, 8> Blocks;
				667	for (MachineBasicBlock &MBB : MF) {
				668	// If there are no or only one successor, nothing to do here.
				669	if (MBB.succ_size() <= 1)
				670	continue;
				671
				672	// Skip blocks unless their terminators start with a branch. Other
				673	// terminators don't seem interesting for guarding against misspeculation.
				674	auto TermIt = MBB.getFirstTerminator();
				675	if (TermIt == MBB.end() \|\| !TermIt->isBranch())
				676	continue;
				677
				678	// Add all the non-EH-pad succossors to the blocks we want to harden. We
				679	// skip EH pads because there isn't really a condition of interest on
				680	// entering.
				681	for (MachineBasicBlock *SuccMBB : MBB.successors())
				682	if (!SuccMBB->isEHPad())
				683	Blocks.insert(SuccMBB);
				684	}
				685
				686	for (MachineBasicBlock *MBB : Blocks) {
				687	auto InsertPt = MBB->SkipPHIsAndLabels(MBB->begin());
				688	BuildMI(*MBB, InsertPt, DebugLoc(), TII->get(X86::LFENCE));
				689	++NumInstsInserted;
				690	++NumLFENCEsInserted;
				691	}
				692	}
				693
				694	SmallVector<X86SpeculativeLoadHardeningPass::BlockCondInfo, 16>
				695	X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
				696	SmallVector<BlockCondInfo, 16> Infos;
				697
				698	// Walk the function and build up a summary for each block's conditions that
				699	// we need to trace through.
				700	for (MachineBasicBlock &MBB : MF) {
				701	// If there are no or only one successor, nothing to do here.
				702	if (MBB.succ_size() <= 1)
				703	continue;
				704
				705	// We want to reliably handle any conditional branch terminators in the
				706	// MBB, so we manually analyze the branch. We can handle all of the
				707	// permutations here, including ones that analyze branch cannot.
				708	//
				709	// The approach is to walk backwards across the terminators, resetting at
				710	// any unconditional non-indirect branch, and track all conditional edges
				711	// to basic blocks as well as the fallthrough or unconditional successor
				712	// edge. For each conditional edge, we track the target and the opposite
				713	// condition code in order to inject a "no-op" cmov into that successor
				714	// that will harden the predicate. For the fallthrough/unconditional
				715	// edge, we inject a separate cmov for each conditional branch with
				716	// matching condition codes. This effectively implements an "and" of the
				717	// condition flags, even if there isn't a single condition flag that would
				718	// directly implement that. We don't bother trying to optimize either of
				719	// these cases because if such an optimization is possible, LLVM should
				720	// have optimized the conditional branches in that way already to reduce
				721	// instruction count. This late, we simply assume the minimal number of
				722	// branch instructions is being emitted and use that to guide our cmov
				723	// insertion.
				724
				725	BlockCondInfo Info = {&MBB, {}, nullptr};
				726
				727	// Now walk backwards through the terminators and build up successors they
				728	// reach and the conditions.
				729	for (MachineInstr &MI : llvm::reverse(MBB)) {
				730	// Once we've handled all the terminators, we're done.
				731	if (!MI.isTerminator())
				732	break;
				733
				734	// If we see a non-branch terminator, we can't handle anything so bail.
				735	if (!MI.isBranch()) {
				736	Info.CondBrs.clear();
				737	break;
				738	}
				739
				740	// If we see an unconditional branch, reset our state, clear any
				741	// fallthrough, and set this is the "else" successor.
				742	if (MI.getOpcode() == X86::JMP_1) {
				743	Info.CondBrs.clear();
				744	Info.UncondBr = &MI;
				745	continue;
				746	}
				747
				748	// If we get an invalid condition, we have an indirect branch or some
				749	// other unanalyzable "fallthrough" case. We model this as a nullptr for
				750	// the destination so we can still guard any conditional successors.
				751	// Consider code sequences like:
				752	// ```
				753	// jCC L1
				754	// jmpq *%rax
				755	// ```
				756	// We still want to harden the edge to `L1`.
				757	if (X86::getCondFromBranchOpc(MI.getOpcode()) == X86::COND_INVALID) {
				758	Info.CondBrs.clear();
				759	Info.UncondBr = &MI;
				760	continue;
				761	}
				762
				763	// We have a vanilla conditional branch, add it to our list.
				764	Info.CondBrs.push_back(&MI);
				765	}
				766	if (Info.CondBrs.empty()) {
				767	++NumBranchesUntraced;
				768	LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
				769	MBB.dump());
				770	continue;
				771	}
				772
				773	Infos.push_back(Info);
				774	}
				775
				776	return Infos;
				777	}
				778
				779	/// Returns true if the instruction has no behavior (specified or otherwise)
				780	/// that is based on the value of any of its register operands
				781	///
				782	/// A classical example of something that is inherently not data invariant is an
				783	/// indirect jump -- the destination is loaded into icache based on the bits set
				784	/// in the jump destination register.
				785	///
				786	/// FIXME: This should become part of our instruction tables.
				787	static bool isDataInvariant(MachineInstr &MI) {
				788	switch (MI.getOpcode()) {
				789	default:
				790	// By default, assume that the instruction is not data invariant.
				791	return false;
				792
				793	// FIXME: For now, we just use a very boring, conservative set of unary
				794	// instructions because we're mostly interested in handling simple
				795	// transformations.
				796	case TargetOpcode::COPY:
				797	return true;
				798	}
				799	}
				800
				801	/// Returns true if the instruction has no behavior (specified or otherwise)
				802	/// that is based on the value loaded from memory or the value of any
				803	/// non-address register operands.
				804	///
				805	/// For example, if the latency of the instruction is dependent on the
				806	/// particular bits set in any of the registers or any of the bits loaded from
				807	/// memory.
				808	///
				809	/// A classical example of something that is inherently not data invariant is an
				810	/// indirect jump -- the destination is loaded into icache based on the bits set
				811	/// in the jump destination register.
				812	///
				813	/// FIXME: This should become part of our instruction tables.
				814	static bool isDataInvariantLoad(MachineInstr &MI) {
				815	switch (MI.getOpcode()) {
				816	default:
				817	// By default, assume that the load will immediately leak.
				818	return false;
				819
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	820	// On x86 it is believed that imul is constant time w.r.t. the loaded data.
				821	// However, they set flags and are perhaps the most surprisingly constant
				822	// time operations so we call them out here separately.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	823	case X86::IMUL16rm:
				824	case X86::IMUL16rmi8:
				825	case X86::IMUL16rmi:
				826	case X86::IMUL32rm:
				827	case X86::IMUL32rmi8:
				828	case X86::IMUL32rmi:
				829	case X86::IMUL64rm:
				830	case X86::IMUL64rmi32:
				831	case X86::IMUL64rmi8:
				832
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	833	// Bit scanning and counting instructions that are somewhat surprisingly
				834	// constant time as they scan across bits and do other fairly complex
				835	// operations like popcnt, but are believed to be constant time on x86.
				836	// However, these set flags.
				837	case X86::BSF16rm:
				838	case X86::BSF32rm:
				839	case X86::BSF64rm:
				840	case X86::BSR16rm:
				841	case X86::BSR32rm:
				842	case X86::BSR64rm:
				843	case X86::LZCNT16rm:
				844	case X86::LZCNT32rm:
				845	case X86::LZCNT64rm:
				846	case X86::POPCNT16rm:
				847	case X86::POPCNT32rm:
				848	case X86::POPCNT64rm:
				849	case X86::TZCNT16rm:
				850	case X86::TZCNT32rm:
				851	case X86::TZCNT64rm:
				852
				853	// Bit manipulation instructions are effectively combinations of basic
				854	// arithmetic ops, and should still execute in constant time. These also
				855	// set flags.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	856	case X86::BLCFILL32rm:
				857	case X86::BLCFILL64rm:
				858	case X86::BLCI32rm:
				859	case X86::BLCI64rm:
				860	case X86::BLCIC32rm:
				861	case X86::BLCIC64rm:
				862	case X86::BLCMSK32rm:
				863	case X86::BLCMSK64rm:
				864	case X86::BLCS32rm:
				865	case X86::BLCS64rm:
				866	case X86::BLSFILL32rm:
				867	case X86::BLSFILL64rm:
				868	case X86::BLSI32rm:
				869	case X86::BLSI64rm:
				870	case X86::BLSIC32rm:
				871	case X86::BLSIC64rm:
				872	case X86::BLSMSK32rm:
				873	case X86::BLSMSK64rm:
				874	case X86::BLSR32rm:
				875	case X86::BLSR64rm:
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	876	case X86::TZMSK32rm:
				877	case X86::TZMSK64rm:
				878
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	879	// Bit extracting and clearing instructions should execute in constant time,
				880	// and set flags.
				881	case X86::BEXTR32rm:
				882	case X86::BEXTR64rm:
				883	case X86::BEXTRI32mi:
				884	case X86::BEXTRI64mi:
				885	case X86::BZHI32rm:
				886	case X86::BZHI64rm:
				887
				888	// Basic arithmetic is constant time on the input but does set flags.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	889	case X86::ADC8rm:
				890	case X86::ADC16rm:
				891	case X86::ADC32rm:
				892	case X86::ADC64rm:
				893	case X86::ADCX32rm:
				894	case X86::ADCX64rm:
				895	case X86::ADD8rm:
				896	case X86::ADD16rm:
				897	case X86::ADD32rm:
				898	case X86::ADD64rm:
				899	case X86::ADOX32rm:
				900	case X86::ADOX64rm:
				901	case X86::AND8rm:
				902	case X86::AND16rm:
				903	case X86::AND32rm:
				904	case X86::AND64rm:
				905	case X86::ANDN32rm:
				906	case X86::ANDN64rm:
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	907	case X86::OR8rm:
				908	case X86::OR16rm:
				909	case X86::OR32rm:
				910	case X86::OR64rm:
				911	case X86::SBB8rm:
				912	case X86::SBB16rm:
				913	case X86::SBB32rm:
				914	case X86::SBB64rm:
				915	case X86::SUB8rm:
				916	case X86::SUB16rm:
				917	case X86::SUB32rm:
				918	case X86::SUB64rm:
				919	case X86::XOR8rm:
				920	case X86::XOR16rm:
				921	case X86::XOR32rm:
				922	case X86::XOR64rm:
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	923	// Check whether the EFLAGS implicit-def is dead. We assume that this will
				924	// always find the implicit-def because this code should only be reached
				925	// for instructions that do in fact implicitly def this.
				926	if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
				927	// If we would clobber EFLAGS that are used, just bail for now.
				928	LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: ";
				929	MI.dump(); dbgs() << "\n");
				930	return false;
				931	}
				932
				933	// Otherwise, fallthrough to handle these the same as instructions that
				934	// don't set EFLAGS.
				935	LLVM_FALLTHROUGH;
				936
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	937	// Integer multiply w/o affecting flags is still believed to be constant
				938	// time on x86. Called out separately as this is among the most surprising
				939	// instructions to exhibit that behavior.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	940	case X86::MULX32rm:
				941	case X86::MULX64rm:
				942
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	943	// Arithmetic instructions that are both constant time and don't set flags.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	944	case X86::RORX32mi:
				945	case X86::RORX64mi:
				946	case X86::SARX32rm:
				947	case X86::SARX64rm:
				948	case X86::SHLX32rm:
				949	case X86::SHLX64rm:
				950	case X86::SHRX32rm:
				951	case X86::SHRX64rm:
				952
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	953	// Conversions are believed to be constant time and don't set flags.
Craig Topper	41fa858	2018-07-13 22:41:50 +0000	[diff] [blame]	954	case X86::CVTTSD2SI64rm: case X86::VCVTTSD2SI64rm: case X86::VCVTTSD2SI64Zrm:
				955	case X86::CVTTSD2SIrm: case X86::VCVTTSD2SIrm: case X86::VCVTTSD2SIZrm:
				956	case X86::CVTTSS2SI64rm: case X86::VCVTTSS2SI64rm: case X86::VCVTTSS2SI64Zrm:
				957	case X86::CVTTSS2SIrm: case X86::VCVTTSS2SIrm: case X86::VCVTTSS2SIZrm:
				958	case X86::CVTSI2SDrm: case X86::VCVTSI2SDrm: case X86::VCVTSI2SDZrm:
				959	case X86::CVTSI2SSrm: case X86::VCVTSI2SSrm: case X86::VCVTSI2SSZrm:
				960	case X86::CVTSI642SDrm: case X86::VCVTSI642SDrm: case X86::VCVTSI642SDZrm:
				961	case X86::CVTSI642SSrm: case X86::VCVTSI642SSrm: case X86::VCVTSI642SSZrm:
				962	case X86::CVTSS2SDrm: case X86::VCVTSS2SDrm: case X86::VCVTSS2SDZrm:
				963	case X86::CVTSD2SSrm: case X86::VCVTSD2SSrm: case X86::VCVTSD2SSZrm:
				964	// AVX512 added unsigned integer conversions.
				965	case X86::VCVTTSD2USI64Zrm:
				966	case X86::VCVTTSD2USIZrm:
				967	case X86::VCVTTSS2USI64Zrm:
				968	case X86::VCVTTSS2USIZrm:
				969	case X86::VCVTUSI2SDZrm:
				970	case X86::VCVTUSI642SDZrm:
				971	case X86::VCVTUSI2SSZrm:
				972	case X86::VCVTUSI642SSZrm:
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	973
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	974	// Loads to register don't set flags.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	975	case X86::MOV8rm:
				976	case X86::MOV8rm_NOREX:
				977	case X86::MOV16rm:
				978	case X86::MOV32rm:
				979	case X86::MOV64rm:
				980	case X86::MOVSX16rm8:
				981	case X86::MOVSX32rm16:
				982	case X86::MOVSX32rm8:
				983	case X86::MOVSX32rm8_NOREX:
				984	case X86::MOVSX64rm16:
				985	case X86::MOVSX64rm32:
				986	case X86::MOVSX64rm8:
				987	case X86::MOVZX16rm8:
				988	case X86::MOVZX32rm16:
				989	case X86::MOVZX32rm8:
				990	case X86::MOVZX32rm8_NOREX:
				991	case X86::MOVZX64rm16:
				992	case X86::MOVZX64rm8:
				993	return true;
				994	}
				995	}
				996
				997	static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
				998	const TargetRegisterInfo &TRI) {
				999	// Check if EFLAGS are alive by seeing if there is a def of them or they
				1000	// live-in, and then seeing if that def is in turn used.
				1001	for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), I))) {
				1002	if (MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
				1003	// If the def is dead, then EFLAGS is not live.
				1004	if (DefOp->isDead())
				1005	return false;
				1006
				1007	// Otherwise we've def'ed it, and it is live.
				1008	return true;
				1009	}
				1010	// While at this instruction, also check if we use and kill EFLAGS
				1011	// which means it isn't live.
				1012	if (MI.killsRegister(X86::EFLAGS, &TRI))
				1013	return false;
				1014	}
				1015
				1016	// If we didn't find anything conclusive (neither definitely alive or
				1017	// definitely dead) return whether it lives into the block.
				1018	return MBB.isLiveIn(X86::EFLAGS);
				1019	}
				1020
				1021	void X86SpeculativeLoadHardeningPass::checkAllLoads(
				1022	MachineFunction &MF, MachineSSAUpdater &PredStateSSA) {
				1023	// If the actual checking of loads is disabled, skip doing anything here.
				1024	if (!HardenLoads)
				1025	return;
				1026
				1027	SmallPtrSet<MachineInstr *, 16> HardenPostLoad;
				1028	SmallPtrSet<MachineInstr *, 16> HardenLoadAddr;
				1029
				1030	SmallSet<unsigned, 16> HardenedAddrRegs;
				1031
				1032	SmallDenseMap<unsigned, unsigned, 32> AddrRegToHardenedReg;
				1033
				1034	// Track the set of load-dependent registers through the basic block. Because
				1035	// the values of these registers have an existing data dependency on a loaded
				1036	// value which we would have checked, we can omit any checks on them.
				1037	SparseBitVector<> LoadDepRegs;
				1038
				1039	for (MachineBasicBlock &MBB : MF) {
				1040	// We harden the loads of a basic block in several passes:
				1041	//
				1042	// 1) Collect all the loads which can have their loaded value hardened
				1043	// and all the loads that instead need their address hardened. During
				1044	// this walk we propagate load dependence for address hardened loads and
				1045	// also look for LFENCE to stop hardening wherever possible. When
				1046	// deciding whether or not to harden the loaded value or not, we check
				1047	// to see if any registers used in the address will have been hardened
				1048	// at this point and if so, harden any remaining address registers as
				1049	// that often successfully re-uses hardened addresses and minimizes
				1050	// instructions. FIXME: We should consider an aggressive mode where we
				1051	// continue to keep as many loads value hardened even when some address
				1052	// register hardening would be free (due to reuse).
				1053	for (MachineInstr &MI : MBB) {
				1054	// We naively assume that all def'ed registers of an instruction have
				1055	// a data dependency on all of their operands.
				1056	// FIXME: Do a more careful analysis of x86 to build a conservative model
				1057	// here.
				1058	if (llvm::any_of(MI.uses(), [&](MachineOperand &Op) {
				1059	return Op.isReg() && LoadDepRegs.test(Op.getReg());
				1060	}))
				1061	for (MachineOperand &Def : MI.defs())
				1062	if (Def.isReg())
				1063	LoadDepRegs.set(Def.getReg());
				1064
				1065	// Both Intel and AMD are guiding that they will change the semantics of
				1066	// LFENCE to be a speculation barrier, so if we see an LFENCE, there is
				1067	// no more need to guard things in this block.
				1068	if (MI.getOpcode() == X86::LFENCE)
				1069	break;
				1070
				1071	// If this instruction cannot load, nothing to do.
				1072	if (!MI.mayLoad())
				1073	continue;
				1074
				1075	// Some instructions which "load" are trivially safe or unimportant.
				1076	if (MI.getOpcode() == X86::MFENCE)
				1077	continue;
				1078
				1079	// Extract the memory operand information about this instruction.
				1080	// FIXME: This doesn't handle loading pseudo instructions which we often
				1081	// could handle with similarly generic logic. We probably need to add an
				1082	// MI-layer routine similar to the MC-layer one we use here which maps
				1083	// pseudos much like this maps real instructions.
				1084	const MCInstrDesc &Desc = MI.getDesc();
				1085	int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
				1086	if (MemRefBeginIdx < 0) {
				1087	LLVM_DEBUG(dbgs() << "WARNING: unable to harden loading instruction: ";
				1088	MI.dump());
				1089	continue;
				1090	}
				1091
				1092	MemRefBeginIdx += X86II::getOperandBias(Desc);
				1093
				1094	MachineOperand &BaseMO = MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
				1095	MachineOperand &IndexMO =
				1096	MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
				1097
				1098	// If we have at least one (non-frame-index, non-RIP) register operand,
				1099	// and neither operand is load-dependent, we need to check the load.
				1100	unsigned BaseReg = 0, IndexReg = 0;
				1101	if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP &&
				1102	BaseMO.getReg() != X86::NoRegister)
				1103	BaseReg = BaseMO.getReg();
				1104	if (IndexMO.getReg() != X86::NoRegister)
				1105	IndexReg = IndexMO.getReg();
				1106
				1107	if (!BaseReg && !IndexReg)
				1108	// No register operands!
				1109	continue;
				1110
				1111	// If any register operand is dependent, this load is dependent and we
				1112	// needn't check it.
				1113	// FIXME: Is this true in the case where we are hardening loads after
				1114	// they complete? Unclear, need to investigate.
				1115	if ((BaseReg && LoadDepRegs.test(BaseReg)) \|\|
				1116	(IndexReg && LoadDepRegs.test(IndexReg)))
				1117	continue;
				1118
Chandler Carruth	e66a6f4	2018-07-16 11:38:48 +0000	[diff] [blame]	1119	// If post-load hardening is enabled, this load is compatible with
				1120	// post-load hardening, and we aren't already going to harden one of the
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1121	// address registers, queue it up to be hardened post-load. Notably, even
				1122	// once hardened this won't introduce a useful dependency that could prune
				1123	// out subsequent loads.
Chandler Carruth	e66a6f4	2018-07-16 11:38:48 +0000	[diff] [blame]	1124	if (EnablePostLoadHardening && canHardenPostLoad(MI) &&
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1125	!HardenedAddrRegs.count(BaseReg) &&
				1126	!HardenedAddrRegs.count(IndexReg)) {
				1127	HardenPostLoad.insert(&MI);
				1128	HardenedAddrRegs.insert(MI.getOperand(0).getReg());
				1129	continue;
				1130	}
				1131
				1132	// Record this instruction for address hardening and record its register
				1133	// operands as being address-hardened.
				1134	HardenLoadAddr.insert(&MI);
				1135	if (BaseReg)
				1136	HardenedAddrRegs.insert(BaseReg);
				1137	if (IndexReg)
				1138	HardenedAddrRegs.insert(IndexReg);
				1139
				1140	for (MachineOperand &Def : MI.defs())
				1141	if (Def.isReg())
				1142	LoadDepRegs.set(Def.getReg());
				1143	}
				1144
				1145	// Now re-walk the instructions in the basic block, and apply whichever
				1146	// hardening strategy we have elected. Note that we do this in a second
				1147	// pass specifically so that we have the complete set of instructions for
				1148	// which we will do post-load hardening and can defer it in certain
				1149	// circumstances.
				1150	//
				1151	// FIXME: This could probably be made even more effective by doing it
				1152	// across the entire function. Rather than just walking the flat list
				1153	// backwards here, we could walk the function in PO and each block bottom
				1154	// up, allowing us to in some cases sink hardening across block blocks. As
				1155	// long as the in-block predicate state is used at the eventual hardening
				1156	// site, this remains safe.
				1157	for (MachineInstr &MI : MBB) {
				1158	// We cannot both require hardening the def of a load and its address.
				1159	assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) &&
				1160	"Requested to harden both the address and def of a load!");
				1161
				1162	// Check if this is a load whose address needs to be hardened.
				1163	if (HardenLoadAddr.erase(&MI)) {
				1164	const MCInstrDesc &Desc = MI.getDesc();
				1165	int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
				1166	assert(MemRefBeginIdx >= 0 && "Cannot have an invalid index here!");
				1167
				1168	MemRefBeginIdx += X86II::getOperandBias(Desc);
				1169
				1170	MachineOperand &BaseMO =
				1171	MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
				1172	MachineOperand &IndexMO =
				1173	MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
				1174	hardenLoadAddr(MI, BaseMO, IndexMO, PredStateSSA, AddrRegToHardenedReg);
				1175	continue;
				1176	}
				1177
				1178	// Test if this instruction is one of our post load instructions (and
				1179	// remove it from the set if so).
				1180	if (HardenPostLoad.erase(&MI)) {
				1181	assert(!MI.isCall() && "Must not try to post-load harden a call!");
				1182
				1183	// If this is a data-invariant load, we want to try and sink any
				1184	// hardening as far as possible.
				1185	if (isDataInvariantLoad(MI)) {
				1186	// Sink the instruction we'll need to harden as far as we can down the
				1187	// graph.
				1188	MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad);
				1189
				1190	// If we managed to sink this instruction, update everything so we
				1191	// harden that instruction when we reach it in the instruction
				1192	// sequence.
				1193	if (SunkMI != &MI) {
				1194	// If in sinking there was no instruction needing to be hardened,
				1195	// we're done.
				1196	if (!SunkMI)
				1197	continue;
				1198
				1199	// Otherwise, add this to the set of defs we harden.
				1200	HardenPostLoad.insert(SunkMI);
				1201	continue;
				1202	}
				1203	}
				1204
				1205	// The register def'ed by this instruction is trivially hardened so map
				1206	// it to itself.
				1207	AddrRegToHardenedReg[MI.getOperand(0).getReg()] =
				1208	MI.getOperand(0).getReg();
				1209
				1210	hardenPostLoad(MI, PredStateSSA);
				1211	continue;
				1212	}
				1213
				1214	// After we finish processing the instruction and doing any hardening
				1215	// necessary for it, we need to handle transferring the predicate state
				1216	// into a call and recovering it after the call returns (if it returns).
				1217	if (!MI.isCall())
				1218	continue;
				1219
				1220	// If we're not hardening interprocedurally, we can just skip calls.
				1221	if (!HardenInterprocedurally)
				1222	continue;
				1223
				1224	auto InsertPt = MI.getIterator();
				1225	DebugLoc Loc = MI.getDebugLoc();
				1226
				1227	// First, we transfer the predicate state into the called function by
				1228	// merging it into the stack pointer. This will kill the current def of
				1229	// the state.
				1230	unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
				1231	mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg);
				1232
				1233	// If this call is also a return (because it is a tail call) we're done.
				1234	if (MI.isReturn())
				1235	continue;
				1236
				1237	// Otherwise we need to step past the call and recover the predicate
				1238	// state from SP after the return, and make this new state available.
				1239	++InsertPt;
				1240	unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
				1241	PredStateSSA.AddAvailableValue(&MBB, NewStateReg);
				1242	}
				1243
				1244	HardenPostLoad.clear();
				1245	HardenLoadAddr.clear();
				1246	HardenedAddrRegs.clear();
				1247	AddrRegToHardenedReg.clear();
				1248
				1249	// Currently, we only track data-dependent loads within a basic block.
				1250	// FIXME: We should see if this is necessary or if we could be more
				1251	// aggressive here without opening up attack avenues.
				1252	LoadDepRegs.clear();
				1253	}
				1254	}
				1255
				1256	/// Save EFLAGS into the returned GPR. This can in turn be restored with
				1257	/// `restoreEFLAGS`.
				1258	///
				1259	/// Note that LLVM can only lower very simple patterns of saved and restored
				1260	/// EFLAGS registers. The restore should always be within the same basic block
				1261	/// as the save so that no PHI nodes are inserted.
				1262	unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
				1263	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
				1264	DebugLoc Loc) {
				1265	// FIXME: Hard coding this to a 32-bit register class seems weird, but matches
				1266	// what instruction selection does.
				1267	unsigned Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
				1268	// We directly copy the FLAGS register and rely on later lowering to clean
				1269	// this up into the appropriate setCC instructions.
				1270	BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
				1271	++NumInstsInserted;
				1272	return Reg;
				1273	}
				1274
				1275	/// Restore EFLAGS from the provided GPR. This should be produced by
				1276	/// `saveEFLAGS`.
				1277	///
				1278	/// This must be done within the same basic block as the save in order to
				1279	/// reliably lower.
				1280	void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
				1281	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
				1282	unsigned Reg) {
				1283	BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
				1284	++NumInstsInserted;
				1285	}
				1286
				1287	/// Takes the current predicate state (in a register) and merges it into the
				1288	/// stack pointer. The state is essentially a single bit, but we merge this in
				1289	/// a way that won't form non-canonical pointers and also will be preserved
				1290	/// across normal stack adjustments.
				1291	void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
				1292	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
				1293	unsigned PredStateReg) {
				1294	unsigned TmpReg = MRI->createVirtualRegister(PredStateRC);
				1295	// FIXME: This hard codes a shift distance based on the number of bits needed
				1296	// to stay canonical on 64-bit. We should compute this somehow and support
				1297	// 32-bit as part of that.
				1298	auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg)
				1299	.addReg(PredStateReg, RegState::Kill)
				1300	.addImm(47);
				1301	ShiftI->addRegisterDead(X86::EFLAGS, TRI);
				1302	++NumInstsInserted;
				1303	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP)
				1304	.addReg(X86::RSP)
				1305	.addReg(TmpReg, RegState::Kill);
				1306	OrI->addRegisterDead(X86::EFLAGS, TRI);
				1307	++NumInstsInserted;
				1308	}
				1309
				1310	/// Extracts the predicate state stored in the high bits of the stack pointer.
				1311	unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
				1312	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
				1313	DebugLoc Loc) {
				1314	unsigned PredStateReg = MRI->createVirtualRegister(PredStateRC);
				1315	unsigned TmpReg = MRI->createVirtualRegister(PredStateRC);
				1316
				1317	// We know that the stack pointer will have any preserved predicate state in
				1318	// its high bit. We just want to smear this across the other bits. Turns out,
				1319	// this is exactly what an arithmetic right shift does.
				1320	BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg)
				1321	.addReg(X86::RSP);
				1322	auto ShiftI =
				1323	BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg)
				1324	.addReg(TmpReg, RegState::Kill)
				1325	.addImm(TRI->getRegSizeInBits(*PredStateRC) - 1);
				1326	ShiftI->addRegisterDead(X86::EFLAGS, TRI);
				1327	++NumInstsInserted;
				1328
				1329	return PredStateReg;
				1330	}
				1331
				1332	void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
				1333	MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
				1334	MachineSSAUpdater &PredStateSSA,
				1335	SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
				1336	MachineBasicBlock &MBB = *MI.getParent();
				1337	DebugLoc Loc = MI.getDebugLoc();
				1338
				1339	// Check if EFLAGS are alive by seeing if there is a def of them or they
				1340	// live-in, and then seeing if that def is in turn used.
				1341	bool EFLAGSLive = isEFLAGSLive(MBB, MI.getIterator(), *TRI);
				1342
				1343	SmallVector<MachineOperand *, 2> HardenOpRegs;
				1344
				1345	if (BaseMO.isFI()) {
				1346	// A frame index is never a dynamically controllable load, so only
				1347	// harden it if we're covering fixed address loads as well.
				1348	LLVM_DEBUG(
				1349	dbgs() << " Skipping hardening base of explicit stack frame load: ";
				1350	MI.dump(); dbgs() << "\n");
				1351	} else if (BaseMO.getReg() == X86::RIP \|\|
				1352	BaseMO.getReg() == X86::NoRegister) {
				1353	// For both RIP-relative addressed loads or absolute loads, we cannot
				1354	// meaningfully harden them because the address being loaded has no
				1355	// dynamic component.
				1356	//
				1357	// FIXME: When using a segment base (like TLS does) we end up with the
				1358	// dynamic address being the base plus -1 because we can't mutate the
				1359	// segment register here. This allows the signed 32-bit offset to point at
				1360	// valid segment-relative addresses and load them successfully.
				1361	LLVM_DEBUG(
				1362	dbgs() << " Cannot harden base of "
				1363	<< (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base")
				1364	<< " address in a load!");
				1365	} else {
				1366	assert(BaseMO.isReg() &&
				1367	"Only allowed to have a frame index or register base.");
				1368	HardenOpRegs.push_back(&BaseMO);
				1369	}
				1370
				1371	if (IndexMO.getReg() != X86::NoRegister &&
				1372	(HardenOpRegs.empty() \|\|
				1373	HardenOpRegs.front()->getReg() != IndexMO.getReg()))
				1374	HardenOpRegs.push_back(&IndexMO);
				1375
				1376	assert((HardenOpRegs.size() == 1 \|\| HardenOpRegs.size() == 2) &&
				1377	"Should have exactly one or two registers to harden!");
				1378	assert((HardenOpRegs.size() == 1 \|\|
				1379	HardenOpRegs[0]->getReg() != HardenOpRegs[1]->getReg()) &&
				1380	"Should not have two of the same registers!");
				1381
				1382	// Remove any registers that have alreaded been checked.
				1383	llvm::erase_if(HardenOpRegs, [&](MachineOperand *Op) {
				1384	// See if this operand's register has already been checked.
				1385	auto It = AddrRegToHardenedReg.find(Op->getReg());
				1386	if (It == AddrRegToHardenedReg.end())
				1387	// Not checked, so retain this one.
				1388	return false;
				1389
				1390	// Otherwise, we can directly update this operand and remove it.
				1391	Op->setReg(It->second);
				1392	return true;
				1393	});
				1394	// If there are none left, we're done.
				1395	if (HardenOpRegs.empty())
				1396	return;
				1397
				1398	// Compute the current predicate state.
				1399	unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
				1400
				1401	auto InsertPt = MI.getIterator();
				1402
				1403	// If EFLAGS are live and we don't have access to instructions that avoid
				1404	// clobbering EFLAGS we need to save and restore them. This in turn makes
				1405	// the EFLAGS no longer live.
				1406	unsigned FlagsReg = 0;
				1407	if (EFLAGSLive && !Subtarget->hasBMI2()) {
				1408	EFLAGSLive = false;
				1409	FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
				1410	}
				1411
				1412	for (MachineOperand *Op : HardenOpRegs) {
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1413	unsigned OpReg = Op->getReg();
Chandler Carruth	cdf0add	2018-07-16 04:17:51 +0000	[diff] [blame]	1414	auto *OpRC = MRI->getRegClass(OpReg);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1415	unsigned TmpReg = MRI->createVirtualRegister(OpRC);
				1416
Chandler Carruth	cdf0add	2018-07-16 04:17:51 +0000	[diff] [blame]	1417	// If this is a vector register, we'll need somewhat custom logic to handle
				1418	// hardening it.
				1419	if (!Subtarget->hasVLX() && (OpRC->hasSuperClassEq(&X86::VR128RegClass) \|\|
				1420	OpRC->hasSuperClassEq(&X86::VR256RegClass))) {
				1421	assert(Subtarget->hasAVX2() && "AVX2-specific register classes!");
				1422	bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128RegClass);
				1423
				1424	// Move our state into a vector register.
				1425	// FIXME: We could skip this at the cost of longer encodings with AVX-512
				1426	// but that doesn't seem likely worth it.
				1427	unsigned VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass);
				1428	auto MovI =
				1429	BuildMI(MBB, InsertPt, Loc, TII->get(X86::VMOV64toPQIrr), VStateReg)
				1430	.addReg(StateReg);
				1431	(void)MovI;
				1432	++NumInstsInserted;
				1433	LLVM_DEBUG(dbgs() << " Inserting mov: "; MovI->dump(); dbgs() << "\n");
				1434
				1435	// Broadcast it across the vector register.
				1436	unsigned VBStateReg = MRI->createVirtualRegister(OpRC);
				1437	auto BroadcastI = BuildMI(MBB, InsertPt, Loc,
				1438	TII->get(Is128Bit ? X86::VPBROADCASTQrr
				1439	: X86::VPBROADCASTQYrr),
				1440	VBStateReg)
				1441	.addReg(VStateReg);
				1442	(void)BroadcastI;
				1443	++NumInstsInserted;
				1444	LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
				1445	dbgs() << "\n");
				1446
				1447	// Merge our potential poison state into the value with a vector or.
				1448	auto OrI =
				1449	BuildMI(MBB, InsertPt, Loc,
				1450	TII->get(Is128Bit ? X86::VPORrr : X86::VPORYrr), TmpReg)
				1451	.addReg(VBStateReg)
				1452	.addReg(OpReg);
				1453	(void)OrI;
				1454	++NumInstsInserted;
				1455	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
				1456	} else if (OpRC->hasSuperClassEq(&X86::VR128XRegClass) \|\|
				1457	OpRC->hasSuperClassEq(&X86::VR256XRegClass) \|\|
				1458	OpRC->hasSuperClassEq(&X86::VR512RegClass)) {
				1459	assert(Subtarget->hasAVX512() && "AVX512-specific register classes!");
				1460	bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128XRegClass);
				1461	bool Is256Bit = OpRC->hasSuperClassEq(&X86::VR256XRegClass);
				1462	if (Is128Bit \|\| Is256Bit)
				1463	assert(Subtarget->hasVLX() && "AVX512VL-specific register classes!");
				1464
				1465	// Broadcast our state into a vector register.
				1466	unsigned VStateReg = MRI->createVirtualRegister(OpRC);
				1467	unsigned BroadcastOp =
				1468	Is128Bit ? X86::VPBROADCASTQrZ128r
				1469	: Is256Bit ? X86::VPBROADCASTQrZ256r : X86::VPBROADCASTQrZr;
				1470	auto BroadcastI =
				1471	BuildMI(MBB, InsertPt, Loc, TII->get(BroadcastOp), VStateReg)
				1472	.addReg(StateReg);
				1473	(void)BroadcastI;
				1474	++NumInstsInserted;
				1475	LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
				1476	dbgs() << "\n");
				1477
				1478	// Merge our potential poison state into the value with a vector or.
				1479	unsigned OrOp = Is128Bit ? X86::VPORQZ128rr
				1480	: Is256Bit ? X86::VPORQZ256rr : X86::VPORQZrr;
				1481	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOp), TmpReg)
				1482	.addReg(VStateReg)
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1483	.addReg(OpReg);
Chandler Carruth	bc46bca	2018-07-16 04:42:27 +0000	[diff] [blame]	1484	(void)OrI;
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1485	++NumInstsInserted;
				1486	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
				1487	} else {
Chandler Carruth	cdf0add	2018-07-16 04:17:51 +0000	[diff] [blame]	1488	// FIXME: Need to support GR32 here for 32-bit code.
				1489	assert(OpRC->hasSuperClassEq(&X86::GR64RegClass) &&
				1490	"Not a supported register class for address hardening!");
				1491
				1492	if (!EFLAGSLive) {
				1493	// Merge our potential poison state into the value with an or.
				1494	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), TmpReg)
				1495	.addReg(StateReg)
				1496	.addReg(OpReg);
				1497	OrI->addRegisterDead(X86::EFLAGS, TRI);
				1498	++NumInstsInserted;
				1499	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
				1500	} else {
				1501	// We need to avoid touching EFLAGS so shift out all but the least
				1502	// significant bit using the instruction that doesn't update flags.
				1503	auto ShiftI =
				1504	BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHRX64rr), TmpReg)
				1505	.addReg(OpReg)
				1506	.addReg(StateReg);
				1507	(void)ShiftI;
				1508	++NumInstsInserted;
				1509	LLVM_DEBUG(dbgs() << " Inserting shrx: "; ShiftI->dump();
				1510	dbgs() << "\n");
				1511	}
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1512	}
				1513
				1514	// Record this register as checked and update the operand.
				1515	assert(!AddrRegToHardenedReg.count(Op->getReg()) &&
				1516	"Should not have checked this register yet!");
				1517	AddrRegToHardenedReg[Op->getReg()] = TmpReg;
				1518	Op->setReg(TmpReg);
				1519	++NumAddrRegsHardened;
				1520	}
				1521
				1522	// And restore the flags if needed.
				1523	if (FlagsReg)
				1524	restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
				1525	}
				1526
				1527	MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
				1528	MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedLoads) {
				1529	assert(isDataInvariantLoad(InitialMI) &&
				1530	"Cannot get here with a non-invariant load!");
				1531
				1532	// See if we can sink hardening the loaded value.
				1533	auto SinkCheckToSingleUse =
				1534	[&](MachineInstr &MI) -> Optional<MachineInstr *> {
				1535	unsigned DefReg = MI.getOperand(0).getReg();
				1536
				1537	// We need to find a single use which we can sink the check. We can
				1538	// primarily do this because many uses may already end up checked on their
				1539	// own.
				1540	MachineInstr *SingleUseMI = nullptr;
				1541	for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
				1542	// If we're already going to harden this use, it is data invariant and
				1543	// within our block and we just need to check that the use isn't in an
				1544	// address.
				1545	if (HardenedLoads.count(&UseMI)) {
				1546	const MCInstrDesc &Desc = UseMI.getDesc();
				1547	int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
				1548	assert(MemRefBeginIdx >= 0 &&
				1549	"Should always have mem references here!");
				1550	MemRefBeginIdx += X86II::getOperandBias(Desc);
				1551
				1552	MachineOperand &BaseMO =
				1553	UseMI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
				1554	MachineOperand &IndexMO =
				1555	UseMI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
				1556	if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) \|\|
				1557	(IndexMO.isReg() && IndexMO.getReg() == DefReg))
				1558	// The load uses the register as part of its address making it not
				1559	// invariant.
				1560	return {};
				1561
				1562	continue;
				1563	}
				1564
				1565	if (SingleUseMI)
				1566	// We already have a single use, this would make two. Bail.
				1567	return {};
				1568
				1569	// If this single use isn't data invariant, isn't in this block, or has
				1570	// interfering EFLAGS, we can't sink the hardening to it.
				1571	if (!isDataInvariant(UseMI) \|\| UseMI.getParent() != MI.getParent())
				1572	return {};
				1573
				1574	// If this instruction defines multiple registers bail as we won't harden
				1575	// all of them.
				1576	if (UseMI.getDesc().getNumDefs() > 1)
				1577	return {};
				1578
				1579	// If this register isn't a virtual register we can't walk uses of sanely,
				1580	// just bail. Also check that its register class is one of the ones we
				1581	// can harden.
				1582	unsigned UseDefReg = UseMI.getOperand(0).getReg();
				1583	if (!TRI->isVirtualRegister(UseDefReg) \|\|
				1584	!MRI->getRegClass(UseDefReg)->hasSubClassEq(&X86::GR64RegClass))
				1585	return {};
				1586
				1587	SingleUseMI = &UseMI;
				1588	}
				1589
				1590	// If SingleUseMI is still null, there is no use that needs its own
				1591	// checking. Otherwise, it is the single use that needs checking.
				1592	return {SingleUseMI};
				1593	};
				1594
				1595	MachineInstr *MI = &InitialMI;
				1596	while (Optional<MachineInstr > SingleUse = SinkCheckToSingleUse(MI)) {
				1597	// Update which MI we're checking now.
				1598	MI = *SingleUse;
				1599	if (!MI)
				1600	break;
				1601	}
				1602
				1603	return MI;
				1604	}
				1605
Chandler Carruth	e66a6f4	2018-07-16 11:38:48 +0000	[diff] [blame]	1606	bool X86SpeculativeLoadHardeningPass::canHardenPostLoad(MachineInstr &MI) {
				1607	if (!isDataInvariantLoad(MI))
				1608	return false;
				1609
				1610	auto &DefOp = MI.getOperand(0);
				1611	unsigned OldDefReg = DefOp.getReg();
				1612
				1613	auto *DefRC = MRI->getRegClass(OldDefReg);
				1614	int DefRegBytes = TRI->getRegSizeInBits(*DefRC) / 8;
				1615	if (DefRegBytes > 8)
				1616	// We don't support post-load hardening of vectors.
				1617	return false;
				1618
				1619	const TargetRegisterClass *GPRRegClasses[] = {
				1620	&X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass,
				1621	&X86::GR64RegClass};
				1622	return DefRC->hasSuperClassEq(GPRRegClasses[Log2_32(DefRegBytes)]);
				1623	}
				1624
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1625	// We can harden non-leaking loads into register without touching the address
				1626	// by just hiding all of the loaded bits. We use an `or` instruction to do
				1627	// this because having the poison value be all ones allows us to use the same
				1628	// value below. And the goal is just for the loaded bits to not be exposed to
				1629	// execution and coercing them to one is sufficient.
				1630	void X86SpeculativeLoadHardeningPass::hardenPostLoad(
				1631	MachineInstr &MI, MachineSSAUpdater &PredStateSSA) {
Chandler Carruth	e66a6f4	2018-07-16 11:38:48 +0000	[diff] [blame]	1632	assert(canHardenPostLoad(MI) &&
				1633	"Invalid instruction for post-load hardening!");
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1634
				1635	MachineBasicBlock &MBB = *MI.getParent();
				1636	DebugLoc Loc = MI.getDebugLoc();
				1637
				1638	// For all of these, the def'ed register operand is operand zero.
				1639	auto &DefOp = MI.getOperand(0);
				1640	unsigned OldDefReg = DefOp.getReg();
				1641
				1642	auto *DefRC = MRI->getRegClass(OldDefReg);
				1643	int DefRegBytes = TRI->getRegSizeInBits(*DefRC) / 8;
				1644
				1645	unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
				1646	unsigned OrOpCode = OrOpCodes[Log2_32(DefRegBytes)];
				1647
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1648	unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
				1649
				1650	auto GetStateRegInRC = [&](const TargetRegisterClass &RC) {
				1651	unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
				1652
				1653	int Bytes = TRI->getRegSizeInBits(RC) / 8;
				1654	// FIXME: Need to teach this about 32-bit mode.
				1655	if (Bytes != 8) {
				1656	unsigned SubRegImm = SubRegImms[Log2_32(Bytes)];
				1657	unsigned NarrowStateReg = MRI->createVirtualRegister(&RC);
				1658	BuildMI(MBB, MI.getIterator(), Loc, TII->get(TargetOpcode::COPY),
				1659	NarrowStateReg)
				1660	.addReg(StateReg, 0, SubRegImm);
				1661	StateReg = NarrowStateReg;
				1662	}
				1663	return StateReg;
				1664	};
				1665
				1666	auto InsertPt = std::next(MI.getIterator());
				1667	unsigned FlagsReg = 0;
				1668	bool EFLAGSLive = isEFLAGSLive(MBB, InsertPt, *TRI);
				1669	if (EFLAGSLive && !Subtarget->hasBMI2()) {
				1670	FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
				1671	EFLAGSLive = false;
				1672	}
				1673
				1674	if (!EFLAGSLive) {
				1675	unsigned StateReg = GetStateRegInRC(*DefRC);
				1676	unsigned NewDefReg = MRI->createVirtualRegister(DefRC);
				1677	DefOp.setReg(NewDefReg);
				1678	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), OldDefReg)
				1679	.addReg(StateReg)
				1680	.addReg(NewDefReg);
				1681	OrI->addRegisterDead(X86::EFLAGS, TRI);
				1682	++NumInstsInserted;
				1683	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
				1684	} else {
				1685	assert(Subtarget->hasBMI2() &&
				1686	"Cannot harden loads and preserve EFLAGS without BMI2!");
				1687
				1688	unsigned ShiftOpCode = DefRegBytes < 4 ? X86::SHRX32rr : X86::SHRX64rr;
				1689	auto &ShiftRC =
				1690	DefRegBytes < 4 ? X86::GR32_NOSPRegClass : X86::GR64_NOSPRegClass;
				1691	int ShiftRegBytes = TRI->getRegSizeInBits(ShiftRC) / 8;
				1692	unsigned DefSubRegImm = SubRegImms[Log2_32(DefRegBytes)];
				1693
				1694	unsigned StateReg = GetStateRegInRC(ShiftRC);
				1695
				1696	// First have the def instruction def a temporary register.
				1697	unsigned TmpReg = MRI->createVirtualRegister(DefRC);
				1698	DefOp.setReg(TmpReg);
				1699	// Now copy it into a register of the shift RC.
				1700	unsigned ShiftInputReg = TmpReg;
				1701	if (DefRegBytes != ShiftRegBytes) {
				1702	unsigned UndefReg = MRI->createVirtualRegister(&ShiftRC);
				1703	BuildMI(MBB, InsertPt, Loc, TII->get(X86::IMPLICIT_DEF), UndefReg);
				1704	ShiftInputReg = MRI->createVirtualRegister(&ShiftRC);
				1705	BuildMI(MBB, InsertPt, Loc, TII->get(X86::INSERT_SUBREG), ShiftInputReg)
				1706	.addReg(UndefReg)
				1707	.addReg(TmpReg)
				1708	.addImm(DefSubRegImm);
				1709	}
				1710
				1711	// We shift this once if the shift is wider than the def and thus we can
				1712	// shift all of the def'ed bytes out. Otherwise we need to do two shifts.
				1713
				1714	unsigned ShiftedReg = MRI->createVirtualRegister(&ShiftRC);
				1715	auto Shift1I =
				1716	BuildMI(MBB, InsertPt, Loc, TII->get(ShiftOpCode), ShiftedReg)
				1717	.addReg(ShiftInputReg)
				1718	.addReg(StateReg);
				1719	(void)Shift1I;
				1720	++NumInstsInserted;
				1721	LLVM_DEBUG(dbgs() << " Inserting shrx: "; Shift1I->dump(); dbgs() << "\n");
				1722
				1723	// The only way we have a bit left is if all 8 bytes were defined. Do an
				1724	// extra shift to get the last bit in this case.
				1725	if (DefRegBytes == ShiftRegBytes) {
				1726	// We can just directly def the old def register as its the same size.
				1727	ShiftInputReg = ShiftedReg;
				1728	auto Shift2I =
				1729	BuildMI(MBB, InsertPt, Loc, TII->get(ShiftOpCode), OldDefReg)
				1730	.addReg(ShiftInputReg)
				1731	.addReg(StateReg);
				1732	(void)Shift2I;
				1733	++NumInstsInserted;
				1734	LLVM_DEBUG(dbgs() << " Inserting shrx: "; Shift2I->dump();
				1735	dbgs() << "\n");
				1736	} else {
				1737	// When we have different size shift register we need to fix up the
				1738	// class. We can do that as we copy into the old def register.
				1739	BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), OldDefReg)
				1740	.addReg(ShiftedReg, 0, DefSubRegImm);
				1741	}
				1742	}
				1743
				1744	if (FlagsReg)
				1745	restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
				1746
				1747	++NumPostLoadRegsHardened;
				1748	}
				1749
				1750	void X86SpeculativeLoadHardeningPass::checkReturnInstr(
				1751	MachineInstr &MI, MachineSSAUpdater &PredStateSSA) {
				1752	MachineBasicBlock &MBB = *MI.getParent();
				1753	DebugLoc Loc = MI.getDebugLoc();
				1754	auto InsertPt = MI.getIterator();
				1755
				1756	if (FenceCallAndRet) {
				1757	// Simply forcibly block speculation of loads out of the function by using
				1758	// an LFENCE. This is potentially a heavy-weight mitigation strategy, but
				1759	// should be secure, is simple from an ABI perspective, and the cost can be
				1760	// minimized through inlining.
				1761	//
				1762	// FIXME: We should investigate ways to establish a strong data-dependency
				1763	// on the return. However, poisoning the stack pointer is unlikely to work
				1764	// because the return is predicted rather than relying on the load of the
				1765	// return address to actually resolve.
				1766	BuildMI(MBB, InsertPt, Loc, TII->get(X86::LFENCE));
				1767	++NumInstsInserted;
				1768	++NumLFENCEsInserted;
				1769	return;
				1770	}
				1771
				1772	// Take our predicate state, shift it to the high 17 bits (so that we keep
				1773	// pointers canonical) and merge it into RSP. This will allow the caller to
				1774	// extract it when we return (speculatively).
				1775	mergePredStateIntoSP(MBB, InsertPt, Loc,
				1776	PredStateSSA.GetValueAtEndOfBlock(&MBB));
				1777	}
				1778
				1779	INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
				1780	"X86 speculative load hardener", false, false)
				1781	INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
				1782	"X86 speculative load hardener", false, false)
				1783
				1784	FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() {
				1785	return new X86SpeculativeLoadHardeningPass();
				1786	}