Blame - llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp - toolchain/llvm-project

blob: a99fa203fa4b24487ae1961f033af5d3688ae6f3 [file] [log] [blame]

Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1	//====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	/// \file
				10	///
				11	/// Provide a pass which mitigates speculative execution attacks which operate
				12	/// by speculating incorrectly past some predicate (a type check, bounds check,
				13	/// or other condition) to reach a load with invalid inputs and leak the data
				14	/// accessed by that load using a side channel out of the speculative domain.
				15	///
				16	/// For details on the attacks, see the first variant in both the Project Zero
				17	/// writeup and the Spectre paper:
				18	/// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
				19	/// https://spectreattack.com/spectre.pdf
				20	///
				21	//===----------------------------------------------------------------------===//
				22
				23	#include "X86.h"
				24	#include "X86InstrBuilder.h"
				25	#include "X86InstrInfo.h"
				26	#include "X86Subtarget.h"
				27	#include "llvm/ADT/ArrayRef.h"
				28	#include "llvm/ADT/DenseMap.h"
				29	#include "llvm/ADT/STLExtras.h"
				30	#include "llvm/ADT/ScopeExit.h"
				31	#include "llvm/ADT/SmallPtrSet.h"
				32	#include "llvm/ADT/SmallSet.h"
				33	#include "llvm/ADT/SmallVector.h"
				34	#include "llvm/ADT/SparseBitVector.h"
				35	#include "llvm/ADT/Statistic.h"
				36	#include "llvm/CodeGen/MachineBasicBlock.h"
				37	#include "llvm/CodeGen/MachineConstantPool.h"
				38	#include "llvm/CodeGen/MachineFunction.h"
				39	#include "llvm/CodeGen/MachineFunctionPass.h"
				40	#include "llvm/CodeGen/MachineInstr.h"
				41	#include "llvm/CodeGen/MachineInstrBuilder.h"
				42	#include "llvm/CodeGen/MachineModuleInfo.h"
				43	#include "llvm/CodeGen/MachineOperand.h"
				44	#include "llvm/CodeGen/MachineRegisterInfo.h"
				45	#include "llvm/CodeGen/MachineSSAUpdater.h"
				46	#include "llvm/CodeGen/TargetInstrInfo.h"
				47	#include "llvm/CodeGen/TargetRegisterInfo.h"
				48	#include "llvm/CodeGen/TargetSchedule.h"
				49	#include "llvm/CodeGen/TargetSubtargetInfo.h"
				50	#include "llvm/IR/DebugLoc.h"
				51	#include "llvm/MC/MCSchedule.h"
				52	#include "llvm/Pass.h"
				53	#include "llvm/Support/CommandLine.h"
				54	#include "llvm/Support/Debug.h"
				55	#include "llvm/Support/raw_ostream.h"
				56	#include <algorithm>
				57	#include <cassert>
				58	#include <iterator>
				59	#include <utility>
				60
				61	using namespace llvm;
				62
				63	#define PASS_KEY "x86-speculative-load-hardening"
				64	#define DEBUG_TYPE PASS_KEY
				65
				66	STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced");
				67	STATISTIC(NumBranchesUntraced, "Number of branches unable to trace");
				68	STATISTIC(NumAddrRegsHardened,
				69	"Number of address mode used registers hardaned");
				70	STATISTIC(NumPostLoadRegsHardened,
				71	"Number of post-load register values hardened");
				72	STATISTIC(NumInstsInserted, "Number of instructions inserted");
				73	STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
				74
				75	static cl::opt<bool> HardenEdgesWithLFENCE(
				76	PASS_KEY "-lfence",
				77	cl::desc(
				78	"Use LFENCE along each conditional edge to harden against speculative "
				79	"loads rather than conditional movs and poisoned pointers."),
				80	cl::init(false), cl::Hidden);
				81
				82	static cl::opt<bool> EnablePostLoadHardening(
				83	PASS_KEY "-post-load",
				84	cl::desc("Harden the value loaded after it is loaded by "
				85	"flushing the loaded bits to 1. This is hard to do "
				86	"in general but can be done easily for GPRs."),
				87	cl::init(true), cl::Hidden);
				88
				89	static cl::opt<bool> FenceCallAndRet(
				90	PASS_KEY "-fence-call-and-ret",
				91	cl::desc("Use a full speculation fence to harden both call and ret edges "
				92	"rather than a lighter weight mitigation."),
				93	cl::init(false), cl::Hidden);
				94
				95	static cl::opt<bool> HardenInterprocedurally(
				96	PASS_KEY "-ip",
				97	cl::desc("Harden interprocedurally by passing our state in and out of "
				98	"functions in the high bits of the stack pointer."),
				99	cl::init(true), cl::Hidden);
				100
				101	static cl::opt<bool>
				102	HardenLoads(PASS_KEY "-loads",
				103	cl::desc("Sanitize loads from memory. When disable, no "
				104	"significant security is provided."),
				105	cl::init(true), cl::Hidden);
				106
				107	namespace llvm {
				108
				109	void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
				110
				111	} // end namespace llvm
				112
				113	namespace {
				114
				115	class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
				116	public:
				117	X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) {
				118	initializeX86SpeculativeLoadHardeningPassPass(
				119	*PassRegistry::getPassRegistry());
				120	}
				121
				122	StringRef getPassName() const override {
				123	return "X86 speculative load hardening";
				124	}
				125	bool runOnMachineFunction(MachineFunction &MF) override;
				126	void getAnalysisUsage(AnalysisUsage &AU) const override;
				127
				128	/// Pass identification, replacement for typeid.
				129	static char ID;
				130
				131	private:
				132	/// The information about a block's conditional terminators needed to trace
				133	/// our predicate state through the exiting edges.
				134	struct BlockCondInfo {
				135	MachineBasicBlock *MBB;
				136
				137	// We mostly have one conditional branch, and in extremely rare cases have
				138	// two. Three and more are so rare as to be unimportant for compile time.
				139	SmallVector<MachineInstr *, 2> CondBrs;
				140
				141	MachineInstr *UncondBr;
				142	};
				143
				144	const X86Subtarget *Subtarget;
				145	MachineRegisterInfo *MRI;
				146	const X86InstrInfo *TII;
				147	const TargetRegisterInfo *TRI;
				148	const TargetRegisterClass *PredStateRC;
				149
				150	void hardenEdgesWithLFENCE(MachineFunction &MF);
				151
				152	SmallVector<BlockCondInfo, 16> collectBlockCondInfo(MachineFunction &MF);
				153
				154	void checkAllLoads(MachineFunction &MF, MachineSSAUpdater &PredStateSSA);
				155
				156	unsigned saveEFLAGS(MachineBasicBlock &MBB,
				157	MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
				158	void restoreEFLAGS(MachineBasicBlock &MBB,
				159	MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
				160	unsigned OFReg);
				161
				162	void mergePredStateIntoSP(MachineBasicBlock &MBB,
				163	MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
				164	unsigned PredStateReg);
				165	unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
				166	MachineBasicBlock::iterator InsertPt,
				167	DebugLoc Loc);
				168
				169	void
				170	hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
				171	MachineOperand &IndexMO, MachineSSAUpdater &PredStateSSA,
				172	SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
				173	MachineInstr *
				174	sinkPostLoadHardenedInst(MachineInstr &MI,
				175	SmallPtrSetImpl<MachineInstr *> &HardenedLoads);
				176	void hardenPostLoad(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
				177	void checkReturnInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
				178	void checkCallInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
				179	};
				180
				181	} // end anonymous namespace
				182
				183	char X86SpeculativeLoadHardeningPass::ID = 0;
				184
				185	void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
				186	AnalysisUsage &AU) const {
				187	MachineFunctionPass::getAnalysisUsage(AU);
				188	}
				189
				190	static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB,
				191	MachineBasicBlock &Succ, int SuccCount,
				192	MachineInstr Br, MachineInstr &UncondBr,
				193	const X86InstrInfo &TII) {
				194	assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!");
				195
				196	MachineFunction &MF = *MBB.getParent();
				197
				198	MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
				199
				200	// We have to insert the new block immediately after the current one as we
				201	// don't know what layout-successor relationships the successor has and we
				202	// may not be able to (and generally don't want to) try to fix those up.
				203	MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
				204
				205	// Update the branch instruction if necessary.
				206	if (Br) {
				207	assert(Br->getOperand(0).getMBB() == &Succ &&
				208	"Didn't start with the right target!");
				209	Br->getOperand(0).setMBB(&NewMBB);
				210
				211	// If this successor was reached through a branch rather than fallthrough,
				212	// we might have broken fallthrough and so need to inject a new
				213	// unconditional branch.
				214	if (!UncondBr) {
				215	MachineBasicBlock &OldLayoutSucc =
				216	*std::next(MachineFunction::iterator(&NewMBB));
				217	assert(MBB.isSuccessor(&OldLayoutSucc) &&
				218	"Without an unconditional branch, the old layout successor should "
				219	"be an actual successor!");
				220	auto BrBuilder =
				221	BuildMI(&MBB, DebugLoc(), TII.get(X86::JMP_1)).addMBB(&OldLayoutSucc);
				222	// Update the unconditional branch now that we've added one.
				223	UncondBr = &*BrBuilder;
				224	}
				225
				226	// Insert unconditional "jump Succ" instruction in the new block if
				227	// necessary.
				228	if (!NewMBB.isLayoutSuccessor(&Succ)) {
				229	SmallVector<MachineOperand, 4> Cond;
				230	TII.insertBranch(NewMBB, &Succ, nullptr, Cond, Br->getDebugLoc());
				231	}
				232	} else {
				233	assert(!UncondBr &&
				234	"Cannot have a branchless successor and an unconditional branch!");
				235	assert(NewMBB.isLayoutSuccessor(&Succ) &&
				236	"A non-branch successor must have been a layout successor before "
				237	"and now is a layout successor of the new block.");
				238	}
				239
				240	// If this is the only edge to the successor, we can just replace it in the
				241	// CFG. Otherwise we need to add a new entry in the CFG for the new
				242	// successor.
				243	if (SuccCount == 1) {
				244	MBB.replaceSuccessor(&Succ, &NewMBB);
				245	} else {
				246	MBB.splitSuccessor(&Succ, &NewMBB);
				247	}
				248
				249	// Hook up the edge from the new basic block to the old successor in the CFG.
				250	NewMBB.addSuccessor(&Succ);
				251
				252	// Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
				253	for (MachineInstr &MI : Succ) {
				254	if (!MI.isPHI())
				255	break;
				256	for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
				257	OpIdx += 2) {
				258	MachineOperand &OpV = MI.getOperand(OpIdx);
				259	MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
				260	assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
				261	if (OpMBB.getMBB() != &MBB)
				262	continue;
				263
				264	// If this is the last edge to the succesor, just replace MBB in the PHI
				265	if (SuccCount == 1) {
				266	OpMBB.setMBB(&NewMBB);
				267	break;
				268	}
				269
				270	// Otherwise, append a new pair of operands for the new incoming edge.
				271	MI.addOperand(MF, OpV);
				272	MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
				273	break;
				274	}
				275	}
				276
				277	// Inherit live-ins from the successor
				278	for (auto &LI : Succ.liveins())
				279	NewMBB.addLiveIn(LI);
				280
				281	LLVM_DEBUG(dbgs() << " Split edge from '" << MBB.getName() << "' to '"
				282	<< Succ.getName() << "'.\n");
				283	return NewMBB;
				284	}
				285
				286	bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
				287	MachineFunction &MF) {
				288	LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
				289	<< " **********\n");
				290
				291	Subtarget = &MF.getSubtarget<X86Subtarget>();
				292	MRI = &MF.getRegInfo();
				293	TII = Subtarget->getInstrInfo();
				294	TRI = Subtarget->getRegisterInfo();
				295	// FIXME: Support for 32-bit.
				296	PredStateRC = &X86::GR64_NOSPRegClass;
				297
				298	if (MF.begin() == MF.end())
				299	// Nothing to do for a degenerate empty function...
				300	return false;
				301
				302	// We support an alternative hardening technique based on a debug flag.
				303	if (HardenEdgesWithLFENCE) {
				304	hardenEdgesWithLFENCE(MF);
				305	return true;
				306	}
				307
				308	// Create a dummy debug loc to use for all the generated code here.
				309	DebugLoc Loc;
				310
				311	MachineBasicBlock &Entry = *MF.begin();
				312	auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(Entry.begin());
				313
				314	// Do a quick scan to see if we have any checkable loads.
				315	bool HasCheckableLoad = false;
				316	for (MachineBasicBlock &MBB : MF) {
				317	for (MachineInstr &MI : MBB) {
				318	// Stop searching blocks at an LFENCE.
				319	if (MI.getOpcode() == X86::LFENCE)
				320	break;
				321
				322	// Looking for loads only.
				323	if (!MI.mayLoad())
				324	continue;
				325
				326	// An MFENCE is modeled as a load but doesn't require hardening.
				327	if (MI.getOpcode() == X86::MFENCE)
				328	continue;
				329
				330	HasCheckableLoad = true;
				331	break;
				332	}
				333	if (HasCheckableLoad)
				334	break;
				335	}
				336
				337	// See if we have any conditional branching blocks that we will need to trace
				338	// predicate state through.
				339	SmallVector<BlockCondInfo, 16> Infos = collectBlockCondInfo(MF);
				340
				341	// If we have no interesting conditions or loads, nothing to do here.
				342	if (!HasCheckableLoad && Infos.empty())
				343	return true;
				344
				345	unsigned PredStateReg;
				346	unsigned PredStateSizeInBytes = TRI->getRegSizeInBits(*PredStateRC) / 8;
				347
				348	// The poison value is required to be an all-ones value for many aspects of
				349	// this mitigation.
				350	const int PoisonVal = -1;
				351	unsigned PoisonReg = MRI->createVirtualRegister(PredStateRC);
				352	BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64ri32), PoisonReg)
				353	.addImm(PoisonVal);
				354	++NumInstsInserted;
				355
				356	// If we have loads being hardened and we've asked for call and ret edges to
				357	// get a full fence-based mitigation, inject that fence.
				358	if (HasCheckableLoad && FenceCallAndRet) {
				359	// We need to insert an LFENCE at the start of the function to suspend any
				360	// incoming misspeculation from the caller. This helps two-fold: the caller
				361	// may not have been protected as this code has been, and this code gets to
				362	// not take any specific action to protect across calls.
				363	// FIXME: We could skip this for functions which unconditionally return
				364	// a constant.
				365	BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::LFENCE));
				366	++NumInstsInserted;
				367	++NumLFENCEsInserted;
				368	}
				369
				370	// If we have no conditionals to protect in blocks, then all we needed to do
				371	// was protect the entry and so we're done.
				372	if (Infos.empty())
				373	// We may have changed the function's code at this point to insert fences.
				374	return true;
				375
				376	// For every basic block in the function which can b
				377	if (HardenInterprocedurally && !FenceCallAndRet) {
				378	// Set up the predicate state by extracting it from the incoming stack
				379	// pointer so we pick up any misspeculation in our caller.
				380	PredStateReg = extractPredStateFromSP(Entry, EntryInsertPt, Loc);
				381	} else {
				382	// Otherwise, just build the predicate state itself by zeroing a register
				383	// as we don't need any initial state.
				384	PredStateReg = MRI->createVirtualRegister(PredStateRC);
				385	unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
				386	auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
				387	PredStateSubReg);
				388	++NumInstsInserted;
				389	MachineOperand *ZeroEFLAGSDefOp =
				390	ZeroI->findRegisterDefOperand(X86::EFLAGS);
				391	assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
				392	"Must have an implicit def of EFLAGS!");
				393	ZeroEFLAGSDefOp->setIsDead(true);
				394	BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
				395	PredStateReg)
				396	.addImm(0)
				397	.addReg(PredStateSubReg)
				398	.addImm(X86::sub_32bit);
				399	}
				400
				401	// We're going to need to trace predicate state throughout the function's
				402	// CFG. Prepare for this by setting up our initial state of PHIs with unique
				403	// predecessor entries and all the initial predicate state.
				404
				405	// FIXME: It's really frustrating that we have to do this, but SSA-form in
				406	// MIR isn't what you might expect. We may have multiple entries in PHI nodes
				407	// for a single predecessor. This makes CFG-updating extremely complex, so
				408	// here we simplify all PHI nodes to a model even simpler than the IR's
				409	// model: exactly one entry per predecessor, regardless of how many edges
				410	// there are.
				411	SmallPtrSet<MachineBasicBlock *, 4> Preds;
				412	SmallVector<int, 4> DupIndices;
				413	for (auto &MBB : MF)
				414	for (auto &MI : MBB) {
				415	if (!MI.isPHI())
				416	break;
				417
				418	// First we scan the operands of the PHI looking for duplicate entries
				419	// a particular predecessor. We retain the operand index of each duplicate
				420	// entry found.
				421	for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
				422	OpIdx += 2)
				423	if (!Preds.insert(MI.getOperand(OpIdx + 1).getMBB()).second)
				424	DupIndices.push_back(OpIdx);
				425
				426	// Now walk the duplicate indices, removing both the block and value. Note
				427	// that these are stored as a vector making this element-wise removal
				428	// :w
				429	// potentially quadratic.
				430	//
				431	// FIXME: It is really frustrating that we have to use a quadratic
				432	// removal algorithm here. There should be a better way, but the use-def
				433	// updates required make that impossible using the public API.
				434	//
				435	// Note that we have to process these backwards so that we don't
				436	// invalidate other indices with each removal.
				437	while (!DupIndices.empty()) {
				438	int OpIdx = DupIndices.pop_back_val();
				439	// Remove both the block and value operand, again in reverse order to
				440	// preserve indices.
				441	MI.RemoveOperand(OpIdx + 1);
				442	MI.RemoveOperand(OpIdx);
				443	}
				444
				445	Preds.clear();
				446	}
				447
				448	// Track the updated values in an SSA updater to rewrite into SSA form at the
				449	// end.
				450	MachineSSAUpdater PredStateSSA(MF);
				451	PredStateSSA.Initialize(PredStateReg);
				452	PredStateSSA.AddAvailableValue(&Entry, PredStateReg);
				453	// Collect the inserted instructions so we can rewrite their uses of the
				454	// predicate state into SSA form.
				455	SmallVector<MachineInstr *, 16> CMovs;
				456
				457	// Now walk all of the basic blocks looking for ones that end in conditional
				458	// jumps where we need to update this register along each edge.
				459	for (BlockCondInfo &Info : Infos) {
				460	MachineBasicBlock &MBB = *Info.MBB;
				461	SmallVectorImpl<MachineInstr *> &CondBrs = Info.CondBrs;
				462	MachineInstr *UncondBr = Info.UncondBr;
				463
				464	LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName()
				465	<< "\n");
				466	++NumCondBranchesTraced;
				467
				468	// Compute the non-conditional successor as either the target of any
				469	// unconditional branch or the layout successor.
				470	MachineBasicBlock *UncondSucc =
				471	UncondBr ? (UncondBr->getOpcode() == X86::JMP_1
				472	? UncondBr->getOperand(0).getMBB()
				473	: nullptr)
				474	: &*std::next(MachineFunction::iterator(&MBB));
				475
				476	// Count how many edges there are to any given successor.
				477	SmallDenseMap<MachineBasicBlock *, int> SuccCounts;
				478	if (UncondSucc)
				479	++SuccCounts[UncondSucc];
				480	for (auto *CondBr : CondBrs)
				481	++SuccCounts[CondBr->getOperand(0).getMBB()];
				482
				483	// A lambda to insert cmov instructions into a block checking all of the
				484	// condition codes in a sequence.
				485	auto BuildCheckingBlockForSuccAndConds =
				486	[&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount,
				487	MachineInstr Br, MachineInstr &UncondBr,
				488	ArrayRef<X86::CondCode> Conds) {
				489	// First, we split the edge to insert the checking block into a safe
				490	// location.
				491	auto &CheckingMBB =
				492	(SuccCount == 1 && Succ.pred_size() == 1)
				493	? Succ
				494	: splitEdge(MBB, Succ, SuccCount, Br, UncondBr, *TII);
				495
				496	bool LiveEFLAGS = Succ.isLiveIn(X86::EFLAGS);
				497	if (!LiveEFLAGS)
				498	CheckingMBB.addLiveIn(X86::EFLAGS);
				499
				500	// Now insert the cmovs to implement the checks.
				501	auto InsertPt = CheckingMBB.begin();
Erich Keane	ac8cb22	2018-07-13 14:43:20 +0000	[diff] [blame]	502	assert((InsertPt == CheckingMBB.end() \|\| !InsertPt->isPHI()) &&
				503	"Should never have a PHI in the initial checking block as it "
				504	"always has a single predecessor!");
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	505
				506	// We will wire each cmov to each other, but need to start with the
				507	// incoming pred state.
				508	unsigned CurStateReg = PredStateReg;
				509
				510	for (X86::CondCode Cond : Conds) {
				511	auto CMovOp = X86::getCMovFromCond(Cond, PredStateSizeInBytes);
				512
				513	unsigned UpdatedStateReg = MRI->createVirtualRegister(PredStateRC);
				514	auto CMovI = BuildMI(CheckingMBB, InsertPt, Loc, TII->get(CMovOp),
				515	UpdatedStateReg)
				516	.addReg(CurStateReg)
				517	.addReg(PoisonReg);
				518	// If this is the last cmov and the EFLAGS weren't originally
				519	// live-in, mark them as killed.
				520	if (!LiveEFLAGS && Cond == Conds.back())
				521	CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
				522
				523	++NumInstsInserted;
				524	LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump();
				525	dbgs() << "\n");
				526
				527	// The first one of the cmovs will be using the top level
				528	// `PredStateReg` and need to get rewritten into SSA form.
				529	if (CurStateReg == PredStateReg)
				530	CMovs.push_back(&*CMovI);
				531
				532	// The next cmov should start from this one's def.
				533	CurStateReg = UpdatedStateReg;
				534	}
				535
				536	// And put the last one into the available values for PredStateSSA.
				537	PredStateSSA.AddAvailableValue(&CheckingMBB, CurStateReg);
				538	};
				539
				540	std::vector<X86::CondCode> UncondCodeSeq;
				541	for (auto *CondBr : CondBrs) {
				542	MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB();
				543	int &SuccCount = SuccCounts[&Succ];
				544
				545	X86::CondCode Cond = X86::getCondFromBranchOpc(CondBr->getOpcode());
				546	X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond);
				547	UncondCodeSeq.push_back(Cond);
				548
				549	BuildCheckingBlockForSuccAndConds(MBB, Succ, SuccCount, CondBr, UncondBr,
				550	{InvCond});
				551
				552	// Decrement the successor count now that we've split one of the edges.
				553	// We need to keep the count of edges to the successor accurate in order
				554	// to know above when to replace the successor in the CFG vs. just
				555	// adding the new successor.
				556	--SuccCount;
				557	}
				558
				559	// Since we may have split edges and changed the number of successors,
				560	// normalize the probabilities. This avoids doing it each time we split an
				561	// edge.
				562	MBB.normalizeSuccProbs();
				563
				564	// Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
				565	// need to intersect the other condition codes. We can do this by just
				566	// doing a cmov for each one.
				567	if (!UncondSucc)
				568	// If we have no fallthrough to protect (perhaps it is an indirect jump?)
				569	// just skip this and continue.
				570	continue;
				571
				572	assert(SuccCounts[UncondSucc] == 1 &&
				573	"We should never have more than one edge to the unconditional "
				574	"successor at this point because every other edge must have been "
				575	"split above!");
				576
				577	// Sort and unique the codes to minimize them.
				578	llvm::sort(UncondCodeSeq.begin(), UncondCodeSeq.end());
				579	UncondCodeSeq.erase(std::unique(UncondCodeSeq.begin(), UncondCodeSeq.end()),
				580	UncondCodeSeq.end());
				581
				582	// Build a checking version of the successor.
				583	BuildCheckingBlockForSuccAndConds(MBB, UncondSucc, /SuccCount*/ 1,
				584	UncondBr, UncondBr, UncondCodeSeq);
				585	}
				586
				587	// We may also enter basic blocks in this function via exception handling
				588	// control flow. Here, if we are hardening interprocedurally, we need to
				589	// re-capture the predicate state from the throwing code. In the Itanium ABI,
				590	// the throw will always look like a call to __cxa_throw and will have the
				591	// predicate state in the stack pointer, so extract fresh predicate state from
				592	// the stack pointer and make it available in SSA.
				593	// FIXME: Handle non-itanium ABI EH models.
				594	if (HardenInterprocedurally) {
				595	for (MachineBasicBlock &MBB : MF) {
				596	assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!");
				597	assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
				598	assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
				599	if (!MBB.isEHPad())
				600	continue;
				601	PredStateSSA.AddAvailableValue(
				602	&MBB,
				603	extractPredStateFromSP(MBB, MBB.SkipPHIsAndLabels(MBB.begin()), Loc));
				604	}
				605	}
				606
				607	// Now check all of the loads using the predicate state.
				608	checkAllLoads(MF, PredStateSSA);
				609
				610	// Now rewrite all the uses of the pred state using the SSA updater so that
				611	// we track updates through the CFG.
				612	for (MachineInstr *CMovI : CMovs)
				613	for (MachineOperand &Op : CMovI->operands()) {
				614	if (!Op.isReg() \|\| Op.getReg() != PredStateReg)
				615	continue;
				616
				617	PredStateSSA.RewriteUse(Op);
				618	}
				619
				620	// If we are hardening interprocedurally, find each returning block and
				621	// protect the caller from being returned to through misspeculation.
				622	if (HardenInterprocedurally)
				623	for (MachineBasicBlock &MBB : MF) {
				624	if (MBB.empty())
				625	continue;
				626
				627	MachineInstr &MI = MBB.back();
				628	if (!MI.isReturn())
				629	continue;
				630
				631	checkReturnInstr(MI, PredStateSSA);
				632	}
				633
				634	LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump();
				635	dbgs() << "\n"; MF.verify(this));
				636	return true;
				637	}
				638
				639	/// Implements the naive hardening approach of putting an LFENCE after every
				640	/// potentially mis-predicted control flow construct.
				641	///
				642	/// We include this as an alternative mostly for the purpose of comparison. The
				643	/// performance impact of this is expected to be extremely severe and not
				644	/// practical for any real-world users.
				645	void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
				646	MachineFunction &MF) {
				647	// First, we scan the function looking for blocks that are reached along edges
				648	// that we might want to harden.
				649	SmallSetVector<MachineBasicBlock *, 8> Blocks;
				650	for (MachineBasicBlock &MBB : MF) {
				651	// If there are no or only one successor, nothing to do here.
				652	if (MBB.succ_size() <= 1)
				653	continue;
				654
				655	// Skip blocks unless their terminators start with a branch. Other
				656	// terminators don't seem interesting for guarding against misspeculation.
				657	auto TermIt = MBB.getFirstTerminator();
				658	if (TermIt == MBB.end() \|\| !TermIt->isBranch())
				659	continue;
				660
				661	// Add all the non-EH-pad succossors to the blocks we want to harden. We
				662	// skip EH pads because there isn't really a condition of interest on
				663	// entering.
				664	for (MachineBasicBlock *SuccMBB : MBB.successors())
				665	if (!SuccMBB->isEHPad())
				666	Blocks.insert(SuccMBB);
				667	}
				668
				669	for (MachineBasicBlock *MBB : Blocks) {
				670	auto InsertPt = MBB->SkipPHIsAndLabels(MBB->begin());
				671	BuildMI(*MBB, InsertPt, DebugLoc(), TII->get(X86::LFENCE));
				672	++NumInstsInserted;
				673	++NumLFENCEsInserted;
				674	}
				675	}
				676
				677	SmallVector<X86SpeculativeLoadHardeningPass::BlockCondInfo, 16>
				678	X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
				679	SmallVector<BlockCondInfo, 16> Infos;
				680
				681	// Walk the function and build up a summary for each block's conditions that
				682	// we need to trace through.
				683	for (MachineBasicBlock &MBB : MF) {
				684	// If there are no or only one successor, nothing to do here.
				685	if (MBB.succ_size() <= 1)
				686	continue;
				687
				688	// We want to reliably handle any conditional branch terminators in the
				689	// MBB, so we manually analyze the branch. We can handle all of the
				690	// permutations here, including ones that analyze branch cannot.
				691	//
				692	// The approach is to walk backwards across the terminators, resetting at
				693	// any unconditional non-indirect branch, and track all conditional edges
				694	// to basic blocks as well as the fallthrough or unconditional successor
				695	// edge. For each conditional edge, we track the target and the opposite
				696	// condition code in order to inject a "no-op" cmov into that successor
				697	// that will harden the predicate. For the fallthrough/unconditional
				698	// edge, we inject a separate cmov for each conditional branch with
				699	// matching condition codes. This effectively implements an "and" of the
				700	// condition flags, even if there isn't a single condition flag that would
				701	// directly implement that. We don't bother trying to optimize either of
				702	// these cases because if such an optimization is possible, LLVM should
				703	// have optimized the conditional branches in that way already to reduce
				704	// instruction count. This late, we simply assume the minimal number of
				705	// branch instructions is being emitted and use that to guide our cmov
				706	// insertion.
				707
				708	BlockCondInfo Info = {&MBB, {}, nullptr};
				709
				710	// Now walk backwards through the terminators and build up successors they
				711	// reach and the conditions.
				712	for (MachineInstr &MI : llvm::reverse(MBB)) {
				713	// Once we've handled all the terminators, we're done.
				714	if (!MI.isTerminator())
				715	break;
				716
				717	// If we see a non-branch terminator, we can't handle anything so bail.
				718	if (!MI.isBranch()) {
				719	Info.CondBrs.clear();
				720	break;
				721	}
				722
				723	// If we see an unconditional branch, reset our state, clear any
				724	// fallthrough, and set this is the "else" successor.
				725	if (MI.getOpcode() == X86::JMP_1) {
				726	Info.CondBrs.clear();
				727	Info.UncondBr = &MI;
				728	continue;
				729	}
				730
				731	// If we get an invalid condition, we have an indirect branch or some
				732	// other unanalyzable "fallthrough" case. We model this as a nullptr for
				733	// the destination so we can still guard any conditional successors.
				734	// Consider code sequences like:
				735	// ```
				736	// jCC L1
				737	// jmpq *%rax
				738	// ```
				739	// We still want to harden the edge to `L1`.
				740	if (X86::getCondFromBranchOpc(MI.getOpcode()) == X86::COND_INVALID) {
				741	Info.CondBrs.clear();
				742	Info.UncondBr = &MI;
				743	continue;
				744	}
				745
				746	// We have a vanilla conditional branch, add it to our list.
				747	Info.CondBrs.push_back(&MI);
				748	}
				749	if (Info.CondBrs.empty()) {
				750	++NumBranchesUntraced;
				751	LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
				752	MBB.dump());
				753	continue;
				754	}
				755
				756	Infos.push_back(Info);
				757	}
				758
				759	return Infos;
				760	}
				761
				762	/// Returns true if the instruction has no behavior (specified or otherwise)
				763	/// that is based on the value of any of its register operands
				764	///
				765	/// A classical example of something that is inherently not data invariant is an
				766	/// indirect jump -- the destination is loaded into icache based on the bits set
				767	/// in the jump destination register.
				768	///
				769	/// FIXME: This should become part of our instruction tables.
				770	static bool isDataInvariant(MachineInstr &MI) {
				771	switch (MI.getOpcode()) {
				772	default:
				773	// By default, assume that the instruction is not data invariant.
				774	return false;
				775
				776	// FIXME: For now, we just use a very boring, conservative set of unary
				777	// instructions because we're mostly interested in handling simple
				778	// transformations.
				779	case TargetOpcode::COPY:
				780	return true;
				781	}
				782	}
				783
				784	/// Returns true if the instruction has no behavior (specified or otherwise)
				785	/// that is based on the value loaded from memory or the value of any
				786	/// non-address register operands.
				787	///
				788	/// For example, if the latency of the instruction is dependent on the
				789	/// particular bits set in any of the registers or any of the bits loaded from
				790	/// memory.
				791	///
				792	/// A classical example of something that is inherently not data invariant is an
				793	/// indirect jump -- the destination is loaded into icache based on the bits set
				794	/// in the jump destination register.
				795	///
				796	/// FIXME: This should become part of our instruction tables.
				797	static bool isDataInvariantLoad(MachineInstr &MI) {
				798	switch (MI.getOpcode()) {
				799	default:
				800	// By default, assume that the load will immediately leak.
				801	return false;
				802
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame^]	803	// On x86 it is believed that imul is constant time w.r.t. the loaded data.
				804	// However, they set flags and are perhaps the most surprisingly constant
				805	// time operations so we call them out here separately.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	806	case X86::IMUL16rm:
				807	case X86::IMUL16rmi8:
				808	case X86::IMUL16rmi:
				809	case X86::IMUL32rm:
				810	case X86::IMUL32rmi8:
				811	case X86::IMUL32rmi:
				812	case X86::IMUL64rm:
				813	case X86::IMUL64rmi32:
				814	case X86::IMUL64rmi8:
				815
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame^]	816	// Bit scanning and counting instructions that are somewhat surprisingly
				817	// constant time as they scan across bits and do other fairly complex
				818	// operations like popcnt, but are believed to be constant time on x86.
				819	// However, these set flags.
				820	case X86::BSF16rm:
				821	case X86::BSF32rm:
				822	case X86::BSF64rm:
				823	case X86::BSR16rm:
				824	case X86::BSR32rm:
				825	case X86::BSR64rm:
				826	case X86::LZCNT16rm:
				827	case X86::LZCNT32rm:
				828	case X86::LZCNT64rm:
				829	case X86::POPCNT16rm:
				830	case X86::POPCNT32rm:
				831	case X86::POPCNT64rm:
				832	case X86::TZCNT16rm:
				833	case X86::TZCNT32rm:
				834	case X86::TZCNT64rm:
				835
				836	// Bit manipulation instructions are effectively combinations of basic
				837	// arithmetic ops, and should still execute in constant time. These also
				838	// set flags.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	839	case X86::BLCFILL32rm:
				840	case X86::BLCFILL64rm:
				841	case X86::BLCI32rm:
				842	case X86::BLCI64rm:
				843	case X86::BLCIC32rm:
				844	case X86::BLCIC64rm:
				845	case X86::BLCMSK32rm:
				846	case X86::BLCMSK64rm:
				847	case X86::BLCS32rm:
				848	case X86::BLCS64rm:
				849	case X86::BLSFILL32rm:
				850	case X86::BLSFILL64rm:
				851	case X86::BLSI32rm:
				852	case X86::BLSI64rm:
				853	case X86::BLSIC32rm:
				854	case X86::BLSIC64rm:
				855	case X86::BLSMSK32rm:
				856	case X86::BLSMSK64rm:
				857	case X86::BLSR32rm:
				858	case X86::BLSR64rm:
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	859	case X86::TZMSK32rm:
				860	case X86::TZMSK64rm:
				861
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame^]	862	// Bit extracting and clearing instructions should execute in constant time,
				863	// and set flags.
				864	case X86::BEXTR32rm:
				865	case X86::BEXTR64rm:
				866	case X86::BEXTRI32mi:
				867	case X86::BEXTRI64mi:
				868	case X86::BZHI32rm:
				869	case X86::BZHI64rm:
				870
				871	// Basic arithmetic is constant time on the input but does set flags.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	872	case X86::ADC8rm:
				873	case X86::ADC16rm:
				874	case X86::ADC32rm:
				875	case X86::ADC64rm:
				876	case X86::ADCX32rm:
				877	case X86::ADCX64rm:
				878	case X86::ADD8rm:
				879	case X86::ADD16rm:
				880	case X86::ADD32rm:
				881	case X86::ADD64rm:
				882	case X86::ADOX32rm:
				883	case X86::ADOX64rm:
				884	case X86::AND8rm:
				885	case X86::AND16rm:
				886	case X86::AND32rm:
				887	case X86::AND64rm:
				888	case X86::ANDN32rm:
				889	case X86::ANDN64rm:
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	890	case X86::OR8rm:
				891	case X86::OR16rm:
				892	case X86::OR32rm:
				893	case X86::OR64rm:
				894	case X86::SBB8rm:
				895	case X86::SBB16rm:
				896	case X86::SBB32rm:
				897	case X86::SBB64rm:
				898	case X86::SUB8rm:
				899	case X86::SUB16rm:
				900	case X86::SUB32rm:
				901	case X86::SUB64rm:
				902	case X86::XOR8rm:
				903	case X86::XOR16rm:
				904	case X86::XOR32rm:
				905	case X86::XOR64rm:
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	906	// Check whether the EFLAGS implicit-def is dead. We assume that this will
				907	// always find the implicit-def because this code should only be reached
				908	// for instructions that do in fact implicitly def this.
				909	if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
				910	// If we would clobber EFLAGS that are used, just bail for now.
				911	LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: ";
				912	MI.dump(); dbgs() << "\n");
				913	return false;
				914	}
				915
				916	// Otherwise, fallthrough to handle these the same as instructions that
				917	// don't set EFLAGS.
				918	LLVM_FALLTHROUGH;
				919
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame^]	920	// Integer multiply w/o affecting flags is still believed to be constant
				921	// time on x86. Called out separately as this is among the most surprising
				922	// instructions to exhibit that behavior.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	923	case X86::MULX32rm:
				924	case X86::MULX64rm:
				925
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame^]	926	// Arithmetic instructions that are both constant time and don't set flags.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	927	case X86::PDEP32rm:
				928	case X86::PDEP64rm:
				929	case X86::PEXT32rm:
				930	case X86::PEXT64rm:
				931	case X86::RORX32mi:
				932	case X86::RORX64mi:
				933	case X86::SARX32rm:
				934	case X86::SARX64rm:
				935	case X86::SHLX32rm:
				936	case X86::SHLX64rm:
				937	case X86::SHRX32rm:
				938	case X86::SHRX64rm:
				939
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame^]	940	// Conversions are believed to be constant time and don't set flags.
				941	// FIXME: Add AVX versions.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	942	case X86::CVTSD2SI64rm_Int:
				943	case X86::CVTSD2SIrm_Int:
				944	case X86::CVTSS2SI64rm_Int:
				945	case X86::CVTSS2SIrm_Int:
				946	case X86::CVTTSD2SI64rm:
				947	case X86::CVTTSD2SI64rm_Int:
				948	case X86::CVTTSD2SIrm:
				949	case X86::CVTTSD2SIrm_Int:
				950	case X86::CVTTSS2SI64rm:
				951	case X86::CVTTSS2SI64rm_Int:
				952	case X86::CVTTSS2SIrm:
				953	case X86::CVTTSS2SIrm_Int:
				954
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame^]	955	// Loads to register don't set flags.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	956	case X86::MOV8rm:
				957	case X86::MOV8rm_NOREX:
				958	case X86::MOV16rm:
				959	case X86::MOV32rm:
				960	case X86::MOV64rm:
				961	case X86::MOVSX16rm8:
				962	case X86::MOVSX32rm16:
				963	case X86::MOVSX32rm8:
				964	case X86::MOVSX32rm8_NOREX:
				965	case X86::MOVSX64rm16:
				966	case X86::MOVSX64rm32:
				967	case X86::MOVSX64rm8:
				968	case X86::MOVZX16rm8:
				969	case X86::MOVZX32rm16:
				970	case X86::MOVZX32rm8:
				971	case X86::MOVZX32rm8_NOREX:
				972	case X86::MOVZX64rm16:
				973	case X86::MOVZX64rm8:
				974	return true;
				975	}
				976	}
				977
				978	static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
				979	const TargetRegisterInfo &TRI) {
				980	// Check if EFLAGS are alive by seeing if there is a def of them or they
				981	// live-in, and then seeing if that def is in turn used.
				982	for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), I))) {
				983	if (MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
				984	// If the def is dead, then EFLAGS is not live.
				985	if (DefOp->isDead())
				986	return false;
				987
				988	// Otherwise we've def'ed it, and it is live.
				989	return true;
				990	}
				991	// While at this instruction, also check if we use and kill EFLAGS
				992	// which means it isn't live.
				993	if (MI.killsRegister(X86::EFLAGS, &TRI))
				994	return false;
				995	}
				996
				997	// If we didn't find anything conclusive (neither definitely alive or
				998	// definitely dead) return whether it lives into the block.
				999	return MBB.isLiveIn(X86::EFLAGS);
				1000	}
				1001
				1002	void X86SpeculativeLoadHardeningPass::checkAllLoads(
				1003	MachineFunction &MF, MachineSSAUpdater &PredStateSSA) {
				1004	// If the actual checking of loads is disabled, skip doing anything here.
				1005	if (!HardenLoads)
				1006	return;
				1007
				1008	SmallPtrSet<MachineInstr *, 16> HardenPostLoad;
				1009	SmallPtrSet<MachineInstr *, 16> HardenLoadAddr;
				1010
				1011	SmallSet<unsigned, 16> HardenedAddrRegs;
				1012
				1013	SmallDenseMap<unsigned, unsigned, 32> AddrRegToHardenedReg;
				1014
				1015	// Track the set of load-dependent registers through the basic block. Because
				1016	// the values of these registers have an existing data dependency on a loaded
				1017	// value which we would have checked, we can omit any checks on them.
				1018	SparseBitVector<> LoadDepRegs;
				1019
				1020	for (MachineBasicBlock &MBB : MF) {
				1021	// We harden the loads of a basic block in several passes:
				1022	//
				1023	// 1) Collect all the loads which can have their loaded value hardened
				1024	// and all the loads that instead need their address hardened. During
				1025	// this walk we propagate load dependence for address hardened loads and
				1026	// also look for LFENCE to stop hardening wherever possible. When
				1027	// deciding whether or not to harden the loaded value or not, we check
				1028	// to see if any registers used in the address will have been hardened
				1029	// at this point and if so, harden any remaining address registers as
				1030	// that often successfully re-uses hardened addresses and minimizes
				1031	// instructions. FIXME: We should consider an aggressive mode where we
				1032	// continue to keep as many loads value hardened even when some address
				1033	// register hardening would be free (due to reuse).
				1034	for (MachineInstr &MI : MBB) {
				1035	// We naively assume that all def'ed registers of an instruction have
				1036	// a data dependency on all of their operands.
				1037	// FIXME: Do a more careful analysis of x86 to build a conservative model
				1038	// here.
				1039	if (llvm::any_of(MI.uses(), [&](MachineOperand &Op) {
				1040	return Op.isReg() && LoadDepRegs.test(Op.getReg());
				1041	}))
				1042	for (MachineOperand &Def : MI.defs())
				1043	if (Def.isReg())
				1044	LoadDepRegs.set(Def.getReg());
				1045
				1046	// Both Intel and AMD are guiding that they will change the semantics of
				1047	// LFENCE to be a speculation barrier, so if we see an LFENCE, there is
				1048	// no more need to guard things in this block.
				1049	if (MI.getOpcode() == X86::LFENCE)
				1050	break;
				1051
				1052	// If this instruction cannot load, nothing to do.
				1053	if (!MI.mayLoad())
				1054	continue;
				1055
				1056	// Some instructions which "load" are trivially safe or unimportant.
				1057	if (MI.getOpcode() == X86::MFENCE)
				1058	continue;
				1059
				1060	// Extract the memory operand information about this instruction.
				1061	// FIXME: This doesn't handle loading pseudo instructions which we often
				1062	// could handle with similarly generic logic. We probably need to add an
				1063	// MI-layer routine similar to the MC-layer one we use here which maps
				1064	// pseudos much like this maps real instructions.
				1065	const MCInstrDesc &Desc = MI.getDesc();
				1066	int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
				1067	if (MemRefBeginIdx < 0) {
				1068	LLVM_DEBUG(dbgs() << "WARNING: unable to harden loading instruction: ";
				1069	MI.dump());
				1070	continue;
				1071	}
				1072
				1073	MemRefBeginIdx += X86II::getOperandBias(Desc);
				1074
				1075	MachineOperand &BaseMO = MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
				1076	MachineOperand &IndexMO =
				1077	MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
				1078
				1079	// If we have at least one (non-frame-index, non-RIP) register operand,
				1080	// and neither operand is load-dependent, we need to check the load.
				1081	unsigned BaseReg = 0, IndexReg = 0;
				1082	if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP &&
				1083	BaseMO.getReg() != X86::NoRegister)
				1084	BaseReg = BaseMO.getReg();
				1085	if (IndexMO.getReg() != X86::NoRegister)
				1086	IndexReg = IndexMO.getReg();
				1087
				1088	if (!BaseReg && !IndexReg)
				1089	// No register operands!
				1090	continue;
				1091
				1092	// If any register operand is dependent, this load is dependent and we
				1093	// needn't check it.
				1094	// FIXME: Is this true in the case where we are hardening loads after
				1095	// they complete? Unclear, need to investigate.
				1096	if ((BaseReg && LoadDepRegs.test(BaseReg)) \|\|
				1097	(IndexReg && LoadDepRegs.test(IndexReg)))
				1098	continue;
				1099
				1100	// If post-load hardening is enabled, this load is known to be
				1101	// data-invariant, and we aren't already going to harden one of the
				1102	// address registers, queue it up to be hardened post-load. Notably, even
				1103	// once hardened this won't introduce a useful dependency that could prune
				1104	// out subsequent loads.
				1105	if (EnablePostLoadHardening && isDataInvariantLoad(MI) &&
				1106	!HardenedAddrRegs.count(BaseReg) &&
				1107	!HardenedAddrRegs.count(IndexReg)) {
				1108	HardenPostLoad.insert(&MI);
				1109	HardenedAddrRegs.insert(MI.getOperand(0).getReg());
				1110	continue;
				1111	}
				1112
				1113	// Record this instruction for address hardening and record its register
				1114	// operands as being address-hardened.
				1115	HardenLoadAddr.insert(&MI);
				1116	if (BaseReg)
				1117	HardenedAddrRegs.insert(BaseReg);
				1118	if (IndexReg)
				1119	HardenedAddrRegs.insert(IndexReg);
				1120
				1121	for (MachineOperand &Def : MI.defs())
				1122	if (Def.isReg())
				1123	LoadDepRegs.set(Def.getReg());
				1124	}
				1125
				1126	// Now re-walk the instructions in the basic block, and apply whichever
				1127	// hardening strategy we have elected. Note that we do this in a second
				1128	// pass specifically so that we have the complete set of instructions for
				1129	// which we will do post-load hardening and can defer it in certain
				1130	// circumstances.
				1131	//
				1132	// FIXME: This could probably be made even more effective by doing it
				1133	// across the entire function. Rather than just walking the flat list
				1134	// backwards here, we could walk the function in PO and each block bottom
				1135	// up, allowing us to in some cases sink hardening across block blocks. As
				1136	// long as the in-block predicate state is used at the eventual hardening
				1137	// site, this remains safe.
				1138	for (MachineInstr &MI : MBB) {
				1139	// We cannot both require hardening the def of a load and its address.
				1140	assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) &&
				1141	"Requested to harden both the address and def of a load!");
				1142
				1143	// Check if this is a load whose address needs to be hardened.
				1144	if (HardenLoadAddr.erase(&MI)) {
				1145	const MCInstrDesc &Desc = MI.getDesc();
				1146	int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
				1147	assert(MemRefBeginIdx >= 0 && "Cannot have an invalid index here!");
				1148
				1149	MemRefBeginIdx += X86II::getOperandBias(Desc);
				1150
				1151	MachineOperand &BaseMO =
				1152	MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
				1153	MachineOperand &IndexMO =
				1154	MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
				1155	hardenLoadAddr(MI, BaseMO, IndexMO, PredStateSSA, AddrRegToHardenedReg);
				1156	continue;
				1157	}
				1158
				1159	// Test if this instruction is one of our post load instructions (and
				1160	// remove it from the set if so).
				1161	if (HardenPostLoad.erase(&MI)) {
				1162	assert(!MI.isCall() && "Must not try to post-load harden a call!");
				1163
				1164	// If this is a data-invariant load, we want to try and sink any
				1165	// hardening as far as possible.
				1166	if (isDataInvariantLoad(MI)) {
				1167	// Sink the instruction we'll need to harden as far as we can down the
				1168	// graph.
				1169	MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad);
				1170
				1171	// If we managed to sink this instruction, update everything so we
				1172	// harden that instruction when we reach it in the instruction
				1173	// sequence.
				1174	if (SunkMI != &MI) {
				1175	// If in sinking there was no instruction needing to be hardened,
				1176	// we're done.
				1177	if (!SunkMI)
				1178	continue;
				1179
				1180	// Otherwise, add this to the set of defs we harden.
				1181	HardenPostLoad.insert(SunkMI);
				1182	continue;
				1183	}
				1184	}
				1185
				1186	// The register def'ed by this instruction is trivially hardened so map
				1187	// it to itself.
				1188	AddrRegToHardenedReg[MI.getOperand(0).getReg()] =
				1189	MI.getOperand(0).getReg();
				1190
				1191	hardenPostLoad(MI, PredStateSSA);
				1192	continue;
				1193	}
				1194
				1195	// After we finish processing the instruction and doing any hardening
				1196	// necessary for it, we need to handle transferring the predicate state
				1197	// into a call and recovering it after the call returns (if it returns).
				1198	if (!MI.isCall())
				1199	continue;
				1200
				1201	// If we're not hardening interprocedurally, we can just skip calls.
				1202	if (!HardenInterprocedurally)
				1203	continue;
				1204
				1205	auto InsertPt = MI.getIterator();
				1206	DebugLoc Loc = MI.getDebugLoc();
				1207
				1208	// First, we transfer the predicate state into the called function by
				1209	// merging it into the stack pointer. This will kill the current def of
				1210	// the state.
				1211	unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
				1212	mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg);
				1213
				1214	// If this call is also a return (because it is a tail call) we're done.
				1215	if (MI.isReturn())
				1216	continue;
				1217
				1218	// Otherwise we need to step past the call and recover the predicate
				1219	// state from SP after the return, and make this new state available.
				1220	++InsertPt;
				1221	unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
				1222	PredStateSSA.AddAvailableValue(&MBB, NewStateReg);
				1223	}
				1224
				1225	HardenPostLoad.clear();
				1226	HardenLoadAddr.clear();
				1227	HardenedAddrRegs.clear();
				1228	AddrRegToHardenedReg.clear();
				1229
				1230	// Currently, we only track data-dependent loads within a basic block.
				1231	// FIXME: We should see if this is necessary or if we could be more
				1232	// aggressive here without opening up attack avenues.
				1233	LoadDepRegs.clear();
				1234	}
				1235	}
				1236
				1237	/// Save EFLAGS into the returned GPR. This can in turn be restored with
				1238	/// `restoreEFLAGS`.
				1239	///
				1240	/// Note that LLVM can only lower very simple patterns of saved and restored
				1241	/// EFLAGS registers. The restore should always be within the same basic block
				1242	/// as the save so that no PHI nodes are inserted.
				1243	unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
				1244	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
				1245	DebugLoc Loc) {
				1246	// FIXME: Hard coding this to a 32-bit register class seems weird, but matches
				1247	// what instruction selection does.
				1248	unsigned Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
				1249	// We directly copy the FLAGS register and rely on later lowering to clean
				1250	// this up into the appropriate setCC instructions.
				1251	BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
				1252	++NumInstsInserted;
				1253	return Reg;
				1254	}
				1255
				1256	/// Restore EFLAGS from the provided GPR. This should be produced by
				1257	/// `saveEFLAGS`.
				1258	///
				1259	/// This must be done within the same basic block as the save in order to
				1260	/// reliably lower.
				1261	void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
				1262	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
				1263	unsigned Reg) {
				1264	BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
				1265	++NumInstsInserted;
				1266	}
				1267
				1268	/// Takes the current predicate state (in a register) and merges it into the
				1269	/// stack pointer. The state is essentially a single bit, but we merge this in
				1270	/// a way that won't form non-canonical pointers and also will be preserved
				1271	/// across normal stack adjustments.
				1272	void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
				1273	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
				1274	unsigned PredStateReg) {
				1275	unsigned TmpReg = MRI->createVirtualRegister(PredStateRC);
				1276	// FIXME: This hard codes a shift distance based on the number of bits needed
				1277	// to stay canonical on 64-bit. We should compute this somehow and support
				1278	// 32-bit as part of that.
				1279	auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg)
				1280	.addReg(PredStateReg, RegState::Kill)
				1281	.addImm(47);
				1282	ShiftI->addRegisterDead(X86::EFLAGS, TRI);
				1283	++NumInstsInserted;
				1284	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP)
				1285	.addReg(X86::RSP)
				1286	.addReg(TmpReg, RegState::Kill);
				1287	OrI->addRegisterDead(X86::EFLAGS, TRI);
				1288	++NumInstsInserted;
				1289	}
				1290
				1291	/// Extracts the predicate state stored in the high bits of the stack pointer.
				1292	unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
				1293	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
				1294	DebugLoc Loc) {
				1295	unsigned PredStateReg = MRI->createVirtualRegister(PredStateRC);
				1296	unsigned TmpReg = MRI->createVirtualRegister(PredStateRC);
				1297
				1298	// We know that the stack pointer will have any preserved predicate state in
				1299	// its high bit. We just want to smear this across the other bits. Turns out,
				1300	// this is exactly what an arithmetic right shift does.
				1301	BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg)
				1302	.addReg(X86::RSP);
				1303	auto ShiftI =
				1304	BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg)
				1305	.addReg(TmpReg, RegState::Kill)
				1306	.addImm(TRI->getRegSizeInBits(*PredStateRC) - 1);
				1307	ShiftI->addRegisterDead(X86::EFLAGS, TRI);
				1308	++NumInstsInserted;
				1309
				1310	return PredStateReg;
				1311	}
				1312
				1313	void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
				1314	MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
				1315	MachineSSAUpdater &PredStateSSA,
				1316	SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
				1317	MachineBasicBlock &MBB = *MI.getParent();
				1318	DebugLoc Loc = MI.getDebugLoc();
				1319
				1320	// Check if EFLAGS are alive by seeing if there is a def of them or they
				1321	// live-in, and then seeing if that def is in turn used.
				1322	bool EFLAGSLive = isEFLAGSLive(MBB, MI.getIterator(), *TRI);
				1323
				1324	SmallVector<MachineOperand *, 2> HardenOpRegs;
				1325
				1326	if (BaseMO.isFI()) {
				1327	// A frame index is never a dynamically controllable load, so only
				1328	// harden it if we're covering fixed address loads as well.
				1329	LLVM_DEBUG(
				1330	dbgs() << " Skipping hardening base of explicit stack frame load: ";
				1331	MI.dump(); dbgs() << "\n");
				1332	} else if (BaseMO.getReg() == X86::RIP \|\|
				1333	BaseMO.getReg() == X86::NoRegister) {
				1334	// For both RIP-relative addressed loads or absolute loads, we cannot
				1335	// meaningfully harden them because the address being loaded has no
				1336	// dynamic component.
				1337	//
				1338	// FIXME: When using a segment base (like TLS does) we end up with the
				1339	// dynamic address being the base plus -1 because we can't mutate the
				1340	// segment register here. This allows the signed 32-bit offset to point at
				1341	// valid segment-relative addresses and load them successfully.
				1342	LLVM_DEBUG(
				1343	dbgs() << " Cannot harden base of "
				1344	<< (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base")
				1345	<< " address in a load!");
				1346	} else {
				1347	assert(BaseMO.isReg() &&
				1348	"Only allowed to have a frame index or register base.");
				1349	HardenOpRegs.push_back(&BaseMO);
				1350	}
				1351
				1352	if (IndexMO.getReg() != X86::NoRegister &&
				1353	(HardenOpRegs.empty() \|\|
				1354	HardenOpRegs.front()->getReg() != IndexMO.getReg()))
				1355	HardenOpRegs.push_back(&IndexMO);
				1356
				1357	assert((HardenOpRegs.size() == 1 \|\| HardenOpRegs.size() == 2) &&
				1358	"Should have exactly one or two registers to harden!");
				1359	assert((HardenOpRegs.size() == 1 \|\|
				1360	HardenOpRegs[0]->getReg() != HardenOpRegs[1]->getReg()) &&
				1361	"Should not have two of the same registers!");
				1362
				1363	// Remove any registers that have alreaded been checked.
				1364	llvm::erase_if(HardenOpRegs, [&](MachineOperand *Op) {
				1365	// See if this operand's register has already been checked.
				1366	auto It = AddrRegToHardenedReg.find(Op->getReg());
				1367	if (It == AddrRegToHardenedReg.end())
				1368	// Not checked, so retain this one.
				1369	return false;
				1370
				1371	// Otherwise, we can directly update this operand and remove it.
				1372	Op->setReg(It->second);
				1373	return true;
				1374	});
				1375	// If there are none left, we're done.
				1376	if (HardenOpRegs.empty())
				1377	return;
				1378
				1379	// Compute the current predicate state.
				1380	unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
				1381
				1382	auto InsertPt = MI.getIterator();
				1383
				1384	// If EFLAGS are live and we don't have access to instructions that avoid
				1385	// clobbering EFLAGS we need to save and restore them. This in turn makes
				1386	// the EFLAGS no longer live.
				1387	unsigned FlagsReg = 0;
				1388	if (EFLAGSLive && !Subtarget->hasBMI2()) {
				1389	EFLAGSLive = false;
				1390	FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
				1391	}
				1392
				1393	for (MachineOperand *Op : HardenOpRegs) {
				1394	auto *OpRC = MRI->getRegClass(Op->getReg());
				1395
				1396	unsigned OpReg = Op->getReg();
				1397	unsigned TmpReg = MRI->createVirtualRegister(OpRC);
				1398
				1399	if (!EFLAGSLive) {
				1400	// Merge our potential poison state into the value with an or.
				1401	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), TmpReg)
				1402	.addReg(StateReg)
				1403	.addReg(OpReg);
				1404	OrI->addRegisterDead(X86::EFLAGS, TRI);
				1405	++NumInstsInserted;
				1406	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
				1407	} else {
				1408	// We need to avoid touching EFLAGS so shift out all but the least
				1409	// significant bit using the instruction that doesn't update flags.
				1410	auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHRX64rr), TmpReg)
				1411	.addReg(OpReg)
				1412	.addReg(StateReg);
				1413	(void)ShiftI;
				1414	++NumInstsInserted;
				1415	LLVM_DEBUG(dbgs() << " Inserting shrx: "; ShiftI->dump();
				1416	dbgs() << "\n");
				1417	}
				1418
				1419	// Record this register as checked and update the operand.
				1420	assert(!AddrRegToHardenedReg.count(Op->getReg()) &&
				1421	"Should not have checked this register yet!");
				1422	AddrRegToHardenedReg[Op->getReg()] = TmpReg;
				1423	Op->setReg(TmpReg);
				1424	++NumAddrRegsHardened;
				1425	}
				1426
				1427	// And restore the flags if needed.
				1428	if (FlagsReg)
				1429	restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
				1430	}
				1431
				1432	MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
				1433	MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedLoads) {
				1434	assert(isDataInvariantLoad(InitialMI) &&
				1435	"Cannot get here with a non-invariant load!");
				1436
				1437	// See if we can sink hardening the loaded value.
				1438	auto SinkCheckToSingleUse =
				1439	[&](MachineInstr &MI) -> Optional<MachineInstr *> {
				1440	unsigned DefReg = MI.getOperand(0).getReg();
				1441
				1442	// We need to find a single use which we can sink the check. We can
				1443	// primarily do this because many uses may already end up checked on their
				1444	// own.
				1445	MachineInstr *SingleUseMI = nullptr;
				1446	for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
				1447	// If we're already going to harden this use, it is data invariant and
				1448	// within our block and we just need to check that the use isn't in an
				1449	// address.
				1450	if (HardenedLoads.count(&UseMI)) {
				1451	const MCInstrDesc &Desc = UseMI.getDesc();
				1452	int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
				1453	assert(MemRefBeginIdx >= 0 &&
				1454	"Should always have mem references here!");
				1455	MemRefBeginIdx += X86II::getOperandBias(Desc);
				1456
				1457	MachineOperand &BaseMO =
				1458	UseMI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
				1459	MachineOperand &IndexMO =
				1460	UseMI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
				1461	if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) \|\|
				1462	(IndexMO.isReg() && IndexMO.getReg() == DefReg))
				1463	// The load uses the register as part of its address making it not
				1464	// invariant.
				1465	return {};
				1466
				1467	continue;
				1468	}
				1469
				1470	if (SingleUseMI)
				1471	// We already have a single use, this would make two. Bail.
				1472	return {};
				1473
				1474	// If this single use isn't data invariant, isn't in this block, or has
				1475	// interfering EFLAGS, we can't sink the hardening to it.
				1476	if (!isDataInvariant(UseMI) \|\| UseMI.getParent() != MI.getParent())
				1477	return {};
				1478
				1479	// If this instruction defines multiple registers bail as we won't harden
				1480	// all of them.
				1481	if (UseMI.getDesc().getNumDefs() > 1)
				1482	return {};
				1483
				1484	// If this register isn't a virtual register we can't walk uses of sanely,
				1485	// just bail. Also check that its register class is one of the ones we
				1486	// can harden.
				1487	unsigned UseDefReg = UseMI.getOperand(0).getReg();
				1488	if (!TRI->isVirtualRegister(UseDefReg) \|\|
				1489	!MRI->getRegClass(UseDefReg)->hasSubClassEq(&X86::GR64RegClass))
				1490	return {};
				1491
				1492	SingleUseMI = &UseMI;
				1493	}
				1494
				1495	// If SingleUseMI is still null, there is no use that needs its own
				1496	// checking. Otherwise, it is the single use that needs checking.
				1497	return {SingleUseMI};
				1498	};
				1499
				1500	MachineInstr *MI = &InitialMI;
				1501	while (Optional<MachineInstr > SingleUse = SinkCheckToSingleUse(MI)) {
				1502	// Update which MI we're checking now.
				1503	MI = *SingleUse;
				1504	if (!MI)
				1505	break;
				1506	}
				1507
				1508	return MI;
				1509	}
				1510
				1511	// We can harden non-leaking loads into register without touching the address
				1512	// by just hiding all of the loaded bits. We use an `or` instruction to do
				1513	// this because having the poison value be all ones allows us to use the same
				1514	// value below. And the goal is just for the loaded bits to not be exposed to
				1515	// execution and coercing them to one is sufficient.
				1516	void X86SpeculativeLoadHardeningPass::hardenPostLoad(
				1517	MachineInstr &MI, MachineSSAUpdater &PredStateSSA) {
				1518	assert(isDataInvariantLoad(MI) &&
				1519	"Cannot get here with a non-invariant load!");
				1520
				1521	MachineBasicBlock &MBB = *MI.getParent();
				1522	DebugLoc Loc = MI.getDebugLoc();
				1523
				1524	// For all of these, the def'ed register operand is operand zero.
				1525	auto &DefOp = MI.getOperand(0);
				1526	unsigned OldDefReg = DefOp.getReg();
				1527
				1528	auto *DefRC = MRI->getRegClass(OldDefReg);
				1529	int DefRegBytes = TRI->getRegSizeInBits(*DefRC) / 8;
				1530
				1531	unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
				1532	unsigned OrOpCode = OrOpCodes[Log2_32(DefRegBytes)];
				1533
				1534	unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
				1535
				1536	auto GetStateRegInRC = [&](const TargetRegisterClass &RC) {
				1537	unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
				1538
				1539	int Bytes = TRI->getRegSizeInBits(RC) / 8;
				1540	// FIXME: Need to teach this about 32-bit mode.
				1541	if (Bytes != 8) {
				1542	unsigned SubRegImm = SubRegImms[Log2_32(Bytes)];
				1543	unsigned NarrowStateReg = MRI->createVirtualRegister(&RC);
				1544	BuildMI(MBB, MI.getIterator(), Loc, TII->get(TargetOpcode::COPY),
				1545	NarrowStateReg)
				1546	.addReg(StateReg, 0, SubRegImm);
				1547	StateReg = NarrowStateReg;
				1548	}
				1549	return StateReg;
				1550	};
				1551
				1552	auto InsertPt = std::next(MI.getIterator());
				1553	unsigned FlagsReg = 0;
				1554	bool EFLAGSLive = isEFLAGSLive(MBB, InsertPt, *TRI);
				1555	if (EFLAGSLive && !Subtarget->hasBMI2()) {
				1556	FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
				1557	EFLAGSLive = false;
				1558	}
				1559
				1560	if (!EFLAGSLive) {
				1561	unsigned StateReg = GetStateRegInRC(*DefRC);
				1562	unsigned NewDefReg = MRI->createVirtualRegister(DefRC);
				1563	DefOp.setReg(NewDefReg);
				1564	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), OldDefReg)
				1565	.addReg(StateReg)
				1566	.addReg(NewDefReg);
				1567	OrI->addRegisterDead(X86::EFLAGS, TRI);
				1568	++NumInstsInserted;
				1569	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
				1570	} else {
				1571	assert(Subtarget->hasBMI2() &&
				1572	"Cannot harden loads and preserve EFLAGS without BMI2!");
				1573
				1574	unsigned ShiftOpCode = DefRegBytes < 4 ? X86::SHRX32rr : X86::SHRX64rr;
				1575	auto &ShiftRC =
				1576	DefRegBytes < 4 ? X86::GR32_NOSPRegClass : X86::GR64_NOSPRegClass;
				1577	int ShiftRegBytes = TRI->getRegSizeInBits(ShiftRC) / 8;
				1578	unsigned DefSubRegImm = SubRegImms[Log2_32(DefRegBytes)];
				1579
				1580	unsigned StateReg = GetStateRegInRC(ShiftRC);
				1581
				1582	// First have the def instruction def a temporary register.
				1583	unsigned TmpReg = MRI->createVirtualRegister(DefRC);
				1584	DefOp.setReg(TmpReg);
				1585	// Now copy it into a register of the shift RC.
				1586	unsigned ShiftInputReg = TmpReg;
				1587	if (DefRegBytes != ShiftRegBytes) {
				1588	unsigned UndefReg = MRI->createVirtualRegister(&ShiftRC);
				1589	BuildMI(MBB, InsertPt, Loc, TII->get(X86::IMPLICIT_DEF), UndefReg);
				1590	ShiftInputReg = MRI->createVirtualRegister(&ShiftRC);
				1591	BuildMI(MBB, InsertPt, Loc, TII->get(X86::INSERT_SUBREG), ShiftInputReg)
				1592	.addReg(UndefReg)
				1593	.addReg(TmpReg)
				1594	.addImm(DefSubRegImm);
				1595	}
				1596
				1597	// We shift this once if the shift is wider than the def and thus we can
				1598	// shift all of the def'ed bytes out. Otherwise we need to do two shifts.
				1599
				1600	unsigned ShiftedReg = MRI->createVirtualRegister(&ShiftRC);
				1601	auto Shift1I =
				1602	BuildMI(MBB, InsertPt, Loc, TII->get(ShiftOpCode), ShiftedReg)
				1603	.addReg(ShiftInputReg)
				1604	.addReg(StateReg);
				1605	(void)Shift1I;
				1606	++NumInstsInserted;
				1607	LLVM_DEBUG(dbgs() << " Inserting shrx: "; Shift1I->dump(); dbgs() << "\n");
				1608
				1609	// The only way we have a bit left is if all 8 bytes were defined. Do an
				1610	// extra shift to get the last bit in this case.
				1611	if (DefRegBytes == ShiftRegBytes) {
				1612	// We can just directly def the old def register as its the same size.
				1613	ShiftInputReg = ShiftedReg;
				1614	auto Shift2I =
				1615	BuildMI(MBB, InsertPt, Loc, TII->get(ShiftOpCode), OldDefReg)
				1616	.addReg(ShiftInputReg)
				1617	.addReg(StateReg);
				1618	(void)Shift2I;
				1619	++NumInstsInserted;
				1620	LLVM_DEBUG(dbgs() << " Inserting shrx: "; Shift2I->dump();
				1621	dbgs() << "\n");
				1622	} else {
				1623	// When we have different size shift register we need to fix up the
				1624	// class. We can do that as we copy into the old def register.
				1625	BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), OldDefReg)
				1626	.addReg(ShiftedReg, 0, DefSubRegImm);
				1627	}
				1628	}
				1629
				1630	if (FlagsReg)
				1631	restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
				1632
				1633	++NumPostLoadRegsHardened;
				1634	}
				1635
				1636	void X86SpeculativeLoadHardeningPass::checkReturnInstr(
				1637	MachineInstr &MI, MachineSSAUpdater &PredStateSSA) {
				1638	MachineBasicBlock &MBB = *MI.getParent();
				1639	DebugLoc Loc = MI.getDebugLoc();
				1640	auto InsertPt = MI.getIterator();
				1641
				1642	if (FenceCallAndRet) {
				1643	// Simply forcibly block speculation of loads out of the function by using
				1644	// an LFENCE. This is potentially a heavy-weight mitigation strategy, but
				1645	// should be secure, is simple from an ABI perspective, and the cost can be
				1646	// minimized through inlining.
				1647	//
				1648	// FIXME: We should investigate ways to establish a strong data-dependency
				1649	// on the return. However, poisoning the stack pointer is unlikely to work
				1650	// because the return is predicted rather than relying on the load of the
				1651	// return address to actually resolve.
				1652	BuildMI(MBB, InsertPt, Loc, TII->get(X86::LFENCE));
				1653	++NumInstsInserted;
				1654	++NumLFENCEsInserted;
				1655	return;
				1656	}
				1657
				1658	// Take our predicate state, shift it to the high 17 bits (so that we keep
				1659	// pointers canonical) and merge it into RSP. This will allow the caller to
				1660	// extract it when we return (speculatively).
				1661	mergePredStateIntoSP(MBB, InsertPt, Loc,
				1662	PredStateSSA.GetValueAtEndOfBlock(&MBB));
				1663	}
				1664
				1665	INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
				1666	"X86 speculative load hardener", false, false)
				1667	INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
				1668	"X86 speculative load hardener", false, false)
				1669
				1670	FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() {
				1671	return new X86SpeculativeLoadHardeningPass();
				1672	}