Blame - llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp - toolchain/llvm-project

blob: eebd07731d6ab9da8fe563c9d04657d140ff9983 [file] [log] [blame]

Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1	//====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	/// \file
				10	///
				11	/// Provide a pass which mitigates speculative execution attacks which operate
				12	/// by speculating incorrectly past some predicate (a type check, bounds check,
				13	/// or other condition) to reach a load with invalid inputs and leak the data
				14	/// accessed by that load using a side channel out of the speculative domain.
				15	///
				16	/// For details on the attacks, see the first variant in both the Project Zero
				17	/// writeup and the Spectre paper:
				18	/// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
				19	/// https://spectreattack.com/spectre.pdf
				20	///
				21	//===----------------------------------------------------------------------===//
				22
				23	#include "X86.h"
				24	#include "X86InstrBuilder.h"
				25	#include "X86InstrInfo.h"
				26	#include "X86Subtarget.h"
				27	#include "llvm/ADT/ArrayRef.h"
				28	#include "llvm/ADT/DenseMap.h"
				29	#include "llvm/ADT/STLExtras.h"
				30	#include "llvm/ADT/ScopeExit.h"
				31	#include "llvm/ADT/SmallPtrSet.h"
				32	#include "llvm/ADT/SmallSet.h"
				33	#include "llvm/ADT/SmallVector.h"
				34	#include "llvm/ADT/SparseBitVector.h"
				35	#include "llvm/ADT/Statistic.h"
				36	#include "llvm/CodeGen/MachineBasicBlock.h"
				37	#include "llvm/CodeGen/MachineConstantPool.h"
				38	#include "llvm/CodeGen/MachineFunction.h"
				39	#include "llvm/CodeGen/MachineFunctionPass.h"
				40	#include "llvm/CodeGen/MachineInstr.h"
				41	#include "llvm/CodeGen/MachineInstrBuilder.h"
				42	#include "llvm/CodeGen/MachineModuleInfo.h"
				43	#include "llvm/CodeGen/MachineOperand.h"
				44	#include "llvm/CodeGen/MachineRegisterInfo.h"
				45	#include "llvm/CodeGen/MachineSSAUpdater.h"
				46	#include "llvm/CodeGen/TargetInstrInfo.h"
				47	#include "llvm/CodeGen/TargetRegisterInfo.h"
				48	#include "llvm/CodeGen/TargetSchedule.h"
				49	#include "llvm/CodeGen/TargetSubtargetInfo.h"
				50	#include "llvm/IR/DebugLoc.h"
				51	#include "llvm/MC/MCSchedule.h"
				52	#include "llvm/Pass.h"
				53	#include "llvm/Support/CommandLine.h"
				54	#include "llvm/Support/Debug.h"
				55	#include "llvm/Support/raw_ostream.h"
				56	#include <algorithm>
				57	#include <cassert>
				58	#include <iterator>
				59	#include <utility>
				60
				61	using namespace llvm;
				62
				63	#define PASS_KEY "x86-speculative-load-hardening"
				64	#define DEBUG_TYPE PASS_KEY
				65
				66	STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced");
				67	STATISTIC(NumBranchesUntraced, "Number of branches unable to trace");
				68	STATISTIC(NumAddrRegsHardened,
				69	"Number of address mode used registers hardaned");
				70	STATISTIC(NumPostLoadRegsHardened,
				71	"Number of post-load register values hardened");
				72	STATISTIC(NumInstsInserted, "Number of instructions inserted");
				73	STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
				74
				75	static cl::opt<bool> HardenEdgesWithLFENCE(
				76	PASS_KEY "-lfence",
				77	cl::desc(
				78	"Use LFENCE along each conditional edge to harden against speculative "
				79	"loads rather than conditional movs and poisoned pointers."),
				80	cl::init(false), cl::Hidden);
				81
				82	static cl::opt<bool> EnablePostLoadHardening(
				83	PASS_KEY "-post-load",
				84	cl::desc("Harden the value loaded after it is loaded by "
				85	"flushing the loaded bits to 1. This is hard to do "
				86	"in general but can be done easily for GPRs."),
				87	cl::init(true), cl::Hidden);
				88
				89	static cl::opt<bool> FenceCallAndRet(
				90	PASS_KEY "-fence-call-and-ret",
				91	cl::desc("Use a full speculation fence to harden both call and ret edges "
				92	"rather than a lighter weight mitigation."),
				93	cl::init(false), cl::Hidden);
				94
				95	static cl::opt<bool> HardenInterprocedurally(
				96	PASS_KEY "-ip",
				97	cl::desc("Harden interprocedurally by passing our state in and out of "
				98	"functions in the high bits of the stack pointer."),
				99	cl::init(true), cl::Hidden);
				100
				101	static cl::opt<bool>
				102	HardenLoads(PASS_KEY "-loads",
				103	cl::desc("Sanitize loads from memory. When disable, no "
				104	"significant security is provided."),
				105	cl::init(true), cl::Hidden);
				106
				107	namespace llvm {
				108
				109	void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
				110
				111	} // end namespace llvm
				112
				113	namespace {
				114
				115	class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
				116	public:
				117	X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) {
				118	initializeX86SpeculativeLoadHardeningPassPass(
				119	*PassRegistry::getPassRegistry());
				120	}
				121
				122	StringRef getPassName() const override {
				123	return "X86 speculative load hardening";
				124	}
				125	bool runOnMachineFunction(MachineFunction &MF) override;
				126	void getAnalysisUsage(AnalysisUsage &AU) const override;
				127
				128	/// Pass identification, replacement for typeid.
				129	static char ID;
				130
				131	private:
				132	/// The information about a block's conditional terminators needed to trace
				133	/// our predicate state through the exiting edges.
				134	struct BlockCondInfo {
				135	MachineBasicBlock *MBB;
				136
				137	// We mostly have one conditional branch, and in extremely rare cases have
				138	// two. Three and more are so rare as to be unimportant for compile time.
				139	SmallVector<MachineInstr *, 2> CondBrs;
				140
				141	MachineInstr *UncondBr;
				142	};
				143
				144	const X86Subtarget *Subtarget;
				145	MachineRegisterInfo *MRI;
				146	const X86InstrInfo *TII;
				147	const TargetRegisterInfo *TRI;
				148	const TargetRegisterClass *PredStateRC;
				149
				150	void hardenEdgesWithLFENCE(MachineFunction &MF);
				151
				152	SmallVector<BlockCondInfo, 16> collectBlockCondInfo(MachineFunction &MF);
				153
				154	void checkAllLoads(MachineFunction &MF, MachineSSAUpdater &PredStateSSA);
				155
				156	unsigned saveEFLAGS(MachineBasicBlock &MBB,
				157	MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
				158	void restoreEFLAGS(MachineBasicBlock &MBB,
				159	MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
				160	unsigned OFReg);
				161
				162	void mergePredStateIntoSP(MachineBasicBlock &MBB,
				163	MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
				164	unsigned PredStateReg);
				165	unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
				166	MachineBasicBlock::iterator InsertPt,
				167	DebugLoc Loc);
				168
				169	void
				170	hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
				171	MachineOperand &IndexMO, MachineSSAUpdater &PredStateSSA,
				172	SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
				173	MachineInstr *
				174	sinkPostLoadHardenedInst(MachineInstr &MI,
				175	SmallPtrSetImpl<MachineInstr *> &HardenedLoads);
				176	void hardenPostLoad(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
				177	void checkReturnInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
				178	void checkCallInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
				179	};
				180
				181	} // end anonymous namespace
				182
				183	char X86SpeculativeLoadHardeningPass::ID = 0;
				184
				185	void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
				186	AnalysisUsage &AU) const {
				187	MachineFunctionPass::getAnalysisUsage(AU);
				188	}
				189
				190	static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB,
				191	MachineBasicBlock &Succ, int SuccCount,
				192	MachineInstr Br, MachineInstr &UncondBr,
				193	const X86InstrInfo &TII) {
				194	assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!");
				195
				196	MachineFunction &MF = *MBB.getParent();
				197
				198	MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
				199
				200	// We have to insert the new block immediately after the current one as we
				201	// don't know what layout-successor relationships the successor has and we
				202	// may not be able to (and generally don't want to) try to fix those up.
				203	MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
				204
				205	// Update the branch instruction if necessary.
				206	if (Br) {
				207	assert(Br->getOperand(0).getMBB() == &Succ &&
				208	"Didn't start with the right target!");
				209	Br->getOperand(0).setMBB(&NewMBB);
				210
				211	// If this successor was reached through a branch rather than fallthrough,
				212	// we might have broken fallthrough and so need to inject a new
				213	// unconditional branch.
				214	if (!UncondBr) {
				215	MachineBasicBlock &OldLayoutSucc =
				216	*std::next(MachineFunction::iterator(&NewMBB));
				217	assert(MBB.isSuccessor(&OldLayoutSucc) &&
				218	"Without an unconditional branch, the old layout successor should "
				219	"be an actual successor!");
				220	auto BrBuilder =
				221	BuildMI(&MBB, DebugLoc(), TII.get(X86::JMP_1)).addMBB(&OldLayoutSucc);
				222	// Update the unconditional branch now that we've added one.
				223	UncondBr = &*BrBuilder;
				224	}
				225
				226	// Insert unconditional "jump Succ" instruction in the new block if
				227	// necessary.
				228	if (!NewMBB.isLayoutSuccessor(&Succ)) {
				229	SmallVector<MachineOperand, 4> Cond;
				230	TII.insertBranch(NewMBB, &Succ, nullptr, Cond, Br->getDebugLoc());
				231	}
				232	} else {
				233	assert(!UncondBr &&
				234	"Cannot have a branchless successor and an unconditional branch!");
				235	assert(NewMBB.isLayoutSuccessor(&Succ) &&
				236	"A non-branch successor must have been a layout successor before "
				237	"and now is a layout successor of the new block.");
				238	}
				239
				240	// If this is the only edge to the successor, we can just replace it in the
				241	// CFG. Otherwise we need to add a new entry in the CFG for the new
				242	// successor.
				243	if (SuccCount == 1) {
				244	MBB.replaceSuccessor(&Succ, &NewMBB);
				245	} else {
				246	MBB.splitSuccessor(&Succ, &NewMBB);
				247	}
				248
				249	// Hook up the edge from the new basic block to the old successor in the CFG.
				250	NewMBB.addSuccessor(&Succ);
				251
				252	// Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
				253	for (MachineInstr &MI : Succ) {
				254	if (!MI.isPHI())
				255	break;
				256	for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
				257	OpIdx += 2) {
				258	MachineOperand &OpV = MI.getOperand(OpIdx);
				259	MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
				260	assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
				261	if (OpMBB.getMBB() != &MBB)
				262	continue;
				263
				264	// If this is the last edge to the succesor, just replace MBB in the PHI
				265	if (SuccCount == 1) {
				266	OpMBB.setMBB(&NewMBB);
				267	break;
				268	}
				269
				270	// Otherwise, append a new pair of operands for the new incoming edge.
				271	MI.addOperand(MF, OpV);
				272	MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
				273	break;
				274	}
				275	}
				276
				277	// Inherit live-ins from the successor
				278	for (auto &LI : Succ.liveins())
				279	NewMBB.addLiveIn(LI);
				280
				281	LLVM_DEBUG(dbgs() << " Split edge from '" << MBB.getName() << "' to '"
				282	<< Succ.getName() << "'.\n");
				283	return NewMBB;
				284	}
				285
				286	bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
				287	MachineFunction &MF) {
				288	LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
				289	<< " **********\n");
				290
				291	Subtarget = &MF.getSubtarget<X86Subtarget>();
				292	MRI = &MF.getRegInfo();
				293	TII = Subtarget->getInstrInfo();
				294	TRI = Subtarget->getRegisterInfo();
				295	// FIXME: Support for 32-bit.
				296	PredStateRC = &X86::GR64_NOSPRegClass;
				297
				298	if (MF.begin() == MF.end())
				299	// Nothing to do for a degenerate empty function...
				300	return false;
				301
				302	// We support an alternative hardening technique based on a debug flag.
				303	if (HardenEdgesWithLFENCE) {
				304	hardenEdgesWithLFENCE(MF);
				305	return true;
				306	}
				307
				308	// Create a dummy debug loc to use for all the generated code here.
				309	DebugLoc Loc;
				310
				311	MachineBasicBlock &Entry = *MF.begin();
				312	auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(Entry.begin());
				313
				314	// Do a quick scan to see if we have any checkable loads.
				315	bool HasCheckableLoad = false;
				316	for (MachineBasicBlock &MBB : MF) {
				317	for (MachineInstr &MI : MBB) {
				318	// Stop searching blocks at an LFENCE.
				319	if (MI.getOpcode() == X86::LFENCE)
				320	break;
				321
				322	// Looking for loads only.
				323	if (!MI.mayLoad())
				324	continue;
				325
				326	// An MFENCE is modeled as a load but doesn't require hardening.
				327	if (MI.getOpcode() == X86::MFENCE)
				328	continue;
				329
				330	HasCheckableLoad = true;
				331	break;
				332	}
				333	if (HasCheckableLoad)
				334	break;
				335	}
				336
				337	// See if we have any conditional branching blocks that we will need to trace
				338	// predicate state through.
				339	SmallVector<BlockCondInfo, 16> Infos = collectBlockCondInfo(MF);
				340
				341	// If we have no interesting conditions or loads, nothing to do here.
				342	if (!HasCheckableLoad && Infos.empty())
				343	return true;
				344
				345	unsigned PredStateReg;
				346	unsigned PredStateSizeInBytes = TRI->getRegSizeInBits(*PredStateRC) / 8;
				347
				348	// The poison value is required to be an all-ones value for many aspects of
				349	// this mitigation.
				350	const int PoisonVal = -1;
				351	unsigned PoisonReg = MRI->createVirtualRegister(PredStateRC);
				352	BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64ri32), PoisonReg)
				353	.addImm(PoisonVal);
				354	++NumInstsInserted;
				355
				356	// If we have loads being hardened and we've asked for call and ret edges to
				357	// get a full fence-based mitigation, inject that fence.
				358	if (HasCheckableLoad && FenceCallAndRet) {
				359	// We need to insert an LFENCE at the start of the function to suspend any
				360	// incoming misspeculation from the caller. This helps two-fold: the caller
				361	// may not have been protected as this code has been, and this code gets to
				362	// not take any specific action to protect across calls.
				363	// FIXME: We could skip this for functions which unconditionally return
				364	// a constant.
				365	BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::LFENCE));
				366	++NumInstsInserted;
				367	++NumLFENCEsInserted;
				368	}
				369
				370	// If we have no conditionals to protect in blocks, then all we needed to do
				371	// was protect the entry and so we're done.
				372	if (Infos.empty())
				373	// We may have changed the function's code at this point to insert fences.
				374	return true;
				375
				376	// For every basic block in the function which can b
				377	if (HardenInterprocedurally && !FenceCallAndRet) {
				378	// Set up the predicate state by extracting it from the incoming stack
				379	// pointer so we pick up any misspeculation in our caller.
				380	PredStateReg = extractPredStateFromSP(Entry, EntryInsertPt, Loc);
				381	} else {
				382	// Otherwise, just build the predicate state itself by zeroing a register
				383	// as we don't need any initial state.
				384	PredStateReg = MRI->createVirtualRegister(PredStateRC);
				385	unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
				386	auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
				387	PredStateSubReg);
				388	++NumInstsInserted;
				389	MachineOperand *ZeroEFLAGSDefOp =
				390	ZeroI->findRegisterDefOperand(X86::EFLAGS);
				391	assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
				392	"Must have an implicit def of EFLAGS!");
				393	ZeroEFLAGSDefOp->setIsDead(true);
				394	BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
				395	PredStateReg)
				396	.addImm(0)
				397	.addReg(PredStateSubReg)
				398	.addImm(X86::sub_32bit);
				399	}
				400
				401	// We're going to need to trace predicate state throughout the function's
				402	// CFG. Prepare for this by setting up our initial state of PHIs with unique
				403	// predecessor entries and all the initial predicate state.
				404
				405	// FIXME: It's really frustrating that we have to do this, but SSA-form in
				406	// MIR isn't what you might expect. We may have multiple entries in PHI nodes
				407	// for a single predecessor. This makes CFG-updating extremely complex, so
				408	// here we simplify all PHI nodes to a model even simpler than the IR's
				409	// model: exactly one entry per predecessor, regardless of how many edges
				410	// there are.
				411	SmallPtrSet<MachineBasicBlock *, 4> Preds;
				412	SmallVector<int, 4> DupIndices;
				413	for (auto &MBB : MF)
				414	for (auto &MI : MBB) {
				415	if (!MI.isPHI())
				416	break;
				417
				418	// First we scan the operands of the PHI looking for duplicate entries
				419	// a particular predecessor. We retain the operand index of each duplicate
				420	// entry found.
				421	for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
				422	OpIdx += 2)
				423	if (!Preds.insert(MI.getOperand(OpIdx + 1).getMBB()).second)
				424	DupIndices.push_back(OpIdx);
				425
				426	// Now walk the duplicate indices, removing both the block and value. Note
				427	// that these are stored as a vector making this element-wise removal
				428	// :w
				429	// potentially quadratic.
				430	//
				431	// FIXME: It is really frustrating that we have to use a quadratic
				432	// removal algorithm here. There should be a better way, but the use-def
				433	// updates required make that impossible using the public API.
				434	//
				435	// Note that we have to process these backwards so that we don't
				436	// invalidate other indices with each removal.
				437	while (!DupIndices.empty()) {
				438	int OpIdx = DupIndices.pop_back_val();
				439	// Remove both the block and value operand, again in reverse order to
				440	// preserve indices.
				441	MI.RemoveOperand(OpIdx + 1);
				442	MI.RemoveOperand(OpIdx);
				443	}
				444
				445	Preds.clear();
				446	}
				447
				448	// Track the updated values in an SSA updater to rewrite into SSA form at the
				449	// end.
				450	MachineSSAUpdater PredStateSSA(MF);
				451	PredStateSSA.Initialize(PredStateReg);
				452	PredStateSSA.AddAvailableValue(&Entry, PredStateReg);
				453	// Collect the inserted instructions so we can rewrite their uses of the
				454	// predicate state into SSA form.
				455	SmallVector<MachineInstr *, 16> CMovs;
				456
				457	// Now walk all of the basic blocks looking for ones that end in conditional
				458	// jumps where we need to update this register along each edge.
				459	for (BlockCondInfo &Info : Infos) {
				460	MachineBasicBlock &MBB = *Info.MBB;
				461	SmallVectorImpl<MachineInstr *> &CondBrs = Info.CondBrs;
				462	MachineInstr *UncondBr = Info.UncondBr;
				463
				464	LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName()
				465	<< "\n");
				466	++NumCondBranchesTraced;
				467
				468	// Compute the non-conditional successor as either the target of any
				469	// unconditional branch or the layout successor.
				470	MachineBasicBlock *UncondSucc =
				471	UncondBr ? (UncondBr->getOpcode() == X86::JMP_1
				472	? UncondBr->getOperand(0).getMBB()
				473	: nullptr)
				474	: &*std::next(MachineFunction::iterator(&MBB));
				475
				476	// Count how many edges there are to any given successor.
				477	SmallDenseMap<MachineBasicBlock *, int> SuccCounts;
				478	if (UncondSucc)
				479	++SuccCounts[UncondSucc];
				480	for (auto *CondBr : CondBrs)
				481	++SuccCounts[CondBr->getOperand(0).getMBB()];
				482
				483	// A lambda to insert cmov instructions into a block checking all of the
				484	// condition codes in a sequence.
				485	auto BuildCheckingBlockForSuccAndConds =
				486	[&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount,
				487	MachineInstr Br, MachineInstr &UncondBr,
				488	ArrayRef<X86::CondCode> Conds) {
				489	// First, we split the edge to insert the checking block into a safe
				490	// location.
				491	auto &CheckingMBB =
				492	(SuccCount == 1 && Succ.pred_size() == 1)
				493	? Succ
				494	: splitEdge(MBB, Succ, SuccCount, Br, UncondBr, *TII);
				495
				496	bool LiveEFLAGS = Succ.isLiveIn(X86::EFLAGS);
				497	if (!LiveEFLAGS)
				498	CheckingMBB.addLiveIn(X86::EFLAGS);
				499
				500	// Now insert the cmovs to implement the checks.
				501	auto InsertPt = CheckingMBB.begin();
Erich Keane	ac8cb22	2018-07-13 14:43:20 +0000	[diff] [blame^]	502	assert((InsertPt == CheckingMBB.end() \|\| !InsertPt->isPHI()) &&
				503	"Should never have a PHI in the initial checking block as it "
				504	"always has a single predecessor!");
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	505
				506	// We will wire each cmov to each other, but need to start with the
				507	// incoming pred state.
				508	unsigned CurStateReg = PredStateReg;
				509
				510	for (X86::CondCode Cond : Conds) {
				511	auto CMovOp = X86::getCMovFromCond(Cond, PredStateSizeInBytes);
				512
				513	unsigned UpdatedStateReg = MRI->createVirtualRegister(PredStateRC);
				514	auto CMovI = BuildMI(CheckingMBB, InsertPt, Loc, TII->get(CMovOp),
				515	UpdatedStateReg)
				516	.addReg(CurStateReg)
				517	.addReg(PoisonReg);
				518	// If this is the last cmov and the EFLAGS weren't originally
				519	// live-in, mark them as killed.
				520	if (!LiveEFLAGS && Cond == Conds.back())
				521	CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
				522
				523	++NumInstsInserted;
				524	LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump();
				525	dbgs() << "\n");
				526
				527	// The first one of the cmovs will be using the top level
				528	// `PredStateReg` and need to get rewritten into SSA form.
				529	if (CurStateReg == PredStateReg)
				530	CMovs.push_back(&*CMovI);
				531
				532	// The next cmov should start from this one's def.
				533	CurStateReg = UpdatedStateReg;
				534	}
				535
				536	// And put the last one into the available values for PredStateSSA.
				537	PredStateSSA.AddAvailableValue(&CheckingMBB, CurStateReg);
				538	};
				539
				540	std::vector<X86::CondCode> UncondCodeSeq;
				541	for (auto *CondBr : CondBrs) {
				542	MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB();
				543	int &SuccCount = SuccCounts[&Succ];
				544
				545	X86::CondCode Cond = X86::getCondFromBranchOpc(CondBr->getOpcode());
				546	X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond);
				547	UncondCodeSeq.push_back(Cond);
				548
				549	BuildCheckingBlockForSuccAndConds(MBB, Succ, SuccCount, CondBr, UncondBr,
				550	{InvCond});
				551
				552	// Decrement the successor count now that we've split one of the edges.
				553	// We need to keep the count of edges to the successor accurate in order
				554	// to know above when to replace the successor in the CFG vs. just
				555	// adding the new successor.
				556	--SuccCount;
				557	}
				558
				559	// Since we may have split edges and changed the number of successors,
				560	// normalize the probabilities. This avoids doing it each time we split an
				561	// edge.
				562	MBB.normalizeSuccProbs();
				563
				564	// Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
				565	// need to intersect the other condition codes. We can do this by just
				566	// doing a cmov for each one.
				567	if (!UncondSucc)
				568	// If we have no fallthrough to protect (perhaps it is an indirect jump?)
				569	// just skip this and continue.
				570	continue;
				571
				572	assert(SuccCounts[UncondSucc] == 1 &&
				573	"We should never have more than one edge to the unconditional "
				574	"successor at this point because every other edge must have been "
				575	"split above!");
				576
				577	// Sort and unique the codes to minimize them.
				578	llvm::sort(UncondCodeSeq.begin(), UncondCodeSeq.end());
				579	UncondCodeSeq.erase(std::unique(UncondCodeSeq.begin(), UncondCodeSeq.end()),
				580	UncondCodeSeq.end());
				581
				582	// Build a checking version of the successor.
				583	BuildCheckingBlockForSuccAndConds(MBB, UncondSucc, /SuccCount*/ 1,
				584	UncondBr, UncondBr, UncondCodeSeq);
				585	}
				586
				587	// We may also enter basic blocks in this function via exception handling
				588	// control flow. Here, if we are hardening interprocedurally, we need to
				589	// re-capture the predicate state from the throwing code. In the Itanium ABI,
				590	// the throw will always look like a call to __cxa_throw and will have the
				591	// predicate state in the stack pointer, so extract fresh predicate state from
				592	// the stack pointer and make it available in SSA.
				593	// FIXME: Handle non-itanium ABI EH models.
				594	if (HardenInterprocedurally) {
				595	for (MachineBasicBlock &MBB : MF) {
				596	assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!");
				597	assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
				598	assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
				599	if (!MBB.isEHPad())
				600	continue;
				601	PredStateSSA.AddAvailableValue(
				602	&MBB,
				603	extractPredStateFromSP(MBB, MBB.SkipPHIsAndLabels(MBB.begin()), Loc));
				604	}
				605	}
				606
				607	// Now check all of the loads using the predicate state.
				608	checkAllLoads(MF, PredStateSSA);
				609
				610	// Now rewrite all the uses of the pred state using the SSA updater so that
				611	// we track updates through the CFG.
				612	for (MachineInstr *CMovI : CMovs)
				613	for (MachineOperand &Op : CMovI->operands()) {
				614	if (!Op.isReg() \|\| Op.getReg() != PredStateReg)
				615	continue;
				616
				617	PredStateSSA.RewriteUse(Op);
				618	}
				619
				620	// If we are hardening interprocedurally, find each returning block and
				621	// protect the caller from being returned to through misspeculation.
				622	if (HardenInterprocedurally)
				623	for (MachineBasicBlock &MBB : MF) {
				624	if (MBB.empty())
				625	continue;
				626
				627	MachineInstr &MI = MBB.back();
				628	if (!MI.isReturn())
				629	continue;
				630
				631	checkReturnInstr(MI, PredStateSSA);
				632	}
				633
				634	LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump();
				635	dbgs() << "\n"; MF.verify(this));
				636	return true;
				637	}
				638
				639	/// Implements the naive hardening approach of putting an LFENCE after every
				640	/// potentially mis-predicted control flow construct.
				641	///
				642	/// We include this as an alternative mostly for the purpose of comparison. The
				643	/// performance impact of this is expected to be extremely severe and not
				644	/// practical for any real-world users.
				645	void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
				646	MachineFunction &MF) {
				647	// First, we scan the function looking for blocks that are reached along edges
				648	// that we might want to harden.
				649	SmallSetVector<MachineBasicBlock *, 8> Blocks;
				650	for (MachineBasicBlock &MBB : MF) {
				651	// If there are no or only one successor, nothing to do here.
				652	if (MBB.succ_size() <= 1)
				653	continue;
				654
				655	// Skip blocks unless their terminators start with a branch. Other
				656	// terminators don't seem interesting for guarding against misspeculation.
				657	auto TermIt = MBB.getFirstTerminator();
				658	if (TermIt == MBB.end() \|\| !TermIt->isBranch())
				659	continue;
				660
				661	// Add all the non-EH-pad succossors to the blocks we want to harden. We
				662	// skip EH pads because there isn't really a condition of interest on
				663	// entering.
				664	for (MachineBasicBlock *SuccMBB : MBB.successors())
				665	if (!SuccMBB->isEHPad())
				666	Blocks.insert(SuccMBB);
				667	}
				668
				669	for (MachineBasicBlock *MBB : Blocks) {
				670	auto InsertPt = MBB->SkipPHIsAndLabels(MBB->begin());
				671	BuildMI(*MBB, InsertPt, DebugLoc(), TII->get(X86::LFENCE));
				672	++NumInstsInserted;
				673	++NumLFENCEsInserted;
				674	}
				675	}
				676
				677	SmallVector<X86SpeculativeLoadHardeningPass::BlockCondInfo, 16>
				678	X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
				679	SmallVector<BlockCondInfo, 16> Infos;
				680
				681	// Walk the function and build up a summary for each block's conditions that
				682	// we need to trace through.
				683	for (MachineBasicBlock &MBB : MF) {
				684	// If there are no or only one successor, nothing to do here.
				685	if (MBB.succ_size() <= 1)
				686	continue;
				687
				688	// We want to reliably handle any conditional branch terminators in the
				689	// MBB, so we manually analyze the branch. We can handle all of the
				690	// permutations here, including ones that analyze branch cannot.
				691	//
				692	// The approach is to walk backwards across the terminators, resetting at
				693	// any unconditional non-indirect branch, and track all conditional edges
				694	// to basic blocks as well as the fallthrough or unconditional successor
				695	// edge. For each conditional edge, we track the target and the opposite
				696	// condition code in order to inject a "no-op" cmov into that successor
				697	// that will harden the predicate. For the fallthrough/unconditional
				698	// edge, we inject a separate cmov for each conditional branch with
				699	// matching condition codes. This effectively implements an "and" of the
				700	// condition flags, even if there isn't a single condition flag that would
				701	// directly implement that. We don't bother trying to optimize either of
				702	// these cases because if such an optimization is possible, LLVM should
				703	// have optimized the conditional branches in that way already to reduce
				704	// instruction count. This late, we simply assume the minimal number of
				705	// branch instructions is being emitted and use that to guide our cmov
				706	// insertion.
				707
				708	BlockCondInfo Info = {&MBB, {}, nullptr};
				709
				710	// Now walk backwards through the terminators and build up successors they
				711	// reach and the conditions.
				712	for (MachineInstr &MI : llvm::reverse(MBB)) {
				713	// Once we've handled all the terminators, we're done.
				714	if (!MI.isTerminator())
				715	break;
				716
				717	// If we see a non-branch terminator, we can't handle anything so bail.
				718	if (!MI.isBranch()) {
				719	Info.CondBrs.clear();
				720	break;
				721	}
				722
				723	// If we see an unconditional branch, reset our state, clear any
				724	// fallthrough, and set this is the "else" successor.
				725	if (MI.getOpcode() == X86::JMP_1) {
				726	Info.CondBrs.clear();
				727	Info.UncondBr = &MI;
				728	continue;
				729	}
				730
				731	// If we get an invalid condition, we have an indirect branch or some
				732	// other unanalyzable "fallthrough" case. We model this as a nullptr for
				733	// the destination so we can still guard any conditional successors.
				734	// Consider code sequences like:
				735	// ```
				736	// jCC L1
				737	// jmpq *%rax
				738	// ```
				739	// We still want to harden the edge to `L1`.
				740	if (X86::getCondFromBranchOpc(MI.getOpcode()) == X86::COND_INVALID) {
				741	Info.CondBrs.clear();
				742	Info.UncondBr = &MI;
				743	continue;
				744	}
				745
				746	// We have a vanilla conditional branch, add it to our list.
				747	Info.CondBrs.push_back(&MI);
				748	}
				749	if (Info.CondBrs.empty()) {
				750	++NumBranchesUntraced;
				751	LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
				752	MBB.dump());
				753	continue;
				754	}
				755
				756	Infos.push_back(Info);
				757	}
				758
				759	return Infos;
				760	}
				761
				762	/// Returns true if the instruction has no behavior (specified or otherwise)
				763	/// that is based on the value of any of its register operands
				764	///
				765	/// A classical example of something that is inherently not data invariant is an
				766	/// indirect jump -- the destination is loaded into icache based on the bits set
				767	/// in the jump destination register.
				768	///
				769	/// FIXME: This should become part of our instruction tables.
				770	static bool isDataInvariant(MachineInstr &MI) {
				771	switch (MI.getOpcode()) {
				772	default:
				773	// By default, assume that the instruction is not data invariant.
				774	return false;
				775
				776	// FIXME: For now, we just use a very boring, conservative set of unary
				777	// instructions because we're mostly interested in handling simple
				778	// transformations.
				779	case TargetOpcode::COPY:
				780	return true;
				781	}
				782	}
				783
				784	/// Returns true if the instruction has no behavior (specified or otherwise)
				785	/// that is based on the value loaded from memory or the value of any
				786	/// non-address register operands.
				787	///
				788	/// For example, if the latency of the instruction is dependent on the
				789	/// particular bits set in any of the registers or any of the bits loaded from
				790	/// memory.
				791	///
				792	/// A classical example of something that is inherently not data invariant is an
				793	/// indirect jump -- the destination is loaded into icache based on the bits set
				794	/// in the jump destination register.
				795	///
				796	/// FIXME: This should become part of our instruction tables.
				797	static bool isDataInvariantLoad(MachineInstr &MI) {
				798	switch (MI.getOpcode()) {
				799	default:
				800	// By default, assume that the load will immediately leak.
				801	return false;
				802
				803	// On x86 it is believed that imul is constant time w.r.t. the loaded data.
				804	// However, they set flags and are perhaps the most surprisingly constant
				805	// time operations so we call them out here separately.
				806	case X86::IMUL16rm:
				807	case X86::IMUL16rmi8:
				808	case X86::IMUL16rmi:
				809	case X86::IMUL32rm:
				810	case X86::IMUL32rmi8:
				811	case X86::IMUL32rmi:
				812	case X86::IMUL64rm:
				813	case X86::IMUL64rmi32:
				814	case X86::IMUL64rmi8:
				815
				816	// Bitfield and bit scanning instructions that are somewhat surprisingly
				817	// constant time as they scan across bits and do other fairly complex
				818	// operations like popcnt, but are believed to be constant time on x86.
				819	// However, these set flags.
				820	case X86::BLCFILL32rm:
				821	case X86::BLCFILL64rm:
				822	case X86::BLCI32rm:
				823	case X86::BLCI64rm:
				824	case X86::BLCIC32rm:
				825	case X86::BLCIC64rm:
				826	case X86::BLCMSK32rm:
				827	case X86::BLCMSK64rm:
				828	case X86::BLCS32rm:
				829	case X86::BLCS64rm:
				830	case X86::BLSFILL32rm:
				831	case X86::BLSFILL64rm:
				832	case X86::BLSI32rm:
				833	case X86::BLSI64rm:
				834	case X86::BLSIC32rm:
				835	case X86::BLSIC64rm:
				836	case X86::BLSMSK32rm:
				837	case X86::BLSMSK64rm:
				838	case X86::BLSR32rm:
				839	case X86::BLSR64rm:
				840	case X86::BZHI32rm:
				841	case X86::BZHI64rm:
				842	case X86::LZCNT16rm:
				843	case X86::LZCNT32rm:
				844	case X86::LZCNT64rm:
				845	case X86::POPCNT16rm:
				846	case X86::POPCNT32rm:
				847	case X86::POPCNT64rm:
				848	case X86::TZCNT16rm:
				849	case X86::TZCNT32rm:
				850	case X86::TZCNT64rm:
				851	case X86::TZMSK32rm:
				852	case X86::TZMSK64rm:
				853
				854	// Basic arithmetic is constant time on the input but does set flags.
				855	case X86::ADC8rm:
				856	case X86::ADC16rm:
				857	case X86::ADC32rm:
				858	case X86::ADC64rm:
				859	case X86::ADCX32rm:
				860	case X86::ADCX64rm:
				861	case X86::ADD8rm:
				862	case X86::ADD16rm:
				863	case X86::ADD32rm:
				864	case X86::ADD64rm:
				865	case X86::ADOX32rm:
				866	case X86::ADOX64rm:
				867	case X86::AND8rm:
				868	case X86::AND16rm:
				869	case X86::AND32rm:
				870	case X86::AND64rm:
				871	case X86::ANDN32rm:
				872	case X86::ANDN64rm:
				873	case X86::BSF16rm:
				874	case X86::BSF32rm:
				875	case X86::BSF64rm:
				876	case X86::BSR16rm:
				877	case X86::BSR32rm:
				878	case X86::BSR64rm:
				879	case X86::OR8rm:
				880	case X86::OR16rm:
				881	case X86::OR32rm:
				882	case X86::OR64rm:
				883	case X86::SBB8rm:
				884	case X86::SBB16rm:
				885	case X86::SBB32rm:
				886	case X86::SBB64rm:
				887	case X86::SUB8rm:
				888	case X86::SUB16rm:
				889	case X86::SUB32rm:
				890	case X86::SUB64rm:
				891	case X86::XOR8rm:
				892	case X86::XOR16rm:
				893	case X86::XOR32rm:
				894	case X86::XOR64rm:
				895	case X86::BEXTR32rm:
				896	case X86::BEXTR64rm:
				897	case X86::BEXTRI32mi:
				898	case X86::BEXTRI64mi:
				899	// Check whether the EFLAGS implicit-def is dead. We assume that this will
				900	// always find the implicit-def because this code should only be reached
				901	// for instructions that do in fact implicitly def this.
				902	if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
				903	// If we would clobber EFLAGS that are used, just bail for now.
				904	LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: ";
				905	MI.dump(); dbgs() << "\n");
				906	return false;
				907	}
				908
				909	// Otherwise, fallthrough to handle these the same as instructions that
				910	// don't set EFLAGS.
				911	LLVM_FALLTHROUGH;
				912
				913	// Integer multiply w/o affecting flags is still believed to be constant
				914	// time on x86. Called out separately as this is among the most surprising
				915	// instructions to exhibit that behavior.
				916	case X86::MULX32rm:
				917	case X86::MULX64rm:
				918
				919	// Arithmetic instructions that are both constant time and don't set flags.
				920	case X86::PDEP32rm:
				921	case X86::PDEP64rm:
				922	case X86::PEXT32rm:
				923	case X86::PEXT64rm:
				924	case X86::RORX32mi:
				925	case X86::RORX64mi:
				926	case X86::SARX32rm:
				927	case X86::SARX64rm:
				928	case X86::SHLX32rm:
				929	case X86::SHLX64rm:
				930	case X86::SHRX32rm:
				931	case X86::SHRX64rm:
				932
				933	// Conversions are believed to be constant time and don't set flags.
				934	// FIXME: Add AVX versions.
				935	case X86::CVTSD2SI64rm_Int:
				936	case X86::CVTSD2SIrm_Int:
				937	case X86::CVTSS2SI64rm_Int:
				938	case X86::CVTSS2SIrm_Int:
				939	case X86::CVTTSD2SI64rm:
				940	case X86::CVTTSD2SI64rm_Int:
				941	case X86::CVTTSD2SIrm:
				942	case X86::CVTTSD2SIrm_Int:
				943	case X86::CVTTSS2SI64rm:
				944	case X86::CVTTSS2SI64rm_Int:
				945	case X86::CVTTSS2SIrm:
				946	case X86::CVTTSS2SIrm_Int:
				947
				948	// Loads to register don't set flags.
				949	case X86::MOV8rm:
				950	case X86::MOV8rm_NOREX:
				951	case X86::MOV16rm:
				952	case X86::MOV32rm:
				953	case X86::MOV64rm:
				954	case X86::MOVSX16rm8:
				955	case X86::MOVSX32rm16:
				956	case X86::MOVSX32rm8:
				957	case X86::MOVSX32rm8_NOREX:
				958	case X86::MOVSX64rm16:
				959	case X86::MOVSX64rm32:
				960	case X86::MOVSX64rm8:
				961	case X86::MOVZX16rm8:
				962	case X86::MOVZX32rm16:
				963	case X86::MOVZX32rm8:
				964	case X86::MOVZX32rm8_NOREX:
				965	case X86::MOVZX64rm16:
				966	case X86::MOVZX64rm8:
				967	return true;
				968	}
				969	}
				970
				971	static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
				972	const TargetRegisterInfo &TRI) {
				973	// Check if EFLAGS are alive by seeing if there is a def of them or they
				974	// live-in, and then seeing if that def is in turn used.
				975	for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), I))) {
				976	if (MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
				977	// If the def is dead, then EFLAGS is not live.
				978	if (DefOp->isDead())
				979	return false;
				980
				981	// Otherwise we've def'ed it, and it is live.
				982	return true;
				983	}
				984	// While at this instruction, also check if we use and kill EFLAGS
				985	// which means it isn't live.
				986	if (MI.killsRegister(X86::EFLAGS, &TRI))
				987	return false;
				988	}
				989
				990	// If we didn't find anything conclusive (neither definitely alive or
				991	// definitely dead) return whether it lives into the block.
				992	return MBB.isLiveIn(X86::EFLAGS);
				993	}
				994
				995	void X86SpeculativeLoadHardeningPass::checkAllLoads(
				996	MachineFunction &MF, MachineSSAUpdater &PredStateSSA) {
				997	// If the actual checking of loads is disabled, skip doing anything here.
				998	if (!HardenLoads)
				999	return;
				1000
				1001	SmallPtrSet<MachineInstr *, 16> HardenPostLoad;
				1002	SmallPtrSet<MachineInstr *, 16> HardenLoadAddr;
				1003
				1004	SmallSet<unsigned, 16> HardenedAddrRegs;
				1005
				1006	SmallDenseMap<unsigned, unsigned, 32> AddrRegToHardenedReg;
				1007
				1008	// Track the set of load-dependent registers through the basic block. Because
				1009	// the values of these registers have an existing data dependency on a loaded
				1010	// value which we would have checked, we can omit any checks on them.
				1011	SparseBitVector<> LoadDepRegs;
				1012
				1013	for (MachineBasicBlock &MBB : MF) {
				1014	// We harden the loads of a basic block in several passes:
				1015	//
				1016	// 1) Collect all the loads which can have their loaded value hardened
				1017	// and all the loads that instead need their address hardened. During
				1018	// this walk we propagate load dependence for address hardened loads and
				1019	// also look for LFENCE to stop hardening wherever possible. When
				1020	// deciding whether or not to harden the loaded value or not, we check
				1021	// to see if any registers used in the address will have been hardened
				1022	// at this point and if so, harden any remaining address registers as
				1023	// that often successfully re-uses hardened addresses and minimizes
				1024	// instructions. FIXME: We should consider an aggressive mode where we
				1025	// continue to keep as many loads value hardened even when some address
				1026	// register hardening would be free (due to reuse).
				1027	for (MachineInstr &MI : MBB) {
				1028	// We naively assume that all def'ed registers of an instruction have
				1029	// a data dependency on all of their operands.
				1030	// FIXME: Do a more careful analysis of x86 to build a conservative model
				1031	// here.
				1032	if (llvm::any_of(MI.uses(), [&](MachineOperand &Op) {
				1033	return Op.isReg() && LoadDepRegs.test(Op.getReg());
				1034	}))
				1035	for (MachineOperand &Def : MI.defs())
				1036	if (Def.isReg())
				1037	LoadDepRegs.set(Def.getReg());
				1038
				1039	// Both Intel and AMD are guiding that they will change the semantics of
				1040	// LFENCE to be a speculation barrier, so if we see an LFENCE, there is
				1041	// no more need to guard things in this block.
				1042	if (MI.getOpcode() == X86::LFENCE)
				1043	break;
				1044
				1045	// If this instruction cannot load, nothing to do.
				1046	if (!MI.mayLoad())
				1047	continue;
				1048
				1049	// Some instructions which "load" are trivially safe or unimportant.
				1050	if (MI.getOpcode() == X86::MFENCE)
				1051	continue;
				1052
				1053	// Extract the memory operand information about this instruction.
				1054	// FIXME: This doesn't handle loading pseudo instructions which we often
				1055	// could handle with similarly generic logic. We probably need to add an
				1056	// MI-layer routine similar to the MC-layer one we use here which maps
				1057	// pseudos much like this maps real instructions.
				1058	const MCInstrDesc &Desc = MI.getDesc();
				1059	int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
				1060	if (MemRefBeginIdx < 0) {
				1061	LLVM_DEBUG(dbgs() << "WARNING: unable to harden loading instruction: ";
				1062	MI.dump());
				1063	continue;
				1064	}
				1065
				1066	MemRefBeginIdx += X86II::getOperandBias(Desc);
				1067
				1068	MachineOperand &BaseMO = MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
				1069	MachineOperand &IndexMO =
				1070	MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
				1071
				1072	// If we have at least one (non-frame-index, non-RIP) register operand,
				1073	// and neither operand is load-dependent, we need to check the load.
				1074	unsigned BaseReg = 0, IndexReg = 0;
				1075	if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP &&
				1076	BaseMO.getReg() != X86::NoRegister)
				1077	BaseReg = BaseMO.getReg();
				1078	if (IndexMO.getReg() != X86::NoRegister)
				1079	IndexReg = IndexMO.getReg();
				1080
				1081	if (!BaseReg && !IndexReg)
				1082	// No register operands!
				1083	continue;
				1084
				1085	// If any register operand is dependent, this load is dependent and we
				1086	// needn't check it.
				1087	// FIXME: Is this true in the case where we are hardening loads after
				1088	// they complete? Unclear, need to investigate.
				1089	if ((BaseReg && LoadDepRegs.test(BaseReg)) \|\|
				1090	(IndexReg && LoadDepRegs.test(IndexReg)))
				1091	continue;
				1092
				1093	// If post-load hardening is enabled, this load is known to be
				1094	// data-invariant, and we aren't already going to harden one of the
				1095	// address registers, queue it up to be hardened post-load. Notably, even
				1096	// once hardened this won't introduce a useful dependency that could prune
				1097	// out subsequent loads.
				1098	if (EnablePostLoadHardening && isDataInvariantLoad(MI) &&
				1099	!HardenedAddrRegs.count(BaseReg) &&
				1100	!HardenedAddrRegs.count(IndexReg)) {
				1101	HardenPostLoad.insert(&MI);
				1102	HardenedAddrRegs.insert(MI.getOperand(0).getReg());
				1103	continue;
				1104	}
				1105
				1106	// Record this instruction for address hardening and record its register
				1107	// operands as being address-hardened.
				1108	HardenLoadAddr.insert(&MI);
				1109	if (BaseReg)
				1110	HardenedAddrRegs.insert(BaseReg);
				1111	if (IndexReg)
				1112	HardenedAddrRegs.insert(IndexReg);
				1113
				1114	for (MachineOperand &Def : MI.defs())
				1115	if (Def.isReg())
				1116	LoadDepRegs.set(Def.getReg());
				1117	}
				1118
				1119	// Now re-walk the instructions in the basic block, and apply whichever
				1120	// hardening strategy we have elected. Note that we do this in a second
				1121	// pass specifically so that we have the complete set of instructions for
				1122	// which we will do post-load hardening and can defer it in certain
				1123	// circumstances.
				1124	//
				1125	// FIXME: This could probably be made even more effective by doing it
				1126	// across the entire function. Rather than just walking the flat list
				1127	// backwards here, we could walk the function in PO and each block bottom
				1128	// up, allowing us to in some cases sink hardening across block blocks. As
				1129	// long as the in-block predicate state is used at the eventual hardening
				1130	// site, this remains safe.
				1131	for (MachineInstr &MI : MBB) {
				1132	// We cannot both require hardening the def of a load and its address.
				1133	assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) &&
				1134	"Requested to harden both the address and def of a load!");
				1135
				1136	// Check if this is a load whose address needs to be hardened.
				1137	if (HardenLoadAddr.erase(&MI)) {
				1138	const MCInstrDesc &Desc = MI.getDesc();
				1139	int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
				1140	assert(MemRefBeginIdx >= 0 && "Cannot have an invalid index here!");
				1141
				1142	MemRefBeginIdx += X86II::getOperandBias(Desc);
				1143
				1144	MachineOperand &BaseMO =
				1145	MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
				1146	MachineOperand &IndexMO =
				1147	MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
				1148	hardenLoadAddr(MI, BaseMO, IndexMO, PredStateSSA, AddrRegToHardenedReg);
				1149	continue;
				1150	}
				1151
				1152	// Test if this instruction is one of our post load instructions (and
				1153	// remove it from the set if so).
				1154	if (HardenPostLoad.erase(&MI)) {
				1155	assert(!MI.isCall() && "Must not try to post-load harden a call!");
				1156
				1157	// If this is a data-invariant load, we want to try and sink any
				1158	// hardening as far as possible.
				1159	if (isDataInvariantLoad(MI)) {
				1160	// Sink the instruction we'll need to harden as far as we can down the
				1161	// graph.
				1162	MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad);
				1163
				1164	// If we managed to sink this instruction, update everything so we
				1165	// harden that instruction when we reach it in the instruction
				1166	// sequence.
				1167	if (SunkMI != &MI) {
				1168	// If in sinking there was no instruction needing to be hardened,
				1169	// we're done.
				1170	if (!SunkMI)
				1171	continue;
				1172
				1173	// Otherwise, add this to the set of defs we harden.
				1174	HardenPostLoad.insert(SunkMI);
				1175	continue;
				1176	}
				1177	}
				1178
				1179	// The register def'ed by this instruction is trivially hardened so map
				1180	// it to itself.
				1181	AddrRegToHardenedReg[MI.getOperand(0).getReg()] =
				1182	MI.getOperand(0).getReg();
				1183
				1184	hardenPostLoad(MI, PredStateSSA);
				1185	continue;
				1186	}
				1187
				1188	// After we finish processing the instruction and doing any hardening
				1189	// necessary for it, we need to handle transferring the predicate state
				1190	// into a call and recovering it after the call returns (if it returns).
				1191	if (!MI.isCall())
				1192	continue;
				1193
				1194	// If we're not hardening interprocedurally, we can just skip calls.
				1195	if (!HardenInterprocedurally)
				1196	continue;
				1197
				1198	auto InsertPt = MI.getIterator();
				1199	DebugLoc Loc = MI.getDebugLoc();
				1200
				1201	// First, we transfer the predicate state into the called function by
				1202	// merging it into the stack pointer. This will kill the current def of
				1203	// the state.
				1204	unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
				1205	mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg);
				1206
				1207	// If this call is also a return (because it is a tail call) we're done.
				1208	if (MI.isReturn())
				1209	continue;
				1210
				1211	// Otherwise we need to step past the call and recover the predicate
				1212	// state from SP after the return, and make this new state available.
				1213	++InsertPt;
				1214	unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
				1215	PredStateSSA.AddAvailableValue(&MBB, NewStateReg);
				1216	}
				1217
				1218	HardenPostLoad.clear();
				1219	HardenLoadAddr.clear();
				1220	HardenedAddrRegs.clear();
				1221	AddrRegToHardenedReg.clear();
				1222
				1223	// Currently, we only track data-dependent loads within a basic block.
				1224	// FIXME: We should see if this is necessary or if we could be more
				1225	// aggressive here without opening up attack avenues.
				1226	LoadDepRegs.clear();
				1227	}
				1228	}
				1229
				1230	/// Save EFLAGS into the returned GPR. This can in turn be restored with
				1231	/// `restoreEFLAGS`.
				1232	///
				1233	/// Note that LLVM can only lower very simple patterns of saved and restored
				1234	/// EFLAGS registers. The restore should always be within the same basic block
				1235	/// as the save so that no PHI nodes are inserted.
				1236	unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
				1237	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
				1238	DebugLoc Loc) {
				1239	// FIXME: Hard coding this to a 32-bit register class seems weird, but matches
				1240	// what instruction selection does.
				1241	unsigned Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
				1242	// We directly copy the FLAGS register and rely on later lowering to clean
				1243	// this up into the appropriate setCC instructions.
				1244	BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
				1245	++NumInstsInserted;
				1246	return Reg;
				1247	}
				1248
				1249	/// Restore EFLAGS from the provided GPR. This should be produced by
				1250	/// `saveEFLAGS`.
				1251	///
				1252	/// This must be done within the same basic block as the save in order to
				1253	/// reliably lower.
				1254	void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
				1255	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
				1256	unsigned Reg) {
				1257	BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
				1258	++NumInstsInserted;
				1259	}
				1260
				1261	/// Takes the current predicate state (in a register) and merges it into the
				1262	/// stack pointer. The state is essentially a single bit, but we merge this in
				1263	/// a way that won't form non-canonical pointers and also will be preserved
				1264	/// across normal stack adjustments.
				1265	void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
				1266	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
				1267	unsigned PredStateReg) {
				1268	unsigned TmpReg = MRI->createVirtualRegister(PredStateRC);
				1269	// FIXME: This hard codes a shift distance based on the number of bits needed
				1270	// to stay canonical on 64-bit. We should compute this somehow and support
				1271	// 32-bit as part of that.
				1272	auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg)
				1273	.addReg(PredStateReg, RegState::Kill)
				1274	.addImm(47);
				1275	ShiftI->addRegisterDead(X86::EFLAGS, TRI);
				1276	++NumInstsInserted;
				1277	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP)
				1278	.addReg(X86::RSP)
				1279	.addReg(TmpReg, RegState::Kill);
				1280	OrI->addRegisterDead(X86::EFLAGS, TRI);
				1281	++NumInstsInserted;
				1282	}
				1283
				1284	/// Extracts the predicate state stored in the high bits of the stack pointer.
				1285	unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
				1286	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
				1287	DebugLoc Loc) {
				1288	unsigned PredStateReg = MRI->createVirtualRegister(PredStateRC);
				1289	unsigned TmpReg = MRI->createVirtualRegister(PredStateRC);
				1290
				1291	// We know that the stack pointer will have any preserved predicate state in
				1292	// its high bit. We just want to smear this across the other bits. Turns out,
				1293	// this is exactly what an arithmetic right shift does.
				1294	BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg)
				1295	.addReg(X86::RSP);
				1296	auto ShiftI =
				1297	BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg)
				1298	.addReg(TmpReg, RegState::Kill)
				1299	.addImm(TRI->getRegSizeInBits(*PredStateRC) - 1);
				1300	ShiftI->addRegisterDead(X86::EFLAGS, TRI);
				1301	++NumInstsInserted;
				1302
				1303	return PredStateReg;
				1304	}
				1305
				1306	void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
				1307	MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
				1308	MachineSSAUpdater &PredStateSSA,
				1309	SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
				1310	MachineBasicBlock &MBB = *MI.getParent();
				1311	DebugLoc Loc = MI.getDebugLoc();
				1312
				1313	// Check if EFLAGS are alive by seeing if there is a def of them or they
				1314	// live-in, and then seeing if that def is in turn used.
				1315	bool EFLAGSLive = isEFLAGSLive(MBB, MI.getIterator(), *TRI);
				1316
				1317	SmallVector<MachineOperand *, 2> HardenOpRegs;
				1318
				1319	if (BaseMO.isFI()) {
				1320	// A frame index is never a dynamically controllable load, so only
				1321	// harden it if we're covering fixed address loads as well.
				1322	LLVM_DEBUG(
				1323	dbgs() << " Skipping hardening base of explicit stack frame load: ";
				1324	MI.dump(); dbgs() << "\n");
				1325	} else if (BaseMO.getReg() == X86::RIP \|\|
				1326	BaseMO.getReg() == X86::NoRegister) {
				1327	// For both RIP-relative addressed loads or absolute loads, we cannot
				1328	// meaningfully harden them because the address being loaded has no
				1329	// dynamic component.
				1330	//
				1331	// FIXME: When using a segment base (like TLS does) we end up with the
				1332	// dynamic address being the base plus -1 because we can't mutate the
				1333	// segment register here. This allows the signed 32-bit offset to point at
				1334	// valid segment-relative addresses and load them successfully.
				1335	LLVM_DEBUG(
				1336	dbgs() << " Cannot harden base of "
				1337	<< (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base")
				1338	<< " address in a load!");
				1339	} else {
				1340	assert(BaseMO.isReg() &&
				1341	"Only allowed to have a frame index or register base.");
				1342	HardenOpRegs.push_back(&BaseMO);
				1343	}
				1344
				1345	if (IndexMO.getReg() != X86::NoRegister &&
				1346	(HardenOpRegs.empty() \|\|
				1347	HardenOpRegs.front()->getReg() != IndexMO.getReg()))
				1348	HardenOpRegs.push_back(&IndexMO);
				1349
				1350	assert((HardenOpRegs.size() == 1 \|\| HardenOpRegs.size() == 2) &&
				1351	"Should have exactly one or two registers to harden!");
				1352	assert((HardenOpRegs.size() == 1 \|\|
				1353	HardenOpRegs[0]->getReg() != HardenOpRegs[1]->getReg()) &&
				1354	"Should not have two of the same registers!");
				1355
				1356	// Remove any registers that have alreaded been checked.
				1357	llvm::erase_if(HardenOpRegs, [&](MachineOperand *Op) {
				1358	// See if this operand's register has already been checked.
				1359	auto It = AddrRegToHardenedReg.find(Op->getReg());
				1360	if (It == AddrRegToHardenedReg.end())
				1361	// Not checked, so retain this one.
				1362	return false;
				1363
				1364	// Otherwise, we can directly update this operand and remove it.
				1365	Op->setReg(It->second);
				1366	return true;
				1367	});
				1368	// If there are none left, we're done.
				1369	if (HardenOpRegs.empty())
				1370	return;
				1371
				1372	// Compute the current predicate state.
				1373	unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
				1374
				1375	auto InsertPt = MI.getIterator();
				1376
				1377	// If EFLAGS are live and we don't have access to instructions that avoid
				1378	// clobbering EFLAGS we need to save and restore them. This in turn makes
				1379	// the EFLAGS no longer live.
				1380	unsigned FlagsReg = 0;
				1381	if (EFLAGSLive && !Subtarget->hasBMI2()) {
				1382	EFLAGSLive = false;
				1383	FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
				1384	}
				1385
				1386	for (MachineOperand *Op : HardenOpRegs) {
				1387	auto *OpRC = MRI->getRegClass(Op->getReg());
				1388
				1389	unsigned OpReg = Op->getReg();
				1390	unsigned TmpReg = MRI->createVirtualRegister(OpRC);
				1391
				1392	if (!EFLAGSLive) {
				1393	// Merge our potential poison state into the value with an or.
				1394	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), TmpReg)
				1395	.addReg(StateReg)
				1396	.addReg(OpReg);
				1397	OrI->addRegisterDead(X86::EFLAGS, TRI);
				1398	++NumInstsInserted;
				1399	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
				1400	} else {
				1401	// We need to avoid touching EFLAGS so shift out all but the least
				1402	// significant bit using the instruction that doesn't update flags.
				1403	auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHRX64rr), TmpReg)
				1404	.addReg(OpReg)
				1405	.addReg(StateReg);
				1406	(void)ShiftI;
				1407	++NumInstsInserted;
				1408	LLVM_DEBUG(dbgs() << " Inserting shrx: "; ShiftI->dump();
				1409	dbgs() << "\n");
				1410	}
				1411
				1412	// Record this register as checked and update the operand.
				1413	assert(!AddrRegToHardenedReg.count(Op->getReg()) &&
				1414	"Should not have checked this register yet!");
				1415	AddrRegToHardenedReg[Op->getReg()] = TmpReg;
				1416	Op->setReg(TmpReg);
				1417	++NumAddrRegsHardened;
				1418	}
				1419
				1420	// And restore the flags if needed.
				1421	if (FlagsReg)
				1422	restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
				1423	}
				1424
				1425	MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
				1426	MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedLoads) {
				1427	assert(isDataInvariantLoad(InitialMI) &&
				1428	"Cannot get here with a non-invariant load!");
				1429
				1430	// See if we can sink hardening the loaded value.
				1431	auto SinkCheckToSingleUse =
				1432	[&](MachineInstr &MI) -> Optional<MachineInstr *> {
				1433	unsigned DefReg = MI.getOperand(0).getReg();
				1434
				1435	// We need to find a single use which we can sink the check. We can
				1436	// primarily do this because many uses may already end up checked on their
				1437	// own.
				1438	MachineInstr *SingleUseMI = nullptr;
				1439	for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
				1440	// If we're already going to harden this use, it is data invariant and
				1441	// within our block and we just need to check that the use isn't in an
				1442	// address.
				1443	if (HardenedLoads.count(&UseMI)) {
				1444	const MCInstrDesc &Desc = UseMI.getDesc();
				1445	int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
				1446	assert(MemRefBeginIdx >= 0 &&
				1447	"Should always have mem references here!");
				1448	MemRefBeginIdx += X86II::getOperandBias(Desc);
				1449
				1450	MachineOperand &BaseMO =
				1451	UseMI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
				1452	MachineOperand &IndexMO =
				1453	UseMI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
				1454	if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) \|\|
				1455	(IndexMO.isReg() && IndexMO.getReg() == DefReg))
				1456	// The load uses the register as part of its address making it not
				1457	// invariant.
				1458	return {};
				1459
				1460	continue;
				1461	}
				1462
				1463	if (SingleUseMI)
				1464	// We already have a single use, this would make two. Bail.
				1465	return {};
				1466
				1467	// If this single use isn't data invariant, isn't in this block, or has
				1468	// interfering EFLAGS, we can't sink the hardening to it.
				1469	if (!isDataInvariant(UseMI) \|\| UseMI.getParent() != MI.getParent())
				1470	return {};
				1471
				1472	// If this instruction defines multiple registers bail as we won't harden
				1473	// all of them.
				1474	if (UseMI.getDesc().getNumDefs() > 1)
				1475	return {};
				1476
				1477	// If this register isn't a virtual register we can't walk uses of sanely,
				1478	// just bail. Also check that its register class is one of the ones we
				1479	// can harden.
				1480	unsigned UseDefReg = UseMI.getOperand(0).getReg();
				1481	if (!TRI->isVirtualRegister(UseDefReg) \|\|
				1482	!MRI->getRegClass(UseDefReg)->hasSubClassEq(&X86::GR64RegClass))
				1483	return {};
				1484
				1485	SingleUseMI = &UseMI;
				1486	}
				1487
				1488	// If SingleUseMI is still null, there is no use that needs its own
				1489	// checking. Otherwise, it is the single use that needs checking.
				1490	return {SingleUseMI};
				1491	};
				1492
				1493	MachineInstr *MI = &InitialMI;
				1494	while (Optional<MachineInstr > SingleUse = SinkCheckToSingleUse(MI)) {
				1495	// Update which MI we're checking now.
				1496	MI = *SingleUse;
				1497	if (!MI)
				1498	break;
				1499	}
				1500
				1501	return MI;
				1502	}
				1503
				1504	// We can harden non-leaking loads into register without touching the address
				1505	// by just hiding all of the loaded bits. We use an `or` instruction to do
				1506	// this because having the poison value be all ones allows us to use the same
				1507	// value below. And the goal is just for the loaded bits to not be exposed to
				1508	// execution and coercing them to one is sufficient.
				1509	void X86SpeculativeLoadHardeningPass::hardenPostLoad(
				1510	MachineInstr &MI, MachineSSAUpdater &PredStateSSA) {
				1511	assert(isDataInvariantLoad(MI) &&
				1512	"Cannot get here with a non-invariant load!");
				1513
				1514	MachineBasicBlock &MBB = *MI.getParent();
				1515	DebugLoc Loc = MI.getDebugLoc();
				1516
				1517	// For all of these, the def'ed register operand is operand zero.
				1518	auto &DefOp = MI.getOperand(0);
				1519	unsigned OldDefReg = DefOp.getReg();
				1520
				1521	auto *DefRC = MRI->getRegClass(OldDefReg);
				1522	int DefRegBytes = TRI->getRegSizeInBits(*DefRC) / 8;
				1523
				1524	unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
				1525	unsigned OrOpCode = OrOpCodes[Log2_32(DefRegBytes)];
				1526
				1527	unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
				1528
				1529	auto GetStateRegInRC = [&](const TargetRegisterClass &RC) {
				1530	unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
				1531
				1532	int Bytes = TRI->getRegSizeInBits(RC) / 8;
				1533	// FIXME: Need to teach this about 32-bit mode.
				1534	if (Bytes != 8) {
				1535	unsigned SubRegImm = SubRegImms[Log2_32(Bytes)];
				1536	unsigned NarrowStateReg = MRI->createVirtualRegister(&RC);
				1537	BuildMI(MBB, MI.getIterator(), Loc, TII->get(TargetOpcode::COPY),
				1538	NarrowStateReg)
				1539	.addReg(StateReg, 0, SubRegImm);
				1540	StateReg = NarrowStateReg;
				1541	}
				1542	return StateReg;
				1543	};
				1544
				1545	auto InsertPt = std::next(MI.getIterator());
				1546	unsigned FlagsReg = 0;
				1547	bool EFLAGSLive = isEFLAGSLive(MBB, InsertPt, *TRI);
				1548	if (EFLAGSLive && !Subtarget->hasBMI2()) {
				1549	FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
				1550	EFLAGSLive = false;
				1551	}
				1552
				1553	if (!EFLAGSLive) {
				1554	unsigned StateReg = GetStateRegInRC(*DefRC);
				1555	unsigned NewDefReg = MRI->createVirtualRegister(DefRC);
				1556	DefOp.setReg(NewDefReg);
				1557	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), OldDefReg)
				1558	.addReg(StateReg)
				1559	.addReg(NewDefReg);
				1560	OrI->addRegisterDead(X86::EFLAGS, TRI);
				1561	++NumInstsInserted;
				1562	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
				1563	} else {
				1564	assert(Subtarget->hasBMI2() &&
				1565	"Cannot harden loads and preserve EFLAGS without BMI2!");
				1566
				1567	unsigned ShiftOpCode = DefRegBytes < 4 ? X86::SHRX32rr : X86::SHRX64rr;
				1568	auto &ShiftRC =
				1569	DefRegBytes < 4 ? X86::GR32_NOSPRegClass : X86::GR64_NOSPRegClass;
				1570	int ShiftRegBytes = TRI->getRegSizeInBits(ShiftRC) / 8;
				1571	unsigned DefSubRegImm = SubRegImms[Log2_32(DefRegBytes)];
				1572
				1573	unsigned StateReg = GetStateRegInRC(ShiftRC);
				1574
				1575	// First have the def instruction def a temporary register.
				1576	unsigned TmpReg = MRI->createVirtualRegister(DefRC);
				1577	DefOp.setReg(TmpReg);
				1578	// Now copy it into a register of the shift RC.
				1579	unsigned ShiftInputReg = TmpReg;
				1580	if (DefRegBytes != ShiftRegBytes) {
				1581	unsigned UndefReg = MRI->createVirtualRegister(&ShiftRC);
				1582	BuildMI(MBB, InsertPt, Loc, TII->get(X86::IMPLICIT_DEF), UndefReg);
				1583	ShiftInputReg = MRI->createVirtualRegister(&ShiftRC);
				1584	BuildMI(MBB, InsertPt, Loc, TII->get(X86::INSERT_SUBREG), ShiftInputReg)
				1585	.addReg(UndefReg)
				1586	.addReg(TmpReg)
				1587	.addImm(DefSubRegImm);
				1588	}
				1589
				1590	// We shift this once if the shift is wider than the def and thus we can
				1591	// shift all of the def'ed bytes out. Otherwise we need to do two shifts.
				1592
				1593	unsigned ShiftedReg = MRI->createVirtualRegister(&ShiftRC);
				1594	auto Shift1I =
				1595	BuildMI(MBB, InsertPt, Loc, TII->get(ShiftOpCode), ShiftedReg)
				1596	.addReg(ShiftInputReg)
				1597	.addReg(StateReg);
				1598	(void)Shift1I;
				1599	++NumInstsInserted;
				1600	LLVM_DEBUG(dbgs() << " Inserting shrx: "; Shift1I->dump(); dbgs() << "\n");
				1601
				1602	// The only way we have a bit left is if all 8 bytes were defined. Do an
				1603	// extra shift to get the last bit in this case.
				1604	if (DefRegBytes == ShiftRegBytes) {
				1605	// We can just directly def the old def register as its the same size.
				1606	ShiftInputReg = ShiftedReg;
				1607	auto Shift2I =
				1608	BuildMI(MBB, InsertPt, Loc, TII->get(ShiftOpCode), OldDefReg)
				1609	.addReg(ShiftInputReg)
				1610	.addReg(StateReg);
				1611	(void)Shift2I;
				1612	++NumInstsInserted;
				1613	LLVM_DEBUG(dbgs() << " Inserting shrx: "; Shift2I->dump();
				1614	dbgs() << "\n");
				1615	} else {
				1616	// When we have different size shift register we need to fix up the
				1617	// class. We can do that as we copy into the old def register.
				1618	BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), OldDefReg)
				1619	.addReg(ShiftedReg, 0, DefSubRegImm);
				1620	}
				1621	}
				1622
				1623	if (FlagsReg)
				1624	restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
				1625
				1626	++NumPostLoadRegsHardened;
				1627	}
				1628
				1629	void X86SpeculativeLoadHardeningPass::checkReturnInstr(
				1630	MachineInstr &MI, MachineSSAUpdater &PredStateSSA) {
				1631	MachineBasicBlock &MBB = *MI.getParent();
				1632	DebugLoc Loc = MI.getDebugLoc();
				1633	auto InsertPt = MI.getIterator();
				1634
				1635	if (FenceCallAndRet) {
				1636	// Simply forcibly block speculation of loads out of the function by using
				1637	// an LFENCE. This is potentially a heavy-weight mitigation strategy, but
				1638	// should be secure, is simple from an ABI perspective, and the cost can be
				1639	// minimized through inlining.
				1640	//
				1641	// FIXME: We should investigate ways to establish a strong data-dependency
				1642	// on the return. However, poisoning the stack pointer is unlikely to work
				1643	// because the return is predicted rather than relying on the load of the
				1644	// return address to actually resolve.
				1645	BuildMI(MBB, InsertPt, Loc, TII->get(X86::LFENCE));
				1646	++NumInstsInserted;
				1647	++NumLFENCEsInserted;
				1648	return;
				1649	}
				1650
				1651	// Take our predicate state, shift it to the high 17 bits (so that we keep
				1652	// pointers canonical) and merge it into RSP. This will allow the caller to
				1653	// extract it when we return (speculatively).
				1654	mergePredStateIntoSP(MBB, InsertPt, Loc,
				1655	PredStateSSA.GetValueAtEndOfBlock(&MBB));
				1656	}
				1657
				1658	INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
				1659	"X86 speculative load hardener", false, false)
				1660	INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
				1661	"X86 speculative load hardener", false, false)
				1662
				1663	FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() {
				1664	return new X86SpeculativeLoadHardeningPass();
				1665	}