Blame - llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp - toolchain/llvm-project

blob: 93dcf95bebdce98aabcd77ea6acca71ef69ef878 [file] [log] [blame]

Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1	//====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	/// \file
				10	///
				11	/// Provide a pass which mitigates speculative execution attacks which operate
				12	/// by speculating incorrectly past some predicate (a type check, bounds check,
				13	/// or other condition) to reach a load with invalid inputs and leak the data
				14	/// accessed by that load using a side channel out of the speculative domain.
				15	///
				16	/// For details on the attacks, see the first variant in both the Project Zero
				17	/// writeup and the Spectre paper:
				18	/// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
				19	/// https://spectreattack.com/spectre.pdf
				20	///
				21	//===----------------------------------------------------------------------===//
				22
				23	#include "X86.h"
				24	#include "X86InstrBuilder.h"
				25	#include "X86InstrInfo.h"
				26	#include "X86Subtarget.h"
				27	#include "llvm/ADT/ArrayRef.h"
				28	#include "llvm/ADT/DenseMap.h"
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	29	#include "llvm/ADT/Optional.h"
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	30	#include "llvm/ADT/STLExtras.h"
				31	#include "llvm/ADT/ScopeExit.h"
				32	#include "llvm/ADT/SmallPtrSet.h"
				33	#include "llvm/ADT/SmallSet.h"
				34	#include "llvm/ADT/SmallVector.h"
				35	#include "llvm/ADT/SparseBitVector.h"
				36	#include "llvm/ADT/Statistic.h"
				37	#include "llvm/CodeGen/MachineBasicBlock.h"
				38	#include "llvm/CodeGen/MachineConstantPool.h"
				39	#include "llvm/CodeGen/MachineFunction.h"
				40	#include "llvm/CodeGen/MachineFunctionPass.h"
				41	#include "llvm/CodeGen/MachineInstr.h"
				42	#include "llvm/CodeGen/MachineInstrBuilder.h"
				43	#include "llvm/CodeGen/MachineModuleInfo.h"
				44	#include "llvm/CodeGen/MachineOperand.h"
				45	#include "llvm/CodeGen/MachineRegisterInfo.h"
				46	#include "llvm/CodeGen/MachineSSAUpdater.h"
				47	#include "llvm/CodeGen/TargetInstrInfo.h"
				48	#include "llvm/CodeGen/TargetRegisterInfo.h"
				49	#include "llvm/CodeGen/TargetSchedule.h"
				50	#include "llvm/CodeGen/TargetSubtargetInfo.h"
				51	#include "llvm/IR/DebugLoc.h"
				52	#include "llvm/MC/MCSchedule.h"
				53	#include "llvm/Pass.h"
				54	#include "llvm/Support/CommandLine.h"
				55	#include "llvm/Support/Debug.h"
				56	#include "llvm/Support/raw_ostream.h"
				57	#include <algorithm>
				58	#include <cassert>
				59	#include <iterator>
				60	#include <utility>
				61
				62	using namespace llvm;
				63
				64	#define PASS_KEY "x86-speculative-load-hardening"
				65	#define DEBUG_TYPE PASS_KEY
				66
				67	STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced");
				68	STATISTIC(NumBranchesUntraced, "Number of branches unable to trace");
				69	STATISTIC(NumAddrRegsHardened,
				70	"Number of address mode used registers hardaned");
				71	STATISTIC(NumPostLoadRegsHardened,
				72	"Number of post-load register values hardened");
				73	STATISTIC(NumInstsInserted, "Number of instructions inserted");
				74	STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
				75
				76	static cl::opt<bool> HardenEdgesWithLFENCE(
				77	PASS_KEY "-lfence",
				78	cl::desc(
				79	"Use LFENCE along each conditional edge to harden against speculative "
				80	"loads rather than conditional movs and poisoned pointers."),
				81	cl::init(false), cl::Hidden);
				82
				83	static cl::opt<bool> EnablePostLoadHardening(
				84	PASS_KEY "-post-load",
				85	cl::desc("Harden the value loaded after it is loaded by "
				86	"flushing the loaded bits to 1. This is hard to do "
				87	"in general but can be done easily for GPRs."),
				88	cl::init(true), cl::Hidden);
				89
				90	static cl::opt<bool> FenceCallAndRet(
				91	PASS_KEY "-fence-call-and-ret",
				92	cl::desc("Use a full speculation fence to harden both call and ret edges "
				93	"rather than a lighter weight mitigation."),
				94	cl::init(false), cl::Hidden);
				95
				96	static cl::opt<bool> HardenInterprocedurally(
				97	PASS_KEY "-ip",
				98	cl::desc("Harden interprocedurally by passing our state in and out of "
				99	"functions in the high bits of the stack pointer."),
				100	cl::init(true), cl::Hidden);
				101
				102	static cl::opt<bool>
				103	HardenLoads(PASS_KEY "-loads",
				104	cl::desc("Sanitize loads from memory. When disable, no "
				105	"significant security is provided."),
				106	cl::init(true), cl::Hidden);
				107
				108	namespace llvm {
				109
				110	void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
				111
				112	} // end namespace llvm
				113
				114	namespace {
				115
				116	class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
				117	public:
				118	X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) {
				119	initializeX86SpeculativeLoadHardeningPassPass(
				120	*PassRegistry::getPassRegistry());
				121	}
				122
				123	StringRef getPassName() const override {
				124	return "X86 speculative load hardening";
				125	}
				126	bool runOnMachineFunction(MachineFunction &MF) override;
				127	void getAnalysisUsage(AnalysisUsage &AU) const override;
				128
				129	/// Pass identification, replacement for typeid.
				130	static char ID;
				131
				132	private:
				133	/// The information about a block's conditional terminators needed to trace
				134	/// our predicate state through the exiting edges.
				135	struct BlockCondInfo {
				136	MachineBasicBlock *MBB;
				137
				138	// We mostly have one conditional branch, and in extremely rare cases have
				139	// two. Three and more are so rare as to be unimportant for compile time.
				140	SmallVector<MachineInstr *, 2> CondBrs;
				141
				142	MachineInstr *UncondBr;
				143	};
				144
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	145	/// Manages the predicate state traced through the program.
				146	struct PredState {
				147	unsigned InitialReg;
				148	unsigned PoisonReg;
				149
				150	const TargetRegisterClass *RC;
				151	MachineSSAUpdater SSA;
				152
				153	PredState(MachineFunction &MF, const TargetRegisterClass *RC)
				154	: RC(RC), SSA(MF) {}
				155	};
				156
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	157	const X86Subtarget *Subtarget;
				158	MachineRegisterInfo *MRI;
				159	const X86InstrInfo *TII;
				160	const TargetRegisterInfo *TRI;
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	161
				162	Optional<PredState> PS;
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	163
				164	void hardenEdgesWithLFENCE(MachineFunction &MF);
				165
				166	SmallVector<BlockCondInfo, 16> collectBlockCondInfo(MachineFunction &MF);
				167
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	168	SmallVector<MachineInstr *, 16>
				169	tracePredStateThroughCFG(MachineFunction &MF, ArrayRef<BlockCondInfo> Infos);
				170
Chandler Carruth	0477b40	2018-07-23 04:01:34 +0000	[diff] [blame]	171	void hardenAllLoads(MachineFunction &MF);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	172
				173	unsigned saveEFLAGS(MachineBasicBlock &MBB,
				174	MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
				175	void restoreEFLAGS(MachineBasicBlock &MBB,
				176	MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
				177	unsigned OFReg);
				178
				179	void mergePredStateIntoSP(MachineBasicBlock &MBB,
				180	MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
				181	unsigned PredStateReg);
				182	unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
				183	MachineBasicBlock::iterator InsertPt,
				184	DebugLoc Loc);
				185
				186	void
				187	hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	188	MachineOperand &IndexMO,
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	189	SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
				190	MachineInstr *
				191	sinkPostLoadHardenedInst(MachineInstr &MI,
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	192	SmallPtrSetImpl<MachineInstr *> &HardenedInstrs);
				193	bool canHardenRegister(unsigned Reg);
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	194	void hardenPostLoad(MachineInstr &MI);
Chandler Carruth	a3a03ac	2018-07-19 23:46:24 +0000	[diff] [blame]	195	void hardenReturnInstr(MachineInstr &MI);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	196	};
				197
				198	} // end anonymous namespace
				199
				200	char X86SpeculativeLoadHardeningPass::ID = 0;
				201
				202	void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
				203	AnalysisUsage &AU) const {
				204	MachineFunctionPass::getAnalysisUsage(AU);
				205	}
				206
				207	static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB,
				208	MachineBasicBlock &Succ, int SuccCount,
				209	MachineInstr Br, MachineInstr &UncondBr,
				210	const X86InstrInfo &TII) {
				211	assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!");
				212
				213	MachineFunction &MF = *MBB.getParent();
				214
				215	MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
				216
				217	// We have to insert the new block immediately after the current one as we
				218	// don't know what layout-successor relationships the successor has and we
				219	// may not be able to (and generally don't want to) try to fix those up.
				220	MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
				221
				222	// Update the branch instruction if necessary.
				223	if (Br) {
				224	assert(Br->getOperand(0).getMBB() == &Succ &&
				225	"Didn't start with the right target!");
				226	Br->getOperand(0).setMBB(&NewMBB);
				227
				228	// If this successor was reached through a branch rather than fallthrough,
				229	// we might have broken fallthrough and so need to inject a new
				230	// unconditional branch.
				231	if (!UncondBr) {
				232	MachineBasicBlock &OldLayoutSucc =
				233	*std::next(MachineFunction::iterator(&NewMBB));
				234	assert(MBB.isSuccessor(&OldLayoutSucc) &&
				235	"Without an unconditional branch, the old layout successor should "
				236	"be an actual successor!");
				237	auto BrBuilder =
				238	BuildMI(&MBB, DebugLoc(), TII.get(X86::JMP_1)).addMBB(&OldLayoutSucc);
				239	// Update the unconditional branch now that we've added one.
				240	UncondBr = &*BrBuilder;
				241	}
				242
				243	// Insert unconditional "jump Succ" instruction in the new block if
				244	// necessary.
				245	if (!NewMBB.isLayoutSuccessor(&Succ)) {
				246	SmallVector<MachineOperand, 4> Cond;
				247	TII.insertBranch(NewMBB, &Succ, nullptr, Cond, Br->getDebugLoc());
				248	}
				249	} else {
				250	assert(!UncondBr &&
				251	"Cannot have a branchless successor and an unconditional branch!");
				252	assert(NewMBB.isLayoutSuccessor(&Succ) &&
				253	"A non-branch successor must have been a layout successor before "
				254	"and now is a layout successor of the new block.");
				255	}
				256
				257	// If this is the only edge to the successor, we can just replace it in the
				258	// CFG. Otherwise we need to add a new entry in the CFG for the new
				259	// successor.
				260	if (SuccCount == 1) {
				261	MBB.replaceSuccessor(&Succ, &NewMBB);
				262	} else {
				263	MBB.splitSuccessor(&Succ, &NewMBB);
				264	}
				265
				266	// Hook up the edge from the new basic block to the old successor in the CFG.
				267	NewMBB.addSuccessor(&Succ);
				268
				269	// Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
				270	for (MachineInstr &MI : Succ) {
				271	if (!MI.isPHI())
				272	break;
				273	for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
				274	OpIdx += 2) {
				275	MachineOperand &OpV = MI.getOperand(OpIdx);
				276	MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
				277	assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
				278	if (OpMBB.getMBB() != &MBB)
				279	continue;
				280
				281	// If this is the last edge to the succesor, just replace MBB in the PHI
				282	if (SuccCount == 1) {
				283	OpMBB.setMBB(&NewMBB);
				284	break;
				285	}
				286
				287	// Otherwise, append a new pair of operands for the new incoming edge.
				288	MI.addOperand(MF, OpV);
				289	MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
				290	break;
				291	}
				292	}
				293
				294	// Inherit live-ins from the successor
				295	for (auto &LI : Succ.liveins())
				296	NewMBB.addLiveIn(LI);
				297
				298	LLVM_DEBUG(dbgs() << " Split edge from '" << MBB.getName() << "' to '"
				299	<< Succ.getName() << "'.\n");
				300	return NewMBB;
				301	}
				302
Chandler Carruth	3ffcc03	2018-07-15 23:46:36 +0000	[diff] [blame]	303	/// Removing duplicate PHI operands to leave the PHI in a canonical and
				304	/// predictable form.
				305	///
				306	/// FIXME: It's really frustrating that we have to do this, but SSA-form in MIR
				307	/// isn't what you might expect. We may have multiple entries in PHI nodes for
				308	/// a single predecessor. This makes CFG-updating extremely complex, so here we
				309	/// simplify all PHI nodes to a model even simpler than the IR's model: exactly
				310	/// one entry per predecessor, regardless of how many edges there are.
				311	static void canonicalizePHIOperands(MachineFunction &MF) {
				312	SmallPtrSet<MachineBasicBlock *, 4> Preds;
				313	SmallVector<int, 4> DupIndices;
				314	for (auto &MBB : MF)
				315	for (auto &MI : MBB) {
				316	if (!MI.isPHI())
				317	break;
				318
				319	// First we scan the operands of the PHI looking for duplicate entries
				320	// a particular predecessor. We retain the operand index of each duplicate
				321	// entry found.
				322	for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
				323	OpIdx += 2)
				324	if (!Preds.insert(MI.getOperand(OpIdx + 1).getMBB()).second)
				325	DupIndices.push_back(OpIdx);
				326
				327	// Now walk the duplicate indices, removing both the block and value. Note
				328	// that these are stored as a vector making this element-wise removal
				329	// :w
				330	// potentially quadratic.
				331	//
				332	// FIXME: It is really frustrating that we have to use a quadratic
				333	// removal algorithm here. There should be a better way, but the use-def
				334	// updates required make that impossible using the public API.
				335	//
				336	// Note that we have to process these backwards so that we don't
				337	// invalidate other indices with each removal.
				338	while (!DupIndices.empty()) {
				339	int OpIdx = DupIndices.pop_back_val();
				340	// Remove both the block and value operand, again in reverse order to
				341	// preserve indices.
				342	MI.RemoveOperand(OpIdx + 1);
				343	MI.RemoveOperand(OpIdx);
				344	}
				345
				346	Preds.clear();
				347	}
				348	}
				349
Chandler Carruth	3620b99	2018-07-16 10:46:16 +0000	[diff] [blame]	350	/// Helper to scan a function for loads vulnerable to misspeculation that we
				351	/// want to harden.
				352	///
				353	/// We use this to avoid making changes to functions where there is nothing we
				354	/// need to do to harden against misspeculation.
				355	static bool hasVulnerableLoad(MachineFunction &MF) {
				356	for (MachineBasicBlock &MBB : MF) {
				357	for (MachineInstr &MI : MBB) {
				358	// Loads within this basic block after an LFENCE are not at risk of
				359	// speculatively executing with invalid predicates from prior control
				360	// flow. So break out of this block but continue scanning the function.
				361	if (MI.getOpcode() == X86::LFENCE)
				362	break;
				363
				364	// Looking for loads only.
				365	if (!MI.mayLoad())
				366	continue;
				367
				368	// An MFENCE is modeled as a load but isn't vulnerable to misspeculation.
				369	if (MI.getOpcode() == X86::MFENCE)
				370	continue;
				371
				372	// We found a load.
				373	return true;
				374	}
				375	}
				376
				377	// No loads found.
				378	return false;
				379	}
				380
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	381	bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
				382	MachineFunction &MF) {
				383	LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
				384	<< " **********\n");
				385
				386	Subtarget = &MF.getSubtarget<X86Subtarget>();
				387	MRI = &MF.getRegInfo();
				388	TII = Subtarget->getInstrInfo();
				389	TRI = Subtarget->getRegisterInfo();
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	390
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	391	// FIXME: Support for 32-bit.
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	392	PS.emplace(MF, &X86::GR64_NOSPRegClass);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	393
				394	if (MF.begin() == MF.end())
				395	// Nothing to do for a degenerate empty function...
				396	return false;
				397
				398	// We support an alternative hardening technique based on a debug flag.
				399	if (HardenEdgesWithLFENCE) {
				400	hardenEdgesWithLFENCE(MF);
				401	return true;
				402	}
				403
				404	// Create a dummy debug loc to use for all the generated code here.
				405	DebugLoc Loc;
				406
				407	MachineBasicBlock &Entry = *MF.begin();
				408	auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(Entry.begin());
				409
				410	// Do a quick scan to see if we have any checkable loads.
Chandler Carruth	3620b99	2018-07-16 10:46:16 +0000	[diff] [blame]	411	bool HasVulnerableLoad = hasVulnerableLoad(MF);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	412
				413	// See if we have any conditional branching blocks that we will need to trace
				414	// predicate state through.
				415	SmallVector<BlockCondInfo, 16> Infos = collectBlockCondInfo(MF);
				416
				417	// If we have no interesting conditions or loads, nothing to do here.
Chandler Carruth	3620b99	2018-07-16 10:46:16 +0000	[diff] [blame]	418	if (!HasVulnerableLoad && Infos.empty())
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	419	return true;
				420
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	421	// The poison value is required to be an all-ones value for many aspects of
				422	// this mitigation.
				423	const int PoisonVal = -1;
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	424	PS->PoisonReg = MRI->createVirtualRegister(PS->RC);
				425	BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64ri32), PS->PoisonReg)
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	426	.addImm(PoisonVal);
				427	++NumInstsInserted;
				428
				429	// If we have loads being hardened and we've asked for call and ret edges to
				430	// get a full fence-based mitigation, inject that fence.
Chandler Carruth	3620b99	2018-07-16 10:46:16 +0000	[diff] [blame]	431	if (HasVulnerableLoad && FenceCallAndRet) {
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	432	// We need to insert an LFENCE at the start of the function to suspend any
				433	// incoming misspeculation from the caller. This helps two-fold: the caller
				434	// may not have been protected as this code has been, and this code gets to
				435	// not take any specific action to protect across calls.
				436	// FIXME: We could skip this for functions which unconditionally return
				437	// a constant.
				438	BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::LFENCE));
				439	++NumInstsInserted;
				440	++NumLFENCEsInserted;
				441	}
				442
Chandler Carruth	fb503ac	2018-07-14 09:32:37 +0000	[diff] [blame]	443	// If we guarded the entry with an LFENCE and have no conditionals to protect
				444	// in blocks, then we're done.
				445	if (FenceCallAndRet && Infos.empty())
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	446	// We may have changed the function's code at this point to insert fences.
				447	return true;
				448
				449	// For every basic block in the function which can b
				450	if (HardenInterprocedurally && !FenceCallAndRet) {
				451	// Set up the predicate state by extracting it from the incoming stack
				452	// pointer so we pick up any misspeculation in our caller.
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	453	PS->InitialReg = extractPredStateFromSP(Entry, EntryInsertPt, Loc);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	454	} else {
				455	// Otherwise, just build the predicate state itself by zeroing a register
				456	// as we don't need any initial state.
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	457	PS->InitialReg = MRI->createVirtualRegister(PS->RC);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	458	unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
				459	auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
				460	PredStateSubReg);
				461	++NumInstsInserted;
				462	MachineOperand *ZeroEFLAGSDefOp =
				463	ZeroI->findRegisterDefOperand(X86::EFLAGS);
				464	assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
				465	"Must have an implicit def of EFLAGS!");
				466	ZeroEFLAGSDefOp->setIsDead(true);
				467	BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	468	PS->InitialReg)
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	469	.addImm(0)
				470	.addReg(PredStateSubReg)
				471	.addImm(X86::sub_32bit);
				472	}
				473
				474	// We're going to need to trace predicate state throughout the function's
				475	// CFG. Prepare for this by setting up our initial state of PHIs with unique
				476	// predecessor entries and all the initial predicate state.
Chandler Carruth	3ffcc03	2018-07-15 23:46:36 +0000	[diff] [blame]	477	canonicalizePHIOperands(MF);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	478
				479	// Track the updated values in an SSA updater to rewrite into SSA form at the
				480	// end.
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	481	PS->SSA.Initialize(PS->InitialReg);
				482	PS->SSA.AddAvailableValue(&Entry, PS->InitialReg);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	483
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	484	// Trace through the CFG.
				485	auto CMovs = tracePredStateThroughCFG(MF, Infos);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	486
				487	// We may also enter basic blocks in this function via exception handling
				488	// control flow. Here, if we are hardening interprocedurally, we need to
				489	// re-capture the predicate state from the throwing code. In the Itanium ABI,
				490	// the throw will always look like a call to __cxa_throw and will have the
				491	// predicate state in the stack pointer, so extract fresh predicate state from
				492	// the stack pointer and make it available in SSA.
				493	// FIXME: Handle non-itanium ABI EH models.
				494	if (HardenInterprocedurally) {
				495	for (MachineBasicBlock &MBB : MF) {
				496	assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!");
				497	assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
				498	assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
				499	if (!MBB.isEHPad())
				500	continue;
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	501	PS->SSA.AddAvailableValue(
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	502	&MBB,
				503	extractPredStateFromSP(MBB, MBB.SkipPHIsAndLabels(MBB.begin()), Loc));
				504	}
				505	}
				506
Chandler Carruth	0477b40	2018-07-23 04:01:34 +0000	[diff] [blame]	507	// Now harden all of the loads in the function using the predicate state.
				508	hardenAllLoads(MF);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	509
				510	// Now rewrite all the uses of the pred state using the SSA updater so that
				511	// we track updates through the CFG.
				512	for (MachineInstr *CMovI : CMovs)
				513	for (MachineOperand &Op : CMovI->operands()) {
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	514	if (!Op.isReg() \|\| Op.getReg() != PS->InitialReg)
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	515	continue;
				516
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	517	PS->SSA.RewriteUse(Op);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	518	}
				519
				520	// If we are hardening interprocedurally, find each returning block and
				521	// protect the caller from being returned to through misspeculation.
				522	if (HardenInterprocedurally)
				523	for (MachineBasicBlock &MBB : MF) {
				524	if (MBB.empty())
				525	continue;
				526
				527	MachineInstr &MI = MBB.back();
Chandler Carruth	1d926fb	2018-07-23 07:56:15 +0000	[diff] [blame]	528
				529	// We only care about returns that are not also calls. For calls, that
				530	// happen to also be returns (tail calls) we will have already handled
				531	// them as calls.
				532	if (!MI.isReturn() \|\| MI.isCall())
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	533	continue;
				534
Chandler Carruth	a3a03ac	2018-07-19 23:46:24 +0000	[diff] [blame]	535	hardenReturnInstr(MI);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	536	}
				537
				538	LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump();
				539	dbgs() << "\n"; MF.verify(this));
				540	return true;
				541	}
				542
				543	/// Implements the naive hardening approach of putting an LFENCE after every
				544	/// potentially mis-predicted control flow construct.
				545	///
				546	/// We include this as an alternative mostly for the purpose of comparison. The
				547	/// performance impact of this is expected to be extremely severe and not
				548	/// practical for any real-world users.
				549	void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
				550	MachineFunction &MF) {
				551	// First, we scan the function looking for blocks that are reached along edges
				552	// that we might want to harden.
				553	SmallSetVector<MachineBasicBlock *, 8> Blocks;
				554	for (MachineBasicBlock &MBB : MF) {
				555	// If there are no or only one successor, nothing to do here.
				556	if (MBB.succ_size() <= 1)
				557	continue;
				558
				559	// Skip blocks unless their terminators start with a branch. Other
				560	// terminators don't seem interesting for guarding against misspeculation.
				561	auto TermIt = MBB.getFirstTerminator();
				562	if (TermIt == MBB.end() \|\| !TermIt->isBranch())
				563	continue;
				564
				565	// Add all the non-EH-pad succossors to the blocks we want to harden. We
				566	// skip EH pads because there isn't really a condition of interest on
				567	// entering.
				568	for (MachineBasicBlock *SuccMBB : MBB.successors())
				569	if (!SuccMBB->isEHPad())
				570	Blocks.insert(SuccMBB);
				571	}
				572
				573	for (MachineBasicBlock *MBB : Blocks) {
				574	auto InsertPt = MBB->SkipPHIsAndLabels(MBB->begin());
				575	BuildMI(*MBB, InsertPt, DebugLoc(), TII->get(X86::LFENCE));
				576	++NumInstsInserted;
				577	++NumLFENCEsInserted;
				578	}
				579	}
				580
				581	SmallVector<X86SpeculativeLoadHardeningPass::BlockCondInfo, 16>
				582	X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
				583	SmallVector<BlockCondInfo, 16> Infos;
				584
				585	// Walk the function and build up a summary for each block's conditions that
				586	// we need to trace through.
				587	for (MachineBasicBlock &MBB : MF) {
				588	// If there are no or only one successor, nothing to do here.
				589	if (MBB.succ_size() <= 1)
				590	continue;
				591
				592	// We want to reliably handle any conditional branch terminators in the
				593	// MBB, so we manually analyze the branch. We can handle all of the
				594	// permutations here, including ones that analyze branch cannot.
				595	//
				596	// The approach is to walk backwards across the terminators, resetting at
				597	// any unconditional non-indirect branch, and track all conditional edges
				598	// to basic blocks as well as the fallthrough or unconditional successor
				599	// edge. For each conditional edge, we track the target and the opposite
				600	// condition code in order to inject a "no-op" cmov into that successor
				601	// that will harden the predicate. For the fallthrough/unconditional
				602	// edge, we inject a separate cmov for each conditional branch with
				603	// matching condition codes. This effectively implements an "and" of the
				604	// condition flags, even if there isn't a single condition flag that would
				605	// directly implement that. We don't bother trying to optimize either of
				606	// these cases because if such an optimization is possible, LLVM should
				607	// have optimized the conditional branches in that way already to reduce
				608	// instruction count. This late, we simply assume the minimal number of
				609	// branch instructions is being emitted and use that to guide our cmov
				610	// insertion.
				611
				612	BlockCondInfo Info = {&MBB, {}, nullptr};
				613
				614	// Now walk backwards through the terminators and build up successors they
				615	// reach and the conditions.
				616	for (MachineInstr &MI : llvm::reverse(MBB)) {
				617	// Once we've handled all the terminators, we're done.
				618	if (!MI.isTerminator())
				619	break;
				620
				621	// If we see a non-branch terminator, we can't handle anything so bail.
				622	if (!MI.isBranch()) {
				623	Info.CondBrs.clear();
				624	break;
				625	}
				626
				627	// If we see an unconditional branch, reset our state, clear any
				628	// fallthrough, and set this is the "else" successor.
				629	if (MI.getOpcode() == X86::JMP_1) {
				630	Info.CondBrs.clear();
				631	Info.UncondBr = &MI;
				632	continue;
				633	}
				634
				635	// If we get an invalid condition, we have an indirect branch or some
				636	// other unanalyzable "fallthrough" case. We model this as a nullptr for
				637	// the destination so we can still guard any conditional successors.
				638	// Consider code sequences like:
				639	// ```
				640	// jCC L1
				641	// jmpq *%rax
				642	// ```
				643	// We still want to harden the edge to `L1`.
				644	if (X86::getCondFromBranchOpc(MI.getOpcode()) == X86::COND_INVALID) {
				645	Info.CondBrs.clear();
				646	Info.UncondBr = &MI;
				647	continue;
				648	}
				649
				650	// We have a vanilla conditional branch, add it to our list.
				651	Info.CondBrs.push_back(&MI);
				652	}
				653	if (Info.CondBrs.empty()) {
				654	++NumBranchesUntraced;
				655	LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
				656	MBB.dump());
				657	continue;
				658	}
				659
				660	Infos.push_back(Info);
				661	}
				662
				663	return Infos;
				664	}
				665
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	666	/// Trace the predicate state through the CFG, instrumenting each conditional
				667	/// branch such that misspeculation through an edge will poison the predicate
				668	/// state.
				669	///
				670	/// Returns the list of inserted CMov instructions so that they can have their
				671	/// uses of the predicate state rewritten into proper SSA form once it is
				672	/// complete.
				673	SmallVector<MachineInstr *, 16>
				674	X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
				675	MachineFunction &MF, ArrayRef<BlockCondInfo> Infos) {
				676	// Collect the inserted cmov instructions so we can rewrite their uses of the
				677	// predicate state into SSA form.
				678	SmallVector<MachineInstr *, 16> CMovs;
				679
				680	// Now walk all of the basic blocks looking for ones that end in conditional
				681	// jumps where we need to update this register along each edge.
				682	for (const BlockCondInfo &Info : Infos) {
				683	MachineBasicBlock &MBB = *Info.MBB;
				684	const SmallVectorImpl<MachineInstr *> &CondBrs = Info.CondBrs;
				685	MachineInstr *UncondBr = Info.UncondBr;
				686
				687	LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName()
				688	<< "\n");
				689	++NumCondBranchesTraced;
				690
				691	// Compute the non-conditional successor as either the target of any
				692	// unconditional branch or the layout successor.
				693	MachineBasicBlock *UncondSucc =
				694	UncondBr ? (UncondBr->getOpcode() == X86::JMP_1
				695	? UncondBr->getOperand(0).getMBB()
				696	: nullptr)
				697	: &*std::next(MachineFunction::iterator(&MBB));
				698
				699	// Count how many edges there are to any given successor.
				700	SmallDenseMap<MachineBasicBlock *, int> SuccCounts;
				701	if (UncondSucc)
				702	++SuccCounts[UncondSucc];
				703	for (auto *CondBr : CondBrs)
				704	++SuccCounts[CondBr->getOperand(0).getMBB()];
				705
				706	// A lambda to insert cmov instructions into a block checking all of the
				707	// condition codes in a sequence.
				708	auto BuildCheckingBlockForSuccAndConds =
				709	[&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount,
				710	MachineInstr Br, MachineInstr &UncondBr,
				711	ArrayRef<X86::CondCode> Conds) {
				712	// First, we split the edge to insert the checking block into a safe
				713	// location.
				714	auto &CheckingMBB =
				715	(SuccCount == 1 && Succ.pred_size() == 1)
				716	? Succ
				717	: splitEdge(MBB, Succ, SuccCount, Br, UncondBr, *TII);
				718
				719	bool LiveEFLAGS = Succ.isLiveIn(X86::EFLAGS);
				720	if (!LiveEFLAGS)
				721	CheckingMBB.addLiveIn(X86::EFLAGS);
				722
				723	// Now insert the cmovs to implement the checks.
				724	auto InsertPt = CheckingMBB.begin();
				725	assert((InsertPt == CheckingMBB.end() \|\| !InsertPt->isPHI()) &&
				726	"Should never have a PHI in the initial checking block as it "
				727	"always has a single predecessor!");
				728
				729	// We will wire each cmov to each other, but need to start with the
				730	// incoming pred state.
				731	unsigned CurStateReg = PS->InitialReg;
				732
				733	for (X86::CondCode Cond : Conds) {
				734	int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
				735	auto CMovOp = X86::getCMovFromCond(Cond, PredStateSizeInBytes);
				736
				737	unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
				738	// Note that we intentionally use an empty debug location so that
				739	// this picks up the preceding location.
				740	auto CMovI = BuildMI(CheckingMBB, InsertPt, DebugLoc(),
				741	TII->get(CMovOp), UpdatedStateReg)
				742	.addReg(CurStateReg)
				743	.addReg(PS->PoisonReg);
				744	// If this is the last cmov and the EFLAGS weren't originally
				745	// live-in, mark them as killed.
				746	if (!LiveEFLAGS && Cond == Conds.back())
				747	CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
				748
				749	++NumInstsInserted;
				750	LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump();
				751	dbgs() << "\n");
				752
				753	// The first one of the cmovs will be using the top level
				754	// `PredStateReg` and need to get rewritten into SSA form.
				755	if (CurStateReg == PS->InitialReg)
				756	CMovs.push_back(&*CMovI);
				757
				758	// The next cmov should start from this one's def.
				759	CurStateReg = UpdatedStateReg;
				760	}
				761
				762	// And put the last one into the available values for SSA form of our
				763	// predicate state.
				764	PS->SSA.AddAvailableValue(&CheckingMBB, CurStateReg);
				765	};
				766
				767	std::vector<X86::CondCode> UncondCodeSeq;
				768	for (auto *CondBr : CondBrs) {
				769	MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB();
				770	int &SuccCount = SuccCounts[&Succ];
				771
				772	X86::CondCode Cond = X86::getCondFromBranchOpc(CondBr->getOpcode());
				773	X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond);
				774	UncondCodeSeq.push_back(Cond);
				775
				776	BuildCheckingBlockForSuccAndConds(MBB, Succ, SuccCount, CondBr, UncondBr,
				777	{InvCond});
				778
				779	// Decrement the successor count now that we've split one of the edges.
				780	// We need to keep the count of edges to the successor accurate in order
				781	// to know above when to replace the successor in the CFG vs. just
				782	// adding the new successor.
				783	--SuccCount;
				784	}
				785
				786	// Since we may have split edges and changed the number of successors,
				787	// normalize the probabilities. This avoids doing it each time we split an
				788	// edge.
				789	MBB.normalizeSuccProbs();
				790
				791	// Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
				792	// need to intersect the other condition codes. We can do this by just
				793	// doing a cmov for each one.
				794	if (!UncondSucc)
				795	// If we have no fallthrough to protect (perhaps it is an indirect jump?)
				796	// just skip this and continue.
				797	continue;
				798
				799	assert(SuccCounts[UncondSucc] == 1 &&
				800	"We should never have more than one edge to the unconditional "
				801	"successor at this point because every other edge must have been "
				802	"split above!");
				803
				804	// Sort and unique the codes to minimize them.
				805	llvm::sort(UncondCodeSeq.begin(), UncondCodeSeq.end());
				806	UncondCodeSeq.erase(std::unique(UncondCodeSeq.begin(), UncondCodeSeq.end()),
				807	UncondCodeSeq.end());
				808
				809	// Build a checking version of the successor.
				810	BuildCheckingBlockForSuccAndConds(MBB, UncondSucc, /SuccCount*/ 1,
				811	UncondBr, UncondBr, UncondCodeSeq);
				812	}
				813
				814	return CMovs;
				815	}
				816
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	817	/// Returns true if the instruction has no behavior (specified or otherwise)
				818	/// that is based on the value of any of its register operands
				819	///
				820	/// A classical example of something that is inherently not data invariant is an
				821	/// indirect jump -- the destination is loaded into icache based on the bits set
				822	/// in the jump destination register.
				823	///
				824	/// FIXME: This should become part of our instruction tables.
				825	static bool isDataInvariant(MachineInstr &MI) {
				826	switch (MI.getOpcode()) {
				827	default:
				828	// By default, assume that the instruction is not data invariant.
				829	return false;
				830
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	831	// Some target-independent operations that trivially lower to data-invariant
				832	// instructions.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	833	case TargetOpcode::COPY:
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	834	case TargetOpcode::INSERT_SUBREG:
				835	case TargetOpcode::SUBREG_TO_REG:
				836	return true;
				837
				838	// On x86 it is believed that imul is constant time w.r.t. the loaded data.
				839	// However, they set flags and are perhaps the most surprisingly constant
				840	// time operations so we call them out here separately.
				841	case X86::IMUL16rr:
				842	case X86::IMUL16rri8:
				843	case X86::IMUL16rri:
				844	case X86::IMUL32rr:
				845	case X86::IMUL32rri8:
				846	case X86::IMUL32rri:
				847	case X86::IMUL64rr:
				848	case X86::IMUL64rri32:
				849	case X86::IMUL64rri8:
				850
				851	// Bit scanning and counting instructions that are somewhat surprisingly
				852	// constant time as they scan across bits and do other fairly complex
				853	// operations like popcnt, but are believed to be constant time on x86.
				854	// However, these set flags.
				855	case X86::BSF16rr:
				856	case X86::BSF32rr:
				857	case X86::BSF64rr:
				858	case X86::BSR16rr:
				859	case X86::BSR32rr:
				860	case X86::BSR64rr:
				861	case X86::LZCNT16rr:
				862	case X86::LZCNT32rr:
				863	case X86::LZCNT64rr:
				864	case X86::POPCNT16rr:
				865	case X86::POPCNT32rr:
				866	case X86::POPCNT64rr:
				867	case X86::TZCNT16rr:
				868	case X86::TZCNT32rr:
				869	case X86::TZCNT64rr:
				870
				871	// Bit manipulation instructions are effectively combinations of basic
				872	// arithmetic ops, and should still execute in constant time. These also
				873	// set flags.
				874	case X86::BLCFILL32rr:
				875	case X86::BLCFILL64rr:
				876	case X86::BLCI32rr:
				877	case X86::BLCI64rr:
				878	case X86::BLCIC32rr:
				879	case X86::BLCIC64rr:
				880	case X86::BLCMSK32rr:
				881	case X86::BLCMSK64rr:
				882	case X86::BLCS32rr:
				883	case X86::BLCS64rr:
				884	case X86::BLSFILL32rr:
				885	case X86::BLSFILL64rr:
				886	case X86::BLSI32rr:
				887	case X86::BLSI64rr:
				888	case X86::BLSIC32rr:
				889	case X86::BLSIC64rr:
				890	case X86::BLSMSK32rr:
				891	case X86::BLSMSK64rr:
				892	case X86::BLSR32rr:
				893	case X86::BLSR64rr:
				894	case X86::TZMSK32rr:
				895	case X86::TZMSK64rr:
				896
				897	// Bit extracting and clearing instructions should execute in constant time,
				898	// and set flags.
				899	case X86::BEXTR32rr:
				900	case X86::BEXTR64rr:
				901	case X86::BEXTRI32ri:
				902	case X86::BEXTRI64ri:
				903	case X86::BZHI32rr:
				904	case X86::BZHI64rr:
				905
Chandler Carruth	c0cb573	2018-07-17 18:07:59 +0000	[diff] [blame]	906	// Shift and rotate.
				907	case X86::ROL8r1: case X86::ROL16r1: case X86::ROL32r1: case X86::ROL64r1:
				908	case X86::ROL8rCL: case X86::ROL16rCL: case X86::ROL32rCL: case X86::ROL64rCL:
				909	case X86::ROL8ri: case X86::ROL16ri: case X86::ROL32ri: case X86::ROL64ri:
				910	case X86::ROR8r1: case X86::ROR16r1: case X86::ROR32r1: case X86::ROR64r1:
				911	case X86::ROR8rCL: case X86::ROR16rCL: case X86::ROR32rCL: case X86::ROR64rCL:
				912	case X86::ROR8ri: case X86::ROR16ri: case X86::ROR32ri: case X86::ROR64ri:
				913	case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1: case X86::SAR64r1:
				914	case X86::SAR8rCL: case X86::SAR16rCL: case X86::SAR32rCL: case X86::SAR64rCL:
				915	case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri: case X86::SAR64ri:
				916	case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1: case X86::SHL64r1:
				917	case X86::SHL8rCL: case X86::SHL16rCL: case X86::SHL32rCL: case X86::SHL64rCL:
				918	case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri: case X86::SHL64ri:
				919	case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1: case X86::SHR64r1:
				920	case X86::SHR8rCL: case X86::SHR16rCL: case X86::SHR32rCL: case X86::SHR64rCL:
				921	case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri: case X86::SHR64ri:
				922	case X86::SHLD16rrCL: case X86::SHLD32rrCL: case X86::SHLD64rrCL:
				923	case X86::SHLD16rri8: case X86::SHLD32rri8: case X86::SHLD64rri8:
				924	case X86::SHRD16rrCL: case X86::SHRD32rrCL: case X86::SHRD64rrCL:
				925	case X86::SHRD16rri8: case X86::SHRD32rri8: case X86::SHRD64rri8:
				926
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	927	// Basic arithmetic is constant time on the input but does set flags.
				928	case X86::ADC8rr: case X86::ADC8ri:
				929	case X86::ADC16rr: case X86::ADC16ri: case X86::ADC16ri8:
				930	case X86::ADC32rr: case X86::ADC32ri: case X86::ADC32ri8:
				931	case X86::ADC64rr: case X86::ADC64ri8: case X86::ADC64ri32:
				932	case X86::ADD8rr: case X86::ADD8ri:
				933	case X86::ADD16rr: case X86::ADD16ri: case X86::ADD16ri8:
				934	case X86::ADD32rr: case X86::ADD32ri: case X86::ADD32ri8:
				935	case X86::ADD64rr: case X86::ADD64ri8: case X86::ADD64ri32:
				936	case X86::AND8rr: case X86::AND8ri:
				937	case X86::AND16rr: case X86::AND16ri: case X86::AND16ri8:
				938	case X86::AND32rr: case X86::AND32ri: case X86::AND32ri8:
				939	case X86::AND64rr: case X86::AND64ri8: case X86::AND64ri32:
				940	case X86::OR8rr: case X86::OR8ri:
				941	case X86::OR16rr: case X86::OR16ri: case X86::OR16ri8:
				942	case X86::OR32rr: case X86::OR32ri: case X86::OR32ri8:
				943	case X86::OR64rr: case X86::OR64ri8: case X86::OR64ri32:
				944	case X86::SBB8rr: case X86::SBB8ri:
				945	case X86::SBB16rr: case X86::SBB16ri: case X86::SBB16ri8:
				946	case X86::SBB32rr: case X86::SBB32ri: case X86::SBB32ri8:
				947	case X86::SBB64rr: case X86::SBB64ri8: case X86::SBB64ri32:
				948	case X86::SUB8rr: case X86::SUB8ri:
				949	case X86::SUB16rr: case X86::SUB16ri: case X86::SUB16ri8:
				950	case X86::SUB32rr: case X86::SUB32ri: case X86::SUB32ri8:
				951	case X86::SUB64rr: case X86::SUB64ri8: case X86::SUB64ri32:
				952	case X86::XOR8rr: case X86::XOR8ri:
				953	case X86::XOR16rr: case X86::XOR16ri: case X86::XOR16ri8:
				954	case X86::XOR32rr: case X86::XOR32ri: case X86::XOR32ri8:
				955	case X86::XOR64rr: case X86::XOR64ri8: case X86::XOR64ri32:
				956	// Arithmetic with just 32-bit and 64-bit variants and no immediates.
				957	case X86::ADCX32rr: case X86::ADCX64rr:
				958	case X86::ADOX32rr: case X86::ADOX64rr:
				959	case X86::ANDN32rr: case X86::ANDN64rr:
Chandler Carruth	c0cb573	2018-07-17 18:07:59 +0000	[diff] [blame]	960	// Unary arithmetic operations.
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	961	case X86::DEC8r: case X86::DEC16r: case X86::DEC32r: case X86::DEC64r:
Chandler Carruth	c0cb573	2018-07-17 18:07:59 +0000	[diff] [blame]	962	case X86::INC8r: case X86::INC16r: case X86::INC32r: case X86::INC64r:
				963	case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	964	// Check whether the EFLAGS implicit-def is dead. We assume that this will
				965	// always find the implicit-def because this code should only be reached
				966	// for instructions that do in fact implicitly def this.
				967	if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
				968	// If we would clobber EFLAGS that are used, just bail for now.
				969	LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: ";
				970	MI.dump(); dbgs() << "\n");
				971	return false;
				972	}
				973
				974	// Otherwise, fallthrough to handle these the same as instructions that
				975	// don't set EFLAGS.
				976	LLVM_FALLTHROUGH;
				977
Chandler Carruth	c0cb573	2018-07-17 18:07:59 +0000	[diff] [blame]	978	// Unlike other arithmetic, NOT doesn't set EFLAGS.
				979	case X86::NOT8r: case X86::NOT16r: case X86::NOT32r: case X86::NOT64r:
				980
				981	// Various move instructions used to zero or sign extend things. Note that we
				982	// intentionally don't support the _NOREX variants as we can't handle that
				983	// register constraint anyways.
				984	case X86::MOVSX16rr8:
				985	case X86::MOVSX32rr8: case X86::MOVSX32rr16:
				986	case X86::MOVSX64rr8: case X86::MOVSX64rr16: case X86::MOVSX64rr32:
				987	case X86::MOVZX16rr8:
				988	case X86::MOVZX32rr8: case X86::MOVZX32rr16:
				989	case X86::MOVZX64rr8: case X86::MOVZX64rr16:
				990	case X86::MOV32rr:
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	991
				992	// Arithmetic instructions that are both constant time and don't set flags.
				993	case X86::RORX32ri:
				994	case X86::RORX64ri:
				995	case X86::SARX32rr:
				996	case X86::SARX64rr:
				997	case X86::SHLX32rr:
				998	case X86::SHLX64rr:
				999	case X86::SHRX32rr:
				1000	case X86::SHRX64rr:
				1001
				1002	// LEA doesn't actually access memory, and its arithmetic is constant time.
				1003	case X86::LEA16r:
				1004	case X86::LEA32r:
				1005	case X86::LEA64_32r:
				1006	case X86::LEA64r:
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1007	return true;
				1008	}
				1009	}
				1010
				1011	/// Returns true if the instruction has no behavior (specified or otherwise)
				1012	/// that is based on the value loaded from memory or the value of any
				1013	/// non-address register operands.
				1014	///
				1015	/// For example, if the latency of the instruction is dependent on the
				1016	/// particular bits set in any of the registers or any of the bits loaded from
				1017	/// memory.
				1018	///
				1019	/// A classical example of something that is inherently not data invariant is an
				1020	/// indirect jump -- the destination is loaded into icache based on the bits set
				1021	/// in the jump destination register.
				1022	///
				1023	/// FIXME: This should become part of our instruction tables.
				1024	static bool isDataInvariantLoad(MachineInstr &MI) {
				1025	switch (MI.getOpcode()) {
				1026	default:
				1027	// By default, assume that the load will immediately leak.
				1028	return false;
				1029
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	1030	// On x86 it is believed that imul is constant time w.r.t. the loaded data.
				1031	// However, they set flags and are perhaps the most surprisingly constant
				1032	// time operations so we call them out here separately.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1033	case X86::IMUL16rm:
				1034	case X86::IMUL16rmi8:
				1035	case X86::IMUL16rmi:
				1036	case X86::IMUL32rm:
				1037	case X86::IMUL32rmi8:
				1038	case X86::IMUL32rmi:
				1039	case X86::IMUL64rm:
				1040	case X86::IMUL64rmi32:
				1041	case X86::IMUL64rmi8:
				1042
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	1043	// Bit scanning and counting instructions that are somewhat surprisingly
				1044	// constant time as they scan across bits and do other fairly complex
				1045	// operations like popcnt, but are believed to be constant time on x86.
				1046	// However, these set flags.
				1047	case X86::BSF16rm:
				1048	case X86::BSF32rm:
				1049	case X86::BSF64rm:
				1050	case X86::BSR16rm:
				1051	case X86::BSR32rm:
				1052	case X86::BSR64rm:
				1053	case X86::LZCNT16rm:
				1054	case X86::LZCNT32rm:
				1055	case X86::LZCNT64rm:
				1056	case X86::POPCNT16rm:
				1057	case X86::POPCNT32rm:
				1058	case X86::POPCNT64rm:
				1059	case X86::TZCNT16rm:
				1060	case X86::TZCNT32rm:
				1061	case X86::TZCNT64rm:
				1062
				1063	// Bit manipulation instructions are effectively combinations of basic
				1064	// arithmetic ops, and should still execute in constant time. These also
				1065	// set flags.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1066	case X86::BLCFILL32rm:
				1067	case X86::BLCFILL64rm:
				1068	case X86::BLCI32rm:
				1069	case X86::BLCI64rm:
				1070	case X86::BLCIC32rm:
				1071	case X86::BLCIC64rm:
				1072	case X86::BLCMSK32rm:
				1073	case X86::BLCMSK64rm:
				1074	case X86::BLCS32rm:
				1075	case X86::BLCS64rm:
				1076	case X86::BLSFILL32rm:
				1077	case X86::BLSFILL64rm:
				1078	case X86::BLSI32rm:
				1079	case X86::BLSI64rm:
				1080	case X86::BLSIC32rm:
				1081	case X86::BLSIC64rm:
				1082	case X86::BLSMSK32rm:
				1083	case X86::BLSMSK64rm:
				1084	case X86::BLSR32rm:
				1085	case X86::BLSR64rm:
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1086	case X86::TZMSK32rm:
				1087	case X86::TZMSK64rm:
				1088
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	1089	// Bit extracting and clearing instructions should execute in constant time,
				1090	// and set flags.
				1091	case X86::BEXTR32rm:
				1092	case X86::BEXTR64rm:
				1093	case X86::BEXTRI32mi:
				1094	case X86::BEXTRI64mi:
				1095	case X86::BZHI32rm:
				1096	case X86::BZHI64rm:
				1097
				1098	// Basic arithmetic is constant time on the input but does set flags.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1099	case X86::ADC8rm:
				1100	case X86::ADC16rm:
				1101	case X86::ADC32rm:
				1102	case X86::ADC64rm:
				1103	case X86::ADCX32rm:
				1104	case X86::ADCX64rm:
				1105	case X86::ADD8rm:
				1106	case X86::ADD16rm:
				1107	case X86::ADD32rm:
				1108	case X86::ADD64rm:
				1109	case X86::ADOX32rm:
				1110	case X86::ADOX64rm:
				1111	case X86::AND8rm:
				1112	case X86::AND16rm:
				1113	case X86::AND32rm:
				1114	case X86::AND64rm:
				1115	case X86::ANDN32rm:
				1116	case X86::ANDN64rm:
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1117	case X86::OR8rm:
				1118	case X86::OR16rm:
				1119	case X86::OR32rm:
				1120	case X86::OR64rm:
				1121	case X86::SBB8rm:
				1122	case X86::SBB16rm:
				1123	case X86::SBB32rm:
				1124	case X86::SBB64rm:
				1125	case X86::SUB8rm:
				1126	case X86::SUB16rm:
				1127	case X86::SUB32rm:
				1128	case X86::SUB64rm:
				1129	case X86::XOR8rm:
				1130	case X86::XOR16rm:
				1131	case X86::XOR32rm:
				1132	case X86::XOR64rm:
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1133	// Check whether the EFLAGS implicit-def is dead. We assume that this will
				1134	// always find the implicit-def because this code should only be reached
				1135	// for instructions that do in fact implicitly def this.
				1136	if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
				1137	// If we would clobber EFLAGS that are used, just bail for now.
				1138	LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: ";
				1139	MI.dump(); dbgs() << "\n");
				1140	return false;
				1141	}
				1142
				1143	// Otherwise, fallthrough to handle these the same as instructions that
				1144	// don't set EFLAGS.
				1145	LLVM_FALLTHROUGH;
				1146
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	1147	// Integer multiply w/o affecting flags is still believed to be constant
				1148	// time on x86. Called out separately as this is among the most surprising
				1149	// instructions to exhibit that behavior.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1150	case X86::MULX32rm:
				1151	case X86::MULX64rm:
				1152
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	1153	// Arithmetic instructions that are both constant time and don't set flags.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1154	case X86::RORX32mi:
				1155	case X86::RORX64mi:
				1156	case X86::SARX32rm:
				1157	case X86::SARX64rm:
				1158	case X86::SHLX32rm:
				1159	case X86::SHLX64rm:
				1160	case X86::SHRX32rm:
				1161	case X86::SHRX64rm:
				1162
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	1163	// Conversions are believed to be constant time and don't set flags.
Craig Topper	41fa858	2018-07-13 22:41:50 +0000	[diff] [blame]	1164	case X86::CVTTSD2SI64rm: case X86::VCVTTSD2SI64rm: case X86::VCVTTSD2SI64Zrm:
				1165	case X86::CVTTSD2SIrm: case X86::VCVTTSD2SIrm: case X86::VCVTTSD2SIZrm:
				1166	case X86::CVTTSS2SI64rm: case X86::VCVTTSS2SI64rm: case X86::VCVTTSS2SI64Zrm:
				1167	case X86::CVTTSS2SIrm: case X86::VCVTTSS2SIrm: case X86::VCVTTSS2SIZrm:
				1168	case X86::CVTSI2SDrm: case X86::VCVTSI2SDrm: case X86::VCVTSI2SDZrm:
				1169	case X86::CVTSI2SSrm: case X86::VCVTSI2SSrm: case X86::VCVTSI2SSZrm:
				1170	case X86::CVTSI642SDrm: case X86::VCVTSI642SDrm: case X86::VCVTSI642SDZrm:
				1171	case X86::CVTSI642SSrm: case X86::VCVTSI642SSrm: case X86::VCVTSI642SSZrm:
				1172	case X86::CVTSS2SDrm: case X86::VCVTSS2SDrm: case X86::VCVTSS2SDZrm:
				1173	case X86::CVTSD2SSrm: case X86::VCVTSD2SSrm: case X86::VCVTSD2SSZrm:
				1174	// AVX512 added unsigned integer conversions.
				1175	case X86::VCVTTSD2USI64Zrm:
				1176	case X86::VCVTTSD2USIZrm:
				1177	case X86::VCVTTSS2USI64Zrm:
				1178	case X86::VCVTTSS2USIZrm:
				1179	case X86::VCVTUSI2SDZrm:
				1180	case X86::VCVTUSI642SDZrm:
				1181	case X86::VCVTUSI2SSZrm:
				1182	case X86::VCVTUSI642SSZrm:
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1183
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	1184	// Loads to register don't set flags.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1185	case X86::MOV8rm:
				1186	case X86::MOV8rm_NOREX:
				1187	case X86::MOV16rm:
				1188	case X86::MOV32rm:
				1189	case X86::MOV64rm:
				1190	case X86::MOVSX16rm8:
				1191	case X86::MOVSX32rm16:
				1192	case X86::MOVSX32rm8:
				1193	case X86::MOVSX32rm8_NOREX:
				1194	case X86::MOVSX64rm16:
				1195	case X86::MOVSX64rm32:
				1196	case X86::MOVSX64rm8:
				1197	case X86::MOVZX16rm8:
				1198	case X86::MOVZX32rm16:
				1199	case X86::MOVZX32rm8:
				1200	case X86::MOVZX32rm8_NOREX:
				1201	case X86::MOVZX64rm16:
				1202	case X86::MOVZX64rm8:
				1203	return true;
				1204	}
				1205	}
				1206
				1207	static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
				1208	const TargetRegisterInfo &TRI) {
				1209	// Check if EFLAGS are alive by seeing if there is a def of them or they
				1210	// live-in, and then seeing if that def is in turn used.
				1211	for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), I))) {
				1212	if (MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
				1213	// If the def is dead, then EFLAGS is not live.
				1214	if (DefOp->isDead())
				1215	return false;
				1216
				1217	// Otherwise we've def'ed it, and it is live.
				1218	return true;
				1219	}
				1220	// While at this instruction, also check if we use and kill EFLAGS
				1221	// which means it isn't live.
				1222	if (MI.killsRegister(X86::EFLAGS, &TRI))
				1223	return false;
				1224	}
				1225
				1226	// If we didn't find anything conclusive (neither definitely alive or
				1227	// definitely dead) return whether it lives into the block.
				1228	return MBB.isLiveIn(X86::EFLAGS);
				1229	}
				1230
Chandler Carruth	0477b40	2018-07-23 04:01:34 +0000	[diff] [blame]	1231	/// Harden all of the loads (including implicit loads) in the function.
				1232	///
				1233	/// This operates in two passes. First, we analyze the loads to determine which
				1234	/// strategy will be used to harden them: hardening the address or hardening the
				1235	/// loaded value when loaded into a register amenable to hardening. We have to
				1236	/// process these first because the two strategies may interact -- later
				1237	/// hardening may change what strategy we wish to use. We also will analyze
				1238	/// data dependencies between loads and avoid hardening those loads that are
				1239	/// data dependent on a load with a hardened address. We also skip hardening
				1240	/// loads already behind an LFENCE as that is sufficient to harden them against
				1241	/// misspeculation.
				1242	///
				1243	/// Second, we apply the hardening steps we determined necessary in the first
				1244	/// pass.
				1245	///
				1246	/// These two passes are applied to each basic block. We operate one block at
				1247	/// a time to simplify reasoning about reachability and sequencing.
				1248	void X86SpeculativeLoadHardeningPass::hardenAllLoads(MachineFunction &MF) {
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1249	// If the actual checking of loads is disabled, skip doing anything here.
				1250	if (!HardenLoads)
				1251	return;
				1252
				1253	SmallPtrSet<MachineInstr *, 16> HardenPostLoad;
				1254	SmallPtrSet<MachineInstr *, 16> HardenLoadAddr;
				1255
				1256	SmallSet<unsigned, 16> HardenedAddrRegs;
				1257
				1258	SmallDenseMap<unsigned, unsigned, 32> AddrRegToHardenedReg;
				1259
				1260	// Track the set of load-dependent registers through the basic block. Because
				1261	// the values of these registers have an existing data dependency on a loaded
				1262	// value which we would have checked, we can omit any checks on them.
				1263	SparseBitVector<> LoadDepRegs;
				1264
				1265	for (MachineBasicBlock &MBB : MF) {
				1266	// We harden the loads of a basic block in several passes:
				1267	//
				1268	// 1) Collect all the loads which can have their loaded value hardened
				1269	// and all the loads that instead need their address hardened. During
				1270	// this walk we propagate load dependence for address hardened loads and
				1271	// also look for LFENCE to stop hardening wherever possible. When
				1272	// deciding whether or not to harden the loaded value or not, we check
				1273	// to see if any registers used in the address will have been hardened
				1274	// at this point and if so, harden any remaining address registers as
				1275	// that often successfully re-uses hardened addresses and minimizes
				1276	// instructions. FIXME: We should consider an aggressive mode where we
				1277	// continue to keep as many loads value hardened even when some address
				1278	// register hardening would be free (due to reuse).
				1279	for (MachineInstr &MI : MBB) {
				1280	// We naively assume that all def'ed registers of an instruction have
				1281	// a data dependency on all of their operands.
				1282	// FIXME: Do a more careful analysis of x86 to build a conservative model
				1283	// here.
				1284	if (llvm::any_of(MI.uses(), [&](MachineOperand &Op) {
				1285	return Op.isReg() && LoadDepRegs.test(Op.getReg());
				1286	}))
				1287	for (MachineOperand &Def : MI.defs())
				1288	if (Def.isReg())
				1289	LoadDepRegs.set(Def.getReg());
				1290
				1291	// Both Intel and AMD are guiding that they will change the semantics of
				1292	// LFENCE to be a speculation barrier, so if we see an LFENCE, there is
				1293	// no more need to guard things in this block.
				1294	if (MI.getOpcode() == X86::LFENCE)
				1295	break;
				1296
				1297	// If this instruction cannot load, nothing to do.
				1298	if (!MI.mayLoad())
				1299	continue;
				1300
				1301	// Some instructions which "load" are trivially safe or unimportant.
				1302	if (MI.getOpcode() == X86::MFENCE)
				1303	continue;
				1304
				1305	// Extract the memory operand information about this instruction.
				1306	// FIXME: This doesn't handle loading pseudo instructions which we often
				1307	// could handle with similarly generic logic. We probably need to add an
				1308	// MI-layer routine similar to the MC-layer one we use here which maps
				1309	// pseudos much like this maps real instructions.
				1310	const MCInstrDesc &Desc = MI.getDesc();
				1311	int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
				1312	if (MemRefBeginIdx < 0) {
				1313	LLVM_DEBUG(dbgs() << "WARNING: unable to harden loading instruction: ";
				1314	MI.dump());
				1315	continue;
				1316	}
				1317
				1318	MemRefBeginIdx += X86II::getOperandBias(Desc);
				1319
				1320	MachineOperand &BaseMO = MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
				1321	MachineOperand &IndexMO =
				1322	MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
				1323
				1324	// If we have at least one (non-frame-index, non-RIP) register operand,
				1325	// and neither operand is load-dependent, we need to check the load.
				1326	unsigned BaseReg = 0, IndexReg = 0;
				1327	if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP &&
				1328	BaseMO.getReg() != X86::NoRegister)
				1329	BaseReg = BaseMO.getReg();
				1330	if (IndexMO.getReg() != X86::NoRegister)
				1331	IndexReg = IndexMO.getReg();
				1332
				1333	if (!BaseReg && !IndexReg)
				1334	// No register operands!
				1335	continue;
				1336
				1337	// If any register operand is dependent, this load is dependent and we
				1338	// needn't check it.
				1339	// FIXME: Is this true in the case where we are hardening loads after
				1340	// they complete? Unclear, need to investigate.
				1341	if ((BaseReg && LoadDepRegs.test(BaseReg)) \|\|
				1342	(IndexReg && LoadDepRegs.test(IndexReg)))
				1343	continue;
				1344
Chandler Carruth	e66a6f4	2018-07-16 11:38:48 +0000	[diff] [blame]	1345	// If post-load hardening is enabled, this load is compatible with
				1346	// post-load hardening, and we aren't already going to harden one of the
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1347	// address registers, queue it up to be hardened post-load. Notably, even
				1348	// once hardened this won't introduce a useful dependency that could prune
				1349	// out subsequent loads.
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	1350	if (EnablePostLoadHardening && isDataInvariantLoad(MI) &&
				1351	MI.getDesc().getNumDefs() == 1 && MI.getOperand(0).isReg() &&
				1352	canHardenRegister(MI.getOperand(0).getReg()) &&
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1353	!HardenedAddrRegs.count(BaseReg) &&
				1354	!HardenedAddrRegs.count(IndexReg)) {
				1355	HardenPostLoad.insert(&MI);
				1356	HardenedAddrRegs.insert(MI.getOperand(0).getReg());
				1357	continue;
				1358	}
				1359
				1360	// Record this instruction for address hardening and record its register
				1361	// operands as being address-hardened.
				1362	HardenLoadAddr.insert(&MI);
				1363	if (BaseReg)
				1364	HardenedAddrRegs.insert(BaseReg);
				1365	if (IndexReg)
				1366	HardenedAddrRegs.insert(IndexReg);
				1367
				1368	for (MachineOperand &Def : MI.defs())
				1369	if (Def.isReg())
				1370	LoadDepRegs.set(Def.getReg());
				1371	}
				1372
				1373	// Now re-walk the instructions in the basic block, and apply whichever
				1374	// hardening strategy we have elected. Note that we do this in a second
				1375	// pass specifically so that we have the complete set of instructions for
				1376	// which we will do post-load hardening and can defer it in certain
				1377	// circumstances.
				1378	//
				1379	// FIXME: This could probably be made even more effective by doing it
				1380	// across the entire function. Rather than just walking the flat list
				1381	// backwards here, we could walk the function in PO and each block bottom
				1382	// up, allowing us to in some cases sink hardening across block blocks. As
				1383	// long as the in-block predicate state is used at the eventual hardening
				1384	// site, this remains safe.
				1385	for (MachineInstr &MI : MBB) {
				1386	// We cannot both require hardening the def of a load and its address.
				1387	assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) &&
				1388	"Requested to harden both the address and def of a load!");
				1389
				1390	// Check if this is a load whose address needs to be hardened.
				1391	if (HardenLoadAddr.erase(&MI)) {
				1392	const MCInstrDesc &Desc = MI.getDesc();
				1393	int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
				1394	assert(MemRefBeginIdx >= 0 && "Cannot have an invalid index here!");
				1395
				1396	MemRefBeginIdx += X86II::getOperandBias(Desc);
				1397
				1398	MachineOperand &BaseMO =
				1399	MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
				1400	MachineOperand &IndexMO =
				1401	MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	1402	hardenLoadAddr(MI, BaseMO, IndexMO, AddrRegToHardenedReg);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1403	continue;
				1404	}
				1405
				1406	// Test if this instruction is one of our post load instructions (and
				1407	// remove it from the set if so).
				1408	if (HardenPostLoad.erase(&MI)) {
				1409	assert(!MI.isCall() && "Must not try to post-load harden a call!");
				1410
				1411	// If this is a data-invariant load, we want to try and sink any
				1412	// hardening as far as possible.
				1413	if (isDataInvariantLoad(MI)) {
				1414	// Sink the instruction we'll need to harden as far as we can down the
				1415	// graph.
				1416	MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad);
				1417
				1418	// If we managed to sink this instruction, update everything so we
				1419	// harden that instruction when we reach it in the instruction
				1420	// sequence.
				1421	if (SunkMI != &MI) {
				1422	// If in sinking there was no instruction needing to be hardened,
				1423	// we're done.
				1424	if (!SunkMI)
				1425	continue;
				1426
				1427	// Otherwise, add this to the set of defs we harden.
				1428	HardenPostLoad.insert(SunkMI);
				1429	continue;
				1430	}
				1431	}
				1432
				1433	// The register def'ed by this instruction is trivially hardened so map
				1434	// it to itself.
				1435	AddrRegToHardenedReg[MI.getOperand(0).getReg()] =
				1436	MI.getOperand(0).getReg();
				1437
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	1438	hardenPostLoad(MI);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1439	continue;
				1440	}
				1441
				1442	// After we finish processing the instruction and doing any hardening
				1443	// necessary for it, we need to handle transferring the predicate state
				1444	// into a call and recovering it after the call returns (if it returns).
				1445	if (!MI.isCall())
				1446	continue;
				1447
				1448	// If we're not hardening interprocedurally, we can just skip calls.
				1449	if (!HardenInterprocedurally)
				1450	continue;
				1451
				1452	auto InsertPt = MI.getIterator();
				1453	DebugLoc Loc = MI.getDebugLoc();
				1454
				1455	// First, we transfer the predicate state into the called function by
				1456	// merging it into the stack pointer. This will kill the current def of
				1457	// the state.
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	1458	unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1459	mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg);
				1460
				1461	// If this call is also a return (because it is a tail call) we're done.
				1462	if (MI.isReturn())
				1463	continue;
				1464
				1465	// Otherwise we need to step past the call and recover the predicate
				1466	// state from SP after the return, and make this new state available.
				1467	++InsertPt;
				1468	unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	1469	PS->SSA.AddAvailableValue(&MBB, NewStateReg);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1470	}
				1471
				1472	HardenPostLoad.clear();
				1473	HardenLoadAddr.clear();
				1474	HardenedAddrRegs.clear();
				1475	AddrRegToHardenedReg.clear();
				1476
				1477	// Currently, we only track data-dependent loads within a basic block.
				1478	// FIXME: We should see if this is necessary or if we could be more
				1479	// aggressive here without opening up attack avenues.
				1480	LoadDepRegs.clear();
				1481	}
				1482	}
				1483
				1484	/// Save EFLAGS into the returned GPR. This can in turn be restored with
				1485	/// `restoreEFLAGS`.
				1486	///
				1487	/// Note that LLVM can only lower very simple patterns of saved and restored
				1488	/// EFLAGS registers. The restore should always be within the same basic block
				1489	/// as the save so that no PHI nodes are inserted.
				1490	unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
				1491	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
				1492	DebugLoc Loc) {
				1493	// FIXME: Hard coding this to a 32-bit register class seems weird, but matches
				1494	// what instruction selection does.
				1495	unsigned Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
				1496	// We directly copy the FLAGS register and rely on later lowering to clean
				1497	// this up into the appropriate setCC instructions.
				1498	BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
				1499	++NumInstsInserted;
				1500	return Reg;
				1501	}
				1502
				1503	/// Restore EFLAGS from the provided GPR. This should be produced by
				1504	/// `saveEFLAGS`.
				1505	///
				1506	/// This must be done within the same basic block as the save in order to
				1507	/// reliably lower.
				1508	void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
				1509	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
				1510	unsigned Reg) {
				1511	BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
				1512	++NumInstsInserted;
				1513	}
				1514
				1515	/// Takes the current predicate state (in a register) and merges it into the
				1516	/// stack pointer. The state is essentially a single bit, but we merge this in
				1517	/// a way that won't form non-canonical pointers and also will be preserved
				1518	/// across normal stack adjustments.
				1519	void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
				1520	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
				1521	unsigned PredStateReg) {
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	1522	unsigned TmpReg = MRI->createVirtualRegister(PS->RC);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1523	// FIXME: This hard codes a shift distance based on the number of bits needed
				1524	// to stay canonical on 64-bit. We should compute this somehow and support
				1525	// 32-bit as part of that.
				1526	auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg)
				1527	.addReg(PredStateReg, RegState::Kill)
				1528	.addImm(47);
				1529	ShiftI->addRegisterDead(X86::EFLAGS, TRI);
				1530	++NumInstsInserted;
				1531	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP)
				1532	.addReg(X86::RSP)
				1533	.addReg(TmpReg, RegState::Kill);
				1534	OrI->addRegisterDead(X86::EFLAGS, TRI);
				1535	++NumInstsInserted;
				1536	}
				1537
				1538	/// Extracts the predicate state stored in the high bits of the stack pointer.
				1539	unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
				1540	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
				1541	DebugLoc Loc) {
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	1542	unsigned PredStateReg = MRI->createVirtualRegister(PS->RC);
				1543	unsigned TmpReg = MRI->createVirtualRegister(PS->RC);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1544
				1545	// We know that the stack pointer will have any preserved predicate state in
				1546	// its high bit. We just want to smear this across the other bits. Turns out,
				1547	// this is exactly what an arithmetic right shift does.
				1548	BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg)
				1549	.addReg(X86::RSP);
				1550	auto ShiftI =
				1551	BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg)
				1552	.addReg(TmpReg, RegState::Kill)
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	1553	.addImm(TRI->getRegSizeInBits(*PS->RC) - 1);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1554	ShiftI->addRegisterDead(X86::EFLAGS, TRI);
				1555	++NumInstsInserted;
				1556
				1557	return PredStateReg;
				1558	}
				1559
				1560	void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
				1561	MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1562	SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
				1563	MachineBasicBlock &MBB = *MI.getParent();
				1564	DebugLoc Loc = MI.getDebugLoc();
				1565
				1566	// Check if EFLAGS are alive by seeing if there is a def of them or they
				1567	// live-in, and then seeing if that def is in turn used.
				1568	bool EFLAGSLive = isEFLAGSLive(MBB, MI.getIterator(), *TRI);
				1569
				1570	SmallVector<MachineOperand *, 2> HardenOpRegs;
				1571
				1572	if (BaseMO.isFI()) {
				1573	// A frame index is never a dynamically controllable load, so only
				1574	// harden it if we're covering fixed address loads as well.
				1575	LLVM_DEBUG(
				1576	dbgs() << " Skipping hardening base of explicit stack frame load: ";
				1577	MI.dump(); dbgs() << "\n");
				1578	} else if (BaseMO.getReg() == X86::RIP \|\|
				1579	BaseMO.getReg() == X86::NoRegister) {
				1580	// For both RIP-relative addressed loads or absolute loads, we cannot
				1581	// meaningfully harden them because the address being loaded has no
				1582	// dynamic component.
				1583	//
				1584	// FIXME: When using a segment base (like TLS does) we end up with the
				1585	// dynamic address being the base plus -1 because we can't mutate the
				1586	// segment register here. This allows the signed 32-bit offset to point at
				1587	// valid segment-relative addresses and load them successfully.
				1588	LLVM_DEBUG(
				1589	dbgs() << " Cannot harden base of "
				1590	<< (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base")
				1591	<< " address in a load!");
				1592	} else {
				1593	assert(BaseMO.isReg() &&
				1594	"Only allowed to have a frame index or register base.");
				1595	HardenOpRegs.push_back(&BaseMO);
				1596	}
				1597
				1598	if (IndexMO.getReg() != X86::NoRegister &&
				1599	(HardenOpRegs.empty() \|\|
				1600	HardenOpRegs.front()->getReg() != IndexMO.getReg()))
				1601	HardenOpRegs.push_back(&IndexMO);
				1602
				1603	assert((HardenOpRegs.size() == 1 \|\| HardenOpRegs.size() == 2) &&
				1604	"Should have exactly one or two registers to harden!");
				1605	assert((HardenOpRegs.size() == 1 \|\|
				1606	HardenOpRegs[0]->getReg() != HardenOpRegs[1]->getReg()) &&
				1607	"Should not have two of the same registers!");
				1608
				1609	// Remove any registers that have alreaded been checked.
				1610	llvm::erase_if(HardenOpRegs, [&](MachineOperand *Op) {
				1611	// See if this operand's register has already been checked.
				1612	auto It = AddrRegToHardenedReg.find(Op->getReg());
				1613	if (It == AddrRegToHardenedReg.end())
				1614	// Not checked, so retain this one.
				1615	return false;
				1616
				1617	// Otherwise, we can directly update this operand and remove it.
				1618	Op->setReg(It->second);
				1619	return true;
				1620	});
				1621	// If there are none left, we're done.
				1622	if (HardenOpRegs.empty())
				1623	return;
				1624
				1625	// Compute the current predicate state.
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	1626	unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1627
				1628	auto InsertPt = MI.getIterator();
				1629
				1630	// If EFLAGS are live and we don't have access to instructions that avoid
				1631	// clobbering EFLAGS we need to save and restore them. This in turn makes
				1632	// the EFLAGS no longer live.
				1633	unsigned FlagsReg = 0;
				1634	if (EFLAGSLive && !Subtarget->hasBMI2()) {
				1635	EFLAGSLive = false;
				1636	FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
				1637	}
				1638
				1639	for (MachineOperand *Op : HardenOpRegs) {
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1640	unsigned OpReg = Op->getReg();
Chandler Carruth	cdf0add	2018-07-16 04:17:51 +0000	[diff] [blame]	1641	auto *OpRC = MRI->getRegClass(OpReg);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1642	unsigned TmpReg = MRI->createVirtualRegister(OpRC);
				1643
Chandler Carruth	cdf0add	2018-07-16 04:17:51 +0000	[diff] [blame]	1644	// If this is a vector register, we'll need somewhat custom logic to handle
				1645	// hardening it.
				1646	if (!Subtarget->hasVLX() && (OpRC->hasSuperClassEq(&X86::VR128RegClass) \|\|
				1647	OpRC->hasSuperClassEq(&X86::VR256RegClass))) {
				1648	assert(Subtarget->hasAVX2() && "AVX2-specific register classes!");
				1649	bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128RegClass);
				1650
				1651	// Move our state into a vector register.
				1652	// FIXME: We could skip this at the cost of longer encodings with AVX-512
				1653	// but that doesn't seem likely worth it.
				1654	unsigned VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass);
				1655	auto MovI =
				1656	BuildMI(MBB, InsertPt, Loc, TII->get(X86::VMOV64toPQIrr), VStateReg)
				1657	.addReg(StateReg);
				1658	(void)MovI;
				1659	++NumInstsInserted;
				1660	LLVM_DEBUG(dbgs() << " Inserting mov: "; MovI->dump(); dbgs() << "\n");
				1661
				1662	// Broadcast it across the vector register.
				1663	unsigned VBStateReg = MRI->createVirtualRegister(OpRC);
				1664	auto BroadcastI = BuildMI(MBB, InsertPt, Loc,
				1665	TII->get(Is128Bit ? X86::VPBROADCASTQrr
				1666	: X86::VPBROADCASTQYrr),
				1667	VBStateReg)
				1668	.addReg(VStateReg);
				1669	(void)BroadcastI;
				1670	++NumInstsInserted;
				1671	LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
				1672	dbgs() << "\n");
				1673
				1674	// Merge our potential poison state into the value with a vector or.
				1675	auto OrI =
				1676	BuildMI(MBB, InsertPt, Loc,
				1677	TII->get(Is128Bit ? X86::VPORrr : X86::VPORYrr), TmpReg)
				1678	.addReg(VBStateReg)
				1679	.addReg(OpReg);
				1680	(void)OrI;
				1681	++NumInstsInserted;
				1682	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
				1683	} else if (OpRC->hasSuperClassEq(&X86::VR128XRegClass) \|\|
				1684	OpRC->hasSuperClassEq(&X86::VR256XRegClass) \|\|
				1685	OpRC->hasSuperClassEq(&X86::VR512RegClass)) {
				1686	assert(Subtarget->hasAVX512() && "AVX512-specific register classes!");
				1687	bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128XRegClass);
				1688	bool Is256Bit = OpRC->hasSuperClassEq(&X86::VR256XRegClass);
				1689	if (Is128Bit \|\| Is256Bit)
				1690	assert(Subtarget->hasVLX() && "AVX512VL-specific register classes!");
				1691
				1692	// Broadcast our state into a vector register.
				1693	unsigned VStateReg = MRI->createVirtualRegister(OpRC);
				1694	unsigned BroadcastOp =
				1695	Is128Bit ? X86::VPBROADCASTQrZ128r
				1696	: Is256Bit ? X86::VPBROADCASTQrZ256r : X86::VPBROADCASTQrZr;
				1697	auto BroadcastI =
				1698	BuildMI(MBB, InsertPt, Loc, TII->get(BroadcastOp), VStateReg)
				1699	.addReg(StateReg);
				1700	(void)BroadcastI;
				1701	++NumInstsInserted;
				1702	LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
				1703	dbgs() << "\n");
				1704
				1705	// Merge our potential poison state into the value with a vector or.
				1706	unsigned OrOp = Is128Bit ? X86::VPORQZ128rr
				1707	: Is256Bit ? X86::VPORQZ256rr : X86::VPORQZrr;
				1708	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOp), TmpReg)
				1709	.addReg(VStateReg)
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1710	.addReg(OpReg);
Chandler Carruth	bc46bca	2018-07-16 04:42:27 +0000	[diff] [blame]	1711	(void)OrI;
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1712	++NumInstsInserted;
				1713	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
				1714	} else {
Chandler Carruth	cdf0add	2018-07-16 04:17:51 +0000	[diff] [blame]	1715	// FIXME: Need to support GR32 here for 32-bit code.
				1716	assert(OpRC->hasSuperClassEq(&X86::GR64RegClass) &&
				1717	"Not a supported register class for address hardening!");
				1718
				1719	if (!EFLAGSLive) {
				1720	// Merge our potential poison state into the value with an or.
				1721	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), TmpReg)
				1722	.addReg(StateReg)
				1723	.addReg(OpReg);
				1724	OrI->addRegisterDead(X86::EFLAGS, TRI);
				1725	++NumInstsInserted;
				1726	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
				1727	} else {
				1728	// We need to avoid touching EFLAGS so shift out all but the least
				1729	// significant bit using the instruction that doesn't update flags.
				1730	auto ShiftI =
				1731	BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHRX64rr), TmpReg)
				1732	.addReg(OpReg)
				1733	.addReg(StateReg);
				1734	(void)ShiftI;
				1735	++NumInstsInserted;
				1736	LLVM_DEBUG(dbgs() << " Inserting shrx: "; ShiftI->dump();
				1737	dbgs() << "\n");
				1738	}
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1739	}
				1740
				1741	// Record this register as checked and update the operand.
				1742	assert(!AddrRegToHardenedReg.count(Op->getReg()) &&
				1743	"Should not have checked this register yet!");
				1744	AddrRegToHardenedReg[Op->getReg()] = TmpReg;
				1745	Op->setReg(TmpReg);
				1746	++NumAddrRegsHardened;
				1747	}
				1748
				1749	// And restore the flags if needed.
				1750	if (FlagsReg)
				1751	restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
				1752	}
				1753
				1754	MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	1755	MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedInstrs) {
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1756	assert(isDataInvariantLoad(InitialMI) &&
				1757	"Cannot get here with a non-invariant load!");
				1758
				1759	// See if we can sink hardening the loaded value.
				1760	auto SinkCheckToSingleUse =
				1761	[&](MachineInstr &MI) -> Optional<MachineInstr *> {
				1762	unsigned DefReg = MI.getOperand(0).getReg();
				1763
				1764	// We need to find a single use which we can sink the check. We can
				1765	// primarily do this because many uses may already end up checked on their
				1766	// own.
				1767	MachineInstr *SingleUseMI = nullptr;
				1768	for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
				1769	// If we're already going to harden this use, it is data invariant and
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	1770	// within our block.
				1771	if (HardenedInstrs.count(&UseMI)) {
				1772	if (!isDataInvariantLoad(UseMI)) {
				1773	// If we've already decided to harden a non-load, we must have sunk
				1774	// some other post-load hardened instruction to it and it must itself
				1775	// be data-invariant.
				1776	assert(isDataInvariant(UseMI) &&
				1777	"Data variant instruction being hardened!");
				1778	continue;
				1779	}
				1780
				1781	// Otherwise, this is a load and the load component can't be data
				1782	// invariant so check how this register is being used.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1783	const MCInstrDesc &Desc = UseMI.getDesc();
				1784	int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
				1785	assert(MemRefBeginIdx >= 0 &&
				1786	"Should always have mem references here!");
				1787	MemRefBeginIdx += X86II::getOperandBias(Desc);
				1788
				1789	MachineOperand &BaseMO =
				1790	UseMI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
				1791	MachineOperand &IndexMO =
				1792	UseMI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
				1793	if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) \|\|
				1794	(IndexMO.isReg() && IndexMO.getReg() == DefReg))
				1795	// The load uses the register as part of its address making it not
				1796	// invariant.
				1797	return {};
				1798
				1799	continue;
				1800	}
				1801
				1802	if (SingleUseMI)
				1803	// We already have a single use, this would make two. Bail.
				1804	return {};
				1805
				1806	// If this single use isn't data invariant, isn't in this block, or has
				1807	// interfering EFLAGS, we can't sink the hardening to it.
				1808	if (!isDataInvariant(UseMI) \|\| UseMI.getParent() != MI.getParent())
				1809	return {};
				1810
				1811	// If this instruction defines multiple registers bail as we won't harden
				1812	// all of them.
				1813	if (UseMI.getDesc().getNumDefs() > 1)
				1814	return {};
				1815
				1816	// If this register isn't a virtual register we can't walk uses of sanely,
				1817	// just bail. Also check that its register class is one of the ones we
				1818	// can harden.
				1819	unsigned UseDefReg = UseMI.getOperand(0).getReg();
				1820	if (!TRI->isVirtualRegister(UseDefReg) \|\|
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	1821	!canHardenRegister(UseDefReg))
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1822	return {};
				1823
				1824	SingleUseMI = &UseMI;
				1825	}
				1826
				1827	// If SingleUseMI is still null, there is no use that needs its own
				1828	// checking. Otherwise, it is the single use that needs checking.
				1829	return {SingleUseMI};
				1830	};
				1831
				1832	MachineInstr *MI = &InitialMI;
				1833	while (Optional<MachineInstr > SingleUse = SinkCheckToSingleUse(MI)) {
				1834	// Update which MI we're checking now.
				1835	MI = *SingleUse;
				1836	if (!MI)
				1837	break;
				1838	}
				1839
				1840	return MI;
				1841	}
				1842
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	1843	bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) {
				1844	auto *RC = MRI->getRegClass(Reg);
				1845	int RegBytes = TRI->getRegSizeInBits(*RC) / 8;
				1846	if (RegBytes > 8)
				1847	// We don't support post-load hardening of vectors.
Chandler Carruth	e66a6f4	2018-07-16 11:38:48 +0000	[diff] [blame]	1848	return false;
				1849
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	1850	// If this register class is explicitly constrained to a class that doesn't
				1851	// require REX prefix, we may not be able to satisfy that constraint when
				1852	// emitting the hardening instructions, so bail out here.
				1853	// FIXME: This seems like a pretty lame hack. The way this comes up is when we
				1854	// end up both with a NOREX and REX-only register as operands to the hardening
				1855	// instructions. It would be better to fix that code to handle this situation
				1856	// rather than hack around it in this way.
				1857	const TargetRegisterClass *NOREXRegClasses[] = {
				1858	&X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass,
				1859	&X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass};
				1860	if (RC == NOREXRegClasses[Log2_32(RegBytes)])
Chandler Carruth	e66a6f4	2018-07-16 11:38:48 +0000	[diff] [blame]	1861	return false;
				1862
				1863	const TargetRegisterClass *GPRRegClasses[] = {
				1864	&X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass,
				1865	&X86::GR64RegClass};
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	1866	return RC->hasSuperClassEq(GPRRegClasses[Log2_32(RegBytes)]);
Chandler Carruth	e66a6f4	2018-07-16 11:38:48 +0000	[diff] [blame]	1867	}
				1868
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1869	// We can harden non-leaking loads into register without touching the address
				1870	// by just hiding all of the loaded bits. We use an `or` instruction to do
				1871	// this because having the poison value be all ones allows us to use the same
				1872	// value below. And the goal is just for the loaded bits to not be exposed to
				1873	// execution and coercing them to one is sufficient.
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	1874	void X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) {
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1875	MachineBasicBlock &MBB = *MI.getParent();
				1876	DebugLoc Loc = MI.getDebugLoc();
				1877
				1878	// For all of these, the def'ed register operand is operand zero.
				1879	auto &DefOp = MI.getOperand(0);
				1880	unsigned OldDefReg = DefOp.getReg();
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	1881	assert(canHardenRegister(OldDefReg) &&
				1882	"Cannot harden this instruction's defined register!");
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1883
				1884	auto *DefRC = MRI->getRegClass(OldDefReg);
				1885	int DefRegBytes = TRI->getRegSizeInBits(*DefRC) / 8;
				1886
				1887	unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
				1888	unsigned OrOpCode = OrOpCodes[Log2_32(DefRegBytes)];
				1889
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1890	unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
				1891
				1892	auto GetStateRegInRC = [&](const TargetRegisterClass &RC) {
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	1893	unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1894
				1895	int Bytes = TRI->getRegSizeInBits(RC) / 8;
				1896	// FIXME: Need to teach this about 32-bit mode.
				1897	if (Bytes != 8) {
				1898	unsigned SubRegImm = SubRegImms[Log2_32(Bytes)];
				1899	unsigned NarrowStateReg = MRI->createVirtualRegister(&RC);
				1900	BuildMI(MBB, MI.getIterator(), Loc, TII->get(TargetOpcode::COPY),
				1901	NarrowStateReg)
				1902	.addReg(StateReg, 0, SubRegImm);
				1903	StateReg = NarrowStateReg;
				1904	}
				1905	return StateReg;
				1906	};
				1907
				1908	auto InsertPt = std::next(MI.getIterator());
				1909	unsigned FlagsReg = 0;
Chandler Carruth	b46c22d	2018-07-24 00:21:59 +0000	[diff] [blame]	1910	if (isEFLAGSLive(MBB, InsertPt, *TRI))
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1911	FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1912
Chandler Carruth	b46c22d	2018-07-24 00:21:59 +0000	[diff] [blame]	1913	unsigned StateReg = GetStateRegInRC(*DefRC);
				1914	unsigned NewDefReg = MRI->createVirtualRegister(DefRC);
				1915	DefOp.setReg(NewDefReg);
				1916	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), OldDefReg)
				1917	.addReg(StateReg)
				1918	.addReg(NewDefReg);
				1919	OrI->addRegisterDead(X86::EFLAGS, TRI);
				1920	++NumInstsInserted;
				1921	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1922
				1923	if (FlagsReg)
				1924	restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
				1925
				1926	++NumPostLoadRegsHardened;
				1927	}
				1928
Chandler Carruth	a3a03ac	2018-07-19 23:46:24 +0000	[diff] [blame]	1929	/// Harden a return instruction.
				1930	///
				1931	/// Returns implicitly perform a load which we need to harden. Without hardening
				1932	/// this load, an attacker my speculatively write over the return address to
				1933	/// steer speculation of the return to an attacker controlled address. This is
				1934	/// called Spectre v1.1 or Bounds Check Bypass Store (BCBS) and is described in
				1935	/// this paper:
				1936	/// https://people.csail.mit.edu/vlk/spectre11.pdf
				1937	///
				1938	/// We can harden this by introducing an LFENCE that will delay any load of the
				1939	/// return address until prior instructions have retired (and thus are not being
				1940	/// speculated), or we can harden the address used by the implicit load: the
				1941	/// stack pointer.
				1942	///
				1943	/// If we are not using an LFENCE, hardening the stack pointer has an additional
				1944	/// benefit: it allows us to pass the predicate state accumulated in this
				1945	/// function back to the caller. In the absence of a BCBS attack on the return,
				1946	/// the caller will typically be resumed and speculatively executed due to the
				1947	/// Return Stack Buffer (RSB) prediction which is very accurate and has a high
				1948	/// priority. It is possible that some code from the caller will be executed
				1949	/// speculatively even during a BCBS-attacked return until the steering takes
				1950	/// effect. Whenever this happens, the caller can recover the (poisoned)
				1951	/// predicate state from the stack pointer and continue to harden loads.
				1952	void X86SpeculativeLoadHardeningPass::hardenReturnInstr(MachineInstr &MI) {
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1953	MachineBasicBlock &MBB = *MI.getParent();
				1954	DebugLoc Loc = MI.getDebugLoc();
				1955	auto InsertPt = MI.getIterator();
				1956
				1957	if (FenceCallAndRet) {
				1958	// Simply forcibly block speculation of loads out of the function by using
				1959	// an LFENCE. This is potentially a heavy-weight mitigation strategy, but
				1960	// should be secure, is simple from an ABI perspective, and the cost can be
				1961	// minimized through inlining.
				1962	//
				1963	// FIXME: We should investigate ways to establish a strong data-dependency
				1964	// on the return. However, poisoning the stack pointer is unlikely to work
				1965	// because the return is predicted rather than relying on the load of the
				1966	// return address to actually resolve.
				1967	BuildMI(MBB, InsertPt, Loc, TII->get(X86::LFENCE));
				1968	++NumInstsInserted;
				1969	++NumLFENCEsInserted;
				1970	return;
				1971	}
				1972
				1973	// Take our predicate state, shift it to the high 17 bits (so that we keep
				1974	// pointers canonical) and merge it into RSP. This will allow the caller to
				1975	// extract it when we return (speculatively).
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	1976	mergePredStateIntoSP(MBB, InsertPt, Loc, PS->SSA.GetValueAtEndOfBlock(&MBB));
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1977	}
				1978
				1979	INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
				1980	"X86 speculative load hardener", false, false)
				1981	INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
				1982	"X86 speculative load hardener", false, false)
				1983
				1984	FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() {
				1985	return new X86SpeculativeLoadHardeningPass();
				1986	}