Blame - llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp - toolchain/llvm-project

blob: 65d53e9fd3af10c570e0aa28741656d6dd41569b [file] [log] [blame]

Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1	//====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	/// \file
				10	///
				11	/// Provide a pass which mitigates speculative execution attacks which operate
				12	/// by speculating incorrectly past some predicate (a type check, bounds check,
				13	/// or other condition) to reach a load with invalid inputs and leak the data
				14	/// accessed by that load using a side channel out of the speculative domain.
				15	///
				16	/// For details on the attacks, see the first variant in both the Project Zero
				17	/// writeup and the Spectre paper:
				18	/// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
				19	/// https://spectreattack.com/spectre.pdf
				20	///
				21	//===----------------------------------------------------------------------===//
				22
				23	#include "X86.h"
				24	#include "X86InstrBuilder.h"
				25	#include "X86InstrInfo.h"
				26	#include "X86Subtarget.h"
				27	#include "llvm/ADT/ArrayRef.h"
				28	#include "llvm/ADT/DenseMap.h"
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	29	#include "llvm/ADT/Optional.h"
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	30	#include "llvm/ADT/STLExtras.h"
				31	#include "llvm/ADT/ScopeExit.h"
				32	#include "llvm/ADT/SmallPtrSet.h"
				33	#include "llvm/ADT/SmallSet.h"
				34	#include "llvm/ADT/SmallVector.h"
				35	#include "llvm/ADT/SparseBitVector.h"
				36	#include "llvm/ADT/Statistic.h"
				37	#include "llvm/CodeGen/MachineBasicBlock.h"
				38	#include "llvm/CodeGen/MachineConstantPool.h"
				39	#include "llvm/CodeGen/MachineFunction.h"
				40	#include "llvm/CodeGen/MachineFunctionPass.h"
				41	#include "llvm/CodeGen/MachineInstr.h"
				42	#include "llvm/CodeGen/MachineInstrBuilder.h"
				43	#include "llvm/CodeGen/MachineModuleInfo.h"
				44	#include "llvm/CodeGen/MachineOperand.h"
				45	#include "llvm/CodeGen/MachineRegisterInfo.h"
				46	#include "llvm/CodeGen/MachineSSAUpdater.h"
				47	#include "llvm/CodeGen/TargetInstrInfo.h"
				48	#include "llvm/CodeGen/TargetRegisterInfo.h"
				49	#include "llvm/CodeGen/TargetSchedule.h"
				50	#include "llvm/CodeGen/TargetSubtargetInfo.h"
				51	#include "llvm/IR/DebugLoc.h"
				52	#include "llvm/MC/MCSchedule.h"
				53	#include "llvm/Pass.h"
				54	#include "llvm/Support/CommandLine.h"
				55	#include "llvm/Support/Debug.h"
				56	#include "llvm/Support/raw_ostream.h"
				57	#include <algorithm>
				58	#include <cassert>
				59	#include <iterator>
				60	#include <utility>
				61
				62	using namespace llvm;
				63
				64	#define PASS_KEY "x86-speculative-load-hardening"
				65	#define DEBUG_TYPE PASS_KEY
				66
				67	STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced");
				68	STATISTIC(NumBranchesUntraced, "Number of branches unable to trace");
				69	STATISTIC(NumAddrRegsHardened,
				70	"Number of address mode used registers hardaned");
				71	STATISTIC(NumPostLoadRegsHardened,
				72	"Number of post-load register values hardened");
				73	STATISTIC(NumInstsInserted, "Number of instructions inserted");
				74	STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
				75
				76	static cl::opt<bool> HardenEdgesWithLFENCE(
				77	PASS_KEY "-lfence",
				78	cl::desc(
				79	"Use LFENCE along each conditional edge to harden against speculative "
				80	"loads rather than conditional movs and poisoned pointers."),
				81	cl::init(false), cl::Hidden);
				82
				83	static cl::opt<bool> EnablePostLoadHardening(
				84	PASS_KEY "-post-load",
				85	cl::desc("Harden the value loaded after it is loaded by "
				86	"flushing the loaded bits to 1. This is hard to do "
				87	"in general but can be done easily for GPRs."),
				88	cl::init(true), cl::Hidden);
				89
				90	static cl::opt<bool> FenceCallAndRet(
				91	PASS_KEY "-fence-call-and-ret",
				92	cl::desc("Use a full speculation fence to harden both call and ret edges "
				93	"rather than a lighter weight mitigation."),
				94	cl::init(false), cl::Hidden);
				95
				96	static cl::opt<bool> HardenInterprocedurally(
				97	PASS_KEY "-ip",
				98	cl::desc("Harden interprocedurally by passing our state in and out of "
				99	"functions in the high bits of the stack pointer."),
				100	cl::init(true), cl::Hidden);
				101
				102	static cl::opt<bool>
				103	HardenLoads(PASS_KEY "-loads",
				104	cl::desc("Sanitize loads from memory. When disable, no "
				105	"significant security is provided."),
				106	cl::init(true), cl::Hidden);
				107
				108	namespace llvm {
				109
				110	void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
				111
				112	} // end namespace llvm
				113
				114	namespace {
				115
				116	class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
				117	public:
				118	X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) {
				119	initializeX86SpeculativeLoadHardeningPassPass(
				120	*PassRegistry::getPassRegistry());
				121	}
				122
				123	StringRef getPassName() const override {
				124	return "X86 speculative load hardening";
				125	}
				126	bool runOnMachineFunction(MachineFunction &MF) override;
				127	void getAnalysisUsage(AnalysisUsage &AU) const override;
				128
				129	/// Pass identification, replacement for typeid.
				130	static char ID;
				131
				132	private:
				133	/// The information about a block's conditional terminators needed to trace
				134	/// our predicate state through the exiting edges.
				135	struct BlockCondInfo {
				136	MachineBasicBlock *MBB;
				137
				138	// We mostly have one conditional branch, and in extremely rare cases have
				139	// two. Three and more are so rare as to be unimportant for compile time.
				140	SmallVector<MachineInstr *, 2> CondBrs;
				141
				142	MachineInstr *UncondBr;
				143	};
				144
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	145	/// Manages the predicate state traced through the program.
				146	struct PredState {
				147	unsigned InitialReg;
				148	unsigned PoisonReg;
				149
				150	const TargetRegisterClass *RC;
				151	MachineSSAUpdater SSA;
				152
				153	PredState(MachineFunction &MF, const TargetRegisterClass *RC)
				154	: RC(RC), SSA(MF) {}
				155	};
				156
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	157	const X86Subtarget *Subtarget;
				158	MachineRegisterInfo *MRI;
				159	const X86InstrInfo *TII;
				160	const TargetRegisterInfo *TRI;
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	161
				162	Optional<PredState> PS;
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	163
				164	void hardenEdgesWithLFENCE(MachineFunction &MF);
				165
				166	SmallVector<BlockCondInfo, 16> collectBlockCondInfo(MachineFunction &MF);
				167
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	168	SmallVector<MachineInstr *, 16>
				169	tracePredStateThroughCFG(MachineFunction &MF, ArrayRef<BlockCondInfo> Infos);
				170
Chandler Carruth	0477b40	2018-07-23 04:01:34 +0000	[diff] [blame]	171	void hardenAllLoads(MachineFunction &MF);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	172
				173	unsigned saveEFLAGS(MachineBasicBlock &MBB,
				174	MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
				175	void restoreEFLAGS(MachineBasicBlock &MBB,
				176	MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
				177	unsigned OFReg);
				178
				179	void mergePredStateIntoSP(MachineBasicBlock &MBB,
				180	MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
				181	unsigned PredStateReg);
				182	unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
				183	MachineBasicBlock::iterator InsertPt,
				184	DebugLoc Loc);
				185
				186	void
				187	hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	188	MachineOperand &IndexMO,
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	189	SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
				190	MachineInstr *
				191	sinkPostLoadHardenedInst(MachineInstr &MI,
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	192	SmallPtrSetImpl<MachineInstr *> &HardenedInstrs);
				193	bool canHardenRegister(unsigned Reg);
Chandler Carruth	5452914	2018-07-24 12:44:00 +0000	[diff] [blame]	194	unsigned hardenValueInRegister(unsigned Reg, MachineBasicBlock &MBB,
				195	MachineBasicBlock::iterator InsertPt,
				196	DebugLoc Loc);
				197	unsigned hardenPostLoad(MachineInstr &MI);
Chandler Carruth	a3a03ac	2018-07-19 23:46:24 +0000	[diff] [blame]	198	void hardenReturnInstr(MachineInstr &MI);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	199	};
				200
				201	} // end anonymous namespace
				202
				203	char X86SpeculativeLoadHardeningPass::ID = 0;
				204
				205	void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
				206	AnalysisUsage &AU) const {
				207	MachineFunctionPass::getAnalysisUsage(AU);
				208	}
				209
				210	static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB,
				211	MachineBasicBlock &Succ, int SuccCount,
				212	MachineInstr Br, MachineInstr &UncondBr,
				213	const X86InstrInfo &TII) {
				214	assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!");
				215
				216	MachineFunction &MF = *MBB.getParent();
				217
				218	MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
				219
				220	// We have to insert the new block immediately after the current one as we
				221	// don't know what layout-successor relationships the successor has and we
				222	// may not be able to (and generally don't want to) try to fix those up.
				223	MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
				224
				225	// Update the branch instruction if necessary.
				226	if (Br) {
				227	assert(Br->getOperand(0).getMBB() == &Succ &&
				228	"Didn't start with the right target!");
				229	Br->getOperand(0).setMBB(&NewMBB);
				230
				231	// If this successor was reached through a branch rather than fallthrough,
				232	// we might have broken fallthrough and so need to inject a new
				233	// unconditional branch.
				234	if (!UncondBr) {
				235	MachineBasicBlock &OldLayoutSucc =
				236	*std::next(MachineFunction::iterator(&NewMBB));
				237	assert(MBB.isSuccessor(&OldLayoutSucc) &&
				238	"Without an unconditional branch, the old layout successor should "
				239	"be an actual successor!");
				240	auto BrBuilder =
				241	BuildMI(&MBB, DebugLoc(), TII.get(X86::JMP_1)).addMBB(&OldLayoutSucc);
				242	// Update the unconditional branch now that we've added one.
				243	UncondBr = &*BrBuilder;
				244	}
				245
				246	// Insert unconditional "jump Succ" instruction in the new block if
				247	// necessary.
				248	if (!NewMBB.isLayoutSuccessor(&Succ)) {
				249	SmallVector<MachineOperand, 4> Cond;
				250	TII.insertBranch(NewMBB, &Succ, nullptr, Cond, Br->getDebugLoc());
				251	}
				252	} else {
				253	assert(!UncondBr &&
				254	"Cannot have a branchless successor and an unconditional branch!");
				255	assert(NewMBB.isLayoutSuccessor(&Succ) &&
				256	"A non-branch successor must have been a layout successor before "
				257	"and now is a layout successor of the new block.");
				258	}
				259
				260	// If this is the only edge to the successor, we can just replace it in the
				261	// CFG. Otherwise we need to add a new entry in the CFG for the new
				262	// successor.
				263	if (SuccCount == 1) {
				264	MBB.replaceSuccessor(&Succ, &NewMBB);
				265	} else {
				266	MBB.splitSuccessor(&Succ, &NewMBB);
				267	}
				268
				269	// Hook up the edge from the new basic block to the old successor in the CFG.
				270	NewMBB.addSuccessor(&Succ);
				271
				272	// Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
				273	for (MachineInstr &MI : Succ) {
				274	if (!MI.isPHI())
				275	break;
				276	for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
				277	OpIdx += 2) {
				278	MachineOperand &OpV = MI.getOperand(OpIdx);
				279	MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
				280	assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
				281	if (OpMBB.getMBB() != &MBB)
				282	continue;
				283
				284	// If this is the last edge to the succesor, just replace MBB in the PHI
				285	if (SuccCount == 1) {
				286	OpMBB.setMBB(&NewMBB);
				287	break;
				288	}
				289
				290	// Otherwise, append a new pair of operands for the new incoming edge.
				291	MI.addOperand(MF, OpV);
				292	MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
				293	break;
				294	}
				295	}
				296
				297	// Inherit live-ins from the successor
				298	for (auto &LI : Succ.liveins())
				299	NewMBB.addLiveIn(LI);
				300
				301	LLVM_DEBUG(dbgs() << " Split edge from '" << MBB.getName() << "' to '"
				302	<< Succ.getName() << "'.\n");
				303	return NewMBB;
				304	}
				305
Chandler Carruth	3ffcc03	2018-07-15 23:46:36 +0000	[diff] [blame]	306	/// Removing duplicate PHI operands to leave the PHI in a canonical and
				307	/// predictable form.
				308	///
				309	/// FIXME: It's really frustrating that we have to do this, but SSA-form in MIR
				310	/// isn't what you might expect. We may have multiple entries in PHI nodes for
				311	/// a single predecessor. This makes CFG-updating extremely complex, so here we
				312	/// simplify all PHI nodes to a model even simpler than the IR's model: exactly
				313	/// one entry per predecessor, regardless of how many edges there are.
				314	static void canonicalizePHIOperands(MachineFunction &MF) {
				315	SmallPtrSet<MachineBasicBlock *, 4> Preds;
				316	SmallVector<int, 4> DupIndices;
				317	for (auto &MBB : MF)
				318	for (auto &MI : MBB) {
				319	if (!MI.isPHI())
				320	break;
				321
				322	// First we scan the operands of the PHI looking for duplicate entries
				323	// a particular predecessor. We retain the operand index of each duplicate
				324	// entry found.
				325	for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
				326	OpIdx += 2)
				327	if (!Preds.insert(MI.getOperand(OpIdx + 1).getMBB()).second)
				328	DupIndices.push_back(OpIdx);
				329
				330	// Now walk the duplicate indices, removing both the block and value. Note
				331	// that these are stored as a vector making this element-wise removal
				332	// :w
				333	// potentially quadratic.
				334	//
				335	// FIXME: It is really frustrating that we have to use a quadratic
				336	// removal algorithm here. There should be a better way, but the use-def
				337	// updates required make that impossible using the public API.
				338	//
				339	// Note that we have to process these backwards so that we don't
				340	// invalidate other indices with each removal.
				341	while (!DupIndices.empty()) {
				342	int OpIdx = DupIndices.pop_back_val();
				343	// Remove both the block and value operand, again in reverse order to
				344	// preserve indices.
				345	MI.RemoveOperand(OpIdx + 1);
				346	MI.RemoveOperand(OpIdx);
				347	}
				348
				349	Preds.clear();
				350	}
				351	}
				352
Chandler Carruth	3620b99	2018-07-16 10:46:16 +0000	[diff] [blame]	353	/// Helper to scan a function for loads vulnerable to misspeculation that we
				354	/// want to harden.
				355	///
				356	/// We use this to avoid making changes to functions where there is nothing we
				357	/// need to do to harden against misspeculation.
				358	static bool hasVulnerableLoad(MachineFunction &MF) {
				359	for (MachineBasicBlock &MBB : MF) {
				360	for (MachineInstr &MI : MBB) {
				361	// Loads within this basic block after an LFENCE are not at risk of
				362	// speculatively executing with invalid predicates from prior control
				363	// flow. So break out of this block but continue scanning the function.
				364	if (MI.getOpcode() == X86::LFENCE)
				365	break;
				366
				367	// Looking for loads only.
				368	if (!MI.mayLoad())
				369	continue;
				370
				371	// An MFENCE is modeled as a load but isn't vulnerable to misspeculation.
				372	if (MI.getOpcode() == X86::MFENCE)
				373	continue;
				374
				375	// We found a load.
				376	return true;
				377	}
				378	}
				379
				380	// No loads found.
				381	return false;
				382	}
				383
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	384	bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
				385	MachineFunction &MF) {
				386	LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
				387	<< " **********\n");
				388
				389	Subtarget = &MF.getSubtarget<X86Subtarget>();
				390	MRI = &MF.getRegInfo();
				391	TII = Subtarget->getInstrInfo();
				392	TRI = Subtarget->getRegisterInfo();
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	393
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	394	// FIXME: Support for 32-bit.
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	395	PS.emplace(MF, &X86::GR64_NOSPRegClass);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	396
				397	if (MF.begin() == MF.end())
				398	// Nothing to do for a degenerate empty function...
				399	return false;
				400
				401	// We support an alternative hardening technique based on a debug flag.
				402	if (HardenEdgesWithLFENCE) {
				403	hardenEdgesWithLFENCE(MF);
				404	return true;
				405	}
				406
				407	// Create a dummy debug loc to use for all the generated code here.
				408	DebugLoc Loc;
				409
				410	MachineBasicBlock &Entry = *MF.begin();
				411	auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(Entry.begin());
				412
				413	// Do a quick scan to see if we have any checkable loads.
Chandler Carruth	3620b99	2018-07-16 10:46:16 +0000	[diff] [blame]	414	bool HasVulnerableLoad = hasVulnerableLoad(MF);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	415
				416	// See if we have any conditional branching blocks that we will need to trace
				417	// predicate state through.
				418	SmallVector<BlockCondInfo, 16> Infos = collectBlockCondInfo(MF);
				419
				420	// If we have no interesting conditions or loads, nothing to do here.
Chandler Carruth	3620b99	2018-07-16 10:46:16 +0000	[diff] [blame]	421	if (!HasVulnerableLoad && Infos.empty())
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	422	return true;
				423
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	424	// The poison value is required to be an all-ones value for many aspects of
				425	// this mitigation.
				426	const int PoisonVal = -1;
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	427	PS->PoisonReg = MRI->createVirtualRegister(PS->RC);
				428	BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64ri32), PS->PoisonReg)
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	429	.addImm(PoisonVal);
				430	++NumInstsInserted;
				431
				432	// If we have loads being hardened and we've asked for call and ret edges to
				433	// get a full fence-based mitigation, inject that fence.
Chandler Carruth	3620b99	2018-07-16 10:46:16 +0000	[diff] [blame]	434	if (HasVulnerableLoad && FenceCallAndRet) {
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	435	// We need to insert an LFENCE at the start of the function to suspend any
				436	// incoming misspeculation from the caller. This helps two-fold: the caller
				437	// may not have been protected as this code has been, and this code gets to
				438	// not take any specific action to protect across calls.
				439	// FIXME: We could skip this for functions which unconditionally return
				440	// a constant.
				441	BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::LFENCE));
				442	++NumInstsInserted;
				443	++NumLFENCEsInserted;
				444	}
				445
Chandler Carruth	fb503ac	2018-07-14 09:32:37 +0000	[diff] [blame]	446	// If we guarded the entry with an LFENCE and have no conditionals to protect
				447	// in blocks, then we're done.
				448	if (FenceCallAndRet && Infos.empty())
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	449	// We may have changed the function's code at this point to insert fences.
				450	return true;
				451
				452	// For every basic block in the function which can b
				453	if (HardenInterprocedurally && !FenceCallAndRet) {
				454	// Set up the predicate state by extracting it from the incoming stack
				455	// pointer so we pick up any misspeculation in our caller.
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	456	PS->InitialReg = extractPredStateFromSP(Entry, EntryInsertPt, Loc);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	457	} else {
				458	// Otherwise, just build the predicate state itself by zeroing a register
				459	// as we don't need any initial state.
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	460	PS->InitialReg = MRI->createVirtualRegister(PS->RC);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	461	unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
				462	auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
				463	PredStateSubReg);
				464	++NumInstsInserted;
				465	MachineOperand *ZeroEFLAGSDefOp =
				466	ZeroI->findRegisterDefOperand(X86::EFLAGS);
				467	assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
				468	"Must have an implicit def of EFLAGS!");
				469	ZeroEFLAGSDefOp->setIsDead(true);
				470	BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	471	PS->InitialReg)
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	472	.addImm(0)
				473	.addReg(PredStateSubReg)
				474	.addImm(X86::sub_32bit);
				475	}
				476
				477	// We're going to need to trace predicate state throughout the function's
				478	// CFG. Prepare for this by setting up our initial state of PHIs with unique
				479	// predecessor entries and all the initial predicate state.
Chandler Carruth	3ffcc03	2018-07-15 23:46:36 +0000	[diff] [blame]	480	canonicalizePHIOperands(MF);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	481
				482	// Track the updated values in an SSA updater to rewrite into SSA form at the
				483	// end.
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	484	PS->SSA.Initialize(PS->InitialReg);
				485	PS->SSA.AddAvailableValue(&Entry, PS->InitialReg);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	486
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	487	// Trace through the CFG.
				488	auto CMovs = tracePredStateThroughCFG(MF, Infos);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	489
				490	// We may also enter basic blocks in this function via exception handling
				491	// control flow. Here, if we are hardening interprocedurally, we need to
				492	// re-capture the predicate state from the throwing code. In the Itanium ABI,
				493	// the throw will always look like a call to __cxa_throw and will have the
				494	// predicate state in the stack pointer, so extract fresh predicate state from
				495	// the stack pointer and make it available in SSA.
				496	// FIXME: Handle non-itanium ABI EH models.
				497	if (HardenInterprocedurally) {
				498	for (MachineBasicBlock &MBB : MF) {
				499	assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!");
				500	assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
				501	assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
				502	if (!MBB.isEHPad())
				503	continue;
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	504	PS->SSA.AddAvailableValue(
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	505	&MBB,
				506	extractPredStateFromSP(MBB, MBB.SkipPHIsAndLabels(MBB.begin()), Loc));
				507	}
				508	}
				509
Chandler Carruth	0477b40	2018-07-23 04:01:34 +0000	[diff] [blame]	510	// Now harden all of the loads in the function using the predicate state.
				511	hardenAllLoads(MF);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	512
				513	// Now rewrite all the uses of the pred state using the SSA updater so that
				514	// we track updates through the CFG.
				515	for (MachineInstr *CMovI : CMovs)
				516	for (MachineOperand &Op : CMovI->operands()) {
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	517	if (!Op.isReg() \|\| Op.getReg() != PS->InitialReg)
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	518	continue;
				519
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	520	PS->SSA.RewriteUse(Op);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	521	}
				522
				523	// If we are hardening interprocedurally, find each returning block and
				524	// protect the caller from being returned to through misspeculation.
				525	if (HardenInterprocedurally)
				526	for (MachineBasicBlock &MBB : MF) {
				527	if (MBB.empty())
				528	continue;
				529
				530	MachineInstr &MI = MBB.back();
Chandler Carruth	1d926fb	2018-07-23 07:56:15 +0000	[diff] [blame]	531
				532	// We only care about returns that are not also calls. For calls, that
				533	// happen to also be returns (tail calls) we will have already handled
				534	// them as calls.
				535	if (!MI.isReturn() \|\| MI.isCall())
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	536	continue;
				537
Chandler Carruth	a3a03ac	2018-07-19 23:46:24 +0000	[diff] [blame]	538	hardenReturnInstr(MI);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	539	}
				540
				541	LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump();
				542	dbgs() << "\n"; MF.verify(this));
				543	return true;
				544	}
				545
				546	/// Implements the naive hardening approach of putting an LFENCE after every
				547	/// potentially mis-predicted control flow construct.
				548	///
				549	/// We include this as an alternative mostly for the purpose of comparison. The
				550	/// performance impact of this is expected to be extremely severe and not
				551	/// practical for any real-world users.
				552	void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
				553	MachineFunction &MF) {
				554	// First, we scan the function looking for blocks that are reached along edges
				555	// that we might want to harden.
				556	SmallSetVector<MachineBasicBlock *, 8> Blocks;
				557	for (MachineBasicBlock &MBB : MF) {
				558	// If there are no or only one successor, nothing to do here.
				559	if (MBB.succ_size() <= 1)
				560	continue;
				561
				562	// Skip blocks unless their terminators start with a branch. Other
				563	// terminators don't seem interesting for guarding against misspeculation.
				564	auto TermIt = MBB.getFirstTerminator();
				565	if (TermIt == MBB.end() \|\| !TermIt->isBranch())
				566	continue;
				567
				568	// Add all the non-EH-pad succossors to the blocks we want to harden. We
				569	// skip EH pads because there isn't really a condition of interest on
				570	// entering.
				571	for (MachineBasicBlock *SuccMBB : MBB.successors())
				572	if (!SuccMBB->isEHPad())
				573	Blocks.insert(SuccMBB);
				574	}
				575
				576	for (MachineBasicBlock *MBB : Blocks) {
				577	auto InsertPt = MBB->SkipPHIsAndLabels(MBB->begin());
				578	BuildMI(*MBB, InsertPt, DebugLoc(), TII->get(X86::LFENCE));
				579	++NumInstsInserted;
				580	++NumLFENCEsInserted;
				581	}
				582	}
				583
				584	SmallVector<X86SpeculativeLoadHardeningPass::BlockCondInfo, 16>
				585	X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
				586	SmallVector<BlockCondInfo, 16> Infos;
				587
				588	// Walk the function and build up a summary for each block's conditions that
				589	// we need to trace through.
				590	for (MachineBasicBlock &MBB : MF) {
				591	// If there are no or only one successor, nothing to do here.
				592	if (MBB.succ_size() <= 1)
				593	continue;
				594
				595	// We want to reliably handle any conditional branch terminators in the
				596	// MBB, so we manually analyze the branch. We can handle all of the
				597	// permutations here, including ones that analyze branch cannot.
				598	//
				599	// The approach is to walk backwards across the terminators, resetting at
				600	// any unconditional non-indirect branch, and track all conditional edges
				601	// to basic blocks as well as the fallthrough or unconditional successor
				602	// edge. For each conditional edge, we track the target and the opposite
				603	// condition code in order to inject a "no-op" cmov into that successor
				604	// that will harden the predicate. For the fallthrough/unconditional
				605	// edge, we inject a separate cmov for each conditional branch with
				606	// matching condition codes. This effectively implements an "and" of the
				607	// condition flags, even if there isn't a single condition flag that would
				608	// directly implement that. We don't bother trying to optimize either of
				609	// these cases because if such an optimization is possible, LLVM should
				610	// have optimized the conditional branches in that way already to reduce
				611	// instruction count. This late, we simply assume the minimal number of
				612	// branch instructions is being emitted and use that to guide our cmov
				613	// insertion.
				614
				615	BlockCondInfo Info = {&MBB, {}, nullptr};
				616
				617	// Now walk backwards through the terminators and build up successors they
				618	// reach and the conditions.
				619	for (MachineInstr &MI : llvm::reverse(MBB)) {
				620	// Once we've handled all the terminators, we're done.
				621	if (!MI.isTerminator())
				622	break;
				623
				624	// If we see a non-branch terminator, we can't handle anything so bail.
				625	if (!MI.isBranch()) {
				626	Info.CondBrs.clear();
				627	break;
				628	}
				629
				630	// If we see an unconditional branch, reset our state, clear any
				631	// fallthrough, and set this is the "else" successor.
				632	if (MI.getOpcode() == X86::JMP_1) {
				633	Info.CondBrs.clear();
				634	Info.UncondBr = &MI;
				635	continue;
				636	}
				637
				638	// If we get an invalid condition, we have an indirect branch or some
				639	// other unanalyzable "fallthrough" case. We model this as a nullptr for
				640	// the destination so we can still guard any conditional successors.
				641	// Consider code sequences like:
				642	// ```
				643	// jCC L1
				644	// jmpq *%rax
				645	// ```
				646	// We still want to harden the edge to `L1`.
				647	if (X86::getCondFromBranchOpc(MI.getOpcode()) == X86::COND_INVALID) {
				648	Info.CondBrs.clear();
				649	Info.UncondBr = &MI;
				650	continue;
				651	}
				652
				653	// We have a vanilla conditional branch, add it to our list.
				654	Info.CondBrs.push_back(&MI);
				655	}
				656	if (Info.CondBrs.empty()) {
				657	++NumBranchesUntraced;
				658	LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
				659	MBB.dump());
				660	continue;
				661	}
				662
				663	Infos.push_back(Info);
				664	}
				665
				666	return Infos;
				667	}
				668
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	669	/// Trace the predicate state through the CFG, instrumenting each conditional
				670	/// branch such that misspeculation through an edge will poison the predicate
				671	/// state.
				672	///
				673	/// Returns the list of inserted CMov instructions so that they can have their
				674	/// uses of the predicate state rewritten into proper SSA form once it is
				675	/// complete.
				676	SmallVector<MachineInstr *, 16>
				677	X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
				678	MachineFunction &MF, ArrayRef<BlockCondInfo> Infos) {
				679	// Collect the inserted cmov instructions so we can rewrite their uses of the
				680	// predicate state into SSA form.
				681	SmallVector<MachineInstr *, 16> CMovs;
				682
				683	// Now walk all of the basic blocks looking for ones that end in conditional
				684	// jumps where we need to update this register along each edge.
				685	for (const BlockCondInfo &Info : Infos) {
				686	MachineBasicBlock &MBB = *Info.MBB;
				687	const SmallVectorImpl<MachineInstr *> &CondBrs = Info.CondBrs;
				688	MachineInstr *UncondBr = Info.UncondBr;
				689
				690	LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName()
				691	<< "\n");
				692	++NumCondBranchesTraced;
				693
				694	// Compute the non-conditional successor as either the target of any
				695	// unconditional branch or the layout successor.
				696	MachineBasicBlock *UncondSucc =
				697	UncondBr ? (UncondBr->getOpcode() == X86::JMP_1
				698	? UncondBr->getOperand(0).getMBB()
				699	: nullptr)
				700	: &*std::next(MachineFunction::iterator(&MBB));
				701
				702	// Count how many edges there are to any given successor.
				703	SmallDenseMap<MachineBasicBlock *, int> SuccCounts;
				704	if (UncondSucc)
				705	++SuccCounts[UncondSucc];
				706	for (auto *CondBr : CondBrs)
				707	++SuccCounts[CondBr->getOperand(0).getMBB()];
				708
				709	// A lambda to insert cmov instructions into a block checking all of the
				710	// condition codes in a sequence.
				711	auto BuildCheckingBlockForSuccAndConds =
				712	[&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount,
				713	MachineInstr Br, MachineInstr &UncondBr,
				714	ArrayRef<X86::CondCode> Conds) {
				715	// First, we split the edge to insert the checking block into a safe
				716	// location.
				717	auto &CheckingMBB =
				718	(SuccCount == 1 && Succ.pred_size() == 1)
				719	? Succ
				720	: splitEdge(MBB, Succ, SuccCount, Br, UncondBr, *TII);
				721
				722	bool LiveEFLAGS = Succ.isLiveIn(X86::EFLAGS);
				723	if (!LiveEFLAGS)
				724	CheckingMBB.addLiveIn(X86::EFLAGS);
				725
				726	// Now insert the cmovs to implement the checks.
				727	auto InsertPt = CheckingMBB.begin();
				728	assert((InsertPt == CheckingMBB.end() \|\| !InsertPt->isPHI()) &&
				729	"Should never have a PHI in the initial checking block as it "
				730	"always has a single predecessor!");
				731
				732	// We will wire each cmov to each other, but need to start with the
				733	// incoming pred state.
				734	unsigned CurStateReg = PS->InitialReg;
				735
				736	for (X86::CondCode Cond : Conds) {
				737	int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
				738	auto CMovOp = X86::getCMovFromCond(Cond, PredStateSizeInBytes);
				739
				740	unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
				741	// Note that we intentionally use an empty debug location so that
				742	// this picks up the preceding location.
				743	auto CMovI = BuildMI(CheckingMBB, InsertPt, DebugLoc(),
				744	TII->get(CMovOp), UpdatedStateReg)
				745	.addReg(CurStateReg)
				746	.addReg(PS->PoisonReg);
				747	// If this is the last cmov and the EFLAGS weren't originally
				748	// live-in, mark them as killed.
				749	if (!LiveEFLAGS && Cond == Conds.back())
				750	CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
				751
				752	++NumInstsInserted;
				753	LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump();
				754	dbgs() << "\n");
				755
				756	// The first one of the cmovs will be using the top level
				757	// `PredStateReg` and need to get rewritten into SSA form.
				758	if (CurStateReg == PS->InitialReg)
				759	CMovs.push_back(&*CMovI);
				760
				761	// The next cmov should start from this one's def.
				762	CurStateReg = UpdatedStateReg;
				763	}
				764
				765	// And put the last one into the available values for SSA form of our
				766	// predicate state.
				767	PS->SSA.AddAvailableValue(&CheckingMBB, CurStateReg);
				768	};
				769
				770	std::vector<X86::CondCode> UncondCodeSeq;
				771	for (auto *CondBr : CondBrs) {
				772	MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB();
				773	int &SuccCount = SuccCounts[&Succ];
				774
				775	X86::CondCode Cond = X86::getCondFromBranchOpc(CondBr->getOpcode());
				776	X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond);
				777	UncondCodeSeq.push_back(Cond);
				778
				779	BuildCheckingBlockForSuccAndConds(MBB, Succ, SuccCount, CondBr, UncondBr,
				780	{InvCond});
				781
				782	// Decrement the successor count now that we've split one of the edges.
				783	// We need to keep the count of edges to the successor accurate in order
				784	// to know above when to replace the successor in the CFG vs. just
				785	// adding the new successor.
				786	--SuccCount;
				787	}
				788
				789	// Since we may have split edges and changed the number of successors,
				790	// normalize the probabilities. This avoids doing it each time we split an
				791	// edge.
				792	MBB.normalizeSuccProbs();
				793
				794	// Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
				795	// need to intersect the other condition codes. We can do this by just
				796	// doing a cmov for each one.
				797	if (!UncondSucc)
				798	// If we have no fallthrough to protect (perhaps it is an indirect jump?)
				799	// just skip this and continue.
				800	continue;
				801
				802	assert(SuccCounts[UncondSucc] == 1 &&
				803	"We should never have more than one edge to the unconditional "
				804	"successor at this point because every other edge must have been "
				805	"split above!");
				806
				807	// Sort and unique the codes to minimize them.
				808	llvm::sort(UncondCodeSeq.begin(), UncondCodeSeq.end());
				809	UncondCodeSeq.erase(std::unique(UncondCodeSeq.begin(), UncondCodeSeq.end()),
				810	UncondCodeSeq.end());
				811
				812	// Build a checking version of the successor.
				813	BuildCheckingBlockForSuccAndConds(MBB, UncondSucc, /SuccCount*/ 1,
				814	UncondBr, UncondBr, UncondCodeSeq);
				815	}
				816
				817	return CMovs;
				818	}
				819
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	820	/// Returns true if the instruction has no behavior (specified or otherwise)
				821	/// that is based on the value of any of its register operands
				822	///
				823	/// A classical example of something that is inherently not data invariant is an
				824	/// indirect jump -- the destination is loaded into icache based on the bits set
				825	/// in the jump destination register.
				826	///
				827	/// FIXME: This should become part of our instruction tables.
				828	static bool isDataInvariant(MachineInstr &MI) {
				829	switch (MI.getOpcode()) {
				830	default:
				831	// By default, assume that the instruction is not data invariant.
				832	return false;
				833
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	834	// Some target-independent operations that trivially lower to data-invariant
				835	// instructions.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	836	case TargetOpcode::COPY:
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	837	case TargetOpcode::INSERT_SUBREG:
				838	case TargetOpcode::SUBREG_TO_REG:
				839	return true;
				840
				841	// On x86 it is believed that imul is constant time w.r.t. the loaded data.
				842	// However, they set flags and are perhaps the most surprisingly constant
				843	// time operations so we call them out here separately.
				844	case X86::IMUL16rr:
				845	case X86::IMUL16rri8:
				846	case X86::IMUL16rri:
				847	case X86::IMUL32rr:
				848	case X86::IMUL32rri8:
				849	case X86::IMUL32rri:
				850	case X86::IMUL64rr:
				851	case X86::IMUL64rri32:
				852	case X86::IMUL64rri8:
				853
				854	// Bit scanning and counting instructions that are somewhat surprisingly
				855	// constant time as they scan across bits and do other fairly complex
				856	// operations like popcnt, but are believed to be constant time on x86.
				857	// However, these set flags.
				858	case X86::BSF16rr:
				859	case X86::BSF32rr:
				860	case X86::BSF64rr:
				861	case X86::BSR16rr:
				862	case X86::BSR32rr:
				863	case X86::BSR64rr:
				864	case X86::LZCNT16rr:
				865	case X86::LZCNT32rr:
				866	case X86::LZCNT64rr:
				867	case X86::POPCNT16rr:
				868	case X86::POPCNT32rr:
				869	case X86::POPCNT64rr:
				870	case X86::TZCNT16rr:
				871	case X86::TZCNT32rr:
				872	case X86::TZCNT64rr:
				873
				874	// Bit manipulation instructions are effectively combinations of basic
				875	// arithmetic ops, and should still execute in constant time. These also
				876	// set flags.
				877	case X86::BLCFILL32rr:
				878	case X86::BLCFILL64rr:
				879	case X86::BLCI32rr:
				880	case X86::BLCI64rr:
				881	case X86::BLCIC32rr:
				882	case X86::BLCIC64rr:
				883	case X86::BLCMSK32rr:
				884	case X86::BLCMSK64rr:
				885	case X86::BLCS32rr:
				886	case X86::BLCS64rr:
				887	case X86::BLSFILL32rr:
				888	case X86::BLSFILL64rr:
				889	case X86::BLSI32rr:
				890	case X86::BLSI64rr:
				891	case X86::BLSIC32rr:
				892	case X86::BLSIC64rr:
				893	case X86::BLSMSK32rr:
				894	case X86::BLSMSK64rr:
				895	case X86::BLSR32rr:
				896	case X86::BLSR64rr:
				897	case X86::TZMSK32rr:
				898	case X86::TZMSK64rr:
				899
				900	// Bit extracting and clearing instructions should execute in constant time,
				901	// and set flags.
				902	case X86::BEXTR32rr:
				903	case X86::BEXTR64rr:
				904	case X86::BEXTRI32ri:
				905	case X86::BEXTRI64ri:
				906	case X86::BZHI32rr:
				907	case X86::BZHI64rr:
				908
Chandler Carruth	c0cb573	2018-07-17 18:07:59 +0000	[diff] [blame]	909	// Shift and rotate.
				910	case X86::ROL8r1: case X86::ROL16r1: case X86::ROL32r1: case X86::ROL64r1:
				911	case X86::ROL8rCL: case X86::ROL16rCL: case X86::ROL32rCL: case X86::ROL64rCL:
				912	case X86::ROL8ri: case X86::ROL16ri: case X86::ROL32ri: case X86::ROL64ri:
				913	case X86::ROR8r1: case X86::ROR16r1: case X86::ROR32r1: case X86::ROR64r1:
				914	case X86::ROR8rCL: case X86::ROR16rCL: case X86::ROR32rCL: case X86::ROR64rCL:
				915	case X86::ROR8ri: case X86::ROR16ri: case X86::ROR32ri: case X86::ROR64ri:
				916	case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1: case X86::SAR64r1:
				917	case X86::SAR8rCL: case X86::SAR16rCL: case X86::SAR32rCL: case X86::SAR64rCL:
				918	case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri: case X86::SAR64ri:
				919	case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1: case X86::SHL64r1:
				920	case X86::SHL8rCL: case X86::SHL16rCL: case X86::SHL32rCL: case X86::SHL64rCL:
				921	case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri: case X86::SHL64ri:
				922	case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1: case X86::SHR64r1:
				923	case X86::SHR8rCL: case X86::SHR16rCL: case X86::SHR32rCL: case X86::SHR64rCL:
				924	case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri: case X86::SHR64ri:
				925	case X86::SHLD16rrCL: case X86::SHLD32rrCL: case X86::SHLD64rrCL:
				926	case X86::SHLD16rri8: case X86::SHLD32rri8: case X86::SHLD64rri8:
				927	case X86::SHRD16rrCL: case X86::SHRD32rrCL: case X86::SHRD64rrCL:
				928	case X86::SHRD16rri8: case X86::SHRD32rri8: case X86::SHRD64rri8:
				929
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	930	// Basic arithmetic is constant time on the input but does set flags.
				931	case X86::ADC8rr: case X86::ADC8ri:
				932	case X86::ADC16rr: case X86::ADC16ri: case X86::ADC16ri8:
				933	case X86::ADC32rr: case X86::ADC32ri: case X86::ADC32ri8:
				934	case X86::ADC64rr: case X86::ADC64ri8: case X86::ADC64ri32:
				935	case X86::ADD8rr: case X86::ADD8ri:
				936	case X86::ADD16rr: case X86::ADD16ri: case X86::ADD16ri8:
				937	case X86::ADD32rr: case X86::ADD32ri: case X86::ADD32ri8:
				938	case X86::ADD64rr: case X86::ADD64ri8: case X86::ADD64ri32:
				939	case X86::AND8rr: case X86::AND8ri:
				940	case X86::AND16rr: case X86::AND16ri: case X86::AND16ri8:
				941	case X86::AND32rr: case X86::AND32ri: case X86::AND32ri8:
				942	case X86::AND64rr: case X86::AND64ri8: case X86::AND64ri32:
				943	case X86::OR8rr: case X86::OR8ri:
				944	case X86::OR16rr: case X86::OR16ri: case X86::OR16ri8:
				945	case X86::OR32rr: case X86::OR32ri: case X86::OR32ri8:
				946	case X86::OR64rr: case X86::OR64ri8: case X86::OR64ri32:
				947	case X86::SBB8rr: case X86::SBB8ri:
				948	case X86::SBB16rr: case X86::SBB16ri: case X86::SBB16ri8:
				949	case X86::SBB32rr: case X86::SBB32ri: case X86::SBB32ri8:
				950	case X86::SBB64rr: case X86::SBB64ri8: case X86::SBB64ri32:
				951	case X86::SUB8rr: case X86::SUB8ri:
				952	case X86::SUB16rr: case X86::SUB16ri: case X86::SUB16ri8:
				953	case X86::SUB32rr: case X86::SUB32ri: case X86::SUB32ri8:
				954	case X86::SUB64rr: case X86::SUB64ri8: case X86::SUB64ri32:
				955	case X86::XOR8rr: case X86::XOR8ri:
				956	case X86::XOR16rr: case X86::XOR16ri: case X86::XOR16ri8:
				957	case X86::XOR32rr: case X86::XOR32ri: case X86::XOR32ri8:
				958	case X86::XOR64rr: case X86::XOR64ri8: case X86::XOR64ri32:
				959	// Arithmetic with just 32-bit and 64-bit variants and no immediates.
				960	case X86::ADCX32rr: case X86::ADCX64rr:
				961	case X86::ADOX32rr: case X86::ADOX64rr:
				962	case X86::ANDN32rr: case X86::ANDN64rr:
Chandler Carruth	c0cb573	2018-07-17 18:07:59 +0000	[diff] [blame]	963	// Unary arithmetic operations.
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	964	case X86::DEC8r: case X86::DEC16r: case X86::DEC32r: case X86::DEC64r:
Chandler Carruth	c0cb573	2018-07-17 18:07:59 +0000	[diff] [blame]	965	case X86::INC8r: case X86::INC16r: case X86::INC32r: case X86::INC64r:
				966	case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	967	// Check whether the EFLAGS implicit-def is dead. We assume that this will
				968	// always find the implicit-def because this code should only be reached
				969	// for instructions that do in fact implicitly def this.
				970	if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
				971	// If we would clobber EFLAGS that are used, just bail for now.
				972	LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: ";
				973	MI.dump(); dbgs() << "\n");
				974	return false;
				975	}
				976
				977	// Otherwise, fallthrough to handle these the same as instructions that
				978	// don't set EFLAGS.
				979	LLVM_FALLTHROUGH;
				980
Chandler Carruth	c0cb573	2018-07-17 18:07:59 +0000	[diff] [blame]	981	// Unlike other arithmetic, NOT doesn't set EFLAGS.
				982	case X86::NOT8r: case X86::NOT16r: case X86::NOT32r: case X86::NOT64r:
				983
				984	// Various move instructions used to zero or sign extend things. Note that we
				985	// intentionally don't support the _NOREX variants as we can't handle that
				986	// register constraint anyways.
				987	case X86::MOVSX16rr8:
				988	case X86::MOVSX32rr8: case X86::MOVSX32rr16:
				989	case X86::MOVSX64rr8: case X86::MOVSX64rr16: case X86::MOVSX64rr32:
				990	case X86::MOVZX16rr8:
				991	case X86::MOVZX32rr8: case X86::MOVZX32rr16:
				992	case X86::MOVZX64rr8: case X86::MOVZX64rr16:
				993	case X86::MOV32rr:
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	994
				995	// Arithmetic instructions that are both constant time and don't set flags.
				996	case X86::RORX32ri:
				997	case X86::RORX64ri:
				998	case X86::SARX32rr:
				999	case X86::SARX64rr:
				1000	case X86::SHLX32rr:
				1001	case X86::SHLX64rr:
				1002	case X86::SHRX32rr:
				1003	case X86::SHRX64rr:
				1004
				1005	// LEA doesn't actually access memory, and its arithmetic is constant time.
				1006	case X86::LEA16r:
				1007	case X86::LEA32r:
				1008	case X86::LEA64_32r:
				1009	case X86::LEA64r:
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1010	return true;
				1011	}
				1012	}
				1013
				1014	/// Returns true if the instruction has no behavior (specified or otherwise)
				1015	/// that is based on the value loaded from memory or the value of any
				1016	/// non-address register operands.
				1017	///
				1018	/// For example, if the latency of the instruction is dependent on the
				1019	/// particular bits set in any of the registers or any of the bits loaded from
				1020	/// memory.
				1021	///
				1022	/// A classical example of something that is inherently not data invariant is an
				1023	/// indirect jump -- the destination is loaded into icache based on the bits set
				1024	/// in the jump destination register.
				1025	///
				1026	/// FIXME: This should become part of our instruction tables.
				1027	static bool isDataInvariantLoad(MachineInstr &MI) {
				1028	switch (MI.getOpcode()) {
				1029	default:
				1030	// By default, assume that the load will immediately leak.
				1031	return false;
				1032
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	1033	// On x86 it is believed that imul is constant time w.r.t. the loaded data.
				1034	// However, they set flags and are perhaps the most surprisingly constant
				1035	// time operations so we call them out here separately.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1036	case X86::IMUL16rm:
				1037	case X86::IMUL16rmi8:
				1038	case X86::IMUL16rmi:
				1039	case X86::IMUL32rm:
				1040	case X86::IMUL32rmi8:
				1041	case X86::IMUL32rmi:
				1042	case X86::IMUL64rm:
				1043	case X86::IMUL64rmi32:
				1044	case X86::IMUL64rmi8:
				1045
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	1046	// Bit scanning and counting instructions that are somewhat surprisingly
				1047	// constant time as they scan across bits and do other fairly complex
				1048	// operations like popcnt, but are believed to be constant time on x86.
				1049	// However, these set flags.
				1050	case X86::BSF16rm:
				1051	case X86::BSF32rm:
				1052	case X86::BSF64rm:
				1053	case X86::BSR16rm:
				1054	case X86::BSR32rm:
				1055	case X86::BSR64rm:
				1056	case X86::LZCNT16rm:
				1057	case X86::LZCNT32rm:
				1058	case X86::LZCNT64rm:
				1059	case X86::POPCNT16rm:
				1060	case X86::POPCNT32rm:
				1061	case X86::POPCNT64rm:
				1062	case X86::TZCNT16rm:
				1063	case X86::TZCNT32rm:
				1064	case X86::TZCNT64rm:
				1065
				1066	// Bit manipulation instructions are effectively combinations of basic
				1067	// arithmetic ops, and should still execute in constant time. These also
				1068	// set flags.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1069	case X86::BLCFILL32rm:
				1070	case X86::BLCFILL64rm:
				1071	case X86::BLCI32rm:
				1072	case X86::BLCI64rm:
				1073	case X86::BLCIC32rm:
				1074	case X86::BLCIC64rm:
				1075	case X86::BLCMSK32rm:
				1076	case X86::BLCMSK64rm:
				1077	case X86::BLCS32rm:
				1078	case X86::BLCS64rm:
				1079	case X86::BLSFILL32rm:
				1080	case X86::BLSFILL64rm:
				1081	case X86::BLSI32rm:
				1082	case X86::BLSI64rm:
				1083	case X86::BLSIC32rm:
				1084	case X86::BLSIC64rm:
				1085	case X86::BLSMSK32rm:
				1086	case X86::BLSMSK64rm:
				1087	case X86::BLSR32rm:
				1088	case X86::BLSR64rm:
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1089	case X86::TZMSK32rm:
				1090	case X86::TZMSK64rm:
				1091
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	1092	// Bit extracting and clearing instructions should execute in constant time,
				1093	// and set flags.
				1094	case X86::BEXTR32rm:
				1095	case X86::BEXTR64rm:
				1096	case X86::BEXTRI32mi:
				1097	case X86::BEXTRI64mi:
				1098	case X86::BZHI32rm:
				1099	case X86::BZHI64rm:
				1100
				1101	// Basic arithmetic is constant time on the input but does set flags.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1102	case X86::ADC8rm:
				1103	case X86::ADC16rm:
				1104	case X86::ADC32rm:
				1105	case X86::ADC64rm:
				1106	case X86::ADCX32rm:
				1107	case X86::ADCX64rm:
				1108	case X86::ADD8rm:
				1109	case X86::ADD16rm:
				1110	case X86::ADD32rm:
				1111	case X86::ADD64rm:
				1112	case X86::ADOX32rm:
				1113	case X86::ADOX64rm:
				1114	case X86::AND8rm:
				1115	case X86::AND16rm:
				1116	case X86::AND32rm:
				1117	case X86::AND64rm:
				1118	case X86::ANDN32rm:
				1119	case X86::ANDN64rm:
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1120	case X86::OR8rm:
				1121	case X86::OR16rm:
				1122	case X86::OR32rm:
				1123	case X86::OR64rm:
				1124	case X86::SBB8rm:
				1125	case X86::SBB16rm:
				1126	case X86::SBB32rm:
				1127	case X86::SBB64rm:
				1128	case X86::SUB8rm:
				1129	case X86::SUB16rm:
				1130	case X86::SUB32rm:
				1131	case X86::SUB64rm:
				1132	case X86::XOR8rm:
				1133	case X86::XOR16rm:
				1134	case X86::XOR32rm:
				1135	case X86::XOR64rm:
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1136	// Check whether the EFLAGS implicit-def is dead. We assume that this will
				1137	// always find the implicit-def because this code should only be reached
				1138	// for instructions that do in fact implicitly def this.
				1139	if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
				1140	// If we would clobber EFLAGS that are used, just bail for now.
				1141	LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: ";
				1142	MI.dump(); dbgs() << "\n");
				1143	return false;
				1144	}
				1145
				1146	// Otherwise, fallthrough to handle these the same as instructions that
				1147	// don't set EFLAGS.
				1148	LLVM_FALLTHROUGH;
				1149
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	1150	// Integer multiply w/o affecting flags is still believed to be constant
				1151	// time on x86. Called out separately as this is among the most surprising
				1152	// instructions to exhibit that behavior.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1153	case X86::MULX32rm:
				1154	case X86::MULX64rm:
				1155
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	1156	// Arithmetic instructions that are both constant time and don't set flags.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1157	case X86::RORX32mi:
				1158	case X86::RORX64mi:
				1159	case X86::SARX32rm:
				1160	case X86::SARX64rm:
				1161	case X86::SHLX32rm:
				1162	case X86::SHLX64rm:
				1163	case X86::SHRX32rm:
				1164	case X86::SHRX64rm:
				1165
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	1166	// Conversions are believed to be constant time and don't set flags.
Craig Topper	41fa858	2018-07-13 22:41:50 +0000	[diff] [blame]	1167	case X86::CVTTSD2SI64rm: case X86::VCVTTSD2SI64rm: case X86::VCVTTSD2SI64Zrm:
				1168	case X86::CVTTSD2SIrm: case X86::VCVTTSD2SIrm: case X86::VCVTTSD2SIZrm:
				1169	case X86::CVTTSS2SI64rm: case X86::VCVTTSS2SI64rm: case X86::VCVTTSS2SI64Zrm:
				1170	case X86::CVTTSS2SIrm: case X86::VCVTTSS2SIrm: case X86::VCVTTSS2SIZrm:
				1171	case X86::CVTSI2SDrm: case X86::VCVTSI2SDrm: case X86::VCVTSI2SDZrm:
				1172	case X86::CVTSI2SSrm: case X86::VCVTSI2SSrm: case X86::VCVTSI2SSZrm:
				1173	case X86::CVTSI642SDrm: case X86::VCVTSI642SDrm: case X86::VCVTSI642SDZrm:
				1174	case X86::CVTSI642SSrm: case X86::VCVTSI642SSrm: case X86::VCVTSI642SSZrm:
				1175	case X86::CVTSS2SDrm: case X86::VCVTSS2SDrm: case X86::VCVTSS2SDZrm:
				1176	case X86::CVTSD2SSrm: case X86::VCVTSD2SSrm: case X86::VCVTSD2SSZrm:
				1177	// AVX512 added unsigned integer conversions.
				1178	case X86::VCVTTSD2USI64Zrm:
				1179	case X86::VCVTTSD2USIZrm:
				1180	case X86::VCVTTSS2USI64Zrm:
				1181	case X86::VCVTTSS2USIZrm:
				1182	case X86::VCVTUSI2SDZrm:
				1183	case X86::VCVTUSI642SDZrm:
				1184	case X86::VCVTUSI2SSZrm:
				1185	case X86::VCVTUSI642SSZrm:
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1186
Craig Topper	445abf7	2018-07-13 22:41:46 +0000	[diff] [blame]	1187	// Loads to register don't set flags.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1188	case X86::MOV8rm:
				1189	case X86::MOV8rm_NOREX:
				1190	case X86::MOV16rm:
				1191	case X86::MOV32rm:
				1192	case X86::MOV64rm:
				1193	case X86::MOVSX16rm8:
				1194	case X86::MOVSX32rm16:
				1195	case X86::MOVSX32rm8:
				1196	case X86::MOVSX32rm8_NOREX:
				1197	case X86::MOVSX64rm16:
				1198	case X86::MOVSX64rm32:
				1199	case X86::MOVSX64rm8:
				1200	case X86::MOVZX16rm8:
				1201	case X86::MOVZX32rm16:
				1202	case X86::MOVZX32rm8:
				1203	case X86::MOVZX32rm8_NOREX:
				1204	case X86::MOVZX64rm16:
				1205	case X86::MOVZX64rm8:
				1206	return true;
				1207	}
				1208	}
				1209
				1210	static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
				1211	const TargetRegisterInfo &TRI) {
				1212	// Check if EFLAGS are alive by seeing if there is a def of them or they
				1213	// live-in, and then seeing if that def is in turn used.
				1214	for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), I))) {
				1215	if (MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
				1216	// If the def is dead, then EFLAGS is not live.
				1217	if (DefOp->isDead())
				1218	return false;
				1219
				1220	// Otherwise we've def'ed it, and it is live.
				1221	return true;
				1222	}
				1223	// While at this instruction, also check if we use and kill EFLAGS
				1224	// which means it isn't live.
				1225	if (MI.killsRegister(X86::EFLAGS, &TRI))
				1226	return false;
				1227	}
				1228
				1229	// If we didn't find anything conclusive (neither definitely alive or
				1230	// definitely dead) return whether it lives into the block.
				1231	return MBB.isLiveIn(X86::EFLAGS);
				1232	}
				1233
Chandler Carruth	0477b40	2018-07-23 04:01:34 +0000	[diff] [blame]	1234	/// Harden all of the loads (including implicit loads) in the function.
				1235	///
				1236	/// This operates in two passes. First, we analyze the loads to determine which
				1237	/// strategy will be used to harden them: hardening the address or hardening the
				1238	/// loaded value when loaded into a register amenable to hardening. We have to
				1239	/// process these first because the two strategies may interact -- later
				1240	/// hardening may change what strategy we wish to use. We also will analyze
				1241	/// data dependencies between loads and avoid hardening those loads that are
				1242	/// data dependent on a load with a hardened address. We also skip hardening
				1243	/// loads already behind an LFENCE as that is sufficient to harden them against
				1244	/// misspeculation.
				1245	///
				1246	/// Second, we apply the hardening steps we determined necessary in the first
				1247	/// pass.
				1248	///
				1249	/// These two passes are applied to each basic block. We operate one block at
				1250	/// a time to simplify reasoning about reachability and sequencing.
				1251	void X86SpeculativeLoadHardeningPass::hardenAllLoads(MachineFunction &MF) {
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1252	// If the actual checking of loads is disabled, skip doing anything here.
				1253	if (!HardenLoads)
				1254	return;
				1255
				1256	SmallPtrSet<MachineInstr *, 16> HardenPostLoad;
				1257	SmallPtrSet<MachineInstr *, 16> HardenLoadAddr;
				1258
				1259	SmallSet<unsigned, 16> HardenedAddrRegs;
				1260
				1261	SmallDenseMap<unsigned, unsigned, 32> AddrRegToHardenedReg;
				1262
				1263	// Track the set of load-dependent registers through the basic block. Because
				1264	// the values of these registers have an existing data dependency on a loaded
				1265	// value which we would have checked, we can omit any checks on them.
				1266	SparseBitVector<> LoadDepRegs;
				1267
				1268	for (MachineBasicBlock &MBB : MF) {
				1269	// We harden the loads of a basic block in several passes:
				1270	//
				1271	// 1) Collect all the loads which can have their loaded value hardened
				1272	// and all the loads that instead need their address hardened. During
				1273	// this walk we propagate load dependence for address hardened loads and
				1274	// also look for LFENCE to stop hardening wherever possible. When
				1275	// deciding whether or not to harden the loaded value or not, we check
				1276	// to see if any registers used in the address will have been hardened
				1277	// at this point and if so, harden any remaining address registers as
				1278	// that often successfully re-uses hardened addresses and minimizes
				1279	// instructions. FIXME: We should consider an aggressive mode where we
				1280	// continue to keep as many loads value hardened even when some address
				1281	// register hardening would be free (due to reuse).
				1282	for (MachineInstr &MI : MBB) {
				1283	// We naively assume that all def'ed registers of an instruction have
				1284	// a data dependency on all of their operands.
				1285	// FIXME: Do a more careful analysis of x86 to build a conservative model
				1286	// here.
				1287	if (llvm::any_of(MI.uses(), [&](MachineOperand &Op) {
				1288	return Op.isReg() && LoadDepRegs.test(Op.getReg());
				1289	}))
				1290	for (MachineOperand &Def : MI.defs())
				1291	if (Def.isReg())
				1292	LoadDepRegs.set(Def.getReg());
				1293
				1294	// Both Intel and AMD are guiding that they will change the semantics of
				1295	// LFENCE to be a speculation barrier, so if we see an LFENCE, there is
				1296	// no more need to guard things in this block.
				1297	if (MI.getOpcode() == X86::LFENCE)
				1298	break;
				1299
				1300	// If this instruction cannot load, nothing to do.
				1301	if (!MI.mayLoad())
				1302	continue;
				1303
				1304	// Some instructions which "load" are trivially safe or unimportant.
				1305	if (MI.getOpcode() == X86::MFENCE)
				1306	continue;
				1307
				1308	// Extract the memory operand information about this instruction.
				1309	// FIXME: This doesn't handle loading pseudo instructions which we often
				1310	// could handle with similarly generic logic. We probably need to add an
				1311	// MI-layer routine similar to the MC-layer one we use here which maps
				1312	// pseudos much like this maps real instructions.
				1313	const MCInstrDesc &Desc = MI.getDesc();
				1314	int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
				1315	if (MemRefBeginIdx < 0) {
				1316	LLVM_DEBUG(dbgs() << "WARNING: unable to harden loading instruction: ";
				1317	MI.dump());
				1318	continue;
				1319	}
				1320
				1321	MemRefBeginIdx += X86II::getOperandBias(Desc);
				1322
				1323	MachineOperand &BaseMO = MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
				1324	MachineOperand &IndexMO =
				1325	MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
				1326
				1327	// If we have at least one (non-frame-index, non-RIP) register operand,
				1328	// and neither operand is load-dependent, we need to check the load.
				1329	unsigned BaseReg = 0, IndexReg = 0;
				1330	if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP &&
				1331	BaseMO.getReg() != X86::NoRegister)
				1332	BaseReg = BaseMO.getReg();
				1333	if (IndexMO.getReg() != X86::NoRegister)
				1334	IndexReg = IndexMO.getReg();
				1335
				1336	if (!BaseReg && !IndexReg)
				1337	// No register operands!
				1338	continue;
				1339
				1340	// If any register operand is dependent, this load is dependent and we
				1341	// needn't check it.
				1342	// FIXME: Is this true in the case where we are hardening loads after
				1343	// they complete? Unclear, need to investigate.
				1344	if ((BaseReg && LoadDepRegs.test(BaseReg)) \|\|
				1345	(IndexReg && LoadDepRegs.test(IndexReg)))
				1346	continue;
				1347
Chandler Carruth	e66a6f4	2018-07-16 11:38:48 +0000	[diff] [blame]	1348	// If post-load hardening is enabled, this load is compatible with
				1349	// post-load hardening, and we aren't already going to harden one of the
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1350	// address registers, queue it up to be hardened post-load. Notably, even
				1351	// once hardened this won't introduce a useful dependency that could prune
				1352	// out subsequent loads.
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	1353	if (EnablePostLoadHardening && isDataInvariantLoad(MI) &&
				1354	MI.getDesc().getNumDefs() == 1 && MI.getOperand(0).isReg() &&
				1355	canHardenRegister(MI.getOperand(0).getReg()) &&
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1356	!HardenedAddrRegs.count(BaseReg) &&
				1357	!HardenedAddrRegs.count(IndexReg)) {
				1358	HardenPostLoad.insert(&MI);
				1359	HardenedAddrRegs.insert(MI.getOperand(0).getReg());
				1360	continue;
				1361	}
				1362
				1363	// Record this instruction for address hardening and record its register
				1364	// operands as being address-hardened.
				1365	HardenLoadAddr.insert(&MI);
				1366	if (BaseReg)
				1367	HardenedAddrRegs.insert(BaseReg);
				1368	if (IndexReg)
				1369	HardenedAddrRegs.insert(IndexReg);
				1370
				1371	for (MachineOperand &Def : MI.defs())
				1372	if (Def.isReg())
				1373	LoadDepRegs.set(Def.getReg());
				1374	}
				1375
				1376	// Now re-walk the instructions in the basic block, and apply whichever
				1377	// hardening strategy we have elected. Note that we do this in a second
				1378	// pass specifically so that we have the complete set of instructions for
				1379	// which we will do post-load hardening and can defer it in certain
				1380	// circumstances.
				1381	//
				1382	// FIXME: This could probably be made even more effective by doing it
				1383	// across the entire function. Rather than just walking the flat list
				1384	// backwards here, we could walk the function in PO and each block bottom
				1385	// up, allowing us to in some cases sink hardening across block blocks. As
				1386	// long as the in-block predicate state is used at the eventual hardening
				1387	// site, this remains safe.
				1388	for (MachineInstr &MI : MBB) {
				1389	// We cannot both require hardening the def of a load and its address.
				1390	assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) &&
				1391	"Requested to harden both the address and def of a load!");
				1392
				1393	// Check if this is a load whose address needs to be hardened.
				1394	if (HardenLoadAddr.erase(&MI)) {
				1395	const MCInstrDesc &Desc = MI.getDesc();
				1396	int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
				1397	assert(MemRefBeginIdx >= 0 && "Cannot have an invalid index here!");
				1398
				1399	MemRefBeginIdx += X86II::getOperandBias(Desc);
				1400
				1401	MachineOperand &BaseMO =
				1402	MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
				1403	MachineOperand &IndexMO =
				1404	MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	1405	hardenLoadAddr(MI, BaseMO, IndexMO, AddrRegToHardenedReg);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1406	continue;
				1407	}
				1408
				1409	// Test if this instruction is one of our post load instructions (and
				1410	// remove it from the set if so).
				1411	if (HardenPostLoad.erase(&MI)) {
				1412	assert(!MI.isCall() && "Must not try to post-load harden a call!");
				1413
				1414	// If this is a data-invariant load, we want to try and sink any
				1415	// hardening as far as possible.
				1416	if (isDataInvariantLoad(MI)) {
				1417	// Sink the instruction we'll need to harden as far as we can down the
				1418	// graph.
				1419	MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad);
				1420
				1421	// If we managed to sink this instruction, update everything so we
				1422	// harden that instruction when we reach it in the instruction
				1423	// sequence.
				1424	if (SunkMI != &MI) {
				1425	// If in sinking there was no instruction needing to be hardened,
				1426	// we're done.
				1427	if (!SunkMI)
				1428	continue;
				1429
				1430	// Otherwise, add this to the set of defs we harden.
				1431	HardenPostLoad.insert(SunkMI);
				1432	continue;
				1433	}
				1434	}
				1435
Chandler Carruth	5452914	2018-07-24 12:44:00 +0000	[diff] [blame]	1436	unsigned HardenedReg = hardenPostLoad(MI);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1437
Chandler Carruth	5452914	2018-07-24 12:44:00 +0000	[diff] [blame]	1438	// Mark the resulting hardened register as such so we don't re-harden.
				1439	AddrRegToHardenedReg[HardenedReg] = HardenedReg;
				1440
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1441	continue;
				1442	}
				1443
				1444	// After we finish processing the instruction and doing any hardening
				1445	// necessary for it, we need to handle transferring the predicate state
				1446	// into a call and recovering it after the call returns (if it returns).
				1447	if (!MI.isCall())
				1448	continue;
				1449
				1450	// If we're not hardening interprocedurally, we can just skip calls.
				1451	if (!HardenInterprocedurally)
				1452	continue;
				1453
				1454	auto InsertPt = MI.getIterator();
				1455	DebugLoc Loc = MI.getDebugLoc();
				1456
				1457	// First, we transfer the predicate state into the called function by
				1458	// merging it into the stack pointer. This will kill the current def of
				1459	// the state.
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	1460	unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1461	mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg);
				1462
				1463	// If this call is also a return (because it is a tail call) we're done.
				1464	if (MI.isReturn())
				1465	continue;
				1466
				1467	// Otherwise we need to step past the call and recover the predicate
				1468	// state from SP after the return, and make this new state available.
				1469	++InsertPt;
				1470	unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	1471	PS->SSA.AddAvailableValue(&MBB, NewStateReg);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1472	}
				1473
				1474	HardenPostLoad.clear();
				1475	HardenLoadAddr.clear();
				1476	HardenedAddrRegs.clear();
				1477	AddrRegToHardenedReg.clear();
				1478
				1479	// Currently, we only track data-dependent loads within a basic block.
				1480	// FIXME: We should see if this is necessary or if we could be more
				1481	// aggressive here without opening up attack avenues.
				1482	LoadDepRegs.clear();
				1483	}
				1484	}
				1485
				1486	/// Save EFLAGS into the returned GPR. This can in turn be restored with
				1487	/// `restoreEFLAGS`.
				1488	///
				1489	/// Note that LLVM can only lower very simple patterns of saved and restored
				1490	/// EFLAGS registers. The restore should always be within the same basic block
				1491	/// as the save so that no PHI nodes are inserted.
				1492	unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
				1493	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
				1494	DebugLoc Loc) {
				1495	// FIXME: Hard coding this to a 32-bit register class seems weird, but matches
				1496	// what instruction selection does.
				1497	unsigned Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
				1498	// We directly copy the FLAGS register and rely on later lowering to clean
				1499	// this up into the appropriate setCC instructions.
				1500	BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
				1501	++NumInstsInserted;
				1502	return Reg;
				1503	}
				1504
				1505	/// Restore EFLAGS from the provided GPR. This should be produced by
				1506	/// `saveEFLAGS`.
				1507	///
				1508	/// This must be done within the same basic block as the save in order to
				1509	/// reliably lower.
				1510	void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
				1511	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
				1512	unsigned Reg) {
				1513	BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
				1514	++NumInstsInserted;
				1515	}
				1516
				1517	/// Takes the current predicate state (in a register) and merges it into the
				1518	/// stack pointer. The state is essentially a single bit, but we merge this in
				1519	/// a way that won't form non-canonical pointers and also will be preserved
				1520	/// across normal stack adjustments.
				1521	void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
				1522	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
				1523	unsigned PredStateReg) {
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	1524	unsigned TmpReg = MRI->createVirtualRegister(PS->RC);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1525	// FIXME: This hard codes a shift distance based on the number of bits needed
				1526	// to stay canonical on 64-bit. We should compute this somehow and support
				1527	// 32-bit as part of that.
				1528	auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg)
				1529	.addReg(PredStateReg, RegState::Kill)
				1530	.addImm(47);
				1531	ShiftI->addRegisterDead(X86::EFLAGS, TRI);
				1532	++NumInstsInserted;
				1533	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP)
				1534	.addReg(X86::RSP)
				1535	.addReg(TmpReg, RegState::Kill);
				1536	OrI->addRegisterDead(X86::EFLAGS, TRI);
				1537	++NumInstsInserted;
				1538	}
				1539
				1540	/// Extracts the predicate state stored in the high bits of the stack pointer.
				1541	unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
				1542	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
				1543	DebugLoc Loc) {
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	1544	unsigned PredStateReg = MRI->createVirtualRegister(PS->RC);
				1545	unsigned TmpReg = MRI->createVirtualRegister(PS->RC);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1546
				1547	// We know that the stack pointer will have any preserved predicate state in
				1548	// its high bit. We just want to smear this across the other bits. Turns out,
				1549	// this is exactly what an arithmetic right shift does.
				1550	BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg)
				1551	.addReg(X86::RSP);
				1552	auto ShiftI =
				1553	BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg)
				1554	.addReg(TmpReg, RegState::Kill)
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	1555	.addImm(TRI->getRegSizeInBits(*PS->RC) - 1);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1556	ShiftI->addRegisterDead(X86::EFLAGS, TRI);
				1557	++NumInstsInserted;
				1558
				1559	return PredStateReg;
				1560	}
				1561
				1562	void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
				1563	MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1564	SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
				1565	MachineBasicBlock &MBB = *MI.getParent();
				1566	DebugLoc Loc = MI.getDebugLoc();
				1567
				1568	// Check if EFLAGS are alive by seeing if there is a def of them or they
				1569	// live-in, and then seeing if that def is in turn used.
				1570	bool EFLAGSLive = isEFLAGSLive(MBB, MI.getIterator(), *TRI);
				1571
				1572	SmallVector<MachineOperand *, 2> HardenOpRegs;
				1573
				1574	if (BaseMO.isFI()) {
				1575	// A frame index is never a dynamically controllable load, so only
				1576	// harden it if we're covering fixed address loads as well.
				1577	LLVM_DEBUG(
				1578	dbgs() << " Skipping hardening base of explicit stack frame load: ";
				1579	MI.dump(); dbgs() << "\n");
				1580	} else if (BaseMO.getReg() == X86::RIP \|\|
				1581	BaseMO.getReg() == X86::NoRegister) {
				1582	// For both RIP-relative addressed loads or absolute loads, we cannot
				1583	// meaningfully harden them because the address being loaded has no
				1584	// dynamic component.
				1585	//
				1586	// FIXME: When using a segment base (like TLS does) we end up with the
				1587	// dynamic address being the base plus -1 because we can't mutate the
				1588	// segment register here. This allows the signed 32-bit offset to point at
				1589	// valid segment-relative addresses and load them successfully.
				1590	LLVM_DEBUG(
				1591	dbgs() << " Cannot harden base of "
				1592	<< (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base")
				1593	<< " address in a load!");
				1594	} else {
				1595	assert(BaseMO.isReg() &&
				1596	"Only allowed to have a frame index or register base.");
				1597	HardenOpRegs.push_back(&BaseMO);
				1598	}
				1599
				1600	if (IndexMO.getReg() != X86::NoRegister &&
				1601	(HardenOpRegs.empty() \|\|
				1602	HardenOpRegs.front()->getReg() != IndexMO.getReg()))
				1603	HardenOpRegs.push_back(&IndexMO);
				1604
				1605	assert((HardenOpRegs.size() == 1 \|\| HardenOpRegs.size() == 2) &&
				1606	"Should have exactly one or two registers to harden!");
				1607	assert((HardenOpRegs.size() == 1 \|\|
				1608	HardenOpRegs[0]->getReg() != HardenOpRegs[1]->getReg()) &&
				1609	"Should not have two of the same registers!");
				1610
				1611	// Remove any registers that have alreaded been checked.
				1612	llvm::erase_if(HardenOpRegs, [&](MachineOperand *Op) {
				1613	// See if this operand's register has already been checked.
				1614	auto It = AddrRegToHardenedReg.find(Op->getReg());
				1615	if (It == AddrRegToHardenedReg.end())
				1616	// Not checked, so retain this one.
				1617	return false;
				1618
				1619	// Otherwise, we can directly update this operand and remove it.
				1620	Op->setReg(It->second);
				1621	return true;
				1622	});
				1623	// If there are none left, we're done.
				1624	if (HardenOpRegs.empty())
				1625	return;
				1626
				1627	// Compute the current predicate state.
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	1628	unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1629
				1630	auto InsertPt = MI.getIterator();
				1631
				1632	// If EFLAGS are live and we don't have access to instructions that avoid
				1633	// clobbering EFLAGS we need to save and restore them. This in turn makes
				1634	// the EFLAGS no longer live.
				1635	unsigned FlagsReg = 0;
				1636	if (EFLAGSLive && !Subtarget->hasBMI2()) {
				1637	EFLAGSLive = false;
				1638	FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
				1639	}
				1640
				1641	for (MachineOperand *Op : HardenOpRegs) {
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1642	unsigned OpReg = Op->getReg();
Chandler Carruth	cdf0add	2018-07-16 04:17:51 +0000	[diff] [blame]	1643	auto *OpRC = MRI->getRegClass(OpReg);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1644	unsigned TmpReg = MRI->createVirtualRegister(OpRC);
				1645
Chandler Carruth	cdf0add	2018-07-16 04:17:51 +0000	[diff] [blame]	1646	// If this is a vector register, we'll need somewhat custom logic to handle
				1647	// hardening it.
				1648	if (!Subtarget->hasVLX() && (OpRC->hasSuperClassEq(&X86::VR128RegClass) \|\|
				1649	OpRC->hasSuperClassEq(&X86::VR256RegClass))) {
				1650	assert(Subtarget->hasAVX2() && "AVX2-specific register classes!");
				1651	bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128RegClass);
				1652
				1653	// Move our state into a vector register.
				1654	// FIXME: We could skip this at the cost of longer encodings with AVX-512
				1655	// but that doesn't seem likely worth it.
				1656	unsigned VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass);
				1657	auto MovI =
				1658	BuildMI(MBB, InsertPt, Loc, TII->get(X86::VMOV64toPQIrr), VStateReg)
				1659	.addReg(StateReg);
				1660	(void)MovI;
				1661	++NumInstsInserted;
				1662	LLVM_DEBUG(dbgs() << " Inserting mov: "; MovI->dump(); dbgs() << "\n");
				1663
				1664	// Broadcast it across the vector register.
				1665	unsigned VBStateReg = MRI->createVirtualRegister(OpRC);
				1666	auto BroadcastI = BuildMI(MBB, InsertPt, Loc,
				1667	TII->get(Is128Bit ? X86::VPBROADCASTQrr
				1668	: X86::VPBROADCASTQYrr),
				1669	VBStateReg)
				1670	.addReg(VStateReg);
				1671	(void)BroadcastI;
				1672	++NumInstsInserted;
				1673	LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
				1674	dbgs() << "\n");
				1675
				1676	// Merge our potential poison state into the value with a vector or.
				1677	auto OrI =
				1678	BuildMI(MBB, InsertPt, Loc,
				1679	TII->get(Is128Bit ? X86::VPORrr : X86::VPORYrr), TmpReg)
				1680	.addReg(VBStateReg)
				1681	.addReg(OpReg);
				1682	(void)OrI;
				1683	++NumInstsInserted;
				1684	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
				1685	} else if (OpRC->hasSuperClassEq(&X86::VR128XRegClass) \|\|
				1686	OpRC->hasSuperClassEq(&X86::VR256XRegClass) \|\|
				1687	OpRC->hasSuperClassEq(&X86::VR512RegClass)) {
				1688	assert(Subtarget->hasAVX512() && "AVX512-specific register classes!");
				1689	bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128XRegClass);
				1690	bool Is256Bit = OpRC->hasSuperClassEq(&X86::VR256XRegClass);
				1691	if (Is128Bit \|\| Is256Bit)
				1692	assert(Subtarget->hasVLX() && "AVX512VL-specific register classes!");
				1693
				1694	// Broadcast our state into a vector register.
				1695	unsigned VStateReg = MRI->createVirtualRegister(OpRC);
				1696	unsigned BroadcastOp =
				1697	Is128Bit ? X86::VPBROADCASTQrZ128r
				1698	: Is256Bit ? X86::VPBROADCASTQrZ256r : X86::VPBROADCASTQrZr;
				1699	auto BroadcastI =
				1700	BuildMI(MBB, InsertPt, Loc, TII->get(BroadcastOp), VStateReg)
				1701	.addReg(StateReg);
				1702	(void)BroadcastI;
				1703	++NumInstsInserted;
				1704	LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
				1705	dbgs() << "\n");
				1706
				1707	// Merge our potential poison state into the value with a vector or.
				1708	unsigned OrOp = Is128Bit ? X86::VPORQZ128rr
				1709	: Is256Bit ? X86::VPORQZ256rr : X86::VPORQZrr;
				1710	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOp), TmpReg)
				1711	.addReg(VStateReg)
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1712	.addReg(OpReg);
Chandler Carruth	bc46bca	2018-07-16 04:42:27 +0000	[diff] [blame]	1713	(void)OrI;
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1714	++NumInstsInserted;
				1715	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
				1716	} else {
Chandler Carruth	cdf0add	2018-07-16 04:17:51 +0000	[diff] [blame]	1717	// FIXME: Need to support GR32 here for 32-bit code.
				1718	assert(OpRC->hasSuperClassEq(&X86::GR64RegClass) &&
				1719	"Not a supported register class for address hardening!");
				1720
				1721	if (!EFLAGSLive) {
				1722	// Merge our potential poison state into the value with an or.
				1723	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), TmpReg)
				1724	.addReg(StateReg)
				1725	.addReg(OpReg);
				1726	OrI->addRegisterDead(X86::EFLAGS, TRI);
				1727	++NumInstsInserted;
				1728	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
				1729	} else {
				1730	// We need to avoid touching EFLAGS so shift out all but the least
				1731	// significant bit using the instruction that doesn't update flags.
				1732	auto ShiftI =
				1733	BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHRX64rr), TmpReg)
				1734	.addReg(OpReg)
				1735	.addReg(StateReg);
				1736	(void)ShiftI;
				1737	++NumInstsInserted;
				1738	LLVM_DEBUG(dbgs() << " Inserting shrx: "; ShiftI->dump();
				1739	dbgs() << "\n");
				1740	}
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1741	}
				1742
				1743	// Record this register as checked and update the operand.
				1744	assert(!AddrRegToHardenedReg.count(Op->getReg()) &&
				1745	"Should not have checked this register yet!");
				1746	AddrRegToHardenedReg[Op->getReg()] = TmpReg;
				1747	Op->setReg(TmpReg);
				1748	++NumAddrRegsHardened;
				1749	}
				1750
				1751	// And restore the flags if needed.
				1752	if (FlagsReg)
				1753	restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
				1754	}
				1755
				1756	MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	1757	MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedInstrs) {
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1758	assert(isDataInvariantLoad(InitialMI) &&
				1759	"Cannot get here with a non-invariant load!");
				1760
				1761	// See if we can sink hardening the loaded value.
				1762	auto SinkCheckToSingleUse =
				1763	[&](MachineInstr &MI) -> Optional<MachineInstr *> {
				1764	unsigned DefReg = MI.getOperand(0).getReg();
				1765
				1766	// We need to find a single use which we can sink the check. We can
				1767	// primarily do this because many uses may already end up checked on their
				1768	// own.
				1769	MachineInstr *SingleUseMI = nullptr;
				1770	for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
				1771	// If we're already going to harden this use, it is data invariant and
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	1772	// within our block.
				1773	if (HardenedInstrs.count(&UseMI)) {
				1774	if (!isDataInvariantLoad(UseMI)) {
				1775	// If we've already decided to harden a non-load, we must have sunk
				1776	// some other post-load hardened instruction to it and it must itself
				1777	// be data-invariant.
				1778	assert(isDataInvariant(UseMI) &&
				1779	"Data variant instruction being hardened!");
				1780	continue;
				1781	}
				1782
				1783	// Otherwise, this is a load and the load component can't be data
				1784	// invariant so check how this register is being used.
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1785	const MCInstrDesc &Desc = UseMI.getDesc();
				1786	int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
				1787	assert(MemRefBeginIdx >= 0 &&
				1788	"Should always have mem references here!");
				1789	MemRefBeginIdx += X86II::getOperandBias(Desc);
				1790
				1791	MachineOperand &BaseMO =
				1792	UseMI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
				1793	MachineOperand &IndexMO =
				1794	UseMI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
				1795	if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) \|\|
				1796	(IndexMO.isReg() && IndexMO.getReg() == DefReg))
				1797	// The load uses the register as part of its address making it not
				1798	// invariant.
				1799	return {};
				1800
				1801	continue;
				1802	}
				1803
				1804	if (SingleUseMI)
				1805	// We already have a single use, this would make two. Bail.
				1806	return {};
				1807
				1808	// If this single use isn't data invariant, isn't in this block, or has
				1809	// interfering EFLAGS, we can't sink the hardening to it.
				1810	if (!isDataInvariant(UseMI) \|\| UseMI.getParent() != MI.getParent())
				1811	return {};
				1812
				1813	// If this instruction defines multiple registers bail as we won't harden
				1814	// all of them.
				1815	if (UseMI.getDesc().getNumDefs() > 1)
				1816	return {};
				1817
				1818	// If this register isn't a virtual register we can't walk uses of sanely,
				1819	// just bail. Also check that its register class is one of the ones we
				1820	// can harden.
				1821	unsigned UseDefReg = UseMI.getOperand(0).getReg();
				1822	if (!TRI->isVirtualRegister(UseDefReg) \|\|
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	1823	!canHardenRegister(UseDefReg))
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1824	return {};
				1825
				1826	SingleUseMI = &UseMI;
				1827	}
				1828
				1829	// If SingleUseMI is still null, there is no use that needs its own
				1830	// checking. Otherwise, it is the single use that needs checking.
				1831	return {SingleUseMI};
				1832	};
				1833
				1834	MachineInstr *MI = &InitialMI;
				1835	while (Optional<MachineInstr > SingleUse = SinkCheckToSingleUse(MI)) {
				1836	// Update which MI we're checking now.
				1837	MI = *SingleUse;
				1838	if (!MI)
				1839	break;
				1840	}
				1841
				1842	return MI;
				1843	}
				1844
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	1845	bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) {
				1846	auto *RC = MRI->getRegClass(Reg);
				1847	int RegBytes = TRI->getRegSizeInBits(*RC) / 8;
				1848	if (RegBytes > 8)
				1849	// We don't support post-load hardening of vectors.
Chandler Carruth	e66a6f4	2018-07-16 11:38:48 +0000	[diff] [blame]	1850	return false;
				1851
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	1852	// If this register class is explicitly constrained to a class that doesn't
				1853	// require REX prefix, we may not be able to satisfy that constraint when
				1854	// emitting the hardening instructions, so bail out here.
				1855	// FIXME: This seems like a pretty lame hack. The way this comes up is when we
				1856	// end up both with a NOREX and REX-only register as operands to the hardening
				1857	// instructions. It would be better to fix that code to handle this situation
				1858	// rather than hack around it in this way.
				1859	const TargetRegisterClass *NOREXRegClasses[] = {
				1860	&X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass,
				1861	&X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass};
				1862	if (RC == NOREXRegClasses[Log2_32(RegBytes)])
Chandler Carruth	e66a6f4	2018-07-16 11:38:48 +0000	[diff] [blame]	1863	return false;
				1864
				1865	const TargetRegisterClass *GPRRegClasses[] = {
				1866	&X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass,
				1867	&X86::GR64RegClass};
Chandler Carruth	fa065aa	2018-07-16 14:58:32 +0000	[diff] [blame]	1868	return RC->hasSuperClassEq(GPRRegClasses[Log2_32(RegBytes)]);
Chandler Carruth	e66a6f4	2018-07-16 11:38:48 +0000	[diff] [blame]	1869	}
				1870
Chandler Carruth	5452914	2018-07-24 12:44:00 +0000	[diff] [blame]	1871	/// Harden a value in a register.
Chandler Carruth	376113d	2018-07-24 12:19:01 +0000	[diff] [blame]	1872	///
Chandler Carruth	5452914	2018-07-24 12:44:00 +0000	[diff] [blame]	1873	/// This is the low-level logic to fully harden a value sitting in a register
				1874	/// against leaking during speculative execution.
				1875	///
				1876	/// Unlike hardening an address that is used by a load, this routine is required
				1877	/// to hide all incoming bits in the register.
				1878	///
				1879	/// `Reg` must be a virtual register. Currently, it is required to be a GPR no
				1880	/// larger than the predicate state register. FIXME: We should support vector
				1881	/// registers here by broadcasting the predicate state.
				1882	///
				1883	/// The new, hardened virtual register is returned. It will have the same
				1884	/// register class as `Reg`.
				1885	unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
				1886	unsigned Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
				1887	DebugLoc Loc) {
				1888	assert(canHardenRegister(Reg) && "Cannot harden this register!");
				1889	assert(TRI->isVirtualRegister(Reg) && "Cannot harden a physical register!");
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1890
Chandler Carruth	5452914	2018-07-24 12:44:00 +0000	[diff] [blame]	1891	auto *RC = MRI->getRegClass(Reg);
				1892	int Bytes = TRI->getRegSizeInBits(*RC) / 8;
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1893
Chandler Carruth	66fbbbc	2018-07-24 00:35:36 +0000	[diff] [blame]	1894	unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1895
Chandler Carruth	66fbbbc	2018-07-24 00:35:36 +0000	[diff] [blame]	1896	// FIXME: Need to teach this about 32-bit mode.
Chandler Carruth	5452914	2018-07-24 12:44:00 +0000	[diff] [blame]	1897	if (Bytes != 8) {
Chandler Carruth	66fbbbc	2018-07-24 00:35:36 +0000	[diff] [blame]	1898	unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
Chandler Carruth	5452914	2018-07-24 12:44:00 +0000	[diff] [blame]	1899	unsigned SubRegImm = SubRegImms[Log2_32(Bytes)];
				1900	unsigned NarrowStateReg = MRI->createVirtualRegister(RC);
				1901	BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), NarrowStateReg)
Chandler Carruth	66fbbbc	2018-07-24 00:35:36 +0000	[diff] [blame]	1902	.addReg(StateReg, 0, SubRegImm);
				1903	StateReg = NarrowStateReg;
				1904	}
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1905
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1906	unsigned FlagsReg = 0;
Chandler Carruth	b46c22d	2018-07-24 00:21:59 +0000	[diff] [blame]	1907	if (isEFLAGSLive(MBB, InsertPt, *TRI))
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1908	FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1909
Chandler Carruth	5452914	2018-07-24 12:44:00 +0000	[diff] [blame]	1910	unsigned NewReg = MRI->createVirtualRegister(RC);
Chandler Carruth	66fbbbc	2018-07-24 00:35:36 +0000	[diff] [blame]	1911	unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
Chandler Carruth	5452914	2018-07-24 12:44:00 +0000	[diff] [blame]	1912	unsigned OrOpCode = OrOpCodes[Log2_32(Bytes)];
				1913	auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), NewReg)
Chandler Carruth	b46c22d	2018-07-24 00:21:59 +0000	[diff] [blame]	1914	.addReg(StateReg)
Chandler Carruth	5452914	2018-07-24 12:44:00 +0000	[diff] [blame]	1915	.addReg(Reg);
Chandler Carruth	b46c22d	2018-07-24 00:21:59 +0000	[diff] [blame]	1916	OrI->addRegisterDead(X86::EFLAGS, TRI);
				1917	++NumInstsInserted;
				1918	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1919
				1920	if (FlagsReg)
				1921	restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
				1922
Chandler Carruth	5452914	2018-07-24 12:44:00 +0000	[diff] [blame]	1923	return NewReg;
				1924	}
				1925
				1926	/// Harden a load by hardening the loaded value in the defined register.
				1927	///
				1928	/// We can harden a non-leaking load into a register without touching the
				1929	/// address by just hiding all of the loaded bits during misspeculation. We use
				1930	/// an `or` instruction to do this because we set up our poison value as all
				1931	/// ones. And the goal is just for the loaded bits to not be exposed to
				1932	/// execution and coercing them to one is sufficient.
				1933	///
				1934	/// Returns the newly hardened register.
				1935	unsigned X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) {
				1936	MachineBasicBlock &MBB = *MI.getParent();
				1937	DebugLoc Loc = MI.getDebugLoc();
				1938
				1939	auto &DefOp = MI.getOperand(0);
				1940	unsigned OldDefReg = DefOp.getReg();
				1941	auto *DefRC = MRI->getRegClass(OldDefReg);
				1942
				1943	// Because we want to completely replace the uses of this def'ed value with
				1944	// the hardened value, create a dedicated new register that will only be used
				1945	// to communicate the unhardened value to the hardening.
				1946	unsigned UnhardenedReg = MRI->createVirtualRegister(DefRC);
				1947	DefOp.setReg(UnhardenedReg);
				1948
				1949	// Now harden this register's value, getting a hardened reg that is safe to
				1950	// use. Note that we insert the instructions to compute this after the
				1951	// defining instruction, not before it.
				1952	unsigned HardenedReg = hardenValueInRegister(
				1953	UnhardenedReg, MBB, std::next(MI.getIterator()), Loc);
				1954
				1955	// Finally, replace the old register (which now only has the uses of the
				1956	// original def) with the hardened register.
				1957	MRI->replaceRegWith(/FromReg/ OldDefReg, /ToReg/ HardenedReg);
				1958
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1959	++NumPostLoadRegsHardened;
Chandler Carruth	5452914	2018-07-24 12:44:00 +0000	[diff] [blame]	1960	return HardenedReg;
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1961	}
				1962
Chandler Carruth	a3a03ac	2018-07-19 23:46:24 +0000	[diff] [blame]	1963	/// Harden a return instruction.
				1964	///
				1965	/// Returns implicitly perform a load which we need to harden. Without hardening
				1966	/// this load, an attacker my speculatively write over the return address to
				1967	/// steer speculation of the return to an attacker controlled address. This is
				1968	/// called Spectre v1.1 or Bounds Check Bypass Store (BCBS) and is described in
				1969	/// this paper:
				1970	/// https://people.csail.mit.edu/vlk/spectre11.pdf
				1971	///
				1972	/// We can harden this by introducing an LFENCE that will delay any load of the
				1973	/// return address until prior instructions have retired (and thus are not being
				1974	/// speculated), or we can harden the address used by the implicit load: the
				1975	/// stack pointer.
				1976	///
				1977	/// If we are not using an LFENCE, hardening the stack pointer has an additional
				1978	/// benefit: it allows us to pass the predicate state accumulated in this
				1979	/// function back to the caller. In the absence of a BCBS attack on the return,
				1980	/// the caller will typically be resumed and speculatively executed due to the
				1981	/// Return Stack Buffer (RSB) prediction which is very accurate and has a high
				1982	/// priority. It is possible that some code from the caller will be executed
				1983	/// speculatively even during a BCBS-attacked return until the steering takes
				1984	/// effect. Whenever this happens, the caller can recover the (poisoned)
				1985	/// predicate state from the stack pointer and continue to harden loads.
				1986	void X86SpeculativeLoadHardeningPass::hardenReturnInstr(MachineInstr &MI) {
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	1987	MachineBasicBlock &MBB = *MI.getParent();
				1988	DebugLoc Loc = MI.getDebugLoc();
				1989	auto InsertPt = MI.getIterator();
				1990
				1991	if (FenceCallAndRet) {
				1992	// Simply forcibly block speculation of loads out of the function by using
				1993	// an LFENCE. This is potentially a heavy-weight mitigation strategy, but
				1994	// should be secure, is simple from an ABI perspective, and the cost can be
				1995	// minimized through inlining.
				1996	//
				1997	// FIXME: We should investigate ways to establish a strong data-dependency
				1998	// on the return. However, poisoning the stack pointer is unlikely to work
				1999	// because the return is predicted rather than relying on the load of the
				2000	// return address to actually resolve.
				2001	BuildMI(MBB, InsertPt, Loc, TII->get(X86::LFENCE));
				2002	++NumInstsInserted;
				2003	++NumLFENCEsInserted;
				2004	return;
				2005	}
				2006
				2007	// Take our predicate state, shift it to the high 17 bits (so that we keep
				2008	// pointers canonical) and merge it into RSP. This will allow the caller to
				2009	// extract it when we return (speculatively).
Chandler Carruth	4b0028a	2018-07-19 11:13:58 +0000	[diff] [blame]	2010	mergePredStateIntoSP(MBB, InsertPt, Loc, PS->SSA.GetValueAtEndOfBlock(&MBB));
Chandler Carruth	90358e1	2018-07-13 11:13:58 +0000	[diff] [blame]	2011	}
				2012
				2013	INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
				2014	"X86 speculative load hardener", false, false)
				2015	INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
				2016	"X86 speculative load hardener", false, false)
				2017
				2018	FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() {
				2019	return new X86SpeculativeLoadHardeningPass();
				2020	}