blob: 93dcf95bebdce98aabcd77ea6acca71ef69ef878 [file] [log] [blame]
Chandler Carruth90358e12018-07-13 11:13:58 +00001//====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9/// \file
10///
11/// Provide a pass which mitigates speculative execution attacks which operate
12/// by speculating incorrectly past some predicate (a type check, bounds check,
13/// or other condition) to reach a load with invalid inputs and leak the data
14/// accessed by that load using a side channel out of the speculative domain.
15///
16/// For details on the attacks, see the first variant in both the Project Zero
17/// writeup and the Spectre paper:
18/// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
19/// https://spectreattack.com/spectre.pdf
20///
21//===----------------------------------------------------------------------===//
22
23#include "X86.h"
24#include "X86InstrBuilder.h"
25#include "X86InstrInfo.h"
26#include "X86Subtarget.h"
27#include "llvm/ADT/ArrayRef.h"
28#include "llvm/ADT/DenseMap.h"
Chandler Carruth4b0028a2018-07-19 11:13:58 +000029#include "llvm/ADT/Optional.h"
Chandler Carruth90358e12018-07-13 11:13:58 +000030#include "llvm/ADT/STLExtras.h"
31#include "llvm/ADT/ScopeExit.h"
32#include "llvm/ADT/SmallPtrSet.h"
33#include "llvm/ADT/SmallSet.h"
34#include "llvm/ADT/SmallVector.h"
35#include "llvm/ADT/SparseBitVector.h"
36#include "llvm/ADT/Statistic.h"
37#include "llvm/CodeGen/MachineBasicBlock.h"
38#include "llvm/CodeGen/MachineConstantPool.h"
39#include "llvm/CodeGen/MachineFunction.h"
40#include "llvm/CodeGen/MachineFunctionPass.h"
41#include "llvm/CodeGen/MachineInstr.h"
42#include "llvm/CodeGen/MachineInstrBuilder.h"
43#include "llvm/CodeGen/MachineModuleInfo.h"
44#include "llvm/CodeGen/MachineOperand.h"
45#include "llvm/CodeGen/MachineRegisterInfo.h"
46#include "llvm/CodeGen/MachineSSAUpdater.h"
47#include "llvm/CodeGen/TargetInstrInfo.h"
48#include "llvm/CodeGen/TargetRegisterInfo.h"
49#include "llvm/CodeGen/TargetSchedule.h"
50#include "llvm/CodeGen/TargetSubtargetInfo.h"
51#include "llvm/IR/DebugLoc.h"
52#include "llvm/MC/MCSchedule.h"
53#include "llvm/Pass.h"
54#include "llvm/Support/CommandLine.h"
55#include "llvm/Support/Debug.h"
56#include "llvm/Support/raw_ostream.h"
57#include <algorithm>
58#include <cassert>
59#include <iterator>
60#include <utility>
61
62using namespace llvm;
63
64#define PASS_KEY "x86-speculative-load-hardening"
65#define DEBUG_TYPE PASS_KEY
66
67STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced");
68STATISTIC(NumBranchesUntraced, "Number of branches unable to trace");
69STATISTIC(NumAddrRegsHardened,
70 "Number of address mode used registers hardaned");
71STATISTIC(NumPostLoadRegsHardened,
72 "Number of post-load register values hardened");
73STATISTIC(NumInstsInserted, "Number of instructions inserted");
74STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
75
76static cl::opt<bool> HardenEdgesWithLFENCE(
77 PASS_KEY "-lfence",
78 cl::desc(
79 "Use LFENCE along each conditional edge to harden against speculative "
80 "loads rather than conditional movs and poisoned pointers."),
81 cl::init(false), cl::Hidden);
82
83static cl::opt<bool> EnablePostLoadHardening(
84 PASS_KEY "-post-load",
85 cl::desc("Harden the value loaded *after* it is loaded by "
86 "flushing the loaded bits to 1. This is hard to do "
87 "in general but can be done easily for GPRs."),
88 cl::init(true), cl::Hidden);
89
90static cl::opt<bool> FenceCallAndRet(
91 PASS_KEY "-fence-call-and-ret",
92 cl::desc("Use a full speculation fence to harden both call and ret edges "
93 "rather than a lighter weight mitigation."),
94 cl::init(false), cl::Hidden);
95
96static cl::opt<bool> HardenInterprocedurally(
97 PASS_KEY "-ip",
98 cl::desc("Harden interprocedurally by passing our state in and out of "
99 "functions in the high bits of the stack pointer."),
100 cl::init(true), cl::Hidden);
101
102static cl::opt<bool>
103 HardenLoads(PASS_KEY "-loads",
104 cl::desc("Sanitize loads from memory. When disable, no "
105 "significant security is provided."),
106 cl::init(true), cl::Hidden);
107
108namespace llvm {
109
110void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
111
112} // end namespace llvm
113
114namespace {
115
116class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
117public:
118 X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) {
119 initializeX86SpeculativeLoadHardeningPassPass(
120 *PassRegistry::getPassRegistry());
121 }
122
123 StringRef getPassName() const override {
124 return "X86 speculative load hardening";
125 }
126 bool runOnMachineFunction(MachineFunction &MF) override;
127 void getAnalysisUsage(AnalysisUsage &AU) const override;
128
129 /// Pass identification, replacement for typeid.
130 static char ID;
131
132private:
133 /// The information about a block's conditional terminators needed to trace
134 /// our predicate state through the exiting edges.
135 struct BlockCondInfo {
136 MachineBasicBlock *MBB;
137
138 // We mostly have one conditional branch, and in extremely rare cases have
139 // two. Three and more are so rare as to be unimportant for compile time.
140 SmallVector<MachineInstr *, 2> CondBrs;
141
142 MachineInstr *UncondBr;
143 };
144
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000145 /// Manages the predicate state traced through the program.
146 struct PredState {
147 unsigned InitialReg;
148 unsigned PoisonReg;
149
150 const TargetRegisterClass *RC;
151 MachineSSAUpdater SSA;
152
153 PredState(MachineFunction &MF, const TargetRegisterClass *RC)
154 : RC(RC), SSA(MF) {}
155 };
156
Chandler Carruth90358e12018-07-13 11:13:58 +0000157 const X86Subtarget *Subtarget;
158 MachineRegisterInfo *MRI;
159 const X86InstrInfo *TII;
160 const TargetRegisterInfo *TRI;
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000161
162 Optional<PredState> PS;
Chandler Carruth90358e12018-07-13 11:13:58 +0000163
164 void hardenEdgesWithLFENCE(MachineFunction &MF);
165
166 SmallVector<BlockCondInfo, 16> collectBlockCondInfo(MachineFunction &MF);
167
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000168 SmallVector<MachineInstr *, 16>
169 tracePredStateThroughCFG(MachineFunction &MF, ArrayRef<BlockCondInfo> Infos);
170
Chandler Carruth0477b402018-07-23 04:01:34 +0000171 void hardenAllLoads(MachineFunction &MF);
Chandler Carruth90358e12018-07-13 11:13:58 +0000172
173 unsigned saveEFLAGS(MachineBasicBlock &MBB,
174 MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
175 void restoreEFLAGS(MachineBasicBlock &MBB,
176 MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
177 unsigned OFReg);
178
179 void mergePredStateIntoSP(MachineBasicBlock &MBB,
180 MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
181 unsigned PredStateReg);
182 unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
183 MachineBasicBlock::iterator InsertPt,
184 DebugLoc Loc);
185
186 void
187 hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000188 MachineOperand &IndexMO,
Chandler Carruth90358e12018-07-13 11:13:58 +0000189 SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
190 MachineInstr *
191 sinkPostLoadHardenedInst(MachineInstr &MI,
Chandler Carruthfa065aa2018-07-16 14:58:32 +0000192 SmallPtrSetImpl<MachineInstr *> &HardenedInstrs);
193 bool canHardenRegister(unsigned Reg);
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000194 void hardenPostLoad(MachineInstr &MI);
Chandler Carrutha3a03ac2018-07-19 23:46:24 +0000195 void hardenReturnInstr(MachineInstr &MI);
Chandler Carruth90358e12018-07-13 11:13:58 +0000196};
197
198} // end anonymous namespace
199
200char X86SpeculativeLoadHardeningPass::ID = 0;
201
202void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
203 AnalysisUsage &AU) const {
204 MachineFunctionPass::getAnalysisUsage(AU);
205}
206
207static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB,
208 MachineBasicBlock &Succ, int SuccCount,
209 MachineInstr *Br, MachineInstr *&UncondBr,
210 const X86InstrInfo &TII) {
211 assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!");
212
213 MachineFunction &MF = *MBB.getParent();
214
215 MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
216
217 // We have to insert the new block immediately after the current one as we
218 // don't know what layout-successor relationships the successor has and we
219 // may not be able to (and generally don't want to) try to fix those up.
220 MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
221
222 // Update the branch instruction if necessary.
223 if (Br) {
224 assert(Br->getOperand(0).getMBB() == &Succ &&
225 "Didn't start with the right target!");
226 Br->getOperand(0).setMBB(&NewMBB);
227
228 // If this successor was reached through a branch rather than fallthrough,
229 // we might have *broken* fallthrough and so need to inject a new
230 // unconditional branch.
231 if (!UncondBr) {
232 MachineBasicBlock &OldLayoutSucc =
233 *std::next(MachineFunction::iterator(&NewMBB));
234 assert(MBB.isSuccessor(&OldLayoutSucc) &&
235 "Without an unconditional branch, the old layout successor should "
236 "be an actual successor!");
237 auto BrBuilder =
238 BuildMI(&MBB, DebugLoc(), TII.get(X86::JMP_1)).addMBB(&OldLayoutSucc);
239 // Update the unconditional branch now that we've added one.
240 UncondBr = &*BrBuilder;
241 }
242
243 // Insert unconditional "jump Succ" instruction in the new block if
244 // necessary.
245 if (!NewMBB.isLayoutSuccessor(&Succ)) {
246 SmallVector<MachineOperand, 4> Cond;
247 TII.insertBranch(NewMBB, &Succ, nullptr, Cond, Br->getDebugLoc());
248 }
249 } else {
250 assert(!UncondBr &&
251 "Cannot have a branchless successor and an unconditional branch!");
252 assert(NewMBB.isLayoutSuccessor(&Succ) &&
253 "A non-branch successor must have been a layout successor before "
254 "and now is a layout successor of the new block.");
255 }
256
257 // If this is the only edge to the successor, we can just replace it in the
258 // CFG. Otherwise we need to add a new entry in the CFG for the new
259 // successor.
260 if (SuccCount == 1) {
261 MBB.replaceSuccessor(&Succ, &NewMBB);
262 } else {
263 MBB.splitSuccessor(&Succ, &NewMBB);
264 }
265
266 // Hook up the edge from the new basic block to the old successor in the CFG.
267 NewMBB.addSuccessor(&Succ);
268
269 // Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
270 for (MachineInstr &MI : Succ) {
271 if (!MI.isPHI())
272 break;
273 for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
274 OpIdx += 2) {
275 MachineOperand &OpV = MI.getOperand(OpIdx);
276 MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
277 assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
278 if (OpMBB.getMBB() != &MBB)
279 continue;
280
281 // If this is the last edge to the succesor, just replace MBB in the PHI
282 if (SuccCount == 1) {
283 OpMBB.setMBB(&NewMBB);
284 break;
285 }
286
287 // Otherwise, append a new pair of operands for the new incoming edge.
288 MI.addOperand(MF, OpV);
289 MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
290 break;
291 }
292 }
293
294 // Inherit live-ins from the successor
295 for (auto &LI : Succ.liveins())
296 NewMBB.addLiveIn(LI);
297
298 LLVM_DEBUG(dbgs() << " Split edge from '" << MBB.getName() << "' to '"
299 << Succ.getName() << "'.\n");
300 return NewMBB;
301}
302
Chandler Carruth3ffcc032018-07-15 23:46:36 +0000303/// Removing duplicate PHI operands to leave the PHI in a canonical and
304/// predictable form.
305///
306/// FIXME: It's really frustrating that we have to do this, but SSA-form in MIR
307/// isn't what you might expect. We may have multiple entries in PHI nodes for
308/// a single predecessor. This makes CFG-updating extremely complex, so here we
309/// simplify all PHI nodes to a model even simpler than the IR's model: exactly
310/// one entry per predecessor, regardless of how many edges there are.
311static void canonicalizePHIOperands(MachineFunction &MF) {
312 SmallPtrSet<MachineBasicBlock *, 4> Preds;
313 SmallVector<int, 4> DupIndices;
314 for (auto &MBB : MF)
315 for (auto &MI : MBB) {
316 if (!MI.isPHI())
317 break;
318
319 // First we scan the operands of the PHI looking for duplicate entries
320 // a particular predecessor. We retain the operand index of each duplicate
321 // entry found.
322 for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
323 OpIdx += 2)
324 if (!Preds.insert(MI.getOperand(OpIdx + 1).getMBB()).second)
325 DupIndices.push_back(OpIdx);
326
327 // Now walk the duplicate indices, removing both the block and value. Note
328 // that these are stored as a vector making this element-wise removal
329 // :w
330 // potentially quadratic.
331 //
332 // FIXME: It is really frustrating that we have to use a quadratic
333 // removal algorithm here. There should be a better way, but the use-def
334 // updates required make that impossible using the public API.
335 //
336 // Note that we have to process these backwards so that we don't
337 // invalidate other indices with each removal.
338 while (!DupIndices.empty()) {
339 int OpIdx = DupIndices.pop_back_val();
340 // Remove both the block and value operand, again in reverse order to
341 // preserve indices.
342 MI.RemoveOperand(OpIdx + 1);
343 MI.RemoveOperand(OpIdx);
344 }
345
346 Preds.clear();
347 }
348}
349
Chandler Carruth3620b992018-07-16 10:46:16 +0000350/// Helper to scan a function for loads vulnerable to misspeculation that we
351/// want to harden.
352///
353/// We use this to avoid making changes to functions where there is nothing we
354/// need to do to harden against misspeculation.
355static bool hasVulnerableLoad(MachineFunction &MF) {
356 for (MachineBasicBlock &MBB : MF) {
357 for (MachineInstr &MI : MBB) {
358 // Loads within this basic block after an LFENCE are not at risk of
359 // speculatively executing with invalid predicates from prior control
360 // flow. So break out of this block but continue scanning the function.
361 if (MI.getOpcode() == X86::LFENCE)
362 break;
363
364 // Looking for loads only.
365 if (!MI.mayLoad())
366 continue;
367
368 // An MFENCE is modeled as a load but isn't vulnerable to misspeculation.
369 if (MI.getOpcode() == X86::MFENCE)
370 continue;
371
372 // We found a load.
373 return true;
374 }
375 }
376
377 // No loads found.
378 return false;
379}
380
Chandler Carruth90358e12018-07-13 11:13:58 +0000381bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
382 MachineFunction &MF) {
383 LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
384 << " **********\n");
385
386 Subtarget = &MF.getSubtarget<X86Subtarget>();
387 MRI = &MF.getRegInfo();
388 TII = Subtarget->getInstrInfo();
389 TRI = Subtarget->getRegisterInfo();
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000390
Chandler Carruth90358e12018-07-13 11:13:58 +0000391 // FIXME: Support for 32-bit.
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000392 PS.emplace(MF, &X86::GR64_NOSPRegClass);
Chandler Carruth90358e12018-07-13 11:13:58 +0000393
394 if (MF.begin() == MF.end())
395 // Nothing to do for a degenerate empty function...
396 return false;
397
398 // We support an alternative hardening technique based on a debug flag.
399 if (HardenEdgesWithLFENCE) {
400 hardenEdgesWithLFENCE(MF);
401 return true;
402 }
403
404 // Create a dummy debug loc to use for all the generated code here.
405 DebugLoc Loc;
406
407 MachineBasicBlock &Entry = *MF.begin();
408 auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(Entry.begin());
409
410 // Do a quick scan to see if we have any checkable loads.
Chandler Carruth3620b992018-07-16 10:46:16 +0000411 bool HasVulnerableLoad = hasVulnerableLoad(MF);
Chandler Carruth90358e12018-07-13 11:13:58 +0000412
413 // See if we have any conditional branching blocks that we will need to trace
414 // predicate state through.
415 SmallVector<BlockCondInfo, 16> Infos = collectBlockCondInfo(MF);
416
417 // If we have no interesting conditions or loads, nothing to do here.
Chandler Carruth3620b992018-07-16 10:46:16 +0000418 if (!HasVulnerableLoad && Infos.empty())
Chandler Carruth90358e12018-07-13 11:13:58 +0000419 return true;
420
Chandler Carruth90358e12018-07-13 11:13:58 +0000421 // The poison value is required to be an all-ones value for many aspects of
422 // this mitigation.
423 const int PoisonVal = -1;
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000424 PS->PoisonReg = MRI->createVirtualRegister(PS->RC);
425 BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64ri32), PS->PoisonReg)
Chandler Carruth90358e12018-07-13 11:13:58 +0000426 .addImm(PoisonVal);
427 ++NumInstsInserted;
428
429 // If we have loads being hardened and we've asked for call and ret edges to
430 // get a full fence-based mitigation, inject that fence.
Chandler Carruth3620b992018-07-16 10:46:16 +0000431 if (HasVulnerableLoad && FenceCallAndRet) {
Chandler Carruth90358e12018-07-13 11:13:58 +0000432 // We need to insert an LFENCE at the start of the function to suspend any
433 // incoming misspeculation from the caller. This helps two-fold: the caller
434 // may not have been protected as this code has been, and this code gets to
435 // not take any specific action to protect across calls.
436 // FIXME: We could skip this for functions which unconditionally return
437 // a constant.
438 BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::LFENCE));
439 ++NumInstsInserted;
440 ++NumLFENCEsInserted;
441 }
442
Chandler Carruthfb503ac2018-07-14 09:32:37 +0000443 // If we guarded the entry with an LFENCE and have no conditionals to protect
444 // in blocks, then we're done.
445 if (FenceCallAndRet && Infos.empty())
Chandler Carruth90358e12018-07-13 11:13:58 +0000446 // We may have changed the function's code at this point to insert fences.
447 return true;
448
449 // For every basic block in the function which can b
450 if (HardenInterprocedurally && !FenceCallAndRet) {
451 // Set up the predicate state by extracting it from the incoming stack
452 // pointer so we pick up any misspeculation in our caller.
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000453 PS->InitialReg = extractPredStateFromSP(Entry, EntryInsertPt, Loc);
Chandler Carruth90358e12018-07-13 11:13:58 +0000454 } else {
455 // Otherwise, just build the predicate state itself by zeroing a register
456 // as we don't need any initial state.
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000457 PS->InitialReg = MRI->createVirtualRegister(PS->RC);
Chandler Carruth90358e12018-07-13 11:13:58 +0000458 unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
459 auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
460 PredStateSubReg);
461 ++NumInstsInserted;
462 MachineOperand *ZeroEFLAGSDefOp =
463 ZeroI->findRegisterDefOperand(X86::EFLAGS);
464 assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
465 "Must have an implicit def of EFLAGS!");
466 ZeroEFLAGSDefOp->setIsDead(true);
467 BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000468 PS->InitialReg)
Chandler Carruth90358e12018-07-13 11:13:58 +0000469 .addImm(0)
470 .addReg(PredStateSubReg)
471 .addImm(X86::sub_32bit);
472 }
473
474 // We're going to need to trace predicate state throughout the function's
475 // CFG. Prepare for this by setting up our initial state of PHIs with unique
476 // predecessor entries and all the initial predicate state.
Chandler Carruth3ffcc032018-07-15 23:46:36 +0000477 canonicalizePHIOperands(MF);
Chandler Carruth90358e12018-07-13 11:13:58 +0000478
479 // Track the updated values in an SSA updater to rewrite into SSA form at the
480 // end.
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000481 PS->SSA.Initialize(PS->InitialReg);
482 PS->SSA.AddAvailableValue(&Entry, PS->InitialReg);
Chandler Carruth90358e12018-07-13 11:13:58 +0000483
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000484 // Trace through the CFG.
485 auto CMovs = tracePredStateThroughCFG(MF, Infos);
Chandler Carruth90358e12018-07-13 11:13:58 +0000486
487 // We may also enter basic blocks in this function via exception handling
488 // control flow. Here, if we are hardening interprocedurally, we need to
489 // re-capture the predicate state from the throwing code. In the Itanium ABI,
490 // the throw will always look like a call to __cxa_throw and will have the
491 // predicate state in the stack pointer, so extract fresh predicate state from
492 // the stack pointer and make it available in SSA.
493 // FIXME: Handle non-itanium ABI EH models.
494 if (HardenInterprocedurally) {
495 for (MachineBasicBlock &MBB : MF) {
496 assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!");
497 assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
498 assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
499 if (!MBB.isEHPad())
500 continue;
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000501 PS->SSA.AddAvailableValue(
Chandler Carruth90358e12018-07-13 11:13:58 +0000502 &MBB,
503 extractPredStateFromSP(MBB, MBB.SkipPHIsAndLabels(MBB.begin()), Loc));
504 }
505 }
506
Chandler Carruth0477b402018-07-23 04:01:34 +0000507 // Now harden all of the loads in the function using the predicate state.
508 hardenAllLoads(MF);
Chandler Carruth90358e12018-07-13 11:13:58 +0000509
510 // Now rewrite all the uses of the pred state using the SSA updater so that
511 // we track updates through the CFG.
512 for (MachineInstr *CMovI : CMovs)
513 for (MachineOperand &Op : CMovI->operands()) {
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000514 if (!Op.isReg() || Op.getReg() != PS->InitialReg)
Chandler Carruth90358e12018-07-13 11:13:58 +0000515 continue;
516
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000517 PS->SSA.RewriteUse(Op);
Chandler Carruth90358e12018-07-13 11:13:58 +0000518 }
519
520 // If we are hardening interprocedurally, find each returning block and
521 // protect the caller from being returned to through misspeculation.
522 if (HardenInterprocedurally)
523 for (MachineBasicBlock &MBB : MF) {
524 if (MBB.empty())
525 continue;
526
527 MachineInstr &MI = MBB.back();
Chandler Carruth1d926fb2018-07-23 07:56:15 +0000528
529 // We only care about returns that are not also calls. For calls, that
530 // happen to also be returns (tail calls) we will have already handled
531 // them as calls.
532 if (!MI.isReturn() || MI.isCall())
Chandler Carruth90358e12018-07-13 11:13:58 +0000533 continue;
534
Chandler Carrutha3a03ac2018-07-19 23:46:24 +0000535 hardenReturnInstr(MI);
Chandler Carruth90358e12018-07-13 11:13:58 +0000536 }
537
538 LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump();
539 dbgs() << "\n"; MF.verify(this));
540 return true;
541}
542
543/// Implements the naive hardening approach of putting an LFENCE after every
544/// potentially mis-predicted control flow construct.
545///
546/// We include this as an alternative mostly for the purpose of comparison. The
547/// performance impact of this is expected to be extremely severe and not
548/// practical for any real-world users.
549void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
550 MachineFunction &MF) {
551 // First, we scan the function looking for blocks that are reached along edges
552 // that we might want to harden.
553 SmallSetVector<MachineBasicBlock *, 8> Blocks;
554 for (MachineBasicBlock &MBB : MF) {
555 // If there are no or only one successor, nothing to do here.
556 if (MBB.succ_size() <= 1)
557 continue;
558
559 // Skip blocks unless their terminators start with a branch. Other
560 // terminators don't seem interesting for guarding against misspeculation.
561 auto TermIt = MBB.getFirstTerminator();
562 if (TermIt == MBB.end() || !TermIt->isBranch())
563 continue;
564
565 // Add all the non-EH-pad succossors to the blocks we want to harden. We
566 // skip EH pads because there isn't really a condition of interest on
567 // entering.
568 for (MachineBasicBlock *SuccMBB : MBB.successors())
569 if (!SuccMBB->isEHPad())
570 Blocks.insert(SuccMBB);
571 }
572
573 for (MachineBasicBlock *MBB : Blocks) {
574 auto InsertPt = MBB->SkipPHIsAndLabels(MBB->begin());
575 BuildMI(*MBB, InsertPt, DebugLoc(), TII->get(X86::LFENCE));
576 ++NumInstsInserted;
577 ++NumLFENCEsInserted;
578 }
579}
580
581SmallVector<X86SpeculativeLoadHardeningPass::BlockCondInfo, 16>
582X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
583 SmallVector<BlockCondInfo, 16> Infos;
584
585 // Walk the function and build up a summary for each block's conditions that
586 // we need to trace through.
587 for (MachineBasicBlock &MBB : MF) {
588 // If there are no or only one successor, nothing to do here.
589 if (MBB.succ_size() <= 1)
590 continue;
591
592 // We want to reliably handle any conditional branch terminators in the
593 // MBB, so we manually analyze the branch. We can handle all of the
594 // permutations here, including ones that analyze branch cannot.
595 //
596 // The approach is to walk backwards across the terminators, resetting at
597 // any unconditional non-indirect branch, and track all conditional edges
598 // to basic blocks as well as the fallthrough or unconditional successor
599 // edge. For each conditional edge, we track the target and the opposite
600 // condition code in order to inject a "no-op" cmov into that successor
601 // that will harden the predicate. For the fallthrough/unconditional
602 // edge, we inject a separate cmov for each conditional branch with
603 // matching condition codes. This effectively implements an "and" of the
604 // condition flags, even if there isn't a single condition flag that would
605 // directly implement that. We don't bother trying to optimize either of
606 // these cases because if such an optimization is possible, LLVM should
607 // have optimized the conditional *branches* in that way already to reduce
608 // instruction count. This late, we simply assume the minimal number of
609 // branch instructions is being emitted and use that to guide our cmov
610 // insertion.
611
612 BlockCondInfo Info = {&MBB, {}, nullptr};
613
614 // Now walk backwards through the terminators and build up successors they
615 // reach and the conditions.
616 for (MachineInstr &MI : llvm::reverse(MBB)) {
617 // Once we've handled all the terminators, we're done.
618 if (!MI.isTerminator())
619 break;
620
621 // If we see a non-branch terminator, we can't handle anything so bail.
622 if (!MI.isBranch()) {
623 Info.CondBrs.clear();
624 break;
625 }
626
627 // If we see an unconditional branch, reset our state, clear any
628 // fallthrough, and set this is the "else" successor.
629 if (MI.getOpcode() == X86::JMP_1) {
630 Info.CondBrs.clear();
631 Info.UncondBr = &MI;
632 continue;
633 }
634
635 // If we get an invalid condition, we have an indirect branch or some
636 // other unanalyzable "fallthrough" case. We model this as a nullptr for
637 // the destination so we can still guard any conditional successors.
638 // Consider code sequences like:
639 // ```
640 // jCC L1
641 // jmpq *%rax
642 // ```
643 // We still want to harden the edge to `L1`.
644 if (X86::getCondFromBranchOpc(MI.getOpcode()) == X86::COND_INVALID) {
645 Info.CondBrs.clear();
646 Info.UncondBr = &MI;
647 continue;
648 }
649
650 // We have a vanilla conditional branch, add it to our list.
651 Info.CondBrs.push_back(&MI);
652 }
653 if (Info.CondBrs.empty()) {
654 ++NumBranchesUntraced;
655 LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
656 MBB.dump());
657 continue;
658 }
659
660 Infos.push_back(Info);
661 }
662
663 return Infos;
664}
665
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000666/// Trace the predicate state through the CFG, instrumenting each conditional
667/// branch such that misspeculation through an edge will poison the predicate
668/// state.
669///
670/// Returns the list of inserted CMov instructions so that they can have their
671/// uses of the predicate state rewritten into proper SSA form once it is
672/// complete.
673SmallVector<MachineInstr *, 16>
674X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
675 MachineFunction &MF, ArrayRef<BlockCondInfo> Infos) {
676 // Collect the inserted cmov instructions so we can rewrite their uses of the
677 // predicate state into SSA form.
678 SmallVector<MachineInstr *, 16> CMovs;
679
680 // Now walk all of the basic blocks looking for ones that end in conditional
681 // jumps where we need to update this register along each edge.
682 for (const BlockCondInfo &Info : Infos) {
683 MachineBasicBlock &MBB = *Info.MBB;
684 const SmallVectorImpl<MachineInstr *> &CondBrs = Info.CondBrs;
685 MachineInstr *UncondBr = Info.UncondBr;
686
687 LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName()
688 << "\n");
689 ++NumCondBranchesTraced;
690
691 // Compute the non-conditional successor as either the target of any
692 // unconditional branch or the layout successor.
693 MachineBasicBlock *UncondSucc =
694 UncondBr ? (UncondBr->getOpcode() == X86::JMP_1
695 ? UncondBr->getOperand(0).getMBB()
696 : nullptr)
697 : &*std::next(MachineFunction::iterator(&MBB));
698
699 // Count how many edges there are to any given successor.
700 SmallDenseMap<MachineBasicBlock *, int> SuccCounts;
701 if (UncondSucc)
702 ++SuccCounts[UncondSucc];
703 for (auto *CondBr : CondBrs)
704 ++SuccCounts[CondBr->getOperand(0).getMBB()];
705
706 // A lambda to insert cmov instructions into a block checking all of the
707 // condition codes in a sequence.
708 auto BuildCheckingBlockForSuccAndConds =
709 [&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount,
710 MachineInstr *Br, MachineInstr *&UncondBr,
711 ArrayRef<X86::CondCode> Conds) {
712 // First, we split the edge to insert the checking block into a safe
713 // location.
714 auto &CheckingMBB =
715 (SuccCount == 1 && Succ.pred_size() == 1)
716 ? Succ
717 : splitEdge(MBB, Succ, SuccCount, Br, UncondBr, *TII);
718
719 bool LiveEFLAGS = Succ.isLiveIn(X86::EFLAGS);
720 if (!LiveEFLAGS)
721 CheckingMBB.addLiveIn(X86::EFLAGS);
722
723 // Now insert the cmovs to implement the checks.
724 auto InsertPt = CheckingMBB.begin();
725 assert((InsertPt == CheckingMBB.end() || !InsertPt->isPHI()) &&
726 "Should never have a PHI in the initial checking block as it "
727 "always has a single predecessor!");
728
729 // We will wire each cmov to each other, but need to start with the
730 // incoming pred state.
731 unsigned CurStateReg = PS->InitialReg;
732
733 for (X86::CondCode Cond : Conds) {
734 int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
735 auto CMovOp = X86::getCMovFromCond(Cond, PredStateSizeInBytes);
736
737 unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
738 // Note that we intentionally use an empty debug location so that
739 // this picks up the preceding location.
740 auto CMovI = BuildMI(CheckingMBB, InsertPt, DebugLoc(),
741 TII->get(CMovOp), UpdatedStateReg)
742 .addReg(CurStateReg)
743 .addReg(PS->PoisonReg);
744 // If this is the last cmov and the EFLAGS weren't originally
745 // live-in, mark them as killed.
746 if (!LiveEFLAGS && Cond == Conds.back())
747 CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
748
749 ++NumInstsInserted;
750 LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump();
751 dbgs() << "\n");
752
753 // The first one of the cmovs will be using the top level
754 // `PredStateReg` and need to get rewritten into SSA form.
755 if (CurStateReg == PS->InitialReg)
756 CMovs.push_back(&*CMovI);
757
758 // The next cmov should start from this one's def.
759 CurStateReg = UpdatedStateReg;
760 }
761
762 // And put the last one into the available values for SSA form of our
763 // predicate state.
764 PS->SSA.AddAvailableValue(&CheckingMBB, CurStateReg);
765 };
766
767 std::vector<X86::CondCode> UncondCodeSeq;
768 for (auto *CondBr : CondBrs) {
769 MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB();
770 int &SuccCount = SuccCounts[&Succ];
771
772 X86::CondCode Cond = X86::getCondFromBranchOpc(CondBr->getOpcode());
773 X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond);
774 UncondCodeSeq.push_back(Cond);
775
776 BuildCheckingBlockForSuccAndConds(MBB, Succ, SuccCount, CondBr, UncondBr,
777 {InvCond});
778
779 // Decrement the successor count now that we've split one of the edges.
780 // We need to keep the count of edges to the successor accurate in order
781 // to know above when to *replace* the successor in the CFG vs. just
782 // adding the new successor.
783 --SuccCount;
784 }
785
786 // Since we may have split edges and changed the number of successors,
787 // normalize the probabilities. This avoids doing it each time we split an
788 // edge.
789 MBB.normalizeSuccProbs();
790
791 // Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
792 // need to intersect the other condition codes. We can do this by just
793 // doing a cmov for each one.
794 if (!UncondSucc)
795 // If we have no fallthrough to protect (perhaps it is an indirect jump?)
796 // just skip this and continue.
797 continue;
798
799 assert(SuccCounts[UncondSucc] == 1 &&
800 "We should never have more than one edge to the unconditional "
801 "successor at this point because every other edge must have been "
802 "split above!");
803
804 // Sort and unique the codes to minimize them.
805 llvm::sort(UncondCodeSeq.begin(), UncondCodeSeq.end());
806 UncondCodeSeq.erase(std::unique(UncondCodeSeq.begin(), UncondCodeSeq.end()),
807 UncondCodeSeq.end());
808
809 // Build a checking version of the successor.
810 BuildCheckingBlockForSuccAndConds(MBB, *UncondSucc, /*SuccCount*/ 1,
811 UncondBr, UncondBr, UncondCodeSeq);
812 }
813
814 return CMovs;
815}
816
Chandler Carruth90358e12018-07-13 11:13:58 +0000817/// Returns true if the instruction has no behavior (specified or otherwise)
818/// that is based on the value of any of its register operands
819///
820/// A classical example of something that is inherently not data invariant is an
821/// indirect jump -- the destination is loaded into icache based on the bits set
822/// in the jump destination register.
823///
824/// FIXME: This should become part of our instruction tables.
825static bool isDataInvariant(MachineInstr &MI) {
826 switch (MI.getOpcode()) {
827 default:
828 // By default, assume that the instruction is not data invariant.
829 return false;
830
Chandler Carruthfa065aa2018-07-16 14:58:32 +0000831 // Some target-independent operations that trivially lower to data-invariant
832 // instructions.
Chandler Carruth90358e12018-07-13 11:13:58 +0000833 case TargetOpcode::COPY:
Chandler Carruthfa065aa2018-07-16 14:58:32 +0000834 case TargetOpcode::INSERT_SUBREG:
835 case TargetOpcode::SUBREG_TO_REG:
836 return true;
837
838 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
839 // However, they set flags and are perhaps the most surprisingly constant
840 // time operations so we call them out here separately.
841 case X86::IMUL16rr:
842 case X86::IMUL16rri8:
843 case X86::IMUL16rri:
844 case X86::IMUL32rr:
845 case X86::IMUL32rri8:
846 case X86::IMUL32rri:
847 case X86::IMUL64rr:
848 case X86::IMUL64rri32:
849 case X86::IMUL64rri8:
850
851 // Bit scanning and counting instructions that are somewhat surprisingly
852 // constant time as they scan across bits and do other fairly complex
853 // operations like popcnt, but are believed to be constant time on x86.
854 // However, these set flags.
855 case X86::BSF16rr:
856 case X86::BSF32rr:
857 case X86::BSF64rr:
858 case X86::BSR16rr:
859 case X86::BSR32rr:
860 case X86::BSR64rr:
861 case X86::LZCNT16rr:
862 case X86::LZCNT32rr:
863 case X86::LZCNT64rr:
864 case X86::POPCNT16rr:
865 case X86::POPCNT32rr:
866 case X86::POPCNT64rr:
867 case X86::TZCNT16rr:
868 case X86::TZCNT32rr:
869 case X86::TZCNT64rr:
870
871 // Bit manipulation instructions are effectively combinations of basic
872 // arithmetic ops, and should still execute in constant time. These also
873 // set flags.
874 case X86::BLCFILL32rr:
875 case X86::BLCFILL64rr:
876 case X86::BLCI32rr:
877 case X86::BLCI64rr:
878 case X86::BLCIC32rr:
879 case X86::BLCIC64rr:
880 case X86::BLCMSK32rr:
881 case X86::BLCMSK64rr:
882 case X86::BLCS32rr:
883 case X86::BLCS64rr:
884 case X86::BLSFILL32rr:
885 case X86::BLSFILL64rr:
886 case X86::BLSI32rr:
887 case X86::BLSI64rr:
888 case X86::BLSIC32rr:
889 case X86::BLSIC64rr:
890 case X86::BLSMSK32rr:
891 case X86::BLSMSK64rr:
892 case X86::BLSR32rr:
893 case X86::BLSR64rr:
894 case X86::TZMSK32rr:
895 case X86::TZMSK64rr:
896
897 // Bit extracting and clearing instructions should execute in constant time,
898 // and set flags.
899 case X86::BEXTR32rr:
900 case X86::BEXTR64rr:
901 case X86::BEXTRI32ri:
902 case X86::BEXTRI64ri:
903 case X86::BZHI32rr:
904 case X86::BZHI64rr:
905
Chandler Carruthc0cb5732018-07-17 18:07:59 +0000906 // Shift and rotate.
907 case X86::ROL8r1: case X86::ROL16r1: case X86::ROL32r1: case X86::ROL64r1:
908 case X86::ROL8rCL: case X86::ROL16rCL: case X86::ROL32rCL: case X86::ROL64rCL:
909 case X86::ROL8ri: case X86::ROL16ri: case X86::ROL32ri: case X86::ROL64ri:
910 case X86::ROR8r1: case X86::ROR16r1: case X86::ROR32r1: case X86::ROR64r1:
911 case X86::ROR8rCL: case X86::ROR16rCL: case X86::ROR32rCL: case X86::ROR64rCL:
912 case X86::ROR8ri: case X86::ROR16ri: case X86::ROR32ri: case X86::ROR64ri:
913 case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1: case X86::SAR64r1:
914 case X86::SAR8rCL: case X86::SAR16rCL: case X86::SAR32rCL: case X86::SAR64rCL:
915 case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri: case X86::SAR64ri:
916 case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1: case X86::SHL64r1:
917 case X86::SHL8rCL: case X86::SHL16rCL: case X86::SHL32rCL: case X86::SHL64rCL:
918 case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri: case X86::SHL64ri:
919 case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1: case X86::SHR64r1:
920 case X86::SHR8rCL: case X86::SHR16rCL: case X86::SHR32rCL: case X86::SHR64rCL:
921 case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri: case X86::SHR64ri:
922 case X86::SHLD16rrCL: case X86::SHLD32rrCL: case X86::SHLD64rrCL:
923 case X86::SHLD16rri8: case X86::SHLD32rri8: case X86::SHLD64rri8:
924 case X86::SHRD16rrCL: case X86::SHRD32rrCL: case X86::SHRD64rrCL:
925 case X86::SHRD16rri8: case X86::SHRD32rri8: case X86::SHRD64rri8:
926
Chandler Carruthfa065aa2018-07-16 14:58:32 +0000927 // Basic arithmetic is constant time on the input but does set flags.
928 case X86::ADC8rr: case X86::ADC8ri:
929 case X86::ADC16rr: case X86::ADC16ri: case X86::ADC16ri8:
930 case X86::ADC32rr: case X86::ADC32ri: case X86::ADC32ri8:
931 case X86::ADC64rr: case X86::ADC64ri8: case X86::ADC64ri32:
932 case X86::ADD8rr: case X86::ADD8ri:
933 case X86::ADD16rr: case X86::ADD16ri: case X86::ADD16ri8:
934 case X86::ADD32rr: case X86::ADD32ri: case X86::ADD32ri8:
935 case X86::ADD64rr: case X86::ADD64ri8: case X86::ADD64ri32:
936 case X86::AND8rr: case X86::AND8ri:
937 case X86::AND16rr: case X86::AND16ri: case X86::AND16ri8:
938 case X86::AND32rr: case X86::AND32ri: case X86::AND32ri8:
939 case X86::AND64rr: case X86::AND64ri8: case X86::AND64ri32:
940 case X86::OR8rr: case X86::OR8ri:
941 case X86::OR16rr: case X86::OR16ri: case X86::OR16ri8:
942 case X86::OR32rr: case X86::OR32ri: case X86::OR32ri8:
943 case X86::OR64rr: case X86::OR64ri8: case X86::OR64ri32:
944 case X86::SBB8rr: case X86::SBB8ri:
945 case X86::SBB16rr: case X86::SBB16ri: case X86::SBB16ri8:
946 case X86::SBB32rr: case X86::SBB32ri: case X86::SBB32ri8:
947 case X86::SBB64rr: case X86::SBB64ri8: case X86::SBB64ri32:
948 case X86::SUB8rr: case X86::SUB8ri:
949 case X86::SUB16rr: case X86::SUB16ri: case X86::SUB16ri8:
950 case X86::SUB32rr: case X86::SUB32ri: case X86::SUB32ri8:
951 case X86::SUB64rr: case X86::SUB64ri8: case X86::SUB64ri32:
952 case X86::XOR8rr: case X86::XOR8ri:
953 case X86::XOR16rr: case X86::XOR16ri: case X86::XOR16ri8:
954 case X86::XOR32rr: case X86::XOR32ri: case X86::XOR32ri8:
955 case X86::XOR64rr: case X86::XOR64ri8: case X86::XOR64ri32:
956 // Arithmetic with just 32-bit and 64-bit variants and no immediates.
957 case X86::ADCX32rr: case X86::ADCX64rr:
958 case X86::ADOX32rr: case X86::ADOX64rr:
959 case X86::ANDN32rr: case X86::ANDN64rr:
Chandler Carruthc0cb5732018-07-17 18:07:59 +0000960 // Unary arithmetic operations.
Chandler Carruthfa065aa2018-07-16 14:58:32 +0000961 case X86::DEC8r: case X86::DEC16r: case X86::DEC32r: case X86::DEC64r:
Chandler Carruthc0cb5732018-07-17 18:07:59 +0000962 case X86::INC8r: case X86::INC16r: case X86::INC32r: case X86::INC64r:
963 case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
Chandler Carruthfa065aa2018-07-16 14:58:32 +0000964 // Check whether the EFLAGS implicit-def is dead. We assume that this will
965 // always find the implicit-def because this code should only be reached
966 // for instructions that do in fact implicitly def this.
967 if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
968 // If we would clobber EFLAGS that are used, just bail for now.
969 LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: ";
970 MI.dump(); dbgs() << "\n");
971 return false;
972 }
973
974 // Otherwise, fallthrough to handle these the same as instructions that
975 // don't set EFLAGS.
976 LLVM_FALLTHROUGH;
977
Chandler Carruthc0cb5732018-07-17 18:07:59 +0000978 // Unlike other arithmetic, NOT doesn't set EFLAGS.
979 case X86::NOT8r: case X86::NOT16r: case X86::NOT32r: case X86::NOT64r:
980
981 // Various move instructions used to zero or sign extend things. Note that we
982 // intentionally don't support the _NOREX variants as we can't handle that
983 // register constraint anyways.
984 case X86::MOVSX16rr8:
985 case X86::MOVSX32rr8: case X86::MOVSX32rr16:
986 case X86::MOVSX64rr8: case X86::MOVSX64rr16: case X86::MOVSX64rr32:
987 case X86::MOVZX16rr8:
988 case X86::MOVZX32rr8: case X86::MOVZX32rr16:
989 case X86::MOVZX64rr8: case X86::MOVZX64rr16:
990 case X86::MOV32rr:
Chandler Carruthfa065aa2018-07-16 14:58:32 +0000991
992 // Arithmetic instructions that are both constant time and don't set flags.
993 case X86::RORX32ri:
994 case X86::RORX64ri:
995 case X86::SARX32rr:
996 case X86::SARX64rr:
997 case X86::SHLX32rr:
998 case X86::SHLX64rr:
999 case X86::SHRX32rr:
1000 case X86::SHRX64rr:
1001
1002 // LEA doesn't actually access memory, and its arithmetic is constant time.
1003 case X86::LEA16r:
1004 case X86::LEA32r:
1005 case X86::LEA64_32r:
1006 case X86::LEA64r:
Chandler Carruth90358e12018-07-13 11:13:58 +00001007 return true;
1008 }
1009}
1010
1011/// Returns true if the instruction has no behavior (specified or otherwise)
1012/// that is based on the value loaded from memory or the value of any
1013/// non-address register operands.
1014///
1015/// For example, if the latency of the instruction is dependent on the
1016/// particular bits set in any of the registers *or* any of the bits loaded from
1017/// memory.
1018///
1019/// A classical example of something that is inherently not data invariant is an
1020/// indirect jump -- the destination is loaded into icache based on the bits set
1021/// in the jump destination register.
1022///
1023/// FIXME: This should become part of our instruction tables.
1024static bool isDataInvariantLoad(MachineInstr &MI) {
1025 switch (MI.getOpcode()) {
1026 default:
1027 // By default, assume that the load will immediately leak.
1028 return false;
1029
Craig Topper445abf72018-07-13 22:41:46 +00001030 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
1031 // However, they set flags and are perhaps the most surprisingly constant
1032 // time operations so we call them out here separately.
Chandler Carruth90358e12018-07-13 11:13:58 +00001033 case X86::IMUL16rm:
1034 case X86::IMUL16rmi8:
1035 case X86::IMUL16rmi:
1036 case X86::IMUL32rm:
1037 case X86::IMUL32rmi8:
1038 case X86::IMUL32rmi:
1039 case X86::IMUL64rm:
1040 case X86::IMUL64rmi32:
1041 case X86::IMUL64rmi8:
1042
Craig Topper445abf72018-07-13 22:41:46 +00001043 // Bit scanning and counting instructions that are somewhat surprisingly
1044 // constant time as they scan across bits and do other fairly complex
1045 // operations like popcnt, but are believed to be constant time on x86.
1046 // However, these set flags.
1047 case X86::BSF16rm:
1048 case X86::BSF32rm:
1049 case X86::BSF64rm:
1050 case X86::BSR16rm:
1051 case X86::BSR32rm:
1052 case X86::BSR64rm:
1053 case X86::LZCNT16rm:
1054 case X86::LZCNT32rm:
1055 case X86::LZCNT64rm:
1056 case X86::POPCNT16rm:
1057 case X86::POPCNT32rm:
1058 case X86::POPCNT64rm:
1059 case X86::TZCNT16rm:
1060 case X86::TZCNT32rm:
1061 case X86::TZCNT64rm:
1062
1063 // Bit manipulation instructions are effectively combinations of basic
1064 // arithmetic ops, and should still execute in constant time. These also
1065 // set flags.
Chandler Carruth90358e12018-07-13 11:13:58 +00001066 case X86::BLCFILL32rm:
1067 case X86::BLCFILL64rm:
1068 case X86::BLCI32rm:
1069 case X86::BLCI64rm:
1070 case X86::BLCIC32rm:
1071 case X86::BLCIC64rm:
1072 case X86::BLCMSK32rm:
1073 case X86::BLCMSK64rm:
1074 case X86::BLCS32rm:
1075 case X86::BLCS64rm:
1076 case X86::BLSFILL32rm:
1077 case X86::BLSFILL64rm:
1078 case X86::BLSI32rm:
1079 case X86::BLSI64rm:
1080 case X86::BLSIC32rm:
1081 case X86::BLSIC64rm:
1082 case X86::BLSMSK32rm:
1083 case X86::BLSMSK64rm:
1084 case X86::BLSR32rm:
1085 case X86::BLSR64rm:
Chandler Carruth90358e12018-07-13 11:13:58 +00001086 case X86::TZMSK32rm:
1087 case X86::TZMSK64rm:
1088
Craig Topper445abf72018-07-13 22:41:46 +00001089 // Bit extracting and clearing instructions should execute in constant time,
1090 // and set flags.
1091 case X86::BEXTR32rm:
1092 case X86::BEXTR64rm:
1093 case X86::BEXTRI32mi:
1094 case X86::BEXTRI64mi:
1095 case X86::BZHI32rm:
1096 case X86::BZHI64rm:
1097
1098 // Basic arithmetic is constant time on the input but does set flags.
Chandler Carruth90358e12018-07-13 11:13:58 +00001099 case X86::ADC8rm:
1100 case X86::ADC16rm:
1101 case X86::ADC32rm:
1102 case X86::ADC64rm:
1103 case X86::ADCX32rm:
1104 case X86::ADCX64rm:
1105 case X86::ADD8rm:
1106 case X86::ADD16rm:
1107 case X86::ADD32rm:
1108 case X86::ADD64rm:
1109 case X86::ADOX32rm:
1110 case X86::ADOX64rm:
1111 case X86::AND8rm:
1112 case X86::AND16rm:
1113 case X86::AND32rm:
1114 case X86::AND64rm:
1115 case X86::ANDN32rm:
1116 case X86::ANDN64rm:
Chandler Carruth90358e12018-07-13 11:13:58 +00001117 case X86::OR8rm:
1118 case X86::OR16rm:
1119 case X86::OR32rm:
1120 case X86::OR64rm:
1121 case X86::SBB8rm:
1122 case X86::SBB16rm:
1123 case X86::SBB32rm:
1124 case X86::SBB64rm:
1125 case X86::SUB8rm:
1126 case X86::SUB16rm:
1127 case X86::SUB32rm:
1128 case X86::SUB64rm:
1129 case X86::XOR8rm:
1130 case X86::XOR16rm:
1131 case X86::XOR32rm:
1132 case X86::XOR64rm:
Chandler Carruth90358e12018-07-13 11:13:58 +00001133 // Check whether the EFLAGS implicit-def is dead. We assume that this will
1134 // always find the implicit-def because this code should only be reached
1135 // for instructions that do in fact implicitly def this.
1136 if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
1137 // If we would clobber EFLAGS that are used, just bail for now.
1138 LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: ";
1139 MI.dump(); dbgs() << "\n");
1140 return false;
1141 }
1142
1143 // Otherwise, fallthrough to handle these the same as instructions that
1144 // don't set EFLAGS.
1145 LLVM_FALLTHROUGH;
1146
Craig Topper445abf72018-07-13 22:41:46 +00001147 // Integer multiply w/o affecting flags is still believed to be constant
1148 // time on x86. Called out separately as this is among the most surprising
1149 // instructions to exhibit that behavior.
Chandler Carruth90358e12018-07-13 11:13:58 +00001150 case X86::MULX32rm:
1151 case X86::MULX64rm:
1152
Craig Topper445abf72018-07-13 22:41:46 +00001153 // Arithmetic instructions that are both constant time and don't set flags.
Chandler Carruth90358e12018-07-13 11:13:58 +00001154 case X86::RORX32mi:
1155 case X86::RORX64mi:
1156 case X86::SARX32rm:
1157 case X86::SARX64rm:
1158 case X86::SHLX32rm:
1159 case X86::SHLX64rm:
1160 case X86::SHRX32rm:
1161 case X86::SHRX64rm:
1162
Craig Topper445abf72018-07-13 22:41:46 +00001163 // Conversions are believed to be constant time and don't set flags.
Craig Topper41fa8582018-07-13 22:41:50 +00001164 case X86::CVTTSD2SI64rm: case X86::VCVTTSD2SI64rm: case X86::VCVTTSD2SI64Zrm:
1165 case X86::CVTTSD2SIrm: case X86::VCVTTSD2SIrm: case X86::VCVTTSD2SIZrm:
1166 case X86::CVTTSS2SI64rm: case X86::VCVTTSS2SI64rm: case X86::VCVTTSS2SI64Zrm:
1167 case X86::CVTTSS2SIrm: case X86::VCVTTSS2SIrm: case X86::VCVTTSS2SIZrm:
1168 case X86::CVTSI2SDrm: case X86::VCVTSI2SDrm: case X86::VCVTSI2SDZrm:
1169 case X86::CVTSI2SSrm: case X86::VCVTSI2SSrm: case X86::VCVTSI2SSZrm:
1170 case X86::CVTSI642SDrm: case X86::VCVTSI642SDrm: case X86::VCVTSI642SDZrm:
1171 case X86::CVTSI642SSrm: case X86::VCVTSI642SSrm: case X86::VCVTSI642SSZrm:
1172 case X86::CVTSS2SDrm: case X86::VCVTSS2SDrm: case X86::VCVTSS2SDZrm:
1173 case X86::CVTSD2SSrm: case X86::VCVTSD2SSrm: case X86::VCVTSD2SSZrm:
1174 // AVX512 added unsigned integer conversions.
1175 case X86::VCVTTSD2USI64Zrm:
1176 case X86::VCVTTSD2USIZrm:
1177 case X86::VCVTTSS2USI64Zrm:
1178 case X86::VCVTTSS2USIZrm:
1179 case X86::VCVTUSI2SDZrm:
1180 case X86::VCVTUSI642SDZrm:
1181 case X86::VCVTUSI2SSZrm:
1182 case X86::VCVTUSI642SSZrm:
Chandler Carruth90358e12018-07-13 11:13:58 +00001183
Craig Topper445abf72018-07-13 22:41:46 +00001184 // Loads to register don't set flags.
Chandler Carruth90358e12018-07-13 11:13:58 +00001185 case X86::MOV8rm:
1186 case X86::MOV8rm_NOREX:
1187 case X86::MOV16rm:
1188 case X86::MOV32rm:
1189 case X86::MOV64rm:
1190 case X86::MOVSX16rm8:
1191 case X86::MOVSX32rm16:
1192 case X86::MOVSX32rm8:
1193 case X86::MOVSX32rm8_NOREX:
1194 case X86::MOVSX64rm16:
1195 case X86::MOVSX64rm32:
1196 case X86::MOVSX64rm8:
1197 case X86::MOVZX16rm8:
1198 case X86::MOVZX32rm16:
1199 case X86::MOVZX32rm8:
1200 case X86::MOVZX32rm8_NOREX:
1201 case X86::MOVZX64rm16:
1202 case X86::MOVZX64rm8:
1203 return true;
1204 }
1205}
1206
1207static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
1208 const TargetRegisterInfo &TRI) {
1209 // Check if EFLAGS are alive by seeing if there is a def of them or they
1210 // live-in, and then seeing if that def is in turn used.
1211 for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), I))) {
1212 if (MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
1213 // If the def is dead, then EFLAGS is not live.
1214 if (DefOp->isDead())
1215 return false;
1216
1217 // Otherwise we've def'ed it, and it is live.
1218 return true;
1219 }
1220 // While at this instruction, also check if we use and kill EFLAGS
1221 // which means it isn't live.
1222 if (MI.killsRegister(X86::EFLAGS, &TRI))
1223 return false;
1224 }
1225
1226 // If we didn't find anything conclusive (neither definitely alive or
1227 // definitely dead) return whether it lives into the block.
1228 return MBB.isLiveIn(X86::EFLAGS);
1229}
1230
Chandler Carruth0477b402018-07-23 04:01:34 +00001231/// Harden all of the loads (including implicit loads) in the function.
1232///
1233/// This operates in two passes. First, we analyze the loads to determine which
1234/// strategy will be used to harden them: hardening the address or hardening the
1235/// loaded value when loaded into a register amenable to hardening. We have to
1236/// process these first because the two strategies may interact -- later
1237/// hardening may change what strategy we wish to use. We also will analyze
1238/// data dependencies between loads and avoid hardening those loads that are
1239/// data dependent on a load with a hardened address. We also skip hardening
1240/// loads already behind an LFENCE as that is sufficient to harden them against
1241/// misspeculation.
1242///
1243/// Second, we apply the hardening steps we determined necessary in the first
1244/// pass.
1245///
1246/// These two passes are applied to each basic block. We operate one block at
1247/// a time to simplify reasoning about reachability and sequencing.
1248void X86SpeculativeLoadHardeningPass::hardenAllLoads(MachineFunction &MF) {
Chandler Carruth90358e12018-07-13 11:13:58 +00001249 // If the actual checking of loads is disabled, skip doing anything here.
1250 if (!HardenLoads)
1251 return;
1252
1253 SmallPtrSet<MachineInstr *, 16> HardenPostLoad;
1254 SmallPtrSet<MachineInstr *, 16> HardenLoadAddr;
1255
1256 SmallSet<unsigned, 16> HardenedAddrRegs;
1257
1258 SmallDenseMap<unsigned, unsigned, 32> AddrRegToHardenedReg;
1259
1260 // Track the set of load-dependent registers through the basic block. Because
1261 // the values of these registers have an existing data dependency on a loaded
1262 // value which we would have checked, we can omit any checks on them.
1263 SparseBitVector<> LoadDepRegs;
1264
1265 for (MachineBasicBlock &MBB : MF) {
1266 // We harden the loads of a basic block in several passes:
1267 //
1268 // 1) Collect all the loads which can have their loaded value hardened
1269 // and all the loads that instead need their address hardened. During
1270 // this walk we propagate load dependence for address hardened loads and
1271 // also look for LFENCE to stop hardening wherever possible. When
1272 // deciding whether or not to harden the loaded value or not, we check
1273 // to see if any registers used in the address will have been hardened
1274 // at this point and if so, harden any remaining address registers as
1275 // that often successfully re-uses hardened addresses and minimizes
1276 // instructions. FIXME: We should consider an aggressive mode where we
1277 // continue to keep as many loads value hardened even when some address
1278 // register hardening would be free (due to reuse).
1279 for (MachineInstr &MI : MBB) {
1280 // We naively assume that all def'ed registers of an instruction have
1281 // a data dependency on all of their operands.
1282 // FIXME: Do a more careful analysis of x86 to build a conservative model
1283 // here.
1284 if (llvm::any_of(MI.uses(), [&](MachineOperand &Op) {
1285 return Op.isReg() && LoadDepRegs.test(Op.getReg());
1286 }))
1287 for (MachineOperand &Def : MI.defs())
1288 if (Def.isReg())
1289 LoadDepRegs.set(Def.getReg());
1290
1291 // Both Intel and AMD are guiding that they will change the semantics of
1292 // LFENCE to be a speculation barrier, so if we see an LFENCE, there is
1293 // no more need to guard things in this block.
1294 if (MI.getOpcode() == X86::LFENCE)
1295 break;
1296
1297 // If this instruction cannot load, nothing to do.
1298 if (!MI.mayLoad())
1299 continue;
1300
1301 // Some instructions which "load" are trivially safe or unimportant.
1302 if (MI.getOpcode() == X86::MFENCE)
1303 continue;
1304
1305 // Extract the memory operand information about this instruction.
1306 // FIXME: This doesn't handle loading pseudo instructions which we often
1307 // could handle with similarly generic logic. We probably need to add an
1308 // MI-layer routine similar to the MC-layer one we use here which maps
1309 // pseudos much like this maps real instructions.
1310 const MCInstrDesc &Desc = MI.getDesc();
1311 int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
1312 if (MemRefBeginIdx < 0) {
1313 LLVM_DEBUG(dbgs() << "WARNING: unable to harden loading instruction: ";
1314 MI.dump());
1315 continue;
1316 }
1317
1318 MemRefBeginIdx += X86II::getOperandBias(Desc);
1319
1320 MachineOperand &BaseMO = MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
1321 MachineOperand &IndexMO =
1322 MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
1323
1324 // If we have at least one (non-frame-index, non-RIP) register operand,
1325 // and neither operand is load-dependent, we need to check the load.
1326 unsigned BaseReg = 0, IndexReg = 0;
1327 if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP &&
1328 BaseMO.getReg() != X86::NoRegister)
1329 BaseReg = BaseMO.getReg();
1330 if (IndexMO.getReg() != X86::NoRegister)
1331 IndexReg = IndexMO.getReg();
1332
1333 if (!BaseReg && !IndexReg)
1334 // No register operands!
1335 continue;
1336
1337 // If any register operand is dependent, this load is dependent and we
1338 // needn't check it.
1339 // FIXME: Is this true in the case where we are hardening loads after
1340 // they complete? Unclear, need to investigate.
1341 if ((BaseReg && LoadDepRegs.test(BaseReg)) ||
1342 (IndexReg && LoadDepRegs.test(IndexReg)))
1343 continue;
1344
Chandler Carruthe66a6f42018-07-16 11:38:48 +00001345 // If post-load hardening is enabled, this load is compatible with
1346 // post-load hardening, and we aren't already going to harden one of the
Chandler Carruth90358e12018-07-13 11:13:58 +00001347 // address registers, queue it up to be hardened post-load. Notably, even
1348 // once hardened this won't introduce a useful dependency that could prune
1349 // out subsequent loads.
Chandler Carruthfa065aa2018-07-16 14:58:32 +00001350 if (EnablePostLoadHardening && isDataInvariantLoad(MI) &&
1351 MI.getDesc().getNumDefs() == 1 && MI.getOperand(0).isReg() &&
1352 canHardenRegister(MI.getOperand(0).getReg()) &&
Chandler Carruth90358e12018-07-13 11:13:58 +00001353 !HardenedAddrRegs.count(BaseReg) &&
1354 !HardenedAddrRegs.count(IndexReg)) {
1355 HardenPostLoad.insert(&MI);
1356 HardenedAddrRegs.insert(MI.getOperand(0).getReg());
1357 continue;
1358 }
1359
1360 // Record this instruction for address hardening and record its register
1361 // operands as being address-hardened.
1362 HardenLoadAddr.insert(&MI);
1363 if (BaseReg)
1364 HardenedAddrRegs.insert(BaseReg);
1365 if (IndexReg)
1366 HardenedAddrRegs.insert(IndexReg);
1367
1368 for (MachineOperand &Def : MI.defs())
1369 if (Def.isReg())
1370 LoadDepRegs.set(Def.getReg());
1371 }
1372
1373 // Now re-walk the instructions in the basic block, and apply whichever
1374 // hardening strategy we have elected. Note that we do this in a second
1375 // pass specifically so that we have the complete set of instructions for
1376 // which we will do post-load hardening and can defer it in certain
1377 // circumstances.
1378 //
1379 // FIXME: This could probably be made even more effective by doing it
1380 // across the entire function. Rather than just walking the flat list
1381 // backwards here, we could walk the function in PO and each block bottom
1382 // up, allowing us to in some cases sink hardening across block blocks. As
1383 // long as the in-block predicate state is used at the eventual hardening
1384 // site, this remains safe.
1385 for (MachineInstr &MI : MBB) {
1386 // We cannot both require hardening the def of a load and its address.
1387 assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) &&
1388 "Requested to harden both the address and def of a load!");
1389
1390 // Check if this is a load whose address needs to be hardened.
1391 if (HardenLoadAddr.erase(&MI)) {
1392 const MCInstrDesc &Desc = MI.getDesc();
1393 int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
1394 assert(MemRefBeginIdx >= 0 && "Cannot have an invalid index here!");
1395
1396 MemRefBeginIdx += X86II::getOperandBias(Desc);
1397
1398 MachineOperand &BaseMO =
1399 MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
1400 MachineOperand &IndexMO =
1401 MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
Chandler Carruth4b0028a2018-07-19 11:13:58 +00001402 hardenLoadAddr(MI, BaseMO, IndexMO, AddrRegToHardenedReg);
Chandler Carruth90358e12018-07-13 11:13:58 +00001403 continue;
1404 }
1405
1406 // Test if this instruction is one of our post load instructions (and
1407 // remove it from the set if so).
1408 if (HardenPostLoad.erase(&MI)) {
1409 assert(!MI.isCall() && "Must not try to post-load harden a call!");
1410
1411 // If this is a data-invariant load, we want to try and sink any
1412 // hardening as far as possible.
1413 if (isDataInvariantLoad(MI)) {
1414 // Sink the instruction we'll need to harden as far as we can down the
1415 // graph.
1416 MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad);
1417
1418 // If we managed to sink this instruction, update everything so we
1419 // harden that instruction when we reach it in the instruction
1420 // sequence.
1421 if (SunkMI != &MI) {
1422 // If in sinking there was no instruction needing to be hardened,
1423 // we're done.
1424 if (!SunkMI)
1425 continue;
1426
1427 // Otherwise, add this to the set of defs we harden.
1428 HardenPostLoad.insert(SunkMI);
1429 continue;
1430 }
1431 }
1432
1433 // The register def'ed by this instruction is trivially hardened so map
1434 // it to itself.
1435 AddrRegToHardenedReg[MI.getOperand(0).getReg()] =
1436 MI.getOperand(0).getReg();
1437
Chandler Carruth4b0028a2018-07-19 11:13:58 +00001438 hardenPostLoad(MI);
Chandler Carruth90358e12018-07-13 11:13:58 +00001439 continue;
1440 }
1441
1442 // After we finish processing the instruction and doing any hardening
1443 // necessary for it, we need to handle transferring the predicate state
1444 // into a call and recovering it after the call returns (if it returns).
1445 if (!MI.isCall())
1446 continue;
1447
1448 // If we're not hardening interprocedurally, we can just skip calls.
1449 if (!HardenInterprocedurally)
1450 continue;
1451
1452 auto InsertPt = MI.getIterator();
1453 DebugLoc Loc = MI.getDebugLoc();
1454
1455 // First, we transfer the predicate state into the called function by
1456 // merging it into the stack pointer. This will kill the current def of
1457 // the state.
Chandler Carruth4b0028a2018-07-19 11:13:58 +00001458 unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
Chandler Carruth90358e12018-07-13 11:13:58 +00001459 mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg);
1460
1461 // If this call is also a return (because it is a tail call) we're done.
1462 if (MI.isReturn())
1463 continue;
1464
1465 // Otherwise we need to step past the call and recover the predicate
1466 // state from SP after the return, and make this new state available.
1467 ++InsertPt;
1468 unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
Chandler Carruth4b0028a2018-07-19 11:13:58 +00001469 PS->SSA.AddAvailableValue(&MBB, NewStateReg);
Chandler Carruth90358e12018-07-13 11:13:58 +00001470 }
1471
1472 HardenPostLoad.clear();
1473 HardenLoadAddr.clear();
1474 HardenedAddrRegs.clear();
1475 AddrRegToHardenedReg.clear();
1476
1477 // Currently, we only track data-dependent loads within a basic block.
1478 // FIXME: We should see if this is necessary or if we could be more
1479 // aggressive here without opening up attack avenues.
1480 LoadDepRegs.clear();
1481 }
1482}
1483
1484/// Save EFLAGS into the returned GPR. This can in turn be restored with
1485/// `restoreEFLAGS`.
1486///
1487/// Note that LLVM can only lower very simple patterns of saved and restored
1488/// EFLAGS registers. The restore should always be within the same basic block
1489/// as the save so that no PHI nodes are inserted.
1490unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
1491 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1492 DebugLoc Loc) {
1493 // FIXME: Hard coding this to a 32-bit register class seems weird, but matches
1494 // what instruction selection does.
1495 unsigned Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
1496 // We directly copy the FLAGS register and rely on later lowering to clean
1497 // this up into the appropriate setCC instructions.
1498 BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
1499 ++NumInstsInserted;
1500 return Reg;
1501}
1502
1503/// Restore EFLAGS from the provided GPR. This should be produced by
1504/// `saveEFLAGS`.
1505///
1506/// This must be done within the same basic block as the save in order to
1507/// reliably lower.
1508void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
1509 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
1510 unsigned Reg) {
1511 BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
1512 ++NumInstsInserted;
1513}
1514
1515/// Takes the current predicate state (in a register) and merges it into the
1516/// stack pointer. The state is essentially a single bit, but we merge this in
1517/// a way that won't form non-canonical pointers and also will be preserved
1518/// across normal stack adjustments.
1519void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
1520 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
1521 unsigned PredStateReg) {
Chandler Carruth4b0028a2018-07-19 11:13:58 +00001522 unsigned TmpReg = MRI->createVirtualRegister(PS->RC);
Chandler Carruth90358e12018-07-13 11:13:58 +00001523 // FIXME: This hard codes a shift distance based on the number of bits needed
1524 // to stay canonical on 64-bit. We should compute this somehow and support
1525 // 32-bit as part of that.
1526 auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg)
1527 .addReg(PredStateReg, RegState::Kill)
1528 .addImm(47);
1529 ShiftI->addRegisterDead(X86::EFLAGS, TRI);
1530 ++NumInstsInserted;
1531 auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP)
1532 .addReg(X86::RSP)
1533 .addReg(TmpReg, RegState::Kill);
1534 OrI->addRegisterDead(X86::EFLAGS, TRI);
1535 ++NumInstsInserted;
1536}
1537
1538/// Extracts the predicate state stored in the high bits of the stack pointer.
1539unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
1540 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1541 DebugLoc Loc) {
Chandler Carruth4b0028a2018-07-19 11:13:58 +00001542 unsigned PredStateReg = MRI->createVirtualRegister(PS->RC);
1543 unsigned TmpReg = MRI->createVirtualRegister(PS->RC);
Chandler Carruth90358e12018-07-13 11:13:58 +00001544
1545 // We know that the stack pointer will have any preserved predicate state in
1546 // its high bit. We just want to smear this across the other bits. Turns out,
1547 // this is exactly what an arithmetic right shift does.
1548 BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg)
1549 .addReg(X86::RSP);
1550 auto ShiftI =
1551 BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg)
1552 .addReg(TmpReg, RegState::Kill)
Chandler Carruth4b0028a2018-07-19 11:13:58 +00001553 .addImm(TRI->getRegSizeInBits(*PS->RC) - 1);
Chandler Carruth90358e12018-07-13 11:13:58 +00001554 ShiftI->addRegisterDead(X86::EFLAGS, TRI);
1555 ++NumInstsInserted;
1556
1557 return PredStateReg;
1558}
1559
1560void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
1561 MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
Chandler Carruth90358e12018-07-13 11:13:58 +00001562 SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
1563 MachineBasicBlock &MBB = *MI.getParent();
1564 DebugLoc Loc = MI.getDebugLoc();
1565
1566 // Check if EFLAGS are alive by seeing if there is a def of them or they
1567 // live-in, and then seeing if that def is in turn used.
1568 bool EFLAGSLive = isEFLAGSLive(MBB, MI.getIterator(), *TRI);
1569
1570 SmallVector<MachineOperand *, 2> HardenOpRegs;
1571
1572 if (BaseMO.isFI()) {
1573 // A frame index is never a dynamically controllable load, so only
1574 // harden it if we're covering fixed address loads as well.
1575 LLVM_DEBUG(
1576 dbgs() << " Skipping hardening base of explicit stack frame load: ";
1577 MI.dump(); dbgs() << "\n");
1578 } else if (BaseMO.getReg() == X86::RIP ||
1579 BaseMO.getReg() == X86::NoRegister) {
1580 // For both RIP-relative addressed loads or absolute loads, we cannot
1581 // meaningfully harden them because the address being loaded has no
1582 // dynamic component.
1583 //
1584 // FIXME: When using a segment base (like TLS does) we end up with the
1585 // dynamic address being the base plus -1 because we can't mutate the
1586 // segment register here. This allows the signed 32-bit offset to point at
1587 // valid segment-relative addresses and load them successfully.
1588 LLVM_DEBUG(
1589 dbgs() << " Cannot harden base of "
1590 << (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base")
1591 << " address in a load!");
1592 } else {
1593 assert(BaseMO.isReg() &&
1594 "Only allowed to have a frame index or register base.");
1595 HardenOpRegs.push_back(&BaseMO);
1596 }
1597
1598 if (IndexMO.getReg() != X86::NoRegister &&
1599 (HardenOpRegs.empty() ||
1600 HardenOpRegs.front()->getReg() != IndexMO.getReg()))
1601 HardenOpRegs.push_back(&IndexMO);
1602
1603 assert((HardenOpRegs.size() == 1 || HardenOpRegs.size() == 2) &&
1604 "Should have exactly one or two registers to harden!");
1605 assert((HardenOpRegs.size() == 1 ||
1606 HardenOpRegs[0]->getReg() != HardenOpRegs[1]->getReg()) &&
1607 "Should not have two of the same registers!");
1608
1609 // Remove any registers that have alreaded been checked.
1610 llvm::erase_if(HardenOpRegs, [&](MachineOperand *Op) {
1611 // See if this operand's register has already been checked.
1612 auto It = AddrRegToHardenedReg.find(Op->getReg());
1613 if (It == AddrRegToHardenedReg.end())
1614 // Not checked, so retain this one.
1615 return false;
1616
1617 // Otherwise, we can directly update this operand and remove it.
1618 Op->setReg(It->second);
1619 return true;
1620 });
1621 // If there are none left, we're done.
1622 if (HardenOpRegs.empty())
1623 return;
1624
1625 // Compute the current predicate state.
Chandler Carruth4b0028a2018-07-19 11:13:58 +00001626 unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
Chandler Carruth90358e12018-07-13 11:13:58 +00001627
1628 auto InsertPt = MI.getIterator();
1629
1630 // If EFLAGS are live and we don't have access to instructions that avoid
1631 // clobbering EFLAGS we need to save and restore them. This in turn makes
1632 // the EFLAGS no longer live.
1633 unsigned FlagsReg = 0;
1634 if (EFLAGSLive && !Subtarget->hasBMI2()) {
1635 EFLAGSLive = false;
1636 FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
1637 }
1638
1639 for (MachineOperand *Op : HardenOpRegs) {
Chandler Carruth90358e12018-07-13 11:13:58 +00001640 unsigned OpReg = Op->getReg();
Chandler Carruthcdf0add2018-07-16 04:17:51 +00001641 auto *OpRC = MRI->getRegClass(OpReg);
Chandler Carruth90358e12018-07-13 11:13:58 +00001642 unsigned TmpReg = MRI->createVirtualRegister(OpRC);
1643
Chandler Carruthcdf0add2018-07-16 04:17:51 +00001644 // If this is a vector register, we'll need somewhat custom logic to handle
1645 // hardening it.
1646 if (!Subtarget->hasVLX() && (OpRC->hasSuperClassEq(&X86::VR128RegClass) ||
1647 OpRC->hasSuperClassEq(&X86::VR256RegClass))) {
1648 assert(Subtarget->hasAVX2() && "AVX2-specific register classes!");
1649 bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128RegClass);
1650
1651 // Move our state into a vector register.
1652 // FIXME: We could skip this at the cost of longer encodings with AVX-512
1653 // but that doesn't seem likely worth it.
1654 unsigned VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass);
1655 auto MovI =
1656 BuildMI(MBB, InsertPt, Loc, TII->get(X86::VMOV64toPQIrr), VStateReg)
1657 .addReg(StateReg);
1658 (void)MovI;
1659 ++NumInstsInserted;
1660 LLVM_DEBUG(dbgs() << " Inserting mov: "; MovI->dump(); dbgs() << "\n");
1661
1662 // Broadcast it across the vector register.
1663 unsigned VBStateReg = MRI->createVirtualRegister(OpRC);
1664 auto BroadcastI = BuildMI(MBB, InsertPt, Loc,
1665 TII->get(Is128Bit ? X86::VPBROADCASTQrr
1666 : X86::VPBROADCASTQYrr),
1667 VBStateReg)
1668 .addReg(VStateReg);
1669 (void)BroadcastI;
1670 ++NumInstsInserted;
1671 LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
1672 dbgs() << "\n");
1673
1674 // Merge our potential poison state into the value with a vector or.
1675 auto OrI =
1676 BuildMI(MBB, InsertPt, Loc,
1677 TII->get(Is128Bit ? X86::VPORrr : X86::VPORYrr), TmpReg)
1678 .addReg(VBStateReg)
1679 .addReg(OpReg);
1680 (void)OrI;
1681 ++NumInstsInserted;
1682 LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1683 } else if (OpRC->hasSuperClassEq(&X86::VR128XRegClass) ||
1684 OpRC->hasSuperClassEq(&X86::VR256XRegClass) ||
1685 OpRC->hasSuperClassEq(&X86::VR512RegClass)) {
1686 assert(Subtarget->hasAVX512() && "AVX512-specific register classes!");
1687 bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128XRegClass);
1688 bool Is256Bit = OpRC->hasSuperClassEq(&X86::VR256XRegClass);
1689 if (Is128Bit || Is256Bit)
1690 assert(Subtarget->hasVLX() && "AVX512VL-specific register classes!");
1691
1692 // Broadcast our state into a vector register.
1693 unsigned VStateReg = MRI->createVirtualRegister(OpRC);
1694 unsigned BroadcastOp =
1695 Is128Bit ? X86::VPBROADCASTQrZ128r
1696 : Is256Bit ? X86::VPBROADCASTQrZ256r : X86::VPBROADCASTQrZr;
1697 auto BroadcastI =
1698 BuildMI(MBB, InsertPt, Loc, TII->get(BroadcastOp), VStateReg)
1699 .addReg(StateReg);
1700 (void)BroadcastI;
1701 ++NumInstsInserted;
1702 LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
1703 dbgs() << "\n");
1704
1705 // Merge our potential poison state into the value with a vector or.
1706 unsigned OrOp = Is128Bit ? X86::VPORQZ128rr
1707 : Is256Bit ? X86::VPORQZ256rr : X86::VPORQZrr;
1708 auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOp), TmpReg)
1709 .addReg(VStateReg)
Chandler Carruth90358e12018-07-13 11:13:58 +00001710 .addReg(OpReg);
Chandler Carruthbc46bca2018-07-16 04:42:27 +00001711 (void)OrI;
Chandler Carruth90358e12018-07-13 11:13:58 +00001712 ++NumInstsInserted;
1713 LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1714 } else {
Chandler Carruthcdf0add2018-07-16 04:17:51 +00001715 // FIXME: Need to support GR32 here for 32-bit code.
1716 assert(OpRC->hasSuperClassEq(&X86::GR64RegClass) &&
1717 "Not a supported register class for address hardening!");
1718
1719 if (!EFLAGSLive) {
1720 // Merge our potential poison state into the value with an or.
1721 auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), TmpReg)
1722 .addReg(StateReg)
1723 .addReg(OpReg);
1724 OrI->addRegisterDead(X86::EFLAGS, TRI);
1725 ++NumInstsInserted;
1726 LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1727 } else {
1728 // We need to avoid touching EFLAGS so shift out all but the least
1729 // significant bit using the instruction that doesn't update flags.
1730 auto ShiftI =
1731 BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHRX64rr), TmpReg)
1732 .addReg(OpReg)
1733 .addReg(StateReg);
1734 (void)ShiftI;
1735 ++NumInstsInserted;
1736 LLVM_DEBUG(dbgs() << " Inserting shrx: "; ShiftI->dump();
1737 dbgs() << "\n");
1738 }
Chandler Carruth90358e12018-07-13 11:13:58 +00001739 }
1740
1741 // Record this register as checked and update the operand.
1742 assert(!AddrRegToHardenedReg.count(Op->getReg()) &&
1743 "Should not have checked this register yet!");
1744 AddrRegToHardenedReg[Op->getReg()] = TmpReg;
1745 Op->setReg(TmpReg);
1746 ++NumAddrRegsHardened;
1747 }
1748
1749 // And restore the flags if needed.
1750 if (FlagsReg)
1751 restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
1752}
1753
1754MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
Chandler Carruthfa065aa2018-07-16 14:58:32 +00001755 MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedInstrs) {
Chandler Carruth90358e12018-07-13 11:13:58 +00001756 assert(isDataInvariantLoad(InitialMI) &&
1757 "Cannot get here with a non-invariant load!");
1758
1759 // See if we can sink hardening the loaded value.
1760 auto SinkCheckToSingleUse =
1761 [&](MachineInstr &MI) -> Optional<MachineInstr *> {
1762 unsigned DefReg = MI.getOperand(0).getReg();
1763
1764 // We need to find a single use which we can sink the check. We can
1765 // primarily do this because many uses may already end up checked on their
1766 // own.
1767 MachineInstr *SingleUseMI = nullptr;
1768 for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
1769 // If we're already going to harden this use, it is data invariant and
Chandler Carruthfa065aa2018-07-16 14:58:32 +00001770 // within our block.
1771 if (HardenedInstrs.count(&UseMI)) {
1772 if (!isDataInvariantLoad(UseMI)) {
1773 // If we've already decided to harden a non-load, we must have sunk
1774 // some other post-load hardened instruction to it and it must itself
1775 // be data-invariant.
1776 assert(isDataInvariant(UseMI) &&
1777 "Data variant instruction being hardened!");
1778 continue;
1779 }
1780
1781 // Otherwise, this is a load and the load component can't be data
1782 // invariant so check how this register is being used.
Chandler Carruth90358e12018-07-13 11:13:58 +00001783 const MCInstrDesc &Desc = UseMI.getDesc();
1784 int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
1785 assert(MemRefBeginIdx >= 0 &&
1786 "Should always have mem references here!");
1787 MemRefBeginIdx += X86II::getOperandBias(Desc);
1788
1789 MachineOperand &BaseMO =
1790 UseMI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
1791 MachineOperand &IndexMO =
1792 UseMI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
1793 if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) ||
1794 (IndexMO.isReg() && IndexMO.getReg() == DefReg))
1795 // The load uses the register as part of its address making it not
1796 // invariant.
1797 return {};
1798
1799 continue;
1800 }
1801
1802 if (SingleUseMI)
1803 // We already have a single use, this would make two. Bail.
1804 return {};
1805
1806 // If this single use isn't data invariant, isn't in this block, or has
1807 // interfering EFLAGS, we can't sink the hardening to it.
1808 if (!isDataInvariant(UseMI) || UseMI.getParent() != MI.getParent())
1809 return {};
1810
1811 // If this instruction defines multiple registers bail as we won't harden
1812 // all of them.
1813 if (UseMI.getDesc().getNumDefs() > 1)
1814 return {};
1815
1816 // If this register isn't a virtual register we can't walk uses of sanely,
1817 // just bail. Also check that its register class is one of the ones we
1818 // can harden.
1819 unsigned UseDefReg = UseMI.getOperand(0).getReg();
1820 if (!TRI->isVirtualRegister(UseDefReg) ||
Chandler Carruthfa065aa2018-07-16 14:58:32 +00001821 !canHardenRegister(UseDefReg))
Chandler Carruth90358e12018-07-13 11:13:58 +00001822 return {};
1823
1824 SingleUseMI = &UseMI;
1825 }
1826
1827 // If SingleUseMI is still null, there is no use that needs its own
1828 // checking. Otherwise, it is the single use that needs checking.
1829 return {SingleUseMI};
1830 };
1831
1832 MachineInstr *MI = &InitialMI;
1833 while (Optional<MachineInstr *> SingleUse = SinkCheckToSingleUse(*MI)) {
1834 // Update which MI we're checking now.
1835 MI = *SingleUse;
1836 if (!MI)
1837 break;
1838 }
1839
1840 return MI;
1841}
1842
Chandler Carruthfa065aa2018-07-16 14:58:32 +00001843bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) {
1844 auto *RC = MRI->getRegClass(Reg);
1845 int RegBytes = TRI->getRegSizeInBits(*RC) / 8;
1846 if (RegBytes > 8)
1847 // We don't support post-load hardening of vectors.
Chandler Carruthe66a6f42018-07-16 11:38:48 +00001848 return false;
1849
Chandler Carruthfa065aa2018-07-16 14:58:32 +00001850 // If this register class is explicitly constrained to a class that doesn't
1851 // require REX prefix, we may not be able to satisfy that constraint when
1852 // emitting the hardening instructions, so bail out here.
1853 // FIXME: This seems like a pretty lame hack. The way this comes up is when we
1854 // end up both with a NOREX and REX-only register as operands to the hardening
1855 // instructions. It would be better to fix that code to handle this situation
1856 // rather than hack around it in this way.
1857 const TargetRegisterClass *NOREXRegClasses[] = {
1858 &X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass,
1859 &X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass};
1860 if (RC == NOREXRegClasses[Log2_32(RegBytes)])
Chandler Carruthe66a6f42018-07-16 11:38:48 +00001861 return false;
1862
1863 const TargetRegisterClass *GPRRegClasses[] = {
1864 &X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass,
1865 &X86::GR64RegClass};
Chandler Carruthfa065aa2018-07-16 14:58:32 +00001866 return RC->hasSuperClassEq(GPRRegClasses[Log2_32(RegBytes)]);
Chandler Carruthe66a6f42018-07-16 11:38:48 +00001867}
1868
Chandler Carruth90358e12018-07-13 11:13:58 +00001869// We can harden non-leaking loads into register without touching the address
1870// by just hiding all of the loaded bits. We use an `or` instruction to do
1871// this because having the poison value be all ones allows us to use the same
1872// value below. And the goal is just for the loaded bits to not be exposed to
1873// execution and coercing them to one is sufficient.
Chandler Carruth4b0028a2018-07-19 11:13:58 +00001874void X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) {
Chandler Carruth90358e12018-07-13 11:13:58 +00001875 MachineBasicBlock &MBB = *MI.getParent();
1876 DebugLoc Loc = MI.getDebugLoc();
1877
1878 // For all of these, the def'ed register operand is operand zero.
1879 auto &DefOp = MI.getOperand(0);
1880 unsigned OldDefReg = DefOp.getReg();
Chandler Carruthfa065aa2018-07-16 14:58:32 +00001881 assert(canHardenRegister(OldDefReg) &&
1882 "Cannot harden this instruction's defined register!");
Chandler Carruth90358e12018-07-13 11:13:58 +00001883
1884 auto *DefRC = MRI->getRegClass(OldDefReg);
1885 int DefRegBytes = TRI->getRegSizeInBits(*DefRC) / 8;
1886
1887 unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
1888 unsigned OrOpCode = OrOpCodes[Log2_32(DefRegBytes)];
1889
Chandler Carruth90358e12018-07-13 11:13:58 +00001890 unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
1891
1892 auto GetStateRegInRC = [&](const TargetRegisterClass &RC) {
Chandler Carruth4b0028a2018-07-19 11:13:58 +00001893 unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
Chandler Carruth90358e12018-07-13 11:13:58 +00001894
1895 int Bytes = TRI->getRegSizeInBits(RC) / 8;
1896 // FIXME: Need to teach this about 32-bit mode.
1897 if (Bytes != 8) {
1898 unsigned SubRegImm = SubRegImms[Log2_32(Bytes)];
1899 unsigned NarrowStateReg = MRI->createVirtualRegister(&RC);
1900 BuildMI(MBB, MI.getIterator(), Loc, TII->get(TargetOpcode::COPY),
1901 NarrowStateReg)
1902 .addReg(StateReg, 0, SubRegImm);
1903 StateReg = NarrowStateReg;
1904 }
1905 return StateReg;
1906 };
1907
1908 auto InsertPt = std::next(MI.getIterator());
1909 unsigned FlagsReg = 0;
Chandler Carruthb46c22d2018-07-24 00:21:59 +00001910 if (isEFLAGSLive(MBB, InsertPt, *TRI))
Chandler Carruth90358e12018-07-13 11:13:58 +00001911 FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
Chandler Carruth90358e12018-07-13 11:13:58 +00001912
Chandler Carruthb46c22d2018-07-24 00:21:59 +00001913 unsigned StateReg = GetStateRegInRC(*DefRC);
1914 unsigned NewDefReg = MRI->createVirtualRegister(DefRC);
1915 DefOp.setReg(NewDefReg);
1916 auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), OldDefReg)
1917 .addReg(StateReg)
1918 .addReg(NewDefReg);
1919 OrI->addRegisterDead(X86::EFLAGS, TRI);
1920 ++NumInstsInserted;
1921 LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
Chandler Carruth90358e12018-07-13 11:13:58 +00001922
1923 if (FlagsReg)
1924 restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
1925
1926 ++NumPostLoadRegsHardened;
1927}
1928
Chandler Carrutha3a03ac2018-07-19 23:46:24 +00001929/// Harden a return instruction.
1930///
1931/// Returns implicitly perform a load which we need to harden. Without hardening
1932/// this load, an attacker my speculatively write over the return address to
1933/// steer speculation of the return to an attacker controlled address. This is
1934/// called Spectre v1.1 or Bounds Check Bypass Store (BCBS) and is described in
1935/// this paper:
1936/// https://people.csail.mit.edu/vlk/spectre11.pdf
1937///
1938/// We can harden this by introducing an LFENCE that will delay any load of the
1939/// return address until prior instructions have retired (and thus are not being
1940/// speculated), or we can harden the address used by the implicit load: the
1941/// stack pointer.
1942///
1943/// If we are not using an LFENCE, hardening the stack pointer has an additional
1944/// benefit: it allows us to pass the predicate state accumulated in this
1945/// function back to the caller. In the absence of a BCBS attack on the return,
1946/// the caller will typically be resumed and speculatively executed due to the
1947/// Return Stack Buffer (RSB) prediction which is very accurate and has a high
1948/// priority. It is possible that some code from the caller will be executed
1949/// speculatively even during a BCBS-attacked return until the steering takes
1950/// effect. Whenever this happens, the caller can recover the (poisoned)
1951/// predicate state from the stack pointer and continue to harden loads.
1952void X86SpeculativeLoadHardeningPass::hardenReturnInstr(MachineInstr &MI) {
Chandler Carruth90358e12018-07-13 11:13:58 +00001953 MachineBasicBlock &MBB = *MI.getParent();
1954 DebugLoc Loc = MI.getDebugLoc();
1955 auto InsertPt = MI.getIterator();
1956
1957 if (FenceCallAndRet) {
1958 // Simply forcibly block speculation of loads out of the function by using
1959 // an LFENCE. This is potentially a heavy-weight mitigation strategy, but
1960 // should be secure, is simple from an ABI perspective, and the cost can be
1961 // minimized through inlining.
1962 //
1963 // FIXME: We should investigate ways to establish a strong data-dependency
1964 // on the return. However, poisoning the stack pointer is unlikely to work
1965 // because the return is *predicted* rather than relying on the load of the
1966 // return address to actually resolve.
1967 BuildMI(MBB, InsertPt, Loc, TII->get(X86::LFENCE));
1968 ++NumInstsInserted;
1969 ++NumLFENCEsInserted;
1970 return;
1971 }
1972
1973 // Take our predicate state, shift it to the high 17 bits (so that we keep
1974 // pointers canonical) and merge it into RSP. This will allow the caller to
1975 // extract it when we return (speculatively).
Chandler Carruth4b0028a2018-07-19 11:13:58 +00001976 mergePredStateIntoSP(MBB, InsertPt, Loc, PS->SSA.GetValueAtEndOfBlock(&MBB));
Chandler Carruth90358e12018-07-13 11:13:58 +00001977}
1978
1979INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
1980 "X86 speculative load hardener", false, false)
1981INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
1982 "X86 speculative load hardener", false, false)
1983
1984FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() {
1985 return new X86SpeculativeLoadHardeningPass();
1986}