blob: 65d53e9fd3af10c570e0aa28741656d6dd41569b [file] [log] [blame]
Chandler Carruth90358e12018-07-13 11:13:58 +00001//====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9/// \file
10///
11/// Provide a pass which mitigates speculative execution attacks which operate
12/// by speculating incorrectly past some predicate (a type check, bounds check,
13/// or other condition) to reach a load with invalid inputs and leak the data
14/// accessed by that load using a side channel out of the speculative domain.
15///
16/// For details on the attacks, see the first variant in both the Project Zero
17/// writeup and the Spectre paper:
18/// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
19/// https://spectreattack.com/spectre.pdf
20///
21//===----------------------------------------------------------------------===//
22
23#include "X86.h"
24#include "X86InstrBuilder.h"
25#include "X86InstrInfo.h"
26#include "X86Subtarget.h"
27#include "llvm/ADT/ArrayRef.h"
28#include "llvm/ADT/DenseMap.h"
Chandler Carruth4b0028a2018-07-19 11:13:58 +000029#include "llvm/ADT/Optional.h"
Chandler Carruth90358e12018-07-13 11:13:58 +000030#include "llvm/ADT/STLExtras.h"
31#include "llvm/ADT/ScopeExit.h"
32#include "llvm/ADT/SmallPtrSet.h"
33#include "llvm/ADT/SmallSet.h"
34#include "llvm/ADT/SmallVector.h"
35#include "llvm/ADT/SparseBitVector.h"
36#include "llvm/ADT/Statistic.h"
37#include "llvm/CodeGen/MachineBasicBlock.h"
38#include "llvm/CodeGen/MachineConstantPool.h"
39#include "llvm/CodeGen/MachineFunction.h"
40#include "llvm/CodeGen/MachineFunctionPass.h"
41#include "llvm/CodeGen/MachineInstr.h"
42#include "llvm/CodeGen/MachineInstrBuilder.h"
43#include "llvm/CodeGen/MachineModuleInfo.h"
44#include "llvm/CodeGen/MachineOperand.h"
45#include "llvm/CodeGen/MachineRegisterInfo.h"
46#include "llvm/CodeGen/MachineSSAUpdater.h"
47#include "llvm/CodeGen/TargetInstrInfo.h"
48#include "llvm/CodeGen/TargetRegisterInfo.h"
49#include "llvm/CodeGen/TargetSchedule.h"
50#include "llvm/CodeGen/TargetSubtargetInfo.h"
51#include "llvm/IR/DebugLoc.h"
52#include "llvm/MC/MCSchedule.h"
53#include "llvm/Pass.h"
54#include "llvm/Support/CommandLine.h"
55#include "llvm/Support/Debug.h"
56#include "llvm/Support/raw_ostream.h"
57#include <algorithm>
58#include <cassert>
59#include <iterator>
60#include <utility>
61
62using namespace llvm;
63
64#define PASS_KEY "x86-speculative-load-hardening"
65#define DEBUG_TYPE PASS_KEY
66
67STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced");
68STATISTIC(NumBranchesUntraced, "Number of branches unable to trace");
69STATISTIC(NumAddrRegsHardened,
70 "Number of address mode used registers hardaned");
71STATISTIC(NumPostLoadRegsHardened,
72 "Number of post-load register values hardened");
73STATISTIC(NumInstsInserted, "Number of instructions inserted");
74STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
75
76static cl::opt<bool> HardenEdgesWithLFENCE(
77 PASS_KEY "-lfence",
78 cl::desc(
79 "Use LFENCE along each conditional edge to harden against speculative "
80 "loads rather than conditional movs and poisoned pointers."),
81 cl::init(false), cl::Hidden);
82
83static cl::opt<bool> EnablePostLoadHardening(
84 PASS_KEY "-post-load",
85 cl::desc("Harden the value loaded *after* it is loaded by "
86 "flushing the loaded bits to 1. This is hard to do "
87 "in general but can be done easily for GPRs."),
88 cl::init(true), cl::Hidden);
89
90static cl::opt<bool> FenceCallAndRet(
91 PASS_KEY "-fence-call-and-ret",
92 cl::desc("Use a full speculation fence to harden both call and ret edges "
93 "rather than a lighter weight mitigation."),
94 cl::init(false), cl::Hidden);
95
96static cl::opt<bool> HardenInterprocedurally(
97 PASS_KEY "-ip",
98 cl::desc("Harden interprocedurally by passing our state in and out of "
99 "functions in the high bits of the stack pointer."),
100 cl::init(true), cl::Hidden);
101
102static cl::opt<bool>
103 HardenLoads(PASS_KEY "-loads",
104 cl::desc("Sanitize loads from memory. When disable, no "
105 "significant security is provided."),
106 cl::init(true), cl::Hidden);
107
108namespace llvm {
109
110void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
111
112} // end namespace llvm
113
114namespace {
115
116class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
117public:
118 X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) {
119 initializeX86SpeculativeLoadHardeningPassPass(
120 *PassRegistry::getPassRegistry());
121 }
122
123 StringRef getPassName() const override {
124 return "X86 speculative load hardening";
125 }
126 bool runOnMachineFunction(MachineFunction &MF) override;
127 void getAnalysisUsage(AnalysisUsage &AU) const override;
128
129 /// Pass identification, replacement for typeid.
130 static char ID;
131
132private:
133 /// The information about a block's conditional terminators needed to trace
134 /// our predicate state through the exiting edges.
135 struct BlockCondInfo {
136 MachineBasicBlock *MBB;
137
138 // We mostly have one conditional branch, and in extremely rare cases have
139 // two. Three and more are so rare as to be unimportant for compile time.
140 SmallVector<MachineInstr *, 2> CondBrs;
141
142 MachineInstr *UncondBr;
143 };
144
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000145 /// Manages the predicate state traced through the program.
146 struct PredState {
147 unsigned InitialReg;
148 unsigned PoisonReg;
149
150 const TargetRegisterClass *RC;
151 MachineSSAUpdater SSA;
152
153 PredState(MachineFunction &MF, const TargetRegisterClass *RC)
154 : RC(RC), SSA(MF) {}
155 };
156
Chandler Carruth90358e12018-07-13 11:13:58 +0000157 const X86Subtarget *Subtarget;
158 MachineRegisterInfo *MRI;
159 const X86InstrInfo *TII;
160 const TargetRegisterInfo *TRI;
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000161
162 Optional<PredState> PS;
Chandler Carruth90358e12018-07-13 11:13:58 +0000163
164 void hardenEdgesWithLFENCE(MachineFunction &MF);
165
166 SmallVector<BlockCondInfo, 16> collectBlockCondInfo(MachineFunction &MF);
167
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000168 SmallVector<MachineInstr *, 16>
169 tracePredStateThroughCFG(MachineFunction &MF, ArrayRef<BlockCondInfo> Infos);
170
Chandler Carruth0477b402018-07-23 04:01:34 +0000171 void hardenAllLoads(MachineFunction &MF);
Chandler Carruth90358e12018-07-13 11:13:58 +0000172
173 unsigned saveEFLAGS(MachineBasicBlock &MBB,
174 MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
175 void restoreEFLAGS(MachineBasicBlock &MBB,
176 MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
177 unsigned OFReg);
178
179 void mergePredStateIntoSP(MachineBasicBlock &MBB,
180 MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
181 unsigned PredStateReg);
182 unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
183 MachineBasicBlock::iterator InsertPt,
184 DebugLoc Loc);
185
186 void
187 hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000188 MachineOperand &IndexMO,
Chandler Carruth90358e12018-07-13 11:13:58 +0000189 SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
190 MachineInstr *
191 sinkPostLoadHardenedInst(MachineInstr &MI,
Chandler Carruthfa065aa2018-07-16 14:58:32 +0000192 SmallPtrSetImpl<MachineInstr *> &HardenedInstrs);
193 bool canHardenRegister(unsigned Reg);
Chandler Carruth54529142018-07-24 12:44:00 +0000194 unsigned hardenValueInRegister(unsigned Reg, MachineBasicBlock &MBB,
195 MachineBasicBlock::iterator InsertPt,
196 DebugLoc Loc);
197 unsigned hardenPostLoad(MachineInstr &MI);
Chandler Carrutha3a03ac2018-07-19 23:46:24 +0000198 void hardenReturnInstr(MachineInstr &MI);
Chandler Carruth90358e12018-07-13 11:13:58 +0000199};
200
201} // end anonymous namespace
202
203char X86SpeculativeLoadHardeningPass::ID = 0;
204
205void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
206 AnalysisUsage &AU) const {
207 MachineFunctionPass::getAnalysisUsage(AU);
208}
209
210static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB,
211 MachineBasicBlock &Succ, int SuccCount,
212 MachineInstr *Br, MachineInstr *&UncondBr,
213 const X86InstrInfo &TII) {
214 assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!");
215
216 MachineFunction &MF = *MBB.getParent();
217
218 MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
219
220 // We have to insert the new block immediately after the current one as we
221 // don't know what layout-successor relationships the successor has and we
222 // may not be able to (and generally don't want to) try to fix those up.
223 MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
224
225 // Update the branch instruction if necessary.
226 if (Br) {
227 assert(Br->getOperand(0).getMBB() == &Succ &&
228 "Didn't start with the right target!");
229 Br->getOperand(0).setMBB(&NewMBB);
230
231 // If this successor was reached through a branch rather than fallthrough,
232 // we might have *broken* fallthrough and so need to inject a new
233 // unconditional branch.
234 if (!UncondBr) {
235 MachineBasicBlock &OldLayoutSucc =
236 *std::next(MachineFunction::iterator(&NewMBB));
237 assert(MBB.isSuccessor(&OldLayoutSucc) &&
238 "Without an unconditional branch, the old layout successor should "
239 "be an actual successor!");
240 auto BrBuilder =
241 BuildMI(&MBB, DebugLoc(), TII.get(X86::JMP_1)).addMBB(&OldLayoutSucc);
242 // Update the unconditional branch now that we've added one.
243 UncondBr = &*BrBuilder;
244 }
245
246 // Insert unconditional "jump Succ" instruction in the new block if
247 // necessary.
248 if (!NewMBB.isLayoutSuccessor(&Succ)) {
249 SmallVector<MachineOperand, 4> Cond;
250 TII.insertBranch(NewMBB, &Succ, nullptr, Cond, Br->getDebugLoc());
251 }
252 } else {
253 assert(!UncondBr &&
254 "Cannot have a branchless successor and an unconditional branch!");
255 assert(NewMBB.isLayoutSuccessor(&Succ) &&
256 "A non-branch successor must have been a layout successor before "
257 "and now is a layout successor of the new block.");
258 }
259
260 // If this is the only edge to the successor, we can just replace it in the
261 // CFG. Otherwise we need to add a new entry in the CFG for the new
262 // successor.
263 if (SuccCount == 1) {
264 MBB.replaceSuccessor(&Succ, &NewMBB);
265 } else {
266 MBB.splitSuccessor(&Succ, &NewMBB);
267 }
268
269 // Hook up the edge from the new basic block to the old successor in the CFG.
270 NewMBB.addSuccessor(&Succ);
271
272 // Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
273 for (MachineInstr &MI : Succ) {
274 if (!MI.isPHI())
275 break;
276 for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
277 OpIdx += 2) {
278 MachineOperand &OpV = MI.getOperand(OpIdx);
279 MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
280 assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
281 if (OpMBB.getMBB() != &MBB)
282 continue;
283
284 // If this is the last edge to the succesor, just replace MBB in the PHI
285 if (SuccCount == 1) {
286 OpMBB.setMBB(&NewMBB);
287 break;
288 }
289
290 // Otherwise, append a new pair of operands for the new incoming edge.
291 MI.addOperand(MF, OpV);
292 MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
293 break;
294 }
295 }
296
297 // Inherit live-ins from the successor
298 for (auto &LI : Succ.liveins())
299 NewMBB.addLiveIn(LI);
300
301 LLVM_DEBUG(dbgs() << " Split edge from '" << MBB.getName() << "' to '"
302 << Succ.getName() << "'.\n");
303 return NewMBB;
304}
305
Chandler Carruth3ffcc032018-07-15 23:46:36 +0000306/// Removing duplicate PHI operands to leave the PHI in a canonical and
307/// predictable form.
308///
309/// FIXME: It's really frustrating that we have to do this, but SSA-form in MIR
310/// isn't what you might expect. We may have multiple entries in PHI nodes for
311/// a single predecessor. This makes CFG-updating extremely complex, so here we
312/// simplify all PHI nodes to a model even simpler than the IR's model: exactly
313/// one entry per predecessor, regardless of how many edges there are.
314static void canonicalizePHIOperands(MachineFunction &MF) {
315 SmallPtrSet<MachineBasicBlock *, 4> Preds;
316 SmallVector<int, 4> DupIndices;
317 for (auto &MBB : MF)
318 for (auto &MI : MBB) {
319 if (!MI.isPHI())
320 break;
321
322 // First we scan the operands of the PHI looking for duplicate entries
323 // a particular predecessor. We retain the operand index of each duplicate
324 // entry found.
325 for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
326 OpIdx += 2)
327 if (!Preds.insert(MI.getOperand(OpIdx + 1).getMBB()).second)
328 DupIndices.push_back(OpIdx);
329
330 // Now walk the duplicate indices, removing both the block and value. Note
331 // that these are stored as a vector making this element-wise removal
332 // :w
333 // potentially quadratic.
334 //
335 // FIXME: It is really frustrating that we have to use a quadratic
336 // removal algorithm here. There should be a better way, but the use-def
337 // updates required make that impossible using the public API.
338 //
339 // Note that we have to process these backwards so that we don't
340 // invalidate other indices with each removal.
341 while (!DupIndices.empty()) {
342 int OpIdx = DupIndices.pop_back_val();
343 // Remove both the block and value operand, again in reverse order to
344 // preserve indices.
345 MI.RemoveOperand(OpIdx + 1);
346 MI.RemoveOperand(OpIdx);
347 }
348
349 Preds.clear();
350 }
351}
352
Chandler Carruth3620b992018-07-16 10:46:16 +0000353/// Helper to scan a function for loads vulnerable to misspeculation that we
354/// want to harden.
355///
356/// We use this to avoid making changes to functions where there is nothing we
357/// need to do to harden against misspeculation.
358static bool hasVulnerableLoad(MachineFunction &MF) {
359 for (MachineBasicBlock &MBB : MF) {
360 for (MachineInstr &MI : MBB) {
361 // Loads within this basic block after an LFENCE are not at risk of
362 // speculatively executing with invalid predicates from prior control
363 // flow. So break out of this block but continue scanning the function.
364 if (MI.getOpcode() == X86::LFENCE)
365 break;
366
367 // Looking for loads only.
368 if (!MI.mayLoad())
369 continue;
370
371 // An MFENCE is modeled as a load but isn't vulnerable to misspeculation.
372 if (MI.getOpcode() == X86::MFENCE)
373 continue;
374
375 // We found a load.
376 return true;
377 }
378 }
379
380 // No loads found.
381 return false;
382}
383
Chandler Carruth90358e12018-07-13 11:13:58 +0000384bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
385 MachineFunction &MF) {
386 LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
387 << " **********\n");
388
389 Subtarget = &MF.getSubtarget<X86Subtarget>();
390 MRI = &MF.getRegInfo();
391 TII = Subtarget->getInstrInfo();
392 TRI = Subtarget->getRegisterInfo();
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000393
Chandler Carruth90358e12018-07-13 11:13:58 +0000394 // FIXME: Support for 32-bit.
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000395 PS.emplace(MF, &X86::GR64_NOSPRegClass);
Chandler Carruth90358e12018-07-13 11:13:58 +0000396
397 if (MF.begin() == MF.end())
398 // Nothing to do for a degenerate empty function...
399 return false;
400
401 // We support an alternative hardening technique based on a debug flag.
402 if (HardenEdgesWithLFENCE) {
403 hardenEdgesWithLFENCE(MF);
404 return true;
405 }
406
407 // Create a dummy debug loc to use for all the generated code here.
408 DebugLoc Loc;
409
410 MachineBasicBlock &Entry = *MF.begin();
411 auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(Entry.begin());
412
413 // Do a quick scan to see if we have any checkable loads.
Chandler Carruth3620b992018-07-16 10:46:16 +0000414 bool HasVulnerableLoad = hasVulnerableLoad(MF);
Chandler Carruth90358e12018-07-13 11:13:58 +0000415
416 // See if we have any conditional branching blocks that we will need to trace
417 // predicate state through.
418 SmallVector<BlockCondInfo, 16> Infos = collectBlockCondInfo(MF);
419
420 // If we have no interesting conditions or loads, nothing to do here.
Chandler Carruth3620b992018-07-16 10:46:16 +0000421 if (!HasVulnerableLoad && Infos.empty())
Chandler Carruth90358e12018-07-13 11:13:58 +0000422 return true;
423
Chandler Carruth90358e12018-07-13 11:13:58 +0000424 // The poison value is required to be an all-ones value for many aspects of
425 // this mitigation.
426 const int PoisonVal = -1;
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000427 PS->PoisonReg = MRI->createVirtualRegister(PS->RC);
428 BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64ri32), PS->PoisonReg)
Chandler Carruth90358e12018-07-13 11:13:58 +0000429 .addImm(PoisonVal);
430 ++NumInstsInserted;
431
432 // If we have loads being hardened and we've asked for call and ret edges to
433 // get a full fence-based mitigation, inject that fence.
Chandler Carruth3620b992018-07-16 10:46:16 +0000434 if (HasVulnerableLoad && FenceCallAndRet) {
Chandler Carruth90358e12018-07-13 11:13:58 +0000435 // We need to insert an LFENCE at the start of the function to suspend any
436 // incoming misspeculation from the caller. This helps two-fold: the caller
437 // may not have been protected as this code has been, and this code gets to
438 // not take any specific action to protect across calls.
439 // FIXME: We could skip this for functions which unconditionally return
440 // a constant.
441 BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::LFENCE));
442 ++NumInstsInserted;
443 ++NumLFENCEsInserted;
444 }
445
Chandler Carruthfb503ac2018-07-14 09:32:37 +0000446 // If we guarded the entry with an LFENCE and have no conditionals to protect
447 // in blocks, then we're done.
448 if (FenceCallAndRet && Infos.empty())
Chandler Carruth90358e12018-07-13 11:13:58 +0000449 // We may have changed the function's code at this point to insert fences.
450 return true;
451
452 // For every basic block in the function which can b
453 if (HardenInterprocedurally && !FenceCallAndRet) {
454 // Set up the predicate state by extracting it from the incoming stack
455 // pointer so we pick up any misspeculation in our caller.
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000456 PS->InitialReg = extractPredStateFromSP(Entry, EntryInsertPt, Loc);
Chandler Carruth90358e12018-07-13 11:13:58 +0000457 } else {
458 // Otherwise, just build the predicate state itself by zeroing a register
459 // as we don't need any initial state.
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000460 PS->InitialReg = MRI->createVirtualRegister(PS->RC);
Chandler Carruth90358e12018-07-13 11:13:58 +0000461 unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
462 auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
463 PredStateSubReg);
464 ++NumInstsInserted;
465 MachineOperand *ZeroEFLAGSDefOp =
466 ZeroI->findRegisterDefOperand(X86::EFLAGS);
467 assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
468 "Must have an implicit def of EFLAGS!");
469 ZeroEFLAGSDefOp->setIsDead(true);
470 BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000471 PS->InitialReg)
Chandler Carruth90358e12018-07-13 11:13:58 +0000472 .addImm(0)
473 .addReg(PredStateSubReg)
474 .addImm(X86::sub_32bit);
475 }
476
477 // We're going to need to trace predicate state throughout the function's
478 // CFG. Prepare for this by setting up our initial state of PHIs with unique
479 // predecessor entries and all the initial predicate state.
Chandler Carruth3ffcc032018-07-15 23:46:36 +0000480 canonicalizePHIOperands(MF);
Chandler Carruth90358e12018-07-13 11:13:58 +0000481
482 // Track the updated values in an SSA updater to rewrite into SSA form at the
483 // end.
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000484 PS->SSA.Initialize(PS->InitialReg);
485 PS->SSA.AddAvailableValue(&Entry, PS->InitialReg);
Chandler Carruth90358e12018-07-13 11:13:58 +0000486
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000487 // Trace through the CFG.
488 auto CMovs = tracePredStateThroughCFG(MF, Infos);
Chandler Carruth90358e12018-07-13 11:13:58 +0000489
490 // We may also enter basic blocks in this function via exception handling
491 // control flow. Here, if we are hardening interprocedurally, we need to
492 // re-capture the predicate state from the throwing code. In the Itanium ABI,
493 // the throw will always look like a call to __cxa_throw and will have the
494 // predicate state in the stack pointer, so extract fresh predicate state from
495 // the stack pointer and make it available in SSA.
496 // FIXME: Handle non-itanium ABI EH models.
497 if (HardenInterprocedurally) {
498 for (MachineBasicBlock &MBB : MF) {
499 assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!");
500 assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
501 assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
502 if (!MBB.isEHPad())
503 continue;
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000504 PS->SSA.AddAvailableValue(
Chandler Carruth90358e12018-07-13 11:13:58 +0000505 &MBB,
506 extractPredStateFromSP(MBB, MBB.SkipPHIsAndLabels(MBB.begin()), Loc));
507 }
508 }
509
Chandler Carruth0477b402018-07-23 04:01:34 +0000510 // Now harden all of the loads in the function using the predicate state.
511 hardenAllLoads(MF);
Chandler Carruth90358e12018-07-13 11:13:58 +0000512
513 // Now rewrite all the uses of the pred state using the SSA updater so that
514 // we track updates through the CFG.
515 for (MachineInstr *CMovI : CMovs)
516 for (MachineOperand &Op : CMovI->operands()) {
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000517 if (!Op.isReg() || Op.getReg() != PS->InitialReg)
Chandler Carruth90358e12018-07-13 11:13:58 +0000518 continue;
519
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000520 PS->SSA.RewriteUse(Op);
Chandler Carruth90358e12018-07-13 11:13:58 +0000521 }
522
523 // If we are hardening interprocedurally, find each returning block and
524 // protect the caller from being returned to through misspeculation.
525 if (HardenInterprocedurally)
526 for (MachineBasicBlock &MBB : MF) {
527 if (MBB.empty())
528 continue;
529
530 MachineInstr &MI = MBB.back();
Chandler Carruth1d926fb2018-07-23 07:56:15 +0000531
532 // We only care about returns that are not also calls. For calls, that
533 // happen to also be returns (tail calls) we will have already handled
534 // them as calls.
535 if (!MI.isReturn() || MI.isCall())
Chandler Carruth90358e12018-07-13 11:13:58 +0000536 continue;
537
Chandler Carrutha3a03ac2018-07-19 23:46:24 +0000538 hardenReturnInstr(MI);
Chandler Carruth90358e12018-07-13 11:13:58 +0000539 }
540
541 LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump();
542 dbgs() << "\n"; MF.verify(this));
543 return true;
544}
545
546/// Implements the naive hardening approach of putting an LFENCE after every
547/// potentially mis-predicted control flow construct.
548///
549/// We include this as an alternative mostly for the purpose of comparison. The
550/// performance impact of this is expected to be extremely severe and not
551/// practical for any real-world users.
552void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
553 MachineFunction &MF) {
554 // First, we scan the function looking for blocks that are reached along edges
555 // that we might want to harden.
556 SmallSetVector<MachineBasicBlock *, 8> Blocks;
557 for (MachineBasicBlock &MBB : MF) {
558 // If there are no or only one successor, nothing to do here.
559 if (MBB.succ_size() <= 1)
560 continue;
561
562 // Skip blocks unless their terminators start with a branch. Other
563 // terminators don't seem interesting for guarding against misspeculation.
564 auto TermIt = MBB.getFirstTerminator();
565 if (TermIt == MBB.end() || !TermIt->isBranch())
566 continue;
567
568 // Add all the non-EH-pad succossors to the blocks we want to harden. We
569 // skip EH pads because there isn't really a condition of interest on
570 // entering.
571 for (MachineBasicBlock *SuccMBB : MBB.successors())
572 if (!SuccMBB->isEHPad())
573 Blocks.insert(SuccMBB);
574 }
575
576 for (MachineBasicBlock *MBB : Blocks) {
577 auto InsertPt = MBB->SkipPHIsAndLabels(MBB->begin());
578 BuildMI(*MBB, InsertPt, DebugLoc(), TII->get(X86::LFENCE));
579 ++NumInstsInserted;
580 ++NumLFENCEsInserted;
581 }
582}
583
584SmallVector<X86SpeculativeLoadHardeningPass::BlockCondInfo, 16>
585X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
586 SmallVector<BlockCondInfo, 16> Infos;
587
588 // Walk the function and build up a summary for each block's conditions that
589 // we need to trace through.
590 for (MachineBasicBlock &MBB : MF) {
591 // If there are no or only one successor, nothing to do here.
592 if (MBB.succ_size() <= 1)
593 continue;
594
595 // We want to reliably handle any conditional branch terminators in the
596 // MBB, so we manually analyze the branch. We can handle all of the
597 // permutations here, including ones that analyze branch cannot.
598 //
599 // The approach is to walk backwards across the terminators, resetting at
600 // any unconditional non-indirect branch, and track all conditional edges
601 // to basic blocks as well as the fallthrough or unconditional successor
602 // edge. For each conditional edge, we track the target and the opposite
603 // condition code in order to inject a "no-op" cmov into that successor
604 // that will harden the predicate. For the fallthrough/unconditional
605 // edge, we inject a separate cmov for each conditional branch with
606 // matching condition codes. This effectively implements an "and" of the
607 // condition flags, even if there isn't a single condition flag that would
608 // directly implement that. We don't bother trying to optimize either of
609 // these cases because if such an optimization is possible, LLVM should
610 // have optimized the conditional *branches* in that way already to reduce
611 // instruction count. This late, we simply assume the minimal number of
612 // branch instructions is being emitted and use that to guide our cmov
613 // insertion.
614
615 BlockCondInfo Info = {&MBB, {}, nullptr};
616
617 // Now walk backwards through the terminators and build up successors they
618 // reach and the conditions.
619 for (MachineInstr &MI : llvm::reverse(MBB)) {
620 // Once we've handled all the terminators, we're done.
621 if (!MI.isTerminator())
622 break;
623
624 // If we see a non-branch terminator, we can't handle anything so bail.
625 if (!MI.isBranch()) {
626 Info.CondBrs.clear();
627 break;
628 }
629
630 // If we see an unconditional branch, reset our state, clear any
631 // fallthrough, and set this is the "else" successor.
632 if (MI.getOpcode() == X86::JMP_1) {
633 Info.CondBrs.clear();
634 Info.UncondBr = &MI;
635 continue;
636 }
637
638 // If we get an invalid condition, we have an indirect branch or some
639 // other unanalyzable "fallthrough" case. We model this as a nullptr for
640 // the destination so we can still guard any conditional successors.
641 // Consider code sequences like:
642 // ```
643 // jCC L1
644 // jmpq *%rax
645 // ```
646 // We still want to harden the edge to `L1`.
647 if (X86::getCondFromBranchOpc(MI.getOpcode()) == X86::COND_INVALID) {
648 Info.CondBrs.clear();
649 Info.UncondBr = &MI;
650 continue;
651 }
652
653 // We have a vanilla conditional branch, add it to our list.
654 Info.CondBrs.push_back(&MI);
655 }
656 if (Info.CondBrs.empty()) {
657 ++NumBranchesUntraced;
658 LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
659 MBB.dump());
660 continue;
661 }
662
663 Infos.push_back(Info);
664 }
665
666 return Infos;
667}
668
Chandler Carruth4b0028a2018-07-19 11:13:58 +0000669/// Trace the predicate state through the CFG, instrumenting each conditional
670/// branch such that misspeculation through an edge will poison the predicate
671/// state.
672///
673/// Returns the list of inserted CMov instructions so that they can have their
674/// uses of the predicate state rewritten into proper SSA form once it is
675/// complete.
676SmallVector<MachineInstr *, 16>
677X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
678 MachineFunction &MF, ArrayRef<BlockCondInfo> Infos) {
679 // Collect the inserted cmov instructions so we can rewrite their uses of the
680 // predicate state into SSA form.
681 SmallVector<MachineInstr *, 16> CMovs;
682
683 // Now walk all of the basic blocks looking for ones that end in conditional
684 // jumps where we need to update this register along each edge.
685 for (const BlockCondInfo &Info : Infos) {
686 MachineBasicBlock &MBB = *Info.MBB;
687 const SmallVectorImpl<MachineInstr *> &CondBrs = Info.CondBrs;
688 MachineInstr *UncondBr = Info.UncondBr;
689
690 LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName()
691 << "\n");
692 ++NumCondBranchesTraced;
693
694 // Compute the non-conditional successor as either the target of any
695 // unconditional branch or the layout successor.
696 MachineBasicBlock *UncondSucc =
697 UncondBr ? (UncondBr->getOpcode() == X86::JMP_1
698 ? UncondBr->getOperand(0).getMBB()
699 : nullptr)
700 : &*std::next(MachineFunction::iterator(&MBB));
701
702 // Count how many edges there are to any given successor.
703 SmallDenseMap<MachineBasicBlock *, int> SuccCounts;
704 if (UncondSucc)
705 ++SuccCounts[UncondSucc];
706 for (auto *CondBr : CondBrs)
707 ++SuccCounts[CondBr->getOperand(0).getMBB()];
708
709 // A lambda to insert cmov instructions into a block checking all of the
710 // condition codes in a sequence.
711 auto BuildCheckingBlockForSuccAndConds =
712 [&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount,
713 MachineInstr *Br, MachineInstr *&UncondBr,
714 ArrayRef<X86::CondCode> Conds) {
715 // First, we split the edge to insert the checking block into a safe
716 // location.
717 auto &CheckingMBB =
718 (SuccCount == 1 && Succ.pred_size() == 1)
719 ? Succ
720 : splitEdge(MBB, Succ, SuccCount, Br, UncondBr, *TII);
721
722 bool LiveEFLAGS = Succ.isLiveIn(X86::EFLAGS);
723 if (!LiveEFLAGS)
724 CheckingMBB.addLiveIn(X86::EFLAGS);
725
726 // Now insert the cmovs to implement the checks.
727 auto InsertPt = CheckingMBB.begin();
728 assert((InsertPt == CheckingMBB.end() || !InsertPt->isPHI()) &&
729 "Should never have a PHI in the initial checking block as it "
730 "always has a single predecessor!");
731
732 // We will wire each cmov to each other, but need to start with the
733 // incoming pred state.
734 unsigned CurStateReg = PS->InitialReg;
735
736 for (X86::CondCode Cond : Conds) {
737 int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
738 auto CMovOp = X86::getCMovFromCond(Cond, PredStateSizeInBytes);
739
740 unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
741 // Note that we intentionally use an empty debug location so that
742 // this picks up the preceding location.
743 auto CMovI = BuildMI(CheckingMBB, InsertPt, DebugLoc(),
744 TII->get(CMovOp), UpdatedStateReg)
745 .addReg(CurStateReg)
746 .addReg(PS->PoisonReg);
747 // If this is the last cmov and the EFLAGS weren't originally
748 // live-in, mark them as killed.
749 if (!LiveEFLAGS && Cond == Conds.back())
750 CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
751
752 ++NumInstsInserted;
753 LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump();
754 dbgs() << "\n");
755
756 // The first one of the cmovs will be using the top level
757 // `PredStateReg` and need to get rewritten into SSA form.
758 if (CurStateReg == PS->InitialReg)
759 CMovs.push_back(&*CMovI);
760
761 // The next cmov should start from this one's def.
762 CurStateReg = UpdatedStateReg;
763 }
764
765 // And put the last one into the available values for SSA form of our
766 // predicate state.
767 PS->SSA.AddAvailableValue(&CheckingMBB, CurStateReg);
768 };
769
770 std::vector<X86::CondCode> UncondCodeSeq;
771 for (auto *CondBr : CondBrs) {
772 MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB();
773 int &SuccCount = SuccCounts[&Succ];
774
775 X86::CondCode Cond = X86::getCondFromBranchOpc(CondBr->getOpcode());
776 X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond);
777 UncondCodeSeq.push_back(Cond);
778
779 BuildCheckingBlockForSuccAndConds(MBB, Succ, SuccCount, CondBr, UncondBr,
780 {InvCond});
781
782 // Decrement the successor count now that we've split one of the edges.
783 // We need to keep the count of edges to the successor accurate in order
784 // to know above when to *replace* the successor in the CFG vs. just
785 // adding the new successor.
786 --SuccCount;
787 }
788
789 // Since we may have split edges and changed the number of successors,
790 // normalize the probabilities. This avoids doing it each time we split an
791 // edge.
792 MBB.normalizeSuccProbs();
793
794 // Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
795 // need to intersect the other condition codes. We can do this by just
796 // doing a cmov for each one.
797 if (!UncondSucc)
798 // If we have no fallthrough to protect (perhaps it is an indirect jump?)
799 // just skip this and continue.
800 continue;
801
802 assert(SuccCounts[UncondSucc] == 1 &&
803 "We should never have more than one edge to the unconditional "
804 "successor at this point because every other edge must have been "
805 "split above!");
806
807 // Sort and unique the codes to minimize them.
808 llvm::sort(UncondCodeSeq.begin(), UncondCodeSeq.end());
809 UncondCodeSeq.erase(std::unique(UncondCodeSeq.begin(), UncondCodeSeq.end()),
810 UncondCodeSeq.end());
811
812 // Build a checking version of the successor.
813 BuildCheckingBlockForSuccAndConds(MBB, *UncondSucc, /*SuccCount*/ 1,
814 UncondBr, UncondBr, UncondCodeSeq);
815 }
816
817 return CMovs;
818}
819
Chandler Carruth90358e12018-07-13 11:13:58 +0000820/// Returns true if the instruction has no behavior (specified or otherwise)
821/// that is based on the value of any of its register operands
822///
823/// A classical example of something that is inherently not data invariant is an
824/// indirect jump -- the destination is loaded into icache based on the bits set
825/// in the jump destination register.
826///
827/// FIXME: This should become part of our instruction tables.
828static bool isDataInvariant(MachineInstr &MI) {
829 switch (MI.getOpcode()) {
830 default:
831 // By default, assume that the instruction is not data invariant.
832 return false;
833
Chandler Carruthfa065aa2018-07-16 14:58:32 +0000834 // Some target-independent operations that trivially lower to data-invariant
835 // instructions.
Chandler Carruth90358e12018-07-13 11:13:58 +0000836 case TargetOpcode::COPY:
Chandler Carruthfa065aa2018-07-16 14:58:32 +0000837 case TargetOpcode::INSERT_SUBREG:
838 case TargetOpcode::SUBREG_TO_REG:
839 return true;
840
841 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
842 // However, they set flags and are perhaps the most surprisingly constant
843 // time operations so we call them out here separately.
844 case X86::IMUL16rr:
845 case X86::IMUL16rri8:
846 case X86::IMUL16rri:
847 case X86::IMUL32rr:
848 case X86::IMUL32rri8:
849 case X86::IMUL32rri:
850 case X86::IMUL64rr:
851 case X86::IMUL64rri32:
852 case X86::IMUL64rri8:
853
854 // Bit scanning and counting instructions that are somewhat surprisingly
855 // constant time as they scan across bits and do other fairly complex
856 // operations like popcnt, but are believed to be constant time on x86.
857 // However, these set flags.
858 case X86::BSF16rr:
859 case X86::BSF32rr:
860 case X86::BSF64rr:
861 case X86::BSR16rr:
862 case X86::BSR32rr:
863 case X86::BSR64rr:
864 case X86::LZCNT16rr:
865 case X86::LZCNT32rr:
866 case X86::LZCNT64rr:
867 case X86::POPCNT16rr:
868 case X86::POPCNT32rr:
869 case X86::POPCNT64rr:
870 case X86::TZCNT16rr:
871 case X86::TZCNT32rr:
872 case X86::TZCNT64rr:
873
874 // Bit manipulation instructions are effectively combinations of basic
875 // arithmetic ops, and should still execute in constant time. These also
876 // set flags.
877 case X86::BLCFILL32rr:
878 case X86::BLCFILL64rr:
879 case X86::BLCI32rr:
880 case X86::BLCI64rr:
881 case X86::BLCIC32rr:
882 case X86::BLCIC64rr:
883 case X86::BLCMSK32rr:
884 case X86::BLCMSK64rr:
885 case X86::BLCS32rr:
886 case X86::BLCS64rr:
887 case X86::BLSFILL32rr:
888 case X86::BLSFILL64rr:
889 case X86::BLSI32rr:
890 case X86::BLSI64rr:
891 case X86::BLSIC32rr:
892 case X86::BLSIC64rr:
893 case X86::BLSMSK32rr:
894 case X86::BLSMSK64rr:
895 case X86::BLSR32rr:
896 case X86::BLSR64rr:
897 case X86::TZMSK32rr:
898 case X86::TZMSK64rr:
899
900 // Bit extracting and clearing instructions should execute in constant time,
901 // and set flags.
902 case X86::BEXTR32rr:
903 case X86::BEXTR64rr:
904 case X86::BEXTRI32ri:
905 case X86::BEXTRI64ri:
906 case X86::BZHI32rr:
907 case X86::BZHI64rr:
908
Chandler Carruthc0cb5732018-07-17 18:07:59 +0000909 // Shift and rotate.
910 case X86::ROL8r1: case X86::ROL16r1: case X86::ROL32r1: case X86::ROL64r1:
911 case X86::ROL8rCL: case X86::ROL16rCL: case X86::ROL32rCL: case X86::ROL64rCL:
912 case X86::ROL8ri: case X86::ROL16ri: case X86::ROL32ri: case X86::ROL64ri:
913 case X86::ROR8r1: case X86::ROR16r1: case X86::ROR32r1: case X86::ROR64r1:
914 case X86::ROR8rCL: case X86::ROR16rCL: case X86::ROR32rCL: case X86::ROR64rCL:
915 case X86::ROR8ri: case X86::ROR16ri: case X86::ROR32ri: case X86::ROR64ri:
916 case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1: case X86::SAR64r1:
917 case X86::SAR8rCL: case X86::SAR16rCL: case X86::SAR32rCL: case X86::SAR64rCL:
918 case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri: case X86::SAR64ri:
919 case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1: case X86::SHL64r1:
920 case X86::SHL8rCL: case X86::SHL16rCL: case X86::SHL32rCL: case X86::SHL64rCL:
921 case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri: case X86::SHL64ri:
922 case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1: case X86::SHR64r1:
923 case X86::SHR8rCL: case X86::SHR16rCL: case X86::SHR32rCL: case X86::SHR64rCL:
924 case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri: case X86::SHR64ri:
925 case X86::SHLD16rrCL: case X86::SHLD32rrCL: case X86::SHLD64rrCL:
926 case X86::SHLD16rri8: case X86::SHLD32rri8: case X86::SHLD64rri8:
927 case X86::SHRD16rrCL: case X86::SHRD32rrCL: case X86::SHRD64rrCL:
928 case X86::SHRD16rri8: case X86::SHRD32rri8: case X86::SHRD64rri8:
929
Chandler Carruthfa065aa2018-07-16 14:58:32 +0000930 // Basic arithmetic is constant time on the input but does set flags.
931 case X86::ADC8rr: case X86::ADC8ri:
932 case X86::ADC16rr: case X86::ADC16ri: case X86::ADC16ri8:
933 case X86::ADC32rr: case X86::ADC32ri: case X86::ADC32ri8:
934 case X86::ADC64rr: case X86::ADC64ri8: case X86::ADC64ri32:
935 case X86::ADD8rr: case X86::ADD8ri:
936 case X86::ADD16rr: case X86::ADD16ri: case X86::ADD16ri8:
937 case X86::ADD32rr: case X86::ADD32ri: case X86::ADD32ri8:
938 case X86::ADD64rr: case X86::ADD64ri8: case X86::ADD64ri32:
939 case X86::AND8rr: case X86::AND8ri:
940 case X86::AND16rr: case X86::AND16ri: case X86::AND16ri8:
941 case X86::AND32rr: case X86::AND32ri: case X86::AND32ri8:
942 case X86::AND64rr: case X86::AND64ri8: case X86::AND64ri32:
943 case X86::OR8rr: case X86::OR8ri:
944 case X86::OR16rr: case X86::OR16ri: case X86::OR16ri8:
945 case X86::OR32rr: case X86::OR32ri: case X86::OR32ri8:
946 case X86::OR64rr: case X86::OR64ri8: case X86::OR64ri32:
947 case X86::SBB8rr: case X86::SBB8ri:
948 case X86::SBB16rr: case X86::SBB16ri: case X86::SBB16ri8:
949 case X86::SBB32rr: case X86::SBB32ri: case X86::SBB32ri8:
950 case X86::SBB64rr: case X86::SBB64ri8: case X86::SBB64ri32:
951 case X86::SUB8rr: case X86::SUB8ri:
952 case X86::SUB16rr: case X86::SUB16ri: case X86::SUB16ri8:
953 case X86::SUB32rr: case X86::SUB32ri: case X86::SUB32ri8:
954 case X86::SUB64rr: case X86::SUB64ri8: case X86::SUB64ri32:
955 case X86::XOR8rr: case X86::XOR8ri:
956 case X86::XOR16rr: case X86::XOR16ri: case X86::XOR16ri8:
957 case X86::XOR32rr: case X86::XOR32ri: case X86::XOR32ri8:
958 case X86::XOR64rr: case X86::XOR64ri8: case X86::XOR64ri32:
959 // Arithmetic with just 32-bit and 64-bit variants and no immediates.
960 case X86::ADCX32rr: case X86::ADCX64rr:
961 case X86::ADOX32rr: case X86::ADOX64rr:
962 case X86::ANDN32rr: case X86::ANDN64rr:
Chandler Carruthc0cb5732018-07-17 18:07:59 +0000963 // Unary arithmetic operations.
Chandler Carruthfa065aa2018-07-16 14:58:32 +0000964 case X86::DEC8r: case X86::DEC16r: case X86::DEC32r: case X86::DEC64r:
Chandler Carruthc0cb5732018-07-17 18:07:59 +0000965 case X86::INC8r: case X86::INC16r: case X86::INC32r: case X86::INC64r:
966 case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
Chandler Carruthfa065aa2018-07-16 14:58:32 +0000967 // Check whether the EFLAGS implicit-def is dead. We assume that this will
968 // always find the implicit-def because this code should only be reached
969 // for instructions that do in fact implicitly def this.
970 if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
971 // If we would clobber EFLAGS that are used, just bail for now.
972 LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: ";
973 MI.dump(); dbgs() << "\n");
974 return false;
975 }
976
977 // Otherwise, fallthrough to handle these the same as instructions that
978 // don't set EFLAGS.
979 LLVM_FALLTHROUGH;
980
Chandler Carruthc0cb5732018-07-17 18:07:59 +0000981 // Unlike other arithmetic, NOT doesn't set EFLAGS.
982 case X86::NOT8r: case X86::NOT16r: case X86::NOT32r: case X86::NOT64r:
983
984 // Various move instructions used to zero or sign extend things. Note that we
985 // intentionally don't support the _NOREX variants as we can't handle that
986 // register constraint anyways.
987 case X86::MOVSX16rr8:
988 case X86::MOVSX32rr8: case X86::MOVSX32rr16:
989 case X86::MOVSX64rr8: case X86::MOVSX64rr16: case X86::MOVSX64rr32:
990 case X86::MOVZX16rr8:
991 case X86::MOVZX32rr8: case X86::MOVZX32rr16:
992 case X86::MOVZX64rr8: case X86::MOVZX64rr16:
993 case X86::MOV32rr:
Chandler Carruthfa065aa2018-07-16 14:58:32 +0000994
995 // Arithmetic instructions that are both constant time and don't set flags.
996 case X86::RORX32ri:
997 case X86::RORX64ri:
998 case X86::SARX32rr:
999 case X86::SARX64rr:
1000 case X86::SHLX32rr:
1001 case X86::SHLX64rr:
1002 case X86::SHRX32rr:
1003 case X86::SHRX64rr:
1004
1005 // LEA doesn't actually access memory, and its arithmetic is constant time.
1006 case X86::LEA16r:
1007 case X86::LEA32r:
1008 case X86::LEA64_32r:
1009 case X86::LEA64r:
Chandler Carruth90358e12018-07-13 11:13:58 +00001010 return true;
1011 }
1012}
1013
1014/// Returns true if the instruction has no behavior (specified or otherwise)
1015/// that is based on the value loaded from memory or the value of any
1016/// non-address register operands.
1017///
1018/// For example, if the latency of the instruction is dependent on the
1019/// particular bits set in any of the registers *or* any of the bits loaded from
1020/// memory.
1021///
1022/// A classical example of something that is inherently not data invariant is an
1023/// indirect jump -- the destination is loaded into icache based on the bits set
1024/// in the jump destination register.
1025///
1026/// FIXME: This should become part of our instruction tables.
1027static bool isDataInvariantLoad(MachineInstr &MI) {
1028 switch (MI.getOpcode()) {
1029 default:
1030 // By default, assume that the load will immediately leak.
1031 return false;
1032
Craig Topper445abf72018-07-13 22:41:46 +00001033 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
1034 // However, they set flags and are perhaps the most surprisingly constant
1035 // time operations so we call them out here separately.
Chandler Carruth90358e12018-07-13 11:13:58 +00001036 case X86::IMUL16rm:
1037 case X86::IMUL16rmi8:
1038 case X86::IMUL16rmi:
1039 case X86::IMUL32rm:
1040 case X86::IMUL32rmi8:
1041 case X86::IMUL32rmi:
1042 case X86::IMUL64rm:
1043 case X86::IMUL64rmi32:
1044 case X86::IMUL64rmi8:
1045
Craig Topper445abf72018-07-13 22:41:46 +00001046 // Bit scanning and counting instructions that are somewhat surprisingly
1047 // constant time as they scan across bits and do other fairly complex
1048 // operations like popcnt, but are believed to be constant time on x86.
1049 // However, these set flags.
1050 case X86::BSF16rm:
1051 case X86::BSF32rm:
1052 case X86::BSF64rm:
1053 case X86::BSR16rm:
1054 case X86::BSR32rm:
1055 case X86::BSR64rm:
1056 case X86::LZCNT16rm:
1057 case X86::LZCNT32rm:
1058 case X86::LZCNT64rm:
1059 case X86::POPCNT16rm:
1060 case X86::POPCNT32rm:
1061 case X86::POPCNT64rm:
1062 case X86::TZCNT16rm:
1063 case X86::TZCNT32rm:
1064 case X86::TZCNT64rm:
1065
1066 // Bit manipulation instructions are effectively combinations of basic
1067 // arithmetic ops, and should still execute in constant time. These also
1068 // set flags.
Chandler Carruth90358e12018-07-13 11:13:58 +00001069 case X86::BLCFILL32rm:
1070 case X86::BLCFILL64rm:
1071 case X86::BLCI32rm:
1072 case X86::BLCI64rm:
1073 case X86::BLCIC32rm:
1074 case X86::BLCIC64rm:
1075 case X86::BLCMSK32rm:
1076 case X86::BLCMSK64rm:
1077 case X86::BLCS32rm:
1078 case X86::BLCS64rm:
1079 case X86::BLSFILL32rm:
1080 case X86::BLSFILL64rm:
1081 case X86::BLSI32rm:
1082 case X86::BLSI64rm:
1083 case X86::BLSIC32rm:
1084 case X86::BLSIC64rm:
1085 case X86::BLSMSK32rm:
1086 case X86::BLSMSK64rm:
1087 case X86::BLSR32rm:
1088 case X86::BLSR64rm:
Chandler Carruth90358e12018-07-13 11:13:58 +00001089 case X86::TZMSK32rm:
1090 case X86::TZMSK64rm:
1091
Craig Topper445abf72018-07-13 22:41:46 +00001092 // Bit extracting and clearing instructions should execute in constant time,
1093 // and set flags.
1094 case X86::BEXTR32rm:
1095 case X86::BEXTR64rm:
1096 case X86::BEXTRI32mi:
1097 case X86::BEXTRI64mi:
1098 case X86::BZHI32rm:
1099 case X86::BZHI64rm:
1100
1101 // Basic arithmetic is constant time on the input but does set flags.
Chandler Carruth90358e12018-07-13 11:13:58 +00001102 case X86::ADC8rm:
1103 case X86::ADC16rm:
1104 case X86::ADC32rm:
1105 case X86::ADC64rm:
1106 case X86::ADCX32rm:
1107 case X86::ADCX64rm:
1108 case X86::ADD8rm:
1109 case X86::ADD16rm:
1110 case X86::ADD32rm:
1111 case X86::ADD64rm:
1112 case X86::ADOX32rm:
1113 case X86::ADOX64rm:
1114 case X86::AND8rm:
1115 case X86::AND16rm:
1116 case X86::AND32rm:
1117 case X86::AND64rm:
1118 case X86::ANDN32rm:
1119 case X86::ANDN64rm:
Chandler Carruth90358e12018-07-13 11:13:58 +00001120 case X86::OR8rm:
1121 case X86::OR16rm:
1122 case X86::OR32rm:
1123 case X86::OR64rm:
1124 case X86::SBB8rm:
1125 case X86::SBB16rm:
1126 case X86::SBB32rm:
1127 case X86::SBB64rm:
1128 case X86::SUB8rm:
1129 case X86::SUB16rm:
1130 case X86::SUB32rm:
1131 case X86::SUB64rm:
1132 case X86::XOR8rm:
1133 case X86::XOR16rm:
1134 case X86::XOR32rm:
1135 case X86::XOR64rm:
Chandler Carruth90358e12018-07-13 11:13:58 +00001136 // Check whether the EFLAGS implicit-def is dead. We assume that this will
1137 // always find the implicit-def because this code should only be reached
1138 // for instructions that do in fact implicitly def this.
1139 if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
1140 // If we would clobber EFLAGS that are used, just bail for now.
1141 LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: ";
1142 MI.dump(); dbgs() << "\n");
1143 return false;
1144 }
1145
1146 // Otherwise, fallthrough to handle these the same as instructions that
1147 // don't set EFLAGS.
1148 LLVM_FALLTHROUGH;
1149
Craig Topper445abf72018-07-13 22:41:46 +00001150 // Integer multiply w/o affecting flags is still believed to be constant
1151 // time on x86. Called out separately as this is among the most surprising
1152 // instructions to exhibit that behavior.
Chandler Carruth90358e12018-07-13 11:13:58 +00001153 case X86::MULX32rm:
1154 case X86::MULX64rm:
1155
Craig Topper445abf72018-07-13 22:41:46 +00001156 // Arithmetic instructions that are both constant time and don't set flags.
Chandler Carruth90358e12018-07-13 11:13:58 +00001157 case X86::RORX32mi:
1158 case X86::RORX64mi:
1159 case X86::SARX32rm:
1160 case X86::SARX64rm:
1161 case X86::SHLX32rm:
1162 case X86::SHLX64rm:
1163 case X86::SHRX32rm:
1164 case X86::SHRX64rm:
1165
Craig Topper445abf72018-07-13 22:41:46 +00001166 // Conversions are believed to be constant time and don't set flags.
Craig Topper41fa8582018-07-13 22:41:50 +00001167 case X86::CVTTSD2SI64rm: case X86::VCVTTSD2SI64rm: case X86::VCVTTSD2SI64Zrm:
1168 case X86::CVTTSD2SIrm: case X86::VCVTTSD2SIrm: case X86::VCVTTSD2SIZrm:
1169 case X86::CVTTSS2SI64rm: case X86::VCVTTSS2SI64rm: case X86::VCVTTSS2SI64Zrm:
1170 case X86::CVTTSS2SIrm: case X86::VCVTTSS2SIrm: case X86::VCVTTSS2SIZrm:
1171 case X86::CVTSI2SDrm: case X86::VCVTSI2SDrm: case X86::VCVTSI2SDZrm:
1172 case X86::CVTSI2SSrm: case X86::VCVTSI2SSrm: case X86::VCVTSI2SSZrm:
1173 case X86::CVTSI642SDrm: case X86::VCVTSI642SDrm: case X86::VCVTSI642SDZrm:
1174 case X86::CVTSI642SSrm: case X86::VCVTSI642SSrm: case X86::VCVTSI642SSZrm:
1175 case X86::CVTSS2SDrm: case X86::VCVTSS2SDrm: case X86::VCVTSS2SDZrm:
1176 case X86::CVTSD2SSrm: case X86::VCVTSD2SSrm: case X86::VCVTSD2SSZrm:
1177 // AVX512 added unsigned integer conversions.
1178 case X86::VCVTTSD2USI64Zrm:
1179 case X86::VCVTTSD2USIZrm:
1180 case X86::VCVTTSS2USI64Zrm:
1181 case X86::VCVTTSS2USIZrm:
1182 case X86::VCVTUSI2SDZrm:
1183 case X86::VCVTUSI642SDZrm:
1184 case X86::VCVTUSI2SSZrm:
1185 case X86::VCVTUSI642SSZrm:
Chandler Carruth90358e12018-07-13 11:13:58 +00001186
Craig Topper445abf72018-07-13 22:41:46 +00001187 // Loads to register don't set flags.
Chandler Carruth90358e12018-07-13 11:13:58 +00001188 case X86::MOV8rm:
1189 case X86::MOV8rm_NOREX:
1190 case X86::MOV16rm:
1191 case X86::MOV32rm:
1192 case X86::MOV64rm:
1193 case X86::MOVSX16rm8:
1194 case X86::MOVSX32rm16:
1195 case X86::MOVSX32rm8:
1196 case X86::MOVSX32rm8_NOREX:
1197 case X86::MOVSX64rm16:
1198 case X86::MOVSX64rm32:
1199 case X86::MOVSX64rm8:
1200 case X86::MOVZX16rm8:
1201 case X86::MOVZX32rm16:
1202 case X86::MOVZX32rm8:
1203 case X86::MOVZX32rm8_NOREX:
1204 case X86::MOVZX64rm16:
1205 case X86::MOVZX64rm8:
1206 return true;
1207 }
1208}
1209
1210static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
1211 const TargetRegisterInfo &TRI) {
1212 // Check if EFLAGS are alive by seeing if there is a def of them or they
1213 // live-in, and then seeing if that def is in turn used.
1214 for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), I))) {
1215 if (MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
1216 // If the def is dead, then EFLAGS is not live.
1217 if (DefOp->isDead())
1218 return false;
1219
1220 // Otherwise we've def'ed it, and it is live.
1221 return true;
1222 }
1223 // While at this instruction, also check if we use and kill EFLAGS
1224 // which means it isn't live.
1225 if (MI.killsRegister(X86::EFLAGS, &TRI))
1226 return false;
1227 }
1228
1229 // If we didn't find anything conclusive (neither definitely alive or
1230 // definitely dead) return whether it lives into the block.
1231 return MBB.isLiveIn(X86::EFLAGS);
1232}
1233
Chandler Carruth0477b402018-07-23 04:01:34 +00001234/// Harden all of the loads (including implicit loads) in the function.
1235///
1236/// This operates in two passes. First, we analyze the loads to determine which
1237/// strategy will be used to harden them: hardening the address or hardening the
1238/// loaded value when loaded into a register amenable to hardening. We have to
1239/// process these first because the two strategies may interact -- later
1240/// hardening may change what strategy we wish to use. We also will analyze
1241/// data dependencies between loads and avoid hardening those loads that are
1242/// data dependent on a load with a hardened address. We also skip hardening
1243/// loads already behind an LFENCE as that is sufficient to harden them against
1244/// misspeculation.
1245///
1246/// Second, we apply the hardening steps we determined necessary in the first
1247/// pass.
1248///
1249/// These two passes are applied to each basic block. We operate one block at
1250/// a time to simplify reasoning about reachability and sequencing.
1251void X86SpeculativeLoadHardeningPass::hardenAllLoads(MachineFunction &MF) {
Chandler Carruth90358e12018-07-13 11:13:58 +00001252 // If the actual checking of loads is disabled, skip doing anything here.
1253 if (!HardenLoads)
1254 return;
1255
1256 SmallPtrSet<MachineInstr *, 16> HardenPostLoad;
1257 SmallPtrSet<MachineInstr *, 16> HardenLoadAddr;
1258
1259 SmallSet<unsigned, 16> HardenedAddrRegs;
1260
1261 SmallDenseMap<unsigned, unsigned, 32> AddrRegToHardenedReg;
1262
1263 // Track the set of load-dependent registers through the basic block. Because
1264 // the values of these registers have an existing data dependency on a loaded
1265 // value which we would have checked, we can omit any checks on them.
1266 SparseBitVector<> LoadDepRegs;
1267
1268 for (MachineBasicBlock &MBB : MF) {
1269 // We harden the loads of a basic block in several passes:
1270 //
1271 // 1) Collect all the loads which can have their loaded value hardened
1272 // and all the loads that instead need their address hardened. During
1273 // this walk we propagate load dependence for address hardened loads and
1274 // also look for LFENCE to stop hardening wherever possible. When
1275 // deciding whether or not to harden the loaded value or not, we check
1276 // to see if any registers used in the address will have been hardened
1277 // at this point and if so, harden any remaining address registers as
1278 // that often successfully re-uses hardened addresses and minimizes
1279 // instructions. FIXME: We should consider an aggressive mode where we
1280 // continue to keep as many loads value hardened even when some address
1281 // register hardening would be free (due to reuse).
1282 for (MachineInstr &MI : MBB) {
1283 // We naively assume that all def'ed registers of an instruction have
1284 // a data dependency on all of their operands.
1285 // FIXME: Do a more careful analysis of x86 to build a conservative model
1286 // here.
1287 if (llvm::any_of(MI.uses(), [&](MachineOperand &Op) {
1288 return Op.isReg() && LoadDepRegs.test(Op.getReg());
1289 }))
1290 for (MachineOperand &Def : MI.defs())
1291 if (Def.isReg())
1292 LoadDepRegs.set(Def.getReg());
1293
1294 // Both Intel and AMD are guiding that they will change the semantics of
1295 // LFENCE to be a speculation barrier, so if we see an LFENCE, there is
1296 // no more need to guard things in this block.
1297 if (MI.getOpcode() == X86::LFENCE)
1298 break;
1299
1300 // If this instruction cannot load, nothing to do.
1301 if (!MI.mayLoad())
1302 continue;
1303
1304 // Some instructions which "load" are trivially safe or unimportant.
1305 if (MI.getOpcode() == X86::MFENCE)
1306 continue;
1307
1308 // Extract the memory operand information about this instruction.
1309 // FIXME: This doesn't handle loading pseudo instructions which we often
1310 // could handle with similarly generic logic. We probably need to add an
1311 // MI-layer routine similar to the MC-layer one we use here which maps
1312 // pseudos much like this maps real instructions.
1313 const MCInstrDesc &Desc = MI.getDesc();
1314 int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
1315 if (MemRefBeginIdx < 0) {
1316 LLVM_DEBUG(dbgs() << "WARNING: unable to harden loading instruction: ";
1317 MI.dump());
1318 continue;
1319 }
1320
1321 MemRefBeginIdx += X86II::getOperandBias(Desc);
1322
1323 MachineOperand &BaseMO = MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
1324 MachineOperand &IndexMO =
1325 MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
1326
1327 // If we have at least one (non-frame-index, non-RIP) register operand,
1328 // and neither operand is load-dependent, we need to check the load.
1329 unsigned BaseReg = 0, IndexReg = 0;
1330 if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP &&
1331 BaseMO.getReg() != X86::NoRegister)
1332 BaseReg = BaseMO.getReg();
1333 if (IndexMO.getReg() != X86::NoRegister)
1334 IndexReg = IndexMO.getReg();
1335
1336 if (!BaseReg && !IndexReg)
1337 // No register operands!
1338 continue;
1339
1340 // If any register operand is dependent, this load is dependent and we
1341 // needn't check it.
1342 // FIXME: Is this true in the case where we are hardening loads after
1343 // they complete? Unclear, need to investigate.
1344 if ((BaseReg && LoadDepRegs.test(BaseReg)) ||
1345 (IndexReg && LoadDepRegs.test(IndexReg)))
1346 continue;
1347
Chandler Carruthe66a6f42018-07-16 11:38:48 +00001348 // If post-load hardening is enabled, this load is compatible with
1349 // post-load hardening, and we aren't already going to harden one of the
Chandler Carruth90358e12018-07-13 11:13:58 +00001350 // address registers, queue it up to be hardened post-load. Notably, even
1351 // once hardened this won't introduce a useful dependency that could prune
1352 // out subsequent loads.
Chandler Carruthfa065aa2018-07-16 14:58:32 +00001353 if (EnablePostLoadHardening && isDataInvariantLoad(MI) &&
1354 MI.getDesc().getNumDefs() == 1 && MI.getOperand(0).isReg() &&
1355 canHardenRegister(MI.getOperand(0).getReg()) &&
Chandler Carruth90358e12018-07-13 11:13:58 +00001356 !HardenedAddrRegs.count(BaseReg) &&
1357 !HardenedAddrRegs.count(IndexReg)) {
1358 HardenPostLoad.insert(&MI);
1359 HardenedAddrRegs.insert(MI.getOperand(0).getReg());
1360 continue;
1361 }
1362
1363 // Record this instruction for address hardening and record its register
1364 // operands as being address-hardened.
1365 HardenLoadAddr.insert(&MI);
1366 if (BaseReg)
1367 HardenedAddrRegs.insert(BaseReg);
1368 if (IndexReg)
1369 HardenedAddrRegs.insert(IndexReg);
1370
1371 for (MachineOperand &Def : MI.defs())
1372 if (Def.isReg())
1373 LoadDepRegs.set(Def.getReg());
1374 }
1375
1376 // Now re-walk the instructions in the basic block, and apply whichever
1377 // hardening strategy we have elected. Note that we do this in a second
1378 // pass specifically so that we have the complete set of instructions for
1379 // which we will do post-load hardening and can defer it in certain
1380 // circumstances.
1381 //
1382 // FIXME: This could probably be made even more effective by doing it
1383 // across the entire function. Rather than just walking the flat list
1384 // backwards here, we could walk the function in PO and each block bottom
1385 // up, allowing us to in some cases sink hardening across block blocks. As
1386 // long as the in-block predicate state is used at the eventual hardening
1387 // site, this remains safe.
1388 for (MachineInstr &MI : MBB) {
1389 // We cannot both require hardening the def of a load and its address.
1390 assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) &&
1391 "Requested to harden both the address and def of a load!");
1392
1393 // Check if this is a load whose address needs to be hardened.
1394 if (HardenLoadAddr.erase(&MI)) {
1395 const MCInstrDesc &Desc = MI.getDesc();
1396 int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
1397 assert(MemRefBeginIdx >= 0 && "Cannot have an invalid index here!");
1398
1399 MemRefBeginIdx += X86II::getOperandBias(Desc);
1400
1401 MachineOperand &BaseMO =
1402 MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
1403 MachineOperand &IndexMO =
1404 MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
Chandler Carruth4b0028a2018-07-19 11:13:58 +00001405 hardenLoadAddr(MI, BaseMO, IndexMO, AddrRegToHardenedReg);
Chandler Carruth90358e12018-07-13 11:13:58 +00001406 continue;
1407 }
1408
1409 // Test if this instruction is one of our post load instructions (and
1410 // remove it from the set if so).
1411 if (HardenPostLoad.erase(&MI)) {
1412 assert(!MI.isCall() && "Must not try to post-load harden a call!");
1413
1414 // If this is a data-invariant load, we want to try and sink any
1415 // hardening as far as possible.
1416 if (isDataInvariantLoad(MI)) {
1417 // Sink the instruction we'll need to harden as far as we can down the
1418 // graph.
1419 MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad);
1420
1421 // If we managed to sink this instruction, update everything so we
1422 // harden that instruction when we reach it in the instruction
1423 // sequence.
1424 if (SunkMI != &MI) {
1425 // If in sinking there was no instruction needing to be hardened,
1426 // we're done.
1427 if (!SunkMI)
1428 continue;
1429
1430 // Otherwise, add this to the set of defs we harden.
1431 HardenPostLoad.insert(SunkMI);
1432 continue;
1433 }
1434 }
1435
Chandler Carruth54529142018-07-24 12:44:00 +00001436 unsigned HardenedReg = hardenPostLoad(MI);
Chandler Carruth90358e12018-07-13 11:13:58 +00001437
Chandler Carruth54529142018-07-24 12:44:00 +00001438 // Mark the resulting hardened register as such so we don't re-harden.
1439 AddrRegToHardenedReg[HardenedReg] = HardenedReg;
1440
Chandler Carruth90358e12018-07-13 11:13:58 +00001441 continue;
1442 }
1443
1444 // After we finish processing the instruction and doing any hardening
1445 // necessary for it, we need to handle transferring the predicate state
1446 // into a call and recovering it after the call returns (if it returns).
1447 if (!MI.isCall())
1448 continue;
1449
1450 // If we're not hardening interprocedurally, we can just skip calls.
1451 if (!HardenInterprocedurally)
1452 continue;
1453
1454 auto InsertPt = MI.getIterator();
1455 DebugLoc Loc = MI.getDebugLoc();
1456
1457 // First, we transfer the predicate state into the called function by
1458 // merging it into the stack pointer. This will kill the current def of
1459 // the state.
Chandler Carruth4b0028a2018-07-19 11:13:58 +00001460 unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
Chandler Carruth90358e12018-07-13 11:13:58 +00001461 mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg);
1462
1463 // If this call is also a return (because it is a tail call) we're done.
1464 if (MI.isReturn())
1465 continue;
1466
1467 // Otherwise we need to step past the call and recover the predicate
1468 // state from SP after the return, and make this new state available.
1469 ++InsertPt;
1470 unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
Chandler Carruth4b0028a2018-07-19 11:13:58 +00001471 PS->SSA.AddAvailableValue(&MBB, NewStateReg);
Chandler Carruth90358e12018-07-13 11:13:58 +00001472 }
1473
1474 HardenPostLoad.clear();
1475 HardenLoadAddr.clear();
1476 HardenedAddrRegs.clear();
1477 AddrRegToHardenedReg.clear();
1478
1479 // Currently, we only track data-dependent loads within a basic block.
1480 // FIXME: We should see if this is necessary or if we could be more
1481 // aggressive here without opening up attack avenues.
1482 LoadDepRegs.clear();
1483 }
1484}
1485
1486/// Save EFLAGS into the returned GPR. This can in turn be restored with
1487/// `restoreEFLAGS`.
1488///
1489/// Note that LLVM can only lower very simple patterns of saved and restored
1490/// EFLAGS registers. The restore should always be within the same basic block
1491/// as the save so that no PHI nodes are inserted.
1492unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
1493 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1494 DebugLoc Loc) {
1495 // FIXME: Hard coding this to a 32-bit register class seems weird, but matches
1496 // what instruction selection does.
1497 unsigned Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
1498 // We directly copy the FLAGS register and rely on later lowering to clean
1499 // this up into the appropriate setCC instructions.
1500 BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
1501 ++NumInstsInserted;
1502 return Reg;
1503}
1504
1505/// Restore EFLAGS from the provided GPR. This should be produced by
1506/// `saveEFLAGS`.
1507///
1508/// This must be done within the same basic block as the save in order to
1509/// reliably lower.
1510void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
1511 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
1512 unsigned Reg) {
1513 BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
1514 ++NumInstsInserted;
1515}
1516
1517/// Takes the current predicate state (in a register) and merges it into the
1518/// stack pointer. The state is essentially a single bit, but we merge this in
1519/// a way that won't form non-canonical pointers and also will be preserved
1520/// across normal stack adjustments.
1521void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
1522 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
1523 unsigned PredStateReg) {
Chandler Carruth4b0028a2018-07-19 11:13:58 +00001524 unsigned TmpReg = MRI->createVirtualRegister(PS->RC);
Chandler Carruth90358e12018-07-13 11:13:58 +00001525 // FIXME: This hard codes a shift distance based on the number of bits needed
1526 // to stay canonical on 64-bit. We should compute this somehow and support
1527 // 32-bit as part of that.
1528 auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg)
1529 .addReg(PredStateReg, RegState::Kill)
1530 .addImm(47);
1531 ShiftI->addRegisterDead(X86::EFLAGS, TRI);
1532 ++NumInstsInserted;
1533 auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP)
1534 .addReg(X86::RSP)
1535 .addReg(TmpReg, RegState::Kill);
1536 OrI->addRegisterDead(X86::EFLAGS, TRI);
1537 ++NumInstsInserted;
1538}
1539
1540/// Extracts the predicate state stored in the high bits of the stack pointer.
1541unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
1542 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1543 DebugLoc Loc) {
Chandler Carruth4b0028a2018-07-19 11:13:58 +00001544 unsigned PredStateReg = MRI->createVirtualRegister(PS->RC);
1545 unsigned TmpReg = MRI->createVirtualRegister(PS->RC);
Chandler Carruth90358e12018-07-13 11:13:58 +00001546
1547 // We know that the stack pointer will have any preserved predicate state in
1548 // its high bit. We just want to smear this across the other bits. Turns out,
1549 // this is exactly what an arithmetic right shift does.
1550 BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg)
1551 .addReg(X86::RSP);
1552 auto ShiftI =
1553 BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg)
1554 .addReg(TmpReg, RegState::Kill)
Chandler Carruth4b0028a2018-07-19 11:13:58 +00001555 .addImm(TRI->getRegSizeInBits(*PS->RC) - 1);
Chandler Carruth90358e12018-07-13 11:13:58 +00001556 ShiftI->addRegisterDead(X86::EFLAGS, TRI);
1557 ++NumInstsInserted;
1558
1559 return PredStateReg;
1560}
1561
1562void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
1563 MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
Chandler Carruth90358e12018-07-13 11:13:58 +00001564 SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
1565 MachineBasicBlock &MBB = *MI.getParent();
1566 DebugLoc Loc = MI.getDebugLoc();
1567
1568 // Check if EFLAGS are alive by seeing if there is a def of them or they
1569 // live-in, and then seeing if that def is in turn used.
1570 bool EFLAGSLive = isEFLAGSLive(MBB, MI.getIterator(), *TRI);
1571
1572 SmallVector<MachineOperand *, 2> HardenOpRegs;
1573
1574 if (BaseMO.isFI()) {
1575 // A frame index is never a dynamically controllable load, so only
1576 // harden it if we're covering fixed address loads as well.
1577 LLVM_DEBUG(
1578 dbgs() << " Skipping hardening base of explicit stack frame load: ";
1579 MI.dump(); dbgs() << "\n");
1580 } else if (BaseMO.getReg() == X86::RIP ||
1581 BaseMO.getReg() == X86::NoRegister) {
1582 // For both RIP-relative addressed loads or absolute loads, we cannot
1583 // meaningfully harden them because the address being loaded has no
1584 // dynamic component.
1585 //
1586 // FIXME: When using a segment base (like TLS does) we end up with the
1587 // dynamic address being the base plus -1 because we can't mutate the
1588 // segment register here. This allows the signed 32-bit offset to point at
1589 // valid segment-relative addresses and load them successfully.
1590 LLVM_DEBUG(
1591 dbgs() << " Cannot harden base of "
1592 << (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base")
1593 << " address in a load!");
1594 } else {
1595 assert(BaseMO.isReg() &&
1596 "Only allowed to have a frame index or register base.");
1597 HardenOpRegs.push_back(&BaseMO);
1598 }
1599
1600 if (IndexMO.getReg() != X86::NoRegister &&
1601 (HardenOpRegs.empty() ||
1602 HardenOpRegs.front()->getReg() != IndexMO.getReg()))
1603 HardenOpRegs.push_back(&IndexMO);
1604
1605 assert((HardenOpRegs.size() == 1 || HardenOpRegs.size() == 2) &&
1606 "Should have exactly one or two registers to harden!");
1607 assert((HardenOpRegs.size() == 1 ||
1608 HardenOpRegs[0]->getReg() != HardenOpRegs[1]->getReg()) &&
1609 "Should not have two of the same registers!");
1610
1611 // Remove any registers that have alreaded been checked.
1612 llvm::erase_if(HardenOpRegs, [&](MachineOperand *Op) {
1613 // See if this operand's register has already been checked.
1614 auto It = AddrRegToHardenedReg.find(Op->getReg());
1615 if (It == AddrRegToHardenedReg.end())
1616 // Not checked, so retain this one.
1617 return false;
1618
1619 // Otherwise, we can directly update this operand and remove it.
1620 Op->setReg(It->second);
1621 return true;
1622 });
1623 // If there are none left, we're done.
1624 if (HardenOpRegs.empty())
1625 return;
1626
1627 // Compute the current predicate state.
Chandler Carruth4b0028a2018-07-19 11:13:58 +00001628 unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
Chandler Carruth90358e12018-07-13 11:13:58 +00001629
1630 auto InsertPt = MI.getIterator();
1631
1632 // If EFLAGS are live and we don't have access to instructions that avoid
1633 // clobbering EFLAGS we need to save and restore them. This in turn makes
1634 // the EFLAGS no longer live.
1635 unsigned FlagsReg = 0;
1636 if (EFLAGSLive && !Subtarget->hasBMI2()) {
1637 EFLAGSLive = false;
1638 FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
1639 }
1640
1641 for (MachineOperand *Op : HardenOpRegs) {
Chandler Carruth90358e12018-07-13 11:13:58 +00001642 unsigned OpReg = Op->getReg();
Chandler Carruthcdf0add2018-07-16 04:17:51 +00001643 auto *OpRC = MRI->getRegClass(OpReg);
Chandler Carruth90358e12018-07-13 11:13:58 +00001644 unsigned TmpReg = MRI->createVirtualRegister(OpRC);
1645
Chandler Carruthcdf0add2018-07-16 04:17:51 +00001646 // If this is a vector register, we'll need somewhat custom logic to handle
1647 // hardening it.
1648 if (!Subtarget->hasVLX() && (OpRC->hasSuperClassEq(&X86::VR128RegClass) ||
1649 OpRC->hasSuperClassEq(&X86::VR256RegClass))) {
1650 assert(Subtarget->hasAVX2() && "AVX2-specific register classes!");
1651 bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128RegClass);
1652
1653 // Move our state into a vector register.
1654 // FIXME: We could skip this at the cost of longer encodings with AVX-512
1655 // but that doesn't seem likely worth it.
1656 unsigned VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass);
1657 auto MovI =
1658 BuildMI(MBB, InsertPt, Loc, TII->get(X86::VMOV64toPQIrr), VStateReg)
1659 .addReg(StateReg);
1660 (void)MovI;
1661 ++NumInstsInserted;
1662 LLVM_DEBUG(dbgs() << " Inserting mov: "; MovI->dump(); dbgs() << "\n");
1663
1664 // Broadcast it across the vector register.
1665 unsigned VBStateReg = MRI->createVirtualRegister(OpRC);
1666 auto BroadcastI = BuildMI(MBB, InsertPt, Loc,
1667 TII->get(Is128Bit ? X86::VPBROADCASTQrr
1668 : X86::VPBROADCASTQYrr),
1669 VBStateReg)
1670 .addReg(VStateReg);
1671 (void)BroadcastI;
1672 ++NumInstsInserted;
1673 LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
1674 dbgs() << "\n");
1675
1676 // Merge our potential poison state into the value with a vector or.
1677 auto OrI =
1678 BuildMI(MBB, InsertPt, Loc,
1679 TII->get(Is128Bit ? X86::VPORrr : X86::VPORYrr), TmpReg)
1680 .addReg(VBStateReg)
1681 .addReg(OpReg);
1682 (void)OrI;
1683 ++NumInstsInserted;
1684 LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1685 } else if (OpRC->hasSuperClassEq(&X86::VR128XRegClass) ||
1686 OpRC->hasSuperClassEq(&X86::VR256XRegClass) ||
1687 OpRC->hasSuperClassEq(&X86::VR512RegClass)) {
1688 assert(Subtarget->hasAVX512() && "AVX512-specific register classes!");
1689 bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128XRegClass);
1690 bool Is256Bit = OpRC->hasSuperClassEq(&X86::VR256XRegClass);
1691 if (Is128Bit || Is256Bit)
1692 assert(Subtarget->hasVLX() && "AVX512VL-specific register classes!");
1693
1694 // Broadcast our state into a vector register.
1695 unsigned VStateReg = MRI->createVirtualRegister(OpRC);
1696 unsigned BroadcastOp =
1697 Is128Bit ? X86::VPBROADCASTQrZ128r
1698 : Is256Bit ? X86::VPBROADCASTQrZ256r : X86::VPBROADCASTQrZr;
1699 auto BroadcastI =
1700 BuildMI(MBB, InsertPt, Loc, TII->get(BroadcastOp), VStateReg)
1701 .addReg(StateReg);
1702 (void)BroadcastI;
1703 ++NumInstsInserted;
1704 LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
1705 dbgs() << "\n");
1706
1707 // Merge our potential poison state into the value with a vector or.
1708 unsigned OrOp = Is128Bit ? X86::VPORQZ128rr
1709 : Is256Bit ? X86::VPORQZ256rr : X86::VPORQZrr;
1710 auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOp), TmpReg)
1711 .addReg(VStateReg)
Chandler Carruth90358e12018-07-13 11:13:58 +00001712 .addReg(OpReg);
Chandler Carruthbc46bca2018-07-16 04:42:27 +00001713 (void)OrI;
Chandler Carruth90358e12018-07-13 11:13:58 +00001714 ++NumInstsInserted;
1715 LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1716 } else {
Chandler Carruthcdf0add2018-07-16 04:17:51 +00001717 // FIXME: Need to support GR32 here for 32-bit code.
1718 assert(OpRC->hasSuperClassEq(&X86::GR64RegClass) &&
1719 "Not a supported register class for address hardening!");
1720
1721 if (!EFLAGSLive) {
1722 // Merge our potential poison state into the value with an or.
1723 auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), TmpReg)
1724 .addReg(StateReg)
1725 .addReg(OpReg);
1726 OrI->addRegisterDead(X86::EFLAGS, TRI);
1727 ++NumInstsInserted;
1728 LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1729 } else {
1730 // We need to avoid touching EFLAGS so shift out all but the least
1731 // significant bit using the instruction that doesn't update flags.
1732 auto ShiftI =
1733 BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHRX64rr), TmpReg)
1734 .addReg(OpReg)
1735 .addReg(StateReg);
1736 (void)ShiftI;
1737 ++NumInstsInserted;
1738 LLVM_DEBUG(dbgs() << " Inserting shrx: "; ShiftI->dump();
1739 dbgs() << "\n");
1740 }
Chandler Carruth90358e12018-07-13 11:13:58 +00001741 }
1742
1743 // Record this register as checked and update the operand.
1744 assert(!AddrRegToHardenedReg.count(Op->getReg()) &&
1745 "Should not have checked this register yet!");
1746 AddrRegToHardenedReg[Op->getReg()] = TmpReg;
1747 Op->setReg(TmpReg);
1748 ++NumAddrRegsHardened;
1749 }
1750
1751 // And restore the flags if needed.
1752 if (FlagsReg)
1753 restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
1754}
1755
1756MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
Chandler Carruthfa065aa2018-07-16 14:58:32 +00001757 MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedInstrs) {
Chandler Carruth90358e12018-07-13 11:13:58 +00001758 assert(isDataInvariantLoad(InitialMI) &&
1759 "Cannot get here with a non-invariant load!");
1760
1761 // See if we can sink hardening the loaded value.
1762 auto SinkCheckToSingleUse =
1763 [&](MachineInstr &MI) -> Optional<MachineInstr *> {
1764 unsigned DefReg = MI.getOperand(0).getReg();
1765
1766 // We need to find a single use which we can sink the check. We can
1767 // primarily do this because many uses may already end up checked on their
1768 // own.
1769 MachineInstr *SingleUseMI = nullptr;
1770 for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
1771 // If we're already going to harden this use, it is data invariant and
Chandler Carruthfa065aa2018-07-16 14:58:32 +00001772 // within our block.
1773 if (HardenedInstrs.count(&UseMI)) {
1774 if (!isDataInvariantLoad(UseMI)) {
1775 // If we've already decided to harden a non-load, we must have sunk
1776 // some other post-load hardened instruction to it and it must itself
1777 // be data-invariant.
1778 assert(isDataInvariant(UseMI) &&
1779 "Data variant instruction being hardened!");
1780 continue;
1781 }
1782
1783 // Otherwise, this is a load and the load component can't be data
1784 // invariant so check how this register is being used.
Chandler Carruth90358e12018-07-13 11:13:58 +00001785 const MCInstrDesc &Desc = UseMI.getDesc();
1786 int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
1787 assert(MemRefBeginIdx >= 0 &&
1788 "Should always have mem references here!");
1789 MemRefBeginIdx += X86II::getOperandBias(Desc);
1790
1791 MachineOperand &BaseMO =
1792 UseMI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
1793 MachineOperand &IndexMO =
1794 UseMI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
1795 if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) ||
1796 (IndexMO.isReg() && IndexMO.getReg() == DefReg))
1797 // The load uses the register as part of its address making it not
1798 // invariant.
1799 return {};
1800
1801 continue;
1802 }
1803
1804 if (SingleUseMI)
1805 // We already have a single use, this would make two. Bail.
1806 return {};
1807
1808 // If this single use isn't data invariant, isn't in this block, or has
1809 // interfering EFLAGS, we can't sink the hardening to it.
1810 if (!isDataInvariant(UseMI) || UseMI.getParent() != MI.getParent())
1811 return {};
1812
1813 // If this instruction defines multiple registers bail as we won't harden
1814 // all of them.
1815 if (UseMI.getDesc().getNumDefs() > 1)
1816 return {};
1817
1818 // If this register isn't a virtual register we can't walk uses of sanely,
1819 // just bail. Also check that its register class is one of the ones we
1820 // can harden.
1821 unsigned UseDefReg = UseMI.getOperand(0).getReg();
1822 if (!TRI->isVirtualRegister(UseDefReg) ||
Chandler Carruthfa065aa2018-07-16 14:58:32 +00001823 !canHardenRegister(UseDefReg))
Chandler Carruth90358e12018-07-13 11:13:58 +00001824 return {};
1825
1826 SingleUseMI = &UseMI;
1827 }
1828
1829 // If SingleUseMI is still null, there is no use that needs its own
1830 // checking. Otherwise, it is the single use that needs checking.
1831 return {SingleUseMI};
1832 };
1833
1834 MachineInstr *MI = &InitialMI;
1835 while (Optional<MachineInstr *> SingleUse = SinkCheckToSingleUse(*MI)) {
1836 // Update which MI we're checking now.
1837 MI = *SingleUse;
1838 if (!MI)
1839 break;
1840 }
1841
1842 return MI;
1843}
1844
Chandler Carruthfa065aa2018-07-16 14:58:32 +00001845bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) {
1846 auto *RC = MRI->getRegClass(Reg);
1847 int RegBytes = TRI->getRegSizeInBits(*RC) / 8;
1848 if (RegBytes > 8)
1849 // We don't support post-load hardening of vectors.
Chandler Carruthe66a6f42018-07-16 11:38:48 +00001850 return false;
1851
Chandler Carruthfa065aa2018-07-16 14:58:32 +00001852 // If this register class is explicitly constrained to a class that doesn't
1853 // require REX prefix, we may not be able to satisfy that constraint when
1854 // emitting the hardening instructions, so bail out here.
1855 // FIXME: This seems like a pretty lame hack. The way this comes up is when we
1856 // end up both with a NOREX and REX-only register as operands to the hardening
1857 // instructions. It would be better to fix that code to handle this situation
1858 // rather than hack around it in this way.
1859 const TargetRegisterClass *NOREXRegClasses[] = {
1860 &X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass,
1861 &X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass};
1862 if (RC == NOREXRegClasses[Log2_32(RegBytes)])
Chandler Carruthe66a6f42018-07-16 11:38:48 +00001863 return false;
1864
1865 const TargetRegisterClass *GPRRegClasses[] = {
1866 &X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass,
1867 &X86::GR64RegClass};
Chandler Carruthfa065aa2018-07-16 14:58:32 +00001868 return RC->hasSuperClassEq(GPRRegClasses[Log2_32(RegBytes)]);
Chandler Carruthe66a6f42018-07-16 11:38:48 +00001869}
1870
Chandler Carruth54529142018-07-24 12:44:00 +00001871/// Harden a value in a register.
Chandler Carruth376113d2018-07-24 12:19:01 +00001872///
Chandler Carruth54529142018-07-24 12:44:00 +00001873/// This is the low-level logic to fully harden a value sitting in a register
1874/// against leaking during speculative execution.
1875///
1876/// Unlike hardening an address that is used by a load, this routine is required
1877/// to hide *all* incoming bits in the register.
1878///
1879/// `Reg` must be a virtual register. Currently, it is required to be a GPR no
1880/// larger than the predicate state register. FIXME: We should support vector
1881/// registers here by broadcasting the predicate state.
1882///
1883/// The new, hardened virtual register is returned. It will have the same
1884/// register class as `Reg`.
1885unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
1886 unsigned Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1887 DebugLoc Loc) {
1888 assert(canHardenRegister(Reg) && "Cannot harden this register!");
1889 assert(TRI->isVirtualRegister(Reg) && "Cannot harden a physical register!");
Chandler Carruth90358e12018-07-13 11:13:58 +00001890
Chandler Carruth54529142018-07-24 12:44:00 +00001891 auto *RC = MRI->getRegClass(Reg);
1892 int Bytes = TRI->getRegSizeInBits(*RC) / 8;
Chandler Carruth90358e12018-07-13 11:13:58 +00001893
Chandler Carruth66fbbbc2018-07-24 00:35:36 +00001894 unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
Chandler Carruth90358e12018-07-13 11:13:58 +00001895
Chandler Carruth66fbbbc2018-07-24 00:35:36 +00001896 // FIXME: Need to teach this about 32-bit mode.
Chandler Carruth54529142018-07-24 12:44:00 +00001897 if (Bytes != 8) {
Chandler Carruth66fbbbc2018-07-24 00:35:36 +00001898 unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
Chandler Carruth54529142018-07-24 12:44:00 +00001899 unsigned SubRegImm = SubRegImms[Log2_32(Bytes)];
1900 unsigned NarrowStateReg = MRI->createVirtualRegister(RC);
1901 BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), NarrowStateReg)
Chandler Carruth66fbbbc2018-07-24 00:35:36 +00001902 .addReg(StateReg, 0, SubRegImm);
1903 StateReg = NarrowStateReg;
1904 }
Chandler Carruth90358e12018-07-13 11:13:58 +00001905
Chandler Carruth90358e12018-07-13 11:13:58 +00001906 unsigned FlagsReg = 0;
Chandler Carruthb46c22d2018-07-24 00:21:59 +00001907 if (isEFLAGSLive(MBB, InsertPt, *TRI))
Chandler Carruth90358e12018-07-13 11:13:58 +00001908 FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
Chandler Carruth90358e12018-07-13 11:13:58 +00001909
Chandler Carruth54529142018-07-24 12:44:00 +00001910 unsigned NewReg = MRI->createVirtualRegister(RC);
Chandler Carruth66fbbbc2018-07-24 00:35:36 +00001911 unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
Chandler Carruth54529142018-07-24 12:44:00 +00001912 unsigned OrOpCode = OrOpCodes[Log2_32(Bytes)];
1913 auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), NewReg)
Chandler Carruthb46c22d2018-07-24 00:21:59 +00001914 .addReg(StateReg)
Chandler Carruth54529142018-07-24 12:44:00 +00001915 .addReg(Reg);
Chandler Carruthb46c22d2018-07-24 00:21:59 +00001916 OrI->addRegisterDead(X86::EFLAGS, TRI);
1917 ++NumInstsInserted;
1918 LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
Chandler Carruth90358e12018-07-13 11:13:58 +00001919
1920 if (FlagsReg)
1921 restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
1922
Chandler Carruth54529142018-07-24 12:44:00 +00001923 return NewReg;
1924}
1925
1926/// Harden a load by hardening the loaded value in the defined register.
1927///
1928/// We can harden a non-leaking load into a register without touching the
1929/// address by just hiding all of the loaded bits during misspeculation. We use
1930/// an `or` instruction to do this because we set up our poison value as all
1931/// ones. And the goal is just for the loaded bits to not be exposed to
1932/// execution and coercing them to one is sufficient.
1933///
1934/// Returns the newly hardened register.
1935unsigned X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) {
1936 MachineBasicBlock &MBB = *MI.getParent();
1937 DebugLoc Loc = MI.getDebugLoc();
1938
1939 auto &DefOp = MI.getOperand(0);
1940 unsigned OldDefReg = DefOp.getReg();
1941 auto *DefRC = MRI->getRegClass(OldDefReg);
1942
1943 // Because we want to completely replace the uses of this def'ed value with
1944 // the hardened value, create a dedicated new register that will only be used
1945 // to communicate the unhardened value to the hardening.
1946 unsigned UnhardenedReg = MRI->createVirtualRegister(DefRC);
1947 DefOp.setReg(UnhardenedReg);
1948
1949 // Now harden this register's value, getting a hardened reg that is safe to
1950 // use. Note that we insert the instructions to compute this *after* the
1951 // defining instruction, not before it.
1952 unsigned HardenedReg = hardenValueInRegister(
1953 UnhardenedReg, MBB, std::next(MI.getIterator()), Loc);
1954
1955 // Finally, replace the old register (which now only has the uses of the
1956 // original def) with the hardened register.
1957 MRI->replaceRegWith(/*FromReg*/ OldDefReg, /*ToReg*/ HardenedReg);
1958
Chandler Carruth90358e12018-07-13 11:13:58 +00001959 ++NumPostLoadRegsHardened;
Chandler Carruth54529142018-07-24 12:44:00 +00001960 return HardenedReg;
Chandler Carruth90358e12018-07-13 11:13:58 +00001961}
1962
Chandler Carrutha3a03ac2018-07-19 23:46:24 +00001963/// Harden a return instruction.
1964///
1965/// Returns implicitly perform a load which we need to harden. Without hardening
1966/// this load, an attacker my speculatively write over the return address to
1967/// steer speculation of the return to an attacker controlled address. This is
1968/// called Spectre v1.1 or Bounds Check Bypass Store (BCBS) and is described in
1969/// this paper:
1970/// https://people.csail.mit.edu/vlk/spectre11.pdf
1971///
1972/// We can harden this by introducing an LFENCE that will delay any load of the
1973/// return address until prior instructions have retired (and thus are not being
1974/// speculated), or we can harden the address used by the implicit load: the
1975/// stack pointer.
1976///
1977/// If we are not using an LFENCE, hardening the stack pointer has an additional
1978/// benefit: it allows us to pass the predicate state accumulated in this
1979/// function back to the caller. In the absence of a BCBS attack on the return,
1980/// the caller will typically be resumed and speculatively executed due to the
1981/// Return Stack Buffer (RSB) prediction which is very accurate and has a high
1982/// priority. It is possible that some code from the caller will be executed
1983/// speculatively even during a BCBS-attacked return until the steering takes
1984/// effect. Whenever this happens, the caller can recover the (poisoned)
1985/// predicate state from the stack pointer and continue to harden loads.
1986void X86SpeculativeLoadHardeningPass::hardenReturnInstr(MachineInstr &MI) {
Chandler Carruth90358e12018-07-13 11:13:58 +00001987 MachineBasicBlock &MBB = *MI.getParent();
1988 DebugLoc Loc = MI.getDebugLoc();
1989 auto InsertPt = MI.getIterator();
1990
1991 if (FenceCallAndRet) {
1992 // Simply forcibly block speculation of loads out of the function by using
1993 // an LFENCE. This is potentially a heavy-weight mitigation strategy, but
1994 // should be secure, is simple from an ABI perspective, and the cost can be
1995 // minimized through inlining.
1996 //
1997 // FIXME: We should investigate ways to establish a strong data-dependency
1998 // on the return. However, poisoning the stack pointer is unlikely to work
1999 // because the return is *predicted* rather than relying on the load of the
2000 // return address to actually resolve.
2001 BuildMI(MBB, InsertPt, Loc, TII->get(X86::LFENCE));
2002 ++NumInstsInserted;
2003 ++NumLFENCEsInserted;
2004 return;
2005 }
2006
2007 // Take our predicate state, shift it to the high 17 bits (so that we keep
2008 // pointers canonical) and merge it into RSP. This will allow the caller to
2009 // extract it when we return (speculatively).
Chandler Carruth4b0028a2018-07-19 11:13:58 +00002010 mergePredStateIntoSP(MBB, InsertPt, Loc, PS->SSA.GetValueAtEndOfBlock(&MBB));
Chandler Carruth90358e12018-07-13 11:13:58 +00002011}
2012
2013INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
2014 "X86 speculative load hardener", false, false)
2015INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
2016 "X86 speculative load hardener", false, false)
2017
2018FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() {
2019 return new X86SpeculativeLoadHardeningPass();
2020}