blob: a4e9ead00709eeb998cc9f418667fb0713924514 [file] [log] [blame]
Chandler Carruth90358e12018-07-13 11:13:58 +00001//====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9/// \file
10///
11/// Provide a pass which mitigates speculative execution attacks which operate
12/// by speculating incorrectly past some predicate (a type check, bounds check,
13/// or other condition) to reach a load with invalid inputs and leak the data
14/// accessed by that load using a side channel out of the speculative domain.
15///
16/// For details on the attacks, see the first variant in both the Project Zero
17/// writeup and the Spectre paper:
18/// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
19/// https://spectreattack.com/spectre.pdf
20///
21//===----------------------------------------------------------------------===//
22
23#include "X86.h"
24#include "X86InstrBuilder.h"
25#include "X86InstrInfo.h"
26#include "X86Subtarget.h"
27#include "llvm/ADT/ArrayRef.h"
28#include "llvm/ADT/DenseMap.h"
29#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/ScopeExit.h"
31#include "llvm/ADT/SmallPtrSet.h"
32#include "llvm/ADT/SmallSet.h"
33#include "llvm/ADT/SmallVector.h"
34#include "llvm/ADT/SparseBitVector.h"
35#include "llvm/ADT/Statistic.h"
36#include "llvm/CodeGen/MachineBasicBlock.h"
37#include "llvm/CodeGen/MachineConstantPool.h"
38#include "llvm/CodeGen/MachineFunction.h"
39#include "llvm/CodeGen/MachineFunctionPass.h"
40#include "llvm/CodeGen/MachineInstr.h"
41#include "llvm/CodeGen/MachineInstrBuilder.h"
42#include "llvm/CodeGen/MachineModuleInfo.h"
43#include "llvm/CodeGen/MachineOperand.h"
44#include "llvm/CodeGen/MachineRegisterInfo.h"
45#include "llvm/CodeGen/MachineSSAUpdater.h"
46#include "llvm/CodeGen/TargetInstrInfo.h"
47#include "llvm/CodeGen/TargetRegisterInfo.h"
48#include "llvm/CodeGen/TargetSchedule.h"
49#include "llvm/CodeGen/TargetSubtargetInfo.h"
50#include "llvm/IR/DebugLoc.h"
51#include "llvm/MC/MCSchedule.h"
52#include "llvm/Pass.h"
53#include "llvm/Support/CommandLine.h"
54#include "llvm/Support/Debug.h"
55#include "llvm/Support/raw_ostream.h"
56#include <algorithm>
57#include <cassert>
58#include <iterator>
59#include <utility>
60
61using namespace llvm;
62
63#define PASS_KEY "x86-speculative-load-hardening"
64#define DEBUG_TYPE PASS_KEY
65
66STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced");
67STATISTIC(NumBranchesUntraced, "Number of branches unable to trace");
68STATISTIC(NumAddrRegsHardened,
69 "Number of address mode used registers hardaned");
70STATISTIC(NumPostLoadRegsHardened,
71 "Number of post-load register values hardened");
72STATISTIC(NumInstsInserted, "Number of instructions inserted");
73STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
74
75static cl::opt<bool> HardenEdgesWithLFENCE(
76 PASS_KEY "-lfence",
77 cl::desc(
78 "Use LFENCE along each conditional edge to harden against speculative "
79 "loads rather than conditional movs and poisoned pointers."),
80 cl::init(false), cl::Hidden);
81
82static cl::opt<bool> EnablePostLoadHardening(
83 PASS_KEY "-post-load",
84 cl::desc("Harden the value loaded *after* it is loaded by "
85 "flushing the loaded bits to 1. This is hard to do "
86 "in general but can be done easily for GPRs."),
87 cl::init(true), cl::Hidden);
88
89static cl::opt<bool> FenceCallAndRet(
90 PASS_KEY "-fence-call-and-ret",
91 cl::desc("Use a full speculation fence to harden both call and ret edges "
92 "rather than a lighter weight mitigation."),
93 cl::init(false), cl::Hidden);
94
95static cl::opt<bool> HardenInterprocedurally(
96 PASS_KEY "-ip",
97 cl::desc("Harden interprocedurally by passing our state in and out of "
98 "functions in the high bits of the stack pointer."),
99 cl::init(true), cl::Hidden);
100
101static cl::opt<bool>
102 HardenLoads(PASS_KEY "-loads",
103 cl::desc("Sanitize loads from memory. When disable, no "
104 "significant security is provided."),
105 cl::init(true), cl::Hidden);
106
107namespace llvm {
108
109void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
110
111} // end namespace llvm
112
113namespace {
114
115class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
116public:
117 X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) {
118 initializeX86SpeculativeLoadHardeningPassPass(
119 *PassRegistry::getPassRegistry());
120 }
121
122 StringRef getPassName() const override {
123 return "X86 speculative load hardening";
124 }
125 bool runOnMachineFunction(MachineFunction &MF) override;
126 void getAnalysisUsage(AnalysisUsage &AU) const override;
127
128 /// Pass identification, replacement for typeid.
129 static char ID;
130
131private:
132 /// The information about a block's conditional terminators needed to trace
133 /// our predicate state through the exiting edges.
134 struct BlockCondInfo {
135 MachineBasicBlock *MBB;
136
137 // We mostly have one conditional branch, and in extremely rare cases have
138 // two. Three and more are so rare as to be unimportant for compile time.
139 SmallVector<MachineInstr *, 2> CondBrs;
140
141 MachineInstr *UncondBr;
142 };
143
144 const X86Subtarget *Subtarget;
145 MachineRegisterInfo *MRI;
146 const X86InstrInfo *TII;
147 const TargetRegisterInfo *TRI;
148 const TargetRegisterClass *PredStateRC;
149
150 void hardenEdgesWithLFENCE(MachineFunction &MF);
151
152 SmallVector<BlockCondInfo, 16> collectBlockCondInfo(MachineFunction &MF);
153
154 void checkAllLoads(MachineFunction &MF, MachineSSAUpdater &PredStateSSA);
155
156 unsigned saveEFLAGS(MachineBasicBlock &MBB,
157 MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
158 void restoreEFLAGS(MachineBasicBlock &MBB,
159 MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
160 unsigned OFReg);
161
162 void mergePredStateIntoSP(MachineBasicBlock &MBB,
163 MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
164 unsigned PredStateReg);
165 unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
166 MachineBasicBlock::iterator InsertPt,
167 DebugLoc Loc);
168
169 void
170 hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
171 MachineOperand &IndexMO, MachineSSAUpdater &PredStateSSA,
172 SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
173 MachineInstr *
174 sinkPostLoadHardenedInst(MachineInstr &MI,
175 SmallPtrSetImpl<MachineInstr *> &HardenedLoads);
Chandler Carruthe66a6f42018-07-16 11:38:48 +0000176 bool canHardenPostLoad(MachineInstr &MI);
Chandler Carruth90358e12018-07-13 11:13:58 +0000177 void hardenPostLoad(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
178 void checkReturnInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
179 void checkCallInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
180};
181
182} // end anonymous namespace
183
184char X86SpeculativeLoadHardeningPass::ID = 0;
185
186void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
187 AnalysisUsage &AU) const {
188 MachineFunctionPass::getAnalysisUsage(AU);
189}
190
191static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB,
192 MachineBasicBlock &Succ, int SuccCount,
193 MachineInstr *Br, MachineInstr *&UncondBr,
194 const X86InstrInfo &TII) {
195 assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!");
196
197 MachineFunction &MF = *MBB.getParent();
198
199 MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
200
201 // We have to insert the new block immediately after the current one as we
202 // don't know what layout-successor relationships the successor has and we
203 // may not be able to (and generally don't want to) try to fix those up.
204 MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
205
206 // Update the branch instruction if necessary.
207 if (Br) {
208 assert(Br->getOperand(0).getMBB() == &Succ &&
209 "Didn't start with the right target!");
210 Br->getOperand(0).setMBB(&NewMBB);
211
212 // If this successor was reached through a branch rather than fallthrough,
213 // we might have *broken* fallthrough and so need to inject a new
214 // unconditional branch.
215 if (!UncondBr) {
216 MachineBasicBlock &OldLayoutSucc =
217 *std::next(MachineFunction::iterator(&NewMBB));
218 assert(MBB.isSuccessor(&OldLayoutSucc) &&
219 "Without an unconditional branch, the old layout successor should "
220 "be an actual successor!");
221 auto BrBuilder =
222 BuildMI(&MBB, DebugLoc(), TII.get(X86::JMP_1)).addMBB(&OldLayoutSucc);
223 // Update the unconditional branch now that we've added one.
224 UncondBr = &*BrBuilder;
225 }
226
227 // Insert unconditional "jump Succ" instruction in the new block if
228 // necessary.
229 if (!NewMBB.isLayoutSuccessor(&Succ)) {
230 SmallVector<MachineOperand, 4> Cond;
231 TII.insertBranch(NewMBB, &Succ, nullptr, Cond, Br->getDebugLoc());
232 }
233 } else {
234 assert(!UncondBr &&
235 "Cannot have a branchless successor and an unconditional branch!");
236 assert(NewMBB.isLayoutSuccessor(&Succ) &&
237 "A non-branch successor must have been a layout successor before "
238 "and now is a layout successor of the new block.");
239 }
240
241 // If this is the only edge to the successor, we can just replace it in the
242 // CFG. Otherwise we need to add a new entry in the CFG for the new
243 // successor.
244 if (SuccCount == 1) {
245 MBB.replaceSuccessor(&Succ, &NewMBB);
246 } else {
247 MBB.splitSuccessor(&Succ, &NewMBB);
248 }
249
250 // Hook up the edge from the new basic block to the old successor in the CFG.
251 NewMBB.addSuccessor(&Succ);
252
253 // Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
254 for (MachineInstr &MI : Succ) {
255 if (!MI.isPHI())
256 break;
257 for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
258 OpIdx += 2) {
259 MachineOperand &OpV = MI.getOperand(OpIdx);
260 MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
261 assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
262 if (OpMBB.getMBB() != &MBB)
263 continue;
264
265 // If this is the last edge to the succesor, just replace MBB in the PHI
266 if (SuccCount == 1) {
267 OpMBB.setMBB(&NewMBB);
268 break;
269 }
270
271 // Otherwise, append a new pair of operands for the new incoming edge.
272 MI.addOperand(MF, OpV);
273 MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
274 break;
275 }
276 }
277
278 // Inherit live-ins from the successor
279 for (auto &LI : Succ.liveins())
280 NewMBB.addLiveIn(LI);
281
282 LLVM_DEBUG(dbgs() << " Split edge from '" << MBB.getName() << "' to '"
283 << Succ.getName() << "'.\n");
284 return NewMBB;
285}
286
Chandler Carruth3ffcc032018-07-15 23:46:36 +0000287/// Removing duplicate PHI operands to leave the PHI in a canonical and
288/// predictable form.
289///
290/// FIXME: It's really frustrating that we have to do this, but SSA-form in MIR
291/// isn't what you might expect. We may have multiple entries in PHI nodes for
292/// a single predecessor. This makes CFG-updating extremely complex, so here we
293/// simplify all PHI nodes to a model even simpler than the IR's model: exactly
294/// one entry per predecessor, regardless of how many edges there are.
295static void canonicalizePHIOperands(MachineFunction &MF) {
296 SmallPtrSet<MachineBasicBlock *, 4> Preds;
297 SmallVector<int, 4> DupIndices;
298 for (auto &MBB : MF)
299 for (auto &MI : MBB) {
300 if (!MI.isPHI())
301 break;
302
303 // First we scan the operands of the PHI looking for duplicate entries
304 // a particular predecessor. We retain the operand index of each duplicate
305 // entry found.
306 for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
307 OpIdx += 2)
308 if (!Preds.insert(MI.getOperand(OpIdx + 1).getMBB()).second)
309 DupIndices.push_back(OpIdx);
310
311 // Now walk the duplicate indices, removing both the block and value. Note
312 // that these are stored as a vector making this element-wise removal
313 // :w
314 // potentially quadratic.
315 //
316 // FIXME: It is really frustrating that we have to use a quadratic
317 // removal algorithm here. There should be a better way, but the use-def
318 // updates required make that impossible using the public API.
319 //
320 // Note that we have to process these backwards so that we don't
321 // invalidate other indices with each removal.
322 while (!DupIndices.empty()) {
323 int OpIdx = DupIndices.pop_back_val();
324 // Remove both the block and value operand, again in reverse order to
325 // preserve indices.
326 MI.RemoveOperand(OpIdx + 1);
327 MI.RemoveOperand(OpIdx);
328 }
329
330 Preds.clear();
331 }
332}
333
Chandler Carruth3620b992018-07-16 10:46:16 +0000334/// Helper to scan a function for loads vulnerable to misspeculation that we
335/// want to harden.
336///
337/// We use this to avoid making changes to functions where there is nothing we
338/// need to do to harden against misspeculation.
339static bool hasVulnerableLoad(MachineFunction &MF) {
340 for (MachineBasicBlock &MBB : MF) {
341 for (MachineInstr &MI : MBB) {
342 // Loads within this basic block after an LFENCE are not at risk of
343 // speculatively executing with invalid predicates from prior control
344 // flow. So break out of this block but continue scanning the function.
345 if (MI.getOpcode() == X86::LFENCE)
346 break;
347
348 // Looking for loads only.
349 if (!MI.mayLoad())
350 continue;
351
352 // An MFENCE is modeled as a load but isn't vulnerable to misspeculation.
353 if (MI.getOpcode() == X86::MFENCE)
354 continue;
355
356 // We found a load.
357 return true;
358 }
359 }
360
361 // No loads found.
362 return false;
363}
364
Chandler Carruth90358e12018-07-13 11:13:58 +0000365bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
366 MachineFunction &MF) {
367 LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
368 << " **********\n");
369
370 Subtarget = &MF.getSubtarget<X86Subtarget>();
371 MRI = &MF.getRegInfo();
372 TII = Subtarget->getInstrInfo();
373 TRI = Subtarget->getRegisterInfo();
374 // FIXME: Support for 32-bit.
375 PredStateRC = &X86::GR64_NOSPRegClass;
376
377 if (MF.begin() == MF.end())
378 // Nothing to do for a degenerate empty function...
379 return false;
380
381 // We support an alternative hardening technique based on a debug flag.
382 if (HardenEdgesWithLFENCE) {
383 hardenEdgesWithLFENCE(MF);
384 return true;
385 }
386
387 // Create a dummy debug loc to use for all the generated code here.
388 DebugLoc Loc;
389
390 MachineBasicBlock &Entry = *MF.begin();
391 auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(Entry.begin());
392
393 // Do a quick scan to see if we have any checkable loads.
Chandler Carruth3620b992018-07-16 10:46:16 +0000394 bool HasVulnerableLoad = hasVulnerableLoad(MF);
Chandler Carruth90358e12018-07-13 11:13:58 +0000395
396 // See if we have any conditional branching blocks that we will need to trace
397 // predicate state through.
398 SmallVector<BlockCondInfo, 16> Infos = collectBlockCondInfo(MF);
399
400 // If we have no interesting conditions or loads, nothing to do here.
Chandler Carruth3620b992018-07-16 10:46:16 +0000401 if (!HasVulnerableLoad && Infos.empty())
Chandler Carruth90358e12018-07-13 11:13:58 +0000402 return true;
403
404 unsigned PredStateReg;
405 unsigned PredStateSizeInBytes = TRI->getRegSizeInBits(*PredStateRC) / 8;
406
407 // The poison value is required to be an all-ones value for many aspects of
408 // this mitigation.
409 const int PoisonVal = -1;
410 unsigned PoisonReg = MRI->createVirtualRegister(PredStateRC);
411 BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64ri32), PoisonReg)
412 .addImm(PoisonVal);
413 ++NumInstsInserted;
414
415 // If we have loads being hardened and we've asked for call and ret edges to
416 // get a full fence-based mitigation, inject that fence.
Chandler Carruth3620b992018-07-16 10:46:16 +0000417 if (HasVulnerableLoad && FenceCallAndRet) {
Chandler Carruth90358e12018-07-13 11:13:58 +0000418 // We need to insert an LFENCE at the start of the function to suspend any
419 // incoming misspeculation from the caller. This helps two-fold: the caller
420 // may not have been protected as this code has been, and this code gets to
421 // not take any specific action to protect across calls.
422 // FIXME: We could skip this for functions which unconditionally return
423 // a constant.
424 BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::LFENCE));
425 ++NumInstsInserted;
426 ++NumLFENCEsInserted;
427 }
428
Chandler Carruthfb503ac2018-07-14 09:32:37 +0000429 // If we guarded the entry with an LFENCE and have no conditionals to protect
430 // in blocks, then we're done.
431 if (FenceCallAndRet && Infos.empty())
Chandler Carruth90358e12018-07-13 11:13:58 +0000432 // We may have changed the function's code at this point to insert fences.
433 return true;
434
435 // For every basic block in the function which can b
436 if (HardenInterprocedurally && !FenceCallAndRet) {
437 // Set up the predicate state by extracting it from the incoming stack
438 // pointer so we pick up any misspeculation in our caller.
439 PredStateReg = extractPredStateFromSP(Entry, EntryInsertPt, Loc);
440 } else {
441 // Otherwise, just build the predicate state itself by zeroing a register
442 // as we don't need any initial state.
443 PredStateReg = MRI->createVirtualRegister(PredStateRC);
444 unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
445 auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
446 PredStateSubReg);
447 ++NumInstsInserted;
448 MachineOperand *ZeroEFLAGSDefOp =
449 ZeroI->findRegisterDefOperand(X86::EFLAGS);
450 assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
451 "Must have an implicit def of EFLAGS!");
452 ZeroEFLAGSDefOp->setIsDead(true);
453 BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
454 PredStateReg)
455 .addImm(0)
456 .addReg(PredStateSubReg)
457 .addImm(X86::sub_32bit);
458 }
459
460 // We're going to need to trace predicate state throughout the function's
461 // CFG. Prepare for this by setting up our initial state of PHIs with unique
462 // predecessor entries and all the initial predicate state.
Chandler Carruth3ffcc032018-07-15 23:46:36 +0000463 canonicalizePHIOperands(MF);
Chandler Carruth90358e12018-07-13 11:13:58 +0000464
465 // Track the updated values in an SSA updater to rewrite into SSA form at the
466 // end.
467 MachineSSAUpdater PredStateSSA(MF);
468 PredStateSSA.Initialize(PredStateReg);
469 PredStateSSA.AddAvailableValue(&Entry, PredStateReg);
470 // Collect the inserted instructions so we can rewrite their uses of the
471 // predicate state into SSA form.
472 SmallVector<MachineInstr *, 16> CMovs;
473
474 // Now walk all of the basic blocks looking for ones that end in conditional
475 // jumps where we need to update this register along each edge.
476 for (BlockCondInfo &Info : Infos) {
477 MachineBasicBlock &MBB = *Info.MBB;
478 SmallVectorImpl<MachineInstr *> &CondBrs = Info.CondBrs;
479 MachineInstr *UncondBr = Info.UncondBr;
480
481 LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName()
482 << "\n");
483 ++NumCondBranchesTraced;
484
485 // Compute the non-conditional successor as either the target of any
486 // unconditional branch or the layout successor.
487 MachineBasicBlock *UncondSucc =
488 UncondBr ? (UncondBr->getOpcode() == X86::JMP_1
489 ? UncondBr->getOperand(0).getMBB()
490 : nullptr)
491 : &*std::next(MachineFunction::iterator(&MBB));
492
493 // Count how many edges there are to any given successor.
494 SmallDenseMap<MachineBasicBlock *, int> SuccCounts;
495 if (UncondSucc)
496 ++SuccCounts[UncondSucc];
497 for (auto *CondBr : CondBrs)
498 ++SuccCounts[CondBr->getOperand(0).getMBB()];
499
500 // A lambda to insert cmov instructions into a block checking all of the
501 // condition codes in a sequence.
502 auto BuildCheckingBlockForSuccAndConds =
503 [&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount,
504 MachineInstr *Br, MachineInstr *&UncondBr,
505 ArrayRef<X86::CondCode> Conds) {
506 // First, we split the edge to insert the checking block into a safe
507 // location.
508 auto &CheckingMBB =
509 (SuccCount == 1 && Succ.pred_size() == 1)
510 ? Succ
511 : splitEdge(MBB, Succ, SuccCount, Br, UncondBr, *TII);
512
513 bool LiveEFLAGS = Succ.isLiveIn(X86::EFLAGS);
514 if (!LiveEFLAGS)
515 CheckingMBB.addLiveIn(X86::EFLAGS);
516
517 // Now insert the cmovs to implement the checks.
518 auto InsertPt = CheckingMBB.begin();
Erich Keaneac8cb222018-07-13 14:43:20 +0000519 assert((InsertPt == CheckingMBB.end() || !InsertPt->isPHI()) &&
520 "Should never have a PHI in the initial checking block as it "
521 "always has a single predecessor!");
Chandler Carruth90358e12018-07-13 11:13:58 +0000522
523 // We will wire each cmov to each other, but need to start with the
524 // incoming pred state.
525 unsigned CurStateReg = PredStateReg;
526
527 for (X86::CondCode Cond : Conds) {
528 auto CMovOp = X86::getCMovFromCond(Cond, PredStateSizeInBytes);
529
530 unsigned UpdatedStateReg = MRI->createVirtualRegister(PredStateRC);
531 auto CMovI = BuildMI(CheckingMBB, InsertPt, Loc, TII->get(CMovOp),
532 UpdatedStateReg)
533 .addReg(CurStateReg)
534 .addReg(PoisonReg);
535 // If this is the last cmov and the EFLAGS weren't originally
536 // live-in, mark them as killed.
537 if (!LiveEFLAGS && Cond == Conds.back())
538 CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
539
540 ++NumInstsInserted;
541 LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump();
542 dbgs() << "\n");
543
544 // The first one of the cmovs will be using the top level
545 // `PredStateReg` and need to get rewritten into SSA form.
546 if (CurStateReg == PredStateReg)
547 CMovs.push_back(&*CMovI);
548
549 // The next cmov should start from this one's def.
550 CurStateReg = UpdatedStateReg;
551 }
552
553 // And put the last one into the available values for PredStateSSA.
554 PredStateSSA.AddAvailableValue(&CheckingMBB, CurStateReg);
555 };
556
557 std::vector<X86::CondCode> UncondCodeSeq;
558 for (auto *CondBr : CondBrs) {
559 MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB();
560 int &SuccCount = SuccCounts[&Succ];
561
562 X86::CondCode Cond = X86::getCondFromBranchOpc(CondBr->getOpcode());
563 X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond);
564 UncondCodeSeq.push_back(Cond);
565
566 BuildCheckingBlockForSuccAndConds(MBB, Succ, SuccCount, CondBr, UncondBr,
567 {InvCond});
568
569 // Decrement the successor count now that we've split one of the edges.
570 // We need to keep the count of edges to the successor accurate in order
571 // to know above when to *replace* the successor in the CFG vs. just
572 // adding the new successor.
573 --SuccCount;
574 }
575
576 // Since we may have split edges and changed the number of successors,
577 // normalize the probabilities. This avoids doing it each time we split an
578 // edge.
579 MBB.normalizeSuccProbs();
580
581 // Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
582 // need to intersect the other condition codes. We can do this by just
583 // doing a cmov for each one.
584 if (!UncondSucc)
585 // If we have no fallthrough to protect (perhaps it is an indirect jump?)
586 // just skip this and continue.
587 continue;
588
589 assert(SuccCounts[UncondSucc] == 1 &&
590 "We should never have more than one edge to the unconditional "
591 "successor at this point because every other edge must have been "
592 "split above!");
593
594 // Sort and unique the codes to minimize them.
595 llvm::sort(UncondCodeSeq.begin(), UncondCodeSeq.end());
596 UncondCodeSeq.erase(std::unique(UncondCodeSeq.begin(), UncondCodeSeq.end()),
597 UncondCodeSeq.end());
598
599 // Build a checking version of the successor.
600 BuildCheckingBlockForSuccAndConds(MBB, *UncondSucc, /*SuccCount*/ 1,
601 UncondBr, UncondBr, UncondCodeSeq);
602 }
603
604 // We may also enter basic blocks in this function via exception handling
605 // control flow. Here, if we are hardening interprocedurally, we need to
606 // re-capture the predicate state from the throwing code. In the Itanium ABI,
607 // the throw will always look like a call to __cxa_throw and will have the
608 // predicate state in the stack pointer, so extract fresh predicate state from
609 // the stack pointer and make it available in SSA.
610 // FIXME: Handle non-itanium ABI EH models.
611 if (HardenInterprocedurally) {
612 for (MachineBasicBlock &MBB : MF) {
613 assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!");
614 assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
615 assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
616 if (!MBB.isEHPad())
617 continue;
618 PredStateSSA.AddAvailableValue(
619 &MBB,
620 extractPredStateFromSP(MBB, MBB.SkipPHIsAndLabels(MBB.begin()), Loc));
621 }
622 }
623
624 // Now check all of the loads using the predicate state.
625 checkAllLoads(MF, PredStateSSA);
626
627 // Now rewrite all the uses of the pred state using the SSA updater so that
628 // we track updates through the CFG.
629 for (MachineInstr *CMovI : CMovs)
630 for (MachineOperand &Op : CMovI->operands()) {
631 if (!Op.isReg() || Op.getReg() != PredStateReg)
632 continue;
633
634 PredStateSSA.RewriteUse(Op);
635 }
636
637 // If we are hardening interprocedurally, find each returning block and
638 // protect the caller from being returned to through misspeculation.
639 if (HardenInterprocedurally)
640 for (MachineBasicBlock &MBB : MF) {
641 if (MBB.empty())
642 continue;
643
644 MachineInstr &MI = MBB.back();
645 if (!MI.isReturn())
646 continue;
647
648 checkReturnInstr(MI, PredStateSSA);
649 }
650
651 LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump();
652 dbgs() << "\n"; MF.verify(this));
653 return true;
654}
655
656/// Implements the naive hardening approach of putting an LFENCE after every
657/// potentially mis-predicted control flow construct.
658///
659/// We include this as an alternative mostly for the purpose of comparison. The
660/// performance impact of this is expected to be extremely severe and not
661/// practical for any real-world users.
662void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
663 MachineFunction &MF) {
664 // First, we scan the function looking for blocks that are reached along edges
665 // that we might want to harden.
666 SmallSetVector<MachineBasicBlock *, 8> Blocks;
667 for (MachineBasicBlock &MBB : MF) {
668 // If there are no or only one successor, nothing to do here.
669 if (MBB.succ_size() <= 1)
670 continue;
671
672 // Skip blocks unless their terminators start with a branch. Other
673 // terminators don't seem interesting for guarding against misspeculation.
674 auto TermIt = MBB.getFirstTerminator();
675 if (TermIt == MBB.end() || !TermIt->isBranch())
676 continue;
677
678 // Add all the non-EH-pad succossors to the blocks we want to harden. We
679 // skip EH pads because there isn't really a condition of interest on
680 // entering.
681 for (MachineBasicBlock *SuccMBB : MBB.successors())
682 if (!SuccMBB->isEHPad())
683 Blocks.insert(SuccMBB);
684 }
685
686 for (MachineBasicBlock *MBB : Blocks) {
687 auto InsertPt = MBB->SkipPHIsAndLabels(MBB->begin());
688 BuildMI(*MBB, InsertPt, DebugLoc(), TII->get(X86::LFENCE));
689 ++NumInstsInserted;
690 ++NumLFENCEsInserted;
691 }
692}
693
694SmallVector<X86SpeculativeLoadHardeningPass::BlockCondInfo, 16>
695X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
696 SmallVector<BlockCondInfo, 16> Infos;
697
698 // Walk the function and build up a summary for each block's conditions that
699 // we need to trace through.
700 for (MachineBasicBlock &MBB : MF) {
701 // If there are no or only one successor, nothing to do here.
702 if (MBB.succ_size() <= 1)
703 continue;
704
705 // We want to reliably handle any conditional branch terminators in the
706 // MBB, so we manually analyze the branch. We can handle all of the
707 // permutations here, including ones that analyze branch cannot.
708 //
709 // The approach is to walk backwards across the terminators, resetting at
710 // any unconditional non-indirect branch, and track all conditional edges
711 // to basic blocks as well as the fallthrough or unconditional successor
712 // edge. For each conditional edge, we track the target and the opposite
713 // condition code in order to inject a "no-op" cmov into that successor
714 // that will harden the predicate. For the fallthrough/unconditional
715 // edge, we inject a separate cmov for each conditional branch with
716 // matching condition codes. This effectively implements an "and" of the
717 // condition flags, even if there isn't a single condition flag that would
718 // directly implement that. We don't bother trying to optimize either of
719 // these cases because if such an optimization is possible, LLVM should
720 // have optimized the conditional *branches* in that way already to reduce
721 // instruction count. This late, we simply assume the minimal number of
722 // branch instructions is being emitted and use that to guide our cmov
723 // insertion.
724
725 BlockCondInfo Info = {&MBB, {}, nullptr};
726
727 // Now walk backwards through the terminators and build up successors they
728 // reach and the conditions.
729 for (MachineInstr &MI : llvm::reverse(MBB)) {
730 // Once we've handled all the terminators, we're done.
731 if (!MI.isTerminator())
732 break;
733
734 // If we see a non-branch terminator, we can't handle anything so bail.
735 if (!MI.isBranch()) {
736 Info.CondBrs.clear();
737 break;
738 }
739
740 // If we see an unconditional branch, reset our state, clear any
741 // fallthrough, and set this is the "else" successor.
742 if (MI.getOpcode() == X86::JMP_1) {
743 Info.CondBrs.clear();
744 Info.UncondBr = &MI;
745 continue;
746 }
747
748 // If we get an invalid condition, we have an indirect branch or some
749 // other unanalyzable "fallthrough" case. We model this as a nullptr for
750 // the destination so we can still guard any conditional successors.
751 // Consider code sequences like:
752 // ```
753 // jCC L1
754 // jmpq *%rax
755 // ```
756 // We still want to harden the edge to `L1`.
757 if (X86::getCondFromBranchOpc(MI.getOpcode()) == X86::COND_INVALID) {
758 Info.CondBrs.clear();
759 Info.UncondBr = &MI;
760 continue;
761 }
762
763 // We have a vanilla conditional branch, add it to our list.
764 Info.CondBrs.push_back(&MI);
765 }
766 if (Info.CondBrs.empty()) {
767 ++NumBranchesUntraced;
768 LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
769 MBB.dump());
770 continue;
771 }
772
773 Infos.push_back(Info);
774 }
775
776 return Infos;
777}
778
779/// Returns true if the instruction has no behavior (specified or otherwise)
780/// that is based on the value of any of its register operands
781///
782/// A classical example of something that is inherently not data invariant is an
783/// indirect jump -- the destination is loaded into icache based on the bits set
784/// in the jump destination register.
785///
786/// FIXME: This should become part of our instruction tables.
787static bool isDataInvariant(MachineInstr &MI) {
788 switch (MI.getOpcode()) {
789 default:
790 // By default, assume that the instruction is not data invariant.
791 return false;
792
793 // FIXME: For now, we just use a very boring, conservative set of unary
794 // instructions because we're mostly interested in handling simple
795 // transformations.
796 case TargetOpcode::COPY:
797 return true;
798 }
799}
800
801/// Returns true if the instruction has no behavior (specified or otherwise)
802/// that is based on the value loaded from memory or the value of any
803/// non-address register operands.
804///
805/// For example, if the latency of the instruction is dependent on the
806/// particular bits set in any of the registers *or* any of the bits loaded from
807/// memory.
808///
809/// A classical example of something that is inherently not data invariant is an
810/// indirect jump -- the destination is loaded into icache based on the bits set
811/// in the jump destination register.
812///
813/// FIXME: This should become part of our instruction tables.
814static bool isDataInvariantLoad(MachineInstr &MI) {
815 switch (MI.getOpcode()) {
816 default:
817 // By default, assume that the load will immediately leak.
818 return false;
819
Craig Topper445abf72018-07-13 22:41:46 +0000820 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
821 // However, they set flags and are perhaps the most surprisingly constant
822 // time operations so we call them out here separately.
Chandler Carruth90358e12018-07-13 11:13:58 +0000823 case X86::IMUL16rm:
824 case X86::IMUL16rmi8:
825 case X86::IMUL16rmi:
826 case X86::IMUL32rm:
827 case X86::IMUL32rmi8:
828 case X86::IMUL32rmi:
829 case X86::IMUL64rm:
830 case X86::IMUL64rmi32:
831 case X86::IMUL64rmi8:
832
Craig Topper445abf72018-07-13 22:41:46 +0000833 // Bit scanning and counting instructions that are somewhat surprisingly
834 // constant time as they scan across bits and do other fairly complex
835 // operations like popcnt, but are believed to be constant time on x86.
836 // However, these set flags.
837 case X86::BSF16rm:
838 case X86::BSF32rm:
839 case X86::BSF64rm:
840 case X86::BSR16rm:
841 case X86::BSR32rm:
842 case X86::BSR64rm:
843 case X86::LZCNT16rm:
844 case X86::LZCNT32rm:
845 case X86::LZCNT64rm:
846 case X86::POPCNT16rm:
847 case X86::POPCNT32rm:
848 case X86::POPCNT64rm:
849 case X86::TZCNT16rm:
850 case X86::TZCNT32rm:
851 case X86::TZCNT64rm:
852
853 // Bit manipulation instructions are effectively combinations of basic
854 // arithmetic ops, and should still execute in constant time. These also
855 // set flags.
Chandler Carruth90358e12018-07-13 11:13:58 +0000856 case X86::BLCFILL32rm:
857 case X86::BLCFILL64rm:
858 case X86::BLCI32rm:
859 case X86::BLCI64rm:
860 case X86::BLCIC32rm:
861 case X86::BLCIC64rm:
862 case X86::BLCMSK32rm:
863 case X86::BLCMSK64rm:
864 case X86::BLCS32rm:
865 case X86::BLCS64rm:
866 case X86::BLSFILL32rm:
867 case X86::BLSFILL64rm:
868 case X86::BLSI32rm:
869 case X86::BLSI64rm:
870 case X86::BLSIC32rm:
871 case X86::BLSIC64rm:
872 case X86::BLSMSK32rm:
873 case X86::BLSMSK64rm:
874 case X86::BLSR32rm:
875 case X86::BLSR64rm:
Chandler Carruth90358e12018-07-13 11:13:58 +0000876 case X86::TZMSK32rm:
877 case X86::TZMSK64rm:
878
Craig Topper445abf72018-07-13 22:41:46 +0000879 // Bit extracting and clearing instructions should execute in constant time,
880 // and set flags.
881 case X86::BEXTR32rm:
882 case X86::BEXTR64rm:
883 case X86::BEXTRI32mi:
884 case X86::BEXTRI64mi:
885 case X86::BZHI32rm:
886 case X86::BZHI64rm:
887
888 // Basic arithmetic is constant time on the input but does set flags.
Chandler Carruth90358e12018-07-13 11:13:58 +0000889 case X86::ADC8rm:
890 case X86::ADC16rm:
891 case X86::ADC32rm:
892 case X86::ADC64rm:
893 case X86::ADCX32rm:
894 case X86::ADCX64rm:
895 case X86::ADD8rm:
896 case X86::ADD16rm:
897 case X86::ADD32rm:
898 case X86::ADD64rm:
899 case X86::ADOX32rm:
900 case X86::ADOX64rm:
901 case X86::AND8rm:
902 case X86::AND16rm:
903 case X86::AND32rm:
904 case X86::AND64rm:
905 case X86::ANDN32rm:
906 case X86::ANDN64rm:
Chandler Carruth90358e12018-07-13 11:13:58 +0000907 case X86::OR8rm:
908 case X86::OR16rm:
909 case X86::OR32rm:
910 case X86::OR64rm:
911 case X86::SBB8rm:
912 case X86::SBB16rm:
913 case X86::SBB32rm:
914 case X86::SBB64rm:
915 case X86::SUB8rm:
916 case X86::SUB16rm:
917 case X86::SUB32rm:
918 case X86::SUB64rm:
919 case X86::XOR8rm:
920 case X86::XOR16rm:
921 case X86::XOR32rm:
922 case X86::XOR64rm:
Chandler Carruth90358e12018-07-13 11:13:58 +0000923 // Check whether the EFLAGS implicit-def is dead. We assume that this will
924 // always find the implicit-def because this code should only be reached
925 // for instructions that do in fact implicitly def this.
926 if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
927 // If we would clobber EFLAGS that are used, just bail for now.
928 LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: ";
929 MI.dump(); dbgs() << "\n");
930 return false;
931 }
932
933 // Otherwise, fallthrough to handle these the same as instructions that
934 // don't set EFLAGS.
935 LLVM_FALLTHROUGH;
936
Craig Topper445abf72018-07-13 22:41:46 +0000937 // Integer multiply w/o affecting flags is still believed to be constant
938 // time on x86. Called out separately as this is among the most surprising
939 // instructions to exhibit that behavior.
Chandler Carruth90358e12018-07-13 11:13:58 +0000940 case X86::MULX32rm:
941 case X86::MULX64rm:
942
Craig Topper445abf72018-07-13 22:41:46 +0000943 // Arithmetic instructions that are both constant time and don't set flags.
Chandler Carruth90358e12018-07-13 11:13:58 +0000944 case X86::RORX32mi:
945 case X86::RORX64mi:
946 case X86::SARX32rm:
947 case X86::SARX64rm:
948 case X86::SHLX32rm:
949 case X86::SHLX64rm:
950 case X86::SHRX32rm:
951 case X86::SHRX64rm:
952
Craig Topper445abf72018-07-13 22:41:46 +0000953 // Conversions are believed to be constant time and don't set flags.
Craig Topper41fa8582018-07-13 22:41:50 +0000954 case X86::CVTTSD2SI64rm: case X86::VCVTTSD2SI64rm: case X86::VCVTTSD2SI64Zrm:
955 case X86::CVTTSD2SIrm: case X86::VCVTTSD2SIrm: case X86::VCVTTSD2SIZrm:
956 case X86::CVTTSS2SI64rm: case X86::VCVTTSS2SI64rm: case X86::VCVTTSS2SI64Zrm:
957 case X86::CVTTSS2SIrm: case X86::VCVTTSS2SIrm: case X86::VCVTTSS2SIZrm:
958 case X86::CVTSI2SDrm: case X86::VCVTSI2SDrm: case X86::VCVTSI2SDZrm:
959 case X86::CVTSI2SSrm: case X86::VCVTSI2SSrm: case X86::VCVTSI2SSZrm:
960 case X86::CVTSI642SDrm: case X86::VCVTSI642SDrm: case X86::VCVTSI642SDZrm:
961 case X86::CVTSI642SSrm: case X86::VCVTSI642SSrm: case X86::VCVTSI642SSZrm:
962 case X86::CVTSS2SDrm: case X86::VCVTSS2SDrm: case X86::VCVTSS2SDZrm:
963 case X86::CVTSD2SSrm: case X86::VCVTSD2SSrm: case X86::VCVTSD2SSZrm:
964 // AVX512 added unsigned integer conversions.
965 case X86::VCVTTSD2USI64Zrm:
966 case X86::VCVTTSD2USIZrm:
967 case X86::VCVTTSS2USI64Zrm:
968 case X86::VCVTTSS2USIZrm:
969 case X86::VCVTUSI2SDZrm:
970 case X86::VCVTUSI642SDZrm:
971 case X86::VCVTUSI2SSZrm:
972 case X86::VCVTUSI642SSZrm:
Chandler Carruth90358e12018-07-13 11:13:58 +0000973
Craig Topper445abf72018-07-13 22:41:46 +0000974 // Loads to register don't set flags.
Chandler Carruth90358e12018-07-13 11:13:58 +0000975 case X86::MOV8rm:
976 case X86::MOV8rm_NOREX:
977 case X86::MOV16rm:
978 case X86::MOV32rm:
979 case X86::MOV64rm:
980 case X86::MOVSX16rm8:
981 case X86::MOVSX32rm16:
982 case X86::MOVSX32rm8:
983 case X86::MOVSX32rm8_NOREX:
984 case X86::MOVSX64rm16:
985 case X86::MOVSX64rm32:
986 case X86::MOVSX64rm8:
987 case X86::MOVZX16rm8:
988 case X86::MOVZX32rm16:
989 case X86::MOVZX32rm8:
990 case X86::MOVZX32rm8_NOREX:
991 case X86::MOVZX64rm16:
992 case X86::MOVZX64rm8:
993 return true;
994 }
995}
996
997static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
998 const TargetRegisterInfo &TRI) {
999 // Check if EFLAGS are alive by seeing if there is a def of them or they
1000 // live-in, and then seeing if that def is in turn used.
1001 for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), I))) {
1002 if (MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
1003 // If the def is dead, then EFLAGS is not live.
1004 if (DefOp->isDead())
1005 return false;
1006
1007 // Otherwise we've def'ed it, and it is live.
1008 return true;
1009 }
1010 // While at this instruction, also check if we use and kill EFLAGS
1011 // which means it isn't live.
1012 if (MI.killsRegister(X86::EFLAGS, &TRI))
1013 return false;
1014 }
1015
1016 // If we didn't find anything conclusive (neither definitely alive or
1017 // definitely dead) return whether it lives into the block.
1018 return MBB.isLiveIn(X86::EFLAGS);
1019}
1020
1021void X86SpeculativeLoadHardeningPass::checkAllLoads(
1022 MachineFunction &MF, MachineSSAUpdater &PredStateSSA) {
1023 // If the actual checking of loads is disabled, skip doing anything here.
1024 if (!HardenLoads)
1025 return;
1026
1027 SmallPtrSet<MachineInstr *, 16> HardenPostLoad;
1028 SmallPtrSet<MachineInstr *, 16> HardenLoadAddr;
1029
1030 SmallSet<unsigned, 16> HardenedAddrRegs;
1031
1032 SmallDenseMap<unsigned, unsigned, 32> AddrRegToHardenedReg;
1033
1034 // Track the set of load-dependent registers through the basic block. Because
1035 // the values of these registers have an existing data dependency on a loaded
1036 // value which we would have checked, we can omit any checks on them.
1037 SparseBitVector<> LoadDepRegs;
1038
1039 for (MachineBasicBlock &MBB : MF) {
1040 // We harden the loads of a basic block in several passes:
1041 //
1042 // 1) Collect all the loads which can have their loaded value hardened
1043 // and all the loads that instead need their address hardened. During
1044 // this walk we propagate load dependence for address hardened loads and
1045 // also look for LFENCE to stop hardening wherever possible. When
1046 // deciding whether or not to harden the loaded value or not, we check
1047 // to see if any registers used in the address will have been hardened
1048 // at this point and if so, harden any remaining address registers as
1049 // that often successfully re-uses hardened addresses and minimizes
1050 // instructions. FIXME: We should consider an aggressive mode where we
1051 // continue to keep as many loads value hardened even when some address
1052 // register hardening would be free (due to reuse).
1053 for (MachineInstr &MI : MBB) {
1054 // We naively assume that all def'ed registers of an instruction have
1055 // a data dependency on all of their operands.
1056 // FIXME: Do a more careful analysis of x86 to build a conservative model
1057 // here.
1058 if (llvm::any_of(MI.uses(), [&](MachineOperand &Op) {
1059 return Op.isReg() && LoadDepRegs.test(Op.getReg());
1060 }))
1061 for (MachineOperand &Def : MI.defs())
1062 if (Def.isReg())
1063 LoadDepRegs.set(Def.getReg());
1064
1065 // Both Intel and AMD are guiding that they will change the semantics of
1066 // LFENCE to be a speculation barrier, so if we see an LFENCE, there is
1067 // no more need to guard things in this block.
1068 if (MI.getOpcode() == X86::LFENCE)
1069 break;
1070
1071 // If this instruction cannot load, nothing to do.
1072 if (!MI.mayLoad())
1073 continue;
1074
1075 // Some instructions which "load" are trivially safe or unimportant.
1076 if (MI.getOpcode() == X86::MFENCE)
1077 continue;
1078
1079 // Extract the memory operand information about this instruction.
1080 // FIXME: This doesn't handle loading pseudo instructions which we often
1081 // could handle with similarly generic logic. We probably need to add an
1082 // MI-layer routine similar to the MC-layer one we use here which maps
1083 // pseudos much like this maps real instructions.
1084 const MCInstrDesc &Desc = MI.getDesc();
1085 int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
1086 if (MemRefBeginIdx < 0) {
1087 LLVM_DEBUG(dbgs() << "WARNING: unable to harden loading instruction: ";
1088 MI.dump());
1089 continue;
1090 }
1091
1092 MemRefBeginIdx += X86II::getOperandBias(Desc);
1093
1094 MachineOperand &BaseMO = MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
1095 MachineOperand &IndexMO =
1096 MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
1097
1098 // If we have at least one (non-frame-index, non-RIP) register operand,
1099 // and neither operand is load-dependent, we need to check the load.
1100 unsigned BaseReg = 0, IndexReg = 0;
1101 if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP &&
1102 BaseMO.getReg() != X86::NoRegister)
1103 BaseReg = BaseMO.getReg();
1104 if (IndexMO.getReg() != X86::NoRegister)
1105 IndexReg = IndexMO.getReg();
1106
1107 if (!BaseReg && !IndexReg)
1108 // No register operands!
1109 continue;
1110
1111 // If any register operand is dependent, this load is dependent and we
1112 // needn't check it.
1113 // FIXME: Is this true in the case where we are hardening loads after
1114 // they complete? Unclear, need to investigate.
1115 if ((BaseReg && LoadDepRegs.test(BaseReg)) ||
1116 (IndexReg && LoadDepRegs.test(IndexReg)))
1117 continue;
1118
Chandler Carruthe66a6f42018-07-16 11:38:48 +00001119 // If post-load hardening is enabled, this load is compatible with
1120 // post-load hardening, and we aren't already going to harden one of the
Chandler Carruth90358e12018-07-13 11:13:58 +00001121 // address registers, queue it up to be hardened post-load. Notably, even
1122 // once hardened this won't introduce a useful dependency that could prune
1123 // out subsequent loads.
Chandler Carruthe66a6f42018-07-16 11:38:48 +00001124 if (EnablePostLoadHardening && canHardenPostLoad(MI) &&
Chandler Carruth90358e12018-07-13 11:13:58 +00001125 !HardenedAddrRegs.count(BaseReg) &&
1126 !HardenedAddrRegs.count(IndexReg)) {
1127 HardenPostLoad.insert(&MI);
1128 HardenedAddrRegs.insert(MI.getOperand(0).getReg());
1129 continue;
1130 }
1131
1132 // Record this instruction for address hardening and record its register
1133 // operands as being address-hardened.
1134 HardenLoadAddr.insert(&MI);
1135 if (BaseReg)
1136 HardenedAddrRegs.insert(BaseReg);
1137 if (IndexReg)
1138 HardenedAddrRegs.insert(IndexReg);
1139
1140 for (MachineOperand &Def : MI.defs())
1141 if (Def.isReg())
1142 LoadDepRegs.set(Def.getReg());
1143 }
1144
1145 // Now re-walk the instructions in the basic block, and apply whichever
1146 // hardening strategy we have elected. Note that we do this in a second
1147 // pass specifically so that we have the complete set of instructions for
1148 // which we will do post-load hardening and can defer it in certain
1149 // circumstances.
1150 //
1151 // FIXME: This could probably be made even more effective by doing it
1152 // across the entire function. Rather than just walking the flat list
1153 // backwards here, we could walk the function in PO and each block bottom
1154 // up, allowing us to in some cases sink hardening across block blocks. As
1155 // long as the in-block predicate state is used at the eventual hardening
1156 // site, this remains safe.
1157 for (MachineInstr &MI : MBB) {
1158 // We cannot both require hardening the def of a load and its address.
1159 assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) &&
1160 "Requested to harden both the address and def of a load!");
1161
1162 // Check if this is a load whose address needs to be hardened.
1163 if (HardenLoadAddr.erase(&MI)) {
1164 const MCInstrDesc &Desc = MI.getDesc();
1165 int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
1166 assert(MemRefBeginIdx >= 0 && "Cannot have an invalid index here!");
1167
1168 MemRefBeginIdx += X86II::getOperandBias(Desc);
1169
1170 MachineOperand &BaseMO =
1171 MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
1172 MachineOperand &IndexMO =
1173 MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
1174 hardenLoadAddr(MI, BaseMO, IndexMO, PredStateSSA, AddrRegToHardenedReg);
1175 continue;
1176 }
1177
1178 // Test if this instruction is one of our post load instructions (and
1179 // remove it from the set if so).
1180 if (HardenPostLoad.erase(&MI)) {
1181 assert(!MI.isCall() && "Must not try to post-load harden a call!");
1182
1183 // If this is a data-invariant load, we want to try and sink any
1184 // hardening as far as possible.
1185 if (isDataInvariantLoad(MI)) {
1186 // Sink the instruction we'll need to harden as far as we can down the
1187 // graph.
1188 MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad);
1189
1190 // If we managed to sink this instruction, update everything so we
1191 // harden that instruction when we reach it in the instruction
1192 // sequence.
1193 if (SunkMI != &MI) {
1194 // If in sinking there was no instruction needing to be hardened,
1195 // we're done.
1196 if (!SunkMI)
1197 continue;
1198
1199 // Otherwise, add this to the set of defs we harden.
1200 HardenPostLoad.insert(SunkMI);
1201 continue;
1202 }
1203 }
1204
1205 // The register def'ed by this instruction is trivially hardened so map
1206 // it to itself.
1207 AddrRegToHardenedReg[MI.getOperand(0).getReg()] =
1208 MI.getOperand(0).getReg();
1209
1210 hardenPostLoad(MI, PredStateSSA);
1211 continue;
1212 }
1213
1214 // After we finish processing the instruction and doing any hardening
1215 // necessary for it, we need to handle transferring the predicate state
1216 // into a call and recovering it after the call returns (if it returns).
1217 if (!MI.isCall())
1218 continue;
1219
1220 // If we're not hardening interprocedurally, we can just skip calls.
1221 if (!HardenInterprocedurally)
1222 continue;
1223
1224 auto InsertPt = MI.getIterator();
1225 DebugLoc Loc = MI.getDebugLoc();
1226
1227 // First, we transfer the predicate state into the called function by
1228 // merging it into the stack pointer. This will kill the current def of
1229 // the state.
1230 unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
1231 mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg);
1232
1233 // If this call is also a return (because it is a tail call) we're done.
1234 if (MI.isReturn())
1235 continue;
1236
1237 // Otherwise we need to step past the call and recover the predicate
1238 // state from SP after the return, and make this new state available.
1239 ++InsertPt;
1240 unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
1241 PredStateSSA.AddAvailableValue(&MBB, NewStateReg);
1242 }
1243
1244 HardenPostLoad.clear();
1245 HardenLoadAddr.clear();
1246 HardenedAddrRegs.clear();
1247 AddrRegToHardenedReg.clear();
1248
1249 // Currently, we only track data-dependent loads within a basic block.
1250 // FIXME: We should see if this is necessary or if we could be more
1251 // aggressive here without opening up attack avenues.
1252 LoadDepRegs.clear();
1253 }
1254}
1255
1256/// Save EFLAGS into the returned GPR. This can in turn be restored with
1257/// `restoreEFLAGS`.
1258///
1259/// Note that LLVM can only lower very simple patterns of saved and restored
1260/// EFLAGS registers. The restore should always be within the same basic block
1261/// as the save so that no PHI nodes are inserted.
1262unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
1263 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1264 DebugLoc Loc) {
1265 // FIXME: Hard coding this to a 32-bit register class seems weird, but matches
1266 // what instruction selection does.
1267 unsigned Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
1268 // We directly copy the FLAGS register and rely on later lowering to clean
1269 // this up into the appropriate setCC instructions.
1270 BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
1271 ++NumInstsInserted;
1272 return Reg;
1273}
1274
1275/// Restore EFLAGS from the provided GPR. This should be produced by
1276/// `saveEFLAGS`.
1277///
1278/// This must be done within the same basic block as the save in order to
1279/// reliably lower.
1280void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
1281 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
1282 unsigned Reg) {
1283 BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
1284 ++NumInstsInserted;
1285}
1286
1287/// Takes the current predicate state (in a register) and merges it into the
1288/// stack pointer. The state is essentially a single bit, but we merge this in
1289/// a way that won't form non-canonical pointers and also will be preserved
1290/// across normal stack adjustments.
1291void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
1292 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
1293 unsigned PredStateReg) {
1294 unsigned TmpReg = MRI->createVirtualRegister(PredStateRC);
1295 // FIXME: This hard codes a shift distance based on the number of bits needed
1296 // to stay canonical on 64-bit. We should compute this somehow and support
1297 // 32-bit as part of that.
1298 auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg)
1299 .addReg(PredStateReg, RegState::Kill)
1300 .addImm(47);
1301 ShiftI->addRegisterDead(X86::EFLAGS, TRI);
1302 ++NumInstsInserted;
1303 auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP)
1304 .addReg(X86::RSP)
1305 .addReg(TmpReg, RegState::Kill);
1306 OrI->addRegisterDead(X86::EFLAGS, TRI);
1307 ++NumInstsInserted;
1308}
1309
1310/// Extracts the predicate state stored in the high bits of the stack pointer.
1311unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
1312 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1313 DebugLoc Loc) {
1314 unsigned PredStateReg = MRI->createVirtualRegister(PredStateRC);
1315 unsigned TmpReg = MRI->createVirtualRegister(PredStateRC);
1316
1317 // We know that the stack pointer will have any preserved predicate state in
1318 // its high bit. We just want to smear this across the other bits. Turns out,
1319 // this is exactly what an arithmetic right shift does.
1320 BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg)
1321 .addReg(X86::RSP);
1322 auto ShiftI =
1323 BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg)
1324 .addReg(TmpReg, RegState::Kill)
1325 .addImm(TRI->getRegSizeInBits(*PredStateRC) - 1);
1326 ShiftI->addRegisterDead(X86::EFLAGS, TRI);
1327 ++NumInstsInserted;
1328
1329 return PredStateReg;
1330}
1331
1332void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
1333 MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
1334 MachineSSAUpdater &PredStateSSA,
1335 SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
1336 MachineBasicBlock &MBB = *MI.getParent();
1337 DebugLoc Loc = MI.getDebugLoc();
1338
1339 // Check if EFLAGS are alive by seeing if there is a def of them or they
1340 // live-in, and then seeing if that def is in turn used.
1341 bool EFLAGSLive = isEFLAGSLive(MBB, MI.getIterator(), *TRI);
1342
1343 SmallVector<MachineOperand *, 2> HardenOpRegs;
1344
1345 if (BaseMO.isFI()) {
1346 // A frame index is never a dynamically controllable load, so only
1347 // harden it if we're covering fixed address loads as well.
1348 LLVM_DEBUG(
1349 dbgs() << " Skipping hardening base of explicit stack frame load: ";
1350 MI.dump(); dbgs() << "\n");
1351 } else if (BaseMO.getReg() == X86::RIP ||
1352 BaseMO.getReg() == X86::NoRegister) {
1353 // For both RIP-relative addressed loads or absolute loads, we cannot
1354 // meaningfully harden them because the address being loaded has no
1355 // dynamic component.
1356 //
1357 // FIXME: When using a segment base (like TLS does) we end up with the
1358 // dynamic address being the base plus -1 because we can't mutate the
1359 // segment register here. This allows the signed 32-bit offset to point at
1360 // valid segment-relative addresses and load them successfully.
1361 LLVM_DEBUG(
1362 dbgs() << " Cannot harden base of "
1363 << (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base")
1364 << " address in a load!");
1365 } else {
1366 assert(BaseMO.isReg() &&
1367 "Only allowed to have a frame index or register base.");
1368 HardenOpRegs.push_back(&BaseMO);
1369 }
1370
1371 if (IndexMO.getReg() != X86::NoRegister &&
1372 (HardenOpRegs.empty() ||
1373 HardenOpRegs.front()->getReg() != IndexMO.getReg()))
1374 HardenOpRegs.push_back(&IndexMO);
1375
1376 assert((HardenOpRegs.size() == 1 || HardenOpRegs.size() == 2) &&
1377 "Should have exactly one or two registers to harden!");
1378 assert((HardenOpRegs.size() == 1 ||
1379 HardenOpRegs[0]->getReg() != HardenOpRegs[1]->getReg()) &&
1380 "Should not have two of the same registers!");
1381
1382 // Remove any registers that have alreaded been checked.
1383 llvm::erase_if(HardenOpRegs, [&](MachineOperand *Op) {
1384 // See if this operand's register has already been checked.
1385 auto It = AddrRegToHardenedReg.find(Op->getReg());
1386 if (It == AddrRegToHardenedReg.end())
1387 // Not checked, so retain this one.
1388 return false;
1389
1390 // Otherwise, we can directly update this operand and remove it.
1391 Op->setReg(It->second);
1392 return true;
1393 });
1394 // If there are none left, we're done.
1395 if (HardenOpRegs.empty())
1396 return;
1397
1398 // Compute the current predicate state.
1399 unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
1400
1401 auto InsertPt = MI.getIterator();
1402
1403 // If EFLAGS are live and we don't have access to instructions that avoid
1404 // clobbering EFLAGS we need to save and restore them. This in turn makes
1405 // the EFLAGS no longer live.
1406 unsigned FlagsReg = 0;
1407 if (EFLAGSLive && !Subtarget->hasBMI2()) {
1408 EFLAGSLive = false;
1409 FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
1410 }
1411
1412 for (MachineOperand *Op : HardenOpRegs) {
Chandler Carruth90358e12018-07-13 11:13:58 +00001413 unsigned OpReg = Op->getReg();
Chandler Carruthcdf0add2018-07-16 04:17:51 +00001414 auto *OpRC = MRI->getRegClass(OpReg);
Chandler Carruth90358e12018-07-13 11:13:58 +00001415 unsigned TmpReg = MRI->createVirtualRegister(OpRC);
1416
Chandler Carruthcdf0add2018-07-16 04:17:51 +00001417 // If this is a vector register, we'll need somewhat custom logic to handle
1418 // hardening it.
1419 if (!Subtarget->hasVLX() && (OpRC->hasSuperClassEq(&X86::VR128RegClass) ||
1420 OpRC->hasSuperClassEq(&X86::VR256RegClass))) {
1421 assert(Subtarget->hasAVX2() && "AVX2-specific register classes!");
1422 bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128RegClass);
1423
1424 // Move our state into a vector register.
1425 // FIXME: We could skip this at the cost of longer encodings with AVX-512
1426 // but that doesn't seem likely worth it.
1427 unsigned VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass);
1428 auto MovI =
1429 BuildMI(MBB, InsertPt, Loc, TII->get(X86::VMOV64toPQIrr), VStateReg)
1430 .addReg(StateReg);
1431 (void)MovI;
1432 ++NumInstsInserted;
1433 LLVM_DEBUG(dbgs() << " Inserting mov: "; MovI->dump(); dbgs() << "\n");
1434
1435 // Broadcast it across the vector register.
1436 unsigned VBStateReg = MRI->createVirtualRegister(OpRC);
1437 auto BroadcastI = BuildMI(MBB, InsertPt, Loc,
1438 TII->get(Is128Bit ? X86::VPBROADCASTQrr
1439 : X86::VPBROADCASTQYrr),
1440 VBStateReg)
1441 .addReg(VStateReg);
1442 (void)BroadcastI;
1443 ++NumInstsInserted;
1444 LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
1445 dbgs() << "\n");
1446
1447 // Merge our potential poison state into the value with a vector or.
1448 auto OrI =
1449 BuildMI(MBB, InsertPt, Loc,
1450 TII->get(Is128Bit ? X86::VPORrr : X86::VPORYrr), TmpReg)
1451 .addReg(VBStateReg)
1452 .addReg(OpReg);
1453 (void)OrI;
1454 ++NumInstsInserted;
1455 LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1456 } else if (OpRC->hasSuperClassEq(&X86::VR128XRegClass) ||
1457 OpRC->hasSuperClassEq(&X86::VR256XRegClass) ||
1458 OpRC->hasSuperClassEq(&X86::VR512RegClass)) {
1459 assert(Subtarget->hasAVX512() && "AVX512-specific register classes!");
1460 bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128XRegClass);
1461 bool Is256Bit = OpRC->hasSuperClassEq(&X86::VR256XRegClass);
1462 if (Is128Bit || Is256Bit)
1463 assert(Subtarget->hasVLX() && "AVX512VL-specific register classes!");
1464
1465 // Broadcast our state into a vector register.
1466 unsigned VStateReg = MRI->createVirtualRegister(OpRC);
1467 unsigned BroadcastOp =
1468 Is128Bit ? X86::VPBROADCASTQrZ128r
1469 : Is256Bit ? X86::VPBROADCASTQrZ256r : X86::VPBROADCASTQrZr;
1470 auto BroadcastI =
1471 BuildMI(MBB, InsertPt, Loc, TII->get(BroadcastOp), VStateReg)
1472 .addReg(StateReg);
1473 (void)BroadcastI;
1474 ++NumInstsInserted;
1475 LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
1476 dbgs() << "\n");
1477
1478 // Merge our potential poison state into the value with a vector or.
1479 unsigned OrOp = Is128Bit ? X86::VPORQZ128rr
1480 : Is256Bit ? X86::VPORQZ256rr : X86::VPORQZrr;
1481 auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOp), TmpReg)
1482 .addReg(VStateReg)
Chandler Carruth90358e12018-07-13 11:13:58 +00001483 .addReg(OpReg);
Chandler Carruthbc46bca2018-07-16 04:42:27 +00001484 (void)OrI;
Chandler Carruth90358e12018-07-13 11:13:58 +00001485 ++NumInstsInserted;
1486 LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1487 } else {
Chandler Carruthcdf0add2018-07-16 04:17:51 +00001488 // FIXME: Need to support GR32 here for 32-bit code.
1489 assert(OpRC->hasSuperClassEq(&X86::GR64RegClass) &&
1490 "Not a supported register class for address hardening!");
1491
1492 if (!EFLAGSLive) {
1493 // Merge our potential poison state into the value with an or.
1494 auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), TmpReg)
1495 .addReg(StateReg)
1496 .addReg(OpReg);
1497 OrI->addRegisterDead(X86::EFLAGS, TRI);
1498 ++NumInstsInserted;
1499 LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1500 } else {
1501 // We need to avoid touching EFLAGS so shift out all but the least
1502 // significant bit using the instruction that doesn't update flags.
1503 auto ShiftI =
1504 BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHRX64rr), TmpReg)
1505 .addReg(OpReg)
1506 .addReg(StateReg);
1507 (void)ShiftI;
1508 ++NumInstsInserted;
1509 LLVM_DEBUG(dbgs() << " Inserting shrx: "; ShiftI->dump();
1510 dbgs() << "\n");
1511 }
Chandler Carruth90358e12018-07-13 11:13:58 +00001512 }
1513
1514 // Record this register as checked and update the operand.
1515 assert(!AddrRegToHardenedReg.count(Op->getReg()) &&
1516 "Should not have checked this register yet!");
1517 AddrRegToHardenedReg[Op->getReg()] = TmpReg;
1518 Op->setReg(TmpReg);
1519 ++NumAddrRegsHardened;
1520 }
1521
1522 // And restore the flags if needed.
1523 if (FlagsReg)
1524 restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
1525}
1526
1527MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
1528 MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedLoads) {
1529 assert(isDataInvariantLoad(InitialMI) &&
1530 "Cannot get here with a non-invariant load!");
1531
1532 // See if we can sink hardening the loaded value.
1533 auto SinkCheckToSingleUse =
1534 [&](MachineInstr &MI) -> Optional<MachineInstr *> {
1535 unsigned DefReg = MI.getOperand(0).getReg();
1536
1537 // We need to find a single use which we can sink the check. We can
1538 // primarily do this because many uses may already end up checked on their
1539 // own.
1540 MachineInstr *SingleUseMI = nullptr;
1541 for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
1542 // If we're already going to harden this use, it is data invariant and
1543 // within our block and we just need to check that the use isn't in an
1544 // address.
1545 if (HardenedLoads.count(&UseMI)) {
1546 const MCInstrDesc &Desc = UseMI.getDesc();
1547 int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
1548 assert(MemRefBeginIdx >= 0 &&
1549 "Should always have mem references here!");
1550 MemRefBeginIdx += X86II::getOperandBias(Desc);
1551
1552 MachineOperand &BaseMO =
1553 UseMI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
1554 MachineOperand &IndexMO =
1555 UseMI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
1556 if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) ||
1557 (IndexMO.isReg() && IndexMO.getReg() == DefReg))
1558 // The load uses the register as part of its address making it not
1559 // invariant.
1560 return {};
1561
1562 continue;
1563 }
1564
1565 if (SingleUseMI)
1566 // We already have a single use, this would make two. Bail.
1567 return {};
1568
1569 // If this single use isn't data invariant, isn't in this block, or has
1570 // interfering EFLAGS, we can't sink the hardening to it.
1571 if (!isDataInvariant(UseMI) || UseMI.getParent() != MI.getParent())
1572 return {};
1573
1574 // If this instruction defines multiple registers bail as we won't harden
1575 // all of them.
1576 if (UseMI.getDesc().getNumDefs() > 1)
1577 return {};
1578
1579 // If this register isn't a virtual register we can't walk uses of sanely,
1580 // just bail. Also check that its register class is one of the ones we
1581 // can harden.
1582 unsigned UseDefReg = UseMI.getOperand(0).getReg();
1583 if (!TRI->isVirtualRegister(UseDefReg) ||
1584 !MRI->getRegClass(UseDefReg)->hasSubClassEq(&X86::GR64RegClass))
1585 return {};
1586
1587 SingleUseMI = &UseMI;
1588 }
1589
1590 // If SingleUseMI is still null, there is no use that needs its own
1591 // checking. Otherwise, it is the single use that needs checking.
1592 return {SingleUseMI};
1593 };
1594
1595 MachineInstr *MI = &InitialMI;
1596 while (Optional<MachineInstr *> SingleUse = SinkCheckToSingleUse(*MI)) {
1597 // Update which MI we're checking now.
1598 MI = *SingleUse;
1599 if (!MI)
1600 break;
1601 }
1602
1603 return MI;
1604}
1605
Chandler Carruthe66a6f42018-07-16 11:38:48 +00001606bool X86SpeculativeLoadHardeningPass::canHardenPostLoad(MachineInstr &MI) {
1607 if (!isDataInvariantLoad(MI))
1608 return false;
1609
1610 auto &DefOp = MI.getOperand(0);
1611 unsigned OldDefReg = DefOp.getReg();
1612
1613 auto *DefRC = MRI->getRegClass(OldDefReg);
1614 int DefRegBytes = TRI->getRegSizeInBits(*DefRC) / 8;
1615 if (DefRegBytes > 8)
1616 // We don't support post-load hardening of vectors.
1617 return false;
1618
1619 const TargetRegisterClass *GPRRegClasses[] = {
1620 &X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass,
1621 &X86::GR64RegClass};
1622 return DefRC->hasSuperClassEq(GPRRegClasses[Log2_32(DefRegBytes)]);
1623}
1624
Chandler Carruth90358e12018-07-13 11:13:58 +00001625// We can harden non-leaking loads into register without touching the address
1626// by just hiding all of the loaded bits. We use an `or` instruction to do
1627// this because having the poison value be all ones allows us to use the same
1628// value below. And the goal is just for the loaded bits to not be exposed to
1629// execution and coercing them to one is sufficient.
1630void X86SpeculativeLoadHardeningPass::hardenPostLoad(
1631 MachineInstr &MI, MachineSSAUpdater &PredStateSSA) {
Chandler Carruthe66a6f42018-07-16 11:38:48 +00001632 assert(canHardenPostLoad(MI) &&
1633 "Invalid instruction for post-load hardening!");
Chandler Carruth90358e12018-07-13 11:13:58 +00001634
1635 MachineBasicBlock &MBB = *MI.getParent();
1636 DebugLoc Loc = MI.getDebugLoc();
1637
1638 // For all of these, the def'ed register operand is operand zero.
1639 auto &DefOp = MI.getOperand(0);
1640 unsigned OldDefReg = DefOp.getReg();
1641
1642 auto *DefRC = MRI->getRegClass(OldDefReg);
1643 int DefRegBytes = TRI->getRegSizeInBits(*DefRC) / 8;
1644
1645 unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
1646 unsigned OrOpCode = OrOpCodes[Log2_32(DefRegBytes)];
1647
Chandler Carruth90358e12018-07-13 11:13:58 +00001648 unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
1649
1650 auto GetStateRegInRC = [&](const TargetRegisterClass &RC) {
1651 unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
1652
1653 int Bytes = TRI->getRegSizeInBits(RC) / 8;
1654 // FIXME: Need to teach this about 32-bit mode.
1655 if (Bytes != 8) {
1656 unsigned SubRegImm = SubRegImms[Log2_32(Bytes)];
1657 unsigned NarrowStateReg = MRI->createVirtualRegister(&RC);
1658 BuildMI(MBB, MI.getIterator(), Loc, TII->get(TargetOpcode::COPY),
1659 NarrowStateReg)
1660 .addReg(StateReg, 0, SubRegImm);
1661 StateReg = NarrowStateReg;
1662 }
1663 return StateReg;
1664 };
1665
1666 auto InsertPt = std::next(MI.getIterator());
1667 unsigned FlagsReg = 0;
1668 bool EFLAGSLive = isEFLAGSLive(MBB, InsertPt, *TRI);
1669 if (EFLAGSLive && !Subtarget->hasBMI2()) {
1670 FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
1671 EFLAGSLive = false;
1672 }
1673
1674 if (!EFLAGSLive) {
1675 unsigned StateReg = GetStateRegInRC(*DefRC);
1676 unsigned NewDefReg = MRI->createVirtualRegister(DefRC);
1677 DefOp.setReg(NewDefReg);
1678 auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), OldDefReg)
1679 .addReg(StateReg)
1680 .addReg(NewDefReg);
1681 OrI->addRegisterDead(X86::EFLAGS, TRI);
1682 ++NumInstsInserted;
1683 LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1684 } else {
1685 assert(Subtarget->hasBMI2() &&
1686 "Cannot harden loads and preserve EFLAGS without BMI2!");
1687
1688 unsigned ShiftOpCode = DefRegBytes < 4 ? X86::SHRX32rr : X86::SHRX64rr;
1689 auto &ShiftRC =
1690 DefRegBytes < 4 ? X86::GR32_NOSPRegClass : X86::GR64_NOSPRegClass;
1691 int ShiftRegBytes = TRI->getRegSizeInBits(ShiftRC) / 8;
1692 unsigned DefSubRegImm = SubRegImms[Log2_32(DefRegBytes)];
1693
1694 unsigned StateReg = GetStateRegInRC(ShiftRC);
1695
1696 // First have the def instruction def a temporary register.
1697 unsigned TmpReg = MRI->createVirtualRegister(DefRC);
1698 DefOp.setReg(TmpReg);
1699 // Now copy it into a register of the shift RC.
1700 unsigned ShiftInputReg = TmpReg;
1701 if (DefRegBytes != ShiftRegBytes) {
1702 unsigned UndefReg = MRI->createVirtualRegister(&ShiftRC);
1703 BuildMI(MBB, InsertPt, Loc, TII->get(X86::IMPLICIT_DEF), UndefReg);
1704 ShiftInputReg = MRI->createVirtualRegister(&ShiftRC);
1705 BuildMI(MBB, InsertPt, Loc, TII->get(X86::INSERT_SUBREG), ShiftInputReg)
1706 .addReg(UndefReg)
1707 .addReg(TmpReg)
1708 .addImm(DefSubRegImm);
1709 }
1710
1711 // We shift this once if the shift is wider than the def and thus we can
1712 // shift *all* of the def'ed bytes out. Otherwise we need to do two shifts.
1713
1714 unsigned ShiftedReg = MRI->createVirtualRegister(&ShiftRC);
1715 auto Shift1I =
1716 BuildMI(MBB, InsertPt, Loc, TII->get(ShiftOpCode), ShiftedReg)
1717 .addReg(ShiftInputReg)
1718 .addReg(StateReg);
1719 (void)Shift1I;
1720 ++NumInstsInserted;
1721 LLVM_DEBUG(dbgs() << " Inserting shrx: "; Shift1I->dump(); dbgs() << "\n");
1722
1723 // The only way we have a bit left is if all 8 bytes were defined. Do an
1724 // extra shift to get the last bit in this case.
1725 if (DefRegBytes == ShiftRegBytes) {
1726 // We can just directly def the old def register as its the same size.
1727 ShiftInputReg = ShiftedReg;
1728 auto Shift2I =
1729 BuildMI(MBB, InsertPt, Loc, TII->get(ShiftOpCode), OldDefReg)
1730 .addReg(ShiftInputReg)
1731 .addReg(StateReg);
1732 (void)Shift2I;
1733 ++NumInstsInserted;
1734 LLVM_DEBUG(dbgs() << " Inserting shrx: "; Shift2I->dump();
1735 dbgs() << "\n");
1736 } else {
1737 // When we have different size shift register we need to fix up the
1738 // class. We can do that as we copy into the old def register.
1739 BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), OldDefReg)
1740 .addReg(ShiftedReg, 0, DefSubRegImm);
1741 }
1742 }
1743
1744 if (FlagsReg)
1745 restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
1746
1747 ++NumPostLoadRegsHardened;
1748}
1749
1750void X86SpeculativeLoadHardeningPass::checkReturnInstr(
1751 MachineInstr &MI, MachineSSAUpdater &PredStateSSA) {
1752 MachineBasicBlock &MBB = *MI.getParent();
1753 DebugLoc Loc = MI.getDebugLoc();
1754 auto InsertPt = MI.getIterator();
1755
1756 if (FenceCallAndRet) {
1757 // Simply forcibly block speculation of loads out of the function by using
1758 // an LFENCE. This is potentially a heavy-weight mitigation strategy, but
1759 // should be secure, is simple from an ABI perspective, and the cost can be
1760 // minimized through inlining.
1761 //
1762 // FIXME: We should investigate ways to establish a strong data-dependency
1763 // on the return. However, poisoning the stack pointer is unlikely to work
1764 // because the return is *predicted* rather than relying on the load of the
1765 // return address to actually resolve.
1766 BuildMI(MBB, InsertPt, Loc, TII->get(X86::LFENCE));
1767 ++NumInstsInserted;
1768 ++NumLFENCEsInserted;
1769 return;
1770 }
1771
1772 // Take our predicate state, shift it to the high 17 bits (so that we keep
1773 // pointers canonical) and merge it into RSP. This will allow the caller to
1774 // extract it when we return (speculatively).
1775 mergePredStateIntoSP(MBB, InsertPt, Loc,
1776 PredStateSSA.GetValueAtEndOfBlock(&MBB));
1777}
1778
1779INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
1780 "X86 speculative load hardener", false, false)
1781INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
1782 "X86 speculative load hardener", false, false)
1783
1784FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() {
1785 return new X86SpeculativeLoadHardeningPass();
1786}