blob: a99fa203fa4b24487ae1961f033af5d3688ae6f3 [file] [log] [blame]
Chandler Carruth90358e12018-07-13 11:13:58 +00001//====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9/// \file
10///
11/// Provide a pass which mitigates speculative execution attacks which operate
12/// by speculating incorrectly past some predicate (a type check, bounds check,
13/// or other condition) to reach a load with invalid inputs and leak the data
14/// accessed by that load using a side channel out of the speculative domain.
15///
16/// For details on the attacks, see the first variant in both the Project Zero
17/// writeup and the Spectre paper:
18/// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
19/// https://spectreattack.com/spectre.pdf
20///
21//===----------------------------------------------------------------------===//
22
23#include "X86.h"
24#include "X86InstrBuilder.h"
25#include "X86InstrInfo.h"
26#include "X86Subtarget.h"
27#include "llvm/ADT/ArrayRef.h"
28#include "llvm/ADT/DenseMap.h"
29#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/ScopeExit.h"
31#include "llvm/ADT/SmallPtrSet.h"
32#include "llvm/ADT/SmallSet.h"
33#include "llvm/ADT/SmallVector.h"
34#include "llvm/ADT/SparseBitVector.h"
35#include "llvm/ADT/Statistic.h"
36#include "llvm/CodeGen/MachineBasicBlock.h"
37#include "llvm/CodeGen/MachineConstantPool.h"
38#include "llvm/CodeGen/MachineFunction.h"
39#include "llvm/CodeGen/MachineFunctionPass.h"
40#include "llvm/CodeGen/MachineInstr.h"
41#include "llvm/CodeGen/MachineInstrBuilder.h"
42#include "llvm/CodeGen/MachineModuleInfo.h"
43#include "llvm/CodeGen/MachineOperand.h"
44#include "llvm/CodeGen/MachineRegisterInfo.h"
45#include "llvm/CodeGen/MachineSSAUpdater.h"
46#include "llvm/CodeGen/TargetInstrInfo.h"
47#include "llvm/CodeGen/TargetRegisterInfo.h"
48#include "llvm/CodeGen/TargetSchedule.h"
49#include "llvm/CodeGen/TargetSubtargetInfo.h"
50#include "llvm/IR/DebugLoc.h"
51#include "llvm/MC/MCSchedule.h"
52#include "llvm/Pass.h"
53#include "llvm/Support/CommandLine.h"
54#include "llvm/Support/Debug.h"
55#include "llvm/Support/raw_ostream.h"
56#include <algorithm>
57#include <cassert>
58#include <iterator>
59#include <utility>
60
61using namespace llvm;
62
63#define PASS_KEY "x86-speculative-load-hardening"
64#define DEBUG_TYPE PASS_KEY
65
66STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced");
67STATISTIC(NumBranchesUntraced, "Number of branches unable to trace");
68STATISTIC(NumAddrRegsHardened,
69 "Number of address mode used registers hardaned");
70STATISTIC(NumPostLoadRegsHardened,
71 "Number of post-load register values hardened");
72STATISTIC(NumInstsInserted, "Number of instructions inserted");
73STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
74
75static cl::opt<bool> HardenEdgesWithLFENCE(
76 PASS_KEY "-lfence",
77 cl::desc(
78 "Use LFENCE along each conditional edge to harden against speculative "
79 "loads rather than conditional movs and poisoned pointers."),
80 cl::init(false), cl::Hidden);
81
82static cl::opt<bool> EnablePostLoadHardening(
83 PASS_KEY "-post-load",
84 cl::desc("Harden the value loaded *after* it is loaded by "
85 "flushing the loaded bits to 1. This is hard to do "
86 "in general but can be done easily for GPRs."),
87 cl::init(true), cl::Hidden);
88
89static cl::opt<bool> FenceCallAndRet(
90 PASS_KEY "-fence-call-and-ret",
91 cl::desc("Use a full speculation fence to harden both call and ret edges "
92 "rather than a lighter weight mitigation."),
93 cl::init(false), cl::Hidden);
94
95static cl::opt<bool> HardenInterprocedurally(
96 PASS_KEY "-ip",
97 cl::desc("Harden interprocedurally by passing our state in and out of "
98 "functions in the high bits of the stack pointer."),
99 cl::init(true), cl::Hidden);
100
101static cl::opt<bool>
102 HardenLoads(PASS_KEY "-loads",
103 cl::desc("Sanitize loads from memory. When disable, no "
104 "significant security is provided."),
105 cl::init(true), cl::Hidden);
106
107namespace llvm {
108
109void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
110
111} // end namespace llvm
112
113namespace {
114
115class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
116public:
117 X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) {
118 initializeX86SpeculativeLoadHardeningPassPass(
119 *PassRegistry::getPassRegistry());
120 }
121
122 StringRef getPassName() const override {
123 return "X86 speculative load hardening";
124 }
125 bool runOnMachineFunction(MachineFunction &MF) override;
126 void getAnalysisUsage(AnalysisUsage &AU) const override;
127
128 /// Pass identification, replacement for typeid.
129 static char ID;
130
131private:
132 /// The information about a block's conditional terminators needed to trace
133 /// our predicate state through the exiting edges.
134 struct BlockCondInfo {
135 MachineBasicBlock *MBB;
136
137 // We mostly have one conditional branch, and in extremely rare cases have
138 // two. Three and more are so rare as to be unimportant for compile time.
139 SmallVector<MachineInstr *, 2> CondBrs;
140
141 MachineInstr *UncondBr;
142 };
143
144 const X86Subtarget *Subtarget;
145 MachineRegisterInfo *MRI;
146 const X86InstrInfo *TII;
147 const TargetRegisterInfo *TRI;
148 const TargetRegisterClass *PredStateRC;
149
150 void hardenEdgesWithLFENCE(MachineFunction &MF);
151
152 SmallVector<BlockCondInfo, 16> collectBlockCondInfo(MachineFunction &MF);
153
154 void checkAllLoads(MachineFunction &MF, MachineSSAUpdater &PredStateSSA);
155
156 unsigned saveEFLAGS(MachineBasicBlock &MBB,
157 MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
158 void restoreEFLAGS(MachineBasicBlock &MBB,
159 MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
160 unsigned OFReg);
161
162 void mergePredStateIntoSP(MachineBasicBlock &MBB,
163 MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
164 unsigned PredStateReg);
165 unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
166 MachineBasicBlock::iterator InsertPt,
167 DebugLoc Loc);
168
169 void
170 hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
171 MachineOperand &IndexMO, MachineSSAUpdater &PredStateSSA,
172 SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
173 MachineInstr *
174 sinkPostLoadHardenedInst(MachineInstr &MI,
175 SmallPtrSetImpl<MachineInstr *> &HardenedLoads);
176 void hardenPostLoad(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
177 void checkReturnInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
178 void checkCallInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
179};
180
181} // end anonymous namespace
182
183char X86SpeculativeLoadHardeningPass::ID = 0;
184
185void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
186 AnalysisUsage &AU) const {
187 MachineFunctionPass::getAnalysisUsage(AU);
188}
189
190static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB,
191 MachineBasicBlock &Succ, int SuccCount,
192 MachineInstr *Br, MachineInstr *&UncondBr,
193 const X86InstrInfo &TII) {
194 assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!");
195
196 MachineFunction &MF = *MBB.getParent();
197
198 MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
199
200 // We have to insert the new block immediately after the current one as we
201 // don't know what layout-successor relationships the successor has and we
202 // may not be able to (and generally don't want to) try to fix those up.
203 MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
204
205 // Update the branch instruction if necessary.
206 if (Br) {
207 assert(Br->getOperand(0).getMBB() == &Succ &&
208 "Didn't start with the right target!");
209 Br->getOperand(0).setMBB(&NewMBB);
210
211 // If this successor was reached through a branch rather than fallthrough,
212 // we might have *broken* fallthrough and so need to inject a new
213 // unconditional branch.
214 if (!UncondBr) {
215 MachineBasicBlock &OldLayoutSucc =
216 *std::next(MachineFunction::iterator(&NewMBB));
217 assert(MBB.isSuccessor(&OldLayoutSucc) &&
218 "Without an unconditional branch, the old layout successor should "
219 "be an actual successor!");
220 auto BrBuilder =
221 BuildMI(&MBB, DebugLoc(), TII.get(X86::JMP_1)).addMBB(&OldLayoutSucc);
222 // Update the unconditional branch now that we've added one.
223 UncondBr = &*BrBuilder;
224 }
225
226 // Insert unconditional "jump Succ" instruction in the new block if
227 // necessary.
228 if (!NewMBB.isLayoutSuccessor(&Succ)) {
229 SmallVector<MachineOperand, 4> Cond;
230 TII.insertBranch(NewMBB, &Succ, nullptr, Cond, Br->getDebugLoc());
231 }
232 } else {
233 assert(!UncondBr &&
234 "Cannot have a branchless successor and an unconditional branch!");
235 assert(NewMBB.isLayoutSuccessor(&Succ) &&
236 "A non-branch successor must have been a layout successor before "
237 "and now is a layout successor of the new block.");
238 }
239
240 // If this is the only edge to the successor, we can just replace it in the
241 // CFG. Otherwise we need to add a new entry in the CFG for the new
242 // successor.
243 if (SuccCount == 1) {
244 MBB.replaceSuccessor(&Succ, &NewMBB);
245 } else {
246 MBB.splitSuccessor(&Succ, &NewMBB);
247 }
248
249 // Hook up the edge from the new basic block to the old successor in the CFG.
250 NewMBB.addSuccessor(&Succ);
251
252 // Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
253 for (MachineInstr &MI : Succ) {
254 if (!MI.isPHI())
255 break;
256 for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
257 OpIdx += 2) {
258 MachineOperand &OpV = MI.getOperand(OpIdx);
259 MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
260 assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
261 if (OpMBB.getMBB() != &MBB)
262 continue;
263
264 // If this is the last edge to the succesor, just replace MBB in the PHI
265 if (SuccCount == 1) {
266 OpMBB.setMBB(&NewMBB);
267 break;
268 }
269
270 // Otherwise, append a new pair of operands for the new incoming edge.
271 MI.addOperand(MF, OpV);
272 MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
273 break;
274 }
275 }
276
277 // Inherit live-ins from the successor
278 for (auto &LI : Succ.liveins())
279 NewMBB.addLiveIn(LI);
280
281 LLVM_DEBUG(dbgs() << " Split edge from '" << MBB.getName() << "' to '"
282 << Succ.getName() << "'.\n");
283 return NewMBB;
284}
285
286bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
287 MachineFunction &MF) {
288 LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
289 << " **********\n");
290
291 Subtarget = &MF.getSubtarget<X86Subtarget>();
292 MRI = &MF.getRegInfo();
293 TII = Subtarget->getInstrInfo();
294 TRI = Subtarget->getRegisterInfo();
295 // FIXME: Support for 32-bit.
296 PredStateRC = &X86::GR64_NOSPRegClass;
297
298 if (MF.begin() == MF.end())
299 // Nothing to do for a degenerate empty function...
300 return false;
301
302 // We support an alternative hardening technique based on a debug flag.
303 if (HardenEdgesWithLFENCE) {
304 hardenEdgesWithLFENCE(MF);
305 return true;
306 }
307
308 // Create a dummy debug loc to use for all the generated code here.
309 DebugLoc Loc;
310
311 MachineBasicBlock &Entry = *MF.begin();
312 auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(Entry.begin());
313
314 // Do a quick scan to see if we have any checkable loads.
315 bool HasCheckableLoad = false;
316 for (MachineBasicBlock &MBB : MF) {
317 for (MachineInstr &MI : MBB) {
318 // Stop searching blocks at an LFENCE.
319 if (MI.getOpcode() == X86::LFENCE)
320 break;
321
322 // Looking for loads only.
323 if (!MI.mayLoad())
324 continue;
325
326 // An MFENCE is modeled as a load but doesn't require hardening.
327 if (MI.getOpcode() == X86::MFENCE)
328 continue;
329
330 HasCheckableLoad = true;
331 break;
332 }
333 if (HasCheckableLoad)
334 break;
335 }
336
337 // See if we have any conditional branching blocks that we will need to trace
338 // predicate state through.
339 SmallVector<BlockCondInfo, 16> Infos = collectBlockCondInfo(MF);
340
341 // If we have no interesting conditions or loads, nothing to do here.
342 if (!HasCheckableLoad && Infos.empty())
343 return true;
344
345 unsigned PredStateReg;
346 unsigned PredStateSizeInBytes = TRI->getRegSizeInBits(*PredStateRC) / 8;
347
348 // The poison value is required to be an all-ones value for many aspects of
349 // this mitigation.
350 const int PoisonVal = -1;
351 unsigned PoisonReg = MRI->createVirtualRegister(PredStateRC);
352 BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64ri32), PoisonReg)
353 .addImm(PoisonVal);
354 ++NumInstsInserted;
355
356 // If we have loads being hardened and we've asked for call and ret edges to
357 // get a full fence-based mitigation, inject that fence.
358 if (HasCheckableLoad && FenceCallAndRet) {
359 // We need to insert an LFENCE at the start of the function to suspend any
360 // incoming misspeculation from the caller. This helps two-fold: the caller
361 // may not have been protected as this code has been, and this code gets to
362 // not take any specific action to protect across calls.
363 // FIXME: We could skip this for functions which unconditionally return
364 // a constant.
365 BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::LFENCE));
366 ++NumInstsInserted;
367 ++NumLFENCEsInserted;
368 }
369
370 // If we have no conditionals to protect in blocks, then all we needed to do
371 // was protect the entry and so we're done.
372 if (Infos.empty())
373 // We may have changed the function's code at this point to insert fences.
374 return true;
375
376 // For every basic block in the function which can b
377 if (HardenInterprocedurally && !FenceCallAndRet) {
378 // Set up the predicate state by extracting it from the incoming stack
379 // pointer so we pick up any misspeculation in our caller.
380 PredStateReg = extractPredStateFromSP(Entry, EntryInsertPt, Loc);
381 } else {
382 // Otherwise, just build the predicate state itself by zeroing a register
383 // as we don't need any initial state.
384 PredStateReg = MRI->createVirtualRegister(PredStateRC);
385 unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
386 auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
387 PredStateSubReg);
388 ++NumInstsInserted;
389 MachineOperand *ZeroEFLAGSDefOp =
390 ZeroI->findRegisterDefOperand(X86::EFLAGS);
391 assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
392 "Must have an implicit def of EFLAGS!");
393 ZeroEFLAGSDefOp->setIsDead(true);
394 BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
395 PredStateReg)
396 .addImm(0)
397 .addReg(PredStateSubReg)
398 .addImm(X86::sub_32bit);
399 }
400
401 // We're going to need to trace predicate state throughout the function's
402 // CFG. Prepare for this by setting up our initial state of PHIs with unique
403 // predecessor entries and all the initial predicate state.
404
405 // FIXME: It's really frustrating that we have to do this, but SSA-form in
406 // MIR isn't what you might expect. We may have multiple entries in PHI nodes
407 // for a single predecessor. This makes CFG-updating extremely complex, so
408 // here we simplify all PHI nodes to a model even simpler than the IR's
409 // model: exactly one entry per predecessor, regardless of how many edges
410 // there are.
411 SmallPtrSet<MachineBasicBlock *, 4> Preds;
412 SmallVector<int, 4> DupIndices;
413 for (auto &MBB : MF)
414 for (auto &MI : MBB) {
415 if (!MI.isPHI())
416 break;
417
418 // First we scan the operands of the PHI looking for duplicate entries
419 // a particular predecessor. We retain the operand index of each duplicate
420 // entry found.
421 for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
422 OpIdx += 2)
423 if (!Preds.insert(MI.getOperand(OpIdx + 1).getMBB()).second)
424 DupIndices.push_back(OpIdx);
425
426 // Now walk the duplicate indices, removing both the block and value. Note
427 // that these are stored as a vector making this element-wise removal
428 // :w
429 // potentially quadratic.
430 //
431 // FIXME: It is really frustrating that we have to use a quadratic
432 // removal algorithm here. There should be a better way, but the use-def
433 // updates required make that impossible using the public API.
434 //
435 // Note that we have to process these backwards so that we don't
436 // invalidate other indices with each removal.
437 while (!DupIndices.empty()) {
438 int OpIdx = DupIndices.pop_back_val();
439 // Remove both the block and value operand, again in reverse order to
440 // preserve indices.
441 MI.RemoveOperand(OpIdx + 1);
442 MI.RemoveOperand(OpIdx);
443 }
444
445 Preds.clear();
446 }
447
448 // Track the updated values in an SSA updater to rewrite into SSA form at the
449 // end.
450 MachineSSAUpdater PredStateSSA(MF);
451 PredStateSSA.Initialize(PredStateReg);
452 PredStateSSA.AddAvailableValue(&Entry, PredStateReg);
453 // Collect the inserted instructions so we can rewrite their uses of the
454 // predicate state into SSA form.
455 SmallVector<MachineInstr *, 16> CMovs;
456
457 // Now walk all of the basic blocks looking for ones that end in conditional
458 // jumps where we need to update this register along each edge.
459 for (BlockCondInfo &Info : Infos) {
460 MachineBasicBlock &MBB = *Info.MBB;
461 SmallVectorImpl<MachineInstr *> &CondBrs = Info.CondBrs;
462 MachineInstr *UncondBr = Info.UncondBr;
463
464 LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName()
465 << "\n");
466 ++NumCondBranchesTraced;
467
468 // Compute the non-conditional successor as either the target of any
469 // unconditional branch or the layout successor.
470 MachineBasicBlock *UncondSucc =
471 UncondBr ? (UncondBr->getOpcode() == X86::JMP_1
472 ? UncondBr->getOperand(0).getMBB()
473 : nullptr)
474 : &*std::next(MachineFunction::iterator(&MBB));
475
476 // Count how many edges there are to any given successor.
477 SmallDenseMap<MachineBasicBlock *, int> SuccCounts;
478 if (UncondSucc)
479 ++SuccCounts[UncondSucc];
480 for (auto *CondBr : CondBrs)
481 ++SuccCounts[CondBr->getOperand(0).getMBB()];
482
483 // A lambda to insert cmov instructions into a block checking all of the
484 // condition codes in a sequence.
485 auto BuildCheckingBlockForSuccAndConds =
486 [&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount,
487 MachineInstr *Br, MachineInstr *&UncondBr,
488 ArrayRef<X86::CondCode> Conds) {
489 // First, we split the edge to insert the checking block into a safe
490 // location.
491 auto &CheckingMBB =
492 (SuccCount == 1 && Succ.pred_size() == 1)
493 ? Succ
494 : splitEdge(MBB, Succ, SuccCount, Br, UncondBr, *TII);
495
496 bool LiveEFLAGS = Succ.isLiveIn(X86::EFLAGS);
497 if (!LiveEFLAGS)
498 CheckingMBB.addLiveIn(X86::EFLAGS);
499
500 // Now insert the cmovs to implement the checks.
501 auto InsertPt = CheckingMBB.begin();
Erich Keaneac8cb222018-07-13 14:43:20 +0000502 assert((InsertPt == CheckingMBB.end() || !InsertPt->isPHI()) &&
503 "Should never have a PHI in the initial checking block as it "
504 "always has a single predecessor!");
Chandler Carruth90358e12018-07-13 11:13:58 +0000505
506 // We will wire each cmov to each other, but need to start with the
507 // incoming pred state.
508 unsigned CurStateReg = PredStateReg;
509
510 for (X86::CondCode Cond : Conds) {
511 auto CMovOp = X86::getCMovFromCond(Cond, PredStateSizeInBytes);
512
513 unsigned UpdatedStateReg = MRI->createVirtualRegister(PredStateRC);
514 auto CMovI = BuildMI(CheckingMBB, InsertPt, Loc, TII->get(CMovOp),
515 UpdatedStateReg)
516 .addReg(CurStateReg)
517 .addReg(PoisonReg);
518 // If this is the last cmov and the EFLAGS weren't originally
519 // live-in, mark them as killed.
520 if (!LiveEFLAGS && Cond == Conds.back())
521 CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
522
523 ++NumInstsInserted;
524 LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump();
525 dbgs() << "\n");
526
527 // The first one of the cmovs will be using the top level
528 // `PredStateReg` and need to get rewritten into SSA form.
529 if (CurStateReg == PredStateReg)
530 CMovs.push_back(&*CMovI);
531
532 // The next cmov should start from this one's def.
533 CurStateReg = UpdatedStateReg;
534 }
535
536 // And put the last one into the available values for PredStateSSA.
537 PredStateSSA.AddAvailableValue(&CheckingMBB, CurStateReg);
538 };
539
540 std::vector<X86::CondCode> UncondCodeSeq;
541 for (auto *CondBr : CondBrs) {
542 MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB();
543 int &SuccCount = SuccCounts[&Succ];
544
545 X86::CondCode Cond = X86::getCondFromBranchOpc(CondBr->getOpcode());
546 X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond);
547 UncondCodeSeq.push_back(Cond);
548
549 BuildCheckingBlockForSuccAndConds(MBB, Succ, SuccCount, CondBr, UncondBr,
550 {InvCond});
551
552 // Decrement the successor count now that we've split one of the edges.
553 // We need to keep the count of edges to the successor accurate in order
554 // to know above when to *replace* the successor in the CFG vs. just
555 // adding the new successor.
556 --SuccCount;
557 }
558
559 // Since we may have split edges and changed the number of successors,
560 // normalize the probabilities. This avoids doing it each time we split an
561 // edge.
562 MBB.normalizeSuccProbs();
563
564 // Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
565 // need to intersect the other condition codes. We can do this by just
566 // doing a cmov for each one.
567 if (!UncondSucc)
568 // If we have no fallthrough to protect (perhaps it is an indirect jump?)
569 // just skip this and continue.
570 continue;
571
572 assert(SuccCounts[UncondSucc] == 1 &&
573 "We should never have more than one edge to the unconditional "
574 "successor at this point because every other edge must have been "
575 "split above!");
576
577 // Sort and unique the codes to minimize them.
578 llvm::sort(UncondCodeSeq.begin(), UncondCodeSeq.end());
579 UncondCodeSeq.erase(std::unique(UncondCodeSeq.begin(), UncondCodeSeq.end()),
580 UncondCodeSeq.end());
581
582 // Build a checking version of the successor.
583 BuildCheckingBlockForSuccAndConds(MBB, *UncondSucc, /*SuccCount*/ 1,
584 UncondBr, UncondBr, UncondCodeSeq);
585 }
586
587 // We may also enter basic blocks in this function via exception handling
588 // control flow. Here, if we are hardening interprocedurally, we need to
589 // re-capture the predicate state from the throwing code. In the Itanium ABI,
590 // the throw will always look like a call to __cxa_throw and will have the
591 // predicate state in the stack pointer, so extract fresh predicate state from
592 // the stack pointer and make it available in SSA.
593 // FIXME: Handle non-itanium ABI EH models.
594 if (HardenInterprocedurally) {
595 for (MachineBasicBlock &MBB : MF) {
596 assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!");
597 assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
598 assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
599 if (!MBB.isEHPad())
600 continue;
601 PredStateSSA.AddAvailableValue(
602 &MBB,
603 extractPredStateFromSP(MBB, MBB.SkipPHIsAndLabels(MBB.begin()), Loc));
604 }
605 }
606
607 // Now check all of the loads using the predicate state.
608 checkAllLoads(MF, PredStateSSA);
609
610 // Now rewrite all the uses of the pred state using the SSA updater so that
611 // we track updates through the CFG.
612 for (MachineInstr *CMovI : CMovs)
613 for (MachineOperand &Op : CMovI->operands()) {
614 if (!Op.isReg() || Op.getReg() != PredStateReg)
615 continue;
616
617 PredStateSSA.RewriteUse(Op);
618 }
619
620 // If we are hardening interprocedurally, find each returning block and
621 // protect the caller from being returned to through misspeculation.
622 if (HardenInterprocedurally)
623 for (MachineBasicBlock &MBB : MF) {
624 if (MBB.empty())
625 continue;
626
627 MachineInstr &MI = MBB.back();
628 if (!MI.isReturn())
629 continue;
630
631 checkReturnInstr(MI, PredStateSSA);
632 }
633
634 LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump();
635 dbgs() << "\n"; MF.verify(this));
636 return true;
637}
638
639/// Implements the naive hardening approach of putting an LFENCE after every
640/// potentially mis-predicted control flow construct.
641///
642/// We include this as an alternative mostly for the purpose of comparison. The
643/// performance impact of this is expected to be extremely severe and not
644/// practical for any real-world users.
645void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
646 MachineFunction &MF) {
647 // First, we scan the function looking for blocks that are reached along edges
648 // that we might want to harden.
649 SmallSetVector<MachineBasicBlock *, 8> Blocks;
650 for (MachineBasicBlock &MBB : MF) {
651 // If there are no or only one successor, nothing to do here.
652 if (MBB.succ_size() <= 1)
653 continue;
654
655 // Skip blocks unless their terminators start with a branch. Other
656 // terminators don't seem interesting for guarding against misspeculation.
657 auto TermIt = MBB.getFirstTerminator();
658 if (TermIt == MBB.end() || !TermIt->isBranch())
659 continue;
660
661 // Add all the non-EH-pad succossors to the blocks we want to harden. We
662 // skip EH pads because there isn't really a condition of interest on
663 // entering.
664 for (MachineBasicBlock *SuccMBB : MBB.successors())
665 if (!SuccMBB->isEHPad())
666 Blocks.insert(SuccMBB);
667 }
668
669 for (MachineBasicBlock *MBB : Blocks) {
670 auto InsertPt = MBB->SkipPHIsAndLabels(MBB->begin());
671 BuildMI(*MBB, InsertPt, DebugLoc(), TII->get(X86::LFENCE));
672 ++NumInstsInserted;
673 ++NumLFENCEsInserted;
674 }
675}
676
677SmallVector<X86SpeculativeLoadHardeningPass::BlockCondInfo, 16>
678X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
679 SmallVector<BlockCondInfo, 16> Infos;
680
681 // Walk the function and build up a summary for each block's conditions that
682 // we need to trace through.
683 for (MachineBasicBlock &MBB : MF) {
684 // If there are no or only one successor, nothing to do here.
685 if (MBB.succ_size() <= 1)
686 continue;
687
688 // We want to reliably handle any conditional branch terminators in the
689 // MBB, so we manually analyze the branch. We can handle all of the
690 // permutations here, including ones that analyze branch cannot.
691 //
692 // The approach is to walk backwards across the terminators, resetting at
693 // any unconditional non-indirect branch, and track all conditional edges
694 // to basic blocks as well as the fallthrough or unconditional successor
695 // edge. For each conditional edge, we track the target and the opposite
696 // condition code in order to inject a "no-op" cmov into that successor
697 // that will harden the predicate. For the fallthrough/unconditional
698 // edge, we inject a separate cmov for each conditional branch with
699 // matching condition codes. This effectively implements an "and" of the
700 // condition flags, even if there isn't a single condition flag that would
701 // directly implement that. We don't bother trying to optimize either of
702 // these cases because if such an optimization is possible, LLVM should
703 // have optimized the conditional *branches* in that way already to reduce
704 // instruction count. This late, we simply assume the minimal number of
705 // branch instructions is being emitted and use that to guide our cmov
706 // insertion.
707
708 BlockCondInfo Info = {&MBB, {}, nullptr};
709
710 // Now walk backwards through the terminators and build up successors they
711 // reach and the conditions.
712 for (MachineInstr &MI : llvm::reverse(MBB)) {
713 // Once we've handled all the terminators, we're done.
714 if (!MI.isTerminator())
715 break;
716
717 // If we see a non-branch terminator, we can't handle anything so bail.
718 if (!MI.isBranch()) {
719 Info.CondBrs.clear();
720 break;
721 }
722
723 // If we see an unconditional branch, reset our state, clear any
724 // fallthrough, and set this is the "else" successor.
725 if (MI.getOpcode() == X86::JMP_1) {
726 Info.CondBrs.clear();
727 Info.UncondBr = &MI;
728 continue;
729 }
730
731 // If we get an invalid condition, we have an indirect branch or some
732 // other unanalyzable "fallthrough" case. We model this as a nullptr for
733 // the destination so we can still guard any conditional successors.
734 // Consider code sequences like:
735 // ```
736 // jCC L1
737 // jmpq *%rax
738 // ```
739 // We still want to harden the edge to `L1`.
740 if (X86::getCondFromBranchOpc(MI.getOpcode()) == X86::COND_INVALID) {
741 Info.CondBrs.clear();
742 Info.UncondBr = &MI;
743 continue;
744 }
745
746 // We have a vanilla conditional branch, add it to our list.
747 Info.CondBrs.push_back(&MI);
748 }
749 if (Info.CondBrs.empty()) {
750 ++NumBranchesUntraced;
751 LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
752 MBB.dump());
753 continue;
754 }
755
756 Infos.push_back(Info);
757 }
758
759 return Infos;
760}
761
762/// Returns true if the instruction has no behavior (specified or otherwise)
763/// that is based on the value of any of its register operands
764///
765/// A classical example of something that is inherently not data invariant is an
766/// indirect jump -- the destination is loaded into icache based on the bits set
767/// in the jump destination register.
768///
769/// FIXME: This should become part of our instruction tables.
770static bool isDataInvariant(MachineInstr &MI) {
771 switch (MI.getOpcode()) {
772 default:
773 // By default, assume that the instruction is not data invariant.
774 return false;
775
776 // FIXME: For now, we just use a very boring, conservative set of unary
777 // instructions because we're mostly interested in handling simple
778 // transformations.
779 case TargetOpcode::COPY:
780 return true;
781 }
782}
783
784/// Returns true if the instruction has no behavior (specified or otherwise)
785/// that is based on the value loaded from memory or the value of any
786/// non-address register operands.
787///
788/// For example, if the latency of the instruction is dependent on the
789/// particular bits set in any of the registers *or* any of the bits loaded from
790/// memory.
791///
792/// A classical example of something that is inherently not data invariant is an
793/// indirect jump -- the destination is loaded into icache based on the bits set
794/// in the jump destination register.
795///
796/// FIXME: This should become part of our instruction tables.
797static bool isDataInvariantLoad(MachineInstr &MI) {
798 switch (MI.getOpcode()) {
799 default:
800 // By default, assume that the load will immediately leak.
801 return false;
802
Craig Topper445abf72018-07-13 22:41:46 +0000803 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
804 // However, they set flags and are perhaps the most surprisingly constant
805 // time operations so we call them out here separately.
Chandler Carruth90358e12018-07-13 11:13:58 +0000806 case X86::IMUL16rm:
807 case X86::IMUL16rmi8:
808 case X86::IMUL16rmi:
809 case X86::IMUL32rm:
810 case X86::IMUL32rmi8:
811 case X86::IMUL32rmi:
812 case X86::IMUL64rm:
813 case X86::IMUL64rmi32:
814 case X86::IMUL64rmi8:
815
Craig Topper445abf72018-07-13 22:41:46 +0000816 // Bit scanning and counting instructions that are somewhat surprisingly
817 // constant time as they scan across bits and do other fairly complex
818 // operations like popcnt, but are believed to be constant time on x86.
819 // However, these set flags.
820 case X86::BSF16rm:
821 case X86::BSF32rm:
822 case X86::BSF64rm:
823 case X86::BSR16rm:
824 case X86::BSR32rm:
825 case X86::BSR64rm:
826 case X86::LZCNT16rm:
827 case X86::LZCNT32rm:
828 case X86::LZCNT64rm:
829 case X86::POPCNT16rm:
830 case X86::POPCNT32rm:
831 case X86::POPCNT64rm:
832 case X86::TZCNT16rm:
833 case X86::TZCNT32rm:
834 case X86::TZCNT64rm:
835
836 // Bit manipulation instructions are effectively combinations of basic
837 // arithmetic ops, and should still execute in constant time. These also
838 // set flags.
Chandler Carruth90358e12018-07-13 11:13:58 +0000839 case X86::BLCFILL32rm:
840 case X86::BLCFILL64rm:
841 case X86::BLCI32rm:
842 case X86::BLCI64rm:
843 case X86::BLCIC32rm:
844 case X86::BLCIC64rm:
845 case X86::BLCMSK32rm:
846 case X86::BLCMSK64rm:
847 case X86::BLCS32rm:
848 case X86::BLCS64rm:
849 case X86::BLSFILL32rm:
850 case X86::BLSFILL64rm:
851 case X86::BLSI32rm:
852 case X86::BLSI64rm:
853 case X86::BLSIC32rm:
854 case X86::BLSIC64rm:
855 case X86::BLSMSK32rm:
856 case X86::BLSMSK64rm:
857 case X86::BLSR32rm:
858 case X86::BLSR64rm:
Chandler Carruth90358e12018-07-13 11:13:58 +0000859 case X86::TZMSK32rm:
860 case X86::TZMSK64rm:
861
Craig Topper445abf72018-07-13 22:41:46 +0000862 // Bit extracting and clearing instructions should execute in constant time,
863 // and set flags.
864 case X86::BEXTR32rm:
865 case X86::BEXTR64rm:
866 case X86::BEXTRI32mi:
867 case X86::BEXTRI64mi:
868 case X86::BZHI32rm:
869 case X86::BZHI64rm:
870
871 // Basic arithmetic is constant time on the input but does set flags.
Chandler Carruth90358e12018-07-13 11:13:58 +0000872 case X86::ADC8rm:
873 case X86::ADC16rm:
874 case X86::ADC32rm:
875 case X86::ADC64rm:
876 case X86::ADCX32rm:
877 case X86::ADCX64rm:
878 case X86::ADD8rm:
879 case X86::ADD16rm:
880 case X86::ADD32rm:
881 case X86::ADD64rm:
882 case X86::ADOX32rm:
883 case X86::ADOX64rm:
884 case X86::AND8rm:
885 case X86::AND16rm:
886 case X86::AND32rm:
887 case X86::AND64rm:
888 case X86::ANDN32rm:
889 case X86::ANDN64rm:
Chandler Carruth90358e12018-07-13 11:13:58 +0000890 case X86::OR8rm:
891 case X86::OR16rm:
892 case X86::OR32rm:
893 case X86::OR64rm:
894 case X86::SBB8rm:
895 case X86::SBB16rm:
896 case X86::SBB32rm:
897 case X86::SBB64rm:
898 case X86::SUB8rm:
899 case X86::SUB16rm:
900 case X86::SUB32rm:
901 case X86::SUB64rm:
902 case X86::XOR8rm:
903 case X86::XOR16rm:
904 case X86::XOR32rm:
905 case X86::XOR64rm:
Chandler Carruth90358e12018-07-13 11:13:58 +0000906 // Check whether the EFLAGS implicit-def is dead. We assume that this will
907 // always find the implicit-def because this code should only be reached
908 // for instructions that do in fact implicitly def this.
909 if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
910 // If we would clobber EFLAGS that are used, just bail for now.
911 LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: ";
912 MI.dump(); dbgs() << "\n");
913 return false;
914 }
915
916 // Otherwise, fallthrough to handle these the same as instructions that
917 // don't set EFLAGS.
918 LLVM_FALLTHROUGH;
919
Craig Topper445abf72018-07-13 22:41:46 +0000920 // Integer multiply w/o affecting flags is still believed to be constant
921 // time on x86. Called out separately as this is among the most surprising
922 // instructions to exhibit that behavior.
Chandler Carruth90358e12018-07-13 11:13:58 +0000923 case X86::MULX32rm:
924 case X86::MULX64rm:
925
Craig Topper445abf72018-07-13 22:41:46 +0000926 // Arithmetic instructions that are both constant time and don't set flags.
Chandler Carruth90358e12018-07-13 11:13:58 +0000927 case X86::PDEP32rm:
928 case X86::PDEP64rm:
929 case X86::PEXT32rm:
930 case X86::PEXT64rm:
931 case X86::RORX32mi:
932 case X86::RORX64mi:
933 case X86::SARX32rm:
934 case X86::SARX64rm:
935 case X86::SHLX32rm:
936 case X86::SHLX64rm:
937 case X86::SHRX32rm:
938 case X86::SHRX64rm:
939
Craig Topper445abf72018-07-13 22:41:46 +0000940 // Conversions are believed to be constant time and don't set flags.
941 // FIXME: Add AVX versions.
Chandler Carruth90358e12018-07-13 11:13:58 +0000942 case X86::CVTSD2SI64rm_Int:
943 case X86::CVTSD2SIrm_Int:
944 case X86::CVTSS2SI64rm_Int:
945 case X86::CVTSS2SIrm_Int:
946 case X86::CVTTSD2SI64rm:
947 case X86::CVTTSD2SI64rm_Int:
948 case X86::CVTTSD2SIrm:
949 case X86::CVTTSD2SIrm_Int:
950 case X86::CVTTSS2SI64rm:
951 case X86::CVTTSS2SI64rm_Int:
952 case X86::CVTTSS2SIrm:
953 case X86::CVTTSS2SIrm_Int:
954
Craig Topper445abf72018-07-13 22:41:46 +0000955 // Loads to register don't set flags.
Chandler Carruth90358e12018-07-13 11:13:58 +0000956 case X86::MOV8rm:
957 case X86::MOV8rm_NOREX:
958 case X86::MOV16rm:
959 case X86::MOV32rm:
960 case X86::MOV64rm:
961 case X86::MOVSX16rm8:
962 case X86::MOVSX32rm16:
963 case X86::MOVSX32rm8:
964 case X86::MOVSX32rm8_NOREX:
965 case X86::MOVSX64rm16:
966 case X86::MOVSX64rm32:
967 case X86::MOVSX64rm8:
968 case X86::MOVZX16rm8:
969 case X86::MOVZX32rm16:
970 case X86::MOVZX32rm8:
971 case X86::MOVZX32rm8_NOREX:
972 case X86::MOVZX64rm16:
973 case X86::MOVZX64rm8:
974 return true;
975 }
976}
977
978static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
979 const TargetRegisterInfo &TRI) {
980 // Check if EFLAGS are alive by seeing if there is a def of them or they
981 // live-in, and then seeing if that def is in turn used.
982 for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), I))) {
983 if (MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
984 // If the def is dead, then EFLAGS is not live.
985 if (DefOp->isDead())
986 return false;
987
988 // Otherwise we've def'ed it, and it is live.
989 return true;
990 }
991 // While at this instruction, also check if we use and kill EFLAGS
992 // which means it isn't live.
993 if (MI.killsRegister(X86::EFLAGS, &TRI))
994 return false;
995 }
996
997 // If we didn't find anything conclusive (neither definitely alive or
998 // definitely dead) return whether it lives into the block.
999 return MBB.isLiveIn(X86::EFLAGS);
1000}
1001
1002void X86SpeculativeLoadHardeningPass::checkAllLoads(
1003 MachineFunction &MF, MachineSSAUpdater &PredStateSSA) {
1004 // If the actual checking of loads is disabled, skip doing anything here.
1005 if (!HardenLoads)
1006 return;
1007
1008 SmallPtrSet<MachineInstr *, 16> HardenPostLoad;
1009 SmallPtrSet<MachineInstr *, 16> HardenLoadAddr;
1010
1011 SmallSet<unsigned, 16> HardenedAddrRegs;
1012
1013 SmallDenseMap<unsigned, unsigned, 32> AddrRegToHardenedReg;
1014
1015 // Track the set of load-dependent registers through the basic block. Because
1016 // the values of these registers have an existing data dependency on a loaded
1017 // value which we would have checked, we can omit any checks on them.
1018 SparseBitVector<> LoadDepRegs;
1019
1020 for (MachineBasicBlock &MBB : MF) {
1021 // We harden the loads of a basic block in several passes:
1022 //
1023 // 1) Collect all the loads which can have their loaded value hardened
1024 // and all the loads that instead need their address hardened. During
1025 // this walk we propagate load dependence for address hardened loads and
1026 // also look for LFENCE to stop hardening wherever possible. When
1027 // deciding whether or not to harden the loaded value or not, we check
1028 // to see if any registers used in the address will have been hardened
1029 // at this point and if so, harden any remaining address registers as
1030 // that often successfully re-uses hardened addresses and minimizes
1031 // instructions. FIXME: We should consider an aggressive mode where we
1032 // continue to keep as many loads value hardened even when some address
1033 // register hardening would be free (due to reuse).
1034 for (MachineInstr &MI : MBB) {
1035 // We naively assume that all def'ed registers of an instruction have
1036 // a data dependency on all of their operands.
1037 // FIXME: Do a more careful analysis of x86 to build a conservative model
1038 // here.
1039 if (llvm::any_of(MI.uses(), [&](MachineOperand &Op) {
1040 return Op.isReg() && LoadDepRegs.test(Op.getReg());
1041 }))
1042 for (MachineOperand &Def : MI.defs())
1043 if (Def.isReg())
1044 LoadDepRegs.set(Def.getReg());
1045
1046 // Both Intel and AMD are guiding that they will change the semantics of
1047 // LFENCE to be a speculation barrier, so if we see an LFENCE, there is
1048 // no more need to guard things in this block.
1049 if (MI.getOpcode() == X86::LFENCE)
1050 break;
1051
1052 // If this instruction cannot load, nothing to do.
1053 if (!MI.mayLoad())
1054 continue;
1055
1056 // Some instructions which "load" are trivially safe or unimportant.
1057 if (MI.getOpcode() == X86::MFENCE)
1058 continue;
1059
1060 // Extract the memory operand information about this instruction.
1061 // FIXME: This doesn't handle loading pseudo instructions which we often
1062 // could handle with similarly generic logic. We probably need to add an
1063 // MI-layer routine similar to the MC-layer one we use here which maps
1064 // pseudos much like this maps real instructions.
1065 const MCInstrDesc &Desc = MI.getDesc();
1066 int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
1067 if (MemRefBeginIdx < 0) {
1068 LLVM_DEBUG(dbgs() << "WARNING: unable to harden loading instruction: ";
1069 MI.dump());
1070 continue;
1071 }
1072
1073 MemRefBeginIdx += X86II::getOperandBias(Desc);
1074
1075 MachineOperand &BaseMO = MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
1076 MachineOperand &IndexMO =
1077 MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
1078
1079 // If we have at least one (non-frame-index, non-RIP) register operand,
1080 // and neither operand is load-dependent, we need to check the load.
1081 unsigned BaseReg = 0, IndexReg = 0;
1082 if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP &&
1083 BaseMO.getReg() != X86::NoRegister)
1084 BaseReg = BaseMO.getReg();
1085 if (IndexMO.getReg() != X86::NoRegister)
1086 IndexReg = IndexMO.getReg();
1087
1088 if (!BaseReg && !IndexReg)
1089 // No register operands!
1090 continue;
1091
1092 // If any register operand is dependent, this load is dependent and we
1093 // needn't check it.
1094 // FIXME: Is this true in the case where we are hardening loads after
1095 // they complete? Unclear, need to investigate.
1096 if ((BaseReg && LoadDepRegs.test(BaseReg)) ||
1097 (IndexReg && LoadDepRegs.test(IndexReg)))
1098 continue;
1099
1100 // If post-load hardening is enabled, this load is known to be
1101 // data-invariant, and we aren't already going to harden one of the
1102 // address registers, queue it up to be hardened post-load. Notably, even
1103 // once hardened this won't introduce a useful dependency that could prune
1104 // out subsequent loads.
1105 if (EnablePostLoadHardening && isDataInvariantLoad(MI) &&
1106 !HardenedAddrRegs.count(BaseReg) &&
1107 !HardenedAddrRegs.count(IndexReg)) {
1108 HardenPostLoad.insert(&MI);
1109 HardenedAddrRegs.insert(MI.getOperand(0).getReg());
1110 continue;
1111 }
1112
1113 // Record this instruction for address hardening and record its register
1114 // operands as being address-hardened.
1115 HardenLoadAddr.insert(&MI);
1116 if (BaseReg)
1117 HardenedAddrRegs.insert(BaseReg);
1118 if (IndexReg)
1119 HardenedAddrRegs.insert(IndexReg);
1120
1121 for (MachineOperand &Def : MI.defs())
1122 if (Def.isReg())
1123 LoadDepRegs.set(Def.getReg());
1124 }
1125
1126 // Now re-walk the instructions in the basic block, and apply whichever
1127 // hardening strategy we have elected. Note that we do this in a second
1128 // pass specifically so that we have the complete set of instructions for
1129 // which we will do post-load hardening and can defer it in certain
1130 // circumstances.
1131 //
1132 // FIXME: This could probably be made even more effective by doing it
1133 // across the entire function. Rather than just walking the flat list
1134 // backwards here, we could walk the function in PO and each block bottom
1135 // up, allowing us to in some cases sink hardening across block blocks. As
1136 // long as the in-block predicate state is used at the eventual hardening
1137 // site, this remains safe.
1138 for (MachineInstr &MI : MBB) {
1139 // We cannot both require hardening the def of a load and its address.
1140 assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) &&
1141 "Requested to harden both the address and def of a load!");
1142
1143 // Check if this is a load whose address needs to be hardened.
1144 if (HardenLoadAddr.erase(&MI)) {
1145 const MCInstrDesc &Desc = MI.getDesc();
1146 int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
1147 assert(MemRefBeginIdx >= 0 && "Cannot have an invalid index here!");
1148
1149 MemRefBeginIdx += X86II::getOperandBias(Desc);
1150
1151 MachineOperand &BaseMO =
1152 MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
1153 MachineOperand &IndexMO =
1154 MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
1155 hardenLoadAddr(MI, BaseMO, IndexMO, PredStateSSA, AddrRegToHardenedReg);
1156 continue;
1157 }
1158
1159 // Test if this instruction is one of our post load instructions (and
1160 // remove it from the set if so).
1161 if (HardenPostLoad.erase(&MI)) {
1162 assert(!MI.isCall() && "Must not try to post-load harden a call!");
1163
1164 // If this is a data-invariant load, we want to try and sink any
1165 // hardening as far as possible.
1166 if (isDataInvariantLoad(MI)) {
1167 // Sink the instruction we'll need to harden as far as we can down the
1168 // graph.
1169 MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad);
1170
1171 // If we managed to sink this instruction, update everything so we
1172 // harden that instruction when we reach it in the instruction
1173 // sequence.
1174 if (SunkMI != &MI) {
1175 // If in sinking there was no instruction needing to be hardened,
1176 // we're done.
1177 if (!SunkMI)
1178 continue;
1179
1180 // Otherwise, add this to the set of defs we harden.
1181 HardenPostLoad.insert(SunkMI);
1182 continue;
1183 }
1184 }
1185
1186 // The register def'ed by this instruction is trivially hardened so map
1187 // it to itself.
1188 AddrRegToHardenedReg[MI.getOperand(0).getReg()] =
1189 MI.getOperand(0).getReg();
1190
1191 hardenPostLoad(MI, PredStateSSA);
1192 continue;
1193 }
1194
1195 // After we finish processing the instruction and doing any hardening
1196 // necessary for it, we need to handle transferring the predicate state
1197 // into a call and recovering it after the call returns (if it returns).
1198 if (!MI.isCall())
1199 continue;
1200
1201 // If we're not hardening interprocedurally, we can just skip calls.
1202 if (!HardenInterprocedurally)
1203 continue;
1204
1205 auto InsertPt = MI.getIterator();
1206 DebugLoc Loc = MI.getDebugLoc();
1207
1208 // First, we transfer the predicate state into the called function by
1209 // merging it into the stack pointer. This will kill the current def of
1210 // the state.
1211 unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
1212 mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg);
1213
1214 // If this call is also a return (because it is a tail call) we're done.
1215 if (MI.isReturn())
1216 continue;
1217
1218 // Otherwise we need to step past the call and recover the predicate
1219 // state from SP after the return, and make this new state available.
1220 ++InsertPt;
1221 unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
1222 PredStateSSA.AddAvailableValue(&MBB, NewStateReg);
1223 }
1224
1225 HardenPostLoad.clear();
1226 HardenLoadAddr.clear();
1227 HardenedAddrRegs.clear();
1228 AddrRegToHardenedReg.clear();
1229
1230 // Currently, we only track data-dependent loads within a basic block.
1231 // FIXME: We should see if this is necessary or if we could be more
1232 // aggressive here without opening up attack avenues.
1233 LoadDepRegs.clear();
1234 }
1235}
1236
1237/// Save EFLAGS into the returned GPR. This can in turn be restored with
1238/// `restoreEFLAGS`.
1239///
1240/// Note that LLVM can only lower very simple patterns of saved and restored
1241/// EFLAGS registers. The restore should always be within the same basic block
1242/// as the save so that no PHI nodes are inserted.
1243unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
1244 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1245 DebugLoc Loc) {
1246 // FIXME: Hard coding this to a 32-bit register class seems weird, but matches
1247 // what instruction selection does.
1248 unsigned Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
1249 // We directly copy the FLAGS register and rely on later lowering to clean
1250 // this up into the appropriate setCC instructions.
1251 BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
1252 ++NumInstsInserted;
1253 return Reg;
1254}
1255
1256/// Restore EFLAGS from the provided GPR. This should be produced by
1257/// `saveEFLAGS`.
1258///
1259/// This must be done within the same basic block as the save in order to
1260/// reliably lower.
1261void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
1262 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
1263 unsigned Reg) {
1264 BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
1265 ++NumInstsInserted;
1266}
1267
1268/// Takes the current predicate state (in a register) and merges it into the
1269/// stack pointer. The state is essentially a single bit, but we merge this in
1270/// a way that won't form non-canonical pointers and also will be preserved
1271/// across normal stack adjustments.
1272void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
1273 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
1274 unsigned PredStateReg) {
1275 unsigned TmpReg = MRI->createVirtualRegister(PredStateRC);
1276 // FIXME: This hard codes a shift distance based on the number of bits needed
1277 // to stay canonical on 64-bit. We should compute this somehow and support
1278 // 32-bit as part of that.
1279 auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg)
1280 .addReg(PredStateReg, RegState::Kill)
1281 .addImm(47);
1282 ShiftI->addRegisterDead(X86::EFLAGS, TRI);
1283 ++NumInstsInserted;
1284 auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP)
1285 .addReg(X86::RSP)
1286 .addReg(TmpReg, RegState::Kill);
1287 OrI->addRegisterDead(X86::EFLAGS, TRI);
1288 ++NumInstsInserted;
1289}
1290
1291/// Extracts the predicate state stored in the high bits of the stack pointer.
1292unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
1293 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1294 DebugLoc Loc) {
1295 unsigned PredStateReg = MRI->createVirtualRegister(PredStateRC);
1296 unsigned TmpReg = MRI->createVirtualRegister(PredStateRC);
1297
1298 // We know that the stack pointer will have any preserved predicate state in
1299 // its high bit. We just want to smear this across the other bits. Turns out,
1300 // this is exactly what an arithmetic right shift does.
1301 BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg)
1302 .addReg(X86::RSP);
1303 auto ShiftI =
1304 BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg)
1305 .addReg(TmpReg, RegState::Kill)
1306 .addImm(TRI->getRegSizeInBits(*PredStateRC) - 1);
1307 ShiftI->addRegisterDead(X86::EFLAGS, TRI);
1308 ++NumInstsInserted;
1309
1310 return PredStateReg;
1311}
1312
1313void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
1314 MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
1315 MachineSSAUpdater &PredStateSSA,
1316 SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
1317 MachineBasicBlock &MBB = *MI.getParent();
1318 DebugLoc Loc = MI.getDebugLoc();
1319
1320 // Check if EFLAGS are alive by seeing if there is a def of them or they
1321 // live-in, and then seeing if that def is in turn used.
1322 bool EFLAGSLive = isEFLAGSLive(MBB, MI.getIterator(), *TRI);
1323
1324 SmallVector<MachineOperand *, 2> HardenOpRegs;
1325
1326 if (BaseMO.isFI()) {
1327 // A frame index is never a dynamically controllable load, so only
1328 // harden it if we're covering fixed address loads as well.
1329 LLVM_DEBUG(
1330 dbgs() << " Skipping hardening base of explicit stack frame load: ";
1331 MI.dump(); dbgs() << "\n");
1332 } else if (BaseMO.getReg() == X86::RIP ||
1333 BaseMO.getReg() == X86::NoRegister) {
1334 // For both RIP-relative addressed loads or absolute loads, we cannot
1335 // meaningfully harden them because the address being loaded has no
1336 // dynamic component.
1337 //
1338 // FIXME: When using a segment base (like TLS does) we end up with the
1339 // dynamic address being the base plus -1 because we can't mutate the
1340 // segment register here. This allows the signed 32-bit offset to point at
1341 // valid segment-relative addresses and load them successfully.
1342 LLVM_DEBUG(
1343 dbgs() << " Cannot harden base of "
1344 << (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base")
1345 << " address in a load!");
1346 } else {
1347 assert(BaseMO.isReg() &&
1348 "Only allowed to have a frame index or register base.");
1349 HardenOpRegs.push_back(&BaseMO);
1350 }
1351
1352 if (IndexMO.getReg() != X86::NoRegister &&
1353 (HardenOpRegs.empty() ||
1354 HardenOpRegs.front()->getReg() != IndexMO.getReg()))
1355 HardenOpRegs.push_back(&IndexMO);
1356
1357 assert((HardenOpRegs.size() == 1 || HardenOpRegs.size() == 2) &&
1358 "Should have exactly one or two registers to harden!");
1359 assert((HardenOpRegs.size() == 1 ||
1360 HardenOpRegs[0]->getReg() != HardenOpRegs[1]->getReg()) &&
1361 "Should not have two of the same registers!");
1362
1363 // Remove any registers that have alreaded been checked.
1364 llvm::erase_if(HardenOpRegs, [&](MachineOperand *Op) {
1365 // See if this operand's register has already been checked.
1366 auto It = AddrRegToHardenedReg.find(Op->getReg());
1367 if (It == AddrRegToHardenedReg.end())
1368 // Not checked, so retain this one.
1369 return false;
1370
1371 // Otherwise, we can directly update this operand and remove it.
1372 Op->setReg(It->second);
1373 return true;
1374 });
1375 // If there are none left, we're done.
1376 if (HardenOpRegs.empty())
1377 return;
1378
1379 // Compute the current predicate state.
1380 unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
1381
1382 auto InsertPt = MI.getIterator();
1383
1384 // If EFLAGS are live and we don't have access to instructions that avoid
1385 // clobbering EFLAGS we need to save and restore them. This in turn makes
1386 // the EFLAGS no longer live.
1387 unsigned FlagsReg = 0;
1388 if (EFLAGSLive && !Subtarget->hasBMI2()) {
1389 EFLAGSLive = false;
1390 FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
1391 }
1392
1393 for (MachineOperand *Op : HardenOpRegs) {
1394 auto *OpRC = MRI->getRegClass(Op->getReg());
1395
1396 unsigned OpReg = Op->getReg();
1397 unsigned TmpReg = MRI->createVirtualRegister(OpRC);
1398
1399 if (!EFLAGSLive) {
1400 // Merge our potential poison state into the value with an or.
1401 auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), TmpReg)
1402 .addReg(StateReg)
1403 .addReg(OpReg);
1404 OrI->addRegisterDead(X86::EFLAGS, TRI);
1405 ++NumInstsInserted;
1406 LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1407 } else {
1408 // We need to avoid touching EFLAGS so shift out all but the least
1409 // significant bit using the instruction that doesn't update flags.
1410 auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHRX64rr), TmpReg)
1411 .addReg(OpReg)
1412 .addReg(StateReg);
1413 (void)ShiftI;
1414 ++NumInstsInserted;
1415 LLVM_DEBUG(dbgs() << " Inserting shrx: "; ShiftI->dump();
1416 dbgs() << "\n");
1417 }
1418
1419 // Record this register as checked and update the operand.
1420 assert(!AddrRegToHardenedReg.count(Op->getReg()) &&
1421 "Should not have checked this register yet!");
1422 AddrRegToHardenedReg[Op->getReg()] = TmpReg;
1423 Op->setReg(TmpReg);
1424 ++NumAddrRegsHardened;
1425 }
1426
1427 // And restore the flags if needed.
1428 if (FlagsReg)
1429 restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
1430}
1431
1432MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
1433 MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedLoads) {
1434 assert(isDataInvariantLoad(InitialMI) &&
1435 "Cannot get here with a non-invariant load!");
1436
1437 // See if we can sink hardening the loaded value.
1438 auto SinkCheckToSingleUse =
1439 [&](MachineInstr &MI) -> Optional<MachineInstr *> {
1440 unsigned DefReg = MI.getOperand(0).getReg();
1441
1442 // We need to find a single use which we can sink the check. We can
1443 // primarily do this because many uses may already end up checked on their
1444 // own.
1445 MachineInstr *SingleUseMI = nullptr;
1446 for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
1447 // If we're already going to harden this use, it is data invariant and
1448 // within our block and we just need to check that the use isn't in an
1449 // address.
1450 if (HardenedLoads.count(&UseMI)) {
1451 const MCInstrDesc &Desc = UseMI.getDesc();
1452 int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
1453 assert(MemRefBeginIdx >= 0 &&
1454 "Should always have mem references here!");
1455 MemRefBeginIdx += X86II::getOperandBias(Desc);
1456
1457 MachineOperand &BaseMO =
1458 UseMI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
1459 MachineOperand &IndexMO =
1460 UseMI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
1461 if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) ||
1462 (IndexMO.isReg() && IndexMO.getReg() == DefReg))
1463 // The load uses the register as part of its address making it not
1464 // invariant.
1465 return {};
1466
1467 continue;
1468 }
1469
1470 if (SingleUseMI)
1471 // We already have a single use, this would make two. Bail.
1472 return {};
1473
1474 // If this single use isn't data invariant, isn't in this block, or has
1475 // interfering EFLAGS, we can't sink the hardening to it.
1476 if (!isDataInvariant(UseMI) || UseMI.getParent() != MI.getParent())
1477 return {};
1478
1479 // If this instruction defines multiple registers bail as we won't harden
1480 // all of them.
1481 if (UseMI.getDesc().getNumDefs() > 1)
1482 return {};
1483
1484 // If this register isn't a virtual register we can't walk uses of sanely,
1485 // just bail. Also check that its register class is one of the ones we
1486 // can harden.
1487 unsigned UseDefReg = UseMI.getOperand(0).getReg();
1488 if (!TRI->isVirtualRegister(UseDefReg) ||
1489 !MRI->getRegClass(UseDefReg)->hasSubClassEq(&X86::GR64RegClass))
1490 return {};
1491
1492 SingleUseMI = &UseMI;
1493 }
1494
1495 // If SingleUseMI is still null, there is no use that needs its own
1496 // checking. Otherwise, it is the single use that needs checking.
1497 return {SingleUseMI};
1498 };
1499
1500 MachineInstr *MI = &InitialMI;
1501 while (Optional<MachineInstr *> SingleUse = SinkCheckToSingleUse(*MI)) {
1502 // Update which MI we're checking now.
1503 MI = *SingleUse;
1504 if (!MI)
1505 break;
1506 }
1507
1508 return MI;
1509}
1510
1511// We can harden non-leaking loads into register without touching the address
1512// by just hiding all of the loaded bits. We use an `or` instruction to do
1513// this because having the poison value be all ones allows us to use the same
1514// value below. And the goal is just for the loaded bits to not be exposed to
1515// execution and coercing them to one is sufficient.
1516void X86SpeculativeLoadHardeningPass::hardenPostLoad(
1517 MachineInstr &MI, MachineSSAUpdater &PredStateSSA) {
1518 assert(isDataInvariantLoad(MI) &&
1519 "Cannot get here with a non-invariant load!");
1520
1521 MachineBasicBlock &MBB = *MI.getParent();
1522 DebugLoc Loc = MI.getDebugLoc();
1523
1524 // For all of these, the def'ed register operand is operand zero.
1525 auto &DefOp = MI.getOperand(0);
1526 unsigned OldDefReg = DefOp.getReg();
1527
1528 auto *DefRC = MRI->getRegClass(OldDefReg);
1529 int DefRegBytes = TRI->getRegSizeInBits(*DefRC) / 8;
1530
1531 unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
1532 unsigned OrOpCode = OrOpCodes[Log2_32(DefRegBytes)];
1533
1534 unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
1535
1536 auto GetStateRegInRC = [&](const TargetRegisterClass &RC) {
1537 unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB);
1538
1539 int Bytes = TRI->getRegSizeInBits(RC) / 8;
1540 // FIXME: Need to teach this about 32-bit mode.
1541 if (Bytes != 8) {
1542 unsigned SubRegImm = SubRegImms[Log2_32(Bytes)];
1543 unsigned NarrowStateReg = MRI->createVirtualRegister(&RC);
1544 BuildMI(MBB, MI.getIterator(), Loc, TII->get(TargetOpcode::COPY),
1545 NarrowStateReg)
1546 .addReg(StateReg, 0, SubRegImm);
1547 StateReg = NarrowStateReg;
1548 }
1549 return StateReg;
1550 };
1551
1552 auto InsertPt = std::next(MI.getIterator());
1553 unsigned FlagsReg = 0;
1554 bool EFLAGSLive = isEFLAGSLive(MBB, InsertPt, *TRI);
1555 if (EFLAGSLive && !Subtarget->hasBMI2()) {
1556 FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
1557 EFLAGSLive = false;
1558 }
1559
1560 if (!EFLAGSLive) {
1561 unsigned StateReg = GetStateRegInRC(*DefRC);
1562 unsigned NewDefReg = MRI->createVirtualRegister(DefRC);
1563 DefOp.setReg(NewDefReg);
1564 auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), OldDefReg)
1565 .addReg(StateReg)
1566 .addReg(NewDefReg);
1567 OrI->addRegisterDead(X86::EFLAGS, TRI);
1568 ++NumInstsInserted;
1569 LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1570 } else {
1571 assert(Subtarget->hasBMI2() &&
1572 "Cannot harden loads and preserve EFLAGS without BMI2!");
1573
1574 unsigned ShiftOpCode = DefRegBytes < 4 ? X86::SHRX32rr : X86::SHRX64rr;
1575 auto &ShiftRC =
1576 DefRegBytes < 4 ? X86::GR32_NOSPRegClass : X86::GR64_NOSPRegClass;
1577 int ShiftRegBytes = TRI->getRegSizeInBits(ShiftRC) / 8;
1578 unsigned DefSubRegImm = SubRegImms[Log2_32(DefRegBytes)];
1579
1580 unsigned StateReg = GetStateRegInRC(ShiftRC);
1581
1582 // First have the def instruction def a temporary register.
1583 unsigned TmpReg = MRI->createVirtualRegister(DefRC);
1584 DefOp.setReg(TmpReg);
1585 // Now copy it into a register of the shift RC.
1586 unsigned ShiftInputReg = TmpReg;
1587 if (DefRegBytes != ShiftRegBytes) {
1588 unsigned UndefReg = MRI->createVirtualRegister(&ShiftRC);
1589 BuildMI(MBB, InsertPt, Loc, TII->get(X86::IMPLICIT_DEF), UndefReg);
1590 ShiftInputReg = MRI->createVirtualRegister(&ShiftRC);
1591 BuildMI(MBB, InsertPt, Loc, TII->get(X86::INSERT_SUBREG), ShiftInputReg)
1592 .addReg(UndefReg)
1593 .addReg(TmpReg)
1594 .addImm(DefSubRegImm);
1595 }
1596
1597 // We shift this once if the shift is wider than the def and thus we can
1598 // shift *all* of the def'ed bytes out. Otherwise we need to do two shifts.
1599
1600 unsigned ShiftedReg = MRI->createVirtualRegister(&ShiftRC);
1601 auto Shift1I =
1602 BuildMI(MBB, InsertPt, Loc, TII->get(ShiftOpCode), ShiftedReg)
1603 .addReg(ShiftInputReg)
1604 .addReg(StateReg);
1605 (void)Shift1I;
1606 ++NumInstsInserted;
1607 LLVM_DEBUG(dbgs() << " Inserting shrx: "; Shift1I->dump(); dbgs() << "\n");
1608
1609 // The only way we have a bit left is if all 8 bytes were defined. Do an
1610 // extra shift to get the last bit in this case.
1611 if (DefRegBytes == ShiftRegBytes) {
1612 // We can just directly def the old def register as its the same size.
1613 ShiftInputReg = ShiftedReg;
1614 auto Shift2I =
1615 BuildMI(MBB, InsertPt, Loc, TII->get(ShiftOpCode), OldDefReg)
1616 .addReg(ShiftInputReg)
1617 .addReg(StateReg);
1618 (void)Shift2I;
1619 ++NumInstsInserted;
1620 LLVM_DEBUG(dbgs() << " Inserting shrx: "; Shift2I->dump();
1621 dbgs() << "\n");
1622 } else {
1623 // When we have different size shift register we need to fix up the
1624 // class. We can do that as we copy into the old def register.
1625 BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), OldDefReg)
1626 .addReg(ShiftedReg, 0, DefSubRegImm);
1627 }
1628 }
1629
1630 if (FlagsReg)
1631 restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
1632
1633 ++NumPostLoadRegsHardened;
1634}
1635
1636void X86SpeculativeLoadHardeningPass::checkReturnInstr(
1637 MachineInstr &MI, MachineSSAUpdater &PredStateSSA) {
1638 MachineBasicBlock &MBB = *MI.getParent();
1639 DebugLoc Loc = MI.getDebugLoc();
1640 auto InsertPt = MI.getIterator();
1641
1642 if (FenceCallAndRet) {
1643 // Simply forcibly block speculation of loads out of the function by using
1644 // an LFENCE. This is potentially a heavy-weight mitigation strategy, but
1645 // should be secure, is simple from an ABI perspective, and the cost can be
1646 // minimized through inlining.
1647 //
1648 // FIXME: We should investigate ways to establish a strong data-dependency
1649 // on the return. However, poisoning the stack pointer is unlikely to work
1650 // because the return is *predicted* rather than relying on the load of the
1651 // return address to actually resolve.
1652 BuildMI(MBB, InsertPt, Loc, TII->get(X86::LFENCE));
1653 ++NumInstsInserted;
1654 ++NumLFENCEsInserted;
1655 return;
1656 }
1657
1658 // Take our predicate state, shift it to the high 17 bits (so that we keep
1659 // pointers canonical) and merge it into RSP. This will allow the caller to
1660 // extract it when we return (speculatively).
1661 mergePredStateIntoSP(MBB, InsertPt, Loc,
1662 PredStateSSA.GetValueAtEndOfBlock(&MBB));
1663}
1664
1665INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
1666 "X86 speculative load hardener", false, false)
1667INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, DEBUG_TYPE,
1668 "X86 speculative load hardener", false, false)
1669
1670FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() {
1671 return new X86SpeculativeLoadHardeningPass();
1672}