[hwasan] Stack instrumentation.

Summary:
Very basic stack instrumentation using tagged pointers.
Tag for N'th alloca in a function is built as XOR of:
 * base tag for the function, which is just some bits of SP (poor
   man's random)
 * small constant which is a function of N.

Allocas are aligned to 16 bytes. On every ReturnInst allocas are
re-tagged to catch use-after-return.

This implementation has a bunch of issues that will be taken care of
later:
1. lifetime intrinsics referring to tagged pointers are not
   recognized in SDAG. This effectively disables stack coloring.
2. Generated code is quite inefficient. There is one extra
   instruction at each memory access that adds the base tag to the
   untagged alloca address. It would be better to keep tagged SP in a
   callee-saved register and address allocas as an offset of that XOR
   retag, but that needs better coordination between hwasan
   instrumentation pass and prologue/epilogue insertion.
3. Lifetime instrinsics are ignored and use-after-scope is not
   implemented. This would be harder to do than in ASan, because we
   need to use a differently tagged pointer depending on which
   lifetime.start / lifetime.end the current instruction is dominated
   / post-dominated.

Reviewers: kcc, alekseyshl

Subscribers: srhines, kubamracek, javed.absar, hiraditya, llvm-commits

Differential Revision: https://reviews.llvm.org/D41602

llvm-svn: 322324
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 8e2833d..f9c3451 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -22,10 +22,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/InstVisitor.h"
@@ -34,6 +31,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -41,8 +39,11 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
 
 using namespace llvm;
 
@@ -55,6 +56,7 @@
 static const size_t kNumberOfAccessSizes = 5;
 
 static const size_t kShadowScale = 4;
+static const unsigned kAllocaAlignment = 1U << kShadowScale;
 static const unsigned kPointerTagShift = 56;
 
 static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
@@ -85,6 +87,10 @@
     cl::desc("Enable recovery mode (continue-after-error)."),
     cl::Hidden, cl::init(false));
 
+static cl::opt<bool> ClInstrumentStack("hwasan-instrument-stack",
+                                       cl::desc("instrument stack (allocas)"),
+                                       cl::Hidden, cl::init(true));
+
 namespace {
 
 /// \brief An instrumentation pass implementing detection of addressability bugs
@@ -111,9 +117,15 @@
                                    uint64_t *TypeSize, unsigned *Alignment,
                                    Value **MaybeMask);
 
+  bool isInterestingAlloca(const AllocaInst &AI);
+  bool tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag);
+  bool instrumentStack(SmallVectorImpl<AllocaInst *> &Allocas,
+                       SmallVectorImpl<Instruction *> &RetVec);
+
 private:
   LLVMContext *C;
   Type *IntptrTy;
+  Type *Int8Ty;
 
   bool Recover;
 
@@ -121,6 +133,8 @@
 
   Function *HwasanMemoryAccessCallback[2][kNumberOfAccessSizes];
   Function *HwasanMemoryAccessCallbackSized[2];
+
+  Function *HwasanTagMemoryFunc;
 };
 
 } // end anonymous namespace
@@ -150,6 +164,7 @@
   C = &(M.getContext());
   IRBuilder<> IRB(*C);
   IntptrTy = IRB.getIntPtrTy(DL);
+  Int8Ty = IRB.getInt8Ty();
 
   std::tie(HwasanCtorFunction, std::ignore) =
       createSanitizerCtorAndInitFunctions(M, kHwasanModuleCtorName,
@@ -180,6 +195,9 @@
               FunctionType::get(IRB.getVoidTy(), {IntptrTy}, false)));
     }
   }
+
+  HwasanTagMemoryFunc = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      "__hwasan_tag_memory", IRB.getVoidTy(), IntptrTy, Int8Ty, IntptrTy));
 }
 
 Value *HWAddressSanitizer::isInterestingMemoryAccess(Instruction *I,
@@ -305,6 +323,133 @@
   return true;
 }
 
+static uint64_t getAllocaSizeInBytes(const AllocaInst &AI) {
+  uint64_t ArraySize = 1;
+  if (AI.isArrayAllocation()) {
+    const ConstantInt *CI = dyn_cast<ConstantInt>(AI.getArraySize());
+    assert(CI && "non-constant array size");
+    ArraySize = CI->getZExtValue();
+  }
+  Type *Ty = AI.getAllocatedType();
+  uint64_t SizeInBytes = AI.getModule()->getDataLayout().getTypeAllocSize(Ty);
+  return SizeInBytes * ArraySize;
+}
+
+bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI,
+                                   Value *Tag) {
+  size_t Size = (getAllocaSizeInBytes(*AI) + kAllocaAlignment - 1) &
+                ~(kAllocaAlignment - 1);
+
+  Value *JustTag = IRB.CreateTrunc(Tag, IRB.getInt8Ty());
+  if (ClInstrumentWithCalls) {
+    IRB.CreateCall(HwasanTagMemoryFunc,
+                   {IRB.CreatePointerCast(AI, IntptrTy), JustTag,
+                    ConstantInt::get(IntptrTy, Size)});
+  } else {
+    size_t ShadowSize = Size >> kShadowScale;
+    Value *ShadowPtr = IRB.CreateIntToPtr(
+        IRB.CreateLShr(IRB.CreatePointerCast(AI, IntptrTy), kShadowScale),
+        IRB.getInt8PtrTy());
+    // If this memset is not inlined, it will be intercepted in the hwasan
+    // runtime library. That's OK, because the interceptor skips the checks if
+    // the address is in the shadow region.
+    // FIXME: the interceptor is not as fast as real memset. Consider lowering
+    // llvm.memset right here into either a sequence of stores, or a call to
+    // hwasan_tag_memory.
+    IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, /*Align=*/1);
+  }
+  return true;
+}
+
+static unsigned RetagMask(unsigned AllocaNo) {
+  // A list of 8-bit numbers that have at most one run of non-zero bits.
+  // x = x ^ (mask << 56) can be encoded as a single armv8 instruction for these
+  // masks.
+  // The list does not include the value 255, which is used for UAR.
+  static unsigned FastMasks[] = {
+      0,   1,   2,   3,   4,   6,   7,   8,   12,  14,  15, 16,  24,
+      28,  30,  31,  32,  48,  56,  60,  62,  63,  64,  96, 112, 120,
+      124, 126, 127, 128, 192, 224, 240, 248, 252, 254};
+  return FastMasks[AllocaNo % (sizeof(FastMasks) / sizeof(FastMasks[0]))];
+}
+
+bool HWAddressSanitizer::instrumentStack(
+    SmallVectorImpl<AllocaInst *> &Allocas,
+    SmallVectorImpl<Instruction *> &RetVec) {
+  Function *F = Allocas[0]->getParent()->getParent();
+  Module *M = F->getParent();
+  Instruction *InsertPt = &*F->getEntryBlock().begin();
+  IRBuilder<> IRB(InsertPt);
+
+  // FIXME: use addressofreturnaddress (but implement it in aarch64 backend
+  // first).
+  auto GetStackPointerFn = Intrinsic::getDeclaration(M, Intrinsic::frameaddress);
+  Value *StackPointer = IRB.CreateCall(GetStackPointerFn, {Constant::getNullValue(IRB.getInt32Ty())});
+
+  // Extract some entropy from the stack pointer for the tags.
+  // Take bits 20..28 (ASLR entropy) and xor with bits 0..8 (these differ
+  // between functions).
+  Value *StackPointerLong = IRB.CreatePointerCast(StackPointer, IntptrTy);
+  Value *StackTag =
+      IRB.CreateXor(StackPointerLong, IRB.CreateLShr(StackPointerLong, 20),
+                    "hwasan.stack.base.tag");
+
+  // Ideally, we want to calculate tagged stack base pointer, and rewrite all
+  // alloca addresses using that. Unfortunately, offsets are not known yet
+  // (unless we use ASan-style mega-alloca). Instead we keep the base tag in a
+  // temp, shift-OR it into each alloca address and xor with the retag mask.
+  // This generates one extra instruction per alloca use.
+  for (unsigned N = 0; N < Allocas.size(); ++N) {
+    auto *AI = Allocas[N];
+    IRB.SetInsertPoint(AI->getNextNode());
+
+    // Replace uses of the alloca with tagged address.
+    std::string Name =
+        AI->hasName() ? AI->getName().str() : "alloca." + itostr(N);
+    Value *Tag =
+        IRB.CreateXor(StackTag, ConstantInt::get(IntptrTy, RetagMask(N)));
+    Value *AILong = IRB.CreatePointerCast(AI, IntptrTy);
+    Value *Replacement = IRB.CreateIntToPtr(
+        IRB.CreateOr(AILong, IRB.CreateShl(Tag, kPointerTagShift)),
+        AI->getType(), Name + ".hwasan");
+
+    for (auto UI = AI->use_begin(), UE = AI->use_end();
+         UI != UE;) {
+      Use &U = *UI++;
+      if (U.getUser() != AILong)
+        U.set(Replacement);
+    }
+
+    tagAlloca(IRB, AI, Tag);
+
+    for (auto RI : RetVec) {
+      IRB.SetInsertPoint(RI);
+
+      // Re-tag alloca memory with the special UAR tag.
+      Value *Tag = IRB.CreateXor(StackTag, ConstantInt::get(IntptrTy, 0xFFU));
+      tagAlloca(IRB, AI, Tag);
+    }
+  }
+
+  return true;
+}
+
+bool HWAddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
+  return (AI.getAllocatedType()->isSized() &&
+          // FIXME: instrument dynamic allocas, too
+          AI.isStaticAlloca() &&
+          // alloca() may be called with 0 size, ignore it.
+          getAllocaSizeInBytes(AI) > 0 &&
+          // We are only interested in allocas not promotable to registers.
+          // Promotable allocas are common under -O0.
+          !isAllocaPromotable(&AI) &&
+          // inalloca allocas are not treated as static, and we don't want
+          // dynamic alloca instrumentation for them as well.
+          !AI.isUsedWithInAlloca() &&
+          // swifterror allocas are register promoted by ISel
+          !AI.isSwiftError());
+}
+
 bool HWAddressSanitizer::runOnFunction(Function &F) {
   if (&F == HwasanCtorFunction)
     return false;
@@ -318,8 +463,25 @@
 
   bool Changed = false;
   SmallVector<Instruction*, 16> ToInstrument;
+  SmallVector<AllocaInst*, 8> AllocasToInstrument;
+  SmallVector<Instruction*, 8> RetVec;
   for (auto &BB : F) {
     for (auto &Inst : BB) {
+      if (ClInstrumentStack)
+        if (AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
+          // Realign all allocas. We don't want small uninteresting allocas to
+          // hide in instrumented alloca's padding.
+          if (AI->getAlignment() < kAllocaAlignment)
+            AI->setAlignment(kAllocaAlignment);
+          // Instrument some of them.
+          if (isInterestingAlloca(*AI))
+            AllocasToInstrument.push_back(AI);
+          continue;
+        }
+
+      if (isa<ReturnInst>(Inst) || isa<ResumeInst>(Inst) || isa<CleanupReturnInst>(Inst))
+        RetVec.push_back(&Inst);
+
       Value *MaybeMask = nullptr;
       bool IsWrite;
       unsigned Alignment;
@@ -331,6 +493,9 @@
     }
   }
 
+  if (!AllocasToInstrument.empty())
+    Changed |= instrumentStack(AllocasToInstrument, RetVec);
+
   for (auto Inst : ToInstrument)
     Changed |= instrumentMemAccess(Inst);