Updated to Clang 3.5a.

Change-Id: I8127eb568f674c2e72635b639a3295381fe8af82
diff --git a/lib/CodeGen/ABIInfo.h b/lib/CodeGen/ABIInfo.h
index 468fe04..d3ec46c 100644
--- a/lib/CodeGen/ABIInfo.h
+++ b/lib/CodeGen/ABIInfo.h
@@ -11,8 +11,8 @@
 #define CLANG_CODEGEN_ABIINFO_H
 
 #include "clang/AST/Type.h"
-#include "llvm/IR/Type.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Type.h"
 
 namespace llvm {
   class Value;
diff --git a/lib/CodeGen/Android.mk b/lib/CodeGen/Android.mk
index 9108e24..1c4a0b7 100644
--- a/lib/CodeGen/Android.mk
+++ b/lib/CodeGen/Android.mk
@@ -3,6 +3,7 @@
 clang_codegen_TBLGEN_TABLES := \
   AttrList.inc \
   Attrs.inc \
+  AttrVisitor.inc \
   CommentCommandList.inc \
   CommentNodes.inc \
   DeclNodes.inc \
@@ -48,11 +49,11 @@
   CodeGenAction.cpp \
   CodeGenFunction.cpp \
   CodeGenModule.cpp \
+  CodeGenPGO.cpp \
   CodeGenTBAA.cpp \
   CodeGenTypes.cpp \
   ItaniumCXXABI.cpp \
   MicrosoftCXXABI.cpp \
-  MicrosoftVBTables.cpp \
   ModuleBuilder.cpp \
   TargetInfo.cpp
 
diff --git a/lib/CodeGen/BackendUtil.cpp b/lib/CodeGen/BackendUtil.cpp
index 90b0f68..17f5f4d 100644
--- a/lib/CodeGen/BackendUtil.cpp
+++ b/lib/CodeGen/BackendUtil.cpp
@@ -13,13 +13,14 @@
 #include "clang/Basic/TargetOptions.h"
 #include "clang/Frontend/CodeGenOptions.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
-#include "llvm/Analysis/Verifier.h"
-#include "llvm/Assembly/PrintModulePass.h"
-#include "llvm/Bitcode/ReaderWriter.h"
+#include "clang/Frontend/Utils.h"
+#include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
@@ -36,6 +37,7 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/ObjCARC.h"
 #include "llvm/Transforms/Scalar.h"
+#include <memory>
 using namespace clang;
 using namespace llvm;
 
@@ -55,38 +57,37 @@
   mutable FunctionPassManager *PerFunctionPasses;
 
 private:
-  PassManager *getCodeGenPasses(TargetMachine *TM) const {
+  PassManager *getCodeGenPasses() const {
     if (!CodeGenPasses) {
       CodeGenPasses = new PassManager();
-      CodeGenPasses->add(new DataLayout(TheModule));
+      CodeGenPasses->add(new DataLayoutPass(TheModule));
       if (TM)
         TM->addAnalysisPasses(*CodeGenPasses);
     }
     return CodeGenPasses;
   }
 
-  PassManager *getPerModulePasses(TargetMachine *TM) const {
+  PassManager *getPerModulePasses() const {
     if (!PerModulePasses) {
       PerModulePasses = new PassManager();
-      PerModulePasses->add(new DataLayout(TheModule));
+      PerModulePasses->add(new DataLayoutPass(TheModule));
       if (TM)
         TM->addAnalysisPasses(*PerModulePasses);
     }
     return PerModulePasses;
   }
 
-  FunctionPassManager *getPerFunctionPasses(TargetMachine *TM) const {
+  FunctionPassManager *getPerFunctionPasses() const {
     if (!PerFunctionPasses) {
       PerFunctionPasses = new FunctionPassManager(TheModule);
-      PerFunctionPasses->add(new DataLayout(TheModule));
+      PerFunctionPasses->add(new DataLayoutPass(TheModule));
       if (TM)
         TM->addAnalysisPasses(*PerFunctionPasses);
     }
     return PerFunctionPasses;
   }
 
-
-  void CreatePasses(TargetMachine *TM);
+  void CreatePasses();
 
   /// CreateTargetMachine - Generates the TargetMachine.
   /// Returns Null if it is unable to create the target machine.
@@ -101,8 +102,7 @@
   /// AddEmitPasses - Add passes necessary to emit assembly or LLVM IR.
   ///
   /// \return True on success.
-  bool AddEmitPasses(BackendAction Action, formatted_raw_ostream &OS,
-                     TargetMachine *TM);
+  bool AddEmitPasses(BackendAction Action, formatted_raw_ostream &OS);
 
 public:
   EmitAssemblyHelper(DiagnosticsEngine &_Diags,
@@ -118,8 +118,12 @@
     delete CodeGenPasses;
     delete PerModulePasses;
     delete PerFunctionPasses;
+    if (CodeGenOpts.DisableFree)
+      BuryPointer(TM.release());
   }
 
+  std::unique_ptr<TargetMachine> TM;
+
   void EmitAssembly(BackendAction Action, raw_ostream *OS);
 };
 
@@ -162,6 +166,11 @@
   PM.add(createSampleProfileLoaderPass(CGOpts.SampleProfileFile));
 }
 
+static void addAddDiscriminatorsPass(const PassManagerBuilder &Builder,
+                                     PassManagerBase &PM) {
+  PM.add(createAddDiscriminatorsPass());
+}
+
 static void addBoundsCheckingPass(const PassManagerBuilder &Builder,
                                     PassManagerBase &PM) {
   PM.add(createBoundsCheckingPass());
@@ -177,12 +186,10 @@
       LangOpts.Sanitize.InitOrder,
       LangOpts.Sanitize.UseAfterReturn,
       LangOpts.Sanitize.UseAfterScope,
-      CGOpts.SanitizerBlacklistFile,
-      CGOpts.SanitizeAddressZeroBaseShadow));
+      CGOpts.SanitizerBlacklistFile));
   PM.add(createAddressSanitizerModulePass(
       LangOpts.Sanitize.InitOrder,
-      CGOpts.SanitizerBlacklistFile,
-      CGOpts.SanitizeAddressZeroBaseShadow));
+      CGOpts.SanitizerBlacklistFile));
 }
 
 static void addMemorySanitizerPass(const PassManagerBuilder &Builder,
@@ -222,7 +229,7 @@
   PM.add(createDataFlowSanitizerPass(CGOpts.SanitizerBlacklistFile));
 }
 
-void EmitAssemblyHelper::CreatePasses(TargetMachine *TM) {
+void EmitAssemblyHelper::CreatePasses() {
   unsigned OptLevel = CodeGenOpts.OptimizationLevel;
   CodeGenOptions::InliningMethod Inlining = CodeGenOpts.getInlining();
 
@@ -244,6 +251,9 @@
   PMBuilder.DisableUnrollLoops = !CodeGenOpts.UnrollLoops;
   PMBuilder.RerollLoops = CodeGenOpts.RerollLoops;
 
+  PMBuilder.addExtension(PassManagerBuilder::EP_EarlyAsPossible,
+                         addAddDiscriminatorsPass);
+
   if (!CodeGenOpts.SampleProfileFile.empty())
     PMBuilder.addExtension(PassManagerBuilder::EP_EarlyAsPossible,
                            addSampleProfileLoaderPass);
@@ -298,19 +308,12 @@
   PMBuilder.LibraryInfo = new TargetLibraryInfo(TargetTriple);
   if (!CodeGenOpts.SimplifyLibCalls)
     PMBuilder.LibraryInfo->disableAllFunctions();
-  
+
   switch (Inlining) {
   case CodeGenOptions::NoInlining: break;
   case CodeGenOptions::NormalInlining: {
-    // FIXME: Derive these constants in a principled fashion.
-    unsigned Threshold = 225;
-    if (CodeGenOpts.OptimizeSize == 1)      // -Os
-      Threshold = 75;
-    else if (CodeGenOpts.OptimizeSize == 2) // -Oz
-      Threshold = 25;
-    else if (OptLevel > 2)
-      Threshold = 275;
-    PMBuilder.Inliner = createFunctionInliningPass(Threshold);
+    PMBuilder.Inliner =
+        createFunctionInliningPass(OptLevel, CodeGenOpts.OptimizeSize);
     break;
   }
   case CodeGenOptions::OnlyAlwaysInlining:
@@ -324,13 +327,13 @@
   }
 
   // Set up the per-function pass manager.
-  FunctionPassManager *FPM = getPerFunctionPasses(TM);
+  FunctionPassManager *FPM = getPerFunctionPasses();
   if (CodeGenOpts.VerifyModule)
     FPM->add(createVerifierPass());
   PMBuilder.populateFunctionPassManager(*FPM);
 
   // Set up the per-module pass manager.
-  PassManager *MPM = getPerModulePasses(TM);
+  PassManager *MPM = getPerModulePasses();
 
   if (!CodeGenOpts.DisableGCov &&
       (CodeGenOpts.EmitGcovArcs || CodeGenOpts.EmitGcovNotes)) {
@@ -437,6 +440,12 @@
 
   llvm::TargetOptions Options;
 
+  if (CodeGenOpts.DisableIntegratedAS)
+    Options.DisableIntegratedAS = true;
+
+  if (CodeGenOpts.CompressDebugSections)
+    Options.CompressDebugSections = true;
+
   // Set frame pointer elimination mode.
   if (!CodeGenOpts.DisableFPElim) {
     Options.NoFramePointerElim = false;
@@ -503,11 +512,10 @@
 }
 
 bool EmitAssemblyHelper::AddEmitPasses(BackendAction Action,
-                                       formatted_raw_ostream &OS,
-                                       TargetMachine *TM) {
+                                       formatted_raw_ostream &OS) {
 
   // Create the code generator passes.
-  PassManager *PM = getCodeGenPasses(TM);
+  PassManager *PM = getCodeGenPasses();
 
   // Add LibraryInfo.
   llvm::Triple TargetTriple(TheModule->getTargetTriple());
@@ -552,27 +560,28 @@
   bool UsesCodeGen = (Action != Backend_EmitNothing &&
                       Action != Backend_EmitBC &&
                       Action != Backend_EmitLL);
-  TargetMachine *TM = CreateTargetMachine(UsesCodeGen);
+  if (!TM)
+    TM.reset(CreateTargetMachine(UsesCodeGen));
+
   if (UsesCodeGen && !TM) return;
-  llvm::OwningPtr<TargetMachine> TMOwner(CodeGenOpts.DisableFree ? 0 : TM);
-  CreatePasses(TM);
+  CreatePasses();
 
   switch (Action) {
   case Backend_EmitNothing:
     break;
 
   case Backend_EmitBC:
-    getPerModulePasses(TM)->add(createBitcodeWriterPass(*OS));
+    getPerModulePasses()->add(createBitcodeWriterPass(*OS));
     break;
 
   case Backend_EmitLL:
     FormattedOS.setStream(*OS, formatted_raw_ostream::PRESERVE_STREAM);
-    getPerModulePasses(TM)->add(createPrintModulePass(&FormattedOS));
+    getPerModulePasses()->add(createPrintModulePass(FormattedOS));
     break;
 
   default:
     FormattedOS.setStream(*OS, formatted_raw_ostream::PRESERVE_STREAM);
-    if (!AddEmitPasses(Action, FormattedOS, TM))
+    if (!AddEmitPasses(Action, FormattedOS))
       return;
   }
 
@@ -607,10 +616,23 @@
 void clang::EmitBackendOutput(DiagnosticsEngine &Diags,
                               const CodeGenOptions &CGOpts,
                               const clang::TargetOptions &TOpts,
-                              const LangOptions &LOpts,
-                              Module *M,
-                              BackendAction Action, raw_ostream *OS) {
+                              const LangOptions &LOpts, StringRef TDesc,
+                              Module *M, BackendAction Action,
+                              raw_ostream *OS) {
   EmitAssemblyHelper AsmHelper(Diags, CGOpts, TOpts, LOpts, M);
 
   AsmHelper.EmitAssembly(Action, OS);
+
+  // If an optional clang TargetInfo description string was passed in, use it to
+  // verify the LLVM TargetMachine's DataLayout.
+  if (AsmHelper.TM && !TDesc.empty()) {
+    std::string DLDesc =
+        AsmHelper.TM->getDataLayout()->getStringRepresentation();
+    if (DLDesc != TDesc) {
+      unsigned DiagID = Diags.getCustomDiagID(
+          DiagnosticsEngine::Error, "backend data layout '%0' does not match "
+                                    "expected target description '%1'");
+      Diags.Report(DiagID) << DLDesc << TDesc;
+    }
+  }
 }
diff --git a/lib/CodeGen/CGAtomic.cpp b/lib/CodeGen/CGAtomic.cpp
index 0df2a40..7c7c80c 100644
--- a/lib/CodeGen/CGAtomic.cpp
+++ b/lib/CodeGen/CGAtomic.cpp
@@ -24,16 +24,6 @@
 using namespace clang;
 using namespace CodeGen;
 
-// The ABI values for various atomic memory orderings.
-enum AtomicOrderingKind {
-  AO_ABI_memory_order_relaxed = 0,
-  AO_ABI_memory_order_consume = 1,
-  AO_ABI_memory_order_acquire = 2,
-  AO_ABI_memory_order_release = 3,
-  AO_ABI_memory_order_acq_rel = 4,
-  AO_ABI_memory_order_seq_cst = 5
-};
-
 namespace {
   class AtomicInfo {
     CodeGenFunction &CGF;
@@ -57,10 +47,10 @@
       ASTContext &C = CGF.getContext();
 
       uint64_t valueAlignInBits;
-      llvm::tie(ValueSizeInBits, valueAlignInBits) = C.getTypeInfo(ValueTy);
+      std::tie(ValueSizeInBits, valueAlignInBits) = C.getTypeInfo(ValueTy);
 
       uint64_t atomicAlignInBits;
-      llvm::tie(AtomicSizeInBits, atomicAlignInBits) = C.getTypeInfo(AtomicTy);
+      std::tie(AtomicSizeInBits, atomicAlignInBits) = C.getTypeInfo(AtomicTy);
 
       assert(ValueSizeInBits <= AtomicSizeInBits);
       assert(valueAlignInBits <= atomicAlignInBits);
@@ -184,10 +174,134 @@
   return true;
 }
 
-static void
-EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, llvm::Value *Dest,
-             llvm::Value *Ptr, llvm::Value *Val1, llvm::Value *Val2,
-             uint64_t Size, unsigned Align, llvm::AtomicOrdering Order) {
+static void emitAtomicCmpXchg(CodeGenFunction &CGF, AtomicExpr *E,
+                              llvm::Value *Dest, llvm::Value *Ptr,
+                              llvm::Value *Val1, llvm::Value *Val2,
+                              uint64_t Size, unsigned Align,
+                              llvm::AtomicOrdering SuccessOrder,
+                              llvm::AtomicOrdering FailureOrder) {
+  // Note that cmpxchg doesn't support weak cmpxchg, at least at the moment.
+  llvm::LoadInst *Expected = CGF.Builder.CreateLoad(Val1);
+  Expected->setAlignment(Align);
+  llvm::LoadInst *Desired = CGF.Builder.CreateLoad(Val2);
+  Desired->setAlignment(Align);
+
+  llvm::AtomicCmpXchgInst *Old = CGF.Builder.CreateAtomicCmpXchg(
+      Ptr, Expected, Desired, SuccessOrder, FailureOrder);
+  Old->setVolatile(E->isVolatile());
+
+  // Cmp holds the result of the compare-exchange operation: true on success,
+  // false on failure.
+  llvm::Value *Cmp = CGF.Builder.CreateICmpEQ(Old, Expected);
+
+  // This basic block is used to hold the store instruction if the operation
+  // failed.
+  llvm::BasicBlock *StoreExpectedBB =
+      CGF.createBasicBlock("cmpxchg.store_expected", CGF.CurFn);
+
+  // This basic block is the exit point of the operation, we should end up
+  // here regardless of whether or not the operation succeeded.
+  llvm::BasicBlock *ContinueBB =
+      CGF.createBasicBlock("cmpxchg.continue", CGF.CurFn);
+
+  // Update Expected if Expected isn't equal to Old, otherwise branch to the
+  // exit point.
+  CGF.Builder.CreateCondBr(Cmp, ContinueBB, StoreExpectedBB);
+
+  CGF.Builder.SetInsertPoint(StoreExpectedBB);
+  // Update the memory at Expected with Old's value.
+  llvm::StoreInst *StoreExpected = CGF.Builder.CreateStore(Old, Val1);
+  StoreExpected->setAlignment(Align);
+  // Finally, branch to the exit point.
+  CGF.Builder.CreateBr(ContinueBB);
+
+  CGF.Builder.SetInsertPoint(ContinueBB);
+  // Update the memory at Dest with Cmp's value.
+  CGF.EmitStoreOfScalar(Cmp, CGF.MakeAddrLValue(Dest, E->getType()));
+  return;
+}
+
+/// Given an ordering required on success, emit all possible cmpxchg
+/// instructions to cope with the provided (but possibly only dynamically known)
+/// FailureOrder.
+static void emitAtomicCmpXchgFailureSet(CodeGenFunction &CGF, AtomicExpr *E,
+                                        llvm::Value *Dest, llvm::Value *Ptr,
+                                        llvm::Value *Val1, llvm::Value *Val2,
+                                        llvm::Value *FailureOrderVal,
+                                        uint64_t Size, unsigned Align,
+                                        llvm::AtomicOrdering SuccessOrder) {
+  llvm::AtomicOrdering FailureOrder;
+  if (llvm::ConstantInt *FO = dyn_cast<llvm::ConstantInt>(FailureOrderVal)) {
+    switch (FO->getSExtValue()) {
+    default:
+      FailureOrder = llvm::Monotonic;
+      break;
+    case AtomicExpr::AO_ABI_memory_order_consume:
+    case AtomicExpr::AO_ABI_memory_order_acquire:
+      FailureOrder = llvm::Acquire;
+      break;
+    case AtomicExpr::AO_ABI_memory_order_seq_cst:
+      FailureOrder = llvm::SequentiallyConsistent;
+      break;
+    }
+    if (FailureOrder >= SuccessOrder) {
+      // Don't assert on undefined behaviour.
+      FailureOrder =
+        llvm::AtomicCmpXchgInst::getStrongestFailureOrdering(SuccessOrder);
+    }
+    emitAtomicCmpXchg(CGF, E, Dest, Ptr, Val1, Val2, Size, Align, SuccessOrder,
+                      FailureOrder);
+    return;
+  }
+
+  // Create all the relevant BB's
+  llvm::BasicBlock *MonotonicBB = 0, *AcquireBB = 0, *SeqCstBB = 0;
+  MonotonicBB = CGF.createBasicBlock("monotonic_fail", CGF.CurFn);
+  if (SuccessOrder != llvm::Monotonic && SuccessOrder != llvm::Release)
+    AcquireBB = CGF.createBasicBlock("acquire_fail", CGF.CurFn);
+  if (SuccessOrder == llvm::SequentiallyConsistent)
+    SeqCstBB = CGF.createBasicBlock("seqcst_fail", CGF.CurFn);
+
+  llvm::BasicBlock *ContBB = CGF.createBasicBlock("atomic.continue", CGF.CurFn);
+
+  llvm::SwitchInst *SI = CGF.Builder.CreateSwitch(FailureOrderVal, MonotonicBB);
+
+  // Emit all the different atomics
+
+  // MonotonicBB is arbitrarily chosen as the default case; in practice, this
+  // doesn't matter unless someone is crazy enough to use something that
+  // doesn't fold to a constant for the ordering.
+  CGF.Builder.SetInsertPoint(MonotonicBB);
+  emitAtomicCmpXchg(CGF, E, Dest, Ptr, Val1, Val2,
+                    Size, Align, SuccessOrder, llvm::Monotonic);
+  CGF.Builder.CreateBr(ContBB);
+
+  if (AcquireBB) {
+    CGF.Builder.SetInsertPoint(AcquireBB);
+    emitAtomicCmpXchg(CGF, E, Dest, Ptr, Val1, Val2,
+                      Size, Align, SuccessOrder, llvm::Acquire);
+    CGF.Builder.CreateBr(ContBB);
+    SI->addCase(CGF.Builder.getInt32(AtomicExpr::AO_ABI_memory_order_consume),
+                AcquireBB);
+    SI->addCase(CGF.Builder.getInt32(AtomicExpr::AO_ABI_memory_order_acquire),
+                AcquireBB);
+  }
+  if (SeqCstBB) {
+    CGF.Builder.SetInsertPoint(SeqCstBB);
+    emitAtomicCmpXchg(CGF, E, Dest, Ptr, Val1, Val2,
+                      Size, Align, SuccessOrder, llvm::SequentiallyConsistent);
+    CGF.Builder.CreateBr(ContBB);
+    SI->addCase(CGF.Builder.getInt32(AtomicExpr::AO_ABI_memory_order_seq_cst),
+                SeqCstBB);
+  }
+
+  CGF.Builder.SetInsertPoint(ContBB);
+}
+
+static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, llvm::Value *Dest,
+                         llvm::Value *Ptr, llvm::Value *Val1, llvm::Value *Val2,
+                         llvm::Value *FailureOrder, uint64_t Size,
+                         unsigned Align, llvm::AtomicOrdering Order) {
   llvm::AtomicRMWInst::BinOp Op = llvm::AtomicRMWInst::Add;
   llvm::Instruction::BinaryOps PostOp = (llvm::Instruction::BinaryOps)0;
 
@@ -198,23 +312,10 @@
   case AtomicExpr::AO__c11_atomic_compare_exchange_strong:
   case AtomicExpr::AO__c11_atomic_compare_exchange_weak:
   case AtomicExpr::AO__atomic_compare_exchange:
-  case AtomicExpr::AO__atomic_compare_exchange_n: {
-    // Note that cmpxchg only supports specifying one ordering and
-    // doesn't support weak cmpxchg, at least at the moment.
-    llvm::LoadInst *LoadVal1 = CGF.Builder.CreateLoad(Val1);
-    LoadVal1->setAlignment(Align);
-    llvm::LoadInst *LoadVal2 = CGF.Builder.CreateLoad(Val2);
-    LoadVal2->setAlignment(Align);
-    llvm::AtomicCmpXchgInst *CXI =
-        CGF.Builder.CreateAtomicCmpXchg(Ptr, LoadVal1, LoadVal2, Order);
-    CXI->setVolatile(E->isVolatile());
-    llvm::StoreInst *StoreVal1 = CGF.Builder.CreateStore(CXI, Val1);
-    StoreVal1->setAlignment(Align);
-    llvm::Value *Cmp = CGF.Builder.CreateICmpEQ(CXI, LoadVal1);
-    CGF.EmitStoreOfScalar(Cmp, CGF.MakeAddrLValue(Dest, E->getType()));
+  case AtomicExpr::AO__atomic_compare_exchange_n:
+    emitAtomicCmpXchgFailureSet(CGF, E, Dest, Ptr, Val1, Val2, FailureOrder,
+                                Size, Align, Order);
     return;
-  }
-
   case AtomicExpr::AO__c11_atomic_load:
   case AtomicExpr::AO__atomic_load_n:
   case AtomicExpr::AO__atomic_load: {
@@ -476,6 +577,8 @@
     Args.add(RValue::get(EmitCastToVoidPtr(Ptr)), getContext().VoidPtrTy);
 
     std::string LibCallName;
+    QualType LoweredMemTy =
+      MemTy->isPointerType() ? getContext().getIntPtrType() : MemTy;
     QualType RetTy;
     bool HaveRetTy = false;
     switch (E->getOp()) {
@@ -531,7 +634,7 @@
     case AtomicExpr::AO__c11_atomic_fetch_add:
     case AtomicExpr::AO__atomic_fetch_add:
       LibCallName = "__atomic_fetch_add";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1, MemTy,
+      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1, LoweredMemTy,
                         E->getExprLoc());
       break;
     // T __atomic_fetch_and_N(T *mem, T val, int order)
@@ -552,7 +655,7 @@
     case AtomicExpr::AO__c11_atomic_fetch_sub:
     case AtomicExpr::AO__atomic_fetch_sub:
       LibCallName = "__atomic_fetch_sub";
-      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1, MemTy,
+      AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1, LoweredMemTy,
                         E->getExprLoc());
       break;
     // T __atomic_fetch_xor_N(T *mem, T val, int order)
@@ -615,32 +718,32 @@
   if (isa<llvm::ConstantInt>(Order)) {
     int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
     switch (ord) {
-    case AO_ABI_memory_order_relaxed:
-      EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, Size, Align,
-                   llvm::Monotonic);
+    case AtomicExpr::AO_ABI_memory_order_relaxed:
+      EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, OrderFail,
+                   Size, Align, llvm::Monotonic);
       break;
-    case AO_ABI_memory_order_consume:
-    case AO_ABI_memory_order_acquire:
+    case AtomicExpr::AO_ABI_memory_order_consume:
+    case AtomicExpr::AO_ABI_memory_order_acquire:
       if (IsStore)
         break; // Avoid crashing on code with undefined behavior
-      EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, Size, Align,
-                   llvm::Acquire);
+      EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, OrderFail,
+                   Size, Align, llvm::Acquire);
       break;
-    case AO_ABI_memory_order_release:
+    case AtomicExpr::AO_ABI_memory_order_release:
       if (IsLoad)
         break; // Avoid crashing on code with undefined behavior
-      EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, Size, Align,
-                   llvm::Release);
+      EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, OrderFail,
+                   Size, Align, llvm::Release);
       break;
-    case AO_ABI_memory_order_acq_rel:
+    case AtomicExpr::AO_ABI_memory_order_acq_rel:
       if (IsLoad || IsStore)
         break; // Avoid crashing on code with undefined behavior
-      EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, Size, Align,
-                   llvm::AcquireRelease);
+      EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, OrderFail,
+                   Size, Align, llvm::AcquireRelease);
       break;
-    case AO_ABI_memory_order_seq_cst:
-      EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, Size, Align,
-                   llvm::SequentiallyConsistent);
+    case AtomicExpr::AO_ABI_memory_order_seq_cst:
+      EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, OrderFail,
+                   Size, Align, llvm::SequentiallyConsistent);
       break;
     default: // invalid order
       // We should not ever get here normally, but it's hard to
@@ -676,36 +779,41 @@
 
   // Emit all the different atomics
   Builder.SetInsertPoint(MonotonicBB);
-  EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, Size, Align,
-               llvm::Monotonic);
+  EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, OrderFail,
+               Size, Align, llvm::Monotonic);
   Builder.CreateBr(ContBB);
   if (!IsStore) {
     Builder.SetInsertPoint(AcquireBB);
-    EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, Size, Align,
-                 llvm::Acquire);
+    EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, OrderFail,
+                 Size, Align, llvm::Acquire);
     Builder.CreateBr(ContBB);
-    SI->addCase(Builder.getInt32(1), AcquireBB);
-    SI->addCase(Builder.getInt32(2), AcquireBB);
+    SI->addCase(Builder.getInt32(AtomicExpr::AO_ABI_memory_order_consume),
+                AcquireBB);
+    SI->addCase(Builder.getInt32(AtomicExpr::AO_ABI_memory_order_acquire),
+                AcquireBB);
   }
   if (!IsLoad) {
     Builder.SetInsertPoint(ReleaseBB);
-    EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, Size, Align,
-                 llvm::Release);
+    EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, OrderFail,
+                 Size, Align, llvm::Release);
     Builder.CreateBr(ContBB);
-    SI->addCase(Builder.getInt32(3), ReleaseBB);
+    SI->addCase(Builder.getInt32(AtomicExpr::AO_ABI_memory_order_release),
+                ReleaseBB);
   }
   if (!IsLoad && !IsStore) {
     Builder.SetInsertPoint(AcqRelBB);
-    EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, Size, Align,
-                 llvm::AcquireRelease);
+    EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, OrderFail,
+                 Size, Align, llvm::AcquireRelease);
     Builder.CreateBr(ContBB);
-    SI->addCase(Builder.getInt32(4), AcqRelBB);
+    SI->addCase(Builder.getInt32(AtomicExpr::AO_ABI_memory_order_acq_rel),
+                AcqRelBB);
   }
   Builder.SetInsertPoint(SeqCstBB);
-  EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, Size, Align,
-               llvm::SequentiallyConsistent);
+  EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, OrderFail,
+               Size, Align, llvm::SequentiallyConsistent);
   Builder.CreateBr(ContBB);
-  SI->addCase(Builder.getInt32(5), SeqCstBB);
+  SI->addCase(Builder.getInt32(AtomicExpr::AO_ABI_memory_order_seq_cst),
+              SeqCstBB);
 
   // Cleanup and return
   Builder.SetInsertPoint(ContBB);
@@ -761,8 +869,8 @@
              getContext().VoidPtrTy);
     args.add(RValue::get(EmitCastToVoidPtr(tempAddr)),
              getContext().VoidPtrTy);
-    args.add(RValue::get(llvm::ConstantInt::get(IntTy,
-                                                AO_ABI_memory_order_seq_cst)),
+    args.add(RValue::get(llvm::ConstantInt::get(
+                 IntTy, AtomicExpr::AO_ABI_memory_order_seq_cst)),
              getContext().IntTy);
     emitAtomicLibcall(*this, "__atomic_load", getContext().VoidTy, args);
 
@@ -911,8 +1019,8 @@
              getContext().VoidPtrTy);
     args.add(RValue::get(EmitCastToVoidPtr(srcAddr)),
              getContext().VoidPtrTy);
-    args.add(RValue::get(llvm::ConstantInt::get(IntTy,
-                                                AO_ABI_memory_order_seq_cst)),
+    args.add(RValue::get(llvm::ConstantInt::get(
+                 IntTy, AtomicExpr::AO_ABI_memory_order_seq_cst)),
              getContext().IntTy);
     emitAtomicLibcall(*this, "__atomic_store", getContext().VoidTy, args);
     return;
diff --git a/lib/CodeGen/CGBlocks.cpp b/lib/CodeGen/CGBlocks.cpp
index 692f9a0..15b08d4 100644
--- a/lib/CodeGen/CGBlocks.cpp
+++ b/lib/CodeGen/CGBlocks.cpp
@@ -18,9 +18,9 @@
 #include "CodeGenModule.h"
 #include "clang/AST/DeclObjC.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/CallSite.h"
 #include <algorithm>
 #include <cstdio>
 
@@ -53,7 +53,7 @@
   return CodeGenFunction(CGM).GenerateCopyHelperFunction(blockInfo);
 }
 
-/// Build the helper function to dipose of a block.
+/// Build the helper function to dispose of a block.
 static llvm::Constant *buildDisposeHelper(CodeGenModule &CGM,
                                           const CGBlockInfo &blockInfo) {
   return CodeGenFunction(CGM).GenerateDestroyHelperFunction(blockInfo);
@@ -300,8 +300,8 @@
 
   // The header is basically a 'struct { void *; int; int; void *; void *; }'.
   CharUnits ptrSize, ptrAlign, intSize, intAlign;
-  llvm::tie(ptrSize, ptrAlign) = C.getTypeInfoInChars(C.VoidPtrTy);
-  llvm::tie(intSize, intAlign) = C.getTypeInfoInChars(C.IntTy);
+  std::tie(ptrSize, ptrAlign) = C.getTypeInfoInChars(C.VoidPtrTy);
+  std::tie(intSize, intAlign) = C.getTypeInfoInChars(C.IntTy);
 
   // Are there crazy embedded platforms where this isn't true?
   assert(intSize <= ptrSize && "layout assumptions horribly violated");
@@ -370,11 +370,10 @@
   }
 
   // Next, all the block captures.
-  for (BlockDecl::capture_const_iterator ci = block->capture_begin(),
-         ce = block->capture_end(); ci != ce; ++ci) {
-    const VarDecl *variable = ci->getVariable();
+  for (const auto &CI : block->captures()) {
+    const VarDecl *variable = CI.getVariable();
 
-    if (ci->isByRef()) {
+    if (CI.isByRef()) {
       // We have to copy/dispose of the __block reference.
       info.NeedsCopyDispose = true;
 
@@ -387,8 +386,7 @@
       maxFieldAlign = std::max(maxFieldAlign, tinfo.second);
 
       layout.push_back(BlockLayoutChunk(tinfo.second, tinfo.first,
-                                        Qualifiers::OCL_None,
-                                        &*ci, llvmType));
+                                        Qualifiers::OCL_None, &CI, llvmType));
       continue;
     }
 
@@ -422,7 +420,7 @@
       lifetime = Qualifiers::OCL_Strong;
 
     // So do types that require non-trivial copy construction.
-    } else if (ci->hasCopyExpr()) {
+    } else if (CI.hasCopyExpr()) {
       info.NeedsCopyDispose = true;
       info.HasCXXObject = true;
 
@@ -446,7 +444,7 @@
     llvm::Type *llvmType =
       CGM.getTypes().ConvertTypeForMem(VT);
     
-    layout.push_back(BlockLayoutChunk(align, size, lifetime, &*ci, llvmType));
+    layout.push_back(BlockLayoutChunk(align, size, lifetime, &CI, llvmType));
   }
 
   // If that was everything, we're done here.
@@ -581,14 +579,13 @@
 
   // Walk through the captures (in order) and find the ones not
   // captured by constant.
-  for (BlockDecl::capture_const_iterator ci = block->capture_begin(),
-         ce = block->capture_end(); ci != ce; ++ci) {
+  for (const auto &CI : block->captures()) {
     // Ignore __block captures; there's nothing special in the
     // on-stack block that we need to do for them.
-    if (ci->isByRef()) continue;
+    if (CI.isByRef()) continue;
 
     // Ignore variables that are constant-captured.
-    const VarDecl *variable = ci->getVariable();
+    const VarDecl *variable = CI.getVariable();
     CGBlockInfo::Capture &capture = blockInfo.getCapture(variable);
     if (capture.isConstant()) continue;
 
@@ -679,7 +676,7 @@
   }
 
   // Find the block info for this block and take ownership of it.
-  OwningPtr<CGBlockInfo> blockInfo;
+  std::unique_ptr<CGBlockInfo> blockInfo;
   blockInfo.reset(findAndRemoveBlockInfo(&FirstBlockInfo,
                                          blockExpr->getBlockDecl()));
 
@@ -741,9 +738,8 @@
   }
 
   // Next, captured variables.
-  for (BlockDecl::capture_const_iterator ci = blockDecl->capture_begin(),
-         ce = blockDecl->capture_end(); ci != ce; ++ci) {
-    const VarDecl *variable = ci->getVariable();
+  for (const auto &CI : blockDecl->captures()) {
+    const VarDecl *variable = CI.getVariable();
     const CGBlockInfo::Capture &capture = blockInfo.getCapture(variable);
 
     // Ignore constant captures.
@@ -761,7 +757,7 @@
     // Compute the address of the thing we're going to move into the
     // block literal.
     llvm::Value *src;
-    if (BlockInfo && ci->isNested()) {
+    if (BlockInfo && CI.isNested()) {
       // We need to use the capture from the enclosing block.
       const CGBlockInfo::Capture &enclosingCapture =
         BlockInfo->getCapture(variable);
@@ -780,8 +776,8 @@
       // emission.
       src = LocalDeclMap.lookup(variable);
       if (!src) {
-        DeclRefExpr declRef(const_cast<VarDecl*>(variable),
-                            /*refersToEnclosing*/ ci->isNested(), type,
+        DeclRefExpr declRef(const_cast<VarDecl *>(variable),
+                            /*refersToEnclosing*/ CI.isNested(), type,
                             VK_LValue, SourceLocation());
         src = EmitDeclRefLValue(&declRef).getAddress();
       }
@@ -791,9 +787,9 @@
     // the block field.  There's no need to chase the forwarding
     // pointer at this point, since we're building something that will
     // live a shorter life than the stack byref anyway.
-    if (ci->isByRef()) {
+    if (CI.isByRef()) {
       // Get a void* that points to the byref struct.
-      if (ci->isNested())
+      if (CI.isNested())
         src = Builder.CreateAlignedLoad(src, align.getQuantity(),
                                         "byref.capture");
       else
@@ -803,7 +799,7 @@
       Builder.CreateAlignedStore(src, blockField, align.getQuantity());
 
     // If we have a copy constructor, evaluate that into the block field.
-    } else if (const Expr *copyExpr = ci->getCopyExpr()) {
+    } else if (const Expr *copyExpr = CI.getCopyExpr()) {
       if (blockDecl->isConversionFromLambda()) {
         // If we have a lambda conversion, emit the expression
         // directly into the block instead.
@@ -851,7 +847,7 @@
       // We use one of these or the other depending on whether the
       // reference is nested.
       DeclRefExpr declRef(const_cast<VarDecl*>(variable),
-                          /*refersToEnclosing*/ ci->isNested(), type,
+                          /*refersToEnclosing*/ CI.isNested(), type,
                           VK_LValue, SourceLocation());
 
       ImplicitCastExpr l2r(ImplicitCastExpr::OnStack, type, CK_LValueToRValue,
@@ -862,7 +858,7 @@
     }
 
     // Activate the cleanup if layout pushed one.
-    if (!ci->isByRef()) {
+    if (!CI.isByRef()) {
       EHScopeStack::stable_iterator cleanup = capture.getCleanup();
       if (cleanup.isValid())
         ActivateCleanupBlock(cleanup, blockInfo.DominatingIP);
@@ -1117,17 +1113,15 @@
   args.push_back(&selfDecl);
 
   // Now add the rest of the parameters.
-  for (BlockDecl::param_const_iterator i = blockDecl->param_begin(),
-       e = blockDecl->param_end(); i != e; ++i)
-    args.push_back(*i);
+  for (auto i : blockDecl->params())
+    args.push_back(i);
 
   // Create the function declaration.
   const FunctionProtoType *fnType = blockInfo.getBlockExpr()->getFunctionType();
-  const CGFunctionInfo &fnInfo =
-    CGM.getTypes().arrangeFunctionDeclaration(fnType->getResultType(), args,
-                                              fnType->getExtInfo(),
-                                              fnType->isVariadic());
-  if (CGM.ReturnTypeUsesSRet(fnInfo))
+  const CGFunctionInfo &fnInfo = CGM.getTypes().arrangeFreeFunctionDeclaration(
+      fnType->getReturnType(), args, fnType->getExtInfo(),
+      fnType->isVariadic());
+  if (CGM.ReturnSlotInterferesWithArgs(fnInfo))
     blockInfo.UsesStret = true;
 
   llvm::FunctionType *fnLLVMType = CGM.getTypes().GetFunctionType(fnInfo);
@@ -1140,7 +1134,7 @@
   CGM.SetInternalFunctionAttributes(blockDecl, fn, fnInfo);
 
   // Begin generating the function.
-  StartFunction(blockDecl, fnType->getResultType(), fn, fnInfo, args,
+  StartFunction(blockDecl, fnType->getReturnType(), fn, fnInfo, args,
                 blockInfo.getBlockExpr()->getBody()->getLocStart());
 
   // Okay.  Undo some of what StartFunction did.
@@ -1177,9 +1171,8 @@
   }
 
   // Also force all the constant captures.
-  for (BlockDecl::capture_const_iterator ci = blockDecl->capture_begin(),
-         ce = blockDecl->capture_end(); ci != ce; ++ci) {
-    const VarDecl *variable = ci->getVariable();
+  for (const auto &CI : blockDecl->captures()) {
+    const VarDecl *variable = CI.getVariable();
     const CGBlockInfo::Capture &capture = blockInfo.getCapture(variable);
     if (!capture.isConstant()) continue;
 
@@ -1201,8 +1194,14 @@
 
   if (IsLambdaConversionToBlock)
     EmitLambdaBlockInvokeBody();
-  else
+  else {
+    PGO.assignRegionCounters(blockDecl, fn);
+    RegionCounter Cnt = getPGORegionCounter(blockDecl->getBody());
+    Cnt.beginRegion(Builder);
     EmitStmt(blockDecl->getBody());
+    PGO.emitInstrumentationData();
+    PGO.destroyRegionCounters();
+  }
 
   // Remember where we were...
   llvm::BasicBlock *resume = Builder.GetInsertBlock();
@@ -1214,9 +1213,8 @@
   // Emit debug information for all the DeclRefExprs.
   // FIXME: also for 'this'
   if (CGDebugInfo *DI = getDebugInfo()) {
-    for (BlockDecl::capture_const_iterator ci = blockDecl->capture_begin(),
-           ce = blockDecl->capture_end(); ci != ce; ++ci) {
-      const VarDecl *variable = ci->getVariable();
+    for (const auto &CI : blockDecl->captures()) {
+      const VarDecl *variable = CI.getVariable();
       DI->EmitLocation(Builder, variable->getLocation());
 
       if (CGM.getCodeGenOpts().getDebugInfo()
@@ -1285,10 +1283,8 @@
   ImplicitParamDecl srcDecl(0, SourceLocation(), 0, C.VoidPtrTy);
   args.push_back(&srcDecl);
 
-  const CGFunctionInfo &FI =
-    CGM.getTypes().arrangeFunctionDeclaration(C.VoidTy, args,
-                                              FunctionType::ExtInfo(),
-                                              /*variadic*/ false);
+  const CGFunctionInfo &FI = CGM.getTypes().arrangeFreeFunctionDeclaration(
+      C.VoidTy, args, FunctionType::ExtInfo(), /*variadic=*/false);
 
   // FIXME: it would be nice if these were mergeable with things with
   // identical semantics.
@@ -1325,25 +1321,24 @@
 
   const BlockDecl *blockDecl = blockInfo.getBlockDecl();
 
-  for (BlockDecl::capture_const_iterator ci = blockDecl->capture_begin(),
-         ce = blockDecl->capture_end(); ci != ce; ++ci) {
-    const VarDecl *variable = ci->getVariable();
+  for (const auto &CI : blockDecl->captures()) {
+    const VarDecl *variable = CI.getVariable();
     QualType type = variable->getType();
 
     const CGBlockInfo::Capture &capture = blockInfo.getCapture(variable);
     if (capture.isConstant()) continue;
 
-    const Expr *copyExpr = ci->getCopyExpr();
+    const Expr *copyExpr = CI.getCopyExpr();
     BlockFieldFlags flags;
 
     bool useARCWeakCopy = false;
     bool useARCStrongCopy = false;
 
     if (copyExpr) {
-      assert(!ci->isByRef());
+      assert(!CI.isByRef());
       // don't bother computing flags
 
-    } else if (ci->isByRef()) {
+    } else if (CI.isByRef()) {
       flags = BLOCK_FIELD_IS_BYREF;
       if (type.isObjCGCWeak())
         flags |= BLOCK_FIELD_IS_WEAK;
@@ -1423,7 +1418,7 @@
         };
 
         bool copyCanThrow = false;
-        if (ci->isByRef() && variable->getType()->getAsCXXRecordDecl()) {
+        if (CI.isByRef() && variable->getType()->getAsCXXRecordDecl()) {
           const Expr *copyExpr =
             CGM.getContext().getBlockVarCopyInits(variable);
           if (copyExpr) {
@@ -1460,10 +1455,8 @@
   ImplicitParamDecl srcDecl(0, SourceLocation(), 0, C.VoidPtrTy);
   args.push_back(&srcDecl);
 
-  const CGFunctionInfo &FI =
-    CGM.getTypes().arrangeFunctionDeclaration(C.VoidTy, args,
-                                              FunctionType::ExtInfo(),
-                                              /*variadic*/ false);
+  const CGFunctionInfo &FI = CGM.getTypes().arrangeFreeFunctionDeclaration(
+      C.VoidTy, args, FunctionType::ExtInfo(), /*variadic=*/false);
 
   // FIXME: We'd like to put these into a mergable by content, with
   // internal linkage.
@@ -1496,9 +1489,8 @@
 
   CodeGenFunction::RunCleanupsScope cleanups(*this);
 
-  for (BlockDecl::capture_const_iterator ci = blockDecl->capture_begin(),
-         ce = blockDecl->capture_end(); ci != ce; ++ci) {
-    const VarDecl *variable = ci->getVariable();
+  for (const auto &CI : blockDecl->captures()) {
+    const VarDecl *variable = CI.getVariable();
     QualType type = variable->getType();
 
     const CGBlockInfo::Capture &capture = blockInfo.getCapture(variable);
@@ -1510,7 +1502,7 @@
     bool useARCWeakDestroy = false;
     bool useARCStrongDestroy = false;
 
-    if (ci->isByRef()) {
+    if (CI.isByRef()) {
       flags = BLOCK_FIELD_IS_BYREF;
       if (type.isObjCGCWeak())
         flags |= BLOCK_FIELD_IS_WEAK;
@@ -1587,7 +1579,7 @@
     : ByrefHelpers(alignment), Flags(flags) {}
 
   void emitCopy(CodeGenFunction &CGF, llvm::Value *destField,
-                llvm::Value *srcField) {
+                llvm::Value *srcField) override {
     destField = CGF.Builder.CreateBitCast(destField, CGF.VoidPtrTy);
 
     srcField = CGF.Builder.CreateBitCast(srcField, CGF.VoidPtrPtrTy);
@@ -1602,14 +1594,14 @@
     CGF.EmitNounwindRuntimeCall(fn, args);
   }
 
-  void emitDispose(CodeGenFunction &CGF, llvm::Value *field) {
+  void emitDispose(CodeGenFunction &CGF, llvm::Value *field) override {
     field = CGF.Builder.CreateBitCast(field, CGF.Int8PtrTy->getPointerTo(0));
     llvm::Value *value = CGF.Builder.CreateLoad(field);
 
     CGF.BuildBlockRelease(value, Flags | BLOCK_BYREF_CALLER);
   }
 
-  void profileImpl(llvm::FoldingSetNodeID &id) const {
+  void profileImpl(llvm::FoldingSetNodeID &id) const override {
     id.AddInteger(Flags.getBitMask());
   }
 };
@@ -1620,15 +1612,15 @@
   ARCWeakByrefHelpers(CharUnits alignment) : ByrefHelpers(alignment) {}
 
   void emitCopy(CodeGenFunction &CGF, llvm::Value *destField,
-                llvm::Value *srcField) {
+                llvm::Value *srcField) override {
     CGF.EmitARCMoveWeak(destField, srcField);
   }
 
-  void emitDispose(CodeGenFunction &CGF, llvm::Value *field) {
+  void emitDispose(CodeGenFunction &CGF, llvm::Value *field) override {
     CGF.EmitARCDestroyWeak(field);
   }
 
-  void profileImpl(llvm::FoldingSetNodeID &id) const {
+  void profileImpl(llvm::FoldingSetNodeID &id) const override {
     // 0 is distinguishable from all pointers and byref flags
     id.AddInteger(0);
   }
@@ -1641,7 +1633,7 @@
   ARCStrongByrefHelpers(CharUnits alignment) : ByrefHelpers(alignment) {}
 
   void emitCopy(CodeGenFunction &CGF, llvm::Value *destField,
-                llvm::Value *srcField) {
+                llvm::Value *srcField) override {
     // Do a "move" by copying the value and then zeroing out the old
     // variable.
 
@@ -1665,11 +1657,11 @@
     store->setAlignment(Alignment.getQuantity());
   }
 
-  void emitDispose(CodeGenFunction &CGF, llvm::Value *field) {
+  void emitDispose(CodeGenFunction &CGF, llvm::Value *field) override {
     CGF.EmitARCDestroyStrong(field, ARCImpreciseLifetime);
   }
 
-  void profileImpl(llvm::FoldingSetNodeID &id) const {
+  void profileImpl(llvm::FoldingSetNodeID &id) const override {
     // 1 is distinguishable from all pointers and byref flags
     id.AddInteger(1);
   }
@@ -1682,7 +1674,7 @@
   ARCStrongBlockByrefHelpers(CharUnits alignment) : ByrefHelpers(alignment) {}
 
   void emitCopy(CodeGenFunction &CGF, llvm::Value *destField,
-                llvm::Value *srcField) {
+                llvm::Value *srcField) override {
     // Do the copy with objc_retainBlock; that's all that
     // _Block_object_assign would do anyway, and we'd have to pass the
     // right arguments to make sure it doesn't get no-op'ed.
@@ -1695,11 +1687,11 @@
     store->setAlignment(Alignment.getQuantity());
   }
 
-  void emitDispose(CodeGenFunction &CGF, llvm::Value *field) {
+  void emitDispose(CodeGenFunction &CGF, llvm::Value *field) override {
     CGF.EmitARCDestroyStrong(field, ARCImpreciseLifetime);
   }
 
-  void profileImpl(llvm::FoldingSetNodeID &id) const {
+  void profileImpl(llvm::FoldingSetNodeID &id) const override {
     // 2 is distinguishable from all pointers and byref flags
     id.AddInteger(2);
   }
@@ -1716,20 +1708,20 @@
                   const Expr *copyExpr)
     : ByrefHelpers(alignment), VarType(type), CopyExpr(copyExpr) {}
 
-  bool needsCopy() const { return CopyExpr != 0; }
+  bool needsCopy() const override { return CopyExpr != 0; }
   void emitCopy(CodeGenFunction &CGF, llvm::Value *destField,
-                llvm::Value *srcField) {
+                llvm::Value *srcField) override {
     if (!CopyExpr) return;
     CGF.EmitSynthesizedCXXCopyCtor(destField, srcField, CopyExpr);
   }
 
-  void emitDispose(CodeGenFunction &CGF, llvm::Value *field) {
+  void emitDispose(CodeGenFunction &CGF, llvm::Value *field) override {
     EHScopeStack::stable_iterator cleanupDepth = CGF.EHStack.stable_begin();
     CGF.PushDestructorCleanup(VarType, field);
     CGF.PopCleanupBlocks(cleanupDepth);
   }
 
-  void profileImpl(llvm::FoldingSetNodeID &id) const {
+  void profileImpl(llvm::FoldingSetNodeID &id) const override {
     id.AddPointer(VarType.getCanonicalType().getAsOpaquePtr());
   }
 };
@@ -1751,10 +1743,8 @@
   ImplicitParamDecl src(0, SourceLocation(), 0, Context.VoidPtrTy);
   args.push_back(&src);
 
-  const CGFunctionInfo &FI =
-    CGF.CGM.getTypes().arrangeFunctionDeclaration(R, args,
-                                                  FunctionType::ExtInfo(),
-                                                  /*variadic*/ false);
+  const CGFunctionInfo &FI = CGF.CGM.getTypes().arrangeFreeFunctionDeclaration(
+      R, args, FunctionType::ExtInfo(), /*variadic=*/false);
 
   CodeGenTypes &Types = CGF.CGM.getTypes();
   llvm::FunctionType *LTy = Types.GetFunctionType(FI);
@@ -1822,10 +1812,8 @@
   ImplicitParamDecl src(0, SourceLocation(), 0, Context.VoidPtrTy);
   args.push_back(&src);
 
-  const CGFunctionInfo &FI =
-    CGF.CGM.getTypes().arrangeFunctionDeclaration(R, args,
-                                                  FunctionType::ExtInfo(),
-                                                  /*variadic*/ false);
+  const CGFunctionInfo &FI = CGF.CGM.getTypes().arrangeFreeFunctionDeclaration(
+      R, args, FunctionType::ExtInfo(), /*variadic=*/false);
 
   CodeGenTypes &Types = CGF.CGM.getTypes();
   llvm::FunctionType *LTy = Types.GetFunctionType(FI);
@@ -2218,7 +2206,7 @@
     llvm::Value *Addr;
     CallBlockRelease(llvm::Value *Addr) : Addr(Addr) {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       // Should we be passing FIELD_IS_WEAK here?
       CGF.BuildBlockRelease(Addr, BLOCK_FIELD_IS_BYREF);
     }
diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp
index 7726ad3..5a86bdd 100644
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -215,8 +215,11 @@
     return RValue::get(CGM.EmitConstantExpr(E, E->getType(), 0));
   case Builtin::BI__builtin_stdarg_start:
   case Builtin::BI__builtin_va_start:
+  case Builtin::BI__va_start:
   case Builtin::BI__builtin_va_end: {
-    Value *ArgValue = EmitVAListRef(E->getArg(0));
+    Value *ArgValue = (BuiltinID == Builtin::BI__va_start)
+                          ? EmitScalarExpr(E->getArg(0))
+                          : EmitVAListRef(E->getArg(0));
     llvm::Type *DestType = Int8PtrTy;
     if (ArgValue->getType() != DestType)
       ArgValue = Builder.CreateBitCast(ArgValue, DestType,
@@ -429,6 +432,12 @@
     Value *F = CGM.getIntrinsic(Intrinsic::readcyclecounter);
     return RValue::get(Builder.CreateCall(F));
   }
+  case Builtin::BI__builtin___clear_cache: {
+    Value *Begin = EmitScalarExpr(E->getArg(0));
+    Value *End = EmitScalarExpr(E->getArg(1));
+    Value *F = CGM.getIntrinsic(Intrinsic::clear_cache);
+    return RValue::get(Builder.CreateCall2(F, Begin, End));
+  }
   case Builtin::BI__builtin_trap: {
     Value *F = CGM.getIntrinsic(Intrinsic::trap);
     return RValue::get(Builder.CreateCall(F));
@@ -964,6 +973,7 @@
     Args[2] = EmitToInt(*this, EmitScalarExpr(E->getArg(2)), T, IntType);
 
     Value *Result = Builder.CreateAtomicCmpXchg(Args[0], Args[1], Args[2],
+                                                llvm::SequentiallyConsistent,
                                                 llvm::SequentiallyConsistent);
     Result = EmitFromInt(*this, Result, T, ValueType);
     return RValue::get(Result);
@@ -990,6 +1000,7 @@
 
     Value *OldVal = Args[1];
     Value *PrevVal = Builder.CreateAtomicCmpXchg(Args[0], Args[1], Args[2],
+                                                 llvm::SequentiallyConsistent,
                                                  llvm::SequentiallyConsistent);
     Value *Result = Builder.CreateICmpEQ(PrevVal, OldVal);
     // zext bool to int.
@@ -1311,7 +1322,6 @@
     llvm::Type *ArgType = Base->getType();
     Value *F = CGM.getIntrinsic(Intrinsic::pow, ArgType);
     return RValue::get(Builder.CreateCall2(F, Base, Exponent));
-    break;
   }
 
   case Builtin::BIfma:
@@ -1500,6 +1510,43 @@
     return RValue::get(EmitLValue(E->getArg(0)).getAddress());
   case Builtin::BI__noop:
     return RValue::get(0);
+  case Builtin::BI_InterlockedCompareExchange: {
+    AtomicCmpXchgInst *CXI = Builder.CreateAtomicCmpXchg(
+        EmitScalarExpr(E->getArg(0)),
+        EmitScalarExpr(E->getArg(2)),
+        EmitScalarExpr(E->getArg(1)),
+        SequentiallyConsistent,
+        SequentiallyConsistent);
+      CXI->setVolatile(true);
+      return RValue::get(CXI);
+  }
+  case Builtin::BI_InterlockedIncrement: {
+    AtomicRMWInst *RMWI = Builder.CreateAtomicRMW(
+      AtomicRMWInst::Add,
+      EmitScalarExpr(E->getArg(0)),
+      ConstantInt::get(Int32Ty, 1),
+      llvm::SequentiallyConsistent);
+    RMWI->setVolatile(true);
+    return RValue::get(Builder.CreateAdd(RMWI, ConstantInt::get(Int32Ty, 1)));
+  }
+  case Builtin::BI_InterlockedDecrement: {
+    AtomicRMWInst *RMWI = Builder.CreateAtomicRMW(
+      AtomicRMWInst::Sub,
+      EmitScalarExpr(E->getArg(0)),
+      ConstantInt::get(Int32Ty, 1),
+      llvm::SequentiallyConsistent);
+    RMWI->setVolatile(true);
+    return RValue::get(Builder.CreateSub(RMWI, ConstantInt::get(Int32Ty, 1)));
+  }
+  case Builtin::BI_InterlockedExchangeAdd: {
+    AtomicRMWInst *RMWI = Builder.CreateAtomicRMW(
+      AtomicRMWInst::Add,
+      EmitScalarExpr(E->getArg(0)),
+      EmitScalarExpr(E->getArg(1)),
+      llvm::SequentiallyConsistent);
+    RMWI->setVolatile(true);
+    return RValue::get(RMWI);
+  }
   }
 
   // If this is an alias for a lib function (e.g. __builtin_sin), emit
@@ -1591,10 +1638,15 @@
                                               const CallExpr *E) {
   switch (getTarget().getTriple().getArch()) {
   case llvm::Triple::aarch64:
+  case llvm::Triple::aarch64_be:
     return EmitAArch64BuiltinExpr(BuiltinID, E);
   case llvm::Triple::arm:
+  case llvm::Triple::armeb:
   case llvm::Triple::thumb:
+  case llvm::Triple::thumbeb:
     return EmitARMBuiltinExpr(BuiltinID, E);
+  case llvm::Triple::arm64:
+    return EmitARM64BuiltinExpr(BuiltinID, E);
   case llvm::Triple::x86:
   case llvm::Triple::x86_64:
     return EmitX86BuiltinExpr(BuiltinID, E);
@@ -1624,6 +1676,11 @@
   case NeonTypeFlags::Int64:
   case NeonTypeFlags::Poly64:
     return llvm::VectorType::get(CGF->Int64Ty, V1Ty ? 1 : (1 << IsQuad));
+  case NeonTypeFlags::Poly128:
+    // FIXME: i128 and f128 doesn't get fully support in Clang and llvm.
+    // There is a lot of i128 and f128 API missing.
+    // so we use v16i8 to represent poly128 and get pattern matched.
+    return llvm::VectorType::get(CGF->Int8Ty, 16);
   case NeonTypeFlags::Float32:
     return llvm::VectorType::get(CGF->FloatTy, V1Ty ? 1 : (2 << IsQuad));
   case NeonTypeFlags::Float64:
@@ -1694,6 +1751,36 @@
     return Builder.CreateAShr(Vec, Shift, name);
 }
 
+Value *CodeGenFunction::EmitConcatVectors(Value *Lo, Value *Hi,
+                                          llvm::Type *ArgTy) {
+  unsigned NumElts = ArgTy->getVectorNumElements();
+  SmallVector<Constant *, 16> Indices;
+  for (unsigned i = 0; i < 2 * NumElts; ++i)
+    Indices.push_back(ConstantInt::get(Int32Ty, i));
+
+  Constant *Mask = ConstantVector::get(Indices);
+  Value *LoCast = Builder.CreateBitCast(Lo, ArgTy);
+  Value *HiCast = Builder.CreateBitCast(Hi, ArgTy);
+  return Builder.CreateShuffleVector(LoCast, HiCast, Mask, "concat");
+}
+
+Value *CodeGenFunction::EmitExtractHigh(Value *Vec, llvm::Type *ResTy) {
+  unsigned NumElts = ResTy->getVectorNumElements();
+  SmallVector<Constant *, 8> Indices;
+
+  llvm::Type *InTy = llvm::VectorType::get(ResTy->getVectorElementType(),
+                                           NumElts * 2);
+  Value *VecCast = Builder.CreateBitCast(Vec, InTy);
+
+  // extract_high is a shuffle on the second half of the input indices: E.g. 4,
+  // 5, 6, 7 if we're extracting <4 x i16> from <8 x i16>.
+  for (unsigned i = 0; i < NumElts; ++i)
+    Indices.push_back(ConstantInt::get(Int32Ty, NumElts + i));
+
+  Constant *Mask = ConstantVector::get(Indices);
+  return Builder.CreateShuffleVector(VecCast, VecCast, Mask, "concat");
+}
+
 /// GetPointeeAlignment - Given an expression with a pointer type, find the
 /// alignment of the type referenced by the pointer.  Skip over implicit
 /// casts.
@@ -1749,21 +1836,1044 @@
   return std::make_pair(EmitScalarExpr(Addr), Align);
 }
 
+enum {
+  AddRetType = (1 << 0),
+  Add1ArgType = (1 << 1),
+  Add2ArgTypes = (1 << 2),
+
+  VectorizeRetType = (1 << 3),
+  VectorizeArgTypes = (1 << 4),
+
+  InventFloatType = (1 << 5),
+  UnsignedAlts = (1 << 6),
+
+  Use64BitVectors = (1 << 7),
+  Use128BitVectors = (1 << 8),
+
+  Vectorize1ArgType = Add1ArgType | VectorizeArgTypes,
+  VectorRet = AddRetType | VectorizeRetType,
+  VectorRetGetArgs01 =
+      AddRetType | Add2ArgTypes | VectorizeRetType | VectorizeArgTypes,
+  FpCmpzModifiers =
+      AddRetType | VectorizeRetType | Add1ArgType | InventFloatType
+};
+
+ struct NeonIntrinsicInfo {
+  unsigned BuiltinID;
+  unsigned LLVMIntrinsic;
+  unsigned AltLLVMIntrinsic;
+  const char *NameHint;
+  unsigned TypeModifier;
+
+  bool operator<(unsigned RHSBuiltinID) const {
+    return BuiltinID < RHSBuiltinID;
+  }
+};
+
+#define NEONMAP0(NameBase) \
+  { NEON::BI__builtin_neon_ ## NameBase, 0, 0, #NameBase, 0 }
+
+#define NEONMAP1(NameBase, LLVMIntrinsic, TypeModifier) \
+  { NEON:: BI__builtin_neon_ ## NameBase, \
+      Intrinsic::LLVMIntrinsic, 0, #NameBase, TypeModifier }
+
+#define NEONMAP2(NameBase, LLVMIntrinsic, AltLLVMIntrinsic, TypeModifier) \
+  { NEON:: BI__builtin_neon_ ## NameBase, \
+      Intrinsic::LLVMIntrinsic, Intrinsic::AltLLVMIntrinsic, \
+      #NameBase, TypeModifier }
+
+static const NeonIntrinsicInfo AArch64SISDIntrinsicInfo[] = {
+  NEONMAP1(vabdd_f64, aarch64_neon_vabd, AddRetType),
+  NEONMAP1(vabds_f32, aarch64_neon_vabd, AddRetType),
+  NEONMAP1(vabsd_s64, aarch64_neon_vabs, 0),
+  NEONMAP1(vaddd_s64, aarch64_neon_vaddds, 0),
+  NEONMAP1(vaddd_u64, aarch64_neon_vadddu, 0),
+  NEONMAP1(vaddlv_s16, aarch64_neon_saddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlv_s32, aarch64_neon_saddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlv_s8, aarch64_neon_saddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlv_u16, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlv_u32, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlv_u8, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlvq_s16, aarch64_neon_saddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlvq_s32, aarch64_neon_saddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlvq_s8, aarch64_neon_saddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlvq_u16, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlvq_u32, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddlvq_u8, aarch64_neon_uaddlv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddv_f32, aarch64_neon_vpfadd, AddRetType | Add1ArgType),
+  NEONMAP1(vaddv_s16, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddv_s32, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddv_s8, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddv_u16, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddv_u32, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddv_u8, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddvq_f32, aarch64_neon_vpfadd, AddRetType | Add1ArgType),
+  NEONMAP1(vaddvq_f64, aarch64_neon_vpfadd, AddRetType | Add1ArgType),
+  NEONMAP1(vaddvq_s16, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddvq_s32, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddvq_s64, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddvq_s8, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddvq_u16, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddvq_u32, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddvq_u64, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vaddvq_u8, aarch64_neon_vaddv, VectorRet | Add1ArgType),
+  NEONMAP1(vcaged_f64, aarch64_neon_fcage, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcages_f32, aarch64_neon_fcage, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcagtd_f64, aarch64_neon_fcagt, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcagts_f32, aarch64_neon_fcagt, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcaled_f64, aarch64_neon_fcage, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcales_f32, aarch64_neon_fcage, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcaltd_f64, aarch64_neon_fcagt, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcalts_f32, aarch64_neon_fcagt, VectorRet | Add2ArgTypes),
+  NEONMAP1(vceqd_f64, aarch64_neon_fceq, VectorRet | Add2ArgTypes),
+  NEONMAP1(vceqd_s64, aarch64_neon_vceq, VectorRetGetArgs01),
+  NEONMAP1(vceqd_u64, aarch64_neon_vceq, VectorRetGetArgs01),
+  NEONMAP1(vceqs_f32, aarch64_neon_fceq, VectorRet | Add2ArgTypes),
+  NEONMAP1(vceqzd_f64, aarch64_neon_fceq, FpCmpzModifiers),
+  NEONMAP1(vceqzd_s64, aarch64_neon_vceq, VectorRetGetArgs01),
+  NEONMAP1(vceqzd_u64, aarch64_neon_vceq, VectorRetGetArgs01),
+  NEONMAP1(vceqzs_f32, aarch64_neon_fceq, FpCmpzModifiers),
+  NEONMAP1(vcged_f64, aarch64_neon_fcge, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcged_s64, aarch64_neon_vcge, VectorRetGetArgs01),
+  NEONMAP1(vcged_u64, aarch64_neon_vchs, VectorRetGetArgs01),
+  NEONMAP1(vcges_f32, aarch64_neon_fcge, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcgezd_f64, aarch64_neon_fcge, FpCmpzModifiers),
+  NEONMAP1(vcgezd_s64, aarch64_neon_vcge, VectorRetGetArgs01),
+  NEONMAP1(vcgezs_f32, aarch64_neon_fcge, FpCmpzModifiers),
+  NEONMAP1(vcgtd_f64, aarch64_neon_fcgt, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcgtd_s64, aarch64_neon_vcgt, VectorRetGetArgs01),
+  NEONMAP1(vcgtd_u64, aarch64_neon_vchi, VectorRetGetArgs01),
+  NEONMAP1(vcgts_f32, aarch64_neon_fcgt, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcgtzd_f64, aarch64_neon_fcgt, FpCmpzModifiers),
+  NEONMAP1(vcgtzd_s64, aarch64_neon_vcgt, VectorRetGetArgs01),
+  NEONMAP1(vcgtzs_f32, aarch64_neon_fcgt, FpCmpzModifiers),
+  NEONMAP1(vcled_f64, aarch64_neon_fcge, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcled_s64, aarch64_neon_vcge, VectorRetGetArgs01),
+  NEONMAP1(vcled_u64, aarch64_neon_vchs, VectorRetGetArgs01),
+  NEONMAP1(vcles_f32, aarch64_neon_fcge, VectorRet | Add2ArgTypes),
+  NEONMAP1(vclezd_f64, aarch64_neon_fclez, FpCmpzModifiers),
+  NEONMAP1(vclezd_s64, aarch64_neon_vclez, VectorRetGetArgs01),
+  NEONMAP1(vclezs_f32, aarch64_neon_fclez, FpCmpzModifiers),
+  NEONMAP1(vcltd_f64, aarch64_neon_fcgt, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcltd_s64, aarch64_neon_vcgt, VectorRetGetArgs01),
+  NEONMAP1(vcltd_u64, aarch64_neon_vchi, VectorRetGetArgs01),
+  NEONMAP1(vclts_f32, aarch64_neon_fcgt, VectorRet | Add2ArgTypes),
+  NEONMAP1(vcltzd_f64, aarch64_neon_fcltz, FpCmpzModifiers),
+  NEONMAP1(vcltzd_s64, aarch64_neon_vcltz, VectorRetGetArgs01),
+  NEONMAP1(vcltzs_f32, aarch64_neon_fcltz, FpCmpzModifiers),
+  NEONMAP1(vcvtad_s64_f64, aarch64_neon_fcvtas, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtad_u64_f64, aarch64_neon_fcvtau, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtas_s32_f32, aarch64_neon_fcvtas, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtas_u32_f32, aarch64_neon_fcvtau, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtd_f64_s64, aarch64_neon_vcvtint2fps, AddRetType | Vectorize1ArgType),
+  NEONMAP1(vcvtd_f64_u64, aarch64_neon_vcvtint2fpu, AddRetType | Vectorize1ArgType),
+  NEONMAP1(vcvtd_n_f64_s64, aarch64_neon_vcvtfxs2fp_n, AddRetType | Vectorize1ArgType),
+  NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp_n, AddRetType | Vectorize1ArgType),
+  NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs_n, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu_n, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtms_u32_f32, aarch64_neon_fcvtmu, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtnd_s64_f64, aarch64_neon_fcvtns, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtnd_u64_f64, aarch64_neon_fcvtnu, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtns_s32_f32, aarch64_neon_fcvtns, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtns_u32_f32, aarch64_neon_fcvtnu, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtpd_s64_f64, aarch64_neon_fcvtps, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtpd_u64_f64, aarch64_neon_fcvtpu, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtps_s32_f32, aarch64_neon_fcvtps, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtps_u32_f32, aarch64_neon_fcvtpu, VectorRet | Add1ArgType),
+  NEONMAP1(vcvts_f32_s32, aarch64_neon_vcvtint2fps, AddRetType | Vectorize1ArgType),
+  NEONMAP1(vcvts_f32_u32, aarch64_neon_vcvtint2fpu, AddRetType | Vectorize1ArgType),
+  NEONMAP1(vcvts_n_f32_s32, aarch64_neon_vcvtfxs2fp_n, AddRetType | Vectorize1ArgType),
+  NEONMAP1(vcvts_n_f32_u32, aarch64_neon_vcvtfxu2fp_n, AddRetType | Vectorize1ArgType),
+  NEONMAP1(vcvts_n_s32_f32, aarch64_neon_vcvtfp2fxs_n, VectorRet | Add1ArgType),
+  NEONMAP1(vcvts_n_u32_f32, aarch64_neon_vcvtfp2fxu_n, VectorRet | Add1ArgType),
+  NEONMAP1(vcvts_s32_f32, aarch64_neon_fcvtzs, VectorRet | Add1ArgType),
+  NEONMAP1(vcvts_u32_f32, aarch64_neon_fcvtzu, VectorRet | Add1ArgType),
+  NEONMAP1(vcvtxd_f32_f64, aarch64_neon_fcvtxn, 0),
+  NEONMAP0(vdupb_lane_i8),
+  NEONMAP0(vdupb_laneq_i8),
+  NEONMAP0(vdupd_lane_f64),
+  NEONMAP0(vdupd_lane_i64),
+  NEONMAP0(vdupd_laneq_f64),
+  NEONMAP0(vdupd_laneq_i64),
+  NEONMAP0(vduph_lane_i16),
+  NEONMAP0(vduph_laneq_i16),
+  NEONMAP0(vdups_lane_f32),
+  NEONMAP0(vdups_lane_i32),
+  NEONMAP0(vdups_laneq_f32),
+  NEONMAP0(vdups_laneq_i32),
+  NEONMAP0(vfmad_lane_f64),
+  NEONMAP0(vfmad_laneq_f64),
+  NEONMAP0(vfmas_lane_f32),
+  NEONMAP0(vfmas_laneq_f32),
+  NEONMAP0(vget_lane_f32),
+  NEONMAP0(vget_lane_f64),
+  NEONMAP0(vget_lane_i16),
+  NEONMAP0(vget_lane_i32),
+  NEONMAP0(vget_lane_i64),
+  NEONMAP0(vget_lane_i8),
+  NEONMAP0(vgetq_lane_f32),
+  NEONMAP0(vgetq_lane_f64),
+  NEONMAP0(vgetq_lane_i16),
+  NEONMAP0(vgetq_lane_i32),
+  NEONMAP0(vgetq_lane_i64),
+  NEONMAP0(vgetq_lane_i8),
+  NEONMAP1(vmaxnmv_f32, aarch64_neon_vpfmaxnm, AddRetType | Add1ArgType),
+  NEONMAP1(vmaxnmvq_f32, aarch64_neon_vmaxnmv, 0),
+  NEONMAP1(vmaxnmvq_f64, aarch64_neon_vpfmaxnm, AddRetType | Add1ArgType),
+  NEONMAP1(vmaxv_f32, aarch64_neon_vpmax, AddRetType | Add1ArgType),
+  NEONMAP1(vmaxv_s16, aarch64_neon_smaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxv_s32, aarch64_neon_smaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxv_s8, aarch64_neon_smaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxv_u16, aarch64_neon_umaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxv_u32, aarch64_neon_umaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxv_u8, aarch64_neon_umaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxvq_f32, aarch64_neon_vmaxv, 0),
+  NEONMAP1(vmaxvq_f64, aarch64_neon_vpmax, AddRetType | Add1ArgType),
+  NEONMAP1(vmaxvq_s16, aarch64_neon_smaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxvq_s32, aarch64_neon_smaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxvq_s8, aarch64_neon_smaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxvq_u16, aarch64_neon_umaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxvq_u32, aarch64_neon_umaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vmaxvq_u8, aarch64_neon_umaxv, VectorRet | Add1ArgType),
+  NEONMAP1(vminnmv_f32, aarch64_neon_vpfminnm, AddRetType | Add1ArgType),
+  NEONMAP1(vminnmvq_f32, aarch64_neon_vminnmv, 0),
+  NEONMAP1(vminnmvq_f64, aarch64_neon_vpfminnm, AddRetType | Add1ArgType),
+  NEONMAP1(vminv_f32, aarch64_neon_vpmin, AddRetType | Add1ArgType),
+  NEONMAP1(vminv_s16, aarch64_neon_sminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminv_s32, aarch64_neon_sminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminv_s8, aarch64_neon_sminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminv_u16, aarch64_neon_uminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminv_u32, aarch64_neon_uminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminv_u8, aarch64_neon_uminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminvq_f32, aarch64_neon_vminv, 0),
+  NEONMAP1(vminvq_f64, aarch64_neon_vpmin, AddRetType | Add1ArgType),
+  NEONMAP1(vminvq_s16, aarch64_neon_sminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminvq_s32, aarch64_neon_sminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminvq_s8, aarch64_neon_sminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminvq_u16, aarch64_neon_uminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminvq_u32, aarch64_neon_uminv, VectorRet | Add1ArgType),
+  NEONMAP1(vminvq_u8, aarch64_neon_uminv, VectorRet | Add1ArgType),
+  NEONMAP0(vmul_n_f64),
+  NEONMAP1(vmull_p64, aarch64_neon_vmull_p64, 0),
+  NEONMAP0(vmulxd_f64),
+  NEONMAP0(vmulxs_f32),
+  NEONMAP1(vnegd_s64, aarch64_neon_vneg, 0),
+  NEONMAP1(vpaddd_f64, aarch64_neon_vpfadd, AddRetType | Add1ArgType),
+  NEONMAP1(vpaddd_s64, aarch64_neon_vpadd, 0),
+  NEONMAP1(vpaddd_u64, aarch64_neon_vpadd, 0),
+  NEONMAP1(vpadds_f32, aarch64_neon_vpfadd, AddRetType | Add1ArgType),
+  NEONMAP1(vpmaxnmqd_f64, aarch64_neon_vpfmaxnm, AddRetType | Add1ArgType),
+  NEONMAP1(vpmaxnms_f32, aarch64_neon_vpfmaxnm, AddRetType | Add1ArgType),
+  NEONMAP1(vpmaxqd_f64, aarch64_neon_vpmax, AddRetType | Add1ArgType),
+  NEONMAP1(vpmaxs_f32, aarch64_neon_vpmax, AddRetType | Add1ArgType),
+  NEONMAP1(vpminnmqd_f64, aarch64_neon_vpfminnm, AddRetType | Add1ArgType),
+  NEONMAP1(vpminnms_f32, aarch64_neon_vpfminnm, AddRetType | Add1ArgType),
+  NEONMAP1(vpminqd_f64, aarch64_neon_vpmin, AddRetType | Add1ArgType),
+  NEONMAP1(vpmins_f32, aarch64_neon_vpmin, AddRetType | Add1ArgType),
+  NEONMAP1(vqabsb_s8, arm_neon_vqabs, VectorRet),
+  NEONMAP1(vqabsd_s64, arm_neon_vqabs, VectorRet),
+  NEONMAP1(vqabsh_s16, arm_neon_vqabs, VectorRet),
+  NEONMAP1(vqabss_s32, arm_neon_vqabs, VectorRet),
+  NEONMAP1(vqaddb_s8, arm_neon_vqadds, VectorRet),
+  NEONMAP1(vqaddb_u8, arm_neon_vqaddu, VectorRet),
+  NEONMAP1(vqaddd_s64, arm_neon_vqadds, VectorRet),
+  NEONMAP1(vqaddd_u64, arm_neon_vqaddu, VectorRet),
+  NEONMAP1(vqaddh_s16, arm_neon_vqadds, VectorRet),
+  NEONMAP1(vqaddh_u16, arm_neon_vqaddu, VectorRet),
+  NEONMAP1(vqadds_s32, arm_neon_vqadds, VectorRet),
+  NEONMAP1(vqadds_u32, arm_neon_vqaddu, VectorRet),
+  NEONMAP0(vqdmlalh_lane_s16),
+  NEONMAP0(vqdmlalh_laneq_s16),
+  NEONMAP1(vqdmlalh_s16, aarch64_neon_vqdmlal, VectorRet),
+  NEONMAP0(vqdmlals_lane_s32),
+  NEONMAP0(vqdmlals_laneq_s32),
+  NEONMAP1(vqdmlals_s32, aarch64_neon_vqdmlal, VectorRet),
+  NEONMAP0(vqdmlslh_lane_s16),
+  NEONMAP0(vqdmlslh_laneq_s16),
+  NEONMAP1(vqdmlslh_s16, aarch64_neon_vqdmlsl, VectorRet),
+  NEONMAP0(vqdmlsls_lane_s32),
+  NEONMAP0(vqdmlsls_laneq_s32),
+  NEONMAP1(vqdmlsls_s32, aarch64_neon_vqdmlsl, VectorRet),
+  NEONMAP1(vqdmulhh_s16, arm_neon_vqdmulh, VectorRet),
+  NEONMAP1(vqdmulhs_s32, arm_neon_vqdmulh, VectorRet),
+  NEONMAP1(vqdmullh_s16, arm_neon_vqdmull, VectorRet),
+  NEONMAP1(vqdmulls_s32, arm_neon_vqdmull, VectorRet),
+  NEONMAP1(vqmovnd_s64, arm_neon_vqmovns, VectorRet),
+  NEONMAP1(vqmovnd_u64, arm_neon_vqmovnu, VectorRet),
+  NEONMAP1(vqmovnh_s16, arm_neon_vqmovns, VectorRet),
+  NEONMAP1(vqmovnh_u16, arm_neon_vqmovnu, VectorRet),
+  NEONMAP1(vqmovns_s32, arm_neon_vqmovns, VectorRet),
+  NEONMAP1(vqmovns_u32, arm_neon_vqmovnu, VectorRet),
+  NEONMAP1(vqmovund_s64, arm_neon_vqmovnsu, VectorRet),
+  NEONMAP1(vqmovunh_s16, arm_neon_vqmovnsu, VectorRet),
+  NEONMAP1(vqmovuns_s32, arm_neon_vqmovnsu, VectorRet),
+  NEONMAP1(vqnegb_s8, arm_neon_vqneg, VectorRet),
+  NEONMAP1(vqnegd_s64, arm_neon_vqneg, VectorRet),
+  NEONMAP1(vqnegh_s16, arm_neon_vqneg, VectorRet),
+  NEONMAP1(vqnegs_s32, arm_neon_vqneg, VectorRet),
+  NEONMAP1(vqrdmulhh_s16, arm_neon_vqrdmulh, VectorRet),
+  NEONMAP1(vqrdmulhs_s32, arm_neon_vqrdmulh, VectorRet),
+  NEONMAP1(vqrshlb_s8, aarch64_neon_vqrshls, VectorRet),
+  NEONMAP1(vqrshlb_u8, aarch64_neon_vqrshlu, VectorRet),
+  NEONMAP1(vqrshld_s64, aarch64_neon_vqrshls, VectorRet),
+  NEONMAP1(vqrshld_u64, aarch64_neon_vqrshlu, VectorRet),
+  NEONMAP1(vqrshlh_s16, aarch64_neon_vqrshls, VectorRet),
+  NEONMAP1(vqrshlh_u16, aarch64_neon_vqrshlu, VectorRet),
+  NEONMAP1(vqrshls_s32, aarch64_neon_vqrshls, VectorRet),
+  NEONMAP1(vqrshls_u32, aarch64_neon_vqrshlu, VectorRet),
+  NEONMAP1(vqrshrnd_n_s64, aarch64_neon_vsqrshrn, VectorRet),
+  NEONMAP1(vqrshrnd_n_u64, aarch64_neon_vuqrshrn, VectorRet),
+  NEONMAP1(vqrshrnh_n_s16, aarch64_neon_vsqrshrn, VectorRet),
+  NEONMAP1(vqrshrnh_n_u16, aarch64_neon_vuqrshrn, VectorRet),
+  NEONMAP1(vqrshrns_n_s32, aarch64_neon_vsqrshrn, VectorRet),
+  NEONMAP1(vqrshrns_n_u32, aarch64_neon_vuqrshrn, VectorRet),
+  NEONMAP1(vqrshrund_n_s64, aarch64_neon_vsqrshrun, VectorRet),
+  NEONMAP1(vqrshrunh_n_s16, aarch64_neon_vsqrshrun, VectorRet),
+  NEONMAP1(vqrshruns_n_s32, aarch64_neon_vsqrshrun, VectorRet),
+  NEONMAP1(vqshlb_n_s8, aarch64_neon_vqshls_n, VectorRet),
+  NEONMAP1(vqshlb_n_u8, aarch64_neon_vqshlu_n, VectorRet),
+  NEONMAP1(vqshlb_s8, aarch64_neon_vqshls, VectorRet),
+  NEONMAP1(vqshlb_u8, aarch64_neon_vqshlu, VectorRet),
+  NEONMAP1(vqshld_n_s64, aarch64_neon_vqshls_n, VectorRet),
+  NEONMAP1(vqshld_n_u64, aarch64_neon_vqshlu_n, VectorRet),
+  NEONMAP1(vqshld_s64, aarch64_neon_vqshls, VectorRet),
+  NEONMAP1(vqshld_u64, aarch64_neon_vqshlu, VectorRet),
+  NEONMAP1(vqshlh_n_s16, aarch64_neon_vqshls_n, VectorRet),
+  NEONMAP1(vqshlh_n_u16, aarch64_neon_vqshlu_n, VectorRet),
+  NEONMAP1(vqshlh_s16, aarch64_neon_vqshls, VectorRet),
+  NEONMAP1(vqshlh_u16, aarch64_neon_vqshlu, VectorRet),
+  NEONMAP1(vqshls_n_s32, aarch64_neon_vqshls_n, VectorRet),
+  NEONMAP1(vqshls_n_u32, aarch64_neon_vqshlu_n, VectorRet),
+  NEONMAP1(vqshls_s32, aarch64_neon_vqshls, VectorRet),
+  NEONMAP1(vqshls_u32, aarch64_neon_vqshlu, VectorRet),
+  NEONMAP1(vqshlub_n_s8, aarch64_neon_vsqshlu, VectorRet),
+  NEONMAP1(vqshlud_n_s64, aarch64_neon_vsqshlu, VectorRet),
+  NEONMAP1(vqshluh_n_s16, aarch64_neon_vsqshlu, VectorRet),
+  NEONMAP1(vqshlus_n_s32, aarch64_neon_vsqshlu, VectorRet),
+  NEONMAP1(vqshrnd_n_s64, aarch64_neon_vsqshrn, VectorRet),
+  NEONMAP1(vqshrnd_n_u64, aarch64_neon_vuqshrn, VectorRet),
+  NEONMAP1(vqshrnh_n_s16, aarch64_neon_vsqshrn, VectorRet),
+  NEONMAP1(vqshrnh_n_u16, aarch64_neon_vuqshrn, VectorRet),
+  NEONMAP1(vqshrns_n_s32, aarch64_neon_vsqshrn, VectorRet),
+  NEONMAP1(vqshrns_n_u32, aarch64_neon_vuqshrn, VectorRet),
+  NEONMAP1(vqshrund_n_s64, aarch64_neon_vsqshrun, VectorRet),
+  NEONMAP1(vqshrunh_n_s16, aarch64_neon_vsqshrun, VectorRet),
+  NEONMAP1(vqshruns_n_s32, aarch64_neon_vsqshrun, VectorRet),
+  NEONMAP1(vqsubb_s8, arm_neon_vqsubs, VectorRet),
+  NEONMAP1(vqsubb_u8, arm_neon_vqsubu, VectorRet),
+  NEONMAP1(vqsubd_s64, arm_neon_vqsubs, VectorRet),
+  NEONMAP1(vqsubd_u64, arm_neon_vqsubu, VectorRet),
+  NEONMAP1(vqsubh_s16, arm_neon_vqsubs, VectorRet),
+  NEONMAP1(vqsubh_u16, arm_neon_vqsubu, VectorRet),
+  NEONMAP1(vqsubs_s32, arm_neon_vqsubs, VectorRet),
+  NEONMAP1(vqsubs_u32, arm_neon_vqsubu, VectorRet),
+  NEONMAP1(vrecped_f64, aarch64_neon_vrecpe, AddRetType),
+  NEONMAP1(vrecpes_f32, aarch64_neon_vrecpe, AddRetType),
+  NEONMAP1(vrecpsd_f64, aarch64_neon_vrecps, AddRetType),
+  NEONMAP1(vrecpss_f32, aarch64_neon_vrecps, AddRetType),
+  NEONMAP1(vrecpxd_f64, aarch64_neon_vrecpx, AddRetType),
+  NEONMAP1(vrecpxs_f32, aarch64_neon_vrecpx, AddRetType),
+  NEONMAP1(vrshld_s64, aarch64_neon_vrshlds, 0),
+  NEONMAP1(vrshld_u64, aarch64_neon_vrshldu, 0),
+  NEONMAP1(vrshrd_n_s64, aarch64_neon_vsrshr, VectorRet),
+  NEONMAP1(vrshrd_n_u64, aarch64_neon_vurshr, VectorRet),
+  NEONMAP1(vrsqrted_f64, aarch64_neon_vrsqrte, AddRetType),
+  NEONMAP1(vrsqrtes_f32, aarch64_neon_vrsqrte, AddRetType),
+  NEONMAP1(vrsqrtsd_f64, aarch64_neon_vrsqrts, AddRetType),
+  NEONMAP1(vrsqrtss_f32, aarch64_neon_vrsqrts, AddRetType),
+  NEONMAP1(vrsrad_n_s64, aarch64_neon_vrsrads_n, 0),
+  NEONMAP1(vrsrad_n_u64, aarch64_neon_vrsradu_n, 0),
+  NEONMAP0(vset_lane_f32),
+  NEONMAP0(vset_lane_f64),
+  NEONMAP0(vset_lane_i16),
+  NEONMAP0(vset_lane_i32),
+  NEONMAP0(vset_lane_i64),
+  NEONMAP0(vset_lane_i8),
+  NEONMAP0(vsetq_lane_f32),
+  NEONMAP0(vsetq_lane_f64),
+  NEONMAP0(vsetq_lane_i16),
+  NEONMAP0(vsetq_lane_i32),
+  NEONMAP0(vsetq_lane_i64),
+  NEONMAP0(vsetq_lane_i8),
+  NEONMAP1(vsha1cq_u32, arm_neon_sha1c, 0),
+  NEONMAP1(vsha1h_u32, arm_neon_sha1h, 0),
+  NEONMAP1(vsha1mq_u32, arm_neon_sha1m, 0),
+  NEONMAP1(vsha1pq_u32, arm_neon_sha1p, 0),
+  NEONMAP1(vshld_n_s64, aarch64_neon_vshld_n, 0),
+  NEONMAP1(vshld_n_u64, aarch64_neon_vshld_n, 0),
+  NEONMAP1(vshld_s64, aarch64_neon_vshlds, 0),
+  NEONMAP1(vshld_u64, aarch64_neon_vshldu, 0),
+  NEONMAP1(vshrd_n_s64, aarch64_neon_vshrds_n, 0),
+  NEONMAP1(vshrd_n_u64, aarch64_neon_vshrdu_n, 0),
+  NEONMAP1(vslid_n_s64, aarch64_neon_vsli, VectorRet),
+  NEONMAP1(vslid_n_u64, aarch64_neon_vsli, VectorRet),
+  NEONMAP1(vsqaddb_u8, aarch64_neon_vsqadd, VectorRet),
+  NEONMAP1(vsqaddd_u64, aarch64_neon_vsqadd, VectorRet),
+  NEONMAP1(vsqaddh_u16, aarch64_neon_vsqadd, VectorRet),
+  NEONMAP1(vsqadds_u32, aarch64_neon_vsqadd, VectorRet),
+  NEONMAP1(vsrad_n_s64, aarch64_neon_vsrads_n, 0),
+  NEONMAP1(vsrad_n_u64, aarch64_neon_vsradu_n, 0),
+  NEONMAP1(vsrid_n_s64, aarch64_neon_vsri, VectorRet),
+  NEONMAP1(vsrid_n_u64, aarch64_neon_vsri, VectorRet),
+  NEONMAP1(vsubd_s64, aarch64_neon_vsubds, 0),
+  NEONMAP1(vsubd_u64, aarch64_neon_vsubdu, 0),
+  NEONMAP1(vtstd_s64, aarch64_neon_vtstd, VectorRetGetArgs01),
+  NEONMAP1(vtstd_u64, aarch64_neon_vtstd, VectorRetGetArgs01),
+  NEONMAP1(vuqaddb_s8, aarch64_neon_vuqadd, VectorRet),
+  NEONMAP1(vuqaddd_s64, aarch64_neon_vuqadd, VectorRet),
+  NEONMAP1(vuqaddh_s16, aarch64_neon_vuqadd, VectorRet),
+  NEONMAP1(vuqadds_s32, aarch64_neon_vuqadd, VectorRet)
+};
+
+static NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = {
+  NEONMAP2(vabd_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vabdq_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
+  NEONMAP1(vabs_v, arm_neon_vabs, 0),
+  NEONMAP1(vabsq_v, arm_neon_vabs, 0),
+  NEONMAP0(vaddhn_v),
+  NEONMAP1(vaesdq_v, arm_neon_aesd, 0),
+  NEONMAP1(vaeseq_v, arm_neon_aese, 0),
+  NEONMAP1(vaesimcq_v, arm_neon_aesimc, 0),
+  NEONMAP1(vaesmcq_v, arm_neon_aesmc, 0),
+  NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType),
+  NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType),
+  NEONMAP1(vcage_v, arm_neon_vacge, 0),
+  NEONMAP1(vcageq_v, arm_neon_vacge, 0),
+  NEONMAP1(vcagt_v, arm_neon_vacgt, 0),
+  NEONMAP1(vcagtq_v, arm_neon_vacgt, 0),
+  NEONMAP1(vcale_v, arm_neon_vacge, 0),
+  NEONMAP1(vcaleq_v, arm_neon_vacge, 0),
+  NEONMAP1(vcalt_v, arm_neon_vacgt, 0),
+  NEONMAP1(vcaltq_v, arm_neon_vacgt, 0),
+  NEONMAP1(vcls_v, arm_neon_vcls, Add1ArgType),
+  NEONMAP1(vclsq_v, arm_neon_vcls, Add1ArgType),
+  NEONMAP1(vclz_v, ctlz, Add1ArgType),
+  NEONMAP1(vclzq_v, ctlz, Add1ArgType),
+  NEONMAP1(vcnt_v, ctpop, Add1ArgType),
+  NEONMAP1(vcntq_v, ctpop, Add1ArgType),
+  NEONMAP1(vcvt_f16_v, arm_neon_vcvtfp2hf, 0),
+  NEONMAP1(vcvt_f32_f16, arm_neon_vcvthf2fp, 0),
+  NEONMAP0(vcvt_f32_v),
+  NEONMAP2(vcvt_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
+  NEONMAP1(vcvt_n_s32_v, arm_neon_vcvtfp2fxs, 0),
+  NEONMAP1(vcvt_n_s64_v, arm_neon_vcvtfp2fxs, 0),
+  NEONMAP1(vcvt_n_u32_v, arm_neon_vcvtfp2fxu, 0),
+  NEONMAP1(vcvt_n_u64_v, arm_neon_vcvtfp2fxu, 0),
+  NEONMAP0(vcvt_s32_v),
+  NEONMAP0(vcvt_s64_v),
+  NEONMAP0(vcvt_u32_v),
+  NEONMAP0(vcvt_u64_v),
+  NEONMAP1(vcvta_s32_v, arm_neon_vcvtas, 0),
+  NEONMAP1(vcvta_s64_v, arm_neon_vcvtas, 0),
+  NEONMAP1(vcvta_u32_v, arm_neon_vcvtau, 0),
+  NEONMAP1(vcvta_u64_v, arm_neon_vcvtau, 0),
+  NEONMAP1(vcvtaq_s32_v, arm_neon_vcvtas, 0),
+  NEONMAP1(vcvtaq_s64_v, arm_neon_vcvtas, 0),
+  NEONMAP1(vcvtaq_u32_v, arm_neon_vcvtau, 0),
+  NEONMAP1(vcvtaq_u64_v, arm_neon_vcvtau, 0),
+  NEONMAP1(vcvtm_s32_v, arm_neon_vcvtms, 0),
+  NEONMAP1(vcvtm_s64_v, arm_neon_vcvtms, 0),
+  NEONMAP1(vcvtm_u32_v, arm_neon_vcvtmu, 0),
+  NEONMAP1(vcvtm_u64_v, arm_neon_vcvtmu, 0),
+  NEONMAP1(vcvtmq_s32_v, arm_neon_vcvtms, 0),
+  NEONMAP1(vcvtmq_s64_v, arm_neon_vcvtms, 0),
+  NEONMAP1(vcvtmq_u32_v, arm_neon_vcvtmu, 0),
+  NEONMAP1(vcvtmq_u64_v, arm_neon_vcvtmu, 0),
+  NEONMAP1(vcvtn_s32_v, arm_neon_vcvtns, 0),
+  NEONMAP1(vcvtn_s64_v, arm_neon_vcvtns, 0),
+  NEONMAP1(vcvtn_u32_v, arm_neon_vcvtnu, 0),
+  NEONMAP1(vcvtn_u64_v, arm_neon_vcvtnu, 0),
+  NEONMAP1(vcvtnq_s32_v, arm_neon_vcvtns, 0),
+  NEONMAP1(vcvtnq_s64_v, arm_neon_vcvtns, 0),
+  NEONMAP1(vcvtnq_u32_v, arm_neon_vcvtnu, 0),
+  NEONMAP1(vcvtnq_u64_v, arm_neon_vcvtnu, 0),
+  NEONMAP1(vcvtp_s32_v, arm_neon_vcvtps, 0),
+  NEONMAP1(vcvtp_s64_v, arm_neon_vcvtps, 0),
+  NEONMAP1(vcvtp_u32_v, arm_neon_vcvtpu, 0),
+  NEONMAP1(vcvtp_u64_v, arm_neon_vcvtpu, 0),
+  NEONMAP1(vcvtpq_s32_v, arm_neon_vcvtps, 0),
+  NEONMAP1(vcvtpq_s64_v, arm_neon_vcvtps, 0),
+  NEONMAP1(vcvtpq_u32_v, arm_neon_vcvtpu, 0),
+  NEONMAP1(vcvtpq_u64_v, arm_neon_vcvtpu, 0),
+  NEONMAP0(vcvtq_f32_v),
+  NEONMAP2(vcvtq_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
+  NEONMAP1(vcvtq_n_s32_v, arm_neon_vcvtfp2fxs, 0),
+  NEONMAP1(vcvtq_n_s64_v, arm_neon_vcvtfp2fxs, 0),
+  NEONMAP1(vcvtq_n_u32_v, arm_neon_vcvtfp2fxu, 0),
+  NEONMAP1(vcvtq_n_u64_v, arm_neon_vcvtfp2fxu, 0),
+  NEONMAP0(vcvtq_s32_v),
+  NEONMAP0(vcvtq_s64_v),
+  NEONMAP0(vcvtq_u32_v),
+  NEONMAP0(vcvtq_u64_v),
+  NEONMAP0(vext_v),
+  NEONMAP0(vextq_v),
+  NEONMAP0(vfma_v),
+  NEONMAP0(vfmaq_v),
+  NEONMAP2(vhadd_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vhaddq_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vhsub_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vhsubq_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
+  NEONMAP0(vld1_dup_v),
+  NEONMAP1(vld1_v, arm_neon_vld1, 0),
+  NEONMAP0(vld1q_dup_v),
+  NEONMAP1(vld1q_v, arm_neon_vld1, 0),
+  NEONMAP1(vld2_lane_v, arm_neon_vld2lane, 0),
+  NEONMAP1(vld2_v, arm_neon_vld2, 0),
+  NEONMAP1(vld2q_lane_v, arm_neon_vld2lane, 0),
+  NEONMAP1(vld2q_v, arm_neon_vld2, 0),
+  NEONMAP1(vld3_lane_v, arm_neon_vld3lane, 0),
+  NEONMAP1(vld3_v, arm_neon_vld3, 0),
+  NEONMAP1(vld3q_lane_v, arm_neon_vld3lane, 0),
+  NEONMAP1(vld3q_v, arm_neon_vld3, 0),
+  NEONMAP1(vld4_lane_v, arm_neon_vld4lane, 0),
+  NEONMAP1(vld4_v, arm_neon_vld4, 0),
+  NEONMAP1(vld4q_lane_v, arm_neon_vld4lane, 0),
+  NEONMAP1(vld4q_v, arm_neon_vld4, 0),
+  NEONMAP2(vmax_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vmaxq_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vmin_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vminq_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
+  NEONMAP0(vmovl_v),
+  NEONMAP0(vmovn_v),
+  NEONMAP1(vmul_v, arm_neon_vmulp, Add1ArgType),
+  NEONMAP0(vmull_v),
+  NEONMAP1(vmulq_v, arm_neon_vmulp, Add1ArgType),
+  NEONMAP2(vpadal_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
+  NEONMAP2(vpadalq_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
+  NEONMAP1(vpadd_v, arm_neon_vpadd, Add1ArgType),
+  NEONMAP2(vpaddl_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
+  NEONMAP2(vpaddlq_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
+  NEONMAP1(vpaddq_v, arm_neon_vpadd, Add1ArgType),
+  NEONMAP2(vpmax_v, arm_neon_vpmaxu, arm_neon_vpmaxs, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vpmin_v, arm_neon_vpminu, arm_neon_vpmins, Add1ArgType | UnsignedAlts),
+  NEONMAP1(vqabs_v, arm_neon_vqabs, Add1ArgType),
+  NEONMAP1(vqabsq_v, arm_neon_vqabs, Add1ArgType),
+  NEONMAP2(vqadd_v, arm_neon_vqaddu, arm_neon_vqadds, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqaddq_v, arm_neon_vqaddu, arm_neon_vqadds, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqdmlal_v, arm_neon_vqdmull, arm_neon_vqadds, 0),
+  NEONMAP2(vqdmlsl_v, arm_neon_vqdmull, arm_neon_vqsubs, 0),
+  NEONMAP1(vqdmulh_v, arm_neon_vqdmulh, Add1ArgType),
+  NEONMAP1(vqdmulhq_v, arm_neon_vqdmulh, Add1ArgType),
+  NEONMAP1(vqdmull_v, arm_neon_vqdmull, Add1ArgType),
+  NEONMAP2(vqmovn_v, arm_neon_vqmovnu, arm_neon_vqmovns, Add1ArgType | UnsignedAlts),
+  NEONMAP1(vqmovun_v, arm_neon_vqmovnsu, Add1ArgType),
+  NEONMAP1(vqneg_v, arm_neon_vqneg, Add1ArgType),
+  NEONMAP1(vqnegq_v, arm_neon_vqneg, Add1ArgType),
+  NEONMAP1(vqrdmulh_v, arm_neon_vqrdmulh, Add1ArgType),
+  NEONMAP1(vqrdmulhq_v, arm_neon_vqrdmulh, Add1ArgType),
+  NEONMAP2(vqrshl_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqrshlq_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqshl_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
+  NEONMAP2(vqshl_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqshlq_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
+  NEONMAP2(vqshlq_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqsub_v, arm_neon_vqsubu, arm_neon_vqsubs, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqsubq_v, arm_neon_vqsubu, arm_neon_vqsubs, Add1ArgType | UnsignedAlts),
+  NEONMAP1(vraddhn_v, arm_neon_vraddhn, Add1ArgType),
+  NEONMAP2(vrecpe_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
+  NEONMAP2(vrecpeq_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
+  NEONMAP1(vrecps_v, arm_neon_vrecps, Add1ArgType),
+  NEONMAP1(vrecpsq_v, arm_neon_vrecps, Add1ArgType),
+  NEONMAP2(vrhadd_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vrhaddq_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vrshl_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vrshlq_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vrsqrte_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
+  NEONMAP2(vrsqrteq_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
+  NEONMAP1(vrsqrts_v, arm_neon_vrsqrts, Add1ArgType),
+  NEONMAP1(vrsqrtsq_v, arm_neon_vrsqrts, Add1ArgType),
+  NEONMAP1(vrsubhn_v, arm_neon_vrsubhn, Add1ArgType),
+  NEONMAP1(vsha1su0q_v, arm_neon_sha1su0, 0),
+  NEONMAP1(vsha1su1q_v, arm_neon_sha1su1, 0),
+  NEONMAP1(vsha256h2q_v, arm_neon_sha256h2, 0),
+  NEONMAP1(vsha256hq_v, arm_neon_sha256h, 0),
+  NEONMAP1(vsha256su0q_v, arm_neon_sha256su0, 0),
+  NEONMAP1(vsha256su1q_v, arm_neon_sha256su1, 0),
+  NEONMAP0(vshl_n_v),
+  NEONMAP2(vshl_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
+  NEONMAP0(vshll_n_v),
+  NEONMAP0(vshlq_n_v),
+  NEONMAP2(vshlq_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
+  NEONMAP0(vshr_n_v),
+  NEONMAP0(vshrn_n_v),
+  NEONMAP0(vshrq_n_v),
+  NEONMAP1(vst1_v, arm_neon_vst1, 0),
+  NEONMAP1(vst1q_v, arm_neon_vst1, 0),
+  NEONMAP1(vst2_lane_v, arm_neon_vst2lane, 0),
+  NEONMAP1(vst2_v, arm_neon_vst2, 0),
+  NEONMAP1(vst2q_lane_v, arm_neon_vst2lane, 0),
+  NEONMAP1(vst2q_v, arm_neon_vst2, 0),
+  NEONMAP1(vst3_lane_v, arm_neon_vst3lane, 0),
+  NEONMAP1(vst3_v, arm_neon_vst3, 0),
+  NEONMAP1(vst3q_lane_v, arm_neon_vst3lane, 0),
+  NEONMAP1(vst3q_v, arm_neon_vst3, 0),
+  NEONMAP1(vst4_lane_v, arm_neon_vst4lane, 0),
+  NEONMAP1(vst4_v, arm_neon_vst4, 0),
+  NEONMAP1(vst4q_lane_v, arm_neon_vst4lane, 0),
+  NEONMAP1(vst4q_v, arm_neon_vst4, 0),
+  NEONMAP0(vsubhn_v),
+  NEONMAP0(vtrn_v),
+  NEONMAP0(vtrnq_v),
+  NEONMAP0(vtst_v),
+  NEONMAP0(vtstq_v),
+  NEONMAP0(vuzp_v),
+  NEONMAP0(vuzpq_v),
+  NEONMAP0(vzip_v),
+  NEONMAP0(vzipq_v)
+};
+
+static NeonIntrinsicInfo ARM64SIMDIntrinsicMap[] = {
+  NEONMAP1(vabs_v, arm64_neon_abs, 0),
+  NEONMAP1(vabsq_v, arm64_neon_abs, 0),
+  NEONMAP0(vaddhn_v),
+  NEONMAP1(vaesdq_v, arm64_crypto_aesd, 0),
+  NEONMAP1(vaeseq_v, arm64_crypto_aese, 0),
+  NEONMAP1(vaesimcq_v, arm64_crypto_aesimc, 0),
+  NEONMAP1(vaesmcq_v, arm64_crypto_aesmc, 0),
+  NEONMAP1(vcage_v, arm64_neon_facge, 0),
+  NEONMAP1(vcageq_v, arm64_neon_facge, 0),
+  NEONMAP1(vcagt_v, arm64_neon_facgt, 0),
+  NEONMAP1(vcagtq_v, arm64_neon_facgt, 0),
+  NEONMAP1(vcale_v, arm64_neon_facge, 0),
+  NEONMAP1(vcaleq_v, arm64_neon_facge, 0),
+  NEONMAP1(vcalt_v, arm64_neon_facgt, 0),
+  NEONMAP1(vcaltq_v, arm64_neon_facgt, 0),
+  NEONMAP1(vcls_v, arm64_neon_cls, Add1ArgType),
+  NEONMAP1(vclsq_v, arm64_neon_cls, Add1ArgType),
+  NEONMAP1(vclz_v, ctlz, Add1ArgType),
+  NEONMAP1(vclzq_v, ctlz, Add1ArgType),
+  NEONMAP1(vcnt_v, ctpop, Add1ArgType),
+  NEONMAP1(vcntq_v, ctpop, Add1ArgType),
+  NEONMAP1(vcvt_f16_v, arm64_neon_vcvtfp2hf, 0),
+  NEONMAP1(vcvt_f32_f16, arm64_neon_vcvthf2fp, 0),
+  NEONMAP0(vcvt_f32_v),
+  NEONMAP2(vcvt_n_f32_v, arm64_neon_vcvtfxu2fp, arm64_neon_vcvtfxs2fp, 0),
+  NEONMAP2(vcvt_n_f64_v, arm64_neon_vcvtfxu2fp, arm64_neon_vcvtfxs2fp, 0),
+  NEONMAP1(vcvt_n_s32_v, arm64_neon_vcvtfp2fxs, 0),
+  NEONMAP1(vcvt_n_s64_v, arm64_neon_vcvtfp2fxs, 0),
+  NEONMAP1(vcvt_n_u32_v, arm64_neon_vcvtfp2fxu, 0),
+  NEONMAP1(vcvt_n_u64_v, arm64_neon_vcvtfp2fxu, 0),
+  NEONMAP0(vcvtq_f32_v),
+  NEONMAP2(vcvtq_n_f32_v, arm64_neon_vcvtfxu2fp, arm64_neon_vcvtfxs2fp, 0),
+  NEONMAP2(vcvtq_n_f64_v, arm64_neon_vcvtfxu2fp, arm64_neon_vcvtfxs2fp, 0),
+  NEONMAP1(vcvtq_n_s32_v, arm64_neon_vcvtfp2fxs, 0),
+  NEONMAP1(vcvtq_n_s64_v, arm64_neon_vcvtfp2fxs, 0),
+  NEONMAP1(vcvtq_n_u32_v, arm64_neon_vcvtfp2fxu, 0),
+  NEONMAP1(vcvtq_n_u64_v, arm64_neon_vcvtfp2fxu, 0),
+  NEONMAP1(vcvtx_f32_v, arm64_neon_fcvtxn, AddRetType | Add1ArgType),
+  NEONMAP0(vext_v),
+  NEONMAP0(vextq_v),
+  NEONMAP0(vfma_v),
+  NEONMAP0(vfmaq_v),
+  NEONMAP2(vhadd_v, arm64_neon_uhadd, arm64_neon_shadd, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vhaddq_v, arm64_neon_uhadd, arm64_neon_shadd, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vhsub_v, arm64_neon_uhsub, arm64_neon_shsub, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vhsubq_v, arm64_neon_uhsub, arm64_neon_shsub, Add1ArgType | UnsignedAlts),
+  NEONMAP0(vmovl_v),
+  NEONMAP0(vmovn_v),
+  NEONMAP1(vmul_v, arm64_neon_pmul, Add1ArgType),
+  NEONMAP1(vmulq_v, arm64_neon_pmul, Add1ArgType),
+  NEONMAP1(vpadd_v, arm64_neon_addp, Add1ArgType),
+  NEONMAP2(vpaddl_v, arm64_neon_uaddlp, arm64_neon_saddlp, UnsignedAlts),
+  NEONMAP2(vpaddlq_v, arm64_neon_uaddlp, arm64_neon_saddlp, UnsignedAlts),
+  NEONMAP1(vpaddq_v, arm64_neon_addp, Add1ArgType),
+  NEONMAP1(vqabs_v, arm64_neon_sqabs, Add1ArgType),
+  NEONMAP1(vqabsq_v, arm64_neon_sqabs, Add1ArgType),
+  NEONMAP2(vqadd_v, arm64_neon_uqadd, arm64_neon_sqadd, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqaddq_v, arm64_neon_uqadd, arm64_neon_sqadd, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqdmlal_v, arm64_neon_sqdmull, arm64_neon_sqadd, 0),
+  NEONMAP2(vqdmlsl_v, arm64_neon_sqdmull, arm64_neon_sqsub, 0),
+  NEONMAP1(vqdmulh_v, arm64_neon_sqdmulh, Add1ArgType),
+  NEONMAP1(vqdmulhq_v, arm64_neon_sqdmulh, Add1ArgType),
+  NEONMAP1(vqdmull_v, arm64_neon_sqdmull, Add1ArgType),
+  NEONMAP2(vqmovn_v, arm64_neon_uqxtn, arm64_neon_sqxtn, Add1ArgType | UnsignedAlts),
+  NEONMAP1(vqmovun_v, arm64_neon_sqxtun, Add1ArgType),
+  NEONMAP1(vqneg_v, arm64_neon_sqneg, Add1ArgType),
+  NEONMAP1(vqnegq_v, arm64_neon_sqneg, Add1ArgType),
+  NEONMAP1(vqrdmulh_v, arm64_neon_sqrdmulh, Add1ArgType),
+  NEONMAP1(vqrdmulhq_v, arm64_neon_sqrdmulh, Add1ArgType),
+  NEONMAP2(vqrshl_v, arm64_neon_uqrshl, arm64_neon_sqrshl, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqrshlq_v, arm64_neon_uqrshl, arm64_neon_sqrshl, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqshl_n_v, arm64_neon_uqshl, arm64_neon_sqshl, UnsignedAlts),
+  NEONMAP2(vqshl_v, arm64_neon_uqshl, arm64_neon_sqshl, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqshlq_n_v, arm64_neon_uqshl, arm64_neon_sqshl,UnsignedAlts),
+  NEONMAP2(vqshlq_v, arm64_neon_uqshl, arm64_neon_sqshl, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqsub_v, arm64_neon_uqsub, arm64_neon_sqsub, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqsubq_v, arm64_neon_uqsub, arm64_neon_sqsub, Add1ArgType | UnsignedAlts),
+  NEONMAP1(vraddhn_v, arm64_neon_raddhn, Add1ArgType),
+  NEONMAP2(vrecpe_v, arm64_neon_frecpe, arm64_neon_urecpe, 0),
+  NEONMAP2(vrecpeq_v, arm64_neon_frecpe, arm64_neon_urecpe, 0),
+  NEONMAP1(vrecps_v, arm64_neon_frecps, Add1ArgType),
+  NEONMAP1(vrecpsq_v, arm64_neon_frecps, Add1ArgType),
+  NEONMAP2(vrhadd_v, arm64_neon_urhadd, arm64_neon_srhadd, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vrhaddq_v, arm64_neon_urhadd, arm64_neon_srhadd, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vrshl_v, arm64_neon_urshl, arm64_neon_srshl, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vrshlq_v, arm64_neon_urshl, arm64_neon_srshl, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vrsqrte_v, arm64_neon_frsqrte, arm64_neon_ursqrte, 0),
+  NEONMAP2(vrsqrteq_v, arm64_neon_frsqrte, arm64_neon_ursqrte, 0),
+  NEONMAP1(vrsqrts_v, arm64_neon_frsqrts, Add1ArgType),
+  NEONMAP1(vrsqrtsq_v, arm64_neon_frsqrts, Add1ArgType),
+  NEONMAP1(vrsubhn_v, arm64_neon_rsubhn, Add1ArgType),
+  NEONMAP1(vsha1su0q_v, arm64_crypto_sha1su0, 0),
+  NEONMAP1(vsha1su1q_v, arm64_crypto_sha1su1, 0),
+  NEONMAP1(vsha256h2q_v, arm64_crypto_sha256h2, 0),
+  NEONMAP1(vsha256hq_v, arm64_crypto_sha256h, 0),
+  NEONMAP1(vsha256su0q_v, arm64_crypto_sha256su0, 0),
+  NEONMAP1(vsha256su1q_v, arm64_crypto_sha256su1, 0),
+  NEONMAP0(vshl_n_v),
+  NEONMAP2(vshl_v, arm64_neon_ushl, arm64_neon_sshl, Add1ArgType | UnsignedAlts),
+  NEONMAP0(vshll_n_v),
+  NEONMAP0(vshlq_n_v),
+  NEONMAP2(vshlq_v, arm64_neon_ushl, arm64_neon_sshl, Add1ArgType | UnsignedAlts),
+  NEONMAP0(vshr_n_v),
+  NEONMAP0(vshrn_n_v),
+  NEONMAP0(vshrq_n_v),
+  NEONMAP0(vsubhn_v),
+  NEONMAP0(vtst_v),
+  NEONMAP0(vtstq_v),
+};
+
+static NeonIntrinsicInfo ARM64SISDIntrinsicMap[] = {
+  NEONMAP1(vabdd_f64, arm64_sisd_fabd, Add1ArgType),
+  NEONMAP1(vabds_f32, arm64_sisd_fabd, Add1ArgType),
+  NEONMAP1(vabsd_s64, arm64_neon_abs, Add1ArgType),
+  NEONMAP1(vaddlv_s32, arm64_neon_saddlv, AddRetType | Add1ArgType),
+  NEONMAP1(vaddlv_u32, arm64_neon_uaddlv, AddRetType | Add1ArgType),
+  NEONMAP1(vaddlvq_s32, arm64_neon_saddlv, AddRetType | Add1ArgType),
+  NEONMAP1(vaddlvq_u32, arm64_neon_uaddlv, AddRetType | Add1ArgType),
+  NEONMAP1(vaddv_f32, arm64_neon_faddv, AddRetType | Add1ArgType),
+  NEONMAP1(vaddv_s32, arm64_neon_saddv, AddRetType | Add1ArgType),
+  NEONMAP1(vaddv_u32, arm64_neon_uaddv, AddRetType | Add1ArgType),
+  NEONMAP1(vaddvq_f32, arm64_neon_faddv, AddRetType | Add1ArgType),
+  NEONMAP1(vaddvq_f64, arm64_neon_faddv, AddRetType | Add1ArgType),
+  NEONMAP1(vaddvq_s32, arm64_neon_saddv, AddRetType | Add1ArgType),
+  NEONMAP1(vaddvq_s64, arm64_neon_saddv, AddRetType | Add1ArgType),
+  NEONMAP1(vaddvq_u32, arm64_neon_uaddv, AddRetType | Add1ArgType),
+  NEONMAP1(vaddvq_u64, arm64_neon_uaddv, AddRetType | Add1ArgType),
+  NEONMAP1(vcaged_f64, arm64_neon_facge, AddRetType | Add1ArgType),
+  NEONMAP1(vcages_f32, arm64_neon_facge, AddRetType | Add1ArgType),
+  NEONMAP1(vcagtd_f64, arm64_neon_facgt, AddRetType | Add1ArgType),
+  NEONMAP1(vcagts_f32, arm64_neon_facgt, AddRetType | Add1ArgType),
+  NEONMAP1(vcaled_f64, arm64_neon_facge, AddRetType | Add1ArgType),
+  NEONMAP1(vcales_f32, arm64_neon_facge, AddRetType | Add1ArgType),
+  NEONMAP1(vcaltd_f64, arm64_neon_facgt, AddRetType | Add1ArgType),
+  NEONMAP1(vcalts_f32, arm64_neon_facgt, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtad_s64_f64, arm64_neon_fcvtas, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtad_u64_f64, arm64_neon_fcvtau, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtas_s32_f32, arm64_neon_fcvtas, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtas_u32_f32, arm64_neon_fcvtau, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtd_n_f64_s64, arm64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtd_n_f64_u64, arm64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtd_n_s64_f64, arm64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtd_n_u64_f64, arm64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtmd_s64_f64, arm64_neon_fcvtms, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtmd_u64_f64, arm64_neon_fcvtmu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtms_s32_f32, arm64_neon_fcvtms, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtms_u32_f32, arm64_neon_fcvtmu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtnd_s64_f64, arm64_neon_fcvtns, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtnd_u64_f64, arm64_neon_fcvtnu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtns_s32_f32, arm64_neon_fcvtns, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtns_u32_f32, arm64_neon_fcvtnu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtpd_s64_f64, arm64_neon_fcvtps, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtpd_u64_f64, arm64_neon_fcvtpu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtps_s32_f32, arm64_neon_fcvtps, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtps_u32_f32, arm64_neon_fcvtpu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvts_n_f32_s32, arm64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
+  NEONMAP1(vcvts_n_f32_u32, arm64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
+  NEONMAP1(vcvts_n_s32_f32, arm64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
+  NEONMAP1(vcvts_n_u32_f32, arm64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvtxd_f32_f64, arm64_sisd_fcvtxn, 0),
+  NEONMAP1(vmaxnmv_f32, arm64_neon_fmaxnmv, AddRetType | Add1ArgType),
+  NEONMAP1(vmaxnmvq_f32, arm64_neon_fmaxnmv, AddRetType | Add1ArgType),
+  NEONMAP1(vmaxnmvq_f64, arm64_neon_fmaxnmv, AddRetType | Add1ArgType),
+  NEONMAP1(vmaxv_f32, arm64_neon_fmaxv, AddRetType | Add1ArgType),
+  NEONMAP1(vmaxv_s32, arm64_neon_smaxv, AddRetType | Add1ArgType),
+  NEONMAP1(vmaxv_u32, arm64_neon_umaxv, AddRetType | Add1ArgType),
+  NEONMAP1(vmaxvq_f32, arm64_neon_fmaxv, AddRetType | Add1ArgType),
+  NEONMAP1(vmaxvq_f64, arm64_neon_fmaxv, AddRetType | Add1ArgType),
+  NEONMAP1(vmaxvq_s32, arm64_neon_smaxv, AddRetType | Add1ArgType),
+  NEONMAP1(vmaxvq_u32, arm64_neon_umaxv, AddRetType | Add1ArgType),
+  NEONMAP1(vminnmv_f32, arm64_neon_fminnmv, AddRetType | Add1ArgType),
+  NEONMAP1(vminnmvq_f32, arm64_neon_fminnmv, AddRetType | Add1ArgType),
+  NEONMAP1(vminnmvq_f64, arm64_neon_fminnmv, AddRetType | Add1ArgType),
+  NEONMAP1(vminv_f32, arm64_neon_fminv, AddRetType | Add1ArgType),
+  NEONMAP1(vminv_s32, arm64_neon_sminv, AddRetType | Add1ArgType),
+  NEONMAP1(vminv_u32, arm64_neon_uminv, AddRetType | Add1ArgType),
+  NEONMAP1(vminvq_f32, arm64_neon_fminv, AddRetType | Add1ArgType),
+  NEONMAP1(vminvq_f64, arm64_neon_fminv, AddRetType | Add1ArgType),
+  NEONMAP1(vminvq_s32, arm64_neon_sminv, AddRetType | Add1ArgType),
+  NEONMAP1(vminvq_u32, arm64_neon_uminv, AddRetType | Add1ArgType),
+  NEONMAP1(vmull_p64, arm64_neon_pmull64, 0),
+  NEONMAP1(vmulxd_f64, arm64_neon_fmulx, Add1ArgType),
+  NEONMAP1(vmulxs_f32, arm64_neon_fmulx, Add1ArgType),
+  NEONMAP1(vpaddd_s64, arm64_neon_uaddv, AddRetType | Add1ArgType),
+  NEONMAP1(vpaddd_u64, arm64_neon_uaddv, AddRetType | Add1ArgType),
+  NEONMAP1(vpmaxnmqd_f64, arm64_neon_fmaxnmv, AddRetType | Add1ArgType),
+  NEONMAP1(vpmaxnms_f32, arm64_neon_fmaxnmv, AddRetType | Add1ArgType),
+  NEONMAP1(vpmaxqd_f64, arm64_neon_fmaxv, AddRetType | Add1ArgType),
+  NEONMAP1(vpmaxs_f32, arm64_neon_fmaxv, AddRetType | Add1ArgType),
+  NEONMAP1(vpminnmqd_f64, arm64_neon_fminnmv, AddRetType | Add1ArgType),
+  NEONMAP1(vpminnms_f32, arm64_neon_fminnmv, AddRetType | Add1ArgType),
+  NEONMAP1(vpminqd_f64, arm64_neon_fminv, AddRetType | Add1ArgType),
+  NEONMAP1(vpmins_f32, arm64_neon_fminv, AddRetType | Add1ArgType),
+  NEONMAP1(vqabsb_s8, arm64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqabsd_s64, arm64_neon_sqabs, Add1ArgType),
+  NEONMAP1(vqabsh_s16, arm64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqabss_s32, arm64_neon_sqabs, Add1ArgType),
+  NEONMAP1(vqaddb_s8, arm64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqaddb_u8, arm64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqaddd_s64, arm64_neon_sqadd, Add1ArgType),
+  NEONMAP1(vqaddd_u64, arm64_neon_uqadd, Add1ArgType),
+  NEONMAP1(vqaddh_s16, arm64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqaddh_u16, arm64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqadds_s32, arm64_neon_sqadd, Add1ArgType),
+  NEONMAP1(vqadds_u32, arm64_neon_uqadd, Add1ArgType),
+  NEONMAP1(vqdmulhh_s16, arm64_neon_sqdmulh, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqdmulhs_s32, arm64_neon_sqdmulh, Add1ArgType),
+  NEONMAP1(vqdmullh_s16, arm64_neon_sqdmull, VectorRet | Use128BitVectors),
+  NEONMAP1(vqdmulls_s32, arm64_neon_sqdmulls_scalar, 0),
+  NEONMAP1(vqmovnd_s64, arm64_neon_scalar_sqxtn, AddRetType | Add1ArgType),
+  NEONMAP1(vqmovnd_u64, arm64_neon_scalar_uqxtn, AddRetType | Add1ArgType),
+  NEONMAP1(vqmovnh_s16, arm64_neon_sqxtn, VectorRet | Use64BitVectors),
+  NEONMAP1(vqmovnh_u16, arm64_neon_uqxtn, VectorRet | Use64BitVectors),
+  NEONMAP1(vqmovns_s32, arm64_neon_sqxtn, VectorRet | Use64BitVectors),
+  NEONMAP1(vqmovns_u32, arm64_neon_uqxtn, VectorRet | Use64BitVectors),
+  NEONMAP1(vqmovund_s64, arm64_neon_scalar_sqxtun, AddRetType | Add1ArgType),
+  NEONMAP1(vqmovunh_s16, arm64_neon_sqxtun, VectorRet | Use64BitVectors),
+  NEONMAP1(vqmovuns_s32, arm64_neon_sqxtun, VectorRet | Use64BitVectors),
+  NEONMAP1(vqnegb_s8, arm64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqnegd_s64, arm64_neon_sqneg, Add1ArgType),
+  NEONMAP1(vqnegh_s16, arm64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqnegs_s32, arm64_neon_sqneg, Add1ArgType),
+  NEONMAP1(vqrdmulhh_s16, arm64_neon_sqrdmulh, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqrdmulhs_s32, arm64_neon_sqrdmulh, Add1ArgType),
+  NEONMAP1(vqrshlb_s8, arm64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqrshlb_u8, arm64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqrshld_s64, arm64_neon_sqrshl, Add1ArgType),
+  NEONMAP1(vqrshld_u64, arm64_neon_uqrshl, Add1ArgType),
+  NEONMAP1(vqrshlh_s16, arm64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqrshlh_u16, arm64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqrshls_s32, arm64_neon_sqrshl, Add1ArgType),
+  NEONMAP1(vqrshls_u32, arm64_neon_uqrshl, Add1ArgType),
+  NEONMAP1(vqrshrnd_n_s64, arm64_neon_sqrshrn, AddRetType),
+  NEONMAP1(vqrshrnd_n_u64, arm64_neon_uqrshrn, AddRetType),
+  NEONMAP1(vqrshrnh_n_s16, arm64_neon_sqrshrn, VectorRet | Use64BitVectors),
+  NEONMAP1(vqrshrnh_n_u16, arm64_neon_uqrshrn, VectorRet | Use64BitVectors),
+  NEONMAP1(vqrshrns_n_s32, arm64_neon_sqrshrn, VectorRet | Use64BitVectors),
+  NEONMAP1(vqrshrns_n_u32, arm64_neon_uqrshrn, VectorRet | Use64BitVectors),
+  NEONMAP1(vqrshrund_n_s64, arm64_neon_sqrshrun, AddRetType),
+  NEONMAP1(vqrshrunh_n_s16, arm64_neon_sqrshrun, VectorRet | Use64BitVectors),
+  NEONMAP1(vqrshruns_n_s32, arm64_neon_sqrshrun, VectorRet | Use64BitVectors),
+  NEONMAP1(vqshlb_n_s8, arm64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqshlb_n_u8, arm64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqshlb_s8, arm64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqshlb_u8, arm64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqshld_s64, arm64_neon_sqshl, Add1ArgType),
+  NEONMAP1(vqshld_u64, arm64_neon_uqshl, Add1ArgType),
+  NEONMAP1(vqshlh_n_s16, arm64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqshlh_n_u16, arm64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqshlh_s16, arm64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqshlh_u16, arm64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqshls_n_s32, arm64_neon_sqshl, Add1ArgType),
+  NEONMAP1(vqshls_n_u32, arm64_neon_uqshl, Add1ArgType),
+  NEONMAP1(vqshls_s32, arm64_neon_sqshl, Add1ArgType),
+  NEONMAP1(vqshls_u32, arm64_neon_uqshl, Add1ArgType),
+  NEONMAP1(vqshlub_n_s8, arm64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqshluh_n_s16, arm64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqshlus_n_s32, arm64_neon_sqshlu, Add1ArgType),
+  NEONMAP1(vqshrnd_n_s64, arm64_neon_sqshrn, AddRetType),
+  NEONMAP1(vqshrnd_n_u64, arm64_neon_uqshrn, AddRetType),
+  NEONMAP1(vqshrnh_n_s16, arm64_neon_sqshrn, VectorRet | Use64BitVectors),
+  NEONMAP1(vqshrnh_n_u16, arm64_neon_uqshrn, VectorRet | Use64BitVectors),
+  NEONMAP1(vqshrns_n_s32, arm64_neon_sqshrn, VectorRet | Use64BitVectors),
+  NEONMAP1(vqshrns_n_u32, arm64_neon_uqshrn, VectorRet | Use64BitVectors),
+  NEONMAP1(vqshrund_n_s64, arm64_neon_sqshrun, AddRetType),
+  NEONMAP1(vqshrunh_n_s16, arm64_neon_sqshrun, VectorRet | Use64BitVectors),
+  NEONMAP1(vqshruns_n_s32, arm64_neon_sqshrun, VectorRet | Use64BitVectors),
+  NEONMAP1(vqsubb_s8, arm64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqsubb_u8, arm64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqsubd_s64, arm64_neon_sqsub, Add1ArgType),
+  NEONMAP1(vqsubd_u64, arm64_neon_uqsub, Add1ArgType),
+  NEONMAP1(vqsubh_s16, arm64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqsubh_u16, arm64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vqsubs_s32, arm64_neon_sqsub, Add1ArgType),
+  NEONMAP1(vqsubs_u32, arm64_neon_uqsub, Add1ArgType),
+  NEONMAP1(vrecped_f64, arm64_neon_frecpe, Add1ArgType),
+  NEONMAP1(vrecpes_f32, arm64_neon_frecpe, Add1ArgType),
+  NEONMAP1(vrecpxd_f64, arm64_neon_frecpx, Add1ArgType),
+  NEONMAP1(vrecpxs_f32, arm64_neon_frecpx, Add1ArgType),
+  NEONMAP1(vrshld_s64, arm64_neon_srshl, Add1ArgType),
+  NEONMAP1(vrshld_u64, arm64_neon_urshl, Add1ArgType),
+  NEONMAP1(vrsqrted_f64, arm64_neon_frsqrte, Add1ArgType),
+  NEONMAP1(vrsqrtes_f32, arm64_neon_frsqrte, Add1ArgType),
+  NEONMAP1(vrsqrtsd_f64, arm64_neon_frsqrts, Add1ArgType),
+  NEONMAP1(vrsqrtss_f32, arm64_neon_frsqrts, Add1ArgType),
+  NEONMAP1(vsha1cq_u32, arm64_crypto_sha1c, 0),
+  NEONMAP1(vsha1h_u32, arm64_crypto_sha1h, 0),
+  NEONMAP1(vsha1mq_u32, arm64_crypto_sha1m, 0),
+  NEONMAP1(vsha1pq_u32, arm64_crypto_sha1p, 0),
+  NEONMAP1(vshld_s64, arm64_neon_sshl, Add1ArgType),
+  NEONMAP1(vshld_u64, arm64_neon_ushl, Add1ArgType),
+  NEONMAP1(vslid_n_s64, arm64_neon_vsli, Vectorize1ArgType),
+  NEONMAP1(vslid_n_u64, arm64_neon_vsli, Vectorize1ArgType),
+  NEONMAP1(vsqaddb_u8, arm64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vsqaddd_u64, arm64_neon_usqadd, Add1ArgType),
+  NEONMAP1(vsqaddh_u16, arm64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vsqadds_u32, arm64_neon_usqadd, Add1ArgType),
+  NEONMAP1(vsrid_n_s64, arm64_neon_vsri, Vectorize1ArgType),
+  NEONMAP1(vsrid_n_u64, arm64_neon_vsri, Vectorize1ArgType),
+  NEONMAP1(vuqaddb_s8, arm64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vuqaddd_s64, arm64_neon_suqadd, Add1ArgType),
+  NEONMAP1(vuqaddh_s16, arm64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
+  NEONMAP1(vuqadds_s32, arm64_neon_suqadd, Add1ArgType),
+};
+
+#undef NEONMAP0
+#undef NEONMAP1
+#undef NEONMAP2
+
+static bool NEONSIMDIntrinsicsProvenSorted = false;
+static bool AArch64SISDIntrinsicInfoProvenSorted = false;
+
+static bool ARM64SIMDIntrinsicsProvenSorted = false;
+static bool ARM64SISDIntrinsicsProvenSorted = false;
+
+
+static const NeonIntrinsicInfo *
+findNeonIntrinsicInMap(llvm::ArrayRef<NeonIntrinsicInfo> IntrinsicMap,
+                       unsigned BuiltinID, bool &MapProvenSorted) {
+
+#ifndef NDEBUG
+  if (!MapProvenSorted) {
+    // FIXME: use std::is_sorted once C++11 is allowed
+    for (unsigned i = 0; i < IntrinsicMap.size() - 1; ++i)
+      assert(IntrinsicMap[i].BuiltinID <= IntrinsicMap[i + 1].BuiltinID);
+    MapProvenSorted = true;
+  }
+#endif
+
+  const NeonIntrinsicInfo *Builtin =
+      std::lower_bound(IntrinsicMap.begin(), IntrinsicMap.end(), BuiltinID);
+
+  if (Builtin != IntrinsicMap.end() && Builtin->BuiltinID == BuiltinID)
+    return Builtin;
+
+  return 0;
+}
+
+Function *CodeGenFunction::LookupNeonLLVMIntrinsic(unsigned IntrinsicID,
+                                                   unsigned Modifier,
+                                                   llvm::Type *ArgType,
+                                                   const CallExpr *E) {
+  int VectorSize = 0;
+  if (Modifier & Use64BitVectors)
+    VectorSize = 64;
+  else if (Modifier & Use128BitVectors)
+    VectorSize = 128;
+
+  // Return type.
+  SmallVector<llvm::Type *, 3> Tys;
+  if (Modifier & AddRetType) {
+    llvm::Type *Ty = ConvertType(E->getCallReturnType());
+    if (Modifier & VectorizeRetType)
+      Ty = llvm::VectorType::get(
+          Ty, VectorSize ? VectorSize / Ty->getPrimitiveSizeInBits() : 1);
+
+    Tys.push_back(Ty);
+  }
+
+  // Arguments.
+  if (Modifier & VectorizeArgTypes) {
+    int Elts = VectorSize ? VectorSize / ArgType->getPrimitiveSizeInBits() : 1;
+    ArgType = llvm::VectorType::get(ArgType, Elts);
+  }
+
+  if (Modifier & (Add1ArgType | Add2ArgTypes))
+    Tys.push_back(ArgType);
+
+  if (Modifier & Add2ArgTypes)
+    Tys.push_back(ArgType);
+
+  if (Modifier & InventFloatType)
+    Tys.push_back(FloatTy);
+
+  return CGM.getIntrinsic(IntrinsicID, Tys);
+}
+
+static Value *EmitCommonNeonSISDBuiltinExpr(CodeGenFunction &CGF,
+                                            const NeonIntrinsicInfo &SISDInfo,
+                                            SmallVectorImpl<Value *> &Ops,
+                                            const CallExpr *E) {
+  unsigned BuiltinID = SISDInfo.BuiltinID;
+  unsigned int Int = SISDInfo.LLVMIntrinsic;
+  unsigned Modifier = SISDInfo.TypeModifier;
+  const char *s = SISDInfo.NameHint;
+
+  switch (BuiltinID) {
+  case NEON::BI__builtin_neon_vcled_s64:
+  case NEON::BI__builtin_neon_vcled_u64:
+  case NEON::BI__builtin_neon_vcles_f32:
+  case NEON::BI__builtin_neon_vcled_f64:
+  case NEON::BI__builtin_neon_vcltd_s64:
+  case NEON::BI__builtin_neon_vcltd_u64:
+  case NEON::BI__builtin_neon_vclts_f32:
+  case NEON::BI__builtin_neon_vcltd_f64:
+  case NEON::BI__builtin_neon_vcales_f32:
+  case NEON::BI__builtin_neon_vcaled_f64:
+  case NEON::BI__builtin_neon_vcalts_f32:
+  case NEON::BI__builtin_neon_vcaltd_f64:
+    // Only one direction of comparisons actually exist, cmle is actually a cmge
+    // with swapped operands. The table gives us the right intrinsic but we
+    // still need to do the swap.
+    std::swap(Ops[0], Ops[1]);
+    break;
+  }
+
+  assert(Int && "Generic code assumes a valid intrinsic");
+
+  // Determine the type(s) of this overloaded AArch64 intrinsic.
+  const Expr *Arg = E->getArg(0);
+  llvm::Type *ArgTy = CGF.ConvertType(Arg->getType());
+  Function *F = CGF.LookupNeonLLVMIntrinsic(Int, Modifier, ArgTy, E);
+
+  int j = 0;
+  ConstantInt *C0 = ConstantInt::get(CGF.Int32Ty, 0);
+  for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+       ai != ae; ++ai, ++j) {
+    llvm::Type *ArgTy = ai->getType();
+    if (Ops[j]->getType()->getPrimitiveSizeInBits() ==
+             ArgTy->getPrimitiveSizeInBits())
+      continue;
+
+    assert(ArgTy->isVectorTy() && !Ops[j]->getType()->isVectorTy());
+    // The constant argument to an _n_ intrinsic always has Int32Ty, so truncate
+    // it before inserting.
+    Ops[j] =
+        CGF.Builder.CreateTruncOrBitCast(Ops[j], ArgTy->getVectorElementType());
+    Ops[j] =
+        CGF.Builder.CreateInsertElement(UndefValue::get(ArgTy), Ops[j], C0);
+  }
+
+  Value *Result = CGF.EmitNeonCall(F, Ops, s);
+  llvm::Type *ResultType = CGF.ConvertType(E->getType());
+  if (ResultType->getPrimitiveSizeInBits() <
+      Result->getType()->getPrimitiveSizeInBits())
+    return CGF.Builder.CreateExtractElement(Result, C0);
+
+  return CGF.Builder.CreateBitCast(Result, ResultType, s);
+}
+
 static Value *EmitAArch64ScalarBuiltinExpr(CodeGenFunction &CGF,
-                                           unsigned BuiltinID,
+                                           const NeonIntrinsicInfo &SISDInfo,
                                            const CallExpr *E) {
-  unsigned int Int = 0;
-  // Scalar result generated across vectors
-  bool AcrossVec = false;
-  // Extend element of one-element vector
-  bool ExtendEle = false;
-  bool OverloadInt = false;
-  bool OverloadCmpInt = false;
-  bool IsFpCmpZInt = false;
-  bool OverloadCvtInt = false;
-  bool OverloadWideInt = false;
-  bool OverloadNarrowInt = false;
-  const char *s = NULL;
+  unsigned BuiltinID = SISDInfo.BuiltinID;
+  unsigned int Int = SISDInfo.LLVMIntrinsic;
+  const char *s = SISDInfo.NameHint;
 
   SmallVector<Value *, 4> Ops;
   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
@@ -1774,20 +2884,20 @@
   // argument that specifies the vector type, need to handle each case.
   switch (BuiltinID) {
   default: break;
-  case AArch64::BI__builtin_neon_vdups_lane_f32:
-  case AArch64::BI__builtin_neon_vdupd_lane_f64:
-  case AArch64::BI__builtin_neon_vdups_laneq_f32:
-  case AArch64::BI__builtin_neon_vdupd_laneq_f64: {
+  case NEON::BI__builtin_neon_vdups_lane_f32:
+  case NEON::BI__builtin_neon_vdupd_lane_f64:
+  case NEON::BI__builtin_neon_vdups_laneq_f32:
+  case NEON::BI__builtin_neon_vdupd_laneq_f64: {
     return CGF.Builder.CreateExtractElement(Ops[0], Ops[1], "vdup_lane");
   }
-  case AArch64::BI__builtin_neon_vdupb_lane_i8:
-  case AArch64::BI__builtin_neon_vduph_lane_i16:
-  case AArch64::BI__builtin_neon_vdups_lane_i32:
-  case AArch64::BI__builtin_neon_vdupd_lane_i64:
-  case AArch64::BI__builtin_neon_vdupb_laneq_i8:
-  case AArch64::BI__builtin_neon_vduph_laneq_i16:
-  case AArch64::BI__builtin_neon_vdups_laneq_i32:
-  case AArch64::BI__builtin_neon_vdupd_laneq_i64: {
+  case NEON::BI__builtin_neon_vdupb_lane_i8:
+  case NEON::BI__builtin_neon_vduph_lane_i16:
+  case NEON::BI__builtin_neon_vdups_lane_i32:
+  case NEON::BI__builtin_neon_vdupd_lane_i64:
+  case NEON::BI__builtin_neon_vdupb_laneq_i8:
+  case NEON::BI__builtin_neon_vduph_laneq_i16:
+  case NEON::BI__builtin_neon_vdups_laneq_i32:
+  case NEON::BI__builtin_neon_vdupd_laneq_i64: {
     // The backend treats Neon scalar types as v1ix types
     // So we want to dup lane from any vector to v1ix vector
     // with shufflevector
@@ -1799,19 +2909,19 @@
     // scalar type expected by the builtin
     return CGF.Builder.CreateBitCast(Result, Ty, s);
   }
-  case AArch64::BI__builtin_neon_vqdmlalh_lane_s16 :
-  case AArch64::BI__builtin_neon_vqdmlalh_laneq_s16 :
-  case AArch64::BI__builtin_neon_vqdmlals_lane_s32 :
-  case AArch64::BI__builtin_neon_vqdmlals_laneq_s32 :
-  case AArch64::BI__builtin_neon_vqdmlslh_lane_s16 :
-  case AArch64::BI__builtin_neon_vqdmlslh_laneq_s16 :
-  case AArch64::BI__builtin_neon_vqdmlsls_lane_s32 :
-  case AArch64::BI__builtin_neon_vqdmlsls_laneq_s32 : {
+  case NEON::BI__builtin_neon_vqdmlalh_lane_s16 :
+  case NEON::BI__builtin_neon_vqdmlalh_laneq_s16 :
+  case NEON::BI__builtin_neon_vqdmlals_lane_s32 :
+  case NEON::BI__builtin_neon_vqdmlals_laneq_s32 :
+  case NEON::BI__builtin_neon_vqdmlslh_lane_s16 :
+  case NEON::BI__builtin_neon_vqdmlslh_laneq_s16 :
+  case NEON::BI__builtin_neon_vqdmlsls_lane_s32 :
+  case NEON::BI__builtin_neon_vqdmlsls_laneq_s32 : {
     Int = Intrinsic::arm_neon_vqadds;
-    if (BuiltinID == AArch64::BI__builtin_neon_vqdmlslh_lane_s16 ||
-        BuiltinID == AArch64::BI__builtin_neon_vqdmlslh_laneq_s16 ||
-        BuiltinID == AArch64::BI__builtin_neon_vqdmlsls_lane_s32 ||
-        BuiltinID == AArch64::BI__builtin_neon_vqdmlsls_laneq_s32) {
+    if (BuiltinID == NEON::BI__builtin_neon_vqdmlslh_lane_s16 ||
+        BuiltinID == NEON::BI__builtin_neon_vqdmlslh_laneq_s16 ||
+        BuiltinID == NEON::BI__builtin_neon_vqdmlsls_lane_s32 ||
+        BuiltinID == NEON::BI__builtin_neon_vqdmlsls_laneq_s32) {
       Int = Intrinsic::arm_neon_vqsubs;
     }
     // create vqdmull call with b * c[i]
@@ -1839,23 +2949,23 @@
     Value *AddRes = CGF.Builder.CreateCall2(F, AddOps[0], AddOps[1]);
     return CGF.Builder.CreateBitCast(AddRes, Ty);
   }
-  case AArch64::BI__builtin_neon_vfmas_lane_f32:
-  case AArch64::BI__builtin_neon_vfmas_laneq_f32:
-  case AArch64::BI__builtin_neon_vfmad_lane_f64:
-  case AArch64::BI__builtin_neon_vfmad_laneq_f64: {
+  case NEON::BI__builtin_neon_vfmas_lane_f32:
+  case NEON::BI__builtin_neon_vfmas_laneq_f32:
+  case NEON::BI__builtin_neon_vfmad_lane_f64:
+  case NEON::BI__builtin_neon_vfmad_laneq_f64: {
     llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType());
     Value *F = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty);
     Ops[2] = CGF.Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
     return CGF.Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]);
   }
   // Scalar Floating-point Multiply Extended
-  case AArch64::BI__builtin_neon_vmulxs_f32:
-  case AArch64::BI__builtin_neon_vmulxd_f64: {
+  case NEON::BI__builtin_neon_vmulxs_f32:
+  case NEON::BI__builtin_neon_vmulxd_f64: {
     Int = Intrinsic::aarch64_neon_vmulx;
     llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType());
     return CGF.EmitNeonCall(CGF.CGM.getIntrinsic(Int, Ty), Ops, "vmulx");
   }
-  case AArch64::BI__builtin_neon_vmul_n_f64: {
+  case NEON::BI__builtin_neon_vmul_n_f64: {
     // v1f64 vmul_n_f64  should be mapped to Neon scalar mul lane
     llvm::Type *VTy = GetNeonType(&CGF,
       NeonTypeFlags(NeonTypeFlags::Float64, false, false));
@@ -1865,788 +2975,523 @@
     Value *Result = CGF.Builder.CreateFMul(Ops[0], Ops[1]);
     return CGF.Builder.CreateBitCast(Result, VTy);
   }
-  case AArch64::BI__builtin_neon_vget_lane_i8:
-  case AArch64::BI__builtin_neon_vget_lane_i16:
-  case AArch64::BI__builtin_neon_vget_lane_i32:
-  case AArch64::BI__builtin_neon_vget_lane_i64:
-  case AArch64::BI__builtin_neon_vget_lane_f32:
-  case AArch64::BI__builtin_neon_vget_lane_f64:
-  case AArch64::BI__builtin_neon_vgetq_lane_i8:
-  case AArch64::BI__builtin_neon_vgetq_lane_i16:
-  case AArch64::BI__builtin_neon_vgetq_lane_i32:
-  case AArch64::BI__builtin_neon_vgetq_lane_i64:
-  case AArch64::BI__builtin_neon_vgetq_lane_f32:
-  case AArch64::BI__builtin_neon_vgetq_lane_f64:
-    return CGF.EmitARMBuiltinExpr(ARM::BI__builtin_neon_vget_lane_i8, E);
-  case AArch64::BI__builtin_neon_vset_lane_i8:
-  case AArch64::BI__builtin_neon_vset_lane_i16:
-  case AArch64::BI__builtin_neon_vset_lane_i32:
-  case AArch64::BI__builtin_neon_vset_lane_i64:
-  case AArch64::BI__builtin_neon_vset_lane_f32:
-  case AArch64::BI__builtin_neon_vset_lane_f64:
-  case AArch64::BI__builtin_neon_vsetq_lane_i8:
-  case AArch64::BI__builtin_neon_vsetq_lane_i16:
-  case AArch64::BI__builtin_neon_vsetq_lane_i32:
-  case AArch64::BI__builtin_neon_vsetq_lane_i64:
-  case AArch64::BI__builtin_neon_vsetq_lane_f32:
-  case AArch64::BI__builtin_neon_vsetq_lane_f64:
-    return CGF.EmitARMBuiltinExpr(ARM::BI__builtin_neon_vset_lane_i8, E);
-  // Crypto
-  case AArch64::BI__builtin_neon_vsha1h_u32:
-    Int = Intrinsic::arm_neon_sha1h;
-    s = "sha1h"; OverloadInt = true; break;
-  case AArch64::BI__builtin_neon_vsha1cq_u32:
-    Int = Intrinsic::aarch64_neon_sha1c;
-    s = "sha1c"; break;
-  case AArch64::BI__builtin_neon_vsha1pq_u32:
-    Int = Intrinsic::aarch64_neon_sha1p;
-    s = "sha1p"; break;
-  case AArch64::BI__builtin_neon_vsha1mq_u32:
-    Int = Intrinsic::aarch64_neon_sha1m;
-    s = "sha1m"; break;
-  // Scalar Add
-  case AArch64::BI__builtin_neon_vaddd_s64:
-    Int = Intrinsic::aarch64_neon_vaddds;
-    s = "vaddds"; break;
-  case AArch64::BI__builtin_neon_vaddd_u64:
-    Int = Intrinsic::aarch64_neon_vadddu;
-    s = "vadddu"; break;
-  // Scalar Sub
-  case AArch64::BI__builtin_neon_vsubd_s64:
-    Int = Intrinsic::aarch64_neon_vsubds;
-    s = "vsubds"; break;
-  case AArch64::BI__builtin_neon_vsubd_u64:
-    Int = Intrinsic::aarch64_neon_vsubdu;
-    s = "vsubdu"; break;
-  // Scalar Saturating Add
-  case AArch64::BI__builtin_neon_vqaddb_s8:
-  case AArch64::BI__builtin_neon_vqaddh_s16:
-  case AArch64::BI__builtin_neon_vqadds_s32:
-  case AArch64::BI__builtin_neon_vqaddd_s64:
-    Int = Intrinsic::arm_neon_vqadds;
-    s = "vqadds"; OverloadInt = true; break;
-  case AArch64::BI__builtin_neon_vqaddb_u8:
-  case AArch64::BI__builtin_neon_vqaddh_u16:
-  case AArch64::BI__builtin_neon_vqadds_u32:
-  case AArch64::BI__builtin_neon_vqaddd_u64:
-    Int = Intrinsic::arm_neon_vqaddu;
-    s = "vqaddu"; OverloadInt = true; break;
-  // Scalar Saturating Sub
-  case AArch64::BI__builtin_neon_vqsubb_s8:
-  case AArch64::BI__builtin_neon_vqsubh_s16:
-  case AArch64::BI__builtin_neon_vqsubs_s32:
-  case AArch64::BI__builtin_neon_vqsubd_s64:
-    Int = Intrinsic::arm_neon_vqsubs;
-    s = "vqsubs"; OverloadInt = true; break;
-  case AArch64::BI__builtin_neon_vqsubb_u8:
-  case AArch64::BI__builtin_neon_vqsubh_u16:
-  case AArch64::BI__builtin_neon_vqsubs_u32:
-  case AArch64::BI__builtin_neon_vqsubd_u64:
-    Int = Intrinsic::arm_neon_vqsubu;
-    s = "vqsubu"; OverloadInt = true; break;
-  // Scalar Shift Left
-  case AArch64::BI__builtin_neon_vshld_s64:
-    Int = Intrinsic::aarch64_neon_vshlds;
-    s = "vshlds"; break;
-  case AArch64::BI__builtin_neon_vshld_u64:
-    Int = Intrinsic::aarch64_neon_vshldu;
-    s = "vshldu"; break;
-  // Scalar Saturating Shift Left
-  case AArch64::BI__builtin_neon_vqshlb_s8:
-  case AArch64::BI__builtin_neon_vqshlh_s16:
-  case AArch64::BI__builtin_neon_vqshls_s32:
-  case AArch64::BI__builtin_neon_vqshld_s64:
-    Int = Intrinsic::aarch64_neon_vqshls;
-    s = "vqshls"; OverloadInt = true; break;
-  case AArch64::BI__builtin_neon_vqshlb_u8:
-  case AArch64::BI__builtin_neon_vqshlh_u16:
-  case AArch64::BI__builtin_neon_vqshls_u32:
-  case AArch64::BI__builtin_neon_vqshld_u64:
-    Int = Intrinsic::aarch64_neon_vqshlu;
-    s = "vqshlu"; OverloadInt = true; break;
-  // Scalar Rouding Shift Left
-  case AArch64::BI__builtin_neon_vrshld_s64:
-    Int = Intrinsic::aarch64_neon_vrshlds;
-    s = "vrshlds"; break;
-  case AArch64::BI__builtin_neon_vrshld_u64:
-    Int = Intrinsic::aarch64_neon_vrshldu;
-    s = "vrshldu"; break;
-  // Scalar Saturating Rouding Shift Left
-  case AArch64::BI__builtin_neon_vqrshlb_s8:
-  case AArch64::BI__builtin_neon_vqrshlh_s16:
-  case AArch64::BI__builtin_neon_vqrshls_s32:
-  case AArch64::BI__builtin_neon_vqrshld_s64:
-    Int = Intrinsic::aarch64_neon_vqrshls;
-    s = "vqrshls"; OverloadInt = true; break;
-  case AArch64::BI__builtin_neon_vqrshlb_u8:
-  case AArch64::BI__builtin_neon_vqrshlh_u16:
-  case AArch64::BI__builtin_neon_vqrshls_u32:
-  case AArch64::BI__builtin_neon_vqrshld_u64:
-    Int = Intrinsic::aarch64_neon_vqrshlu;
-    s = "vqrshlu"; OverloadInt = true; break;
-  // Scalar Reduce Pairwise Add
-  case AArch64::BI__builtin_neon_vpaddd_s64:
-  case AArch64::BI__builtin_neon_vpaddd_u64:
-    Int = Intrinsic::aarch64_neon_vpadd; s = "vpadd";
-    break;
-  case AArch64::BI__builtin_neon_vpadds_f32:
-    Int = Intrinsic::aarch64_neon_vpfadd; s = "vpfadd";
-    break;
-  case AArch64::BI__builtin_neon_vpaddd_f64:
-    Int = Intrinsic::aarch64_neon_vpfaddq; s = "vpfaddq";
-    break;
-  // Scalar Reduce Pairwise Floating Point Max
-  case AArch64::BI__builtin_neon_vpmaxs_f32:
-    Int = Intrinsic::aarch64_neon_vpmax; s = "vpmax";
-    break;
-  case AArch64::BI__builtin_neon_vpmaxqd_f64:
-    Int = Intrinsic::aarch64_neon_vpmaxq; s = "vpmaxq";
-    break;
-  // Scalar Reduce Pairwise Floating Point Min
-  case AArch64::BI__builtin_neon_vpmins_f32:
-    Int = Intrinsic::aarch64_neon_vpmin; s = "vpmin";
-    break;
-  case AArch64::BI__builtin_neon_vpminqd_f64:
-    Int = Intrinsic::aarch64_neon_vpminq; s = "vpminq";
-    break;
-  // Scalar Reduce Pairwise Floating Point Maxnm
-  case AArch64::BI__builtin_neon_vpmaxnms_f32:
-    Int = Intrinsic::aarch64_neon_vpfmaxnm; s = "vpfmaxnm";
-    break;
-  case AArch64::BI__builtin_neon_vpmaxnmqd_f64:
-    Int = Intrinsic::aarch64_neon_vpfmaxnmq; s = "vpfmaxnmq";
-    break;
-  // Scalar Reduce Pairwise Floating Point Minnm
-  case AArch64::BI__builtin_neon_vpminnms_f32:
-    Int = Intrinsic::aarch64_neon_vpfminnm; s = "vpfminnm";
-    break;
-  case AArch64::BI__builtin_neon_vpminnmqd_f64:
-    Int = Intrinsic::aarch64_neon_vpfminnmq; s = "vpfminnmq";
-    break;
-  // The followings are intrinsics with scalar results generated AcrossVec vectors
-  case AArch64::BI__builtin_neon_vaddlv_s8:
-  case AArch64::BI__builtin_neon_vaddlv_s16:
-  case AArch64::BI__builtin_neon_vaddlvq_s8:
-  case AArch64::BI__builtin_neon_vaddlvq_s16:
-  case AArch64::BI__builtin_neon_vaddlvq_s32:
-    Int = Intrinsic::aarch64_neon_saddlv;
-    AcrossVec = true; ExtendEle = true; s = "saddlv"; break;
-  case AArch64::BI__builtin_neon_vaddlv_u8:
-  case AArch64::BI__builtin_neon_vaddlv_u16:
-  case AArch64::BI__builtin_neon_vaddlvq_u8:
-  case AArch64::BI__builtin_neon_vaddlvq_u16:
-  case AArch64::BI__builtin_neon_vaddlvq_u32:
-    Int = Intrinsic::aarch64_neon_uaddlv;
-    AcrossVec = true; ExtendEle = true; s = "uaddlv"; break;
-  case AArch64::BI__builtin_neon_vmaxv_s8:
-  case AArch64::BI__builtin_neon_vmaxv_s16:
-  case AArch64::BI__builtin_neon_vmaxvq_s8:
-  case AArch64::BI__builtin_neon_vmaxvq_s16:
-  case AArch64::BI__builtin_neon_vmaxvq_s32:
-    Int = Intrinsic::aarch64_neon_smaxv;
-    AcrossVec = true; ExtendEle = false; s = "smaxv"; break;
-  case AArch64::BI__builtin_neon_vmaxv_u8:
-  case AArch64::BI__builtin_neon_vmaxv_u16:
-  case AArch64::BI__builtin_neon_vmaxvq_u8:
-  case AArch64::BI__builtin_neon_vmaxvq_u16:
-  case AArch64::BI__builtin_neon_vmaxvq_u32:
-    Int = Intrinsic::aarch64_neon_umaxv;
-    AcrossVec = true; ExtendEle = false; s = "umaxv"; break;
-  case AArch64::BI__builtin_neon_vminv_s8:
-  case AArch64::BI__builtin_neon_vminv_s16:
-  case AArch64::BI__builtin_neon_vminvq_s8:
-  case AArch64::BI__builtin_neon_vminvq_s16:
-  case AArch64::BI__builtin_neon_vminvq_s32:
-    Int = Intrinsic::aarch64_neon_sminv;
-    AcrossVec = true; ExtendEle = false; s = "sminv"; break;
-  case AArch64::BI__builtin_neon_vminv_u8:
-  case AArch64::BI__builtin_neon_vminv_u16:
-  case AArch64::BI__builtin_neon_vminvq_u8:
-  case AArch64::BI__builtin_neon_vminvq_u16:
-  case AArch64::BI__builtin_neon_vminvq_u32:
-    Int = Intrinsic::aarch64_neon_uminv;
-    AcrossVec = true; ExtendEle = false; s = "uminv"; break;
-  case AArch64::BI__builtin_neon_vaddv_s8:
-  case AArch64::BI__builtin_neon_vaddv_s16:
-  case AArch64::BI__builtin_neon_vaddvq_s8:
-  case AArch64::BI__builtin_neon_vaddvq_s16:
-  case AArch64::BI__builtin_neon_vaddvq_s32:
-  case AArch64::BI__builtin_neon_vaddvq_s64:
-  case AArch64::BI__builtin_neon_vaddv_u8:
-  case AArch64::BI__builtin_neon_vaddv_u16:
-  case AArch64::BI__builtin_neon_vaddvq_u8:
-  case AArch64::BI__builtin_neon_vaddvq_u16:
-  case AArch64::BI__builtin_neon_vaddvq_u32:
-  case AArch64::BI__builtin_neon_vaddvq_u64:
-  case AArch64::BI__builtin_neon_vaddv_f32:
-  case AArch64::BI__builtin_neon_vaddvq_f32:
-  case AArch64::BI__builtin_neon_vaddvq_f64:
-    Int = Intrinsic::aarch64_neon_vaddv;
-    AcrossVec = true; ExtendEle = false; s = "vaddv"; break;      
-  case AArch64::BI__builtin_neon_vmaxv_f32:
-  case AArch64::BI__builtin_neon_vmaxvq_f32:
-  case AArch64::BI__builtin_neon_vmaxvq_f64:
-    Int = Intrinsic::aarch64_neon_vmaxv;
-    AcrossVec = true; ExtendEle = false; s = "vmaxv"; break;
-  case AArch64::BI__builtin_neon_vminv_f32:
-  case AArch64::BI__builtin_neon_vminvq_f32:
-  case AArch64::BI__builtin_neon_vminvq_f64:
-    Int = Intrinsic::aarch64_neon_vminv;
-    AcrossVec = true; ExtendEle = false; s = "vminv"; break;
-  case AArch64::BI__builtin_neon_vmaxnmv_f32:
-  case AArch64::BI__builtin_neon_vmaxnmvq_f32:
-  case AArch64::BI__builtin_neon_vmaxnmvq_f64:
-    Int = Intrinsic::aarch64_neon_vmaxnmv;
-    AcrossVec = true; ExtendEle = false; s = "vmaxnmv"; break;
-  case AArch64::BI__builtin_neon_vminnmv_f32:
-  case AArch64::BI__builtin_neon_vminnmvq_f32:
-  case AArch64::BI__builtin_neon_vminnmvq_f64:
-    Int = Intrinsic::aarch64_neon_vminnmv;
-    AcrossVec = true; ExtendEle = false; s = "vminnmv"; break;
-  // Scalar Integer Saturating Doubling Multiply Half High
-  case AArch64::BI__builtin_neon_vqdmulhh_s16:
-  case AArch64::BI__builtin_neon_vqdmulhs_s32:
-    Int = Intrinsic::arm_neon_vqdmulh;
-    s = "vqdmulh"; OverloadInt = true; break;
-  // Scalar Integer Saturating Rounding Doubling Multiply Half High
-  case AArch64::BI__builtin_neon_vqrdmulhh_s16:
-  case AArch64::BI__builtin_neon_vqrdmulhs_s32:
-    Int = Intrinsic::arm_neon_vqrdmulh;
-    s = "vqrdmulh"; OverloadInt = true; break;
-  // Scalar Floating-point Reciprocal Step and
-  case AArch64::BI__builtin_neon_vrecpss_f32:
-  case AArch64::BI__builtin_neon_vrecpsd_f64:
-    Int = Intrinsic::arm_neon_vrecps;
-    s = "vrecps"; OverloadInt = true; break;
-  // Scalar Floating-point Reciprocal Square Root Step
-  case AArch64::BI__builtin_neon_vrsqrtss_f32:
-  case AArch64::BI__builtin_neon_vrsqrtsd_f64:
-    Int = Intrinsic::arm_neon_vrsqrts;
-    s = "vrsqrts"; OverloadInt = true; break;
-  // Scalar Signed Integer Convert To Floating-point
-  case AArch64::BI__builtin_neon_vcvts_f32_s32:
-    Int = Intrinsic::aarch64_neon_vcvtf32_s32,
-    s = "vcvtf"; OverloadInt = false; break;
-  case AArch64::BI__builtin_neon_vcvtd_f64_s64:
-    Int = Intrinsic::aarch64_neon_vcvtf64_s64,
-    s = "vcvtf"; OverloadInt = false; break;
-  // Scalar Unsigned Integer Convert To Floating-point
-  case AArch64::BI__builtin_neon_vcvts_f32_u32:
-    Int = Intrinsic::aarch64_neon_vcvtf32_u32,
-    s = "vcvtf"; OverloadInt = false; break;
-  case AArch64::BI__builtin_neon_vcvtd_f64_u64:
-    Int = Intrinsic::aarch64_neon_vcvtf64_u64,
-    s = "vcvtf"; OverloadInt = false; break;
-  // Scalar Floating-point Converts
-  case AArch64::BI__builtin_neon_vcvtxd_f32_f64:
-    Int = Intrinsic::aarch64_neon_fcvtxn;
-    s = "vcvtxn"; OverloadCvtInt = true; break;
-  case AArch64::BI__builtin_neon_vcvtas_s32_f32:
-  case AArch64::BI__builtin_neon_vcvtad_s64_f64:
-    Int = Intrinsic::aarch64_neon_fcvtas;
-    s = "vcvtas"; OverloadCvtInt = true; break;
-  case AArch64::BI__builtin_neon_vcvtas_u32_f32:
-  case AArch64::BI__builtin_neon_vcvtad_u64_f64:
-    Int = Intrinsic::aarch64_neon_fcvtau;
-    s = "vcvtau"; OverloadCvtInt = true; break;
-  case AArch64::BI__builtin_neon_vcvtms_s32_f32:
-  case AArch64::BI__builtin_neon_vcvtmd_s64_f64:
-    Int = Intrinsic::aarch64_neon_fcvtms;
-    s = "vcvtms"; OverloadCvtInt = true; break;
-  case AArch64::BI__builtin_neon_vcvtms_u32_f32:
-  case AArch64::BI__builtin_neon_vcvtmd_u64_f64:
-    Int = Intrinsic::aarch64_neon_fcvtmu;
-    s = "vcvtmu"; OverloadCvtInt = true; break;
-  case AArch64::BI__builtin_neon_vcvtns_s32_f32:
-  case AArch64::BI__builtin_neon_vcvtnd_s64_f64:
-    Int = Intrinsic::aarch64_neon_fcvtns;
-    s = "vcvtns"; OverloadCvtInt = true; break;
-  case AArch64::BI__builtin_neon_vcvtns_u32_f32:
-  case AArch64::BI__builtin_neon_vcvtnd_u64_f64:
-    Int = Intrinsic::aarch64_neon_fcvtnu;
-    s = "vcvtnu"; OverloadCvtInt = true; break;
-  case AArch64::BI__builtin_neon_vcvtps_s32_f32:
-  case AArch64::BI__builtin_neon_vcvtpd_s64_f64:
-    Int = Intrinsic::aarch64_neon_fcvtps;
-    s = "vcvtps"; OverloadCvtInt = true; break;
-  case AArch64::BI__builtin_neon_vcvtps_u32_f32:
-  case AArch64::BI__builtin_neon_vcvtpd_u64_f64:
-    Int = Intrinsic::aarch64_neon_fcvtpu;
-    s = "vcvtpu"; OverloadCvtInt = true; break;
-  case AArch64::BI__builtin_neon_vcvts_s32_f32:
-  case AArch64::BI__builtin_neon_vcvtd_s64_f64:
-    Int = Intrinsic::aarch64_neon_fcvtzs;
-    s = "vcvtzs"; OverloadCvtInt = true; break;
-  case AArch64::BI__builtin_neon_vcvts_u32_f32:
-  case AArch64::BI__builtin_neon_vcvtd_u64_f64:
-    Int = Intrinsic::aarch64_neon_fcvtzu;
-    s = "vcvtzu"; OverloadCvtInt = true; break;
-  // Scalar Floating-point Reciprocal Estimate
-  case AArch64::BI__builtin_neon_vrecpes_f32:
-  case AArch64::BI__builtin_neon_vrecped_f64:
-    Int = Intrinsic::arm_neon_vrecpe;
-    s = "vrecpe"; OverloadInt = true; break;
-  // Scalar Floating-point Reciprocal Exponent
-  case AArch64::BI__builtin_neon_vrecpxs_f32:
-  case AArch64::BI__builtin_neon_vrecpxd_f64:
-    Int = Intrinsic::aarch64_neon_vrecpx;
-    s = "vrecpx"; OverloadInt = true; break;
-  // Scalar Floating-point Reciprocal Square Root Estimate
-  case AArch64::BI__builtin_neon_vrsqrtes_f32:
-  case AArch64::BI__builtin_neon_vrsqrted_f64:
-    Int = Intrinsic::arm_neon_vrsqrte;
-    s = "vrsqrte"; OverloadInt = true; break;
-  // Scalar Compare Equal
-  case AArch64::BI__builtin_neon_vceqd_s64:
-  case AArch64::BI__builtin_neon_vceqd_u64:
-    Int = Intrinsic::aarch64_neon_vceq; s = "vceq";
-    OverloadCmpInt = true; break;
-  // Scalar Compare Equal To Zero
-  case AArch64::BI__builtin_neon_vceqzd_s64:
-  case AArch64::BI__builtin_neon_vceqzd_u64:
-    Int = Intrinsic::aarch64_neon_vceq; s = "vceq";
+  case NEON::BI__builtin_neon_vget_lane_i8:
+  case NEON::BI__builtin_neon_vget_lane_i16:
+  case NEON::BI__builtin_neon_vget_lane_i32:
+  case NEON::BI__builtin_neon_vget_lane_i64:
+  case NEON::BI__builtin_neon_vget_lane_f32:
+  case NEON::BI__builtin_neon_vget_lane_f64:
+  case NEON::BI__builtin_neon_vgetq_lane_i8:
+  case NEON::BI__builtin_neon_vgetq_lane_i16:
+  case NEON::BI__builtin_neon_vgetq_lane_i32:
+  case NEON::BI__builtin_neon_vgetq_lane_i64:
+  case NEON::BI__builtin_neon_vgetq_lane_f32:
+  case NEON::BI__builtin_neon_vgetq_lane_f64:
+    return CGF.EmitARMBuiltinExpr(NEON::BI__builtin_neon_vget_lane_i8, E);
+  case NEON::BI__builtin_neon_vset_lane_i8:
+  case NEON::BI__builtin_neon_vset_lane_i16:
+  case NEON::BI__builtin_neon_vset_lane_i32:
+  case NEON::BI__builtin_neon_vset_lane_i64:
+  case NEON::BI__builtin_neon_vset_lane_f32:
+  case NEON::BI__builtin_neon_vset_lane_f64:
+  case NEON::BI__builtin_neon_vsetq_lane_i8:
+  case NEON::BI__builtin_neon_vsetq_lane_i16:
+  case NEON::BI__builtin_neon_vsetq_lane_i32:
+  case NEON::BI__builtin_neon_vsetq_lane_i64:
+  case NEON::BI__builtin_neon_vsetq_lane_f32:
+  case NEON::BI__builtin_neon_vsetq_lane_f64:
+    return CGF.EmitARMBuiltinExpr(NEON::BI__builtin_neon_vset_lane_i8, E);
+
+  case NEON::BI__builtin_neon_vceqzd_s64:
+  case NEON::BI__builtin_neon_vceqzd_u64:
+  case NEON::BI__builtin_neon_vcgezd_s64:
+  case NEON::BI__builtin_neon_vcgtzd_s64:
+  case NEON::BI__builtin_neon_vclezd_s64:
+  case NEON::BI__builtin_neon_vcltzd_s64:
     // Add implicit zero operand.
     Ops.push_back(llvm::Constant::getNullValue(Ops[0]->getType()));
-    OverloadCmpInt = true; break;
-  // Scalar Compare Greater Than or Equal
-  case AArch64::BI__builtin_neon_vcged_s64:
-    Int = Intrinsic::aarch64_neon_vcge; s = "vcge";
-    OverloadCmpInt = true; break;
-  case AArch64::BI__builtin_neon_vcged_u64:
-    Int = Intrinsic::aarch64_neon_vchs; s = "vcge";
-    OverloadCmpInt = true; break;
-  // Scalar Compare Greater Than or Equal To Zero
-  case AArch64::BI__builtin_neon_vcgezd_s64:
-    Int = Intrinsic::aarch64_neon_vcge; s = "vcge";
-    // Add implicit zero operand.
-    Ops.push_back(llvm::Constant::getNullValue(Ops[0]->getType()));
-    OverloadCmpInt = true; break;
-  // Scalar Compare Greater Than
-  case AArch64::BI__builtin_neon_vcgtd_s64:
-    Int = Intrinsic::aarch64_neon_vcgt; s = "vcgt";
-    OverloadCmpInt = true; break;
-  case AArch64::BI__builtin_neon_vcgtd_u64:
-    Int = Intrinsic::aarch64_neon_vchi; s = "vcgt";
-    OverloadCmpInt = true; break;
-  // Scalar Compare Greater Than Zero
-  case AArch64::BI__builtin_neon_vcgtzd_s64:
-    Int = Intrinsic::aarch64_neon_vcgt; s = "vcgt";
-    // Add implicit zero operand.
-    Ops.push_back(llvm::Constant::getNullValue(Ops[0]->getType()));
-    OverloadCmpInt = true; break;
-  // Scalar Compare Less Than or Equal
-  case AArch64::BI__builtin_neon_vcled_s64:
-    Int = Intrinsic::aarch64_neon_vcge; s = "vcge";
-    OverloadCmpInt = true; std::swap(Ops[0], Ops[1]); break;
-  case AArch64::BI__builtin_neon_vcled_u64:
-    Int = Intrinsic::aarch64_neon_vchs; s = "vchs";
-    OverloadCmpInt = true; std::swap(Ops[0], Ops[1]); break;
-  // Scalar Compare Less Than or Equal To Zero
-  case AArch64::BI__builtin_neon_vclezd_s64:
-    Int = Intrinsic::aarch64_neon_vclez; s = "vcle";
-    // Add implicit zero operand.
-    Ops.push_back(llvm::Constant::getNullValue(Ops[0]->getType()));
-    OverloadCmpInt = true; break;
-  // Scalar Compare Less Than
-  case AArch64::BI__builtin_neon_vcltd_s64:
-    Int = Intrinsic::aarch64_neon_vcgt; s = "vcgt";
-    OverloadCmpInt = true; std::swap(Ops[0], Ops[1]); break;
-  case AArch64::BI__builtin_neon_vcltd_u64:
-    Int = Intrinsic::aarch64_neon_vchi; s = "vchi";
-    OverloadCmpInt = true; std::swap(Ops[0], Ops[1]); break;
-  // Scalar Compare Less Than Zero
-  case AArch64::BI__builtin_neon_vcltzd_s64:
-    Int = Intrinsic::aarch64_neon_vcltz; s = "vclt";
-    // Add implicit zero operand.
-    Ops.push_back(llvm::Constant::getNullValue(Ops[0]->getType()));
-    OverloadCmpInt = true; break;
-  // Scalar Floating-point Compare Equal
-  case AArch64::BI__builtin_neon_vceqs_f32:
-  case AArch64::BI__builtin_neon_vceqd_f64:
-    Int = Intrinsic::aarch64_neon_vceq; s = "vceq";
-    OverloadCmpInt = true; break;
-  // Scalar Floating-point Compare Equal To Zero
-  case AArch64::BI__builtin_neon_vceqzs_f32:
-  case AArch64::BI__builtin_neon_vceqzd_f64:
-    Int = Intrinsic::aarch64_neon_vceq; s = "vceq";
+    break;
+  case NEON::BI__builtin_neon_vceqzs_f32:
+  case NEON::BI__builtin_neon_vceqzd_f64:
+  case NEON::BI__builtin_neon_vcgezs_f32:
+  case NEON::BI__builtin_neon_vcgezd_f64:
+  case NEON::BI__builtin_neon_vcgtzs_f32:
+  case NEON::BI__builtin_neon_vcgtzd_f64:
+  case NEON::BI__builtin_neon_vclezs_f32:
+  case NEON::BI__builtin_neon_vclezd_f64:
+  case NEON::BI__builtin_neon_vcltzs_f32:
+  case NEON::BI__builtin_neon_vcltzd_f64:
     // Add implicit zero operand.
     Ops.push_back(llvm::Constant::getNullValue(CGF.FloatTy));
-    IsFpCmpZInt = true;
-    OverloadCmpInt = true; break;
-  // Scalar Floating-point Compare Greater Than Or Equal
-  case AArch64::BI__builtin_neon_vcges_f32:
-  case AArch64::BI__builtin_neon_vcged_f64:
-    Int = Intrinsic::aarch64_neon_vcge; s = "vcge";
-    OverloadCmpInt = true; break;
-  // Scalar Floating-point Compare Greater Than Or Equal To Zero
-  case AArch64::BI__builtin_neon_vcgezs_f32:
-  case AArch64::BI__builtin_neon_vcgezd_f64:
-    Int = Intrinsic::aarch64_neon_vcge; s = "vcge";
-    // Add implicit zero operand.
-    Ops.push_back(llvm::Constant::getNullValue(CGF.FloatTy));
-    IsFpCmpZInt = true;
-    OverloadCmpInt = true; break;
-  // Scalar Floating-point Compare Greather Than
-  case AArch64::BI__builtin_neon_vcgts_f32:
-  case AArch64::BI__builtin_neon_vcgtd_f64:
-    Int = Intrinsic::aarch64_neon_vcgt; s = "vcgt";
-    OverloadCmpInt = true; break;
-  // Scalar Floating-point Compare Greather Than Zero
-  case AArch64::BI__builtin_neon_vcgtzs_f32:
-  case AArch64::BI__builtin_neon_vcgtzd_f64:
-    Int = Intrinsic::aarch64_neon_vcgt; s = "vcgt";
-    // Add implicit zero operand.
-    Ops.push_back(llvm::Constant::getNullValue(CGF.FloatTy));
-    IsFpCmpZInt = true;
-    OverloadCmpInt = true; break;
-  // Scalar Floating-point Compare Less Than or Equal
-  case AArch64::BI__builtin_neon_vcles_f32:
-  case AArch64::BI__builtin_neon_vcled_f64:
-    Int = Intrinsic::aarch64_neon_vcge; s = "vcge";
-    OverloadCmpInt = true; break;
-  // Scalar Floating-point Compare Less Than Or Equal To Zero
-  case AArch64::BI__builtin_neon_vclezs_f32:
-  case AArch64::BI__builtin_neon_vclezd_f64:
-    Int = Intrinsic::aarch64_neon_vclez; s = "vcle";
-    // Add implicit zero operand.
-    Ops.push_back(llvm::Constant::getNullValue(CGF.FloatTy));
-    IsFpCmpZInt = true;
-    OverloadCmpInt = true; break;
-  // Scalar Floating-point Compare Less Than Zero
-  case AArch64::BI__builtin_neon_vclts_f32:
-  case AArch64::BI__builtin_neon_vcltd_f64:
-    Int = Intrinsic::aarch64_neon_vcgt; s = "vcgt";
-    OverloadCmpInt = true; std::swap(Ops[0], Ops[1]); break;
-  // Scalar Floating-point Compare Less Than Zero
-  case AArch64::BI__builtin_neon_vcltzs_f32:
-  case AArch64::BI__builtin_neon_vcltzd_f64:
-    Int = Intrinsic::aarch64_neon_vcltz; s = "vclt";
-    // Add implicit zero operand.
-    Ops.push_back(llvm::Constant::getNullValue(CGF.FloatTy));
-    IsFpCmpZInt = true;
-    OverloadCmpInt = true; break;
-  // Scalar Floating-point Absolute Compare Greater Than Or Equal
-  case AArch64::BI__builtin_neon_vcages_f32:
-  case AArch64::BI__builtin_neon_vcaged_f64:
-    Int = Intrinsic::aarch64_neon_vcage; s = "vcage";
-    OverloadCmpInt = true; break;
-  // Scalar Floating-point Absolute Compare Greater Than
-  case AArch64::BI__builtin_neon_vcagts_f32:
-  case AArch64::BI__builtin_neon_vcagtd_f64:
-    Int = Intrinsic::aarch64_neon_vcagt; s = "vcagt";
-    OverloadCmpInt = true; break;
-  // Scalar Floating-point Absolute Compare Less Than Or Equal
-  case AArch64::BI__builtin_neon_vcales_f32:
-  case AArch64::BI__builtin_neon_vcaled_f64:
-    Int = Intrinsic::aarch64_neon_vcage; s = "vcage";
-    OverloadCmpInt = true; std::swap(Ops[0], Ops[1]); break;
-  // Scalar Floating-point Absolute Compare Less Than
-  case AArch64::BI__builtin_neon_vcalts_f32:
-  case AArch64::BI__builtin_neon_vcaltd_f64:
-    Int = Intrinsic::aarch64_neon_vcagt; s = "vcalt";
-    OverloadCmpInt = true; std::swap(Ops[0], Ops[1]); break;
-  // Scalar Compare Bitwise Test Bits
-  case AArch64::BI__builtin_neon_vtstd_s64:
-  case AArch64::BI__builtin_neon_vtstd_u64:
-    Int = Intrinsic::aarch64_neon_vtstd; s = "vtst";
-    OverloadCmpInt = true; break;
-  // Scalar Absolute Value
-  case AArch64::BI__builtin_neon_vabsd_s64:
-    Int = Intrinsic::aarch64_neon_vabs;
-    s = "vabs"; OverloadInt = false; break;
-  // Scalar Absolute Difference
-  case AArch64::BI__builtin_neon_vabds_f32:
-  case AArch64::BI__builtin_neon_vabdd_f64:
-    Int = Intrinsic::aarch64_neon_vabd;
-    s = "vabd"; OverloadInt = true; break;
-  // Scalar Signed Saturating Absolute Value
-  case AArch64::BI__builtin_neon_vqabsb_s8:
-  case AArch64::BI__builtin_neon_vqabsh_s16:
-  case AArch64::BI__builtin_neon_vqabss_s32:
-  case AArch64::BI__builtin_neon_vqabsd_s64:
-    Int = Intrinsic::arm_neon_vqabs;
-    s = "vqabs"; OverloadInt = true; break;
-  // Scalar Negate
-  case AArch64::BI__builtin_neon_vnegd_s64:
-    Int = Intrinsic::aarch64_neon_vneg;
-    s = "vneg"; OverloadInt = false; break;
-  // Scalar Signed Saturating Negate
-  case AArch64::BI__builtin_neon_vqnegb_s8:
-  case AArch64::BI__builtin_neon_vqnegh_s16:
-  case AArch64::BI__builtin_neon_vqnegs_s32:
-  case AArch64::BI__builtin_neon_vqnegd_s64:
-    Int = Intrinsic::arm_neon_vqneg;
-    s = "vqneg"; OverloadInt = true; break;
-  // Scalar Signed Saturating Accumulated of Unsigned Value
-  case AArch64::BI__builtin_neon_vuqaddb_s8:
-  case AArch64::BI__builtin_neon_vuqaddh_s16:
-  case AArch64::BI__builtin_neon_vuqadds_s32:
-  case AArch64::BI__builtin_neon_vuqaddd_s64:
-    Int = Intrinsic::aarch64_neon_vuqadd;
-    s = "vuqadd"; OverloadInt = true; break;
-  // Scalar Unsigned Saturating Accumulated of Signed Value
-  case AArch64::BI__builtin_neon_vsqaddb_u8:
-  case AArch64::BI__builtin_neon_vsqaddh_u16:
-  case AArch64::BI__builtin_neon_vsqadds_u32:
-  case AArch64::BI__builtin_neon_vsqaddd_u64:
-    Int = Intrinsic::aarch64_neon_vsqadd;
-    s = "vsqadd"; OverloadInt = true; break;
-  // Signed Saturating Doubling Multiply-Add Long
-  case AArch64::BI__builtin_neon_vqdmlalh_s16:
-  case AArch64::BI__builtin_neon_vqdmlals_s32:
-    Int = Intrinsic::aarch64_neon_vqdmlal;
-    s = "vqdmlal"; OverloadWideInt = true; break;
-  // Signed Saturating Doubling Multiply-Subtract Long
-  case AArch64::BI__builtin_neon_vqdmlslh_s16:
-  case AArch64::BI__builtin_neon_vqdmlsls_s32:
-    Int = Intrinsic::aarch64_neon_vqdmlsl;
-    s = "vqdmlsl"; OverloadWideInt = true; break;
-  // Signed Saturating Doubling Multiply Long
-  case AArch64::BI__builtin_neon_vqdmullh_s16:
-  case AArch64::BI__builtin_neon_vqdmulls_s32:
-    Int = Intrinsic::arm_neon_vqdmull;
-    s = "vqdmull"; OverloadWideInt = true; break;
-  // Scalar Signed Saturating Extract Unsigned Narrow
-  case AArch64::BI__builtin_neon_vqmovunh_s16:
-  case AArch64::BI__builtin_neon_vqmovuns_s32:
-  case AArch64::BI__builtin_neon_vqmovund_s64:
-    Int = Intrinsic::arm_neon_vqmovnsu;
-    s = "vqmovun"; OverloadNarrowInt = true; break;
-  // Scalar Signed Saturating Extract Narrow
-  case AArch64::BI__builtin_neon_vqmovnh_s16:
-  case AArch64::BI__builtin_neon_vqmovns_s32:
-  case AArch64::BI__builtin_neon_vqmovnd_s64:
-    Int = Intrinsic::arm_neon_vqmovns;
-    s = "vqmovn"; OverloadNarrowInt = true; break;
-  // Scalar Unsigned Saturating Extract Narrow
-  case AArch64::BI__builtin_neon_vqmovnh_u16:
-  case AArch64::BI__builtin_neon_vqmovns_u32:
-  case AArch64::BI__builtin_neon_vqmovnd_u64:
-    Int = Intrinsic::arm_neon_vqmovnu;
-    s = "vqmovn"; OverloadNarrowInt = true; break;
-  // Scalar Signed Shift Right (Immediate)
-  case AArch64::BI__builtin_neon_vshrd_n_s64:
-    Int = Intrinsic::aarch64_neon_vshrds_n;
-    s = "vsshr"; OverloadInt = false; break;
-  // Scalar Unsigned Shift Right (Immediate)
-  case AArch64::BI__builtin_neon_vshrd_n_u64:
-    Int = Intrinsic::aarch64_neon_vshrdu_n;
-    s = "vushr"; OverloadInt = false; break;
-  // Scalar Signed Rounding Shift Right (Immediate)
-  case AArch64::BI__builtin_neon_vrshrd_n_s64:
-    Int = Intrinsic::aarch64_neon_vsrshr;
-    s = "vsrshr"; OverloadInt = true; break;
-  // Scalar Unsigned Rounding Shift Right (Immediate)
-  case AArch64::BI__builtin_neon_vrshrd_n_u64:
-    Int = Intrinsic::aarch64_neon_vurshr;
-    s = "vurshr"; OverloadInt = true; break;
-  // Scalar Signed Shift Right and Accumulate (Immediate)
-  case AArch64::BI__builtin_neon_vsrad_n_s64:
-    Int = Intrinsic::aarch64_neon_vsrads_n;
-    s = "vssra"; OverloadInt = false; break;
-  // Scalar Unsigned Shift Right and Accumulate (Immediate)
-  case AArch64::BI__builtin_neon_vsrad_n_u64:
-    Int = Intrinsic::aarch64_neon_vsradu_n;
-    s = "vusra"; OverloadInt = false; break;
-  // Scalar Signed Rounding Shift Right and Accumulate (Immediate)
-  case AArch64::BI__builtin_neon_vrsrad_n_s64:
-    Int = Intrinsic::aarch64_neon_vrsrads_n;
-    s = "vsrsra"; OverloadInt = false; break;
-  // Scalar Unsigned Rounding Shift Right and Accumulate (Immediate)
-  case AArch64::BI__builtin_neon_vrsrad_n_u64:
-    Int = Intrinsic::aarch64_neon_vrsradu_n;
-    s = "vursra"; OverloadInt = false; break;
-  // Scalar Signed/Unsigned Shift Left (Immediate)
-  case AArch64::BI__builtin_neon_vshld_n_s64:
-  case AArch64::BI__builtin_neon_vshld_n_u64:
-    Int = Intrinsic::aarch64_neon_vshld_n;
-    s = "vshl"; OverloadInt = false; break;
-  // Signed Saturating Shift Left (Immediate)
-  case AArch64::BI__builtin_neon_vqshlb_n_s8:
-  case AArch64::BI__builtin_neon_vqshlh_n_s16:
-  case AArch64::BI__builtin_neon_vqshls_n_s32:
-  case AArch64::BI__builtin_neon_vqshld_n_s64:
-    Int = Intrinsic::aarch64_neon_vqshls_n;
-    s = "vsqshl"; OverloadInt = true; break;
-  // Unsigned Saturating Shift Left (Immediate)
-  case AArch64::BI__builtin_neon_vqshlb_n_u8:
-  case AArch64::BI__builtin_neon_vqshlh_n_u16:
-  case AArch64::BI__builtin_neon_vqshls_n_u32:
-  case AArch64::BI__builtin_neon_vqshld_n_u64:
-    Int = Intrinsic::aarch64_neon_vqshlu_n;
-    s = "vuqshl"; OverloadInt = true; break;
-  // Signed Saturating Shift Left Unsigned (Immediate)
-  case AArch64::BI__builtin_neon_vqshlub_n_s8:
-  case AArch64::BI__builtin_neon_vqshluh_n_s16:
-  case AArch64::BI__builtin_neon_vqshlus_n_s32:
-  case AArch64::BI__builtin_neon_vqshlud_n_s64:
-    Int = Intrinsic::aarch64_neon_vsqshlu;
-    s = "vsqshlu"; OverloadInt = true; break;
-  // Shift Right And Insert (Immediate)
-  case AArch64::BI__builtin_neon_vsrid_n_s64:
-  case AArch64::BI__builtin_neon_vsrid_n_u64:
-    Int = Intrinsic::aarch64_neon_vsri;
-    s = "vsri"; OverloadInt = true; break;
-  // Shift Left And Insert (Immediate)
-  case AArch64::BI__builtin_neon_vslid_n_s64:
-  case AArch64::BI__builtin_neon_vslid_n_u64:
-    Int = Intrinsic::aarch64_neon_vsli;
-    s = "vsli"; OverloadInt = true; break;
-  // Signed Saturating Shift Right Narrow (Immediate)
-  case AArch64::BI__builtin_neon_vqshrnh_n_s16:
-  case AArch64::BI__builtin_neon_vqshrns_n_s32:
-  case AArch64::BI__builtin_neon_vqshrnd_n_s64:
-    Int = Intrinsic::aarch64_neon_vsqshrn;
-    s = "vsqshrn"; OverloadInt = true; break;
-  // Unsigned Saturating Shift Right Narrow (Immediate)
-  case AArch64::BI__builtin_neon_vqshrnh_n_u16:
-  case AArch64::BI__builtin_neon_vqshrns_n_u32:
-  case AArch64::BI__builtin_neon_vqshrnd_n_u64:
-    Int = Intrinsic::aarch64_neon_vuqshrn;
-    s = "vuqshrn"; OverloadInt = true; break;
-  // Signed Saturating Rounded Shift Right Narrow (Immediate)
-  case AArch64::BI__builtin_neon_vqrshrnh_n_s16:
-  case AArch64::BI__builtin_neon_vqrshrns_n_s32:
-  case AArch64::BI__builtin_neon_vqrshrnd_n_s64:
-    Int = Intrinsic::aarch64_neon_vsqrshrn;
-    s = "vsqrshrn"; OverloadInt = true; break;
-  // Unsigned Saturating Rounded Shift Right Narrow (Immediate)
-  case AArch64::BI__builtin_neon_vqrshrnh_n_u16:
-  case AArch64::BI__builtin_neon_vqrshrns_n_u32:
-  case AArch64::BI__builtin_neon_vqrshrnd_n_u64:
-    Int = Intrinsic::aarch64_neon_vuqrshrn;
-    s = "vuqrshrn"; OverloadInt = true; break;
-  // Signed Saturating Shift Right Unsigned Narrow (Immediate)
-  case AArch64::BI__builtin_neon_vqshrunh_n_s16:
-  case AArch64::BI__builtin_neon_vqshruns_n_s32:
-  case AArch64::BI__builtin_neon_vqshrund_n_s64:
-    Int = Intrinsic::aarch64_neon_vsqshrun;
-    s = "vsqshrun"; OverloadInt = true; break;
-  // Signed Saturating Rounded Shift Right Unsigned Narrow (Immediate)
-  case AArch64::BI__builtin_neon_vqrshrunh_n_s16:
-  case AArch64::BI__builtin_neon_vqrshruns_n_s32:
-  case AArch64::BI__builtin_neon_vqrshrund_n_s64:
-    Int = Intrinsic::aarch64_neon_vsqrshrun;
-    s = "vsqrshrun"; OverloadInt = true; break;
-  // Scalar Signed Fixed-point Convert To Floating-Point (Immediate)
-  case AArch64::BI__builtin_neon_vcvts_n_f32_s32:
-    Int = Intrinsic::aarch64_neon_vcvtf32_n_s32;
-    s = "vcvtf"; OverloadInt = false; break;
-  case AArch64::BI__builtin_neon_vcvtd_n_f64_s64:
-    Int = Intrinsic::aarch64_neon_vcvtf64_n_s64;
-    s = "vcvtf"; OverloadInt = false; break;
-  // Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate)
-  case AArch64::BI__builtin_neon_vcvts_n_f32_u32:
-    Int = Intrinsic::aarch64_neon_vcvtf32_n_u32;
-    s = "vcvtf"; OverloadInt = false; break;
-  case AArch64::BI__builtin_neon_vcvtd_n_f64_u64:
-    Int = Intrinsic::aarch64_neon_vcvtf64_n_u64;
-    s = "vcvtf"; OverloadInt = false; break;
-  // Scalar Floating-point Convert To Signed Fixed-point (Immediate)
-  case AArch64::BI__builtin_neon_vcvts_n_s32_f32:
-    Int = Intrinsic::aarch64_neon_vcvts_n_s32_f32;
-    s = "fcvtzs"; OverloadInt = false; break;
-  case AArch64::BI__builtin_neon_vcvtd_n_s64_f64:
-    Int = Intrinsic::aarch64_neon_vcvtd_n_s64_f64;
-    s = "fcvtzs"; OverloadInt = false; break;
-  // Scalar Floating-point Convert To Unsigned Fixed-point (Immediate)
-  case AArch64::BI__builtin_neon_vcvts_n_u32_f32:
-    Int = Intrinsic::aarch64_neon_vcvts_n_u32_f32;
-    s = "fcvtzu"; OverloadInt = false; break;
-  case AArch64::BI__builtin_neon_vcvtd_n_u64_f64:
-    Int = Intrinsic::aarch64_neon_vcvtd_n_u64_f64;
-    s = "fcvtzu"; OverloadInt = false; break;
+    break;
   }
 
-  if (!Int)
+  // It didn't need any handling specific to the AArch64 backend, so defer to
+  // common code.
+  return EmitCommonNeonSISDBuiltinExpr(CGF, SISDInfo, Ops, E);
+}
+
+Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
+    unsigned BuiltinID, unsigned LLVMIntrinsic, unsigned AltLLVMIntrinsic,
+    const char *NameHint, unsigned Modifier, const CallExpr *E,
+    SmallVectorImpl<llvm::Value *> &Ops, llvm::Value *Align) {
+  // Get the last argument, which specifies the vector type.
+  llvm::APSInt NeonTypeConst;
+  const Expr *Arg = E->getArg(E->getNumArgs() - 1);
+  if (!Arg->isIntegerConstantExpr(NeonTypeConst, getContext()))
     return 0;
 
-  // AArch64 scalar builtin that returns scalar type
-  // and should be mapped to AArch64 intrinsic that returns
-  // one-element vector type.
-  Function *F = 0;
-  if (AcrossVec) {
-    // Gen arg type
-    const Expr *Arg = E->getArg(E->getNumArgs()-1);
-    llvm::Type *Ty = CGF.ConvertType(Arg->getType());
-    llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
-    llvm::Type *ETy = VTy->getElementType();
-    llvm::VectorType *RTy = llvm::VectorType::get(ETy, 1);
-  
-    if (ExtendEle) {
-      assert(!ETy->isFloatingPointTy());
-      RTy = llvm::VectorType::getExtendedElementVectorType(RTy);
+  // Determine the type of this overloaded NEON intrinsic.
+  NeonTypeFlags Type(NeonTypeConst.getZExtValue());
+  bool Usgn = Type.isUnsigned();
+  bool Quad = Type.isQuad();
+
+  llvm::VectorType *VTy = GetNeonType(this, Type);
+  llvm::Type *Ty = VTy;
+  if (!Ty)
+    return 0;
+
+  unsigned Int = LLVMIntrinsic;
+  if ((Modifier & UnsignedAlts) && !Usgn)
+    Int = AltLLVMIntrinsic;
+
+  switch (BuiltinID) {
+  default: break;
+  case NEON::BI__builtin_neon_vabs_v:
+  case NEON::BI__builtin_neon_vabsq_v:
+    if (VTy->getElementType()->isFloatingPointTy())
+      return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, Ty), Ops, "vabs");
+    return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vabs");
+  case NEON::BI__builtin_neon_vaddhn_v: {
+    llvm::VectorType *SrcTy =
+        llvm::VectorType::getExtendedElementVectorType(VTy);
+
+    // %sum = add <4 x i32> %lhs, %rhs
+    Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
+    Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy);
+    Ops[0] = Builder.CreateAdd(Ops[0], Ops[1], "vaddhn");
+
+    // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
+    Constant *ShiftAmt = ConstantInt::get(SrcTy->getElementType(),
+                                       SrcTy->getScalarSizeInBits() / 2);
+    ShiftAmt = ConstantVector::getSplat(VTy->getNumElements(), ShiftAmt);
+    Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vaddhn");
+
+    // %res = trunc <4 x i32> %high to <4 x i16>
+    return Builder.CreateTrunc(Ops[0], VTy, "vaddhn");
+  }
+  case NEON::BI__builtin_neon_vcale_v:
+  case NEON::BI__builtin_neon_vcaleq_v:
+  case NEON::BI__builtin_neon_vcalt_v:
+  case NEON::BI__builtin_neon_vcaltq_v:
+    std::swap(Ops[0], Ops[1]);
+  case NEON::BI__builtin_neon_vcage_v:
+  case NEON::BI__builtin_neon_vcageq_v:
+  case NEON::BI__builtin_neon_vcagt_v:
+  case NEON::BI__builtin_neon_vcagtq_v: {
+    llvm::Type *VecFlt = llvm::VectorType::get(
+        VTy->getScalarSizeInBits() == 32 ? FloatTy : DoubleTy,
+        VTy->getNumElements());
+    llvm::Type *Tys[] = { VTy, VecFlt };
+    Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
+    return EmitNeonCall(F, Ops, NameHint);
+  }
+  case NEON::BI__builtin_neon_vclz_v:
+  case NEON::BI__builtin_neon_vclzq_v:
+    // We generate target-independent intrinsic, which needs a second argument
+    // for whether or not clz of zero is undefined; on ARM it isn't.
+    Ops.push_back(Builder.getInt1(getTarget().isCLZForZeroUndef()));
+    break;
+  case NEON::BI__builtin_neon_vcvt_f32_v:
+  case NEON::BI__builtin_neon_vcvtq_f32_v:
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, Quad));
+    return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
+                : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
+  case NEON::BI__builtin_neon_vcvt_n_f32_v:
+  case NEON::BI__builtin_neon_vcvt_n_f64_v:
+  case NEON::BI__builtin_neon_vcvtq_n_f32_v:
+  case NEON::BI__builtin_neon_vcvtq_n_f64_v: {
+    bool Double =
+      (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
+    llvm::Type *FloatTy =
+        GetNeonType(this, NeonTypeFlags(Double ? NeonTypeFlags::Float64
+                                               : NeonTypeFlags::Float32,
+                                        false, Quad));
+    llvm::Type *Tys[2] = { FloatTy, Ty };
+    Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
+    Function *F = CGM.getIntrinsic(Int, Tys);
+    return EmitNeonCall(F, Ops, "vcvt_n");
+  }
+  case NEON::BI__builtin_neon_vcvt_n_s32_v:
+  case NEON::BI__builtin_neon_vcvt_n_u32_v:
+  case NEON::BI__builtin_neon_vcvt_n_s64_v:
+  case NEON::BI__builtin_neon_vcvt_n_u64_v:
+  case NEON::BI__builtin_neon_vcvtq_n_s32_v:
+  case NEON::BI__builtin_neon_vcvtq_n_u32_v:
+  case NEON::BI__builtin_neon_vcvtq_n_s64_v:
+  case NEON::BI__builtin_neon_vcvtq_n_u64_v: {
+    bool Double =
+      (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
+    llvm::Type *FloatTy =
+        GetNeonType(this, NeonTypeFlags(Double ? NeonTypeFlags::Float64
+                                               : NeonTypeFlags::Float32,
+                                        false, Quad));
+    llvm::Type *Tys[2] = { Ty, FloatTy };
+    Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
+    return EmitNeonCall(F, Ops, "vcvt_n");
+  }
+  case NEON::BI__builtin_neon_vcvt_s32_v:
+  case NEON::BI__builtin_neon_vcvt_u32_v:
+  case NEON::BI__builtin_neon_vcvt_s64_v:
+  case NEON::BI__builtin_neon_vcvt_u64_v:
+  case NEON::BI__builtin_neon_vcvtq_s32_v:
+  case NEON::BI__builtin_neon_vcvtq_u32_v:
+  case NEON::BI__builtin_neon_vcvtq_s64_v:
+  case NEON::BI__builtin_neon_vcvtq_u64_v: {
+    bool Double =
+      (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
+    llvm::Type *FloatTy =
+        GetNeonType(this, NeonTypeFlags(Double ? NeonTypeFlags::Float64
+                                               : NeonTypeFlags::Float32,
+                                        false, Quad));
+    Ops[0] = Builder.CreateBitCast(Ops[0], FloatTy);
+    return Usgn ? Builder.CreateFPToUI(Ops[0], Ty, "vcvt")
+                : Builder.CreateFPToSI(Ops[0], Ty, "vcvt");
+  }
+  case NEON::BI__builtin_neon_vcvta_s32_v:
+  case NEON::BI__builtin_neon_vcvta_s64_v:
+  case NEON::BI__builtin_neon_vcvta_u32_v:
+  case NEON::BI__builtin_neon_vcvta_u64_v:
+  case NEON::BI__builtin_neon_vcvtaq_s32_v:
+  case NEON::BI__builtin_neon_vcvtaq_s64_v:
+  case NEON::BI__builtin_neon_vcvtaq_u32_v:
+  case NEON::BI__builtin_neon_vcvtaq_u64_v:
+  case NEON::BI__builtin_neon_vcvtn_s32_v:
+  case NEON::BI__builtin_neon_vcvtn_s64_v:
+  case NEON::BI__builtin_neon_vcvtn_u32_v:
+  case NEON::BI__builtin_neon_vcvtn_u64_v:
+  case NEON::BI__builtin_neon_vcvtnq_s32_v:
+  case NEON::BI__builtin_neon_vcvtnq_s64_v:
+  case NEON::BI__builtin_neon_vcvtnq_u32_v:
+  case NEON::BI__builtin_neon_vcvtnq_u64_v:
+  case NEON::BI__builtin_neon_vcvtp_s32_v:
+  case NEON::BI__builtin_neon_vcvtp_s64_v:
+  case NEON::BI__builtin_neon_vcvtp_u32_v:
+  case NEON::BI__builtin_neon_vcvtp_u64_v:
+  case NEON::BI__builtin_neon_vcvtpq_s32_v:
+  case NEON::BI__builtin_neon_vcvtpq_s64_v:
+  case NEON::BI__builtin_neon_vcvtpq_u32_v:
+  case NEON::BI__builtin_neon_vcvtpq_u64_v:
+  case NEON::BI__builtin_neon_vcvtm_s32_v:
+  case NEON::BI__builtin_neon_vcvtm_s64_v:
+  case NEON::BI__builtin_neon_vcvtm_u32_v:
+  case NEON::BI__builtin_neon_vcvtm_u64_v:
+  case NEON::BI__builtin_neon_vcvtmq_s32_v:
+  case NEON::BI__builtin_neon_vcvtmq_s64_v:
+  case NEON::BI__builtin_neon_vcvtmq_u32_v:
+  case NEON::BI__builtin_neon_vcvtmq_u64_v: {
+    bool Double =
+      (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
+    llvm::Type *InTy =
+      GetNeonType(this,
+                  NeonTypeFlags(Double ? NeonTypeFlags::Float64
+                                : NeonTypeFlags::Float32, false, Quad));
+    llvm::Type *Tys[2] = { Ty, InTy };
+    return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, NameHint);
+  }
+  case NEON::BI__builtin_neon_vext_v:
+  case NEON::BI__builtin_neon_vextq_v: {
+    int CV = cast<ConstantInt>(Ops[2])->getSExtValue();
+    SmallVector<Constant*, 16> Indices;
+    for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
+      Indices.push_back(ConstantInt::get(Int32Ty, i+CV));
+
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+    Value *SV = llvm::ConstantVector::get(Indices);
+    return Builder.CreateShuffleVector(Ops[0], Ops[1], SV, "vext");
+  }
+  case NEON::BI__builtin_neon_vfma_v:
+  case NEON::BI__builtin_neon_vfmaq_v: {
+    Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
+
+    // NEON intrinsic puts accumulator first, unlike the LLVM fma.
+    return Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]);
+  }
+  case NEON::BI__builtin_neon_vld1_v:
+  case NEON::BI__builtin_neon_vld1q_v:
+    Ops.push_back(Align);
+    return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vld1");
+  case NEON::BI__builtin_neon_vld2_v:
+  case NEON::BI__builtin_neon_vld2q_v:
+  case NEON::BI__builtin_neon_vld3_v:
+  case NEON::BI__builtin_neon_vld3q_v:
+  case NEON::BI__builtin_neon_vld4_v:
+  case NEON::BI__builtin_neon_vld4q_v: {
+    Function *F = CGM.getIntrinsic(LLVMIntrinsic, Ty);
+    Ops[1] = Builder.CreateCall2(F, Ops[1], Align, NameHint);
+    Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    return Builder.CreateStore(Ops[1], Ops[0]);
+  }
+  case NEON::BI__builtin_neon_vld1_dup_v:
+  case NEON::BI__builtin_neon_vld1q_dup_v: {
+    Value *V = UndefValue::get(Ty);
+    Ty = llvm::PointerType::getUnqual(VTy->getElementType());
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    LoadInst *Ld = Builder.CreateLoad(Ops[0]);
+    Ld->setAlignment(cast<ConstantInt>(Align)->getZExtValue());
+    llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
+    Ops[0] = Builder.CreateInsertElement(V, Ld, CI);
+    return EmitNeonSplat(Ops[0], CI);
+  }
+  case NEON::BI__builtin_neon_vld2_lane_v:
+  case NEON::BI__builtin_neon_vld2q_lane_v:
+  case NEON::BI__builtin_neon_vld3_lane_v:
+  case NEON::BI__builtin_neon_vld3q_lane_v:
+  case NEON::BI__builtin_neon_vld4_lane_v:
+  case NEON::BI__builtin_neon_vld4q_lane_v: {
+    Function *F = CGM.getIntrinsic(LLVMIntrinsic, Ty);
+    for (unsigned I = 2; I < Ops.size() - 1; ++I)
+      Ops[I] = Builder.CreateBitCast(Ops[I], Ty);
+    Ops.push_back(Align);
+    Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), NameHint);
+    Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    return Builder.CreateStore(Ops[1], Ops[0]);
+  }
+  case NEON::BI__builtin_neon_vmovl_v: {
+    llvm::Type *DTy =llvm::VectorType::getTruncatedElementVectorType(VTy);
+    Ops[0] = Builder.CreateBitCast(Ops[0], DTy);
+    if (Usgn)
+      return Builder.CreateZExt(Ops[0], Ty, "vmovl");
+    return Builder.CreateSExt(Ops[0], Ty, "vmovl");
+  }
+  case NEON::BI__builtin_neon_vmovn_v: {
+    llvm::Type *QTy = llvm::VectorType::getExtendedElementVectorType(VTy);
+    Ops[0] = Builder.CreateBitCast(Ops[0], QTy);
+    return Builder.CreateTrunc(Ops[0], Ty, "vmovn");
+  }
+  case NEON::BI__builtin_neon_vmull_v:
+    // FIXME: the integer vmull operations could be emitted in terms of pure
+    // LLVM IR (2 exts followed by a mul). Unfortunately LLVM has a habit of
+    // hoisting the exts outside loops. Until global ISel comes along that can
+    // see through such movement this leads to bad CodeGen. So we need an
+    // intrinsic for now.
+    Int = Usgn ? Intrinsic::arm_neon_vmullu : Intrinsic::arm_neon_vmulls;
+    Int = Type.isPoly() ? (unsigned)Intrinsic::arm_neon_vmullp : Int;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull");
+  case NEON::BI__builtin_neon_vpadal_v:
+  case NEON::BI__builtin_neon_vpadalq_v: {
+    // The source operand type has twice as many elements of half the size.
+    unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
+    llvm::Type *EltTy =
+      llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
+    llvm::Type *NarrowTy =
+      llvm::VectorType::get(EltTy, VTy->getNumElements() * 2);
+    llvm::Type *Tys[2] = { Ty, NarrowTy };
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
+  }
+  case NEON::BI__builtin_neon_vpaddl_v:
+  case NEON::BI__builtin_neon_vpaddlq_v: {
+    // The source operand type has twice as many elements of half the size.
+    unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
+    llvm::Type *EltTy = llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
+    llvm::Type *NarrowTy =
+      llvm::VectorType::get(EltTy, VTy->getNumElements() * 2);
+    llvm::Type *Tys[2] = { Ty, NarrowTy };
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vpaddl");
+  }
+  case NEON::BI__builtin_neon_vqdmlal_v:
+  case NEON::BI__builtin_neon_vqdmlsl_v: {
+    SmallVector<Value *, 2> MulOps(Ops.begin() + 1, Ops.end());
+    Value *Mul = EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty),
+                              MulOps, "vqdmlal");
+
+    SmallVector<Value *, 2> AccumOps;
+    AccumOps.push_back(Ops[0]);
+    AccumOps.push_back(Mul);
+    return EmitNeonCall(CGM.getIntrinsic(AltLLVMIntrinsic, Ty),
+                        AccumOps, NameHint);
+  }
+  case NEON::BI__builtin_neon_vqshl_n_v:
+  case NEON::BI__builtin_neon_vqshlq_n_v:
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl_n",
+                        1, false);
+  case NEON::BI__builtin_neon_vrecpe_v:
+  case NEON::BI__builtin_neon_vrecpeq_v:
+  case NEON::BI__builtin_neon_vrsqrte_v:
+  case NEON::BI__builtin_neon_vrsqrteq_v:
+    Int = Ty->isFPOrFPVectorTy() ? LLVMIntrinsic : AltLLVMIntrinsic;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
+
+  case NEON::BI__builtin_neon_vshl_n_v:
+  case NEON::BI__builtin_neon_vshlq_n_v:
+    Ops[1] = EmitNeonShiftVector(Ops[1], Ty, false);
+    return Builder.CreateShl(Builder.CreateBitCast(Ops[0],Ty), Ops[1],
+                             "vshl_n");
+  case NEON::BI__builtin_neon_vshll_n_v: {
+    llvm::Type *SrcTy = llvm::VectorType::getTruncatedElementVectorType(VTy);
+    Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
+    if (Usgn)
+      Ops[0] = Builder.CreateZExt(Ops[0], VTy);
+    else
+      Ops[0] = Builder.CreateSExt(Ops[0], VTy);
+    Ops[1] = EmitNeonShiftVector(Ops[1], VTy, false);
+    return Builder.CreateShl(Ops[0], Ops[1], "vshll_n");
+  }
+  case NEON::BI__builtin_neon_vshrn_n_v: {
+    llvm::Type *SrcTy = llvm::VectorType::getExtendedElementVectorType(VTy);
+    Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
+    Ops[1] = EmitNeonShiftVector(Ops[1], SrcTy, false);
+    if (Usgn)
+      Ops[0] = Builder.CreateLShr(Ops[0], Ops[1]);
+    else
+      Ops[0] = Builder.CreateAShr(Ops[0], Ops[1]);
+    return Builder.CreateTrunc(Ops[0], Ty, "vshrn_n");
+  }
+  case NEON::BI__builtin_neon_vshr_n_v:
+  case NEON::BI__builtin_neon_vshrq_n_v:
+    return EmitNeonRShiftImm(Ops[0], Ops[1], Ty, Usgn, "vshr_n");
+  case NEON::BI__builtin_neon_vst1_v:
+  case NEON::BI__builtin_neon_vst1q_v:
+  case NEON::BI__builtin_neon_vst2_v:
+  case NEON::BI__builtin_neon_vst2q_v:
+  case NEON::BI__builtin_neon_vst3_v:
+  case NEON::BI__builtin_neon_vst3q_v:
+  case NEON::BI__builtin_neon_vst4_v:
+  case NEON::BI__builtin_neon_vst4q_v:
+  case NEON::BI__builtin_neon_vst2_lane_v:
+  case NEON::BI__builtin_neon_vst2q_lane_v:
+  case NEON::BI__builtin_neon_vst3_lane_v:
+  case NEON::BI__builtin_neon_vst3q_lane_v:
+  case NEON::BI__builtin_neon_vst4_lane_v:
+  case NEON::BI__builtin_neon_vst4q_lane_v:
+    Ops.push_back(Align);
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "");
+  case NEON::BI__builtin_neon_vsubhn_v: {
+    llvm::VectorType *SrcTy =
+        llvm::VectorType::getExtendedElementVectorType(VTy);
+
+    // %sum = add <4 x i32> %lhs, %rhs
+    Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
+    Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy);
+    Ops[0] = Builder.CreateSub(Ops[0], Ops[1], "vsubhn");
+
+    // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
+    Constant *ShiftAmt = ConstantInt::get(SrcTy->getElementType(),
+                                       SrcTy->getScalarSizeInBits() / 2);
+    ShiftAmt = ConstantVector::getSplat(VTy->getNumElements(), ShiftAmt);
+    Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vsubhn");
+
+    // %res = trunc <4 x i32> %high to <4 x i16>
+    return Builder.CreateTrunc(Ops[0], VTy, "vsubhn");
+  }
+  case NEON::BI__builtin_neon_vtrn_v:
+  case NEON::BI__builtin_neon_vtrnq_v: {
+    Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
+    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
+    Value *SV = 0;
+
+    for (unsigned vi = 0; vi != 2; ++vi) {
+      SmallVector<Constant*, 16> Indices;
+      for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
+        Indices.push_back(Builder.getInt32(i+vi));
+        Indices.push_back(Builder.getInt32(i+e+vi));
+      }
+      Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ops[0], vi);
+      SV = llvm::ConstantVector::get(Indices);
+      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vtrn");
+      SV = Builder.CreateStore(SV, Addr);
     }
+    return SV;
+  }
+  case NEON::BI__builtin_neon_vtst_v:
+  case NEON::BI__builtin_neon_vtstq_v: {
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+    Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]);
+    Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0],
+                                ConstantAggregateZero::get(Ty));
+    return Builder.CreateSExt(Ops[0], Ty, "vtst");
+  }
+  case NEON::BI__builtin_neon_vuzp_v:
+  case NEON::BI__builtin_neon_vuzpq_v: {
+    Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
+    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
+    Value *SV = 0;
 
-    llvm::Type *Tys[2] = {RTy, VTy};
-    F = CGF.CGM.getIntrinsic(Int, Tys);
-    assert(E->getNumArgs() == 1);
-  } else if (OverloadInt) {
-    // Determine the type of this overloaded AArch64 intrinsic
-    llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType());
-    llvm::VectorType *VTy = llvm::VectorType::get(Ty, 1);
-    assert(VTy);
+    for (unsigned vi = 0; vi != 2; ++vi) {
+      SmallVector<Constant*, 16> Indices;
+      for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
+        Indices.push_back(ConstantInt::get(Int32Ty, 2*i+vi));
 
-    F = CGF.CGM.getIntrinsic(Int, VTy);
-  } else if (OverloadWideInt || OverloadNarrowInt) {
-    // Determine the type of this overloaded AArch64 intrinsic
-    const Expr *Arg = E->getArg(E->getNumArgs()-1);
-    llvm::Type *Ty = CGF.ConvertType(Arg->getType());
-    llvm::VectorType *VTy = llvm::VectorType::get(Ty, 1);
-    llvm::VectorType *RTy = OverloadWideInt ? 
-      llvm::VectorType::getExtendedElementVectorType(VTy) :
-      llvm::VectorType::getTruncatedElementVectorType(VTy);
-    F = CGF.CGM.getIntrinsic(Int, RTy);
-  } else if (OverloadCmpInt) {
-    // Determine the types of this overloaded AArch64 intrinsic
-    SmallVector<llvm::Type *, 3> Tys;
-    const Expr *Arg = E->getArg(E->getNumArgs()-1);
-    llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType());
-    llvm::VectorType *VTy = llvm::VectorType::get(Ty, 1);
-    Tys.push_back(VTy);
-    Ty = CGF.ConvertType(Arg->getType());
-    VTy = llvm::VectorType::get(Ty, 1);
-    Tys.push_back(VTy);
-    if(IsFpCmpZInt)
-      VTy = llvm::VectorType::get(CGF.FloatTy, 1);
-    Tys.push_back(VTy);
+      Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ops[0], vi);
+      SV = llvm::ConstantVector::get(Indices);
+      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vuzp");
+      SV = Builder.CreateStore(SV, Addr);
+    }
+    return SV;
+  }
+  case NEON::BI__builtin_neon_vzip_v:
+  case NEON::BI__builtin_neon_vzipq_v: {
+    Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
+    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
+    Value *SV = 0;
 
-    F = CGF.CGM.getIntrinsic(Int, Tys);
-  } else if (OverloadCvtInt) {
-    // Determine the types of this overloaded AArch64 intrinsic
-    SmallVector<llvm::Type *, 2> Tys;
-    const Expr *Arg = E->getArg(E->getNumArgs()-1);
-    llvm::Type *Ty = CGF.ConvertType(E->getCallReturnType());
-    llvm::VectorType *VTy = llvm::VectorType::get(Ty, 1);
-    Tys.push_back(VTy);
-    Ty = CGF.ConvertType(Arg->getType());
-    VTy = llvm::VectorType::get(Ty, 1);
-    Tys.push_back(VTy);
+    for (unsigned vi = 0; vi != 2; ++vi) {
+      SmallVector<Constant*, 16> Indices;
+      for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
+        Indices.push_back(ConstantInt::get(Int32Ty, (i + vi*e) >> 1));
+        Indices.push_back(ConstantInt::get(Int32Ty, ((i + vi*e) >> 1)+e));
+      }
+      Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ops[0], vi);
+      SV = llvm::ConstantVector::get(Indices);
+      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], SV, "vzip");
+      SV = Builder.CreateStore(SV, Addr);
+    }
+    return SV;
+  }
+  }
 
-    F = CGF.CGM.getIntrinsic(Int, Tys);
-  } else
-    F = CGF.CGM.getIntrinsic(Int);
+  assert(Int && "Expected valid intrinsic number");
 
-  Value *Result = CGF.EmitNeonCall(F, Ops, s);
-  llvm::Type *ResultType = CGF.ConvertType(E->getType());
+  // Determine the type(s) of this overloaded AArch64 intrinsic.
+  Function *F = LookupNeonLLVMIntrinsic(Int, Modifier, Ty, E);
+
+  Value *Result = EmitNeonCall(F, Ops, NameHint);
+  llvm::Type *ResultType = ConvertType(E->getType());
   // AArch64 intrinsic one-element vector type cast to
   // scalar type expected by the builtin
-  return CGF.Builder.CreateBitCast(Result, ResultType, s);
+  return Builder.CreateBitCast(Result, ResultType, NameHint);
 }
 
 Value *CodeGenFunction::EmitAArch64CompareBuiltinExpr(
     Value *Op, llvm::Type *Ty, const CmpInst::Predicate Fp,
     const CmpInst::Predicate Ip, const Twine &Name) {
-  llvm::Type *OTy = ((llvm::User *)Op)->getOperand(0)->getType();
-  if (OTy->isPointerTy())
-    OTy = Ty;
+  llvm::Type *OTy = Op->getType();
+
+  // FIXME: this is utterly horrific. We should not be looking at previous
+  // codegen context to find out what needs doing. Unfortunately TableGen
+  // currently gives us exactly the same calls for vceqz_f32 and vceqz_s32
+  // (etc).
+  if (BitCastInst *BI = dyn_cast<BitCastInst>(Op))
+    OTy = BI->getOperand(0)->getType();
+
   Op = Builder.CreateBitCast(Op, OTy);
-  if (((llvm::VectorType *)OTy)->getElementType()->isFloatingPointTy()) {
-    Op = Builder.CreateFCmp(Fp, Op, ConstantAggregateZero::get(OTy));
+  if (OTy->getScalarType()->isFloatingPointTy()) {
+    Op = Builder.CreateFCmp(Fp, Op, Constant::getNullValue(OTy));
   } else {
-    Op = Builder.CreateICmp(Ip, Op, ConstantAggregateZero::get(OTy));
+    Op = Builder.CreateICmp(Ip, Op, Constant::getNullValue(OTy));
   }
-  return Builder.CreateZExt(Op, Ty, Name);
+  return Builder.CreateSExt(Op, Ty, Name);
 }
 
 static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
@@ -2681,13 +3526,9 @@
                                                      ZeroTbl, SV, Name));
   }
 
-  TblTy = llvm::VectorType::get(TblTy->getElementType(),
-                                2*TblTy->getNumElements());
-  llvm::Type *Tys[2] = { ResTy, TblTy };
-
   Function *TblF;
   TblOps.push_back(IndexOp);
-  TblF = CGF.CGM.getIntrinsic(IntID, Tys);
+  TblF = CGF.CGM.getIntrinsic(IntID, ResTy);
   
   return CGF.EmitNeonCall(TblF, TblOps, Name);
 }
@@ -2698,37 +3539,33 @@
   unsigned int Int = 0;
   const char *s = NULL;
 
-  unsigned TblPos;
   switch (BuiltinID) {
   default:
     return 0;
-  case AArch64::BI__builtin_neon_vtbl1_v:
-  case AArch64::BI__builtin_neon_vqtbl1_v:
-  case AArch64::BI__builtin_neon_vqtbl1q_v:
-  case AArch64::BI__builtin_neon_vtbl2_v:
-  case AArch64::BI__builtin_neon_vqtbl2_v:
-  case AArch64::BI__builtin_neon_vqtbl2q_v:
-  case AArch64::BI__builtin_neon_vtbl3_v:
-  case AArch64::BI__builtin_neon_vqtbl3_v:
-  case AArch64::BI__builtin_neon_vqtbl3q_v:
-  case AArch64::BI__builtin_neon_vtbl4_v:
-  case AArch64::BI__builtin_neon_vqtbl4_v:
-  case AArch64::BI__builtin_neon_vqtbl4q_v:
-    TblPos = 0;
-    break;
-  case AArch64::BI__builtin_neon_vtbx1_v:
-  case AArch64::BI__builtin_neon_vqtbx1_v:
-  case AArch64::BI__builtin_neon_vqtbx1q_v:
-  case AArch64::BI__builtin_neon_vtbx2_v:
-  case AArch64::BI__builtin_neon_vqtbx2_v:
-  case AArch64::BI__builtin_neon_vqtbx2q_v:
-  case AArch64::BI__builtin_neon_vtbx3_v:
-  case AArch64::BI__builtin_neon_vqtbx3_v:
-  case AArch64::BI__builtin_neon_vqtbx3q_v:
-  case AArch64::BI__builtin_neon_vtbx4_v:
-  case AArch64::BI__builtin_neon_vqtbx4_v:
-  case AArch64::BI__builtin_neon_vqtbx4q_v:
-    TblPos = 1;
+  case NEON::BI__builtin_neon_vtbl1_v:
+  case NEON::BI__builtin_neon_vqtbl1_v:
+  case NEON::BI__builtin_neon_vqtbl1q_v:
+  case NEON::BI__builtin_neon_vtbl2_v:
+  case NEON::BI__builtin_neon_vqtbl2_v:
+  case NEON::BI__builtin_neon_vqtbl2q_v:
+  case NEON::BI__builtin_neon_vtbl3_v:
+  case NEON::BI__builtin_neon_vqtbl3_v:
+  case NEON::BI__builtin_neon_vqtbl3q_v:
+  case NEON::BI__builtin_neon_vtbl4_v:
+  case NEON::BI__builtin_neon_vqtbl4_v:
+  case NEON::BI__builtin_neon_vqtbl4q_v:
+  case NEON::BI__builtin_neon_vtbx1_v:
+  case NEON::BI__builtin_neon_vqtbx1_v:
+  case NEON::BI__builtin_neon_vqtbx1q_v:
+  case NEON::BI__builtin_neon_vtbx2_v:
+  case NEON::BI__builtin_neon_vqtbx2_v:
+  case NEON::BI__builtin_neon_vqtbx2q_v:
+  case NEON::BI__builtin_neon_vtbx3_v:
+  case NEON::BI__builtin_neon_vqtbx3_v:
+  case NEON::BI__builtin_neon_vqtbx3q_v:
+  case NEON::BI__builtin_neon_vtbx4_v:
+  case NEON::BI__builtin_neon_vqtbx4_v:
+  case NEON::BI__builtin_neon_vqtbx4q_v:
     break;
   }
 
@@ -2752,35 +3589,31 @@
     Ops.push_back(CGF.EmitScalarExpr(E->getArg(i)));
   }
 
-  Arg = E->getArg(TblPos);
-  llvm::Type *TblTy = CGF.ConvertType(Arg->getType());
-  llvm::VectorType *VTblTy = cast<llvm::VectorType>(TblTy);
-  llvm::Type *Tys[2] = { Ty, VTblTy };
   unsigned nElts = VTy->getNumElements();  
 
   // AArch64 scalar builtins are not overloaded, they do not have an extra
   // argument that specifies the vector type, need to handle each case.
   SmallVector<Value *, 2> TblOps;
   switch (BuiltinID) {
-  case AArch64::BI__builtin_neon_vtbl1_v: {
+  case NEON::BI__builtin_neon_vtbl1_v: {
     TblOps.push_back(Ops[0]);
     return packTBLDVectorList(CGF, TblOps, 0, Ops[1], Ty,
                               Intrinsic::aarch64_neon_vtbl1, "vtbl1");
   }
-  case AArch64::BI__builtin_neon_vtbl2_v: {
+  case NEON::BI__builtin_neon_vtbl2_v: {
     TblOps.push_back(Ops[0]);
     TblOps.push_back(Ops[1]);
     return packTBLDVectorList(CGF, TblOps, 0, Ops[2], Ty,
                               Intrinsic::aarch64_neon_vtbl1, "vtbl1");
   }
-  case AArch64::BI__builtin_neon_vtbl3_v: {
+  case NEON::BI__builtin_neon_vtbl3_v: {
     TblOps.push_back(Ops[0]);
     TblOps.push_back(Ops[1]);
     TblOps.push_back(Ops[2]);
     return packTBLDVectorList(CGF, TblOps, 0, Ops[3], Ty,
                               Intrinsic::aarch64_neon_vtbl2, "vtbl2");
   }
-  case AArch64::BI__builtin_neon_vtbl4_v: {
+  case NEON::BI__builtin_neon_vtbl4_v: {
     TblOps.push_back(Ops[0]);
     TblOps.push_back(Ops[1]);
     TblOps.push_back(Ops[2]);
@@ -2788,7 +3621,7 @@
     return packTBLDVectorList(CGF, TblOps, 0, Ops[4], Ty,
                               Intrinsic::aarch64_neon_vtbl2, "vtbl2");
   }
-  case AArch64::BI__builtin_neon_vtbx1_v: {
+  case NEON::BI__builtin_neon_vtbx1_v: {
     TblOps.push_back(Ops[1]);
     Value *TblRes = packTBLDVectorList(CGF, TblOps, 0, Ops[2], Ty,
                                     Intrinsic::aarch64_neon_vtbl1, "vtbl1");
@@ -2805,13 +3638,13 @@
     Function *BslF = CGF.CGM.getIntrinsic(Intrinsic::arm_neon_vbsl, Ty);
     return CGF.EmitNeonCall(BslF, BslOps, "vbsl");
   }
-  case AArch64::BI__builtin_neon_vtbx2_v: {
+  case NEON::BI__builtin_neon_vtbx2_v: {
     TblOps.push_back(Ops[1]);
     TblOps.push_back(Ops[2]);
     return packTBLDVectorList(CGF, TblOps, Ops[0], Ops[3], Ty,
                               Intrinsic::aarch64_neon_vtbx1, "vtbx1");
   }
-  case AArch64::BI__builtin_neon_vtbx3_v: {
+  case NEON::BI__builtin_neon_vtbx3_v: {
     TblOps.push_back(Ops[1]);
     TblOps.push_back(Ops[2]);
     TblOps.push_back(Ops[3]);
@@ -2831,7 +3664,7 @@
     Function *BslF = CGF.CGM.getIntrinsic(Intrinsic::arm_neon_vbsl, Ty);
     return CGF.EmitNeonCall(BslF, BslOps, "vbsl");
   }
-  case AArch64::BI__builtin_neon_vtbx4_v: {
+  case NEON::BI__builtin_neon_vtbx4_v: {
     TblOps.push_back(Ops[1]);
     TblOps.push_back(Ops[2]);
     TblOps.push_back(Ops[3]);
@@ -2839,29 +3672,29 @@
     return packTBLDVectorList(CGF, TblOps, Ops[0], Ops[5], Ty,
                               Intrinsic::aarch64_neon_vtbx2, "vtbx2");
   }
-  case AArch64::BI__builtin_neon_vqtbl1_v:
-  case AArch64::BI__builtin_neon_vqtbl1q_v:
+  case NEON::BI__builtin_neon_vqtbl1_v:
+  case NEON::BI__builtin_neon_vqtbl1q_v:
     Int = Intrinsic::aarch64_neon_vtbl1; s = "vtbl1"; break;
-  case AArch64::BI__builtin_neon_vqtbl2_v:
-  case AArch64::BI__builtin_neon_vqtbl2q_v: {
+  case NEON::BI__builtin_neon_vqtbl2_v:
+  case NEON::BI__builtin_neon_vqtbl2q_v: {
     Int = Intrinsic::aarch64_neon_vtbl2; s = "vtbl2"; break;
-  case AArch64::BI__builtin_neon_vqtbl3_v:
-  case AArch64::BI__builtin_neon_vqtbl3q_v:
+  case NEON::BI__builtin_neon_vqtbl3_v:
+  case NEON::BI__builtin_neon_vqtbl3q_v:
     Int = Intrinsic::aarch64_neon_vtbl3; s = "vtbl3"; break;
-  case AArch64::BI__builtin_neon_vqtbl4_v:
-  case AArch64::BI__builtin_neon_vqtbl4q_v:
+  case NEON::BI__builtin_neon_vqtbl4_v:
+  case NEON::BI__builtin_neon_vqtbl4q_v:
     Int = Intrinsic::aarch64_neon_vtbl4; s = "vtbl4"; break;
-  case AArch64::BI__builtin_neon_vqtbx1_v:
-  case AArch64::BI__builtin_neon_vqtbx1q_v:
+  case NEON::BI__builtin_neon_vqtbx1_v:
+  case NEON::BI__builtin_neon_vqtbx1q_v:
     Int = Intrinsic::aarch64_neon_vtbx1; s = "vtbx1"; break;
-  case AArch64::BI__builtin_neon_vqtbx2_v:
-  case AArch64::BI__builtin_neon_vqtbx2q_v:
+  case NEON::BI__builtin_neon_vqtbx2_v:
+  case NEON::BI__builtin_neon_vqtbx2q_v:
     Int = Intrinsic::aarch64_neon_vtbx2; s = "vtbx2"; break;
-  case AArch64::BI__builtin_neon_vqtbx3_v:
-  case AArch64::BI__builtin_neon_vqtbx3q_v:
+  case NEON::BI__builtin_neon_vqtbx3_v:
+  case NEON::BI__builtin_neon_vqtbx3q_v:
     Int = Intrinsic::aarch64_neon_vtbx3; s = "vtbx3"; break;
-  case AArch64::BI__builtin_neon_vqtbx4_v:
-  case AArch64::BI__builtin_neon_vqtbx4q_v:
+  case NEON::BI__builtin_neon_vqtbx4_v:
+  case NEON::BI__builtin_neon_vqtbx4q_v:
     Int = Intrinsic::aarch64_neon_vtbx4; s = "vtbx4"; break;
   }
   }
@@ -2869,15 +3702,23 @@
   if (!Int)
     return 0;
 
-  Function *F = CGF.CGM.getIntrinsic(Int, Tys);
+  Function *F = CGF.CGM.getIntrinsic(Int, Ty);
   return CGF.EmitNeonCall(F, Ops, s);
 }
 
 Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
                                                const CallExpr *E) {
+
   // Process AArch64 scalar builtins
-  if (Value *Result = EmitAArch64ScalarBuiltinExpr(*this, BuiltinID, E))
+  llvm::ArrayRef<NeonIntrinsicInfo> SISDInfo(AArch64SISDIntrinsicInfo);
+  const NeonIntrinsicInfo *Builtin = findNeonIntrinsicInMap(
+      SISDInfo, BuiltinID, AArch64SISDIntrinsicInfoProvenSorted);
+
+  if (Builtin) {
+    Value *Result = EmitAArch64ScalarBuiltinExpr(*this, *Builtin, E);
+    assert(Result && "SISD intrinsic should have been handled");
     return Result;
+  }
 
   // Process AArch64 table lookup builtins
   if (Value *Result = EmitAArch64TblBuiltinExpr(*this, BuiltinID, E))
@@ -2899,20 +3740,60 @@
 
   SmallVector<Value *, 4> Ops;
   llvm::Value *Align = 0; // Alignment for load/store
+
+  if (BuiltinID == NEON::BI__builtin_neon_vldrq_p128) {
+   Value *Op = EmitScalarExpr(E->getArg(0));
+   unsigned addressSpace =
+     cast<llvm::PointerType>(Op->getType())->getAddressSpace();
+   llvm::Type *Ty = llvm::Type::getFP128PtrTy(getLLVMContext(), addressSpace);
+   Op = Builder.CreateBitCast(Op, Ty);
+   Op = Builder.CreateLoad(Op);
+   Ty = llvm::Type::getIntNTy(getLLVMContext(), 128);
+   return Builder.CreateBitCast(Op, Ty);
+  }
+  if (BuiltinID == NEON::BI__builtin_neon_vstrq_p128) {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    unsigned addressSpace =
+      cast<llvm::PointerType>(Op0->getType())->getAddressSpace();
+    llvm::Type *PTy = llvm::Type::getFP128PtrTy(getLLVMContext(), addressSpace);
+    Op0 = Builder.CreateBitCast(Op0, PTy);
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    llvm::Type *Ty = llvm::Type::getFP128Ty(getLLVMContext());
+    Op1 = Builder.CreateBitCast(Op1, Ty);
+    return Builder.CreateStore(Op1, Op0);
+  }
   for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) {
     if (i == 0) {
       switch (BuiltinID) {
-      case AArch64::BI__builtin_neon_vst1_x2_v:
-      case AArch64::BI__builtin_neon_vst1q_x2_v:
-      case AArch64::BI__builtin_neon_vst1_x3_v:
-      case AArch64::BI__builtin_neon_vst1q_x3_v:
-      case AArch64::BI__builtin_neon_vst1_x4_v:
-      case AArch64::BI__builtin_neon_vst1q_x4_v:
+      case NEON::BI__builtin_neon_vld1_v:
+      case NEON::BI__builtin_neon_vld1q_v:
+      case NEON::BI__builtin_neon_vst1_v:
+      case NEON::BI__builtin_neon_vst1q_v:
+      case NEON::BI__builtin_neon_vst2_v:
+      case NEON::BI__builtin_neon_vst2q_v:
+      case NEON::BI__builtin_neon_vst3_v:
+      case NEON::BI__builtin_neon_vst3q_v:
+      case NEON::BI__builtin_neon_vst4_v:
+      case NEON::BI__builtin_neon_vst4q_v:
+      case NEON::BI__builtin_neon_vst1_x2_v:
+      case NEON::BI__builtin_neon_vst1q_x2_v:
+      case NEON::BI__builtin_neon_vst1_x3_v:
+      case NEON::BI__builtin_neon_vst1q_x3_v:
+      case NEON::BI__builtin_neon_vst1_x4_v:
+      case NEON::BI__builtin_neon_vst1q_x4_v:
       // Handle ld1/st1 lane in this function a little different from ARM.
-      case AArch64::BI__builtin_neon_vld1_lane_v:
-      case AArch64::BI__builtin_neon_vld1q_lane_v:
-      case AArch64::BI__builtin_neon_vst1_lane_v:
-      case AArch64::BI__builtin_neon_vst1q_lane_v:
+      case NEON::BI__builtin_neon_vld1_lane_v:
+      case NEON::BI__builtin_neon_vld1q_lane_v:
+      case NEON::BI__builtin_neon_vst1_lane_v:
+      case NEON::BI__builtin_neon_vst1q_lane_v:
+      case NEON::BI__builtin_neon_vst2_lane_v:
+      case NEON::BI__builtin_neon_vst2q_lane_v:
+      case NEON::BI__builtin_neon_vst3_lane_v:
+      case NEON::BI__builtin_neon_vst3q_lane_v:
+      case NEON::BI__builtin_neon_vst4_lane_v:
+      case NEON::BI__builtin_neon_vst4q_lane_v:
+      case NEON::BI__builtin_neon_vld1_dup_v:
+      case NEON::BI__builtin_neon_vld1q_dup_v:
         // Get the alignment for the argument in addition to the value;
         // we'll use it later.
         std::pair<llvm::Value *, unsigned> Src =
@@ -2924,21 +3805,31 @@
     }
     if (i == 1) {
       switch (BuiltinID) {
-      case AArch64::BI__builtin_neon_vld1_x2_v:
-      case AArch64::BI__builtin_neon_vld1q_x2_v:
-      case AArch64::BI__builtin_neon_vld1_x3_v:
-      case AArch64::BI__builtin_neon_vld1q_x3_v:
-      case AArch64::BI__builtin_neon_vld1_x4_v:
-      case AArch64::BI__builtin_neon_vld1q_x4_v:
+      case NEON::BI__builtin_neon_vld2_v:
+      case NEON::BI__builtin_neon_vld2q_v:
+      case NEON::BI__builtin_neon_vld3_v:
+      case NEON::BI__builtin_neon_vld3q_v:
+      case NEON::BI__builtin_neon_vld4_v:
+      case NEON::BI__builtin_neon_vld4q_v:
+      case NEON::BI__builtin_neon_vld1_x2_v:
+      case NEON::BI__builtin_neon_vld1q_x2_v:
+      case NEON::BI__builtin_neon_vld1_x3_v:
+      case NEON::BI__builtin_neon_vld1q_x3_v:
+      case NEON::BI__builtin_neon_vld1_x4_v:
+      case NEON::BI__builtin_neon_vld1q_x4_v:
       // Handle ld1/st1 dup lane in this function a little different from ARM.
-      case AArch64::BI__builtin_neon_vld2_dup_v:
-      case AArch64::BI__builtin_neon_vld2q_dup_v:
-      case AArch64::BI__builtin_neon_vld3_dup_v:
-      case AArch64::BI__builtin_neon_vld3q_dup_v:
-      case AArch64::BI__builtin_neon_vld4_dup_v:
-      case AArch64::BI__builtin_neon_vld4q_dup_v:
-      case AArch64::BI__builtin_neon_vld2_lane_v:
-      case AArch64::BI__builtin_neon_vld2q_lane_v:
+      case NEON::BI__builtin_neon_vld2_dup_v:
+      case NEON::BI__builtin_neon_vld2q_dup_v:
+      case NEON::BI__builtin_neon_vld3_dup_v:
+      case NEON::BI__builtin_neon_vld3q_dup_v:
+      case NEON::BI__builtin_neon_vld4_dup_v:
+      case NEON::BI__builtin_neon_vld4q_dup_v:
+      case NEON::BI__builtin_neon_vld2_lane_v:
+      case NEON::BI__builtin_neon_vld2q_lane_v:
+      case NEON::BI__builtin_neon_vld3_lane_v:
+      case NEON::BI__builtin_neon_vld3q_lane_v:
+      case NEON::BI__builtin_neon_vld4_lane_v:
+      case NEON::BI__builtin_neon_vld4q_lane_v:
         // Get the alignment for the argument in addition to the value;
         // we'll use it later.
         std::pair<llvm::Value *, unsigned> Src =
@@ -2967,6 +3858,17 @@
   if (!Ty)
     return 0;
 
+
+  // Many NEON builtins have identical semantics and uses in ARM and
+  // AArch64. Emit these in a single function.
+  llvm::ArrayRef<NeonIntrinsicInfo> IntrinsicMap(ARMSIMDIntrinsicMap);
+  Builtin = findNeonIntrinsicInMap(IntrinsicMap, BuiltinID,
+                                   NEONSIMDIntrinsicsProvenSorted);
+  if (Builtin)
+    return EmitCommonNeonBuiltinExpr(
+        Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
+        Builtin->NameHint, Builtin->TypeModifier, E, Ops, Align);
+
   unsigned Int;
   switch (BuiltinID) {
   default:
@@ -2975,208 +3877,30 @@
   // AArch64 builtins mapping to legacy ARM v7 builtins.
   // FIXME: the mapped builtins listed correspond to what has been tested
   // in aarch64-neon-intrinsics.c so far.
-  case AArch64::BI__builtin_neon_vuzp_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vuzp_v, E);
-  case AArch64::BI__builtin_neon_vuzpq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vuzpq_v, E);
-  case AArch64::BI__builtin_neon_vzip_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vzip_v, E);
-  case AArch64::BI__builtin_neon_vzipq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vzipq_v, E);
-  case AArch64::BI__builtin_neon_vtrn_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vtrn_v, E);
-  case AArch64::BI__builtin_neon_vtrnq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vtrnq_v, E);
-  case AArch64::BI__builtin_neon_vext_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vext_v, E);
-  case AArch64::BI__builtin_neon_vextq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vextq_v, E);
-  case AArch64::BI__builtin_neon_vmul_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmul_v, E);
-  case AArch64::BI__builtin_neon_vmulq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmulq_v, E);
-  case AArch64::BI__builtin_neon_vabd_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vabd_v, E);
-  case AArch64::BI__builtin_neon_vabdq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vabdq_v, E);
-  case AArch64::BI__builtin_neon_vfma_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vfma_v, E);
-  case AArch64::BI__builtin_neon_vfmaq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vfmaq_v, E);
-  case AArch64::BI__builtin_neon_vbsl_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vbsl_v, E);
-  case AArch64::BI__builtin_neon_vbslq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vbslq_v, E);
-  case AArch64::BI__builtin_neon_vrsqrts_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrsqrts_v, E);
-  case AArch64::BI__builtin_neon_vrsqrtsq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrsqrtsq_v, E);
-  case AArch64::BI__builtin_neon_vrecps_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrecps_v, E);
-  case AArch64::BI__builtin_neon_vrecpsq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrecpsq_v, E);
-  case AArch64::BI__builtin_neon_vcale_v:
-    if (VTy->getVectorNumElements() == 1) {
-      std::swap(Ops[0], Ops[1]);
-    } else {
-      return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcale_v, E);
-    }
-  case AArch64::BI__builtin_neon_vcage_v:
-    if (VTy->getVectorNumElements() == 1) {
-      // Determine the types of this overloaded AArch64 intrinsic
-      SmallVector<llvm::Type *, 3> Tys;
-      Tys.push_back(VTy);
-      VTy = llvm::VectorType::get(DoubleTy, 1);
-      Tys.push_back(VTy);
-      Tys.push_back(VTy);
-      Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_vcage, Tys);
-      return EmitNeonCall(F, Ops, "vcage");
-    }
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcage_v, E);
-  case AArch64::BI__builtin_neon_vcaleq_v:
-    std::swap(Ops[0], Ops[1]);
-  case AArch64::BI__builtin_neon_vcageq_v: {
-    Function *F;
-    if (VTy->getElementType()->isIntegerTy(64))
-      F = CGM.getIntrinsic(Intrinsic::aarch64_neon_vacgeq);
-    else
-      F = CGM.getIntrinsic(Intrinsic::arm_neon_vacgeq);
-    return EmitNeonCall(F, Ops, "vcage");
-  }
-  case AArch64::BI__builtin_neon_vcalt_v:
-    if (VTy->getVectorNumElements() == 1) {
-      std::swap(Ops[0], Ops[1]);
-    } else {
-      return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcalt_v, E);
-    }
-  case AArch64::BI__builtin_neon_vcagt_v:
-    if (VTy->getVectorNumElements() == 1) {
-      // Determine the types of this overloaded AArch64 intrinsic
-      SmallVector<llvm::Type *, 3> Tys;
-      Tys.push_back(VTy);
-      VTy = llvm::VectorType::get(DoubleTy, 1);
-      Tys.push_back(VTy);
-      Tys.push_back(VTy);
-      Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_vcagt, Tys);
-      return EmitNeonCall(F, Ops, "vcagt");
-    }
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcagt_v, E);
-  case AArch64::BI__builtin_neon_vcaltq_v:
-    std::swap(Ops[0], Ops[1]);
-  case AArch64::BI__builtin_neon_vcagtq_v: {
-    Function *F;
-    if (VTy->getElementType()->isIntegerTy(64))
-      F = CGM.getIntrinsic(Intrinsic::aarch64_neon_vacgtq);
-    else
-      F = CGM.getIntrinsic(Intrinsic::arm_neon_vacgtq);
-    return EmitNeonCall(F, Ops, "vcagt");
-  }
-  case AArch64::BI__builtin_neon_vtst_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vtst_v, E);
-  case AArch64::BI__builtin_neon_vtstq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vtstq_v, E);
-  case AArch64::BI__builtin_neon_vhadd_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vhadd_v, E);
-  case AArch64::BI__builtin_neon_vhaddq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vhaddq_v, E);
-  case AArch64::BI__builtin_neon_vhsub_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vhsub_v, E);
-  case AArch64::BI__builtin_neon_vhsubq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vhsubq_v, E);
-  case AArch64::BI__builtin_neon_vrhadd_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrhadd_v, E);
-  case AArch64::BI__builtin_neon_vrhaddq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrhaddq_v, E);
-  case AArch64::BI__builtin_neon_vqadd_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqadd_v, E);
-  case AArch64::BI__builtin_neon_vqaddq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqaddq_v, E);
-  case AArch64::BI__builtin_neon_vqsub_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqsub_v, E);
-  case AArch64::BI__builtin_neon_vqsubq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqsubq_v, E);
-  case AArch64::BI__builtin_neon_vshl_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vshl_v, E);
-  case AArch64::BI__builtin_neon_vshlq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vshlq_v, E);
-  case AArch64::BI__builtin_neon_vqshl_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqshl_v, E);
-  case AArch64::BI__builtin_neon_vqshlq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqshlq_v, E);
-  case AArch64::BI__builtin_neon_vrshl_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrshl_v, E);
-  case AArch64::BI__builtin_neon_vrshlq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrshlq_v, E);
-  case AArch64::BI__builtin_neon_vqrshl_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqrshl_v, E);
-  case AArch64::BI__builtin_neon_vqrshlq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqrshlq_v, E);
-  case AArch64::BI__builtin_neon_vaddhn_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vaddhn_v, E);
-  case AArch64::BI__builtin_neon_vraddhn_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vraddhn_v, E);
-  case AArch64::BI__builtin_neon_vsubhn_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vsubhn_v, E);
-  case AArch64::BI__builtin_neon_vrsubhn_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrsubhn_v, E);
-  case AArch64::BI__builtin_neon_vmull_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmull_v, E);
-  case AArch64::BI__builtin_neon_vqdmull_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqdmull_v, E);
-  case AArch64::BI__builtin_neon_vqdmlal_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqdmlal_v, E);
-  case AArch64::BI__builtin_neon_vqdmlsl_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqdmlsl_v, E);
-  case AArch64::BI__builtin_neon_vmax_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmax_v, E);
-  case AArch64::BI__builtin_neon_vmaxq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmaxq_v, E);
-  case AArch64::BI__builtin_neon_vmin_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmin_v, E);
-  case AArch64::BI__builtin_neon_vminq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vminq_v, E);
-  case AArch64::BI__builtin_neon_vpmax_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vpmax_v, E);
-  case AArch64::BI__builtin_neon_vpmin_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vpmin_v, E);
-  case AArch64::BI__builtin_neon_vpadd_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vpadd_v, E);
-  case AArch64::BI__builtin_neon_vqdmulh_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqdmulh_v, E);
-  case AArch64::BI__builtin_neon_vqdmulhq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqdmulhq_v, E);
-  case AArch64::BI__builtin_neon_vqrdmulh_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqrdmulh_v, E);
-  case AArch64::BI__builtin_neon_vqrdmulhq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqrdmulhq_v, E);
 
   // Shift by immediate
-  case AArch64::BI__builtin_neon_vshr_n_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vshr_n_v, E);
-  case AArch64::BI__builtin_neon_vshrq_n_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vshrq_n_v, E);
-  case AArch64::BI__builtin_neon_vrshr_n_v:
-  case AArch64::BI__builtin_neon_vrshrq_n_v:
+  case NEON::BI__builtin_neon_vrshr_n_v:
+  case NEON::BI__builtin_neon_vrshrq_n_v:
     Int = usgn ? Intrinsic::aarch64_neon_vurshr
                : Intrinsic::aarch64_neon_vsrshr;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshr_n");
-  case AArch64::BI__builtin_neon_vsra_n_v:
+  case NEON::BI__builtin_neon_vsra_n_v:
     if (VTy->getElementType()->isIntegerTy(64)) {
       Int = usgn ? Intrinsic::aarch64_neon_vsradu_n
                  : Intrinsic::aarch64_neon_vsrads_n;
       return EmitNeonCall(CGM.getIntrinsic(Int), Ops, "vsra_n");
     }
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vsra_n_v, E);
-  case AArch64::BI__builtin_neon_vsraq_n_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vsraq_n_v, E);
-  case AArch64::BI__builtin_neon_vrsra_n_v:
+    return EmitARMBuiltinExpr(NEON::BI__builtin_neon_vsra_n_v, E);
+  case NEON::BI__builtin_neon_vsraq_n_v:
+    return EmitARMBuiltinExpr(NEON::BI__builtin_neon_vsraq_n_v, E);
+  case NEON::BI__builtin_neon_vrsra_n_v:
     if (VTy->getElementType()->isIntegerTy(64)) {
       Int = usgn ? Intrinsic::aarch64_neon_vrsradu_n
                  : Intrinsic::aarch64_neon_vrsrads_n;
       return EmitNeonCall(CGM.getIntrinsic(Int), Ops, "vrsra_n");
     }
     // fall through
-  case AArch64::BI__builtin_neon_vrsraq_n_v: {
+  case NEON::BI__builtin_neon_vrsraq_n_v: {
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
     Int = usgn ? Intrinsic::aarch64_neon_vurshr
@@ -3184,73 +3908,39 @@
     Ops[1] = Builder.CreateCall2(CGM.getIntrinsic(Int, Ty), Ops[1], Ops[2]);
     return Builder.CreateAdd(Ops[0], Ops[1], "vrsra_n");
   }
-  case AArch64::BI__builtin_neon_vshl_n_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vshl_n_v, E);
-  case AArch64::BI__builtin_neon_vshlq_n_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vshlq_n_v, E);
-  case AArch64::BI__builtin_neon_vqshl_n_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqshl_n_v, E);
-  case AArch64::BI__builtin_neon_vqshlq_n_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqshlq_n_v, E);
-  case AArch64::BI__builtin_neon_vqshlu_n_v:
-  case AArch64::BI__builtin_neon_vqshluq_n_v:
+  case NEON::BI__builtin_neon_vqshlu_n_v:
+  case NEON::BI__builtin_neon_vqshluq_n_v:
     Int = Intrinsic::aarch64_neon_vsqshlu;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshlu_n");
-  case AArch64::BI__builtin_neon_vsri_n_v:
-  case AArch64::BI__builtin_neon_vsriq_n_v:
+  case NEON::BI__builtin_neon_vsri_n_v:
+  case NEON::BI__builtin_neon_vsriq_n_v:
     Int = Intrinsic::aarch64_neon_vsri;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsri_n");
-  case AArch64::BI__builtin_neon_vsli_n_v:
-  case AArch64::BI__builtin_neon_vsliq_n_v:
+  case NEON::BI__builtin_neon_vsli_n_v:
+  case NEON::BI__builtin_neon_vsliq_n_v:
     Int = Intrinsic::aarch64_neon_vsli;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsli_n");
-  case AArch64::BI__builtin_neon_vshll_n_v: {
-    llvm::Type *SrcTy = llvm::VectorType::getTruncatedElementVectorType(VTy);
-    Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
-    if (usgn)
-      Ops[0] = Builder.CreateZExt(Ops[0], VTy);
-    else
-      Ops[0] = Builder.CreateSExt(Ops[0], VTy);
-    Ops[1] = EmitNeonShiftVector(Ops[1], VTy, false);
-    return Builder.CreateShl(Ops[0], Ops[1], "vshll_n");
-  }
-  case AArch64::BI__builtin_neon_vshrn_n_v: {
-    llvm::Type *SrcTy = llvm::VectorType::getExtendedElementVectorType(VTy);
-    Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
-    Ops[1] = EmitNeonShiftVector(Ops[1], SrcTy, false);
-    if (usgn)
-      Ops[0] = Builder.CreateLShr(Ops[0], Ops[1]);
-    else
-      Ops[0] = Builder.CreateAShr(Ops[0], Ops[1]);
-    return Builder.CreateTrunc(Ops[0], Ty, "vshrn_n");
-  }
-  case AArch64::BI__builtin_neon_vqshrun_n_v:
+  case NEON::BI__builtin_neon_vqshrun_n_v:
     Int = Intrinsic::aarch64_neon_vsqshrun;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrun_n");
-  case AArch64::BI__builtin_neon_vrshrn_n_v:
+  case NEON::BI__builtin_neon_vrshrn_n_v:
     Int = Intrinsic::aarch64_neon_vrshrn;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshrn_n");
-  case AArch64::BI__builtin_neon_vqrshrun_n_v:
+  case NEON::BI__builtin_neon_vqrshrun_n_v:
     Int = Intrinsic::aarch64_neon_vsqrshrun;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrun_n");
-  case AArch64::BI__builtin_neon_vqshrn_n_v:
+  case NEON::BI__builtin_neon_vqshrn_n_v:
     Int = usgn ? Intrinsic::aarch64_neon_vuqshrn
                : Intrinsic::aarch64_neon_vsqshrn;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n");
-  case AArch64::BI__builtin_neon_vqrshrn_n_v:
+  case NEON::BI__builtin_neon_vqrshrn_n_v:
     Int = usgn ? Intrinsic::aarch64_neon_vuqrshrn
                : Intrinsic::aarch64_neon_vsqrshrn;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n");
 
   // Convert
-  case AArch64::BI__builtin_neon_vmovl_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmovl_v, E);
-  case AArch64::BI__builtin_neon_vcvt_n_f32_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcvt_n_f32_v, E);
-  case AArch64::BI__builtin_neon_vcvtq_n_f32_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcvtq_n_f32_v, E);
-  case AArch64::BI__builtin_neon_vcvt_n_f64_v:
-  case AArch64::BI__builtin_neon_vcvtq_n_f64_v: {
+  case NEON::BI__builtin_neon_vcvt_n_f64_v:
+  case NEON::BI__builtin_neon_vcvtq_n_f64_v: {
     llvm::Type *FloatTy =
         GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad));
     llvm::Type *Tys[2] = { FloatTy, Ty };
@@ -3259,78 +3949,26 @@
     Function *F = CGM.getIntrinsic(Int, Tys);
     return EmitNeonCall(F, Ops, "vcvt_n");
   }
-  case AArch64::BI__builtin_neon_vcvt_n_s32_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcvt_n_s32_v, E);
-  case AArch64::BI__builtin_neon_vcvtq_n_s32_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcvtq_n_s32_v, E);
-  case AArch64::BI__builtin_neon_vcvt_n_u32_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcvt_n_u32_v, E);
-  case AArch64::BI__builtin_neon_vcvtq_n_u32_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcvtq_n_u32_v, E);
-  case AArch64::BI__builtin_neon_vcvt_n_s64_v:
-  case AArch64::BI__builtin_neon_vcvt_n_u64_v:
-  case AArch64::BI__builtin_neon_vcvtq_n_s64_v:
-  case AArch64::BI__builtin_neon_vcvtq_n_u64_v: {
-    llvm::Type *FloatTy =
-        GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad));
-    llvm::Type *Tys[2] = { Ty, FloatTy };
-    Int = usgn ? Intrinsic::arm_neon_vcvtfp2fxu
-               : Intrinsic::arm_neon_vcvtfp2fxs;
-    Function *F = CGM.getIntrinsic(Int, Tys);
-    return EmitNeonCall(F, Ops, "vcvt_n");
-  }
 
   // Load/Store
-  case AArch64::BI__builtin_neon_vld1_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld1_v, E);
-  case AArch64::BI__builtin_neon_vld1q_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld1q_v, E);
-  case AArch64::BI__builtin_neon_vld2_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld2_v, E);
-  case AArch64::BI__builtin_neon_vld2q_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld2q_v, E);
-  case AArch64::BI__builtin_neon_vld3_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld3_v, E);
-  case AArch64::BI__builtin_neon_vld3q_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld3q_v, E);
-  case AArch64::BI__builtin_neon_vld4_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld4_v, E);
-  case AArch64::BI__builtin_neon_vld4q_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld4q_v, E);
-  case AArch64::BI__builtin_neon_vst1_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst1_v, E);
-  case AArch64::BI__builtin_neon_vst1q_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst1q_v, E);
-  case AArch64::BI__builtin_neon_vst2_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst2_v, E);
-  case AArch64::BI__builtin_neon_vst2q_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst2q_v, E);
-  case AArch64::BI__builtin_neon_vst3_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst3_v, E);
-  case AArch64::BI__builtin_neon_vst3q_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst3q_v, E);
-  case AArch64::BI__builtin_neon_vst4_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst4_v, E);
-  case AArch64::BI__builtin_neon_vst4q_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst4q_v, E);
-  case AArch64::BI__builtin_neon_vld1_x2_v:
-  case AArch64::BI__builtin_neon_vld1q_x2_v:
-  case AArch64::BI__builtin_neon_vld1_x3_v:
-  case AArch64::BI__builtin_neon_vld1q_x3_v:
-  case AArch64::BI__builtin_neon_vld1_x4_v:
-  case AArch64::BI__builtin_neon_vld1q_x4_v: {
+  case NEON::BI__builtin_neon_vld1_x2_v:
+  case NEON::BI__builtin_neon_vld1q_x2_v:
+  case NEON::BI__builtin_neon_vld1_x3_v:
+  case NEON::BI__builtin_neon_vld1q_x3_v:
+  case NEON::BI__builtin_neon_vld1_x4_v:
+  case NEON::BI__builtin_neon_vld1q_x4_v: {
     unsigned Int;
     switch (BuiltinID) {
-    case AArch64::BI__builtin_neon_vld1_x2_v:
-    case AArch64::BI__builtin_neon_vld1q_x2_v:
+    case NEON::BI__builtin_neon_vld1_x2_v:
+    case NEON::BI__builtin_neon_vld1q_x2_v:
       Int = Intrinsic::aarch64_neon_vld1x2;
       break;
-    case AArch64::BI__builtin_neon_vld1_x3_v:
-    case AArch64::BI__builtin_neon_vld1q_x3_v:
+    case NEON::BI__builtin_neon_vld1_x3_v:
+    case NEON::BI__builtin_neon_vld1q_x3_v:
       Int = Intrinsic::aarch64_neon_vld1x3;
       break;
-    case AArch64::BI__builtin_neon_vld1_x4_v:
-    case AArch64::BI__builtin_neon_vld1q_x4_v:
+    case NEON::BI__builtin_neon_vld1_x4_v:
+    case NEON::BI__builtin_neon_vld1q_x4_v:
       Int = Intrinsic::aarch64_neon_vld1x4;
       break;
     }
@@ -3340,32 +3978,32 @@
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
     return Builder.CreateStore(Ops[1], Ops[0]);
   }
-  case AArch64::BI__builtin_neon_vst1_x2_v:
-  case AArch64::BI__builtin_neon_vst1q_x2_v:
-  case AArch64::BI__builtin_neon_vst1_x3_v:
-  case AArch64::BI__builtin_neon_vst1q_x3_v:
-  case AArch64::BI__builtin_neon_vst1_x4_v:
-  case AArch64::BI__builtin_neon_vst1q_x4_v: {
+  case NEON::BI__builtin_neon_vst1_x2_v:
+  case NEON::BI__builtin_neon_vst1q_x2_v:
+  case NEON::BI__builtin_neon_vst1_x3_v:
+  case NEON::BI__builtin_neon_vst1q_x3_v:
+  case NEON::BI__builtin_neon_vst1_x4_v:
+  case NEON::BI__builtin_neon_vst1q_x4_v: {
     Ops.push_back(Align);
     unsigned Int;
     switch (BuiltinID) {
-    case AArch64::BI__builtin_neon_vst1_x2_v:
-    case AArch64::BI__builtin_neon_vst1q_x2_v:
+    case NEON::BI__builtin_neon_vst1_x2_v:
+    case NEON::BI__builtin_neon_vst1q_x2_v:
       Int = Intrinsic::aarch64_neon_vst1x2;
       break;
-    case AArch64::BI__builtin_neon_vst1_x3_v:
-    case AArch64::BI__builtin_neon_vst1q_x3_v:
+    case NEON::BI__builtin_neon_vst1_x3_v:
+    case NEON::BI__builtin_neon_vst1q_x3_v:
       Int = Intrinsic::aarch64_neon_vst1x3;
       break;
-    case AArch64::BI__builtin_neon_vst1_x4_v:
-    case AArch64::BI__builtin_neon_vst1q_x4_v:
+    case NEON::BI__builtin_neon_vst1_x4_v:
+    case NEON::BI__builtin_neon_vst1q_x4_v:
       Int = Intrinsic::aarch64_neon_vst1x4;
       break;
     }
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "");
   }
-  case AArch64::BI__builtin_neon_vld1_lane_v:
-  case AArch64::BI__builtin_neon_vld1q_lane_v: {
+  case NEON::BI__builtin_neon_vld1_lane_v:
+  case NEON::BI__builtin_neon_vld1q_lane_v: {
     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
@@ -3373,20 +4011,8 @@
     Ld->setAlignment(cast<ConstantInt>(Align)->getZExtValue());
     return Builder.CreateInsertElement(Ops[1], Ld, Ops[2], "vld1_lane");
   }
-  case AArch64::BI__builtin_neon_vld2_lane_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld2q_lane_v, E);
-  case AArch64::BI__builtin_neon_vld2q_lane_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld2q_lane_v, E);
-  case AArch64::BI__builtin_neon_vld3_lane_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld3_lane_v, E);
-  case AArch64::BI__builtin_neon_vld3q_lane_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld3q_lane_v, E);
-  case AArch64::BI__builtin_neon_vld4_lane_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld4_lane_v, E);
-  case AArch64::BI__builtin_neon_vld4q_lane_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld4q_lane_v, E);
-  case AArch64::BI__builtin_neon_vst1_lane_v:
-  case AArch64::BI__builtin_neon_vst1q_lane_v: {
+  case NEON::BI__builtin_neon_vst1_lane_v:
+  case NEON::BI__builtin_neon_vst1q_lane_v: {
     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
@@ -3395,39 +4021,23 @@
     St->setAlignment(cast<ConstantInt>(Align)->getZExtValue());
     return St;
   }
-  case AArch64::BI__builtin_neon_vst2_lane_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst2_lane_v, E);
-  case AArch64::BI__builtin_neon_vst2q_lane_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst2q_lane_v, E);
-  case AArch64::BI__builtin_neon_vst3_lane_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst3_lane_v, E);
-  case AArch64::BI__builtin_neon_vst3q_lane_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst3q_lane_v, E);
-  case AArch64::BI__builtin_neon_vst4_lane_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst4_lane_v, E);
-  case AArch64::BI__builtin_neon_vst4q_lane_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vst4q_lane_v, E);
-  case AArch64::BI__builtin_neon_vld1_dup_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld1_dup_v, E);
-  case AArch64::BI__builtin_neon_vld1q_dup_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vld1q_dup_v, E);
-  case AArch64::BI__builtin_neon_vld2_dup_v:
-  case AArch64::BI__builtin_neon_vld2q_dup_v:
-  case AArch64::BI__builtin_neon_vld3_dup_v:
-  case AArch64::BI__builtin_neon_vld3q_dup_v:
-  case AArch64::BI__builtin_neon_vld4_dup_v:
-  case AArch64::BI__builtin_neon_vld4q_dup_v: {
+  case NEON::BI__builtin_neon_vld2_dup_v:
+  case NEON::BI__builtin_neon_vld2q_dup_v:
+  case NEON::BI__builtin_neon_vld3_dup_v:
+  case NEON::BI__builtin_neon_vld3q_dup_v:
+  case NEON::BI__builtin_neon_vld4_dup_v:
+  case NEON::BI__builtin_neon_vld4q_dup_v: {
     // Handle 64-bit x 1 elements as a special-case.  There is no "dup" needed.
     if (VTy->getElementType()->getPrimitiveSizeInBits() == 64 &&
         VTy->getNumElements() == 1) {
       switch (BuiltinID) {
-      case AArch64::BI__builtin_neon_vld2_dup_v:
+      case NEON::BI__builtin_neon_vld2_dup_v:
         Int = Intrinsic::arm_neon_vld2;
         break;
-      case AArch64::BI__builtin_neon_vld3_dup_v:
+      case NEON::BI__builtin_neon_vld3_dup_v:
         Int = Intrinsic::arm_neon_vld3;
         break;
-      case AArch64::BI__builtin_neon_vld4_dup_v:
+      case NEON::BI__builtin_neon_vld4_dup_v:
         Int = Intrinsic::arm_neon_vld4;
         break;
       default:
@@ -3440,16 +4050,16 @@
       return Builder.CreateStore(Ops[1], Ops[0]);
     }
     switch (BuiltinID) {
-    case AArch64::BI__builtin_neon_vld2_dup_v:
-    case AArch64::BI__builtin_neon_vld2q_dup_v:
+    case NEON::BI__builtin_neon_vld2_dup_v:
+    case NEON::BI__builtin_neon_vld2q_dup_v:
       Int = Intrinsic::arm_neon_vld2lane;
       break;
-    case AArch64::BI__builtin_neon_vld3_dup_v:
-    case AArch64::BI__builtin_neon_vld3q_dup_v:
+    case NEON::BI__builtin_neon_vld3_dup_v:
+    case NEON::BI__builtin_neon_vld3q_dup_v:
       Int = Intrinsic::arm_neon_vld3lane;
       break;
-    case AArch64::BI__builtin_neon_vld4_dup_v:
-    case AArch64::BI__builtin_neon_vld4q_dup_v:
+    case NEON::BI__builtin_neon_vld4_dup_v:
+    case NEON::BI__builtin_neon_vld4q_dup_v:
       Int = Intrinsic::arm_neon_vld4lane;
       break;
     }
@@ -3478,42 +4088,11 @@
     return Builder.CreateStore(Ops[1], Ops[0]);
   }
 
-  // Crypto
-  case AArch64::BI__builtin_neon_vaeseq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_aese, Ty),
-                        Ops, "aese");
-  case AArch64::BI__builtin_neon_vaesdq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_aesd, Ty),
-                        Ops, "aesd");
-  case AArch64::BI__builtin_neon_vaesmcq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_aesmc, Ty),
-                        Ops, "aesmc");
-  case AArch64::BI__builtin_neon_vaesimcq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_aesimc, Ty),
-                        Ops, "aesimc");
-  case AArch64::BI__builtin_neon_vsha1su1q_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1su1, Ty),
-                        Ops, "sha1su1");
-  case AArch64::BI__builtin_neon_vsha256su0q_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha256su0, Ty),
-                        Ops, "sha256su0");
-  case AArch64::BI__builtin_neon_vsha1su0q_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1su0, Ty),
-                        Ops, "sha1su0");
-  case AArch64::BI__builtin_neon_vsha256hq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha256h, Ty),
-                        Ops, "sha256h");
-  case AArch64::BI__builtin_neon_vsha256h2q_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha256h2, Ty),
-                        Ops, "sha256h2");
-  case AArch64::BI__builtin_neon_vsha256su1q_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha256su1, Ty),
-                        Ops, "sha256su1");
-  case AArch64::BI__builtin_neon_vmul_lane_v:
-  case AArch64::BI__builtin_neon_vmul_laneq_v: {
+  case NEON::BI__builtin_neon_vmul_lane_v:
+  case NEON::BI__builtin_neon_vmul_laneq_v: {
     // v1f64 vmul_lane should be mapped to Neon scalar mul lane
     bool Quad = false;
-    if (BuiltinID == AArch64::BI__builtin_neon_vmul_laneq_v)
+    if (BuiltinID == NEON::BI__builtin_neon_vmul_laneq_v)
       Quad = true;
     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
     llvm::Type *VTy = GetNeonType(this,
@@ -3525,7 +4104,7 @@
   }
 
   // AArch64-only builtins
-  case AArch64::BI__builtin_neon_vfmaq_laneq_v: {
+  case NEON::BI__builtin_neon_vfmaq_laneq_v: {
     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
@@ -3534,7 +4113,7 @@
     Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3]));
     return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]);
   }
-  case AArch64::BI__builtin_neon_vfmaq_lane_v: {
+  case NEON::BI__builtin_neon_vfmaq_lane_v: {
     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
@@ -3549,7 +4128,7 @@
 
     return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]);
   }
-  case AArch64::BI__builtin_neon_vfma_lane_v: {
+  case NEON::BI__builtin_neon_vfma_lane_v: {
     llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
     // v1f64 fma should be mapped to Neon scalar f64 fma
     if (VTy && VTy->getElementType() == DoubleTy) {
@@ -3571,7 +4150,7 @@
     Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3]));
     return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]);
   }
-  case AArch64::BI__builtin_neon_vfma_laneq_v: {
+  case NEON::BI__builtin_neon_vfma_laneq_v: {
     llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
     // v1f64 fma should be mapped to Neon scalar f64 fma
     if (VTy && VTy->getElementType() == DoubleTy) {
@@ -3598,8 +4177,8 @@
 
     return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]);
   }
-  case AArch64::BI__builtin_neon_vfms_v:
-  case AArch64::BI__builtin_neon_vfmsq_v: {
+  case NEON::BI__builtin_neon_vfms_v:
+  case NEON::BI__builtin_neon_vfmsq_v: {
     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
@@ -3610,314 +4189,136 @@
     // AArch64 intrinsic has it first.
     return Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]);
   }
-  case AArch64::BI__builtin_neon_vmaxnm_v:
-  case AArch64::BI__builtin_neon_vmaxnmq_v: {
+  case NEON::BI__builtin_neon_vmaxnm_v:
+  case NEON::BI__builtin_neon_vmaxnmq_v: {
     Int = Intrinsic::aarch64_neon_vmaxnm;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm");
   }
-  case AArch64::BI__builtin_neon_vminnm_v:
-  case AArch64::BI__builtin_neon_vminnmq_v: {
+  case NEON::BI__builtin_neon_vminnm_v:
+  case NEON::BI__builtin_neon_vminnmq_v: {
     Int = Intrinsic::aarch64_neon_vminnm;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm");
   }
-  case AArch64::BI__builtin_neon_vpmaxnm_v:
-  case AArch64::BI__builtin_neon_vpmaxnmq_v: {
+  case NEON::BI__builtin_neon_vpmaxnm_v:
+  case NEON::BI__builtin_neon_vpmaxnmq_v: {
     Int = Intrinsic::aarch64_neon_vpmaxnm;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmaxnm");
   }
-  case AArch64::BI__builtin_neon_vpminnm_v:
-  case AArch64::BI__builtin_neon_vpminnmq_v: {
+  case NEON::BI__builtin_neon_vpminnm_v:
+  case NEON::BI__builtin_neon_vpminnmq_v: {
     Int = Intrinsic::aarch64_neon_vpminnm;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm");
   }
-  case AArch64::BI__builtin_neon_vpmaxq_v: {
+  case NEON::BI__builtin_neon_vpmaxq_v: {
     Int = usgn ? Intrinsic::arm_neon_vpmaxu : Intrinsic::arm_neon_vpmaxs;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmax");
   }
-  case AArch64::BI__builtin_neon_vpminq_v: {
+  case NEON::BI__builtin_neon_vpminq_v: {
     Int = usgn ? Intrinsic::arm_neon_vpminu : Intrinsic::arm_neon_vpmins;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmin");
   }
-  case AArch64::BI__builtin_neon_vpaddq_v: {
-    Int = Intrinsic::arm_neon_vpadd;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpadd");
-  }
-  case AArch64::BI__builtin_neon_vmulx_v:
-  case AArch64::BI__builtin_neon_vmulxq_v: {
+  case NEON::BI__builtin_neon_vmulx_v:
+  case NEON::BI__builtin_neon_vmulxq_v: {
     Int = Intrinsic::aarch64_neon_vmulx;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx");
   }
-  case AArch64::BI__builtin_neon_vpaddl_v:
-  case AArch64::BI__builtin_neon_vpaddlq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vpaddl_v, E);
-  case AArch64::BI__builtin_neon_vpadal_v:
-  case AArch64::BI__builtin_neon_vpadalq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vpadal_v, E);
-  case AArch64::BI__builtin_neon_vqabs_v:
-  case AArch64::BI__builtin_neon_vqabsq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqabs_v, E);
-  case AArch64::BI__builtin_neon_vqneg_v:
-  case AArch64::BI__builtin_neon_vqnegq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqneg_v, E);
-  case AArch64::BI__builtin_neon_vabs_v:
-  case AArch64::BI__builtin_neon_vabsq_v: {
-    if (VTy->getElementType()->isFloatingPointTy()) {
-      return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, Ty), Ops, "vabs");
-    }
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vabs_v, E);
-  }
-  case AArch64::BI__builtin_neon_vsqadd_v:
-  case AArch64::BI__builtin_neon_vsqaddq_v: {
+  case NEON::BI__builtin_neon_vsqadd_v:
+  case NEON::BI__builtin_neon_vsqaddq_v: {
     Int = Intrinsic::aarch64_neon_usqadd;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqadd");
   }
-  case AArch64::BI__builtin_neon_vuqadd_v:
-  case AArch64::BI__builtin_neon_vuqaddq_v: {
+  case NEON::BI__builtin_neon_vuqadd_v:
+  case NEON::BI__builtin_neon_vuqaddq_v: {
     Int = Intrinsic::aarch64_neon_suqadd;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd");
   }
-  case AArch64::BI__builtin_neon_vcls_v:
-  case AArch64::BI__builtin_neon_vclsq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcls_v, E);
-  case AArch64::BI__builtin_neon_vclz_v:
-  case AArch64::BI__builtin_neon_vclzq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vclz_v, E);
-  case AArch64::BI__builtin_neon_vcnt_v:
-  case AArch64::BI__builtin_neon_vcntq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcnt_v, E);
-  case AArch64::BI__builtin_neon_vrbit_v:
-  case AArch64::BI__builtin_neon_vrbitq_v:
+  case NEON::BI__builtin_neon_vrbit_v:
+  case NEON::BI__builtin_neon_vrbitq_v:
     Int = Intrinsic::aarch64_neon_rbit;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrbit");
-  case AArch64::BI__builtin_neon_vmovn_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmovn_v, E);
-  case AArch64::BI__builtin_neon_vqmovun_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqmovun_v, E);
-  case AArch64::BI__builtin_neon_vqmovn_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqmovn_v, E);
-  case AArch64::BI__builtin_neon_vcvt_f16_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcvt_f16_v, E);
-  case AArch64::BI__builtin_neon_vcvt_f32_f16:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcvt_f32_f16, E);
-  case AArch64::BI__builtin_neon_vcvt_f32_f64: {
-    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
-    Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, false));
+  case NEON::BI__builtin_neon_vcvt_f32_f64: {
+    NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float64, false, true);
+    Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
     return Builder.CreateFPTrunc(Ops[0], Ty, "vcvt");
   }
-  case AArch64::BI__builtin_neon_vcvtx_f32_v: {
+  case NEON::BI__builtin_neon_vcvtx_f32_v: {
     llvm::Type *EltTy = FloatTy;
     llvm::Type *ResTy = llvm::VectorType::get(EltTy, 2);
     llvm::Type *Tys[2] = { ResTy, Ty };
-    Int = Intrinsic::aarch64_neon_fcvtxn;
+    Int = Intrinsic::aarch64_neon_vcvtxn;
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtx_f32_f64");
   }
-  case AArch64::BI__builtin_neon_vcvt_f64_f32: {
+  case NEON::BI__builtin_neon_vcvt_f64_f32: {
     llvm::Type *OpTy =
         GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, false));
     Ops[0] = Builder.CreateBitCast(Ops[0], OpTy);
     return Builder.CreateFPExt(Ops[0], Ty, "vcvt");
   }
-  case AArch64::BI__builtin_neon_vcvt_f64_v:
-  case AArch64::BI__builtin_neon_vcvtq_f64_v: {
+  case NEON::BI__builtin_neon_vcvt_f64_v:
+  case NEON::BI__builtin_neon_vcvtq_f64_v: {
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad));
     return usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
   }
-  case AArch64::BI__builtin_neon_vrndn_v:
-  case AArch64::BI__builtin_neon_vrndnq_v: {
+  case NEON::BI__builtin_neon_vrndn_v:
+  case NEON::BI__builtin_neon_vrndnq_v: {
     Int = Intrinsic::aarch64_neon_frintn;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndn");
   }
-  case AArch64::BI__builtin_neon_vrnda_v:
-  case AArch64::BI__builtin_neon_vrndaq_v: {
+  case NEON::BI__builtin_neon_vrnda_v:
+  case NEON::BI__builtin_neon_vrndaq_v: {
     Int = Intrinsic::round;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda");
   }
-  case AArch64::BI__builtin_neon_vrndp_v:
-  case AArch64::BI__builtin_neon_vrndpq_v: {
+  case NEON::BI__builtin_neon_vrndp_v:
+  case NEON::BI__builtin_neon_vrndpq_v: {
     Int = Intrinsic::ceil;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp");
   }
-  case AArch64::BI__builtin_neon_vrndm_v:
-  case AArch64::BI__builtin_neon_vrndmq_v: {
+  case NEON::BI__builtin_neon_vrndm_v:
+  case NEON::BI__builtin_neon_vrndmq_v: {
     Int = Intrinsic::floor;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm");
   }
-  case AArch64::BI__builtin_neon_vrndx_v:
-  case AArch64::BI__builtin_neon_vrndxq_v: {
+  case NEON::BI__builtin_neon_vrndx_v:
+  case NEON::BI__builtin_neon_vrndxq_v: {
     Int = Intrinsic::rint;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx");
   }
-  case AArch64::BI__builtin_neon_vrnd_v:
-  case AArch64::BI__builtin_neon_vrndq_v: {
+  case NEON::BI__builtin_neon_vrnd_v:
+  case NEON::BI__builtin_neon_vrndq_v: {
     Int = Intrinsic::trunc;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd");
   }
-  case AArch64::BI__builtin_neon_vrndi_v:
-  case AArch64::BI__builtin_neon_vrndiq_v: {
+  case NEON::BI__builtin_neon_vrndi_v:
+  case NEON::BI__builtin_neon_vrndiq_v: {
     Int = Intrinsic::nearbyint;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndi");
   }
-  case AArch64::BI__builtin_neon_vcvt_s32_v:
-  case AArch64::BI__builtin_neon_vcvt_u32_v:
-  case AArch64::BI__builtin_neon_vcvtq_s32_v:
-  case AArch64::BI__builtin_neon_vcvtq_u32_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcvtq_u32_v, E);
-  case AArch64::BI__builtin_neon_vcvt_s64_v:
-  case AArch64::BI__builtin_neon_vcvt_u64_v:
-  case AArch64::BI__builtin_neon_vcvtq_s64_v:
-  case AArch64::BI__builtin_neon_vcvtq_u64_v: {
-    llvm::Type *DoubleTy =
-        GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad));
-    Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
-    return usgn ? Builder.CreateFPToUI(Ops[0], Ty, "vcvt")
-                : Builder.CreateFPToSI(Ops[0], Ty, "vcvt");
-  }
-  case AArch64::BI__builtin_neon_vcvtn_s32_v:
-  case AArch64::BI__builtin_neon_vcvtnq_s32_v: {
-    llvm::Type *OpTy = llvm::VectorType::get(FloatTy, VTy->getNumElements());
-    llvm::Type *Tys[2] = { Ty, OpTy };
-    Int = Intrinsic::aarch64_neon_fcvtns;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtns_f32");
-  }
-  case AArch64::BI__builtin_neon_vcvtn_s64_v:
-  case AArch64::BI__builtin_neon_vcvtnq_s64_v: {
-    llvm::Type *OpTy = llvm::VectorType::get(DoubleTy, VTy->getNumElements());
-    llvm::Type *Tys[2] = { Ty, OpTy };
-    Int = Intrinsic::aarch64_neon_fcvtns;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtns_f64");
-  }
-  case AArch64::BI__builtin_neon_vcvtn_u32_v:
-  case AArch64::BI__builtin_neon_vcvtnq_u32_v: {
-    llvm::Type *OpTy = llvm::VectorType::get(FloatTy, VTy->getNumElements());
-    llvm::Type *Tys[2] = { Ty, OpTy };
-    Int = Intrinsic::aarch64_neon_fcvtnu;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtnu_f32");
-  }
-  case AArch64::BI__builtin_neon_vcvtn_u64_v:
-  case AArch64::BI__builtin_neon_vcvtnq_u64_v: {
-    llvm::Type *OpTy = llvm::VectorType::get(DoubleTy, VTy->getNumElements());
-    llvm::Type *Tys[2] = { Ty, OpTy };
-    Int = Intrinsic::aarch64_neon_fcvtnu;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtnu_f64");
-  }
-  case AArch64::BI__builtin_neon_vcvtp_s32_v:
-  case AArch64::BI__builtin_neon_vcvtpq_s32_v: {
-    llvm::Type *OpTy = llvm::VectorType::get(FloatTy, VTy->getNumElements());
-    llvm::Type *Tys[2] = { Ty, OpTy };
-    Int = Intrinsic::aarch64_neon_fcvtps;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtps_f32");
-  }
-  case AArch64::BI__builtin_neon_vcvtp_s64_v:
-  case AArch64::BI__builtin_neon_vcvtpq_s64_v: {
-    llvm::Type *OpTy = llvm::VectorType::get(DoubleTy, VTy->getNumElements());
-    llvm::Type *Tys[2] = { Ty, OpTy };
-    Int = Intrinsic::aarch64_neon_fcvtps;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtps_f64");
-  }
-  case AArch64::BI__builtin_neon_vcvtp_u32_v:
-  case AArch64::BI__builtin_neon_vcvtpq_u32_v: {
-    llvm::Type *OpTy = llvm::VectorType::get(FloatTy, VTy->getNumElements());
-    llvm::Type *Tys[2] = { Ty, OpTy };
-    Int = Intrinsic::aarch64_neon_fcvtpu;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtpu_f32");
-  }
-  case AArch64::BI__builtin_neon_vcvtp_u64_v:
-  case AArch64::BI__builtin_neon_vcvtpq_u64_v: {
-    llvm::Type *OpTy = llvm::VectorType::get(DoubleTy, VTy->getNumElements());
-    llvm::Type *Tys[2] = { Ty, OpTy };
-    Int = Intrinsic::aarch64_neon_fcvtpu;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtpu_f64");
-  }
-  case AArch64::BI__builtin_neon_vcvtm_s32_v:
-  case AArch64::BI__builtin_neon_vcvtmq_s32_v: {
-    llvm::Type *OpTy = llvm::VectorType::get(FloatTy, VTy->getNumElements());
-    llvm::Type *Tys[2] = { Ty, OpTy };
-    Int = Intrinsic::aarch64_neon_fcvtms;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtms_f32");
-  }
-  case AArch64::BI__builtin_neon_vcvtm_s64_v:
-  case AArch64::BI__builtin_neon_vcvtmq_s64_v: {
-    llvm::Type *OpTy = llvm::VectorType::get(DoubleTy, VTy->getNumElements());
-    llvm::Type *Tys[2] = { Ty, OpTy };
-    Int = Intrinsic::aarch64_neon_fcvtms;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtms_f64");
-  }
-  case AArch64::BI__builtin_neon_vcvtm_u32_v:
-  case AArch64::BI__builtin_neon_vcvtmq_u32_v: {
-    llvm::Type *OpTy = llvm::VectorType::get(FloatTy, VTy->getNumElements());
-    llvm::Type *Tys[2] = { Ty, OpTy };
-    Int = Intrinsic::aarch64_neon_fcvtmu;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtmu_f32");
-  }
-  case AArch64::BI__builtin_neon_vcvtm_u64_v:
-  case AArch64::BI__builtin_neon_vcvtmq_u64_v: {
-    llvm::Type *OpTy = llvm::VectorType::get(DoubleTy, VTy->getNumElements());
-    llvm::Type *Tys[2] = { Ty, OpTy };
-    Int = Intrinsic::aarch64_neon_fcvtmu;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtmu_f64");
-  }
-  case AArch64::BI__builtin_neon_vcvta_s32_v:
-  case AArch64::BI__builtin_neon_vcvtaq_s32_v: {
-    llvm::Type *OpTy = llvm::VectorType::get(FloatTy, VTy->getNumElements());
-    llvm::Type *Tys[2] = { Ty, OpTy };
-    Int = Intrinsic::aarch64_neon_fcvtas;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtas_f32");
-  }
-  case AArch64::BI__builtin_neon_vcvta_s64_v:
-  case AArch64::BI__builtin_neon_vcvtaq_s64_v: {
-    llvm::Type *OpTy = llvm::VectorType::get(DoubleTy, VTy->getNumElements());
-    llvm::Type *Tys[2] = { Ty, OpTy };
-    Int = Intrinsic::aarch64_neon_fcvtas;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtas_f64");
-  }
-  case AArch64::BI__builtin_neon_vcvta_u32_v:
-  case AArch64::BI__builtin_neon_vcvtaq_u32_v: {
-    llvm::Type *OpTy = llvm::VectorType::get(FloatTy, VTy->getNumElements());
-    llvm::Type *Tys[2] = { Ty, OpTy };
-    Int = Intrinsic::aarch64_neon_fcvtau;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtau_f32");
-  }
-  case AArch64::BI__builtin_neon_vcvta_u64_v:
-  case AArch64::BI__builtin_neon_vcvtaq_u64_v: {
-    llvm::Type *OpTy = llvm::VectorType::get(DoubleTy, VTy->getNumElements());
-    llvm::Type *Tys[2] = { Ty, OpTy };
-    Int = Intrinsic::aarch64_neon_fcvtau;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtau_f64");
-  }
-  case AArch64::BI__builtin_neon_vrecpe_v:
-  case AArch64::BI__builtin_neon_vrecpeq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrecpe_v, E);
-  case AArch64::BI__builtin_neon_vrsqrte_v:
-  case AArch64::BI__builtin_neon_vrsqrteq_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrsqrte_v, E);
-  case AArch64::BI__builtin_neon_vsqrt_v:
-  case AArch64::BI__builtin_neon_vsqrtq_v: {
+  case NEON::BI__builtin_neon_vsqrt_v:
+  case NEON::BI__builtin_neon_vsqrtq_v: {
     Int = Intrinsic::sqrt;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqrt");
   }
-  case AArch64::BI__builtin_neon_vcvt_f32_v:
-  case AArch64::BI__builtin_neon_vcvtq_f32_v:
-    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vcvt_f32_v, E);
-  case AArch64::BI__builtin_neon_vceqz_v:
-  case AArch64::BI__builtin_neon_vceqzq_v:
+  case NEON::BI__builtin_neon_vceqz_v:
+  case NEON::BI__builtin_neon_vceqzq_v:
     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ,
                                          ICmpInst::ICMP_EQ, "vceqz");
-  case AArch64::BI__builtin_neon_vcgez_v:
-  case AArch64::BI__builtin_neon_vcgezq_v:
+  case NEON::BI__builtin_neon_vcgez_v:
+  case NEON::BI__builtin_neon_vcgezq_v:
     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE,
                                          ICmpInst::ICMP_SGE, "vcgez");
-  case AArch64::BI__builtin_neon_vclez_v:
-  case AArch64::BI__builtin_neon_vclezq_v:
+  case NEON::BI__builtin_neon_vclez_v:
+  case NEON::BI__builtin_neon_vclezq_v:
     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE,
                                          ICmpInst::ICMP_SLE, "vclez");
-  case AArch64::BI__builtin_neon_vcgtz_v:
-  case AArch64::BI__builtin_neon_vcgtzq_v:
+  case NEON::BI__builtin_neon_vcgtz_v:
+  case NEON::BI__builtin_neon_vcgtzq_v:
     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT,
                                          ICmpInst::ICMP_SGT, "vcgtz");
-  case AArch64::BI__builtin_neon_vcltz_v:
-  case AArch64::BI__builtin_neon_vcltzq_v:
+  case NEON::BI__builtin_neon_vcltz_v:
+  case NEON::BI__builtin_neon_vcltzq_v:
     return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT,
                                          ICmpInst::ICMP_SLT, "vcltz");
   }
@@ -4074,28 +4475,28 @@
   for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) {
     if (i == 0) {
       switch (BuiltinID) {
-      case ARM::BI__builtin_neon_vld1_v:
-      case ARM::BI__builtin_neon_vld1q_v:
-      case ARM::BI__builtin_neon_vld1q_lane_v:
-      case ARM::BI__builtin_neon_vld1_lane_v:
-      case ARM::BI__builtin_neon_vld1_dup_v:
-      case ARM::BI__builtin_neon_vld1q_dup_v:
-      case ARM::BI__builtin_neon_vst1_v:
-      case ARM::BI__builtin_neon_vst1q_v:
-      case ARM::BI__builtin_neon_vst1q_lane_v:
-      case ARM::BI__builtin_neon_vst1_lane_v:
-      case ARM::BI__builtin_neon_vst2_v:
-      case ARM::BI__builtin_neon_vst2q_v:
-      case ARM::BI__builtin_neon_vst2_lane_v:
-      case ARM::BI__builtin_neon_vst2q_lane_v:
-      case ARM::BI__builtin_neon_vst3_v:
-      case ARM::BI__builtin_neon_vst3q_v:
-      case ARM::BI__builtin_neon_vst3_lane_v:
-      case ARM::BI__builtin_neon_vst3q_lane_v:
-      case ARM::BI__builtin_neon_vst4_v:
-      case ARM::BI__builtin_neon_vst4q_v:
-      case ARM::BI__builtin_neon_vst4_lane_v:
-      case ARM::BI__builtin_neon_vst4q_lane_v:
+      case NEON::BI__builtin_neon_vld1_v:
+      case NEON::BI__builtin_neon_vld1q_v:
+      case NEON::BI__builtin_neon_vld1q_lane_v:
+      case NEON::BI__builtin_neon_vld1_lane_v:
+      case NEON::BI__builtin_neon_vld1_dup_v:
+      case NEON::BI__builtin_neon_vld1q_dup_v:
+      case NEON::BI__builtin_neon_vst1_v:
+      case NEON::BI__builtin_neon_vst1q_v:
+      case NEON::BI__builtin_neon_vst1q_lane_v:
+      case NEON::BI__builtin_neon_vst1_lane_v:
+      case NEON::BI__builtin_neon_vst2_v:
+      case NEON::BI__builtin_neon_vst2q_v:
+      case NEON::BI__builtin_neon_vst2_lane_v:
+      case NEON::BI__builtin_neon_vst2q_lane_v:
+      case NEON::BI__builtin_neon_vst3_v:
+      case NEON::BI__builtin_neon_vst3q_v:
+      case NEON::BI__builtin_neon_vst3_lane_v:
+      case NEON::BI__builtin_neon_vst3q_lane_v:
+      case NEON::BI__builtin_neon_vst4_v:
+      case NEON::BI__builtin_neon_vst4q_v:
+      case NEON::BI__builtin_neon_vst4_lane_v:
+      case NEON::BI__builtin_neon_vst4q_lane_v:
         // Get the alignment for the argument in addition to the value;
         // we'll use it later.
         std::pair<llvm::Value*, unsigned> Src =
@@ -4107,21 +4508,21 @@
     }
     if (i == 1) {
       switch (BuiltinID) {
-      case ARM::BI__builtin_neon_vld2_v:
-      case ARM::BI__builtin_neon_vld2q_v:
-      case ARM::BI__builtin_neon_vld3_v:
-      case ARM::BI__builtin_neon_vld3q_v:
-      case ARM::BI__builtin_neon_vld4_v:
-      case ARM::BI__builtin_neon_vld4q_v:
-      case ARM::BI__builtin_neon_vld2_lane_v:
-      case ARM::BI__builtin_neon_vld2q_lane_v:
-      case ARM::BI__builtin_neon_vld3_lane_v:
-      case ARM::BI__builtin_neon_vld3q_lane_v:
-      case ARM::BI__builtin_neon_vld4_lane_v:
-      case ARM::BI__builtin_neon_vld4q_lane_v:
-      case ARM::BI__builtin_neon_vld2_dup_v:
-      case ARM::BI__builtin_neon_vld3_dup_v:
-      case ARM::BI__builtin_neon_vld4_dup_v:
+      case NEON::BI__builtin_neon_vld2_v:
+      case NEON::BI__builtin_neon_vld2q_v:
+      case NEON::BI__builtin_neon_vld3_v:
+      case NEON::BI__builtin_neon_vld3q_v:
+      case NEON::BI__builtin_neon_vld4_v:
+      case NEON::BI__builtin_neon_vld4q_v:
+      case NEON::BI__builtin_neon_vld2_lane_v:
+      case NEON::BI__builtin_neon_vld2q_lane_v:
+      case NEON::BI__builtin_neon_vld3_lane_v:
+      case NEON::BI__builtin_neon_vld3q_lane_v:
+      case NEON::BI__builtin_neon_vld4_lane_v:
+      case NEON::BI__builtin_neon_vld4q_lane_v:
+      case NEON::BI__builtin_neon_vld2_dup_v:
+      case NEON::BI__builtin_neon_vld3_dup_v:
+      case NEON::BI__builtin_neon_vld4_dup_v:
         // Get the alignment for the argument in addition to the value;
         // we'll use it later.
         std::pair<llvm::Value*, unsigned> Src =
@@ -4134,34 +4535,52 @@
     Ops.push_back(EmitScalarExpr(E->getArg(i)));
   }
 
-  // vget_lane and vset_lane are not overloaded and do not have an extra
-  // argument that specifies the vector type.
   switch (BuiltinID) {
   default: break;
-  case ARM::BI__builtin_neon_vget_lane_i8:
-  case ARM::BI__builtin_neon_vget_lane_i16:
-  case ARM::BI__builtin_neon_vget_lane_i32:
-  case ARM::BI__builtin_neon_vget_lane_i64:
-  case ARM::BI__builtin_neon_vget_lane_f32:
-  case ARM::BI__builtin_neon_vgetq_lane_i8:
-  case ARM::BI__builtin_neon_vgetq_lane_i16:
-  case ARM::BI__builtin_neon_vgetq_lane_i32:
-  case ARM::BI__builtin_neon_vgetq_lane_i64:
-  case ARM::BI__builtin_neon_vgetq_lane_f32:
+  // vget_lane and vset_lane are not overloaded and do not have an extra
+  // argument that specifies the vector type.
+  case NEON::BI__builtin_neon_vget_lane_i8:
+  case NEON::BI__builtin_neon_vget_lane_i16:
+  case NEON::BI__builtin_neon_vget_lane_i32:
+  case NEON::BI__builtin_neon_vget_lane_i64:
+  case NEON::BI__builtin_neon_vget_lane_f32:
+  case NEON::BI__builtin_neon_vgetq_lane_i8:
+  case NEON::BI__builtin_neon_vgetq_lane_i16:
+  case NEON::BI__builtin_neon_vgetq_lane_i32:
+  case NEON::BI__builtin_neon_vgetq_lane_i64:
+  case NEON::BI__builtin_neon_vgetq_lane_f32:
     return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
                                         "vget_lane");
-  case ARM::BI__builtin_neon_vset_lane_i8:
-  case ARM::BI__builtin_neon_vset_lane_i16:
-  case ARM::BI__builtin_neon_vset_lane_i32:
-  case ARM::BI__builtin_neon_vset_lane_i64:
-  case ARM::BI__builtin_neon_vset_lane_f32:
-  case ARM::BI__builtin_neon_vsetq_lane_i8:
-  case ARM::BI__builtin_neon_vsetq_lane_i16:
-  case ARM::BI__builtin_neon_vsetq_lane_i32:
-  case ARM::BI__builtin_neon_vsetq_lane_i64:
-  case ARM::BI__builtin_neon_vsetq_lane_f32:
+  case NEON::BI__builtin_neon_vset_lane_i8:
+  case NEON::BI__builtin_neon_vset_lane_i16:
+  case NEON::BI__builtin_neon_vset_lane_i32:
+  case NEON::BI__builtin_neon_vset_lane_i64:
+  case NEON::BI__builtin_neon_vset_lane_f32:
+  case NEON::BI__builtin_neon_vsetq_lane_i8:
+  case NEON::BI__builtin_neon_vsetq_lane_i16:
+  case NEON::BI__builtin_neon_vsetq_lane_i32:
+  case NEON::BI__builtin_neon_vsetq_lane_i64:
+  case NEON::BI__builtin_neon_vsetq_lane_f32:
     Ops.push_back(EmitScalarExpr(E->getArg(2)));
     return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
+
+  // Non-polymorphic crypto instructions also not overloaded
+  case NEON::BI__builtin_neon_vsha1h_u32:
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1h), Ops,
+                        "vsha1h");
+  case NEON::BI__builtin_neon_vsha1cq_u32:
+    Ops.push_back(EmitScalarExpr(E->getArg(2)));
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1c), Ops,
+                        "vsha1h");
+  case NEON::BI__builtin_neon_vsha1pq_u32:
+    Ops.push_back(EmitScalarExpr(E->getArg(2)));
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1p), Ops,
+                        "vsha1h");
+  case NEON::BI__builtin_neon_vsha1mq_u32:
+    Ops.push_back(EmitScalarExpr(E->getArg(2)));
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1m), Ops,
+                        "vsha1h");
   }
 
   // Get the last argument, which specifies the vector type.
@@ -4191,7 +4610,6 @@
   // Determine the type of this overloaded NEON intrinsic.
   NeonTypeFlags Type(Result.getZExtValue());
   bool usgn = Type.isUnsigned();
-  bool quad = Type.isQuad();
   bool rightShift = false;
 
   llvm::VectorType *VTy = GetNeonType(this, Type);
@@ -4199,158 +4617,20 @@
   if (!Ty)
     return 0;
 
+  // Many NEON builtins have identical semantics and uses in ARM and
+  // AArch64. Emit these in a single function.
+  llvm::ArrayRef<NeonIntrinsicInfo> IntrinsicMap(ARMSIMDIntrinsicMap);
+  const NeonIntrinsicInfo *Builtin = findNeonIntrinsicInMap(
+      IntrinsicMap, BuiltinID, NEONSIMDIntrinsicsProvenSorted);
+  if (Builtin)
+    return EmitCommonNeonBuiltinExpr(
+        Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
+        Builtin->NameHint, Builtin->TypeModifier, E, Ops, Align);
+
   unsigned Int;
   switch (BuiltinID) {
   default: return 0;
-  case ARM::BI__builtin_neon_vbsl_v:
-  case ARM::BI__builtin_neon_vbslq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vbsl, Ty),
-                        Ops, "vbsl");
-  case ARM::BI__builtin_neon_vabd_v:
-  case ARM::BI__builtin_neon_vabdq_v:
-    Int = usgn ? Intrinsic::arm_neon_vabdu : Intrinsic::arm_neon_vabds;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vabd");
-  case ARM::BI__builtin_neon_vabs_v:
-  case ARM::BI__builtin_neon_vabsq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vabs, Ty),
-                        Ops, "vabs");
-  case ARM::BI__builtin_neon_vaddhn_v: {
-    llvm::VectorType *SrcTy =
-        llvm::VectorType::getExtendedElementVectorType(VTy);
-
-    // %sum = add <4 x i32> %lhs, %rhs
-    Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
-    Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy);
-    Ops[0] = Builder.CreateAdd(Ops[0], Ops[1], "vaddhn");
-
-    // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
-    Constant *ShiftAmt = ConstantInt::get(SrcTy->getElementType(),
-                                       SrcTy->getScalarSizeInBits() / 2);
-    ShiftAmt = ConstantVector::getSplat(VTy->getNumElements(), ShiftAmt);
-    Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vaddhn");
-
-    // %res = trunc <4 x i32> %high to <4 x i16>
-    return Builder.CreateTrunc(Ops[0], VTy, "vaddhn");
-  }
-  case ARM::BI__builtin_neon_vcale_v:
-    std::swap(Ops[0], Ops[1]);
-  case ARM::BI__builtin_neon_vcage_v: {
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vacged);
-    return EmitNeonCall(F, Ops, "vcage");
-  }
-  case ARM::BI__builtin_neon_vcaleq_v:
-    std::swap(Ops[0], Ops[1]);
-  case ARM::BI__builtin_neon_vcageq_v: {
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vacgeq);
-    return EmitNeonCall(F, Ops, "vcage");
-  }
-  case ARM::BI__builtin_neon_vcalt_v:
-    std::swap(Ops[0], Ops[1]);
-  case ARM::BI__builtin_neon_vcagt_v: {
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vacgtd);
-    return EmitNeonCall(F, Ops, "vcagt");
-  }
-  case ARM::BI__builtin_neon_vcaltq_v:
-    std::swap(Ops[0], Ops[1]);
-  case ARM::BI__builtin_neon_vcagtq_v: {
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vacgtq);
-    return EmitNeonCall(F, Ops, "vcagt");
-  }
-  case ARM::BI__builtin_neon_vcls_v:
-  case ARM::BI__builtin_neon_vclsq_v: {
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vcls, Ty);
-    return EmitNeonCall(F, Ops, "vcls");
-  }
-  case ARM::BI__builtin_neon_vclz_v:
-  case ARM::BI__builtin_neon_vclzq_v: {
-    // Generate target-independent intrinsic; also need to add second argument
-    // for whether or not clz of zero is undefined; on ARM it isn't.
-    Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ty);
-    Ops.push_back(Builder.getInt1(getTarget().isCLZForZeroUndef()));
-    return EmitNeonCall(F, Ops, "vclz");
-  }
-  case ARM::BI__builtin_neon_vcnt_v:
-  case ARM::BI__builtin_neon_vcntq_v: {
-    // generate target-independent intrinsic
-    Function *F = CGM.getIntrinsic(Intrinsic::ctpop, Ty);
-    return EmitNeonCall(F, Ops, "vctpop");
-  }
-  case ARM::BI__builtin_neon_vcvt_f16_v: {
-    assert(Type.getEltType() == NeonTypeFlags::Float16 && !quad &&
-           "unexpected vcvt_f16_v builtin");
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vcvtfp2hf);
-    return EmitNeonCall(F, Ops, "vcvt");
-  }
-  case ARM::BI__builtin_neon_vcvt_f32_f16: {
-    assert(Type.getEltType() == NeonTypeFlags::Float16 && !quad &&
-           "unexpected vcvt_f32_f16 builtin");
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vcvthf2fp);
-    return EmitNeonCall(F, Ops, "vcvt");
-  }
-  case ARM::BI__builtin_neon_vcvt_f32_v:
-  case ARM::BI__builtin_neon_vcvtq_f32_v:
-    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
-    Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, quad));
-    return usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
-                : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
-  case ARM::BI__builtin_neon_vcvt_s32_v:
-  case ARM::BI__builtin_neon_vcvt_u32_v:
-  case ARM::BI__builtin_neon_vcvtq_s32_v:
-  case ARM::BI__builtin_neon_vcvtq_u32_v: {
-    llvm::Type *FloatTy =
-      GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, quad));
-    Ops[0] = Builder.CreateBitCast(Ops[0], FloatTy);
-    return usgn ? Builder.CreateFPToUI(Ops[0], Ty, "vcvt")
-                : Builder.CreateFPToSI(Ops[0], Ty, "vcvt");
-  }
-  case ARM::BI__builtin_neon_vcvt_n_f32_v:
-  case ARM::BI__builtin_neon_vcvtq_n_f32_v: {
-    llvm::Type *FloatTy =
-      GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, quad));
-    llvm::Type *Tys[2] = { FloatTy, Ty };
-    Int = usgn ? Intrinsic::arm_neon_vcvtfxu2fp
-               : Intrinsic::arm_neon_vcvtfxs2fp;
-    Function *F = CGM.getIntrinsic(Int, Tys);
-    return EmitNeonCall(F, Ops, "vcvt_n");
-  }
-  case ARM::BI__builtin_neon_vcvt_n_s32_v:
-  case ARM::BI__builtin_neon_vcvt_n_u32_v:
-  case ARM::BI__builtin_neon_vcvtq_n_s32_v:
-  case ARM::BI__builtin_neon_vcvtq_n_u32_v: {
-    llvm::Type *FloatTy =
-      GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, quad));
-    llvm::Type *Tys[2] = { Ty, FloatTy };
-    Int = usgn ? Intrinsic::arm_neon_vcvtfp2fxu
-               : Intrinsic::arm_neon_vcvtfp2fxs;
-    Function *F = CGM.getIntrinsic(Int, Tys);
-    return EmitNeonCall(F, Ops, "vcvt_n");
-  }
-  case ARM::BI__builtin_neon_vext_v:
-  case ARM::BI__builtin_neon_vextq_v: {
-    int CV = cast<ConstantInt>(Ops[2])->getSExtValue();
-    SmallVector<Constant*, 16> Indices;
-    for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
-      Indices.push_back(ConstantInt::get(Int32Ty, i+CV));
-
-    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
-    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
-    Value *SV = llvm::ConstantVector::get(Indices);
-    return Builder.CreateShuffleVector(Ops[0], Ops[1], SV, "vext");
-  }
-  case ARM::BI__builtin_neon_vhadd_v:
-  case ARM::BI__builtin_neon_vhaddq_v:
-    Int = usgn ? Intrinsic::arm_neon_vhaddu : Intrinsic::arm_neon_vhadds;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vhadd");
-  case ARM::BI__builtin_neon_vhsub_v:
-  case ARM::BI__builtin_neon_vhsubq_v:
-    Int = usgn ? Intrinsic::arm_neon_vhsubu : Intrinsic::arm_neon_vhsubs;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vhsub");
-  case ARM::BI__builtin_neon_vld1_v:
-  case ARM::BI__builtin_neon_vld1q_v:
-    Ops.push_back(Align);
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vld1, Ty),
-                        Ops, "vld1");
-  case ARM::BI__builtin_neon_vld1q_lane_v:
+  case NEON::BI__builtin_neon_vld1q_lane_v:
     // Handle 64-bit integer elements as a special case.  Use shuffles of
     // one-element vectors to avoid poor code for i64 in the backend.
     if (VTy->getElementType()->isIntegerTy(64)) {
@@ -4371,7 +4651,7 @@
       return Builder.CreateShuffleVector(Ops[1], Ld, SV, "vld1q_lane");
     }
     // fall through
-  case ARM::BI__builtin_neon_vld1_lane_v: {
+  case NEON::BI__builtin_neon_vld1_lane_v: {
     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
     Ty = llvm::PointerType::getUnqual(VTy->getElementType());
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
@@ -4379,90 +4659,19 @@
     Ld->setAlignment(cast<ConstantInt>(Align)->getZExtValue());
     return Builder.CreateInsertElement(Ops[1], Ld, Ops[2], "vld1_lane");
   }
-  case ARM::BI__builtin_neon_vld1_dup_v:
-  case ARM::BI__builtin_neon_vld1q_dup_v: {
-    Value *V = UndefValue::get(Ty);
-    Ty = llvm::PointerType::getUnqual(VTy->getElementType());
-    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
-    LoadInst *Ld = Builder.CreateLoad(Ops[0]);
-    Ld->setAlignment(cast<ConstantInt>(Align)->getZExtValue());
-    llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
-    Ops[0] = Builder.CreateInsertElement(V, Ld, CI);
-    return EmitNeonSplat(Ops[0], CI);
-  }
-  case ARM::BI__builtin_neon_vld2_v:
-  case ARM::BI__builtin_neon_vld2q_v: {
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld2, Ty);
-    Ops[1] = Builder.CreateCall2(F, Ops[1], Align, "vld2");
-    Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
-    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
-    return Builder.CreateStore(Ops[1], Ops[0]);
-  }
-  case ARM::BI__builtin_neon_vld3_v:
-  case ARM::BI__builtin_neon_vld3q_v: {
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld3, Ty);
-    Ops[1] = Builder.CreateCall2(F, Ops[1], Align, "vld3");
-    Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
-    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
-    return Builder.CreateStore(Ops[1], Ops[0]);
-  }
-  case ARM::BI__builtin_neon_vld4_v:
-  case ARM::BI__builtin_neon_vld4q_v: {
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld4, Ty);
-    Ops[1] = Builder.CreateCall2(F, Ops[1], Align, "vld4");
-    Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
-    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
-    return Builder.CreateStore(Ops[1], Ops[0]);
-  }
-  case ARM::BI__builtin_neon_vld2_lane_v:
-  case ARM::BI__builtin_neon_vld2q_lane_v: {
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld2lane, Ty);
-    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
-    Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
-    Ops.push_back(Align);
-    Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), "vld2_lane");
-    Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
-    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
-    return Builder.CreateStore(Ops[1], Ops[0]);
-  }
-  case ARM::BI__builtin_neon_vld3_lane_v:
-  case ARM::BI__builtin_neon_vld3q_lane_v: {
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld3lane, Ty);
-    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
-    Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
-    Ops[4] = Builder.CreateBitCast(Ops[4], Ty);
-    Ops.push_back(Align);
-    Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), "vld3_lane");
-    Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
-    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
-    return Builder.CreateStore(Ops[1], Ops[0]);
-  }
-  case ARM::BI__builtin_neon_vld4_lane_v:
-  case ARM::BI__builtin_neon_vld4q_lane_v: {
-    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld4lane, Ty);
-    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
-    Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
-    Ops[4] = Builder.CreateBitCast(Ops[4], Ty);
-    Ops[5] = Builder.CreateBitCast(Ops[5], Ty);
-    Ops.push_back(Align);
-    Ops[1] = Builder.CreateCall(F, makeArrayRef(Ops).slice(1), "vld3_lane");
-    Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
-    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
-    return Builder.CreateStore(Ops[1], Ops[0]);
-  }
-  case ARM::BI__builtin_neon_vld2_dup_v:
-  case ARM::BI__builtin_neon_vld3_dup_v:
-  case ARM::BI__builtin_neon_vld4_dup_v: {
+  case NEON::BI__builtin_neon_vld2_dup_v:
+  case NEON::BI__builtin_neon_vld3_dup_v:
+  case NEON::BI__builtin_neon_vld4_dup_v: {
     // Handle 64-bit elements as a special-case.  There is no "dup" needed.
     if (VTy->getElementType()->getPrimitiveSizeInBits() == 64) {
       switch (BuiltinID) {
-      case ARM::BI__builtin_neon_vld2_dup_v:
+      case NEON::BI__builtin_neon_vld2_dup_v:
         Int = Intrinsic::arm_neon_vld2;
         break;
-      case ARM::BI__builtin_neon_vld3_dup_v:
+      case NEON::BI__builtin_neon_vld3_dup_v:
         Int = Intrinsic::arm_neon_vld3;
         break;
-      case ARM::BI__builtin_neon_vld4_dup_v:
+      case NEON::BI__builtin_neon_vld4_dup_v:
         Int = Intrinsic::arm_neon_vld4;
         break;
       default: llvm_unreachable("unknown vld_dup intrinsic?");
@@ -4474,13 +4683,13 @@
       return Builder.CreateStore(Ops[1], Ops[0]);
     }
     switch (BuiltinID) {
-    case ARM::BI__builtin_neon_vld2_dup_v:
+    case NEON::BI__builtin_neon_vld2_dup_v:
       Int = Intrinsic::arm_neon_vld2lane;
       break;
-    case ARM::BI__builtin_neon_vld3_dup_v:
+    case NEON::BI__builtin_neon_vld3_dup_v:
       Int = Intrinsic::arm_neon_vld3lane;
       break;
-    case ARM::BI__builtin_neon_vld4_dup_v:
+    case NEON::BI__builtin_neon_vld4_dup_v:
       Int = Intrinsic::arm_neon_vld4lane;
       break;
     default: llvm_unreachable("unknown vld_dup intrinsic?");
@@ -4509,251 +4718,58 @@
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
     return Builder.CreateStore(Ops[1], Ops[0]);
   }
-  case ARM::BI__builtin_neon_vmax_v:
-  case ARM::BI__builtin_neon_vmaxq_v:
-    Int = usgn ? Intrinsic::arm_neon_vmaxu : Intrinsic::arm_neon_vmaxs;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmax");
-  case ARM::BI__builtin_neon_vmin_v:
-  case ARM::BI__builtin_neon_vminq_v:
-    Int = usgn ? Intrinsic::arm_neon_vminu : Intrinsic::arm_neon_vmins;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmin");
-  case ARM::BI__builtin_neon_vmovl_v: {
-    llvm::Type *DTy =llvm::VectorType::getTruncatedElementVectorType(VTy);
-    Ops[0] = Builder.CreateBitCast(Ops[0], DTy);
-    if (usgn)
-      return Builder.CreateZExt(Ops[0], Ty, "vmovl");
-    return Builder.CreateSExt(Ops[0], Ty, "vmovl");
-  }
-  case ARM::BI__builtin_neon_vmovn_v: {
-    llvm::Type *QTy = llvm::VectorType::getExtendedElementVectorType(VTy);
-    Ops[0] = Builder.CreateBitCast(Ops[0], QTy);
-    return Builder.CreateTrunc(Ops[0], Ty, "vmovn");
-  }
-  case ARM::BI__builtin_neon_vmul_v:
-  case ARM::BI__builtin_neon_vmulq_v:
-    assert(Type.isPoly() && "vmul builtin only supported for polynomial types");
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vmulp, Ty),
-                        Ops, "vmul");
-  case ARM::BI__builtin_neon_vmull_v:
-    // FIXME: the integer vmull operations could be emitted in terms of pure
-    // LLVM IR (2 exts followed by a mul). Unfortunately LLVM has a habit of
-    // hoisting the exts outside loops. Until global ISel comes along that can
-    // see through such movement this leads to bad CodeGen. So we need an
-    // intrinsic for now.
-    Int = usgn ? Intrinsic::arm_neon_vmullu : Intrinsic::arm_neon_vmulls;
-    Int = Type.isPoly() ? (unsigned)Intrinsic::arm_neon_vmullp : Int;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull");
-  case ARM::BI__builtin_neon_vfma_v:
-  case ARM::BI__builtin_neon_vfmaq_v: {
-    Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
-    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
-    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
-    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
-
-    // NEON intrinsic puts accumulator first, unlike the LLVM fma.
-    return Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]);
-  }
-  case ARM::BI__builtin_neon_vpadal_v:
-  case ARM::BI__builtin_neon_vpadalq_v: {
-    Int = usgn ? Intrinsic::arm_neon_vpadalu : Intrinsic::arm_neon_vpadals;
-    // The source operand type has twice as many elements of half the size.
-    unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
-    llvm::Type *EltTy =
-      llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
-    llvm::Type *NarrowTy =
-      llvm::VectorType::get(EltTy, VTy->getNumElements() * 2);
-    llvm::Type *Tys[2] = { Ty, NarrowTy };
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vpadal");
-  }
-  case ARM::BI__builtin_neon_vpadd_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vpadd, Ty),
-                        Ops, "vpadd");
-  case ARM::BI__builtin_neon_vpaddl_v:
-  case ARM::BI__builtin_neon_vpaddlq_v: {
-    Int = usgn ? Intrinsic::arm_neon_vpaddlu : Intrinsic::arm_neon_vpaddls;
-    // The source operand type has twice as many elements of half the size.
-    unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
-    llvm::Type *EltTy = llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
-    llvm::Type *NarrowTy =
-      llvm::VectorType::get(EltTy, VTy->getNumElements() * 2);
-    llvm::Type *Tys[2] = { Ty, NarrowTy };
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vpaddl");
-  }
-  case ARM::BI__builtin_neon_vpmax_v:
-    Int = usgn ? Intrinsic::arm_neon_vpmaxu : Intrinsic::arm_neon_vpmaxs;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmax");
-  case ARM::BI__builtin_neon_vpmin_v:
-    Int = usgn ? Intrinsic::arm_neon_vpminu : Intrinsic::arm_neon_vpmins;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmin");
-  case ARM::BI__builtin_neon_vqabs_v:
-  case ARM::BI__builtin_neon_vqabsq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqabs, Ty),
-                        Ops, "vqabs");
-  case ARM::BI__builtin_neon_vqadd_v:
-  case ARM::BI__builtin_neon_vqaddq_v:
-    Int = usgn ? Intrinsic::arm_neon_vqaddu : Intrinsic::arm_neon_vqadds;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqadd");
-  case ARM::BI__builtin_neon_vqdmlal_v: {
-    SmallVector<Value *, 2> MulOps(Ops.begin() + 1, Ops.end());
-    Value *Mul = EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqdmull, Ty),
-                              MulOps, "vqdmlal");
-
-    SmallVector<Value *, 2> AddOps;
-    AddOps.push_back(Ops[0]);
-    AddOps.push_back(Mul);
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqadds, Ty),
-                        AddOps, "vqdmlal");
-  }
-  case ARM::BI__builtin_neon_vqdmlsl_v: {
-    SmallVector<Value *, 2> MulOps(Ops.begin() + 1, Ops.end());
-    Value *Mul = EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqdmull, Ty),
-                              MulOps, "vqdmlsl");
-
-    SmallVector<Value *, 2> SubOps;
-    SubOps.push_back(Ops[0]);
-    SubOps.push_back(Mul);
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqsubs, Ty),
-                        SubOps, "vqdmlsl");
-  }
-  case ARM::BI__builtin_neon_vqdmulh_v:
-  case ARM::BI__builtin_neon_vqdmulhq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqdmulh, Ty),
-                        Ops, "vqdmulh");
-  case ARM::BI__builtin_neon_vqdmull_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqdmull, Ty),
-                        Ops, "vqdmull");
-  case ARM::BI__builtin_neon_vqmovn_v:
-    Int = usgn ? Intrinsic::arm_neon_vqmovnu : Intrinsic::arm_neon_vqmovns;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqmovn");
-  case ARM::BI__builtin_neon_vqmovun_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqmovnsu, Ty),
-                        Ops, "vqdmull");
-  case ARM::BI__builtin_neon_vqneg_v:
-  case ARM::BI__builtin_neon_vqnegq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqneg, Ty),
-                        Ops, "vqneg");
-  case ARM::BI__builtin_neon_vqrdmulh_v:
-  case ARM::BI__builtin_neon_vqrdmulhq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqrdmulh, Ty),
-                        Ops, "vqrdmulh");
-  case ARM::BI__builtin_neon_vqrshl_v:
-  case ARM::BI__builtin_neon_vqrshlq_v:
-    Int = usgn ? Intrinsic::arm_neon_vqrshiftu : Intrinsic::arm_neon_vqrshifts;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshl");
-  case ARM::BI__builtin_neon_vqrshrn_n_v:
+  case NEON::BI__builtin_neon_vqrshrn_n_v:
     Int =
       usgn ? Intrinsic::arm_neon_vqrshiftnu : Intrinsic::arm_neon_vqrshiftns;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n",
                         1, true);
-  case ARM::BI__builtin_neon_vqrshrun_n_v:
+  case NEON::BI__builtin_neon_vqrshrun_n_v:
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqrshiftnsu, Ty),
                         Ops, "vqrshrun_n", 1, true);
-  case ARM::BI__builtin_neon_vqshl_v:
-  case ARM::BI__builtin_neon_vqshlq_v:
-    Int = usgn ? Intrinsic::arm_neon_vqshiftu : Intrinsic::arm_neon_vqshifts;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl");
-  case ARM::BI__builtin_neon_vqshl_n_v:
-  case ARM::BI__builtin_neon_vqshlq_n_v:
-    Int = usgn ? Intrinsic::arm_neon_vqshiftu : Intrinsic::arm_neon_vqshifts;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl_n",
-                        1, false);
-  case ARM::BI__builtin_neon_vqshlu_n_v:
-  case ARM::BI__builtin_neon_vqshluq_n_v:
+  case NEON::BI__builtin_neon_vqshlu_n_v:
+  case NEON::BI__builtin_neon_vqshluq_n_v:
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqshiftsu, Ty),
                         Ops, "vqshlu", 1, false);
-  case ARM::BI__builtin_neon_vqshrn_n_v:
+  case NEON::BI__builtin_neon_vqshrn_n_v:
     Int = usgn ? Intrinsic::arm_neon_vqshiftnu : Intrinsic::arm_neon_vqshiftns;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n",
                         1, true);
-  case ARM::BI__builtin_neon_vqshrun_n_v:
+  case NEON::BI__builtin_neon_vqshrun_n_v:
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqshiftnsu, Ty),
                         Ops, "vqshrun_n", 1, true);
-  case ARM::BI__builtin_neon_vqsub_v:
-  case ARM::BI__builtin_neon_vqsubq_v:
-    Int = usgn ? Intrinsic::arm_neon_vqsubu : Intrinsic::arm_neon_vqsubs;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqsub");
-  case ARM::BI__builtin_neon_vraddhn_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vraddhn, Ty),
-                        Ops, "vraddhn");
-  case ARM::BI__builtin_neon_vrecpe_v:
-  case ARM::BI__builtin_neon_vrecpeq_v:
+  case NEON::BI__builtin_neon_vrecpe_v:
+  case NEON::BI__builtin_neon_vrecpeq_v:
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrecpe, Ty),
                         Ops, "vrecpe");
-  case ARM::BI__builtin_neon_vrecps_v:
-  case ARM::BI__builtin_neon_vrecpsq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrecps, Ty),
-                        Ops, "vrecps");
-  case ARM::BI__builtin_neon_vrhadd_v:
-  case ARM::BI__builtin_neon_vrhaddq_v:
-    Int = usgn ? Intrinsic::arm_neon_vrhaddu : Intrinsic::arm_neon_vrhadds;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrhadd");
-  case ARM::BI__builtin_neon_vrshl_v:
-  case ARM::BI__builtin_neon_vrshlq_v:
-    Int = usgn ? Intrinsic::arm_neon_vrshiftu : Intrinsic::arm_neon_vrshifts;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshl");
-  case ARM::BI__builtin_neon_vrshrn_n_v:
+  case NEON::BI__builtin_neon_vrshrn_n_v:
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrshiftn, Ty),
                         Ops, "vrshrn_n", 1, true);
-  case ARM::BI__builtin_neon_vrshr_n_v:
-  case ARM::BI__builtin_neon_vrshrq_n_v:
+  case NEON::BI__builtin_neon_vrshr_n_v:
+  case NEON::BI__builtin_neon_vrshrq_n_v:
     Int = usgn ? Intrinsic::arm_neon_vrshiftu : Intrinsic::arm_neon_vrshifts;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshr_n", 1, true);
-  case ARM::BI__builtin_neon_vrsqrte_v:
-  case ARM::BI__builtin_neon_vrsqrteq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrsqrte, Ty),
-                        Ops, "vrsqrte");
-  case ARM::BI__builtin_neon_vrsqrts_v:
-  case ARM::BI__builtin_neon_vrsqrtsq_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrsqrts, Ty),
-                        Ops, "vrsqrts");
-  case ARM::BI__builtin_neon_vrsra_n_v:
-  case ARM::BI__builtin_neon_vrsraq_n_v:
+  case NEON::BI__builtin_neon_vrsra_n_v:
+  case NEON::BI__builtin_neon_vrsraq_n_v:
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
     Ops[2] = EmitNeonShiftVector(Ops[2], Ty, true);
     Int = usgn ? Intrinsic::arm_neon_vrshiftu : Intrinsic::arm_neon_vrshifts;
     Ops[1] = Builder.CreateCall2(CGM.getIntrinsic(Int, Ty), Ops[1], Ops[2]);
     return Builder.CreateAdd(Ops[0], Ops[1], "vrsra_n");
-  case ARM::BI__builtin_neon_vrsubhn_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrsubhn, Ty),
-                        Ops, "vrsubhn");
-  case ARM::BI__builtin_neon_vshl_v:
-  case ARM::BI__builtin_neon_vshlq_v:
-    Int = usgn ? Intrinsic::arm_neon_vshiftu : Intrinsic::arm_neon_vshifts;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vshl");
-  case ARM::BI__builtin_neon_vshll_n_v:
-    Int = usgn ? Intrinsic::arm_neon_vshiftlu : Intrinsic::arm_neon_vshiftls;
-    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vshll", 1);
-  case ARM::BI__builtin_neon_vshl_n_v:
-  case ARM::BI__builtin_neon_vshlq_n_v:
-    Ops[1] = EmitNeonShiftVector(Ops[1], Ty, false);
-    return Builder.CreateShl(Builder.CreateBitCast(Ops[0],Ty), Ops[1],
-                             "vshl_n");
-  case ARM::BI__builtin_neon_vshrn_n_v:
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vshiftn, Ty),
-                        Ops, "vshrn_n", 1, true);
-  case ARM::BI__builtin_neon_vshr_n_v:
-  case ARM::BI__builtin_neon_vshrq_n_v:
-    return EmitNeonRShiftImm(Ops[0], Ops[1], Ty, usgn, "vshr_n");
-  case ARM::BI__builtin_neon_vsri_n_v:
-  case ARM::BI__builtin_neon_vsriq_n_v:
+  case NEON::BI__builtin_neon_vsri_n_v:
+  case NEON::BI__builtin_neon_vsriq_n_v:
     rightShift = true;
-  case ARM::BI__builtin_neon_vsli_n_v:
-  case ARM::BI__builtin_neon_vsliq_n_v:
+  case NEON::BI__builtin_neon_vsli_n_v:
+  case NEON::BI__builtin_neon_vsliq_n_v:
     Ops[2] = EmitNeonShiftVector(Ops[2], Ty, rightShift);
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vshiftins, Ty),
                         Ops, "vsli_n");
-  case ARM::BI__builtin_neon_vsra_n_v:
-  case ARM::BI__builtin_neon_vsraq_n_v:
+  case NEON::BI__builtin_neon_vsra_n_v:
+  case NEON::BI__builtin_neon_vsraq_n_v:
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
     Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n");
     return Builder.CreateAdd(Ops[0], Ops[1]);
-  case ARM::BI__builtin_neon_vst1_v:
-  case ARM::BI__builtin_neon_vst1q_v:
-    Ops.push_back(Align);
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst1, Ty),
-                        Ops, "");
-  case ARM::BI__builtin_neon_vst1q_lane_v:
+  case NEON::BI__builtin_neon_vst1q_lane_v:
     // Handle 64-bit integer elements as a special case.  Use a shuffle to get
     // a one-element vector and avoid poor code for i64 in the backend.
     if (VTy->getElementType()->isIntegerTy(64)) {
@@ -4765,7 +4781,7 @@
                                                  Ops[1]->getType()), Ops);
     }
     // fall through
-  case ARM::BI__builtin_neon_vst1_lane_v: {
+  case NEON::BI__builtin_neon_vst1_lane_v: {
     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
     Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
@@ -4774,89 +4790,1922 @@
     St->setAlignment(cast<ConstantInt>(Align)->getZExtValue());
     return St;
   }
-  case ARM::BI__builtin_neon_vst2_v:
-  case ARM::BI__builtin_neon_vst2q_v:
-    Ops.push_back(Align);
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst2, Ty),
-                        Ops, "");
-  case ARM::BI__builtin_neon_vst2_lane_v:
-  case ARM::BI__builtin_neon_vst2q_lane_v:
-    Ops.push_back(Align);
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst2lane, Ty),
-                        Ops, "");
-  case ARM::BI__builtin_neon_vst3_v:
-  case ARM::BI__builtin_neon_vst3q_v:
-    Ops.push_back(Align);
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst3, Ty),
-                        Ops, "");
-  case ARM::BI__builtin_neon_vst3_lane_v:
-  case ARM::BI__builtin_neon_vst3q_lane_v:
-    Ops.push_back(Align);
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst3lane, Ty),
-                        Ops, "");
-  case ARM::BI__builtin_neon_vst4_v:
-  case ARM::BI__builtin_neon_vst4q_v:
-    Ops.push_back(Align);
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst4, Ty),
-                        Ops, "");
-  case ARM::BI__builtin_neon_vst4_lane_v:
-  case ARM::BI__builtin_neon_vst4q_lane_v:
-    Ops.push_back(Align);
-    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst4lane, Ty),
-                        Ops, "");
-  case ARM::BI__builtin_neon_vsubhn_v: {
-    llvm::VectorType *SrcTy =
-        llvm::VectorType::getExtendedElementVectorType(VTy);
-
-    // %sum = add <4 x i32> %lhs, %rhs
-    Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
-    Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy);
-    Ops[0] = Builder.CreateSub(Ops[0], Ops[1], "vsubhn");
-
-    // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
-    Constant *ShiftAmt = ConstantInt::get(SrcTy->getElementType(),
-                                       SrcTy->getScalarSizeInBits() / 2);
-    ShiftAmt = ConstantVector::getSplat(VTy->getNumElements(), ShiftAmt);
-    Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vsubhn");
-
-    // %res = trunc <4 x i32> %high to <4 x i16>
-    return Builder.CreateTrunc(Ops[0], VTy, "vsubhn");
-  }
-  case ARM::BI__builtin_neon_vtbl1_v:
+  case NEON::BI__builtin_neon_vtbl1_v:
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl1),
                         Ops, "vtbl1");
-  case ARM::BI__builtin_neon_vtbl2_v:
+  case NEON::BI__builtin_neon_vtbl2_v:
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl2),
                         Ops, "vtbl2");
-  case ARM::BI__builtin_neon_vtbl3_v:
+  case NEON::BI__builtin_neon_vtbl3_v:
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl3),
                         Ops, "vtbl3");
-  case ARM::BI__builtin_neon_vtbl4_v:
+  case NEON::BI__builtin_neon_vtbl4_v:
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl4),
                         Ops, "vtbl4");
-  case ARM::BI__builtin_neon_vtbx1_v:
+  case NEON::BI__builtin_neon_vtbx1_v:
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx1),
                         Ops, "vtbx1");
-  case ARM::BI__builtin_neon_vtbx2_v:
+  case NEON::BI__builtin_neon_vtbx2_v:
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx2),
                         Ops, "vtbx2");
-  case ARM::BI__builtin_neon_vtbx3_v:
+  case NEON::BI__builtin_neon_vtbx3_v:
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx3),
                         Ops, "vtbx3");
-  case ARM::BI__builtin_neon_vtbx4_v:
+  case NEON::BI__builtin_neon_vtbx4_v:
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx4),
                         Ops, "vtbx4");
-  case ARM::BI__builtin_neon_vtst_v:
-  case ARM::BI__builtin_neon_vtstq_v: {
+  }
+}
+
+static Value *EmitARM64TblBuiltinExpr(CodeGenFunction &CGF, unsigned BuiltinID,
+                                      const CallExpr *E,
+                                      SmallVectorImpl<Value *> &Ops) {
+  unsigned int Int = 0;
+  const char *s = NULL;
+
+  unsigned TblPos;
+  switch (BuiltinID) {
+  default:
+    return 0;
+  case NEON::BI__builtin_neon_vtbl1_v:
+  case NEON::BI__builtin_neon_vqtbl1_v:
+  case NEON::BI__builtin_neon_vqtbl1q_v:
+  case NEON::BI__builtin_neon_vtbl2_v:
+  case NEON::BI__builtin_neon_vqtbl2_v:
+  case NEON::BI__builtin_neon_vqtbl2q_v:
+  case NEON::BI__builtin_neon_vtbl3_v:
+  case NEON::BI__builtin_neon_vqtbl3_v:
+  case NEON::BI__builtin_neon_vqtbl3q_v:
+  case NEON::BI__builtin_neon_vtbl4_v:
+  case NEON::BI__builtin_neon_vqtbl4_v:
+  case NEON::BI__builtin_neon_vqtbl4q_v:
+    TblPos = 0;
+    break;
+  case NEON::BI__builtin_neon_vtbx1_v:
+  case NEON::BI__builtin_neon_vqtbx1_v:
+  case NEON::BI__builtin_neon_vqtbx1q_v:
+  case NEON::BI__builtin_neon_vtbx2_v:
+  case NEON::BI__builtin_neon_vqtbx2_v:
+  case NEON::BI__builtin_neon_vqtbx2q_v:
+  case NEON::BI__builtin_neon_vtbx3_v:
+  case NEON::BI__builtin_neon_vqtbx3_v:
+  case NEON::BI__builtin_neon_vqtbx3q_v:
+  case NEON::BI__builtin_neon_vtbx4_v:
+  case NEON::BI__builtin_neon_vqtbx4_v:
+  case NEON::BI__builtin_neon_vqtbx4q_v:
+    TblPos = 1;
+    break;
+  }
+
+  assert(E->getNumArgs() >= 3);
+
+  // Get the last argument, which specifies the vector type.
+  llvm::APSInt Result;
+  const Expr *Arg = E->getArg(E->getNumArgs() - 1);
+  if (!Arg->isIntegerConstantExpr(Result, CGF.getContext()))
+    return 0;
+
+  // Determine the type of this overloaded NEON intrinsic.
+  NeonTypeFlags Type(Result.getZExtValue());
+  llvm::VectorType *VTy = GetNeonType(&CGF, Type);
+  llvm::Type *Ty = VTy;
+  if (!Ty)
+    return 0;
+
+  Arg = E->getArg(TblPos);
+  unsigned nElts = VTy->getNumElements();
+
+  CodeGen::CGBuilderTy &Builder = CGF.Builder;
+
+  // AArch64 scalar builtins are not overloaded, they do not have an extra
+  // argument that specifies the vector type, need to handle each case.
+  SmallVector<Value *, 2> TblOps;
+  switch (BuiltinID) {
+  case NEON::BI__builtin_neon_vtbl1_v: {
+    TblOps.push_back(Ops[0]);
+    return packTBLDVectorList(CGF, TblOps, 0, Ops[1], Ty,
+                              Intrinsic::arm64_neon_tbl1, "vtbl1");
+  }
+  case NEON::BI__builtin_neon_vtbl2_v: {
+    TblOps.push_back(Ops[0]);
+    TblOps.push_back(Ops[1]);
+    return packTBLDVectorList(CGF, TblOps, 0, Ops[2], Ty,
+                              Intrinsic::arm64_neon_tbl1, "vtbl1");
+  }
+  case NEON::BI__builtin_neon_vtbl3_v: {
+    TblOps.push_back(Ops[0]);
+    TblOps.push_back(Ops[1]);
+    TblOps.push_back(Ops[2]);
+    return packTBLDVectorList(CGF, TblOps, 0, Ops[3], Ty,
+                              Intrinsic::arm64_neon_tbl2, "vtbl2");
+  }
+  case NEON::BI__builtin_neon_vtbl4_v: {
+    TblOps.push_back(Ops[0]);
+    TblOps.push_back(Ops[1]);
+    TblOps.push_back(Ops[2]);
+    TblOps.push_back(Ops[3]);
+    return packTBLDVectorList(CGF, TblOps, 0, Ops[4], Ty,
+                              Intrinsic::arm64_neon_tbl2, "vtbl2");
+  }
+  case NEON::BI__builtin_neon_vtbx1_v: {
+    TblOps.push_back(Ops[1]);
+    Value *TblRes = packTBLDVectorList(CGF, TblOps, 0, Ops[2], Ty,
+                                    Intrinsic::arm64_neon_tbl1, "vtbl1");
+
+    llvm::Constant *Eight = ConstantInt::get(VTy->getElementType(), 8);
+    Value* EightV = llvm::ConstantVector::getSplat(nElts, Eight);
+    Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[2], EightV);
+    CmpRes = Builder.CreateSExt(CmpRes, Ty);
+
+    Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]);
+    Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes);
+    return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx");
+  }
+  case NEON::BI__builtin_neon_vtbx2_v: {
+    TblOps.push_back(Ops[1]);
+    TblOps.push_back(Ops[2]);
+    return packTBLDVectorList(CGF, TblOps, Ops[0], Ops[3], Ty,
+                              Intrinsic::arm64_neon_tbx1, "vtbx1");
+  }
+  case NEON::BI__builtin_neon_vtbx3_v: {
+    TblOps.push_back(Ops[1]);
+    TblOps.push_back(Ops[2]);
+    TblOps.push_back(Ops[3]);
+    Value *TblRes = packTBLDVectorList(CGF, TblOps, 0, Ops[4], Ty,
+                                       Intrinsic::arm64_neon_tbl2, "vtbl2");
+
+    llvm::Constant *TwentyFour = ConstantInt::get(VTy->getElementType(), 24);
+    Value* TwentyFourV = llvm::ConstantVector::getSplat(nElts, TwentyFour);
+    Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[4],
+                                           TwentyFourV);
+    CmpRes = Builder.CreateSExt(CmpRes, Ty);
+
+    Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]);
+    Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes);
+    return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx");
+  }
+  case NEON::BI__builtin_neon_vtbx4_v: {
+    TblOps.push_back(Ops[1]);
+    TblOps.push_back(Ops[2]);
+    TblOps.push_back(Ops[3]);
+    TblOps.push_back(Ops[4]);
+    return packTBLDVectorList(CGF, TblOps, Ops[0], Ops[5], Ty,
+                              Intrinsic::arm64_neon_tbx2, "vtbx2");
+  }
+  case NEON::BI__builtin_neon_vqtbl1_v:
+  case NEON::BI__builtin_neon_vqtbl1q_v:
+    Int = Intrinsic::arm64_neon_tbl1; s = "vtbl1"; break;
+  case NEON::BI__builtin_neon_vqtbl2_v:
+  case NEON::BI__builtin_neon_vqtbl2q_v: {
+    Int = Intrinsic::arm64_neon_tbl2; s = "vtbl2"; break;
+  case NEON::BI__builtin_neon_vqtbl3_v:
+  case NEON::BI__builtin_neon_vqtbl3q_v:
+    Int = Intrinsic::arm64_neon_tbl3; s = "vtbl3"; break;
+  case NEON::BI__builtin_neon_vqtbl4_v:
+  case NEON::BI__builtin_neon_vqtbl4q_v:
+    Int = Intrinsic::arm64_neon_tbl4; s = "vtbl4"; break;
+  case NEON::BI__builtin_neon_vqtbx1_v:
+  case NEON::BI__builtin_neon_vqtbx1q_v:
+    Int = Intrinsic::arm64_neon_tbx1; s = "vtbx1"; break;
+  case NEON::BI__builtin_neon_vqtbx2_v:
+  case NEON::BI__builtin_neon_vqtbx2q_v:
+    Int = Intrinsic::arm64_neon_tbx2; s = "vtbx2"; break;
+  case NEON::BI__builtin_neon_vqtbx3_v:
+  case NEON::BI__builtin_neon_vqtbx3q_v:
+    Int = Intrinsic::arm64_neon_tbx3; s = "vtbx3"; break;
+  case NEON::BI__builtin_neon_vqtbx4_v:
+  case NEON::BI__builtin_neon_vqtbx4q_v:
+    Int = Intrinsic::arm64_neon_tbx4; s = "vtbx4"; break;
+  }
+  }
+
+  if (!Int)
+    return 0;
+
+  Function *F = CGF.CGM.getIntrinsic(Int, Ty);
+  return CGF.EmitNeonCall(F, Ops, s);
+}
+
+Value *CodeGenFunction::vectorWrapScalar16(Value *Op) {
+  llvm::Type *VTy = llvm::VectorType::get(Int16Ty, 4);
+  Op = Builder.CreateBitCast(Op, Int16Ty);
+  Value *V = UndefValue::get(VTy);
+  llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
+  Op = Builder.CreateInsertElement(V, Op, CI);
+  return Op;
+}
+
+Value *CodeGenFunction::vectorWrapScalar8(Value *Op) {
+  llvm::Type *VTy = llvm::VectorType::get(Int8Ty, 8);
+  Op = Builder.CreateBitCast(Op, Int8Ty);
+  Value *V = UndefValue::get(VTy);
+  llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
+  Op = Builder.CreateInsertElement(V, Op, CI);
+  return Op;
+}
+
+Value *CodeGenFunction::
+emitVectorWrappedScalar8Intrinsic(unsigned Int, SmallVectorImpl<Value*> &Ops,
+                                  const char *Name) {
+  // i8 is not a legal types for ARM64, so we can't just use
+  // a normal overloaed intrinsic call for these scalar types. Instead
+  // we'll build 64-bit vectors w/ lane zero being our input values and
+  // perform the operation on that. The back end can pattern match directly
+  // to the scalar instruction.
+  Ops[0] = vectorWrapScalar8(Ops[0]);
+  Ops[1] = vectorWrapScalar8(Ops[1]);
+  llvm::Type *VTy = llvm::VectorType::get(Int8Ty, 8);
+  Value *V = EmitNeonCall(CGM.getIntrinsic(Int, VTy), Ops, Name);
+  Constant *CI = ConstantInt::get(Int32Ty, 0);
+  return Builder.CreateExtractElement(V, CI, "lane0");
+}
+
+Value *CodeGenFunction::
+emitVectorWrappedScalar16Intrinsic(unsigned Int, SmallVectorImpl<Value*> &Ops,
+                                   const char *Name) {
+  // i16 is not a legal types for ARM64, so we can't just use
+  // a normal overloaed intrinsic call for these scalar types. Instead
+  // we'll build 64-bit vectors w/ lane zero being our input values and
+  // perform the operation on that. The back end can pattern match directly
+  // to the scalar instruction.
+  Ops[0] = vectorWrapScalar16(Ops[0]);
+  Ops[1] = vectorWrapScalar16(Ops[1]);
+  llvm::Type *VTy = llvm::VectorType::get(Int16Ty, 4);
+  Value *V = EmitNeonCall(CGM.getIntrinsic(Int, VTy), Ops, Name);
+  Constant *CI = ConstantInt::get(Int32Ty, 0);
+  return Builder.CreateExtractElement(V, CI, "lane0");
+}
+
+Value *CodeGenFunction::EmitARM64BuiltinExpr(unsigned BuiltinID,
+                                             const CallExpr *E) {
+  if (BuiltinID == ARM64::BI__clear_cache) {
+    assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
+    const FunctionDecl *FD = E->getDirectCallee();
+    SmallVector<Value*, 2> Ops;
+    for (unsigned i = 0; i < 2; i++)
+      Ops.push_back(EmitScalarExpr(E->getArg(i)));
+    llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType());
+    llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
+    StringRef Name = FD->getName();
+    return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops);
+  }
+
+  if (BuiltinID == ARM64::BI__builtin_arm_ldrex &&
+      getContext().getTypeSize(E->getType()) == 128) {
+    Function *F = CGM.getIntrinsic(Intrinsic::arm64_ldxp);
+
+    Value *LdPtr = EmitScalarExpr(E->getArg(0));
+    Value *Val = Builder.CreateCall(F, Builder.CreateBitCast(LdPtr, Int8PtrTy),
+                                    "ldxp");
+
+    Value *Val0 = Builder.CreateExtractValue(Val, 1);
+    Value *Val1 = Builder.CreateExtractValue(Val, 0);
+    llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
+    Val0 = Builder.CreateZExt(Val0, Int128Ty);
+    Val1 = Builder.CreateZExt(Val1, Int128Ty);
+
+    Value *ShiftCst = llvm::ConstantInt::get(Int128Ty, 64);
+    Val = Builder.CreateShl(Val0, ShiftCst, "shl", true /* nuw */);
+    Val = Builder.CreateOr(Val, Val1);
+    return Builder.CreateBitCast(Val, ConvertType(E->getType()));
+  } else if (BuiltinID == ARM64::BI__builtin_arm_ldrex) {
+    Value *LoadAddr = EmitScalarExpr(E->getArg(0));
+
+    QualType Ty = E->getType();
+    llvm::Type *RealResTy = ConvertType(Ty);
+    llvm::Type *IntResTy = llvm::IntegerType::get(getLLVMContext(),
+                                                  getContext().getTypeSize(Ty));
+    LoadAddr = Builder.CreateBitCast(LoadAddr, IntResTy->getPointerTo());
+
+    Function *F = CGM.getIntrinsic(Intrinsic::arm64_ldxr, LoadAddr->getType());
+    Value *Val = Builder.CreateCall(F, LoadAddr, "ldxr");
+
+    if (RealResTy->isPointerTy())
+      return Builder.CreateIntToPtr(Val, RealResTy);
+
+    Val = Builder.CreateTruncOrBitCast(Val, IntResTy);
+    return Builder.CreateBitCast(Val, RealResTy);
+  }
+
+  if (BuiltinID == ARM64::BI__builtin_arm_strex &&
+      getContext().getTypeSize(E->getArg(0)->getType()) == 128) {
+    Function *F = CGM.getIntrinsic(Intrinsic::arm64_stxp);
+    llvm::Type *STy = llvm::StructType::get(Int64Ty, Int64Ty, NULL);
+
+    Value *One = llvm::ConstantInt::get(Int32Ty, 1);
+    Value *Tmp = Builder.CreateAlloca(ConvertType(E->getArg(0)->getType()),
+                                      One);
+    Value *Val = EmitScalarExpr(E->getArg(0));
+    Builder.CreateStore(Val, Tmp);
+
+    Value *LdPtr = Builder.CreateBitCast(Tmp,llvm::PointerType::getUnqual(STy));
+    Val = Builder.CreateLoad(LdPtr);
+
+    Value *Arg0 = Builder.CreateExtractValue(Val, 0);
+    Value *Arg1 = Builder.CreateExtractValue(Val, 1);
+    Value *StPtr = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)),
+                                         Int8PtrTy);
+    return Builder.CreateCall3(F, Arg0, Arg1, StPtr, "stxp");
+  } else if (BuiltinID == ARM64::BI__builtin_arm_strex) {
+    Value *StoreVal = EmitScalarExpr(E->getArg(0));
+    Value *StoreAddr = EmitScalarExpr(E->getArg(1));
+
+    QualType Ty = E->getArg(0)->getType();
+    llvm::Type *StoreTy = llvm::IntegerType::get(getLLVMContext(),
+                                                 getContext().getTypeSize(Ty));
+    StoreAddr = Builder.CreateBitCast(StoreAddr, StoreTy->getPointerTo());
+
+    if (StoreVal->getType()->isPointerTy())
+      StoreVal = Builder.CreatePtrToInt(StoreVal, Int64Ty);
+    else {
+      StoreVal = Builder.CreateBitCast(StoreVal, StoreTy);
+      StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int64Ty);
+    }
+
+    Function *F = CGM.getIntrinsic(Intrinsic::arm64_stxr, StoreAddr->getType());
+    return Builder.CreateCall2(F, StoreVal, StoreAddr, "stxr");
+  }
+
+  if (BuiltinID == ARM64::BI__builtin_arm_clrex) {
+    Function *F = CGM.getIntrinsic(Intrinsic::arm64_clrex);
+    return Builder.CreateCall(F);
+  }
+
+  // CRC32
+  Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
+  switch (BuiltinID) {
+  case ARM64::BI__builtin_arm_crc32b:
+    CRCIntrinsicID = Intrinsic::arm64_crc32b; break;
+  case ARM64::BI__builtin_arm_crc32cb:
+    CRCIntrinsicID = Intrinsic::arm64_crc32cb; break;
+  case ARM64::BI__builtin_arm_crc32h:
+    CRCIntrinsicID = Intrinsic::arm64_crc32h; break;
+  case ARM64::BI__builtin_arm_crc32ch:
+    CRCIntrinsicID = Intrinsic::arm64_crc32ch; break;
+  case ARM64::BI__builtin_arm_crc32w:
+    CRCIntrinsicID = Intrinsic::arm64_crc32w; break;
+  case ARM64::BI__builtin_arm_crc32cw:
+    CRCIntrinsicID = Intrinsic::arm64_crc32cw; break;
+  case ARM64::BI__builtin_arm_crc32d:
+    CRCIntrinsicID = Intrinsic::arm64_crc32x; break;
+  case ARM64::BI__builtin_arm_crc32cd:
+    CRCIntrinsicID = Intrinsic::arm64_crc32cx; break;
+  }
+
+  if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
+    Value *Arg0 = EmitScalarExpr(E->getArg(0));
+    Value *Arg1 = EmitScalarExpr(E->getArg(1));
+    Function *F = CGM.getIntrinsic(CRCIntrinsicID);
+
+    llvm::Type *DataTy = F->getFunctionType()->getParamType(1);
+    Arg1 = Builder.CreateZExtOrBitCast(Arg1, DataTy);
+
+    return Builder.CreateCall2(F, Arg0, Arg1);
+  }
+
+  llvm::SmallVector<Value*, 4> Ops;
+  for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++)
+    Ops.push_back(EmitScalarExpr(E->getArg(i)));
+
+  llvm::ArrayRef<NeonIntrinsicInfo> SISDMap(ARM64SISDIntrinsicMap);
+  const NeonIntrinsicInfo *Builtin = findNeonIntrinsicInMap(
+      SISDMap, BuiltinID, ARM64SISDIntrinsicsProvenSorted);
+
+  if (Builtin) {
+    Ops.push_back(EmitScalarExpr(E->getArg(E->getNumArgs() - 1)));
+    Value *Result = EmitCommonNeonSISDBuiltinExpr(*this, *Builtin, Ops, E);
+    assert(Result && "SISD intrinsic should have been handled");
+    return Result;
+  }
+
+  llvm::APSInt Result;
+  const Expr *Arg = E->getArg(E->getNumArgs()-1);
+  NeonTypeFlags Type(0);
+  if (Arg->isIntegerConstantExpr(Result, getContext()))
+    // Determine the type of this overloaded NEON intrinsic.
+    Type = NeonTypeFlags(Result.getZExtValue());
+
+  bool usgn = Type.isUnsigned();
+  bool quad = Type.isQuad();
+
+  // Handle non-overloaded intrinsics first.
+  switch (BuiltinID) {
+  default: break;
+  case NEON::BI__builtin_neon_vldrq_p128: {
+    llvm::Type *Int128PTy = llvm::Type::getIntNPtrTy(getLLVMContext(), 128);
+    Value *Ptr = Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), Int128PTy);
+    return Builder.CreateLoad(Ptr);
+  }
+  case NEON::BI__builtin_neon_vstrq_p128: {
+    llvm::Type *Int128PTy = llvm::Type::getIntNPtrTy(getLLVMContext(), 128);
+    Value *Ptr = Builder.CreateBitCast(Ops[0], Int128PTy);
+    return Builder.CreateStore(EmitScalarExpr(E->getArg(1)), Ptr);
+  }
+  case NEON::BI__builtin_neon_vcvts_u32_f32:
+  case NEON::BI__builtin_neon_vcvtd_u64_f64:
+    usgn = true;
+    // FALL THROUGH
+  case NEON::BI__builtin_neon_vcvts_s32_f32:
+  case NEON::BI__builtin_neon_vcvtd_s64_f64: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    bool Is64 = Ops[0]->getType()->getPrimitiveSizeInBits() == 64;
+    llvm::Type *InTy = Is64 ? Int64Ty : Int32Ty;
+    llvm::Type *FTy = Is64 ? DoubleTy : FloatTy;
+    Ops[0] = Builder.CreateBitCast(Ops[0], FTy);
+    if (usgn)
+      return Builder.CreateFPToUI(Ops[0], InTy);
+    return Builder.CreateFPToSI(Ops[0], InTy);
+  }
+  case NEON::BI__builtin_neon_vcvts_f32_u32:
+  case NEON::BI__builtin_neon_vcvtd_f64_u64:
+    usgn = true;
+    // FALL THROUGH
+  case NEON::BI__builtin_neon_vcvts_f32_s32:
+  case NEON::BI__builtin_neon_vcvtd_f64_s64: {
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    bool Is64 = Ops[0]->getType()->getPrimitiveSizeInBits() == 64;
+    llvm::Type *InTy = Is64 ? Int64Ty : Int32Ty;
+    llvm::Type *FTy = Is64 ? DoubleTy : FloatTy;
+    Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
+    if (usgn)
+      return Builder.CreateUIToFP(Ops[0], FTy);
+    return Builder.CreateSIToFP(Ops[0], FTy);
+  }
+  case NEON::BI__builtin_neon_vpaddd_s64: {
+    llvm::Type *Ty =
+      llvm::VectorType::get(llvm::Type::getInt64Ty(getLLVMContext()), 2);
+    Value *Vec = EmitScalarExpr(E->getArg(0));
+    // The vector is v2f64, so make sure it's bitcast to that.
+    Vec = Builder.CreateBitCast(Vec, Ty, "v2i64");
+    llvm::Value *Idx0 = llvm::ConstantInt::get(Int32Ty, 0);
+    llvm::Value *Idx1 = llvm::ConstantInt::get(Int32Ty, 1);
+    Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
+    Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
+    // Pairwise addition of a v2f64 into a scalar f64.
+    return Builder.CreateAdd(Op0, Op1, "vpaddd");
+  }
+  case NEON::BI__builtin_neon_vpaddd_f64: {
+    llvm::Type *Ty =
+      llvm::VectorType::get(llvm::Type::getDoubleTy(getLLVMContext()), 2);
+    Value *Vec = EmitScalarExpr(E->getArg(0));
+    // The vector is v2f64, so make sure it's bitcast to that.
+    Vec = Builder.CreateBitCast(Vec, Ty, "v2f64");
+    llvm::Value *Idx0 = llvm::ConstantInt::get(Int32Ty, 0);
+    llvm::Value *Idx1 = llvm::ConstantInt::get(Int32Ty, 1);
+    Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
+    Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
+    // Pairwise addition of a v2f64 into a scalar f64.
+    return Builder.CreateFAdd(Op0, Op1, "vpaddd");
+  }
+  case NEON::BI__builtin_neon_vpadds_f32: {
+    llvm::Type *Ty =
+      llvm::VectorType::get(llvm::Type::getFloatTy(getLLVMContext()), 2);
+    Value *Vec = EmitScalarExpr(E->getArg(0));
+    // The vector is v2f32, so make sure it's bitcast to that.
+    Vec = Builder.CreateBitCast(Vec, Ty, "v2f32");
+    llvm::Value *Idx0 = llvm::ConstantInt::get(Int32Ty, 0);
+    llvm::Value *Idx1 = llvm::ConstantInt::get(Int32Ty, 1);
+    Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
+    Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
+    // Pairwise addition of a v2f32 into a scalar f32.
+    return Builder.CreateFAdd(Op0, Op1, "vpaddd");
+  }
+  case NEON::BI__builtin_neon_vceqzd_s64:
+  case NEON::BI__builtin_neon_vceqzd_f64:
+  case NEON::BI__builtin_neon_vceqzs_f32:
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    return EmitAArch64CompareBuiltinExpr(
+        Ops[0], ConvertType(E->getCallReturnType()), ICmpInst::FCMP_OEQ,
+        ICmpInst::ICMP_EQ, "vceqz");
+  case NEON::BI__builtin_neon_vcgezd_s64:
+  case NEON::BI__builtin_neon_vcgezd_f64:
+  case NEON::BI__builtin_neon_vcgezs_f32:
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    return EmitAArch64CompareBuiltinExpr(
+        Ops[0], ConvertType(E->getCallReturnType()), ICmpInst::FCMP_OGE,
+        ICmpInst::ICMP_SGE, "vcgez");
+  case NEON::BI__builtin_neon_vclezd_s64:
+  case NEON::BI__builtin_neon_vclezd_f64:
+  case NEON::BI__builtin_neon_vclezs_f32:
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    return EmitAArch64CompareBuiltinExpr(
+        Ops[0], ConvertType(E->getCallReturnType()), ICmpInst::FCMP_OLE,
+        ICmpInst::ICMP_SLE, "vclez");
+  case NEON::BI__builtin_neon_vcgtzd_s64:
+  case NEON::BI__builtin_neon_vcgtzd_f64:
+  case NEON::BI__builtin_neon_vcgtzs_f32:
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    return EmitAArch64CompareBuiltinExpr(
+        Ops[0], ConvertType(E->getCallReturnType()), ICmpInst::FCMP_OGT,
+        ICmpInst::ICMP_SGT, "vcgtz");
+  case NEON::BI__builtin_neon_vcltzd_s64:
+  case NEON::BI__builtin_neon_vcltzd_f64:
+  case NEON::BI__builtin_neon_vcltzs_f32:
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    return EmitAArch64CompareBuiltinExpr(
+        Ops[0], ConvertType(E->getCallReturnType()), ICmpInst::FCMP_OLT,
+        ICmpInst::ICMP_SLT, "vcltz");
+
+  case NEON::BI__builtin_neon_vceqzd_u64: {
+    llvm::Type *Ty = llvm::Type::getInt64Ty(getLLVMContext());
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    Ops[0] = Builder.CreateICmp(llvm::ICmpInst::ICMP_EQ, Ops[0],
+                                llvm::Constant::getNullValue(Ty));
+    return Builder.CreateSExt(Ops[0], Ty, "vceqzd");
+  }
+  case NEON::BI__builtin_neon_vceqd_f64:
+  case NEON::BI__builtin_neon_vcled_f64:
+  case NEON::BI__builtin_neon_vcltd_f64:
+  case NEON::BI__builtin_neon_vcged_f64:
+  case NEON::BI__builtin_neon_vcgtd_f64: {
+    llvm::CmpInst::Predicate P;
+    switch (BuiltinID) {
+    default: llvm_unreachable("missing builtin ID in switch!");
+    case NEON::BI__builtin_neon_vceqd_f64: P = llvm::FCmpInst::FCMP_OEQ; break;
+    case NEON::BI__builtin_neon_vcled_f64: P = llvm::FCmpInst::FCMP_OLE; break;
+    case NEON::BI__builtin_neon_vcltd_f64: P = llvm::FCmpInst::FCMP_OLT; break;
+    case NEON::BI__builtin_neon_vcged_f64: P = llvm::FCmpInst::FCMP_OGE; break;
+    case NEON::BI__builtin_neon_vcgtd_f64: P = llvm::FCmpInst::FCMP_OGT; break;
+    }
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
+    Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
+    Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
+    return Builder.CreateSExt(Ops[0], Int64Ty, "vcmpd");
+  }
+  case NEON::BI__builtin_neon_vceqs_f32:
+  case NEON::BI__builtin_neon_vcles_f32:
+  case NEON::BI__builtin_neon_vclts_f32:
+  case NEON::BI__builtin_neon_vcges_f32:
+  case NEON::BI__builtin_neon_vcgts_f32: {
+    llvm::CmpInst::Predicate P;
+    switch (BuiltinID) {
+    default: llvm_unreachable("missing builtin ID in switch!");
+    case NEON::BI__builtin_neon_vceqs_f32: P = llvm::FCmpInst::FCMP_OEQ; break;
+    case NEON::BI__builtin_neon_vcles_f32: P = llvm::FCmpInst::FCMP_OLE; break;
+    case NEON::BI__builtin_neon_vclts_f32: P = llvm::FCmpInst::FCMP_OLT; break;
+    case NEON::BI__builtin_neon_vcges_f32: P = llvm::FCmpInst::FCMP_OGE; break;
+    case NEON::BI__builtin_neon_vcgts_f32: P = llvm::FCmpInst::FCMP_OGT; break;
+    }
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    Ops[0] = Builder.CreateBitCast(Ops[0], FloatTy);
+    Ops[1] = Builder.CreateBitCast(Ops[1], FloatTy);
+    Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
+    return Builder.CreateSExt(Ops[0], Int32Ty, "vcmpd");
+  }
+  case NEON::BI__builtin_neon_vceqd_s64:
+  case NEON::BI__builtin_neon_vceqd_u64:
+  case NEON::BI__builtin_neon_vcgtd_s64:
+  case NEON::BI__builtin_neon_vcgtd_u64:
+  case NEON::BI__builtin_neon_vcltd_s64:
+  case NEON::BI__builtin_neon_vcltd_u64:
+  case NEON::BI__builtin_neon_vcged_u64:
+  case NEON::BI__builtin_neon_vcged_s64:
+  case NEON::BI__builtin_neon_vcled_u64:
+  case NEON::BI__builtin_neon_vcled_s64: {
+    llvm::CmpInst::Predicate P;
+    switch (BuiltinID) {
+    default: llvm_unreachable("missing builtin ID in switch!");
+    case NEON::BI__builtin_neon_vceqd_s64:
+    case NEON::BI__builtin_neon_vceqd_u64:P = llvm::ICmpInst::ICMP_EQ;break;
+    case NEON::BI__builtin_neon_vcgtd_s64:P = llvm::ICmpInst::ICMP_SGT;break;
+    case NEON::BI__builtin_neon_vcgtd_u64:P = llvm::ICmpInst::ICMP_UGT;break;
+    case NEON::BI__builtin_neon_vcltd_s64:P = llvm::ICmpInst::ICMP_SLT;break;
+    case NEON::BI__builtin_neon_vcltd_u64:P = llvm::ICmpInst::ICMP_ULT;break;
+    case NEON::BI__builtin_neon_vcged_u64:P = llvm::ICmpInst::ICMP_UGE;break;
+    case NEON::BI__builtin_neon_vcged_s64:P = llvm::ICmpInst::ICMP_SGE;break;
+    case NEON::BI__builtin_neon_vcled_u64:P = llvm::ICmpInst::ICMP_ULE;break;
+    case NEON::BI__builtin_neon_vcled_s64:P = llvm::ICmpInst::ICMP_SLE;break;
+    }
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
+    Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
+    Ops[0] = Builder.CreateICmp(P, Ops[0], Ops[1]);
+    return Builder.CreateSExt(Ops[0], Int64Ty, "vceqd");
+  }
+  case NEON::BI__builtin_neon_vtstd_s64:
+  case NEON::BI__builtin_neon_vtstd_u64: {
+    llvm::Type *Ty = llvm::Type::getInt64Ty(getLLVMContext());
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
     Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
     Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]);
     Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0],
-                                ConstantAggregateZero::get(Ty));
-    return Builder.CreateSExt(Ops[0], Ty, "vtst");
+                                llvm::Constant::getNullValue(Ty));
+    return Builder.CreateSExt(Ops[0], Ty, "vtstd");
   }
-  case ARM::BI__builtin_neon_vtrn_v:
-  case ARM::BI__builtin_neon_vtrnq_v: {
+  case NEON::BI__builtin_neon_vset_lane_i8:
+  case NEON::BI__builtin_neon_vset_lane_i16:
+  case NEON::BI__builtin_neon_vset_lane_i32:
+  case NEON::BI__builtin_neon_vset_lane_i64:
+  case NEON::BI__builtin_neon_vset_lane_f32:
+  case NEON::BI__builtin_neon_vsetq_lane_i8:
+  case NEON::BI__builtin_neon_vsetq_lane_i16:
+  case NEON::BI__builtin_neon_vsetq_lane_i32:
+  case NEON::BI__builtin_neon_vsetq_lane_i64:
+  case NEON::BI__builtin_neon_vsetq_lane_f32:
+    Ops.push_back(EmitScalarExpr(E->getArg(2)));
+    return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
+  case NEON::BI__builtin_neon_vset_lane_f64:
+    // The vector type needs a cast for the v1f64 variant.
+    Ops[1] = Builder.CreateBitCast(Ops[1],
+                                   llvm::VectorType::get(DoubleTy, 1));
+    Ops.push_back(EmitScalarExpr(E->getArg(2)));
+    return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
+  case NEON::BI__builtin_neon_vsetq_lane_f64:
+    // The vector type needs a cast for the v2f64 variant.
+    Ops[1] = Builder.CreateBitCast(Ops[1],
+        llvm::VectorType::get(llvm::Type::getDoubleTy(getLLVMContext()), 2));
+    Ops.push_back(EmitScalarExpr(E->getArg(2)));
+    return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
+
+  case NEON::BI__builtin_neon_vget_lane_i8:
+  case NEON::BI__builtin_neon_vdupb_lane_i8:
+    Ops[0] = Builder.CreateBitCast(Ops[0],
+        llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8));
+    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
+                                        "vget_lane");
+  case NEON::BI__builtin_neon_vgetq_lane_i8:
+  case NEON::BI__builtin_neon_vdupb_laneq_i8:
+    Ops[0] = Builder.CreateBitCast(Ops[0],
+        llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16));
+    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
+                                        "vgetq_lane");
+  case NEON::BI__builtin_neon_vget_lane_i16:
+  case NEON::BI__builtin_neon_vduph_lane_i16:
+    Ops[0] = Builder.CreateBitCast(Ops[0],
+        llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4));
+    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
+                                        "vget_lane");
+  case NEON::BI__builtin_neon_vgetq_lane_i16:
+  case NEON::BI__builtin_neon_vduph_laneq_i16:
+    Ops[0] = Builder.CreateBitCast(Ops[0],
+        llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8));
+    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
+                                        "vgetq_lane");
+  case NEON::BI__builtin_neon_vget_lane_i32:
+  case NEON::BI__builtin_neon_vdups_lane_i32:
+    Ops[0] = Builder.CreateBitCast(
+        Ops[0],
+        llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 32), 2));
+    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
+                                        "vget_lane");
+  case NEON::BI__builtin_neon_vdups_lane_f32:
+    Ops[0] = Builder.CreateBitCast(Ops[0],
+        llvm::VectorType::get(llvm::Type::getFloatTy(getLLVMContext()), 2));
+    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
+                                        "vdups_lane");
+  case NEON::BI__builtin_neon_vgetq_lane_i32:
+  case NEON::BI__builtin_neon_vdups_laneq_i32:
+    Ops[0] = Builder.CreateBitCast(Ops[0],
+        llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 32), 4));
+    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
+                                        "vgetq_lane");
+  case NEON::BI__builtin_neon_vget_lane_i64:
+  case NEON::BI__builtin_neon_vdupd_lane_i64:
+    Ops[0] = Builder.CreateBitCast(Ops[0],
+        llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 64), 1));
+    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
+                                        "vget_lane");
+  case NEON::BI__builtin_neon_vdupd_lane_f64:
+    Ops[0] = Builder.CreateBitCast(Ops[0],
+        llvm::VectorType::get(llvm::Type::getDoubleTy(getLLVMContext()), 1));
+    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
+                                        "vdupd_lane");
+  case NEON::BI__builtin_neon_vgetq_lane_i64:
+  case NEON::BI__builtin_neon_vdupd_laneq_i64:
+    Ops[0] = Builder.CreateBitCast(Ops[0],
+        llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 64), 2));
+    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
+                                        "vgetq_lane");
+  case NEON::BI__builtin_neon_vget_lane_f32:
+    Ops[0] = Builder.CreateBitCast(Ops[0],
+        llvm::VectorType::get(llvm::Type::getFloatTy(getLLVMContext()), 2));
+    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
+                                        "vget_lane");
+  case NEON::BI__builtin_neon_vget_lane_f64:
+    Ops[0] = Builder.CreateBitCast(Ops[0],
+        llvm::VectorType::get(llvm::Type::getDoubleTy(getLLVMContext()), 1));
+    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
+                                        "vget_lane");
+  case NEON::BI__builtin_neon_vgetq_lane_f32:
+  case NEON::BI__builtin_neon_vdups_laneq_f32:
+    Ops[0] = Builder.CreateBitCast(Ops[0],
+        llvm::VectorType::get(llvm::Type::getFloatTy(getLLVMContext()), 4));
+    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
+                                        "vgetq_lane");
+  case NEON::BI__builtin_neon_vgetq_lane_f64:
+  case NEON::BI__builtin_neon_vdupd_laneq_f64:
+    Ops[0] = Builder.CreateBitCast(Ops[0],
+        llvm::VectorType::get(llvm::Type::getDoubleTy(getLLVMContext()), 2));
+    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
+                                        "vgetq_lane");
+  case NEON::BI__builtin_neon_vaddd_s64:
+  case NEON::BI__builtin_neon_vaddd_u64:
+    return Builder.CreateAdd(Ops[0], EmitScalarExpr(E->getArg(1)), "vaddd");
+  case NEON::BI__builtin_neon_vsubd_s64:
+  case NEON::BI__builtin_neon_vsubd_u64:
+    return Builder.CreateSub(Ops[0], EmitScalarExpr(E->getArg(1)), "vsubd");
+  case NEON::BI__builtin_neon_vqdmlalh_s16:
+  case NEON::BI__builtin_neon_vqdmlslh_s16: {
+    SmallVector<Value *, 2> ProductOps;
+    ProductOps.push_back(vectorWrapScalar16(Ops[1]));
+    ProductOps.push_back(vectorWrapScalar16(EmitScalarExpr(E->getArg(2))));
+    llvm::Type *VTy = llvm::VectorType::get(Int32Ty, 4);
+    Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_sqdmull, VTy),
+                          ProductOps, "vqdmlXl");
+    Constant *CI = ConstantInt::get(Int32Ty, 0);
+    Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0");
+
+    unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlalh_s16
+                                        ? Intrinsic::arm64_neon_sqadd
+                                        : Intrinsic::arm64_neon_sqsub;
+    return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int32Ty), Ops, "vqdmlXl");
+  }
+  case NEON::BI__builtin_neon_vqshlud_n_s64: {
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty);
+    llvm::Type *VTy = llvm::VectorType::get(Int64Ty, 1);
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_sqshlu, VTy),
+                          Ops, "vqshlu_n");
+    return Builder.CreateBitCast(Ops[0], Int64Ty);
+  }
+  case NEON::BI__builtin_neon_vqshld_n_u64:
+  case NEON::BI__builtin_neon_vqshld_n_s64: {
+    unsigned Int = BuiltinID == NEON::BI__builtin_neon_vqshld_n_u64
+                                   ? Intrinsic::arm64_neon_uqshl
+                                   : Intrinsic::arm64_neon_sqshl;
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty);
+    llvm::Type *VTy = llvm::VectorType::get(Int64Ty, 1);
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, VTy), Ops, "vqshl_n");
+    return Builder.CreateBitCast(Ops[0], Int64Ty);
+  }
+  case NEON::BI__builtin_neon_vrshrd_n_u64:
+  case NEON::BI__builtin_neon_vrshrd_n_s64: {
+    unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrshrd_n_u64
+                                   ? Intrinsic::arm64_neon_urshl
+                                   : Intrinsic::arm64_neon_srshl;
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    llvm::Type *VTy = llvm::VectorType::get(Int64Ty, 1);
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, VTy), Ops, "vrshr_n", 1, true);
+    return Builder.CreateBitCast(Ops[0], Int64Ty);
+  }
+  case NEON::BI__builtin_neon_vrsrad_n_u64:
+  case NEON::BI__builtin_neon_vrsrad_n_s64: {
+    unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrsrad_n_u64
+                                   ? Intrinsic::arm64_neon_urshl
+                                   : Intrinsic::arm64_neon_srshl;
+    Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
+    Ops.push_back(Builder.CreateNeg(EmitScalarExpr(E->getArg(2))));
+    Ops[1] = Builder.CreateCall2(CGM.getIntrinsic(Int, Int64Ty), Ops[1],
+                                 Builder.CreateSExt(Ops[2], Int64Ty));
+    return Builder.CreateAdd(Ops[0], Builder.CreateBitCast(Ops[1], Int64Ty));
+  }
+  case NEON::BI__builtin_neon_vshld_n_s64:
+  case NEON::BI__builtin_neon_vshld_n_u64: {
+    llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
+    return Builder.CreateShl(
+        Ops[0], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
+                                                   Amt->getZExtValue())),
+        "vshr_n");
+  }
+  case NEON::BI__builtin_neon_vshrd_n_s64: {
+    llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
+    return Builder.CreateAShr(
+        Ops[0], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
+                                                   Amt->getZExtValue())),
+        "vshr_n");
+  }
+  case NEON::BI__builtin_neon_vshrd_n_u64: {
+    llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
+    return Builder.CreateLShr(
+        Ops[0], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
+                                                   Amt->getZExtValue())),
+        "vshr_n");
+  }
+  case NEON::BI__builtin_neon_vsrad_n_s64: {
+    llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2)));
+    Ops[1] = Builder.CreateAShr(
+        Ops[1], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
+                                                   Amt->getZExtValue())),
+        "vshr_n");
+    return Builder.CreateAdd(Ops[0], Ops[1]);
+  }
+  case NEON::BI__builtin_neon_vsrad_n_u64: {
+    llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2)));
+    Ops[1] = Builder.CreateLShr(
+        Ops[1], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
+                                                   Amt->getZExtValue())),
+        "vshr_n");
+    return Builder.CreateAdd(Ops[0], Ops[1]);
+  }
+  case NEON::BI__builtin_neon_vqdmlalh_lane_s16:
+  case NEON::BI__builtin_neon_vqdmlalh_laneq_s16:
+  case NEON::BI__builtin_neon_vqdmlslh_lane_s16:
+  case NEON::BI__builtin_neon_vqdmlslh_laneq_s16: {
+    Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)),
+                                          "lane");
+    SmallVector<Value *, 2> ProductOps;
+    ProductOps.push_back(vectorWrapScalar16(Ops[1]));
+    ProductOps.push_back(vectorWrapScalar16(Ops[2]));
+    llvm::Type *VTy = llvm::VectorType::get(Int32Ty, 4);
+    Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_sqdmull, VTy),
+                          ProductOps, "vqdmlXl");
+    Constant *CI = ConstantInt::get(Int32Ty, 0);
+    Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0");
+    Ops.pop_back();
+
+    unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlalh_lane_s16 ||
+                       BuiltinID == NEON::BI__builtin_neon_vqdmlalh_laneq_s16)
+                          ? Intrinsic::arm64_neon_sqadd
+                          : Intrinsic::arm64_neon_sqsub;
+    return EmitNeonCall(CGM.getIntrinsic(AccInt, Int32Ty), Ops, "vqdmlXl");
+  }
+  case NEON::BI__builtin_neon_vqdmlals_s32:
+  case NEON::BI__builtin_neon_vqdmlsls_s32: {
+    SmallVector<Value *, 2> ProductOps;
+    ProductOps.push_back(Ops[1]);
+    ProductOps.push_back(EmitScalarExpr(E->getArg(2)));
+    Ops[1] =
+        EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_sqdmulls_scalar),
+                     ProductOps, "vqdmlXl");
+
+    unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlals_s32
+                                        ? Intrinsic::arm64_neon_sqadd
+                                        : Intrinsic::arm64_neon_sqsub;
+    return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int64Ty), Ops, "vqdmlXl");
+  }
+  case NEON::BI__builtin_neon_vqdmlals_lane_s32:
+  case NEON::BI__builtin_neon_vqdmlals_laneq_s32:
+  case NEON::BI__builtin_neon_vqdmlsls_lane_s32:
+  case NEON::BI__builtin_neon_vqdmlsls_laneq_s32: {
+    Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)),
+                                          "lane");
+    SmallVector<Value *, 2> ProductOps;
+    ProductOps.push_back(Ops[1]);
+    ProductOps.push_back(Ops[2]);
+    Ops[1] =
+        EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_sqdmulls_scalar),
+                     ProductOps, "vqdmlXl");
+    Ops.pop_back();
+
+    unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlals_lane_s32 ||
+                       BuiltinID == NEON::BI__builtin_neon_vqdmlals_laneq_s32)
+                          ? Intrinsic::arm64_neon_sqadd
+                          : Intrinsic::arm64_neon_sqsub;
+    return EmitNeonCall(CGM.getIntrinsic(AccInt, Int64Ty), Ops, "vqdmlXl");
+  }
+  }
+
+  llvm::VectorType *VTy = GetNeonType(this, Type);
+  llvm::Type *Ty = VTy;
+  if (!Ty)
+    return 0;
+
+  // Not all intrinsics handled by the common case work for ARM64 yet, so only
+  // defer to common code if it's been added to our special map.
+  Builtin = findNeonIntrinsicInMap(ARM64SIMDIntrinsicMap, BuiltinID,
+                                   ARM64SIMDIntrinsicsProvenSorted);
+
+  if (Builtin)
+    return EmitCommonNeonBuiltinExpr(
+        Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
+        Builtin->NameHint, Builtin->TypeModifier, E, Ops, 0);
+
+  if (Value *V = EmitARM64TblBuiltinExpr(*this, BuiltinID, E, Ops))
+    return V;
+
+  unsigned Int;
+  switch (BuiltinID) {
+  default: return 0;
+  case NEON::BI__builtin_neon_vbsl_v:
+  case NEON::BI__builtin_neon_vbslq_v: {
+    llvm::Type *BitTy = llvm::VectorType::getInteger(VTy);
+    Ops[0] = Builder.CreateBitCast(Ops[0], BitTy, "vbsl");
+    Ops[1] = Builder.CreateBitCast(Ops[1], BitTy, "vbsl");
+    Ops[2] = Builder.CreateBitCast(Ops[2], BitTy, "vbsl");
+
+    Ops[1] = Builder.CreateAnd(Ops[0], Ops[1], "vbsl");
+    Ops[2] = Builder.CreateAnd(Builder.CreateNot(Ops[0]), Ops[2], "vbsl");
+    Ops[0] = Builder.CreateOr(Ops[1], Ops[2], "vbsl");
+    return Builder.CreateBitCast(Ops[0], Ty);
+  }
+  case NEON::BI__builtin_neon_vfma_lane_v:
+  case NEON::BI__builtin_neon_vfmaq_lane_v: { // Only used for FP types
+    // The ARM builtins (and instructions) have the addend as the first
+    // operand, but the 'fma' intrinsics have it last. Swap it around here.
+    Value *Addend = Ops[0];
+    Value *Multiplicand = Ops[1];
+    Value *LaneSource = Ops[2];
+    Ops[0] = Multiplicand;
+    Ops[1] = LaneSource;
+    Ops[2] = Addend;
+
+    // Now adjust things to handle the lane access.
+    llvm::Type *SourceTy = BuiltinID == NEON::BI__builtin_neon_vfmaq_lane_v ?
+      llvm::VectorType::get(VTy->getElementType(), VTy->getNumElements() / 2) :
+      VTy;
+    llvm::Constant *cst = cast<Constant>(Ops[3]);
+    Value *SV = llvm::ConstantVector::getSplat(VTy->getNumElements(), cst);
+    Ops[1] = Builder.CreateBitCast(Ops[1], SourceTy);
+    Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV, "lane");
+
+    Ops.pop_back();
+    Int = Intrinsic::fma;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmla");
+  }
+  case NEON::BI__builtin_neon_vfma_laneq_v: {
+    llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
+    // v1f64 fma should be mapped to Neon scalar f64 fma
+    if (VTy && VTy->getElementType() == DoubleTy) {
+      Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
+      Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
+      llvm::Type *VTy = GetNeonType(this,
+        NeonTypeFlags(NeonTypeFlags::Float64, false, true));
+      Ops[2] = Builder.CreateBitCast(Ops[2], VTy);
+      Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
+      Value *F = CGM.getIntrinsic(Intrinsic::fma, DoubleTy);
+      Value *Result = Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]);
+      return Builder.CreateBitCast(Result, Ty);
+    }
+    Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+
+    llvm::Type *STy = llvm::VectorType::get(VTy->getElementType(),
+                                            VTy->getNumElements() * 2);
+    Ops[2] = Builder.CreateBitCast(Ops[2], STy);
+    Value* SV = llvm::ConstantVector::getSplat(VTy->getNumElements(),
+                                               cast<ConstantInt>(Ops[3]));
+    Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane");
+
+    return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]);
+  }
+  case NEON::BI__builtin_neon_vfmaq_laneq_v: {
+    Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+
+    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
+    Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3]));
+    return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]);
+  }
+  case NEON::BI__builtin_neon_vfmas_lane_f32:
+  case NEON::BI__builtin_neon_vfmas_laneq_f32:
+  case NEON::BI__builtin_neon_vfmad_lane_f64:
+  case NEON::BI__builtin_neon_vfmad_laneq_f64: {
+    Ops.push_back(EmitScalarExpr(E->getArg(3)));
+    llvm::Type *Ty = ConvertType(E->getCallReturnType());
+    Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
+    Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
+    return Builder.CreateCall3(F, Ops[1], Ops[2], Ops[0]);
+  }
+  case NEON::BI__builtin_neon_vfms_v:
+  case NEON::BI__builtin_neon_vfmsq_v: {  // Only used for FP types
+    // FIXME: probably remove when we no longer support aarch64_simd.h
+    // (arm_neon.h delegates to vfma).
+
+    // The ARM builtins (and instructions) have the addend as the first
+    // operand, but the 'fma' intrinsics have it last. Swap it around here.
+    Value *Subtrahend = Ops[0];
+    Value *Multiplicand = Ops[2];
+    Ops[0] = Multiplicand;
+    Ops[2] = Subtrahend;
+    Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
+    Ops[1] = Builder.CreateFNeg(Ops[1]);
+    Int = Intrinsic::fma;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmls");
+  }
+  case NEON::BI__builtin_neon_vmull_v:
+    // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
+    Int = usgn ? Intrinsic::arm64_neon_umull : Intrinsic::arm64_neon_smull;
+    if (Type.isPoly()) Int = Intrinsic::arm64_neon_pmull;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull");
+  case NEON::BI__builtin_neon_vmax_v:
+  case NEON::BI__builtin_neon_vmaxq_v:
+    // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
+    Int = usgn ? Intrinsic::arm64_neon_umax : Intrinsic::arm64_neon_smax;
+    if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::arm64_neon_fmax;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmax");
+  case NEON::BI__builtin_neon_vmin_v:
+  case NEON::BI__builtin_neon_vminq_v:
+    // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
+    Int = usgn ? Intrinsic::arm64_neon_umin : Intrinsic::arm64_neon_smin;
+    if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::arm64_neon_fmin;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmin");
+  case NEON::BI__builtin_neon_vabd_v:
+  case NEON::BI__builtin_neon_vabdq_v:
+    // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
+    Int = usgn ? Intrinsic::arm64_neon_uabd : Intrinsic::arm64_neon_sabd;
+    if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::arm64_neon_fabd;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vabd");
+  case NEON::BI__builtin_neon_vpadal_v:
+  case NEON::BI__builtin_neon_vpadalq_v: {
+    unsigned ArgElts = VTy->getNumElements();
+    llvm::IntegerType *EltTy = cast<IntegerType>(VTy->getElementType());
+    unsigned BitWidth = EltTy->getBitWidth();
+    llvm::Type *ArgTy = llvm::VectorType::get(
+        llvm::IntegerType::get(getLLVMContext(), BitWidth/2), 2*ArgElts);
+    llvm::Type* Tys[2] = { VTy, ArgTy };
+    Int = usgn ? Intrinsic::arm64_neon_uaddlp : Intrinsic::arm64_neon_saddlp;
+    SmallVector<llvm::Value*, 1> TmpOps;
+    TmpOps.push_back(Ops[1]);
+    Function *F = CGM.getIntrinsic(Int, Tys);
+    llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vpadal");
+    llvm::Value *addend = Builder.CreateBitCast(Ops[0], tmp->getType());
+    return Builder.CreateAdd(tmp, addend);
+  }
+  case NEON::BI__builtin_neon_vpmin_v:
+  case NEON::BI__builtin_neon_vpminq_v:
+    // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
+    Int = usgn ? Intrinsic::arm64_neon_uminp : Intrinsic::arm64_neon_sminp;
+    if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::arm64_neon_fminp;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmin");
+  case NEON::BI__builtin_neon_vpmax_v:
+  case NEON::BI__builtin_neon_vpmaxq_v:
+    // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
+    Int = usgn ? Intrinsic::arm64_neon_umaxp : Intrinsic::arm64_neon_smaxp;
+    if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::arm64_neon_fmaxp;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmax");
+  case NEON::BI__builtin_neon_vminnm_v:
+  case NEON::BI__builtin_neon_vminnmq_v:
+    Int = Intrinsic::arm64_neon_fminnm;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm");
+  case NEON::BI__builtin_neon_vmaxnm_v:
+  case NEON::BI__builtin_neon_vmaxnmq_v:
+    Int = Intrinsic::arm64_neon_fmaxnm;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm");
+  case NEON::BI__builtin_neon_vrecpss_f32: {
+    llvm::Type *f32Type = llvm::Type::getFloatTy(getLLVMContext());
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_frecps, f32Type),
+                        Ops, "vrecps");
+  }
+  case NEON::BI__builtin_neon_vrecpsd_f64: {
+    llvm::Type *f64Type = llvm::Type::getDoubleTy(getLLVMContext());
+    Ops.push_back(EmitScalarExpr(E->getArg(1)));
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_frecps, f64Type),
+                        Ops, "vrecps");
+  }
+  case NEON::BI__builtin_neon_vrshr_n_v:
+  case NEON::BI__builtin_neon_vrshrq_n_v:
+    // FIXME: this can be shared with 32-bit ARM, but not AArch64 at the
+    // moment. After the final merge it should be added to
+    // EmitCommonNeonBuiltinExpr.
+    Int = usgn ? Intrinsic::arm64_neon_urshl : Intrinsic::arm64_neon_srshl;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshr_n", 1, true);
+  case NEON::BI__builtin_neon_vqshlu_n_v:
+  case NEON::BI__builtin_neon_vqshluq_n_v:
+    // FIXME: AArch64 and ARM use different intrinsics for this, but are
+    // essentially compatible. It should be in EmitCommonNeonBuiltinExpr after
+    // the final merge.
+    Int = Intrinsic::arm64_neon_sqshlu;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshlu_n", 1, false);
+  case NEON::BI__builtin_neon_vqshrun_n_v:
+    // FIXME: as above
+    Int = Intrinsic::arm64_neon_sqshrun;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrun_n");
+  case NEON::BI__builtin_neon_vqrshrun_n_v:
+    // FIXME: and again.
+    Int = Intrinsic::arm64_neon_sqrshrun;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrun_n");
+  case NEON::BI__builtin_neon_vqshrn_n_v:
+    // FIXME: guess
+    Int = usgn ? Intrinsic::arm64_neon_uqshrn : Intrinsic::arm64_neon_sqshrn;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n");
+  case NEON::BI__builtin_neon_vrshrn_n_v:
+    // FIXME: there might be a pattern here.
+    Int = Intrinsic::arm64_neon_rshrn;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshrn_n");
+  case NEON::BI__builtin_neon_vqrshrn_n_v:
+    // FIXME: another one
+    Int = usgn ? Intrinsic::arm64_neon_uqrshrn : Intrinsic::arm64_neon_sqrshrn;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n");
+  case NEON::BI__builtin_neon_vrnda_v:
+  case NEON::BI__builtin_neon_vrndaq_v: {
+    Int = Intrinsic::round;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda");
+  }
+  case NEON::BI__builtin_neon_vrndi_v:
+  case NEON::BI__builtin_neon_vrndiq_v: {
+    Int = Intrinsic::nearbyint;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndi");
+  }
+  case NEON::BI__builtin_neon_vrndm_v:
+  case NEON::BI__builtin_neon_vrndmq_v: {
+    Int = Intrinsic::floor;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm");
+  }
+  case NEON::BI__builtin_neon_vrndn_v:
+  case NEON::BI__builtin_neon_vrndnq_v: {
+    Int = Intrinsic::arm64_neon_frintn;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndn");
+  }
+  case NEON::BI__builtin_neon_vrndp_v:
+  case NEON::BI__builtin_neon_vrndpq_v: {
+    Int = Intrinsic::ceil;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp");
+  }
+  case NEON::BI__builtin_neon_vrndx_v:
+  case NEON::BI__builtin_neon_vrndxq_v: {
+    Int = Intrinsic::rint;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx");
+  }
+  case NEON::BI__builtin_neon_vrnd_v:
+  case NEON::BI__builtin_neon_vrndq_v: {
+    Int = Intrinsic::trunc;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndz");
+  }
+  case NEON::BI__builtin_neon_vceqz_v:
+  case NEON::BI__builtin_neon_vceqzq_v:
+    return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ,
+                                         ICmpInst::ICMP_EQ, "vceqz");
+  case NEON::BI__builtin_neon_vcgez_v:
+  case NEON::BI__builtin_neon_vcgezq_v:
+    return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE,
+                                         ICmpInst::ICMP_SGE, "vcgez");
+  case NEON::BI__builtin_neon_vclez_v:
+  case NEON::BI__builtin_neon_vclezq_v:
+    return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE,
+                                         ICmpInst::ICMP_SLE, "vclez");
+  case NEON::BI__builtin_neon_vcgtz_v:
+  case NEON::BI__builtin_neon_vcgtzq_v:
+    return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT,
+                                         ICmpInst::ICMP_SGT, "vcgtz");
+  case NEON::BI__builtin_neon_vcltz_v:
+  case NEON::BI__builtin_neon_vcltzq_v:
+    return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT,
+                                         ICmpInst::ICMP_SLT, "vcltz");
+  case NEON::BI__builtin_neon_vcvt_f64_v:
+  case NEON::BI__builtin_neon_vcvtq_f64_v:
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad));
+    return usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
+                : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
+  case NEON::BI__builtin_neon_vcvt_f64_f32: {
+    assert(Type.getEltType() == NeonTypeFlags::Float64 && quad &&
+           "unexpected vcvt_f64_f32 builtin");
+    NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float32, false, false);
+    Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
+
+    return Builder.CreateFPExt(Ops[0], Ty, "vcvt");
+  }
+  case NEON::BI__builtin_neon_vcvt_f32_f64: {
+    assert(Type.getEltType() == NeonTypeFlags::Float32 &&
+           "unexpected vcvt_f32_f64 builtin");
+    NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float64, false, true);
+    Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
+
+    return Builder.CreateFPTrunc(Ops[0], Ty, "vcvt");
+  }
+  case NEON::BI__builtin_neon_vcvt_s32_v:
+  case NEON::BI__builtin_neon_vcvt_u32_v:
+  case NEON::BI__builtin_neon_vcvt_s64_v:
+  case NEON::BI__builtin_neon_vcvt_u64_v:
+  case NEON::BI__builtin_neon_vcvtq_s32_v:
+  case NEON::BI__builtin_neon_vcvtq_u32_v:
+  case NEON::BI__builtin_neon_vcvtq_s64_v:
+  case NEON::BI__builtin_neon_vcvtq_u64_v: {
+    bool Double =
+      (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
+    llvm::Type *InTy =
+      GetNeonType(this,
+                  NeonTypeFlags(Double ? NeonTypeFlags::Float64
+                                : NeonTypeFlags::Float32, false, quad));
+    Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
+    if (usgn)
+      return Builder.CreateFPToUI(Ops[0], Ty);
+    return Builder.CreateFPToSI(Ops[0], Ty);
+  }
+  case NEON::BI__builtin_neon_vcvta_s32_v:
+  case NEON::BI__builtin_neon_vcvtaq_s32_v:
+  case NEON::BI__builtin_neon_vcvta_u32_v:
+  case NEON::BI__builtin_neon_vcvtaq_u32_v:
+  case NEON::BI__builtin_neon_vcvta_s64_v:
+  case NEON::BI__builtin_neon_vcvtaq_s64_v:
+  case NEON::BI__builtin_neon_vcvta_u64_v:
+  case NEON::BI__builtin_neon_vcvtaq_u64_v: {
+    Int = usgn ? Intrinsic::arm64_neon_fcvtau : Intrinsic::arm64_neon_fcvtas;
+    bool Double =
+      (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
+    llvm::Type *InTy =
+      GetNeonType(this,
+                  NeonTypeFlags(Double ? NeonTypeFlags::Float64
+                                : NeonTypeFlags::Float32, false, quad));
+    llvm::Type *Tys[2] = { Ty, InTy };
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvta");
+  }
+  case NEON::BI__builtin_neon_vcvtm_s32_v:
+  case NEON::BI__builtin_neon_vcvtmq_s32_v:
+  case NEON::BI__builtin_neon_vcvtm_u32_v:
+  case NEON::BI__builtin_neon_vcvtmq_u32_v:
+  case NEON::BI__builtin_neon_vcvtm_s64_v:
+  case NEON::BI__builtin_neon_vcvtmq_s64_v:
+  case NEON::BI__builtin_neon_vcvtm_u64_v:
+  case NEON::BI__builtin_neon_vcvtmq_u64_v: {
+    Int = usgn ? Intrinsic::arm64_neon_fcvtmu : Intrinsic::arm64_neon_fcvtms;
+    bool Double =
+      (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
+    llvm::Type *InTy =
+      GetNeonType(this,
+                  NeonTypeFlags(Double ? NeonTypeFlags::Float64
+                                : NeonTypeFlags::Float32, false, quad));
+    llvm::Type *Tys[2] = { Ty, InTy };
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtm");
+  }
+  case NEON::BI__builtin_neon_vcvtn_s32_v:
+  case NEON::BI__builtin_neon_vcvtnq_s32_v:
+  case NEON::BI__builtin_neon_vcvtn_u32_v:
+  case NEON::BI__builtin_neon_vcvtnq_u32_v:
+  case NEON::BI__builtin_neon_vcvtn_s64_v:
+  case NEON::BI__builtin_neon_vcvtnq_s64_v:
+  case NEON::BI__builtin_neon_vcvtn_u64_v:
+  case NEON::BI__builtin_neon_vcvtnq_u64_v: {
+    Int = usgn ? Intrinsic::arm64_neon_fcvtnu : Intrinsic::arm64_neon_fcvtns;
+    bool Double =
+      (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
+    llvm::Type *InTy =
+      GetNeonType(this,
+                  NeonTypeFlags(Double ? NeonTypeFlags::Float64
+                                : NeonTypeFlags::Float32, false, quad));
+    llvm::Type *Tys[2] = { Ty, InTy };
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtn");
+  }
+  case NEON::BI__builtin_neon_vcvtp_s32_v:
+  case NEON::BI__builtin_neon_vcvtpq_s32_v:
+  case NEON::BI__builtin_neon_vcvtp_u32_v:
+  case NEON::BI__builtin_neon_vcvtpq_u32_v:
+  case NEON::BI__builtin_neon_vcvtp_s64_v:
+  case NEON::BI__builtin_neon_vcvtpq_s64_v:
+  case NEON::BI__builtin_neon_vcvtp_u64_v:
+  case NEON::BI__builtin_neon_vcvtpq_u64_v: {
+    Int = usgn ? Intrinsic::arm64_neon_fcvtpu : Intrinsic::arm64_neon_fcvtps;
+    bool Double =
+      (cast<llvm::IntegerType>(VTy->getElementType())->getBitWidth() == 64);
+    llvm::Type *InTy =
+      GetNeonType(this,
+                  NeonTypeFlags(Double ? NeonTypeFlags::Float64
+                                : NeonTypeFlags::Float32, false, quad));
+    llvm::Type *Tys[2] = { Ty, InTy };
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtp");
+  }
+  case NEON::BI__builtin_neon_vmulx_v:
+  case NEON::BI__builtin_neon_vmulxq_v: {
+    Int = Intrinsic::arm64_neon_fmulx;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx");
+  }
+  case NEON::BI__builtin_neon_vmul_lane_v:
+  case NEON::BI__builtin_neon_vmul_laneq_v: {
+    // v1f64 vmul_lane should be mapped to Neon scalar mul lane
+    bool Quad = false;
+    if (BuiltinID == NEON::BI__builtin_neon_vmul_laneq_v)
+      Quad = true;
+    Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
+    llvm::Type *VTy = GetNeonType(this,
+      NeonTypeFlags(NeonTypeFlags::Float64, false, Quad));
+    Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
+    Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
+    Value *Result = Builder.CreateFMul(Ops[0], Ops[1]);
+    return Builder.CreateBitCast(Result, Ty);
+  }
+  case NEON::BI__builtin_neon_vnegd_s64:
+    return Builder.CreateNeg(EmitScalarExpr(E->getArg(0)), "vnegd");
+  case NEON::BI__builtin_neon_vpmaxnm_v:
+  case NEON::BI__builtin_neon_vpmaxnmq_v: {
+    Int = Intrinsic::arm64_neon_fmaxnmp;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmaxnm");
+  }
+  case NEON::BI__builtin_neon_vpminnm_v:
+  case NEON::BI__builtin_neon_vpminnmq_v: {
+    Int = Intrinsic::arm64_neon_fminnmp;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm");
+  }
+  case NEON::BI__builtin_neon_vsqrt_v:
+  case NEON::BI__builtin_neon_vsqrtq_v: {
+    Int = Intrinsic::sqrt;
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqrt");
+  }
+  case NEON::BI__builtin_neon_vrbit_v:
+  case NEON::BI__builtin_neon_vrbitq_v: {
+    Int = Intrinsic::arm64_neon_rbit;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrbit");
+  }
+  case NEON::BI__builtin_neon_vaddv_u8:
+    // FIXME: These are handled by the AArch64 scalar code.
+    usgn = true;
+    // FALLTHROUGH
+  case NEON::BI__builtin_neon_vaddv_s8: {
+    Int = usgn ? Intrinsic::arm64_neon_uaddv : Intrinsic::arm64_neon_saddv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 8));
+  }
+  case NEON::BI__builtin_neon_vaddv_u16:
+    usgn = true;
+    // FALLTHROUGH
+  case NEON::BI__builtin_neon_vaddv_s16: {
+    Int = usgn ? Intrinsic::arm64_neon_uaddv : Intrinsic::arm64_neon_saddv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 16));
+  }
+  case NEON::BI__builtin_neon_vaddvq_u8:
+    usgn = true;
+    // FALLTHROUGH
+  case NEON::BI__builtin_neon_vaddvq_s8: {
+    Int = usgn ? Intrinsic::arm64_neon_uaddv : Intrinsic::arm64_neon_saddv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 8));
+  }
+  case NEON::BI__builtin_neon_vaddvq_u16:
+    usgn = true;
+    // FALLTHROUGH
+  case NEON::BI__builtin_neon_vaddvq_s16: {
+    Int = usgn ? Intrinsic::arm64_neon_uaddv : Intrinsic::arm64_neon_saddv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 16));
+  }
+  case NEON::BI__builtin_neon_vmaxv_u8: {
+    Int = Intrinsic::arm64_neon_umaxv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 8));
+  }
+  case NEON::BI__builtin_neon_vmaxv_u16: {
+    Int = Intrinsic::arm64_neon_umaxv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 16));
+  }
+  case NEON::BI__builtin_neon_vmaxvq_u8: {
+    Int = Intrinsic::arm64_neon_umaxv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 8));
+  }
+  case NEON::BI__builtin_neon_vmaxvq_u16: {
+    Int = Intrinsic::arm64_neon_umaxv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 16));
+  }
+  case NEON::BI__builtin_neon_vmaxv_s8: {
+    Int = Intrinsic::arm64_neon_smaxv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 8));
+  }
+  case NEON::BI__builtin_neon_vmaxv_s16: {
+    Int = Intrinsic::arm64_neon_smaxv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 16));
+  }
+  case NEON::BI__builtin_neon_vmaxvq_s8: {
+    Int = Intrinsic::arm64_neon_smaxv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 8));
+  }
+  case NEON::BI__builtin_neon_vmaxvq_s16: {
+    Int = Intrinsic::arm64_neon_smaxv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 16));
+  }
+  case NEON::BI__builtin_neon_vminv_u8: {
+    Int = Intrinsic::arm64_neon_uminv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 8));
+  }
+  case NEON::BI__builtin_neon_vminv_u16: {
+    Int = Intrinsic::arm64_neon_uminv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 16));
+  }
+  case NEON::BI__builtin_neon_vminvq_u8: {
+    Int = Intrinsic::arm64_neon_uminv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 8));
+  }
+  case NEON::BI__builtin_neon_vminvq_u16: {
+    Int = Intrinsic::arm64_neon_uminv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 16));
+  }
+  case NEON::BI__builtin_neon_vminv_s8: {
+    Int = Intrinsic::arm64_neon_sminv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 8));
+  }
+  case NEON::BI__builtin_neon_vminv_s16: {
+    Int = Intrinsic::arm64_neon_sminv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 16));
+  }
+  case NEON::BI__builtin_neon_vminvq_s8: {
+    Int = Intrinsic::arm64_neon_sminv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 8));
+  }
+  case NEON::BI__builtin_neon_vminvq_s16: {
+    Int = Intrinsic::arm64_neon_sminv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 16));
+  }
+  case NEON::BI__builtin_neon_vmul_n_f64: {
+    Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
+    Value *RHS = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)), DoubleTy);
+    return Builder.CreateFMul(Ops[0], RHS);
+  }
+  case NEON::BI__builtin_neon_vaddlv_u8: {
+    Int = Intrinsic::arm64_neon_uaddlv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 16));
+  }
+  case NEON::BI__builtin_neon_vaddlv_u16: {
+    Int = Intrinsic::arm64_neon_uaddlv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
+  }
+  case NEON::BI__builtin_neon_vaddlvq_u8: {
+    Int = Intrinsic::arm64_neon_uaddlv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 16));
+  }
+  case NEON::BI__builtin_neon_vaddlvq_u16: {
+    Int = Intrinsic::arm64_neon_uaddlv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
+  }
+  case NEON::BI__builtin_neon_vaddlv_s8: {
+    Int = Intrinsic::arm64_neon_saddlv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 8);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 16));
+  }
+  case NEON::BI__builtin_neon_vaddlv_s16: {
+    Int = Intrinsic::arm64_neon_saddlv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 4);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
+  }
+  case NEON::BI__builtin_neon_vaddlvq_s8: {
+    Int = Intrinsic::arm64_neon_saddlv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 8), 16);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
+    return Builder.CreateTrunc(Ops[0],
+             llvm::IntegerType::get(getLLVMContext(), 16));
+  }
+  case NEON::BI__builtin_neon_vaddlvq_s16: {
+    Int = Intrinsic::arm64_neon_saddlv;
+    Ty = llvm::IntegerType::get(getLLVMContext(), 32);
+    VTy =
+      llvm::VectorType::get(llvm::IntegerType::get(getLLVMContext(), 16), 8);
+    llvm::Type *Tys[2] = { Ty, VTy };
+    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
+  }
+  case NEON::BI__builtin_neon_vsri_n_v:
+  case NEON::BI__builtin_neon_vsriq_n_v: {
+    Int = Intrinsic::arm64_neon_vsri;
+    llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty);
+    return EmitNeonCall(Intrin, Ops, "vsri_n");
+  }
+  case NEON::BI__builtin_neon_vsli_n_v:
+  case NEON::BI__builtin_neon_vsliq_n_v: {
+    Int = Intrinsic::arm64_neon_vsli;
+    llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty);
+    return EmitNeonCall(Intrin, Ops, "vsli_n");
+  }
+  case NEON::BI__builtin_neon_vsra_n_v:
+  case NEON::BI__builtin_neon_vsraq_n_v:
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n");
+    return Builder.CreateAdd(Ops[0], Ops[1]);
+  case NEON::BI__builtin_neon_vrsra_n_v:
+  case NEON::BI__builtin_neon_vrsraq_n_v: {
+    Int = usgn ? Intrinsic::arm64_neon_urshl : Intrinsic::arm64_neon_srshl;
+    SmallVector<llvm::Value*,2> TmpOps;
+    TmpOps.push_back(Ops[1]);
+    TmpOps.push_back(Ops[2]);
+    Function* F = CGM.getIntrinsic(Int, Ty);
+    llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vrshr_n", 1, true);
+    Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
+    return Builder.CreateAdd(Ops[0], tmp);
+  }
+    // FIXME: Sharing loads & stores with 32-bit is complicated by the absence
+    // of an Align parameter here.
+  case NEON::BI__builtin_neon_vld1_x2_v:
+  case NEON::BI__builtin_neon_vld1q_x2_v:
+  case NEON::BI__builtin_neon_vld1_x3_v:
+  case NEON::BI__builtin_neon_vld1q_x3_v:
+  case NEON::BI__builtin_neon_vld1_x4_v:
+  case NEON::BI__builtin_neon_vld1q_x4_v: {
+    llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getVectorElementType());
+    Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
+    llvm::Type *Tys[2] = { VTy, PTy };
+    unsigned Int;
+    switch (BuiltinID) {
+    case NEON::BI__builtin_neon_vld1_x2_v:
+    case NEON::BI__builtin_neon_vld1q_x2_v:
+      Int = Intrinsic::arm64_neon_ld1x2;
+      break;
+    case NEON::BI__builtin_neon_vld1_x3_v:
+    case NEON::BI__builtin_neon_vld1q_x3_v:
+      Int = Intrinsic::arm64_neon_ld1x3;
+      break;
+    case NEON::BI__builtin_neon_vld1_x4_v:
+    case NEON::BI__builtin_neon_vld1q_x4_v:
+      Int = Intrinsic::arm64_neon_ld1x4;
+      break;
+    }
+    Function *F = CGM.getIntrinsic(Int, Tys);
+    Ops[1] = Builder.CreateCall(F, Ops[1], "vld1xN");
+    Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    return Builder.CreateStore(Ops[1], Ops[0]);
+  }
+  case NEON::BI__builtin_neon_vst1_x2_v:
+  case NEON::BI__builtin_neon_vst1q_x2_v:
+  case NEON::BI__builtin_neon_vst1_x3_v:
+  case NEON::BI__builtin_neon_vst1q_x3_v:
+  case NEON::BI__builtin_neon_vst1_x4_v:
+  case NEON::BI__builtin_neon_vst1q_x4_v: {
+    llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getVectorElementType());
+    llvm::Type *Tys[2] = { VTy, PTy };
+    unsigned Int;
+    switch (BuiltinID) {
+    case NEON::BI__builtin_neon_vst1_x2_v:
+    case NEON::BI__builtin_neon_vst1q_x2_v:
+      Int = Intrinsic::arm64_neon_st1x2;
+      break;
+    case NEON::BI__builtin_neon_vst1_x3_v:
+    case NEON::BI__builtin_neon_vst1q_x3_v:
+      Int = Intrinsic::arm64_neon_st1x3;
+      break;
+    case NEON::BI__builtin_neon_vst1_x4_v:
+    case NEON::BI__builtin_neon_vst1q_x4_v:
+      Int = Intrinsic::arm64_neon_st1x4;
+      break;
+    }
+    SmallVector<Value *, 4> IntOps(Ops.begin()+1, Ops.end());
+    IntOps.push_back(Ops[0]);
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), IntOps, "");
+  }
+  case NEON::BI__builtin_neon_vld1_v:
+  case NEON::BI__builtin_neon_vld1q_v:
+    Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(VTy));
+    return Builder.CreateLoad(Ops[0]);
+  case NEON::BI__builtin_neon_vst1_v:
+  case NEON::BI__builtin_neon_vst1q_v:
+    Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(VTy));
+    Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
+    return Builder.CreateStore(Ops[1], Ops[0]);
+  case NEON::BI__builtin_neon_vld1_lane_v:
+  case NEON::BI__builtin_neon_vld1q_lane_v:
+    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+    Ty = llvm::PointerType::getUnqual(VTy->getElementType());
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    Ops[0] = Builder.CreateLoad(Ops[0]);
+    return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vld1_lane");
+  case NEON::BI__builtin_neon_vld1_dup_v:
+  case NEON::BI__builtin_neon_vld1q_dup_v: {
+    Value *V = UndefValue::get(Ty);
+    Ty = llvm::PointerType::getUnqual(VTy->getElementType());
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    Ops[0] = Builder.CreateLoad(Ops[0]);
+    llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
+    Ops[0] = Builder.CreateInsertElement(V, Ops[0], CI);
+    return EmitNeonSplat(Ops[0], CI);
+  }
+  case NEON::BI__builtin_neon_vst1_lane_v:
+  case NEON::BI__builtin_neon_vst1q_lane_v:
+    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+    Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
+    Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
+    return Builder.CreateStore(Ops[1], Builder.CreateBitCast(Ops[0], Ty));
+  case NEON::BI__builtin_neon_vld2_v:
+  case NEON::BI__builtin_neon_vld2q_v: {
+    llvm::Type *PTy = llvm::PointerType::getUnqual(VTy);
+    Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
+    llvm::Type *Tys[2] = { VTy, PTy };
+    Function *F = CGM.getIntrinsic(Intrinsic::arm64_neon_ld2, Tys);
+    Ops[1] = Builder.CreateCall(F, Ops[1], "vld2");
+    Ops[0] = Builder.CreateBitCast(Ops[0],
+                llvm::PointerType::getUnqual(Ops[1]->getType()));
+    return Builder.CreateStore(Ops[1], Ops[0]);
+  }
+  case NEON::BI__builtin_neon_vld3_v:
+  case NEON::BI__builtin_neon_vld3q_v: {
+    llvm::Type *PTy = llvm::PointerType::getUnqual(VTy);
+    Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
+    llvm::Type *Tys[2] = { VTy, PTy };
+    Function *F = CGM.getIntrinsic(Intrinsic::arm64_neon_ld3, Tys);
+    Ops[1] = Builder.CreateCall(F, Ops[1], "vld3");
+    Ops[0] = Builder.CreateBitCast(Ops[0],
+                llvm::PointerType::getUnqual(Ops[1]->getType()));
+    return Builder.CreateStore(Ops[1], Ops[0]);
+  }
+  case NEON::BI__builtin_neon_vld4_v:
+  case NEON::BI__builtin_neon_vld4q_v: {
+    llvm::Type *PTy = llvm::PointerType::getUnqual(VTy);
+    Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
+    llvm::Type *Tys[2] = { VTy, PTy };
+    Function *F = CGM.getIntrinsic(Intrinsic::arm64_neon_ld4, Tys);
+    Ops[1] = Builder.CreateCall(F, Ops[1], "vld4");
+    Ops[0] = Builder.CreateBitCast(Ops[0],
+                llvm::PointerType::getUnqual(Ops[1]->getType()));
+    return Builder.CreateStore(Ops[1], Ops[0]);
+  }
+  case NEON::BI__builtin_neon_vld2_dup_v:
+  case NEON::BI__builtin_neon_vld2q_dup_v: {
+    llvm::Type *PTy =
+      llvm::PointerType::getUnqual(VTy->getElementType());
+    Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
+    llvm::Type *Tys[2] = { VTy, PTy };
+    Function *F = CGM.getIntrinsic(Intrinsic::arm64_neon_ld2r, Tys);
+    Ops[1] = Builder.CreateCall(F, Ops[1], "vld2");
+    Ops[0] = Builder.CreateBitCast(Ops[0],
+                llvm::PointerType::getUnqual(Ops[1]->getType()));
+    return Builder.CreateStore(Ops[1], Ops[0]);
+  }
+  case NEON::BI__builtin_neon_vld3_dup_v:
+  case NEON::BI__builtin_neon_vld3q_dup_v: {
+    llvm::Type *PTy =
+      llvm::PointerType::getUnqual(VTy->getElementType());
+    Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
+    llvm::Type *Tys[2] = { VTy, PTy };
+    Function *F = CGM.getIntrinsic(Intrinsic::arm64_neon_ld3r, Tys);
+    Ops[1] = Builder.CreateCall(F, Ops[1], "vld3");
+    Ops[0] = Builder.CreateBitCast(Ops[0],
+                llvm::PointerType::getUnqual(Ops[1]->getType()));
+    return Builder.CreateStore(Ops[1], Ops[0]);
+  }
+  case NEON::BI__builtin_neon_vld4_dup_v:
+  case NEON::BI__builtin_neon_vld4q_dup_v: {
+    llvm::Type *PTy =
+      llvm::PointerType::getUnqual(VTy->getElementType());
+    Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
+    llvm::Type *Tys[2] = { VTy, PTy };
+    Function *F = CGM.getIntrinsic(Intrinsic::arm64_neon_ld4r, Tys);
+    Ops[1] = Builder.CreateCall(F, Ops[1], "vld4");
+    Ops[0] = Builder.CreateBitCast(Ops[0],
+                llvm::PointerType::getUnqual(Ops[1]->getType()));
+    return Builder.CreateStore(Ops[1], Ops[0]);
+  }
+  case NEON::BI__builtin_neon_vld2_lane_v:
+  case NEON::BI__builtin_neon_vld2q_lane_v: {
+    llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
+    Function *F = CGM.getIntrinsic(Intrinsic::arm64_neon_ld2lane, Tys);
+    Ops.push_back(Ops[1]);
+    Ops.erase(Ops.begin()+1);
+    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
+    Ops[3] = Builder.CreateZExt(Ops[3],
+                llvm::IntegerType::get(getLLVMContext(), 64));
+    Ops[1] = Builder.CreateCall(F,
+                ArrayRef<Value*>(Ops).slice(1), "vld2_lane");
+    Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    return Builder.CreateStore(Ops[1], Ops[0]);
+  }
+  case NEON::BI__builtin_neon_vld3_lane_v:
+  case NEON::BI__builtin_neon_vld3q_lane_v: {
+    llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
+    Function *F = CGM.getIntrinsic(Intrinsic::arm64_neon_ld3lane, Tys);
+    Ops.push_back(Ops[1]);
+    Ops.erase(Ops.begin()+1);
+    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
+    Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
+    Ops[4] = Builder.CreateZExt(Ops[4],
+                llvm::IntegerType::get(getLLVMContext(), 64));
+    Ops[1] = Builder.CreateCall(F,
+                ArrayRef<Value*>(Ops).slice(1), "vld3_lane");
+    Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    return Builder.CreateStore(Ops[1], Ops[0]);
+  }
+  case NEON::BI__builtin_neon_vld4_lane_v:
+  case NEON::BI__builtin_neon_vld4q_lane_v: {
+    llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
+    Function *F = CGM.getIntrinsic(Intrinsic::arm64_neon_ld4lane, Tys);
+    Ops.push_back(Ops[1]);
+    Ops.erase(Ops.begin()+1);
+    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
+    Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
+    Ops[4] = Builder.CreateBitCast(Ops[4], Ty);
+    Ops[5] = Builder.CreateZExt(Ops[5],
+                llvm::IntegerType::get(getLLVMContext(), 64));
+    Ops[1] = Builder.CreateCall(F,
+                ArrayRef<Value*>(Ops).slice(1), "vld4_lane");
+    Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    return Builder.CreateStore(Ops[1], Ops[0]);
+  }
+  case NEON::BI__builtin_neon_vst2_v:
+  case NEON::BI__builtin_neon_vst2q_v: {
+    Ops.push_back(Ops[0]);
+    Ops.erase(Ops.begin());
+    llvm::Type *Tys[2] = { VTy, Ops[2]->getType() };
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_st2, Tys),
+                        Ops, "");
+  }
+  case NEON::BI__builtin_neon_vst2_lane_v:
+  case NEON::BI__builtin_neon_vst2q_lane_v: {
+    Ops.push_back(Ops[0]);
+    Ops.erase(Ops.begin());
+    Ops[2] = Builder.CreateZExt(Ops[2],
+                llvm::IntegerType::get(getLLVMContext(), 64));
+    llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_st2lane, Tys),
+                        Ops, "");
+  }
+  case NEON::BI__builtin_neon_vst3_v:
+  case NEON::BI__builtin_neon_vst3q_v: {
+    Ops.push_back(Ops[0]);
+    Ops.erase(Ops.begin());
+    llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_st3, Tys),
+                        Ops, "");
+  }
+  case NEON::BI__builtin_neon_vst3_lane_v:
+  case NEON::BI__builtin_neon_vst3q_lane_v: {
+    Ops.push_back(Ops[0]);
+    Ops.erase(Ops.begin());
+    Ops[3] = Builder.CreateZExt(Ops[3],
+                llvm::IntegerType::get(getLLVMContext(), 64));
+    llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_st3lane, Tys),
+                        Ops, "");
+  }
+  case NEON::BI__builtin_neon_vst4_v:
+  case NEON::BI__builtin_neon_vst4q_v: {
+    Ops.push_back(Ops[0]);
+    Ops.erase(Ops.begin());
+    llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_st4, Tys),
+                        Ops, "");
+  }
+  case NEON::BI__builtin_neon_vst4_lane_v:
+  case NEON::BI__builtin_neon_vst4q_lane_v: {
+    Ops.push_back(Ops[0]);
+    Ops.erase(Ops.begin());
+    Ops[4] = Builder.CreateZExt(Ops[4],
+                llvm::IntegerType::get(getLLVMContext(), 64));
+    llvm::Type *Tys[2] = { VTy, Ops[5]->getType() };
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_st4lane, Tys),
+                        Ops, "");
+  }
+  case NEON::BI__builtin_neon_vtrn_v:
+  case NEON::BI__builtin_neon_vtrnq_v: {
     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
@@ -4865,8 +6714,8 @@
     for (unsigned vi = 0; vi != 2; ++vi) {
       SmallVector<Constant*, 16> Indices;
       for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
-        Indices.push_back(Builder.getInt32(i+vi));
-        Indices.push_back(Builder.getInt32(i+e+vi));
+        Indices.push_back(ConstantInt::get(Int32Ty, i+vi));
+        Indices.push_back(ConstantInt::get(Int32Ty, i+e+vi));
       }
       Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ops[0], vi);
       SV = llvm::ConstantVector::get(Indices);
@@ -4875,8 +6724,8 @@
     }
     return SV;
   }
-  case ARM::BI__builtin_neon_vuzp_v:
-  case ARM::BI__builtin_neon_vuzpq_v: {
+  case NEON::BI__builtin_neon_vuzp_v:
+  case NEON::BI__builtin_neon_vuzpq_v: {
     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
@@ -4894,8 +6743,8 @@
     }
     return SV;
   }
-  case ARM::BI__builtin_neon_vzip_v:
-  case ARM::BI__builtin_neon_vzipq_v: {
+  case NEON::BI__builtin_neon_vzip_v:
+  case NEON::BI__builtin_neon_vzipq_v: {
     Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty));
     Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
     Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
@@ -4914,6 +6763,48 @@
     }
     return SV;
   }
+  case NEON::BI__builtin_neon_vqtbl1q_v: {
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_tbl1, Ty),
+                        Ops, "vtbl1");
+  }
+  case NEON::BI__builtin_neon_vqtbl2q_v: {
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_tbl2, Ty),
+                        Ops, "vtbl2");
+  }
+  case NEON::BI__builtin_neon_vqtbl3q_v: {
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_tbl3, Ty),
+                        Ops, "vtbl3");
+  }
+  case NEON::BI__builtin_neon_vqtbl4q_v: {
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_tbl4, Ty),
+                        Ops, "vtbl4");
+  }
+  case NEON::BI__builtin_neon_vqtbx1q_v: {
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_tbx1, Ty),
+                        Ops, "vtbx1");
+  }
+  case NEON::BI__builtin_neon_vqtbx2q_v: {
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_tbx2, Ty),
+                        Ops, "vtbx2");
+  }
+  case NEON::BI__builtin_neon_vqtbx3q_v: {
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_tbx3, Ty),
+                        Ops, "vtbx3");
+  }
+  case NEON::BI__builtin_neon_vqtbx4q_v: {
+    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm64_neon_tbx4, Ty),
+                        Ops, "vtbx4");
+  }
+  case NEON::BI__builtin_neon_vsqadd_v:
+  case NEON::BI__builtin_neon_vsqaddq_v: {
+    Int = Intrinsic::arm64_neon_usqadd;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqadd");
+  }
+  case NEON::BI__builtin_neon_vuqadd_v:
+  case NEON::BI__builtin_neon_vuqaddq_v: {
+    Int = Intrinsic::arm64_neon_suqadd;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd");
+  }
   }
 }
 
@@ -4970,6 +6861,14 @@
 
   switch (BuiltinID) {
   default: return 0;
+  case X86::BI_mm_prefetch: {
+    Value *Address = EmitScalarExpr(E->getArg(0));
+    Value *RW = ConstantInt::get(Int32Ty, 0);
+    Value *Locality = EmitScalarExpr(E->getArg(1));
+    Value *Data = ConstantInt::get(Int32Ty, 1);
+    Value *F = CGM.getIntrinsic(Intrinsic::prefetch);
+    return Builder.CreateCall4(F, Address, RW, Locality, Data);
+  }
   case X86::BI__builtin_ia32_vec_init_v8qi:
   case X86::BI__builtin_ia32_vec_init_v4hi:
   case X86::BI__builtin_ia32_vec_init_v2si:
diff --git a/lib/CodeGen/CGCUDANV.cpp b/lib/CodeGen/CGCUDANV.cpp
index 0ebf1aa..fb11751 100644
--- a/lib/CodeGen/CGCUDANV.cpp
+++ b/lib/CodeGen/CGCUDANV.cpp
@@ -17,9 +17,9 @@
 #include "CodeGenModule.h"
 #include "clang/AST/Decl.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
-#include "llvm/Support/CallSite.h"
 #include <vector>
 
 using namespace clang;
@@ -39,7 +39,7 @@
 public:
   CGNVCUDARuntime(CodeGenModule &CGM);
 
-  void EmitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args);
+  void EmitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args) override;
 };
 
 }
diff --git a/lib/CodeGen/CGCUDARuntime.cpp b/lib/CodeGen/CGCUDARuntime.cpp
index eaf31bb..54a28f5 100644
--- a/lib/CodeGen/CGCUDARuntime.cpp
+++ b/lib/CodeGen/CGCUDARuntime.cpp
@@ -31,7 +31,8 @@
   llvm::BasicBlock *ContBlock = CGF.createBasicBlock("kcall.end");
 
   CodeGenFunction::ConditionalEvaluation eval(CGF);
-  CGF.EmitBranchOnBoolExpr(E->getConfig(), ContBlock, ConfigOKBlock);
+  CGF.EmitBranchOnBoolExpr(E->getConfig(), ContBlock, ConfigOKBlock,
+                           /*TrueCount=*/0);
 
   eval.begin(CGF);
   CGF.EmitBlock(ConfigOKBlock);
diff --git a/lib/CodeGen/CGCXX.cpp b/lib/CodeGen/CGCXX.cpp
index cfb2d62..ef29af7 100644
--- a/lib/CodeGen/CGCXX.cpp
+++ b/lib/CodeGen/CGCXX.cpp
@@ -35,7 +35,7 @@
     return true;
 
   // Producing an alias to a base class ctor/dtor can degrade debug quality
-  // as the debugger cannot tell them appart.
+  // as the debugger cannot tell them apart.
   if (getCodeGenOpts().OptimizationLevel == 0)
     return true;
 
@@ -56,22 +56,20 @@
 
   // If any field has a non-trivial destructor, we have to emit the
   // destructor separately.
-  for (CXXRecordDecl::field_iterator I = Class->field_begin(),
-         E = Class->field_end(); I != E; ++I)
+  for (const auto *I : Class->fields())
     if (I->getType().isDestructedType())
       return true;
 
   // Try to find a unique base class with a non-trivial destructor.
   const CXXRecordDecl *UniqueBase = 0;
-  for (CXXRecordDecl::base_class_const_iterator I = Class->bases_begin(),
-         E = Class->bases_end(); I != E; ++I) {
+  for (const auto &I : Class->bases()) {
 
     // We're in the base destructor, so skip virtual bases.
-    if (I->isVirtual()) continue;
+    if (I.isVirtual()) continue;
 
     // Skip base classes with trivial destructors.
     const CXXRecordDecl *Base
-      = cast<CXXRecordDecl>(I->getType()->getAs<RecordType>()->getDecl());
+      = cast<CXXRecordDecl>(I.getType()->getAs<RecordType>()->getDecl());
     if (Base->hasTrivialDestructor()) continue;
 
     // If we've already found a base class with a non-trivial
@@ -92,7 +90,13 @@
   if (!ClassLayout.getBaseClassOffset(UniqueBase).isZero())
     return true;
 
+  // Give up if the calling conventions don't match. We could update the call,
+  // but it is probably not worth it.
   const CXXDestructorDecl *BaseD = UniqueBase->getDestructor();
+  if (BaseD->getType()->getAs<FunctionType>()->getCallConv() !=
+      D->getType()->getAs<FunctionType>()->getCallConv())
+    return true;
+
   return TryEmitDefinitionAsAlias(GlobalDecl(D, Dtor_Base),
                                   GlobalDecl(BaseD, Dtor_Base),
                                   false);
@@ -107,7 +111,7 @@
   if (!getCodeGenOpts().CXXCtorDtorAliases)
     return true;
 
-  // The alias will use the linkage of the referrent.  If we can't
+  // The alias will use the linkage of the referent.  If we can't
   // support aliases with that linkage, fail.
   llvm::GlobalValue::LinkageTypes Linkage = getFunctionLinkage(AliasDecl);
 
@@ -130,7 +134,7 @@
   llvm::PointerType *AliasType
     = getTypes().GetFunctionType(AliasDecl)->getPointerTo();
 
-  // Find the referrent.  Some aliases might require a bitcast, in
+  // Find the referent.  Some aliases might require a bitcast, in
   // which case the caller is responsible for ensuring the soundness
   // of these semantics.
   llvm::GlobalValue *Ref = cast<llvm::GlobalValue>(GetAddrOfGlobal(TargetDecl));
@@ -143,7 +147,7 @@
   if (llvm::GlobalValue::isDiscardableIfUnused(Linkage) &&
      (TargetLinkage != llvm::GlobalValue::AvailableExternallyLinkage ||
       !TargetDecl.getDecl()->hasAttr<AlwaysInlineAttr>())) {
-    // FIXME: An extern template instanciation will create functions with
+    // FIXME: An extern template instantiation will create functions with
     // linkage "AvailableExternally". In libc++, some classes also define
     // members with attribute "AlwaysInline" and expect no reference to
     // be generated. It is desirable to reenable this optimisation after
@@ -190,11 +194,13 @@
 
 void CodeGenModule::EmitCXXConstructor(const CXXConstructorDecl *ctor,
                                        CXXCtorType ctorType) {
-  // The complete constructor is equivalent to the base constructor
-  // for classes with no virtual bases.  Try to emit it as an alias.
-  if (getTarget().getCXXABI().hasConstructorVariants() &&
-      !ctor->getParent()->getNumVBases() &&
-      (ctorType == Ctor_Complete || ctorType == Ctor_Base)) {
+  if (!getTarget().getCXXABI().hasConstructorVariants()) {
+    // If there are no constructor variants, always emit the complete destructor.
+    ctorType = Ctor_Complete;
+  } else if (!ctor->getParent()->getNumVBases() &&
+             (ctorType == Ctor_Complete || ctorType == Ctor_Base)) {
+    // The complete constructor is equivalent to the base constructor
+    // for classes with no virtual bases.  Try to emit it as an alias.
     bool ProducedAlias =
         !TryEmitDefinitionAsAlias(GlobalDecl(ctor, Ctor_Complete),
                                   GlobalDecl(ctor, Ctor_Base), true);
@@ -205,8 +211,8 @@
   const CGFunctionInfo &fnInfo =
     getTypes().arrangeCXXConstructorDeclaration(ctor, ctorType);
 
-  llvm::Function *fn =
-    cast<llvm::Function>(GetAddrOfCXXConstructor(ctor, ctorType, &fnInfo));
+  llvm::Function *fn = cast<llvm::Function>(
+      GetAddrOfCXXConstructor(ctor, ctorType, &fnInfo, true));
   setFunctionLinkage(GlobalDecl(ctor, ctorType), fn);
 
   CodeGenFunction(*this).GenerateCode(GlobalDecl(ctor, ctorType), fn, fnInfo);
@@ -218,7 +224,8 @@
 llvm::GlobalValue *
 CodeGenModule::GetAddrOfCXXConstructor(const CXXConstructorDecl *ctor,
                                        CXXCtorType ctorType,
-                                       const CGFunctionInfo *fnInfo) {
+                                       const CGFunctionInfo *fnInfo,
+                                       bool DontDefer) {
   GlobalDecl GD(ctor, ctorType);
   
   StringRef name = getMangledName(GD);
@@ -230,7 +237,8 @@
 
   llvm::FunctionType *fnType = getTypes().GetFunctionType(*fnInfo);
   return cast<llvm::Function>(GetOrCreateLLVMFunction(name, fnType, GD,
-                                                      /*ForVTable=*/false));
+                                                      /*ForVTable=*/false,
+                                                      DontDefer));
 }
 
 void CodeGenModule::EmitCXXDestructor(const CXXDestructorDecl *dtor,
@@ -260,8 +268,8 @@
   const CGFunctionInfo &fnInfo =
     getTypes().arrangeCXXDestructor(dtor, dtorType);
 
-  llvm::Function *fn =
-    cast<llvm::Function>(GetAddrOfCXXDestructor(dtor, dtorType, &fnInfo));
+  llvm::Function *fn = cast<llvm::Function>(
+      GetAddrOfCXXDestructor(dtor, dtorType, &fnInfo, 0, true));
   setFunctionLinkage(GlobalDecl(dtor, dtorType), fn);
 
   CodeGenFunction(*this).GenerateCode(GlobalDecl(dtor, dtorType), fn, fnInfo);
@@ -274,7 +282,8 @@
 CodeGenModule::GetAddrOfCXXDestructor(const CXXDestructorDecl *dtor,
                                       CXXDtorType dtorType,
                                       const CGFunctionInfo *fnInfo,
-                                      llvm::FunctionType *fnType) {
+                                      llvm::FunctionType *fnType,
+                                      bool DontDefer) {
   GlobalDecl GD(dtor, dtorType);
 
   StringRef name = getMangledName(GD);
@@ -286,7 +295,8 @@
     fnType = getTypes().GetFunctionType(*fnInfo);
   }
   return cast<llvm::Function>(GetOrCreateLLVMFunction(name, fnType, GD,
-                                                      /*ForVTable=*/false));
+                                                      /*ForVTable=*/false,
+                                                      DontDefer));
 }
 
 static llvm::Value *BuildAppleKextVirtualCall(CodeGenFunction &CGF,
diff --git a/lib/CodeGen/CGCXXABI.cpp b/lib/CodeGen/CGCXXABI.cpp
index 412b278..2bb3907 100644
--- a/lib/CodeGen/CGCXXABI.cpp
+++ b/lib/CodeGen/CGCXXABI.cpp
@@ -37,10 +37,9 @@
   return CGM.getTypes().ConvertType(CGM.getContext().getPointerDiffType());
 }
 
-llvm::Value *CGCXXABI::EmitLoadOfMemberFunctionPointer(CodeGenFunction &CGF,
-                                                       llvm::Value *&This,
-                                                       llvm::Value *MemPtr,
-                                                 const MemberPointerType *MPT) {
+llvm::Value *CGCXXABI::EmitLoadOfMemberFunctionPointer(
+    CodeGenFunction &CGF, const Expr *E, llvm::Value *&This,
+    llvm::Value *MemPtr, const MemberPointerType *MPT) {
   ErrorUnsupportedABI(CGF, "calls through member pointers");
 
   const FunctionProtoType *FPT = 
@@ -52,10 +51,10 @@
   return llvm::Constant::getNullValue(FTy->getPointerTo());
 }
 
-llvm::Value *CGCXXABI::EmitMemberDataPointerAddress(CodeGenFunction &CGF,
-                                                    llvm::Value *Base,
-                                                    llvm::Value *MemPtr,
-                                              const MemberPointerType *MPT) {
+llvm::Value *
+CGCXXABI::EmitMemberDataPointerAddress(CodeGenFunction &CGF, const Expr *E,
+                                       llvm::Value *Base, llvm::Value *MemPtr,
+                                       const MemberPointerType *MPT) {
   ErrorUnsupportedABI(CGF, "loads of member pointers");
   llvm::Type *Ty = CGF.ConvertType(MPT->getPointeeType())->getPointerTo();
   return llvm::Constant::getNullValue(Ty);
@@ -116,7 +115,7 @@
   return true;
 }
 
-void CGCXXABI::BuildThisParam(CodeGenFunction &CGF, FunctionArgList &params) {
+void CGCXXABI::buildThisParam(CodeGenFunction &CGF, FunctionArgList &params) {
   const CXXMethodDecl *MD = cast<CXXMethodDecl>(CGF.CurGD.getDecl());
 
   // FIXME: I'm not entirely sure I like using a fake decl just for code
@@ -281,8 +280,9 @@
     llvm::Function *InitFunc) {
 }
 
-LValue CGCXXABI::EmitThreadLocalDeclRefExpr(CodeGenFunction &CGF,
-                                          const DeclRefExpr *DRE) {
+LValue CGCXXABI::EmitThreadLocalVarDeclLValue(CodeGenFunction &CGF,
+                                              const VarDecl *VD,
+                                              QualType LValType) {
   ErrorUnsupportedABI(CGF, "odr-use of thread_local global");
   return LValue();
 }
@@ -290,3 +290,31 @@
 bool CGCXXABI::NeedsVTTParameter(GlobalDecl GD) {
   return false;
 }
+
+/// What sort of uniqueness rules should we use for the RTTI for the
+/// given type?
+CGCXXABI::RTTIUniquenessKind
+CGCXXABI::classifyRTTIUniqueness(QualType CanTy,
+                                 llvm::GlobalValue::LinkageTypes Linkage) {
+  if (shouldRTTIBeUnique())
+    return RUK_Unique;
+
+  // It's only necessary for linkonce_odr or weak_odr linkage.
+  if (Linkage != llvm::GlobalValue::LinkOnceODRLinkage &&
+      Linkage != llvm::GlobalValue::WeakODRLinkage)
+    return RUK_Unique;
+
+  // It's only necessary with default visibility.
+  if (CanTy->getVisibility() != DefaultVisibility)
+    return RUK_Unique;
+
+  // If we're not required to publish this symbol, hide it.
+  if (Linkage == llvm::GlobalValue::LinkOnceODRLinkage)
+    return RUK_NonUniqueHidden;
+
+  // If we're required to publish this symbol, as we might be under an
+  // explicit instantiation, leave it with default visibility but
+  // enable string-comparisons.
+  assert(Linkage == llvm::GlobalValue::WeakODRLinkage);
+  return RUK_NonUniqueVisible;
+}
diff --git a/lib/CodeGen/CGCXXABI.h b/lib/CodeGen/CGCXXABI.h
index 9e9a2a7..beaec2c 100644
--- a/lib/CodeGen/CGCXXABI.h
+++ b/lib/CodeGen/CGCXXABI.h
@@ -41,7 +41,7 @@
 class CGCXXABI {
 protected:
   CodeGenModule &CGM;
-  OwningPtr<MangleContext> MangleCtx;
+  std::unique_ptr<MangleContext> MangleCtx;
 
   CGCXXABI(CodeGenModule &CGM)
     : CGM(CGM), MangleCtx(CGM.getContext().createMangleContext()) {}
@@ -60,15 +60,6 @@
   /// Get a null value for unsupported member pointers.
   llvm::Constant *GetBogusMemberPointer(QualType T);
 
-  // FIXME: Every place that calls getVTT{Decl,Value} is something
-  // that needs to be abstracted properly.
-  ImplicitParamDecl *&getVTTDecl(CodeGenFunction &CGF) {
-    return CGF.CXXStructorImplicitParamDecl;
-  }
-  llvm::Value *&getVTTValue(CodeGenFunction &CGF) {
-    return CGF.CXXStructorImplicitParamValue;
-  }
-
   ImplicitParamDecl *&getStructorImplicitParamDecl(CodeGenFunction &CGF) {
     return CGF.CXXStructorImplicitParamDecl;
   }
@@ -76,11 +67,8 @@
     return CGF.CXXStructorImplicitParamValue;
   }
 
-  /// Build a parameter variable suitable for 'this'.
-  void BuildThisParam(CodeGenFunction &CGF, FunctionArgList &Params);
-
   /// Perform prolog initialization of the parameter variable suitable
-  /// for 'this' emitted by BuildThisParam.
+  /// for 'this' emitted by buildThisParam.
   void EmitThisParam(CodeGenFunction &CGF);
 
   ASTContext &getContext() const { return CGM.getContext(); }
@@ -134,17 +122,15 @@
   /// Load a member function from an object and a member function
   /// pointer.  Apply the this-adjustment and set 'This' to the
   /// adjusted value.
-  virtual llvm::Value *
-  EmitLoadOfMemberFunctionPointer(CodeGenFunction &CGF,
-                                  llvm::Value *&This,
-                                  llvm::Value *MemPtr,
-                                  const MemberPointerType *MPT);
+  virtual llvm::Value *EmitLoadOfMemberFunctionPointer(
+      CodeGenFunction &CGF, const Expr *E, llvm::Value *&This,
+      llvm::Value *MemPtr, const MemberPointerType *MPT);
 
   /// Calculate an l-value from an object and a data member pointer.
-  virtual llvm::Value *EmitMemberDataPointerAddress(CodeGenFunction &CGF,
-                                                    llvm::Value *Base,
-                                                    llvm::Value *MemPtr,
-                                            const MemberPointerType *MPT);
+  virtual llvm::Value *
+  EmitMemberDataPointerAddress(CodeGenFunction &CGF, const Expr *E,
+                               llvm::Value *Base, llvm::Value *MemPtr,
+                               const MemberPointerType *MPT);
 
   /// Perform a derived-to-base, base-to-derived, or bitcast member
   /// pointer conversion.
@@ -272,23 +258,27 @@
   }
 
   /// Perform ABI-specific "this" argument adjustment required prior to
-  /// a virtual function call.
-  virtual llvm::Value *adjustThisArgumentForVirtualCall(CodeGenFunction &CGF,
-                                                        GlobalDecl GD,
-                                                        llvm::Value *This) {
+  /// a call of a virtual function.
+  /// The "VirtualCall" argument is true iff the call itself is virtual.
+  virtual llvm::Value *
+  adjustThisArgumentForVirtualFunctionCall(CodeGenFunction &CGF, GlobalDecl GD,
+                                           llvm::Value *This,
+                                           bool VirtualCall) {
     return This;
   }
 
-  /// Build the ABI-specific portion of the parameter list for a
-  /// function.  This generally involves a 'this' parameter and
-  /// possibly some extra data for constructors and destructors.
+  /// Build a parameter variable suitable for 'this'.
+  void buildThisParam(CodeGenFunction &CGF, FunctionArgList &Params);
+
+  /// Insert any ABI-specific implicit parameters into the parameter list for a
+  /// function.  This generally involves extra data for constructors and
+  /// destructors.
   ///
   /// ABIs may also choose to override the return type, which has been
   /// initialized with the type of 'this' if HasThisReturn(CGF.CurGD) is true or
   /// the formal return type of the function otherwise.
-  virtual void BuildInstanceFunctionParams(CodeGenFunction &CGF,
-                                           QualType &ResTy,
-                                           FunctionArgList &Params) = 0;
+  virtual void addImplicitStructorParams(CodeGenFunction &CGF, QualType &ResTy,
+                                         FunctionArgList &Params) = 0;
 
   /// Perform ABI-specific "this" parameter adjustment in a virtual function
   /// prologue.
@@ -300,14 +290,20 @@
   /// Emit the ABI-specific prolog for the function.
   virtual void EmitInstanceFunctionProlog(CodeGenFunction &CGF) = 0;
 
-  /// Emit the constructor call. Return the function that is called.
-  virtual void EmitConstructorCall(CodeGenFunction &CGF,
-                                   const CXXConstructorDecl *D,
-                                   CXXCtorType Type,
-                                   bool ForVirtualBase, bool Delegating,
-                                   llvm::Value *This,
-                                   CallExpr::const_arg_iterator ArgBeg,
-                                   CallExpr::const_arg_iterator ArgEnd) = 0;
+  /// Add any ABI-specific implicit arguments needed to call a constructor.
+  ///
+  /// \return The number of args added to the call, which is typically zero or
+  /// one.
+  virtual unsigned
+  addImplicitConstructorArgs(CodeGenFunction &CGF, const CXXConstructorDecl *D,
+                             CXXCtorType Type, bool ForVirtualBase,
+                             bool Delegating, CallArgList &Args) = 0;
+
+  /// Emit the destructor call.
+  virtual void EmitDestructorCall(CodeGenFunction &CGF,
+                                  const CXXDestructorDecl *DD, CXXDtorType Type,
+                                  bool ForVirtualBase, bool Delegating,
+                                  llvm::Value *This) = 0;
 
   /// Emits the VTable definitions required for the given record type.
   virtual void emitVTableDefinitions(CodeGenVTables &CGVT,
@@ -485,8 +481,39 @@
   /// Emit a reference to a non-local thread_local variable (including
   /// triggering the initialization of all thread_local variables in its
   /// translation unit).
-  virtual LValue EmitThreadLocalDeclRefExpr(CodeGenFunction &CGF,
-                                            const DeclRefExpr *DRE);
+  virtual LValue EmitThreadLocalVarDeclLValue(CodeGenFunction &CGF,
+                                              const VarDecl *VD,
+                                              QualType LValType);
+
+  /**************************** RTTI Uniqueness ******************************/
+
+protected:
+  /// Returns true if the ABI requires RTTI type_info objects to be unique
+  /// across a program.
+  virtual bool shouldRTTIBeUnique() { return true; }
+
+public:
+  /// What sort of unique-RTTI behavior should we use?
+  enum RTTIUniquenessKind {
+    /// We are guaranteeing, or need to guarantee, that the RTTI string
+    /// is unique.
+    RUK_Unique,
+
+    /// We are not guaranteeing uniqueness for the RTTI string, so we
+    /// can demote to hidden visibility but must use string comparisons.
+    RUK_NonUniqueHidden,
+
+    /// We are not guaranteeing uniqueness for the RTTI string, so we
+    /// have to use string comparisons, but we also have to emit it with
+    /// non-hidden visibility.
+    RUK_NonUniqueVisible
+  };
+
+  /// Return the required visibility status for the given type and linkage in
+  /// the current ABI.
+  RTTIUniquenessKind
+  classifyRTTIUniqueness(QualType CanTy,
+                         llvm::GlobalValue::LinkageTypes Linkage);
 };
 
 // Create an instance of a C++ ABI class:
diff --git a/lib/CodeGen/CGCall.cpp b/lib/CodeGen/CGCall.cpp
index 22f2467..e26d6b2d 100644
--- a/lib/CodeGen/CGCall.cpp
+++ b/lib/CodeGen/CGCall.cpp
@@ -26,10 +26,10 @@
 #include "clang/Frontend/CodeGenOptions.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InlineAsm.h"
-#include "llvm/MC/SubtargetFeature.h"
-#include "llvm/Support/CallSite.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace clang;
 using namespace CodeGen;
@@ -79,23 +79,26 @@
 CodeGenTypes::arrangeFreeFunctionType(CanQual<FunctionNoProtoType> FTNP) {
   // When translating an unprototyped function type, always use a
   // variadic type.
-  return arrangeLLVMFunctionInfo(FTNP->getResultType().getUnqualifiedType(),
-                                 None, FTNP->getExtInfo(), RequiredArgs(0));
+  return arrangeLLVMFunctionInfo(FTNP->getReturnType().getUnqualifiedType(),
+                                 false, None, FTNP->getExtInfo(),
+                                 RequiredArgs(0));
 }
 
 /// Arrange the LLVM function layout for a value of the given function
 /// type, on top of any implicit parameters already stored.  Use the
 /// given ExtInfo instead of the ExtInfo from the function type.
 static const CGFunctionInfo &arrangeLLVMFunctionInfo(CodeGenTypes &CGT,
+                                                     bool IsInstanceMethod,
                                        SmallVectorImpl<CanQualType> &prefix,
                                              CanQual<FunctionProtoType> FTP,
                                               FunctionType::ExtInfo extInfo) {
   RequiredArgs required = RequiredArgs::forPrototypePlus(FTP, prefix.size());
   // FIXME: Kill copy.
-  for (unsigned i = 0, e = FTP->getNumArgs(); i != e; ++i)
-    prefix.push_back(FTP->getArgType(i));
-  CanQualType resultType = FTP->getResultType().getUnqualifiedType();
-  return CGT.arrangeLLVMFunctionInfo(resultType, prefix, extInfo, required);
+  for (unsigned i = 0, e = FTP->getNumParams(); i != e; ++i)
+    prefix.push_back(FTP->getParamType(i));
+  CanQualType resultType = FTP->getReturnType().getUnqualifiedType();
+  return CGT.arrangeLLVMFunctionInfo(resultType, IsInstanceMethod, prefix,
+                                     extInfo, required);
 }
 
 /// Arrange the argument and result information for a free function (i.e.
@@ -103,7 +106,7 @@
 static const CGFunctionInfo &arrangeFreeFunctionType(CodeGenTypes &CGT,
                                       SmallVectorImpl<CanQualType> &prefix,
                                             CanQual<FunctionProtoType> FTP) {
-  return arrangeLLVMFunctionInfo(CGT, prefix, FTP, FTP->getExtInfo());
+  return arrangeLLVMFunctionInfo(CGT, false, prefix, FTP, FTP->getExtInfo());
 }
 
 /// Arrange the argument and result information for a free function (i.e.
@@ -112,7 +115,7 @@
                                       SmallVectorImpl<CanQualType> &prefix,
                                             CanQual<FunctionProtoType> FTP) {
   FunctionType::ExtInfo extInfo = FTP->getExtInfo();
-  return arrangeLLVMFunctionInfo(CGT, prefix, FTP, extInfo);
+  return arrangeLLVMFunctionInfo(CGT, true, prefix, FTP, extInfo);
 }
 
 /// Arrange the argument and result information for a value of the
@@ -123,7 +126,7 @@
   return ::arrangeFreeFunctionType(*this, argTypes, FTP);
 }
 
-static CallingConv getCallingConventionForDecl(const Decl *D) {
+static CallingConv getCallingConventionForDecl(const Decl *D, bool IsWindows) {
   // Set the appropriate calling convention for the Function.
   if (D->hasAttr<StdCallAttr>())
     return CC_X86StdCall;
@@ -146,6 +149,12 @@
   if (D->hasAttr<IntelOclBiccAttr>())
     return CC_IntelOclBicc;
 
+  if (D->hasAttr<MSABIAttr>())
+    return IsWindows ? CC_C : CC_X86_64Win64;
+
+  if (D->hasAttr<SysVABIAttr>())
+    return IsWindows ? CC_X86_64SysV : CC_C;
+
   return CC_C;
 }
 
@@ -202,18 +211,41 @@
   CanQualType resultType =
     TheCXXABI.HasThisReturn(GD) ? argTypes.front() : Context.VoidTy;
 
-  TheCXXABI.BuildConstructorSignature(D, ctorKind, resultType, argTypes);
-
   CanQual<FunctionProtoType> FTP = GetFormalType(D);
 
-  RequiredArgs required = RequiredArgs::forPrototypePlus(FTP, argTypes.size());
-
   // Add the formal parameters.
-  for (unsigned i = 0, e = FTP->getNumArgs(); i != e; ++i)
-    argTypes.push_back(FTP->getArgType(i));
+  for (unsigned i = 0, e = FTP->getNumParams(); i != e; ++i)
+    argTypes.push_back(FTP->getParamType(i));
+
+  TheCXXABI.BuildConstructorSignature(D, ctorKind, resultType, argTypes);
+
+  RequiredArgs required =
+      (D->isVariadic() ? RequiredArgs(argTypes.size()) : RequiredArgs::All);
 
   FunctionType::ExtInfo extInfo = FTP->getExtInfo();
-  return arrangeLLVMFunctionInfo(resultType, argTypes, extInfo, required);
+  return arrangeLLVMFunctionInfo(resultType, true, argTypes, extInfo, required);
+}
+
+/// Arrange a call to a C++ method, passing the given arguments.
+const CGFunctionInfo &
+CodeGenTypes::arrangeCXXConstructorCall(const CallArgList &args,
+                                        const CXXConstructorDecl *D,
+                                        CXXCtorType CtorKind,
+                                        unsigned ExtraArgs) {
+  // FIXME: Kill copy.
+  SmallVector<CanQualType, 16> ArgTypes;
+  for (CallArgList::const_iterator i = args.begin(), e = args.end(); i != e;
+       ++i)
+    ArgTypes.push_back(Context.getCanonicalParamType(i->Ty));
+
+  CanQual<FunctionProtoType> FPT = GetFormalType(D);
+  RequiredArgs Required = RequiredArgs::forPrototypePlus(FPT, 1 + ExtraArgs);
+  GlobalDecl GD(D, CtorKind);
+  CanQualType ResultType =
+      TheCXXABI.HasThisReturn(GD) ? ArgTypes.front() : Context.VoidTy;
+
+  FunctionType::ExtInfo Info = FPT->getExtInfo();
+  return arrangeLLVMFunctionInfo(ResultType, true, ArgTypes, Info, Required);
 }
 
 /// Arrange the argument and result information for a declaration,
@@ -232,11 +264,11 @@
   TheCXXABI.BuildDestructorSignature(D, dtorKind, resultType, argTypes);
 
   CanQual<FunctionProtoType> FTP = GetFormalType(D);
-  assert(FTP->getNumArgs() == 0 && "dtor with formal parameters");
+  assert(FTP->getNumParams() == 0 && "dtor with formal parameters");
   assert(FTP->isVariadic() == 0 && "dtor with formal parameters");
 
   FunctionType::ExtInfo extInfo = FTP->getExtInfo();
-  return arrangeLLVMFunctionInfo(resultType, argTypes, extInfo,
+  return arrangeLLVMFunctionInfo(resultType, true, argTypes, extInfo,
                                  RequiredArgs::All);
 }
 
@@ -256,7 +288,7 @@
   // non-variadic type.
   if (isa<FunctionNoProtoType>(FTy)) {
     CanQual<FunctionNoProtoType> noProto = FTy.getAs<FunctionNoProtoType>();
-    return arrangeLLVMFunctionInfo(noProto->getResultType(), None,
+    return arrangeLLVMFunctionInfo(noProto->getReturnType(), false, None,
                                    noProto->getExtInfo(), RequiredArgs::All);
   }
 
@@ -286,13 +318,13 @@
   argTys.push_back(Context.getCanonicalParamType(receiverType));
   argTys.push_back(Context.getCanonicalParamType(Context.getObjCSelType()));
   // FIXME: Kill copy?
-  for (ObjCMethodDecl::param_const_iterator i = MD->param_begin(),
-         e = MD->param_end(); i != e; ++i) {
-    argTys.push_back(Context.getCanonicalParamType((*i)->getType()));
+  for (const auto *I : MD->params()) {
+    argTys.push_back(Context.getCanonicalParamType(I->getType()));
   }
 
   FunctionType::ExtInfo einfo;
-  einfo = einfo.withCallingConv(getCallingConventionForDecl(MD));
+  bool IsWindows = getContext().getTargetInfo().getTriple().isOSWindows();
+  einfo = einfo.withCallingConv(getCallingConventionForDecl(MD, IsWindows));
 
   if (getContext().getLangOpts().ObjCAutoRefCount &&
       MD->hasAttr<NSReturnsRetainedAttr>())
@@ -301,8 +333,8 @@
   RequiredArgs required =
     (MD->isVariadic() ? RequiredArgs(argTys.size()) : RequiredArgs::All);
 
-  return arrangeLLVMFunctionInfo(GetReturnType(MD->getResultType()), argTys,
-                                 einfo, required);
+  return arrangeLLVMFunctionInfo(GetReturnType(MD->getReturnType()), false,
+                                 argTys, einfo, required);
 }
 
 const CGFunctionInfo &
@@ -336,7 +368,7 @@
   // extra prefix plus the arguments in the prototype.
   if (const FunctionProtoType *proto = dyn_cast<FunctionProtoType>(fnType)) {
     if (proto->isVariadic())
-      required = RequiredArgs(proto->getNumArgs() + numExtraRequiredArgs);
+      required = RequiredArgs(proto->getNumParams() + numExtraRequiredArgs);
 
   // If we don't have a prototype at all, but we're supposed to
   // explicitly use the variadic convention for unprototyped calls,
@@ -348,7 +380,7 @@
     required = RequiredArgs(args.size());
   }
 
-  return CGT.arrangeFreeFunctionCall(fnType->getResultType(), args,
+  return CGT.arrangeFreeFunctionCall(fnType->getReturnType(), args,
                                      fnType->getExtInfo(), required);
 }
 
@@ -380,8 +412,8 @@
   for (CallArgList::const_iterator i = args.begin(), e = args.end();
        i != e; ++i)
     argTypes.push_back(Context.getCanonicalParamType(i->Ty));
-  return arrangeLLVMFunctionInfo(GetReturnType(resultType), argTypes, info,
-                                 required);
+  return arrangeLLVMFunctionInfo(GetReturnType(resultType), false, argTypes,
+                                 info, required);
 }
 
 /// Arrange a call to a C++ method, passing the given arguments.
@@ -396,15 +428,13 @@
     argTypes.push_back(Context.getCanonicalParamType(i->Ty));
 
   FunctionType::ExtInfo info = FPT->getExtInfo();
-  return arrangeLLVMFunctionInfo(GetReturnType(FPT->getResultType()),
+  return arrangeLLVMFunctionInfo(GetReturnType(FPT->getReturnType()), true,
                                  argTypes, info, required);
 }
 
-const CGFunctionInfo &
-CodeGenTypes::arrangeFunctionDeclaration(QualType resultType,
-                                         const FunctionArgList &args,
-                                         const FunctionType::ExtInfo &info,
-                                         bool isVariadic) {
+const CGFunctionInfo &CodeGenTypes::arrangeFreeFunctionDeclaration(
+    QualType resultType, const FunctionArgList &args,
+    const FunctionType::ExtInfo &info, bool isVariadic) {
   // FIXME: Kill copy.
   SmallVector<CanQualType, 16> argTypes;
   for (FunctionArgList::const_iterator i = args.begin(), e = args.end();
@@ -413,12 +443,12 @@
 
   RequiredArgs required =
     (isVariadic ? RequiredArgs(args.size()) : RequiredArgs::All);
-  return arrangeLLVMFunctionInfo(GetReturnType(resultType), argTypes, info,
+  return arrangeLLVMFunctionInfo(GetReturnType(resultType), false, argTypes, info,
                                  required);
 }
 
 const CGFunctionInfo &CodeGenTypes::arrangeNullaryFunction() {
-  return arrangeLLVMFunctionInfo(getContext().VoidTy, None,
+  return arrangeLLVMFunctionInfo(getContext().VoidTy, false, None,
                                  FunctionType::ExtInfo(), RequiredArgs::All);
 }
 
@@ -427,6 +457,7 @@
 /// above functions ultimately defer to.
 const CGFunctionInfo &
 CodeGenTypes::arrangeLLVMFunctionInfo(CanQualType resultType,
+                                      bool IsInstanceMethod,
                                       ArrayRef<CanQualType> argTypes,
                                       FunctionType::ExtInfo info,
                                       RequiredArgs required) {
@@ -440,7 +471,8 @@
 
   // Lookup or create unique function info.
   llvm::FoldingSetNodeID ID;
-  CGFunctionInfo::Profile(ID, info, required, resultType, argTypes);
+  CGFunctionInfo::Profile(ID, IsInstanceMethod, info, required, resultType,
+                          argTypes);
 
   void *insertPos = 0;
   CGFunctionInfo *FI = FunctionInfos.FindNodeOrInsertPos(ID, insertPos);
@@ -448,7 +480,8 @@
     return *FI;
 
   // Construct the function info.  We co-allocate the ArgInfos.
-  FI = CGFunctionInfo::create(CC, info, resultType, argTypes, required);
+  FI = CGFunctionInfo::create(CC, IsInstanceMethod, info, resultType, argTypes,
+                              required);
   FunctionInfos.InsertNode(FI, insertPos);
 
   bool inserted = FunctionsBeingProcessed.insert(FI); (void)inserted;
@@ -464,10 +497,9 @@
   if (retInfo.canHaveCoerceToType() && retInfo.getCoerceToType() == 0)
     retInfo.setCoerceToType(ConvertType(FI->getReturnType()));
 
-  for (CGFunctionInfo::arg_iterator I = FI->arg_begin(), E = FI->arg_end();
-       I != E; ++I)
-    if (I->info.canHaveCoerceToType() && I->info.getCoerceToType() == 0)
-      I->info.setCoerceToType(ConvertType(I->type));
+  for (auto &I : FI->arguments())
+    if (I.info.canHaveCoerceToType() && I.info.getCoerceToType() == 0)
+      I.info.setCoerceToType(ConvertType(I.type));
 
   bool erased = FunctionsBeingProcessed.erase(FI); (void)erased;
   assert(erased && "Not in set?");
@@ -476,6 +508,7 @@
 }
 
 CGFunctionInfo *CGFunctionInfo::create(unsigned llvmCC,
+                                       bool IsInstanceMethod,
                                        const FunctionType::ExtInfo &info,
                                        CanQualType resultType,
                                        ArrayRef<CanQualType> argTypes,
@@ -486,11 +519,13 @@
   FI->CallingConvention = llvmCC;
   FI->EffectiveCallingConvention = llvmCC;
   FI->ASTCallingConvention = info.getCC();
+  FI->InstanceMethod = IsInstanceMethod;
   FI->NoReturn = info.getNoReturn();
   FI->ReturnsRetained = info.getProducesResult();
   FI->Required = required;
   FI->HasRegParm = info.getHasRegParm();
   FI->RegParm = info.getRegParm();
+  FI->ArgStruct = 0;
   FI->NumArgs = argTypes.size();
   FI->getArgsBuffer()[0].type = resultType;
   for (unsigned i = 0, e = argTypes.size(); i != e; ++i)
@@ -516,9 +551,7 @@
       const FieldDecl *LargestFD = 0;
       CharUnits UnionSize = CharUnits::Zero();
 
-      for (RecordDecl::field_iterator i = RD->field_begin(), e = RD->field_end();
-           i != e; ++i) {
-        const FieldDecl *FD = *i;
+      for (const auto *FD : RD->fields()) {
         assert(!FD->isBitField() &&
                "Cannot expand structure with bit-field members.");
         CharUnits FieldSize = getContext().getTypeSizeInChars(FD->getType());
@@ -530,11 +563,10 @@
       if (LargestFD)
         GetExpandedTypes(LargestFD->getType(), expandedTypes);
     } else {
-      for (RecordDecl::field_iterator i = RD->field_begin(), e = RD->field_end();
-           i != e; ++i) {
-        assert(!i->isBitField() &&
+      for (const auto *I : RD->fields()) {
+        assert(!I->isBitField() &&
                "Cannot expand structure with bit-field members.");
-        GetExpandedTypes(i->getType(), expandedTypes);
+        GetExpandedTypes(I->getType(), expandedTypes);
       }
     }
   } else if (const ComplexType *CT = type->getAs<ComplexType>()) {
@@ -567,9 +599,7 @@
       const FieldDecl *LargestFD = 0;
       CharUnits UnionSize = CharUnits::Zero();
 
-      for (RecordDecl::field_iterator i = RD->field_begin(), e = RD->field_end();
-           i != e; ++i) {
-        const FieldDecl *FD = *i;
+      for (const auto *FD : RD->fields()) {
         assert(!FD->isBitField() &&
                "Cannot expand structure with bit-field members.");
         CharUnits FieldSize = getContext().getTypeSizeInChars(FD->getType());
@@ -584,9 +614,7 @@
         AI = ExpandTypeFromArgs(LargestFD->getType(), SubLV, AI);
       }
     } else {
-      for (RecordDecl::field_iterator i = RD->field_begin(), e = RD->field_end();
-           i != e; ++i) {
-        FieldDecl *FD = *i;
+      for (const auto *FD : RD->fields()) {
         QualType FT = FD->getType();
 
         // FIXME: What are the right qualifiers here?
@@ -850,6 +878,11 @@
   return FI.getReturnInfo().isIndirect();
 }
 
+bool CodeGenModule::ReturnSlotInterferesWithArgs(const CGFunctionInfo &FI) {
+  return ReturnTypeUsesSRet(FI) &&
+         getTargetCodeGenInfo().doesReturnSlotInterfereWithArgs();
+}
+
 bool CodeGenModule::ReturnTypeUsesFPRet(QualType ResultType) {
   if (const BuiltinType *BT = ResultType->getAs<BuiltinType>()) {
     switch (BT->getKind()) {
@@ -902,6 +935,18 @@
     resultType = retAI.getCoerceToType();
     break;
 
+  case ABIArgInfo::InAlloca:
+    if (retAI.getInAllocaSRet()) {
+      // sret things on win32 aren't void, they return the sret pointer.
+      QualType ret = FI.getReturnType();
+      llvm::Type *ty = ConvertType(ret);
+      unsigned addressSpace = Context.getTargetAddressSpace(ret);
+      resultType = llvm::PointerType::get(ty, addressSpace);
+    } else {
+      resultType = llvm::Type::getVoidTy(getLLVMContext());
+    }
+    break;
+
   case ABIArgInfo::Indirect: {
     assert(!retAI.getIndirectAlign() && "Align unused on indirect return.");
     resultType = llvm::Type::getVoidTy(getLLVMContext());
@@ -934,6 +979,7 @@
 
     switch (argAI.getKind()) {
     case ABIArgInfo::Ignore:
+    case ABIArgInfo::InAlloca:
       break;
 
     case ABIArgInfo::Indirect: {
@@ -964,6 +1010,10 @@
     }
   }
 
+  // Add the inalloca struct as the last parameter type.
+  if (llvm::StructType *ArgStruct = FI.getArgStruct())
+    argTypes.push_back(ArgStruct->getPointerTo());
+
   bool Erased = FunctionsBeingProcessed.erase(&FI); (void)Erased;
   assert(Erased && "Not in set?");
   
@@ -1006,6 +1056,8 @@
       FuncAttrs.addAttribute(llvm::Attribute::NoUnwind);
     if (TargetDecl->hasAttr<NoReturnAttr>())
       FuncAttrs.addAttribute(llvm::Attribute::NoReturn);
+    if (TargetDecl->hasAttr<NoDuplicateAttr>())
+      FuncAttrs.addAttribute(llvm::Attribute::NoDuplicate);
 
     if (const FunctionDecl *Fn = dyn_cast<FunctionDecl>(TargetDecl)) {
       const FunctionProtoType *FPT = Fn->getType()->getAs<FunctionProtoType>();
@@ -1089,6 +1141,13 @@
   case ABIArgInfo::Ignore:
     break;
 
+  case ABIArgInfo::InAlloca: {
+    // inalloca disables readnone and readonly
+    FuncAttrs.removeAttribute(llvm::Attribute::ReadOnly)
+      .removeAttribute(llvm::Attribute::ReadNone);
+    break;
+  }
+
   case ABIArgInfo::Indirect: {
     llvm::AttrBuilder SRETAttrs;
     SRETAttrs.addAttribute(llvm::Attribute::StructRet);
@@ -1114,10 +1173,9 @@
                                     llvm::AttributeSet::ReturnIndex,
                                     RetAttrs));
 
-  for (CGFunctionInfo::const_arg_iterator it = FI.arg_begin(),
-         ie = FI.arg_end(); it != ie; ++it) {
-    QualType ParamType = it->type;
-    const ABIArgInfo &AI = it->info;
+  for (const auto &I : FI.arguments()) {
+    QualType ParamType = I.type;
+    const ABIArgInfo &AI = I.info;
     llvm::AttrBuilder Attrs;
 
     if (AI.getPaddingType()) {
@@ -1173,6 +1231,13 @@
       // Skip increment, no matching LLVM parameter.
       continue;
 
+    case ABIArgInfo::InAlloca:
+      // inalloca disables readnone and readonly.
+      FuncAttrs.removeAttribute(llvm::Attribute::ReadOnly)
+          .removeAttribute(llvm::Attribute::ReadNone);
+      // Skip increment, no matching LLVM parameter.
+      continue;
+
     case ABIArgInfo::Expand: {
       SmallVector<llvm::Type*, 8> types;
       // FIXME: This is rather inefficient. Do we ever actually need to do
@@ -1188,6 +1253,14 @@
       PAL.push_back(llvm::AttributeSet::get(getLLVMContext(), Index, Attrs));
     ++Index;
   }
+
+  // Add the inalloca attribute to the trailing inalloca parameter if present.
+  if (FI.usesInAlloca()) {
+    llvm::AttrBuilder Attrs;
+    Attrs.addAttribute(llvm::Attribute::InAlloca);
+    PAL.push_back(llvm::AttributeSet::get(getLLVMContext(), Index, Attrs));
+  }
+
   if (FuncAttrs.hasAttributes())
     PAL.push_back(llvm::
                   AttributeSet::get(getLLVMContext(),
@@ -1224,7 +1297,7 @@
   // return statements.
   if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(CurCodeDecl)) {
     if (FD->hasImplicitReturnZero()) {
-      QualType RetTy = FD->getResultType().getUnqualifiedType();
+      QualType RetTy = FD->getReturnType().getUnqualifiedType();
       llvm::Type* LLVMTy = CGM.getTypes().ConvertType(RetTy);
       llvm::Constant* Zero = llvm::Constant::getNullValue(LLVMTy);
       Builder.CreateStore(Zero, ReturnValue);
@@ -1237,6 +1310,16 @@
   // Emit allocs for param decls.  Give the LLVM Argument nodes names.
   llvm::Function::arg_iterator AI = Fn->arg_begin();
 
+  // If we're using inalloca, all the memory arguments are GEPs off of the last
+  // parameter, which is a pointer to the complete memory area.
+  llvm::Value *ArgStruct = 0;
+  if (FI.usesInAlloca()) {
+    llvm::Function::arg_iterator EI = Fn->arg_end();
+    --EI;
+    ArgStruct = EI;
+    assert(ArgStruct->getType() == FI.getArgStruct()->getPointerTo());
+  }
+
   // Name the struct return argument.
   if (CGM.ReturnTypeUsesSRet(FI)) {
     AI->setName("agg.result");
@@ -1246,6 +1329,18 @@
     ++AI;
   }
 
+  // Track if we received the parameter as a pointer (indirect, byval, or
+  // inalloca).  If already have a pointer, EmitParmDecl doesn't need to copy it
+  // into a local alloca for us.
+  enum ValOrPointer { HaveValue = 0, HavePointer = 1 };
+  typedef llvm::PointerIntPair<llvm::Value *, 1> ValueAndIsPtr;
+  SmallVector<ValueAndIsPtr, 16> ArgVals;
+  ArgVals.reserve(Args.size());
+
+  // Create a pointer value for every parameter declaration.  This usually
+  // entails copying one or more LLVM IR arguments into an alloca.  Don't push
+  // any cleanups or do anything that might unwind.  We do that separately, so
+  // we can push the cleanups in the correct order for the ABI.
   assert(FI.arg_size() == Args.size() &&
          "Mismatch between function signature & arguments.");
   unsigned ArgNo = 1;
@@ -1264,6 +1359,13 @@
       ++AI;
 
     switch (ArgI.getKind()) {
+    case ABIArgInfo::InAlloca: {
+      llvm::Value *V = Builder.CreateStructGEP(
+          ArgStruct, ArgI.getInAllocaFieldIndex(), Arg->getName());
+      ArgVals.push_back(ValueAndIsPtr(V, HavePointer));
+      continue;  // Don't increment AI!
+    }
+
     case ABIArgInfo::Indirect: {
       llvm::Value *V = AI;
 
@@ -1290,6 +1392,7 @@
                                false);
           V = AlignedTemp;
         }
+        ArgVals.push_back(ValueAndIsPtr(V, HavePointer));
       } else {
         // Load scalar value from indirect argument.
         CharUnits Alignment = getContext().getTypeAlignInChars(Ty);
@@ -1298,8 +1401,8 @@
 
         if (isPromoted)
           V = emitArgumentDemotion(*this, Arg, V);
+        ArgVals.push_back(ValueAndIsPtr(V, HaveValue));
       }
-      EmitParmDecl(*Arg, V, ArgNo);
       break;
     }
 
@@ -1340,7 +1443,7 @@
         if (V->getType() != LTy)
           V = Builder.CreateBitCast(V, LTy);
 
-        EmitParmDecl(*Arg, V, ArgNo);
+        ArgVals.push_back(ValueAndIsPtr(V, HaveValue));
         break;
       }
 
@@ -1412,8 +1515,10 @@
         V = EmitLoadOfScalar(V, false, AlignmentToUse, Ty, Arg->getLocStart());
         if (isPromoted)
           V = emitArgumentDemotion(*this, Arg, V);
+        ArgVals.push_back(ValueAndIsPtr(V, HaveValue));
+      } else {
+        ArgVals.push_back(ValueAndIsPtr(V, HavePointer));
       }
-      EmitParmDecl(*Arg, V, ArgNo);
       continue;  // Skip ++AI increment, already done.
     }
 
@@ -1426,7 +1531,7 @@
       Alloca->setAlignment(Align.getQuantity());
       LValue LV = MakeAddrLValue(Alloca, Ty, Align);
       llvm::Function::arg_iterator End = ExpandTypeFromArgs(Ty, LV, AI);
-      EmitParmDecl(*Arg, Alloca, ArgNo);
+      ArgVals.push_back(ValueAndIsPtr(Alloca, HavePointer));
 
       // Name the arguments used in expansion and increment AI.
       unsigned Index = 0;
@@ -1437,11 +1542,12 @@
 
     case ABIArgInfo::Ignore:
       // Initialize the local variable appropriately.
-      if (!hasScalarEvaluationKind(Ty))
-        EmitParmDecl(*Arg, CreateMemTemp(Ty), ArgNo);
-      else
-        EmitParmDecl(*Arg, llvm::UndefValue::get(ConvertType(Arg->getType())),
-                     ArgNo);
+      if (!hasScalarEvaluationKind(Ty)) {
+        ArgVals.push_back(ValueAndIsPtr(CreateMemTemp(Ty), HavePointer));
+      } else {
+        llvm::Value *U = llvm::UndefValue::get(ConvertType(Arg->getType()));
+        ArgVals.push_back(ValueAndIsPtr(U, HaveValue));
+      }
 
       // Skip increment, no matching LLVM parameter.
       continue;
@@ -1449,7 +1555,20 @@
 
     ++AI;
   }
+
+  if (FI.usesInAlloca())
+    ++AI;
   assert(AI == Fn->arg_end() && "Argument mismatch!");
+
+  if (getTarget().getCXXABI().areArgsDestroyedLeftToRightInCallee()) {
+    for (int I = Args.size() - 1; I >= 0; --I)
+      EmitParmDecl(*Args[I], ArgVals[I].getPointer(), ArgVals[I].getInt(),
+                   I + 1);
+  } else {
+    for (unsigned I = 0, E = Args.size(); I != E; ++I)
+      EmitParmDecl(*Args[I], ArgVals[I].getPointer(), ArgVals[I].getInt(),
+                   I + 1);
+  }
 }
 
 static void eraseUnusedBitCasts(llvm::Instruction *insn) {
@@ -1626,7 +1745,7 @@
   }
 
   llvm::StoreInst *store =
-    dyn_cast<llvm::StoreInst>(CGF.ReturnValue->use_back());
+    dyn_cast<llvm::StoreInst>(CGF.ReturnValue->user_back());
   if (!store) return 0;
 
   // These aren't actually possible for non-coerced returns, and we
@@ -1662,6 +1781,20 @@
   const ABIArgInfo &RetAI = FI.getReturnInfo();
 
   switch (RetAI.getKind()) {
+  case ABIArgInfo::InAlloca:
+    // Aggregrates get evaluated directly into the destination.  Sometimes we
+    // need to return the sret value in a register, though.
+    assert(hasAggregateEvaluationKind(RetTy));
+    if (RetAI.getInAllocaSRet()) {
+      llvm::Function::arg_iterator EI = CurFn->arg_end();
+      --EI;
+      llvm::Value *ArgStruct = EI;
+      llvm::Value *SRet =
+          Builder.CreateStructGEP(ArgStruct, RetAI.getInAllocaFieldIndex());
+      RV = Builder.CreateLoad(SRet, "sret");
+    }
+    break;
+
   case ABIArgInfo::Indirect: {
     switch (getEvaluationKind(RetTy)) {
     case TEK_Complex: {
@@ -1750,6 +1883,25 @@
     Ret->setDebugLoc(RetDbgLoc);
 }
 
+static bool isInAllocaArgument(CGCXXABI &ABI, QualType type) {
+  const CXXRecordDecl *RD = type->getAsCXXRecordDecl();
+  return RD && ABI.getRecordArgABI(RD) == CGCXXABI::RAA_DirectInMemory;
+}
+
+static AggValueSlot createPlaceholderSlot(CodeGenFunction &CGF, QualType Ty) {
+  // FIXME: Generate IR in one pass, rather than going back and fixing up these
+  // placeholders.
+  llvm::Type *IRTy = CGF.ConvertTypeForMem(Ty);
+  llvm::Value *Placeholder =
+      llvm::UndefValue::get(IRTy->getPointerTo()->getPointerTo());
+  Placeholder = CGF.Builder.CreateLoad(Placeholder);
+  return AggValueSlot::forAddr(Placeholder, CharUnits::Zero(),
+                               Ty.getQualifiers(),
+                               AggValueSlot::IsNotDestructed,
+                               AggValueSlot::DoesNotNeedGCBarriers,
+                               AggValueSlot::IsNotAliased);
+}
+
 void CodeGenFunction::EmitDelegateCallArg(CallArgList &args,
                                           const VarDecl *param,
                                           SourceLocation loc) {
@@ -1773,6 +1925,20 @@
     return args.add(RValue::get(Builder.CreateLoad(local)), type);
   }
 
+  if (isInAllocaArgument(CGM.getCXXABI(), type)) {
+    AggValueSlot Slot = createPlaceholderSlot(*this, type);
+    Slot.setExternallyDestructed();
+
+    // FIXME: Either emit a copy constructor call, or figure out how to do
+    // guaranteed tail calls with perfect forwarding in LLVM.
+    CGM.ErrorUnsupported(param, "non-trivial argument copy for thunk");
+    EmitNullInitialization(Slot.getAddr(), type);
+
+    RValue RV = Slot.asRValue();
+    args.add(RV, type);
+    return;
+  }
+
   args.add(convertTempToRValue(local, type, loc), type);
 }
 
@@ -1852,14 +2018,13 @@
 
 static void emitWritebacks(CodeGenFunction &CGF,
                            const CallArgList &args) {
-  for (CallArgList::writeback_iterator
-         i = args.writeback_begin(), e = args.writeback_end(); i != e; ++i)
-    emitWriteback(CGF, *i);
+  for (const auto &I : args.writebacks())
+    emitWriteback(CGF, I);
 }
 
 static void deactivateArgCleanupsBeforeCall(CodeGenFunction &CGF,
                                             const CallArgList &CallArgs) {
-  assert(CGF.getTarget().getCXXABI().isArgumentDestroyedByCallee());
+  assert(CGF.getTarget().getCXXABI().areArgsDestroyedLeftToRightInCallee());
   ArrayRef<CallArgList::CallArgCleanup> Cleanups =
     CallArgs.getCleanupsToDeactivate();
   // Iterate in reverse to increase the likelihood of popping the cleanup.
@@ -2004,6 +2169,99 @@
   args.add(RValue::get(finalArgument), CRE->getType());
 }
 
+void CallArgList::allocateArgumentMemory(CodeGenFunction &CGF) {
+  assert(!StackBase && !StackCleanup.isValid());
+
+  // Save the stack.
+  llvm::Function *F = CGF.CGM.getIntrinsic(llvm::Intrinsic::stacksave);
+  StackBase = CGF.Builder.CreateCall(F, "inalloca.save");
+
+  // Control gets really tied up in landing pads, so we have to spill the
+  // stacksave to an alloca to avoid violating SSA form.
+  // TODO: This is dead if we never emit the cleanup.  We should create the
+  // alloca and store lazily on the first cleanup emission.
+  StackBaseMem = CGF.CreateTempAlloca(CGF.Int8PtrTy, "inalloca.spmem");
+  CGF.Builder.CreateStore(StackBase, StackBaseMem);
+  CGF.pushStackRestore(EHCleanup, StackBaseMem);
+  StackCleanup = CGF.EHStack.getInnermostEHScope();
+  assert(StackCleanup.isValid());
+}
+
+void CallArgList::freeArgumentMemory(CodeGenFunction &CGF) const {
+  if (StackBase) {
+    CGF.DeactivateCleanupBlock(StackCleanup, StackBase);
+    llvm::Value *F = CGF.CGM.getIntrinsic(llvm::Intrinsic::stackrestore);
+    // We could load StackBase from StackBaseMem, but in the non-exceptional
+    // case we can skip it.
+    CGF.Builder.CreateCall(F, StackBase);
+  }
+}
+
+void CodeGenFunction::EmitCallArgs(CallArgList &Args,
+                                   ArrayRef<QualType> ArgTypes,
+                                   CallExpr::const_arg_iterator ArgBeg,
+                                   CallExpr::const_arg_iterator ArgEnd,
+                                   bool ForceColumnInfo) {
+  CGDebugInfo *DI = getDebugInfo();
+  SourceLocation CallLoc;
+  if (DI) CallLoc = DI->getLocation();
+
+  // We *have* to evaluate arguments from right to left in the MS C++ ABI,
+  // because arguments are destroyed left to right in the callee.
+  if (CGM.getTarget().getCXXABI().areArgsDestroyedLeftToRightInCallee()) {
+    // Insert a stack save if we're going to need any inalloca args.
+    bool HasInAllocaArgs = false;
+    for (ArrayRef<QualType>::iterator I = ArgTypes.begin(), E = ArgTypes.end();
+         I != E && !HasInAllocaArgs; ++I)
+      HasInAllocaArgs = isInAllocaArgument(CGM.getCXXABI(), *I);
+    if (HasInAllocaArgs) {
+      assert(getTarget().getTriple().getArch() == llvm::Triple::x86);
+      Args.allocateArgumentMemory(*this);
+    }
+
+    // Evaluate each argument.
+    size_t CallArgsStart = Args.size();
+    for (int I = ArgTypes.size() - 1; I >= 0; --I) {
+      CallExpr::const_arg_iterator Arg = ArgBeg + I;
+      EmitCallArg(Args, *Arg, ArgTypes[I]);
+      // Restore the debug location.
+      if (DI) DI->EmitLocation(Builder, CallLoc, ForceColumnInfo);
+    }
+
+    // Un-reverse the arguments we just evaluated so they match up with the LLVM
+    // IR function.
+    std::reverse(Args.begin() + CallArgsStart, Args.end());
+    return;
+  }
+
+  for (unsigned I = 0, E = ArgTypes.size(); I != E; ++I) {
+    CallExpr::const_arg_iterator Arg = ArgBeg + I;
+    assert(Arg != ArgEnd);
+    EmitCallArg(Args, *Arg, ArgTypes[I]);
+    // Restore the debug location.
+    if (DI) DI->EmitLocation(Builder, CallLoc, ForceColumnInfo);
+  }
+}
+
+namespace {
+
+struct DestroyUnpassedArg : EHScopeStack::Cleanup {
+  DestroyUnpassedArg(llvm::Value *Addr, QualType Ty)
+      : Addr(Addr), Ty(Ty) {}
+
+  llvm::Value *Addr;
+  QualType Ty;
+
+  void Emit(CodeGenFunction &CGF, Flags flags) override {
+    const CXXDestructorDecl *Dtor = Ty->getAsCXXRecordDecl()->getDestructor();
+    assert(!Dtor->isTrivial());
+    CGF.EmitCXXDestructorCall(Dtor, Dtor_Complete, /*for vbase*/ false,
+                              /*Delegating=*/false, Addr);
+  }
+};
+
+}
+
 void CodeGenFunction::EmitCallArg(CallArgList &args, const Expr *E,
                                   QualType type) {
   if (const ObjCIndirectCopyRestoreExpr *CRE
@@ -2026,23 +2284,25 @@
   // In the Microsoft C++ ABI, aggregate arguments are destructed by the callee.
   // However, we still have to push an EH-only cleanup in case we unwind before
   // we make it to the call.
-  if (HasAggregateEvalKind &&
-      CGM.getTarget().getCXXABI().isArgumentDestroyedByCallee()) {
-    const CXXRecordDecl *RD = type->getAsCXXRecordDecl();
-    if (RD && RD->hasNonTrivialDestructor()) {
-      AggValueSlot Slot = CreateAggTemp(type, "agg.arg.tmp");
-      Slot.setExternallyDestructed();
-      EmitAggExpr(E, Slot);
-      RValue RV = Slot.asRValue();
-      args.add(RV, type);
+  if (HasAggregateEvalKind && args.isUsingInAlloca()) {
+    assert(getTarget().getTriple().getArch() == llvm::Triple::x86);
+    AggValueSlot Slot = createPlaceholderSlot(*this, type);
+    Slot.setExternallyDestructed();
+    EmitAggExpr(E, Slot);
+    RValue RV = Slot.asRValue();
+    args.add(RV, type);
 
-      pushDestroy(EHCleanup, RV.getAggregateAddr(), type, destroyCXXObject,
-                  /*useEHCleanupForArray*/ true);
+    const CXXRecordDecl *RD = type->getAsCXXRecordDecl();
+    if (RD->hasNonTrivialDestructor()) {
+      // Create a no-op GEP between the placeholder and the cleanup so we can
+      // RAUW it successfully.  It also serves as a marker of the first
+      // instruction where the cleanup is active.
+      pushFullExprCleanup<DestroyUnpassedArg>(EHCleanup, Slot.getAddr(), type);
       // This unreachable is a temporary marker which will be removed later.
       llvm::Instruction *IsActive = Builder.CreateUnreachable();
       args.addArgCleanupDeactivation(EHStack.getInnermostEHScope(), IsActive);
-      return;
     }
+    return;
   }
 
   if (HasAggregateEvalKind && isa<ImplicitCastExpr>(E) &&
@@ -2128,6 +2388,7 @@
     call->setCallingConv(getRuntimeCC());
     Builder.CreateUnreachable();
   }
+  PGO.setCurrentRegionUnreachable();
 }
 
 /// Emits a call or invoke instruction to the given nullary runtime
@@ -2209,9 +2470,7 @@
       const FieldDecl *LargestFD = 0;
       CharUnits UnionSize = CharUnits::Zero();
 
-      for (RecordDecl::field_iterator i = RD->field_begin(), e = RD->field_end();
-           i != e; ++i) {
-        const FieldDecl *FD = *i;
+      for (const auto *FD : RD->fields()) {
         assert(!FD->isBitField() &&
                "Cannot expand structure with bit-field members.");
         CharUnits FieldSize = getContext().getTypeSizeInChars(FD->getType());
@@ -2225,10 +2484,7 @@
         ExpandTypeToArgs(LargestFD->getType(), FldRV, Args, IRFuncTy);
       }
     } else {
-      for (RecordDecl::field_iterator i = RD->field_begin(), e = RD->field_end();
-           i != e; ++i) {
-        FieldDecl *FD = *i;
-
+      for (const auto *FD : RD->fields()) {
         RValue FldRV = EmitRValueForField(LV, FD, SourceLocation());
         ExpandTypeToArgs(FD->getType(), FldRV, Args, IRFuncTy);
       }
@@ -2251,6 +2507,20 @@
   }
 }
 
+/// \brief Store a non-aggregate value to an address to initialize it.  For
+/// initialization, a non-atomic store will be used.
+static void EmitInitStoreOfNonAggregate(CodeGenFunction &CGF, RValue Src,
+                                        LValue Dst) {
+  if (Src.isScalar())
+    CGF.EmitStoreOfScalar(Src.getScalarVal(), Dst, /*init=*/true);
+  else
+    CGF.EmitStoreOfComplex(Src.getComplexVal(), Dst, /*init=*/true);
+}
+
+void CodeGenFunction::deferPlaceholderReplacement(llvm::Instruction *Old,
+                                                  llvm::Value *New) {
+  DeferredReplacements.push_back(std::make_pair(Old, New));
+}
 
 RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
                                  llvm::Value *Callee,
@@ -2272,14 +2542,32 @@
     cast<llvm::FunctionType>(
                   cast<llvm::PointerType>(Callee->getType())->getElementType());
 
+  // If we're using inalloca, insert the allocation after the stack save.
+  // FIXME: Do this earlier rather than hacking it in here!
+  llvm::Value *ArgMemory = 0;
+  if (llvm::StructType *ArgStruct = CallInfo.getArgStruct()) {
+    llvm::AllocaInst *AI = new llvm::AllocaInst(
+        ArgStruct, "argmem", CallArgs.getStackBase()->getNextNode());
+    AI->setUsedWithInAlloca(true);
+    assert(AI->isUsedWithInAlloca() && !AI->isStaticAlloca());
+    ArgMemory = AI;
+  }
+
   // If the call returns a temporary with struct return, create a temporary
   // alloca to hold the result, unless one is given to us.
-  if (CGM.ReturnTypeUsesSRet(CallInfo)) {
-    llvm::Value *Value = ReturnValue.getValue();
-    if (!Value)
-      Value = CreateMemTemp(RetTy);
-    Args.push_back(Value);
-    checkArgMatches(Value, IRArgNo, IRFuncTy);
+  llvm::Value *SRetPtr = 0;
+  if (CGM.ReturnTypeUsesSRet(CallInfo) || RetAI.isInAlloca()) {
+    SRetPtr = ReturnValue.getValue();
+    if (!SRetPtr)
+      SRetPtr = CreateMemTemp(RetTy);
+    if (CGM.ReturnTypeUsesSRet(CallInfo)) {
+      Args.push_back(SRetPtr);
+      checkArgMatches(SRetPtr, IRArgNo, IRFuncTy);
+    } else {
+      llvm::Value *Addr =
+          Builder.CreateStructGEP(ArgMemory, RetAI.getInAllocaFieldIndex());
+      Builder.CreateStore(SRetPtr, Addr);
+    }
   }
 
   assert(CallInfo.arg_size() == CallArgs.size() &&
@@ -2299,6 +2587,35 @@
     }
 
     switch (ArgInfo.getKind()) {
+    case ABIArgInfo::InAlloca: {
+      assert(getTarget().getTriple().getArch() == llvm::Triple::x86);
+      if (RV.isAggregate()) {
+        // Replace the placeholder with the appropriate argument slot GEP.
+        llvm::Instruction *Placeholder =
+            cast<llvm::Instruction>(RV.getAggregateAddr());
+        CGBuilderTy::InsertPoint IP = Builder.saveIP();
+        Builder.SetInsertPoint(Placeholder);
+        llvm::Value *Addr = Builder.CreateStructGEP(
+            ArgMemory, ArgInfo.getInAllocaFieldIndex());
+        Builder.restoreIP(IP);
+        deferPlaceholderReplacement(Placeholder, Addr);
+      } else {
+        // Store the RValue into the argument struct.
+        llvm::Value *Addr =
+            Builder.CreateStructGEP(ArgMemory, ArgInfo.getInAllocaFieldIndex());
+        unsigned AS = Addr->getType()->getPointerAddressSpace();
+        llvm::Type *MemType = ConvertTypeForMem(I->Ty)->getPointerTo(AS);
+        // There are some cases where a trivial bitcast is not avoidable.  The
+        // definition of a type later in a translation unit may change it's type
+        // from {}* to (%struct.foo*)*.
+        if (Addr->getType() != MemType)
+          Addr = Builder.CreateBitCast(Addr, MemType);
+        LValue argLV = MakeAddrLValue(Addr, I->Ty, TypeAlign);
+        EmitInitStoreOfNonAggregate(*this, RV, argLV);
+      }
+      break; // Don't increment IRArgNo!
+    }
+
     case ABIArgInfo::Indirect: {
       if (RV.isScalar() || RV.isComplex()) {
         // Make a temporary alloca to pass the argument.
@@ -2307,13 +2624,8 @@
           AI->setAlignment(ArgInfo.getIndirectAlign());
         Args.push_back(AI);
 
-        LValue argLV =
-          MakeAddrLValue(Args.back(), I->Ty, TypeAlign);
-        
-        if (RV.isScalar())
-          EmitStoreOfScalar(RV.getScalarVal(), argLV, /*init*/ true);
-        else
-          EmitStoreOfComplex(RV.getComplexVal(), argLV, /*init*/ true);
+        LValue argLV = MakeAddrLValue(Args.back(), I->Ty, TypeAlign);
+        EmitInitStoreOfNonAggregate(*this, RV, argLV);
         
         // Validate argument match.
         checkArgMatches(AI, IRArgNo, IRFuncTy);
@@ -2386,11 +2698,7 @@
       if (RV.isScalar() || RV.isComplex()) {
         SrcPtr = CreateMemTemp(I->Ty, "coerce");
         LValue SrcLV = MakeAddrLValue(SrcPtr, I->Ty, TypeAlign);
-        if (RV.isScalar()) {
-          EmitStoreOfScalar(RV.getScalarVal(), SrcLV, /*init*/ true);
-        } else {
-          EmitStoreOfComplex(RV.getComplexVal(), SrcLV, /*init*/ true);
-        }
+        EmitInitStoreOfNonAggregate(*this, RV, SrcLV);
       } else
         SrcPtr = RV.getAggregateAddr();
 
@@ -2456,6 +2764,34 @@
     }
   }
 
+  if (ArgMemory) {
+    llvm::Value *Arg = ArgMemory;
+    llvm::Type *LastParamTy =
+        IRFuncTy->getParamType(IRFuncTy->getNumParams() - 1);
+    if (Arg->getType() != LastParamTy) {
+#ifndef NDEBUG
+      // Assert that these structs have equivalent element types.
+      llvm::StructType *FullTy = CallInfo.getArgStruct();
+      llvm::StructType *Prefix = cast<llvm::StructType>(
+          cast<llvm::PointerType>(LastParamTy)->getElementType());
+
+      // For variadic functions, the caller might supply a larger struct than
+      // the callee expects, and that's OK.
+      assert(Prefix->getNumElements() == FullTy->getNumElements() ||
+             (CallInfo.isVariadic() &&
+              Prefix->getNumElements() <= FullTy->getNumElements()));
+
+      for (llvm::StructType::element_iterator PI = Prefix->element_begin(),
+                                              PE = Prefix->element_end(),
+                                              FI = FullTy->element_begin();
+           PI != PE; ++PI, ++FI)
+        assert(*PI == *FI);
+#endif
+      Arg = Builder.CreateBitCast(Arg, LastParamTy);
+    }
+    Args.push_back(Arg);
+  }
+
   if (!CallArgs.getCleanupsToDeactivate().empty())
     deactivateArgCleanupsBeforeCall(*this, CallArgs);
 
@@ -2545,9 +2881,14 @@
   if (CallArgs.hasWritebacks())
     emitWritebacks(*this, CallArgs);
 
+  // The stack cleanup for inalloca arguments has to run out of the normal
+  // lexical order, so deactivate it and run it manually here.
+  CallArgs.freeArgumentMemory(*this);
+
   switch (RetAI.getKind()) {
+  case ABIArgInfo::InAlloca:
   case ABIArgInfo::Indirect:
-    return convertTempToRValue(Args[0], RetTy, SourceLocation());
+    return convertTempToRValue(SRetPtr, RetTy, SourceLocation());
 
   case ABIArgInfo::Ignore:
     // If we are ignoring an argument that had a result, make sure to
diff --git a/lib/CodeGen/CGCall.h b/lib/CodeGen/CGCall.h
index 532cb59..2e43d1d 100644
--- a/lib/CodeGen/CGCall.h
+++ b/lib/CodeGen/CGCall.h
@@ -56,6 +56,8 @@
   class CallArgList :
     public SmallVector<CallArg, 16> {
   public:
+    CallArgList() : StackBase(0), StackBaseMem(0) {}
+
     struct Writeback {
       /// The original argument.  Note that the argument l-value
       /// is potentially null.
@@ -97,9 +99,12 @@
 
     bool hasWritebacks() const { return !Writebacks.empty(); }
 
-    typedef SmallVectorImpl<Writeback>::const_iterator writeback_iterator;
-    writeback_iterator writeback_begin() const { return Writebacks.begin(); }
-    writeback_iterator writeback_end() const { return Writebacks.end(); }
+    typedef llvm::iterator_range<SmallVectorImpl<Writeback>::const_iterator>
+      writeback_const_range;
+
+    writeback_const_range writebacks() const {
+      return writeback_const_range(Writebacks.begin(), Writebacks.end());
+    }
 
     void addArgCleanupDeactivation(EHScopeStack::stable_iterator Cleanup,
                                    llvm::Instruction *IsActiveIP) {
@@ -113,6 +118,14 @@
       return CleanupsToDeactivate;
     }
 
+    void allocateArgumentMemory(CodeGenFunction &CGF);
+    llvm::Instruction *getStackBase() const { return StackBase; }
+    void freeArgumentMemory(CodeGenFunction &CGF) const;
+
+    /// \brief Returns if we're using an inalloca struct to pass arguments in
+    /// memory.
+    bool isUsingInAlloca() const { return StackBase; }
+
   private:
     SmallVector<Writeback, 1> Writebacks;
 
@@ -120,6 +133,17 @@
     /// is used to cleanup objects that are owned by the callee once the call
     /// occurs.
     SmallVector<CallArgCleanup, 1> CleanupsToDeactivate;
+
+    /// The stacksave call.  It dominates all of the argument evaluation.
+    llvm::CallInst *StackBase;
+
+    /// The alloca holding the stackbase.  We need it to maintain SSA form.
+    llvm::AllocaInst *StackBaseMem;
+
+    /// The iterator pointing to the stack restore cleanup.  We manually run and
+    /// deactivate this cleanup after the call in the unexceptional case because
+    /// it doesn't run in the normal order.
+    EHScopeStack::stable_iterator StackCleanup;
   };
 
   /// FunctionArgList - Type for representing both the decl and type
diff --git a/lib/CodeGen/CGClass.cpp b/lib/CodeGen/CGClass.cpp
index 4848d75..071f8b3 100644
--- a/lib/CodeGen/CGClass.cpp
+++ b/lib/CodeGen/CGClass.cpp
@@ -12,10 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "CGBlocks.h"
+#include "CGCXXABI.h"
 #include "CGDebugInfo.h"
 #include "CGRecordLayout.h"
 #include "CodeGenFunction.h"
-#include "CGCXXABI.h"
 #include "clang/AST/CXXInheritance.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/EvaluatedExprVisitor.h"
@@ -342,7 +342,7 @@
     CallBaseDtor(const CXXRecordDecl *Base, bool BaseIsVirtual)
       : BaseClass(Base), BaseIsVirtual(BaseIsVirtual) {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       const CXXRecordDecl *DerivedClass =
         cast<CXXMethodDecl>(CGF.CurCodeDecl)->getParent();
 
@@ -549,10 +549,8 @@
     // If we are initializing an anonymous union field, drill down to
     // the field.
     IndirectFieldDecl *IndirectField = MemberInit->getIndirectMember();
-    IndirectFieldDecl::chain_iterator I = IndirectField->chain_begin(),
-      IEnd = IndirectField->chain_end();
-    for ( ; I != IEnd; ++I)
-      LHS = CGF.EmitLValueForFieldInitialization(LHS, cast<FieldDecl>(*I));
+    for (const auto *I : IndirectField->chain())
+      LHS = CGF.EmitLValueForFieldInitialization(LHS, cast<FieldDecl>(I));
     FieldType = MemberInit->getIndirectMember()->getAnonField()->getType();
   } else {
     LHS = CGF.EmitLValueForFieldInitialization(LHS, Field);
@@ -699,6 +697,10 @@
   const CXXConstructorDecl *Ctor = cast<CXXConstructorDecl>(CurGD.getDecl());
   CXXCtorType CtorType = CurGD.getCtorType();
 
+  assert((CGM.getTarget().getCXXABI().hasConstructorVariants() ||
+          CtorType == Ctor_Complete) &&
+         "can only generate complete ctor for this ABI");
+
   // Before we go any further, try the complete->base constructor
   // delegation optimization.
   if (CtorType == Ctor_Complete && IsConstructorDelegationValid(Ctor) &&
@@ -717,6 +719,9 @@
   if (IsTryBody)
     EnterCXXTryStmt(*cast<CXXTryStmt>(Body), true);
 
+  RegionCounter Cnt = getPGORegionCounter(Body);
+  Cnt.beginRegion(Builder);
+
   RunCleanupsScope RunCleanups(*this);
 
   // TODO: in restricted cases, we can emit the vbase initializers of
@@ -1190,23 +1195,17 @@
     return false;
 
   // Check fields.
-  for (CXXRecordDecl::field_iterator I = BaseClassDecl->field_begin(),
-       E = BaseClassDecl->field_end(); I != E; ++I) {
-    const FieldDecl *Field = *I;
-    
+  for (const auto *Field : BaseClassDecl->fields())
     if (!FieldHasTrivialDestructorBody(Context, Field))
       return false;
-  }
 
   // Check non-virtual bases.
-  for (CXXRecordDecl::base_class_const_iterator I = 
-       BaseClassDecl->bases_begin(), E = BaseClassDecl->bases_end();
-       I != E; ++I) {
-    if (I->isVirtual())
+  for (const auto &I : BaseClassDecl->bases()) {
+    if (I.isVirtual())
       continue;
 
     const CXXRecordDecl *NonVirtualBase =
-      cast<CXXRecordDecl>(I->getType()->castAs<RecordType>()->getDecl());
+      cast<CXXRecordDecl>(I.getType()->castAs<RecordType>()->getDecl());
     if (!HasTrivialDestructorBody(Context, NonVirtualBase,
                                   MostDerivedClassDecl))
       return false;
@@ -1214,11 +1213,9 @@
 
   if (BaseClassDecl == MostDerivedClassDecl) {
     // Check virtual bases.
-    for (CXXRecordDecl::base_class_const_iterator I = 
-         BaseClassDecl->vbases_begin(), E = BaseClassDecl->vbases_end();
-         I != E; ++I) {
+    for (const auto &I : BaseClassDecl->vbases()) {
       const CXXRecordDecl *VirtualBase =
-        cast<CXXRecordDecl>(I->getType()->castAs<RecordType>()->getDecl());
+        cast<CXXRecordDecl>(I.getType()->castAs<RecordType>()->getDecl());
       if (!HasTrivialDestructorBody(Context, VirtualBase,
                                     MostDerivedClassDecl))
         return false;      
@@ -1251,13 +1248,9 @@
 
   // Check the fields.
   const CXXRecordDecl *ClassDecl = Dtor->getParent();
-  for (CXXRecordDecl::field_iterator I = ClassDecl->field_begin(),
-       E = ClassDecl->field_end(); I != E; ++I) {
-    const FieldDecl *Field = *I;
-
+  for (const auto *Field : ClassDecl->fields())
     if (!FieldHasTrivialDestructorBody(Context, Field))
       return false;
-  }
 
   return true;
 }
@@ -1315,6 +1308,9 @@
   case Dtor_Base:
     assert(Body);
 
+    RegionCounter Cnt = getPGORegionCounter(Body);
+    Cnt.beginRegion(Builder);
+
     // Enter the cleanup scopes for fields and non-virtual bases.
     EnterDtorCleanups(Dtor, Dtor_Base);
 
@@ -1355,11 +1351,8 @@
   LexicalScope Scope(*this, RootCS->getSourceRange());
 
   AssignmentMemcpyizer AM(*this, AssignOp, Args);
-  for (CompoundStmt::const_body_iterator I = RootCS->body_begin(),
-                                         E = RootCS->body_end();
-       I != E; ++I) {
-    AM.emitAssignment(*I);  
-  }
+  for (auto *I : RootCS->body())
+    AM.emitAssignment(I);  
   AM.finish();
 }
 
@@ -1368,7 +1361,7 @@
   struct CallDtorDelete : EHScopeStack::Cleanup {
     CallDtorDelete() {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       const CXXDestructorDecl *Dtor = cast<CXXDestructorDecl>(CGF.CurCodeDecl);
       const CXXRecordDecl *ClassDecl = Dtor->getParent();
       CGF.EmitDeleteCall(Dtor->getOperatorDelete(), CGF.LoadCXXThis(),
@@ -1384,7 +1377,7 @@
       assert(ShouldDeleteCondition != NULL);
     }
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       llvm::BasicBlock *callDeleteBB = CGF.createBasicBlock("dtor.call_delete");
       llvm::BasicBlock *continueBB = CGF.createBasicBlock("dtor.continue");
       llvm::Value *ShouldCallDelete
@@ -1413,7 +1406,7 @@
       : field(field), destroyer(destroyer),
         useEHCleanupForArray(useEHCleanupForArray) {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       // Find the address of the field.
       llvm::Value *thisValue = CGF.LoadCXXThis();
       QualType RecordTy = CGF.getContext().getTagDeclType(field->getParent());
@@ -1427,7 +1420,7 @@
   };
 }
 
-/// EmitDtorEpilogue - Emit all code that comes at the end of class's
+/// \brief Emit all code that comes at the end of class's
 /// destructor. This is to call destructors on members and base classes
 /// in reverse order of their construction.
 void CodeGenFunction::EnterDtorCleanups(const CXXDestructorDecl *DD,
@@ -1439,7 +1432,7 @@
   // operator delete that Sema picked up.
   if (DtorType == Dtor_Deleting) {
     assert(DD->getOperatorDelete() && 
-           "operator delete missing - EmitDtorEpilogue");
+           "operator delete missing - EnterDtorCleanups");
     if (CXXStructorImplicitParamValue) {
       // If there is an implicit param to the deleting dtor, it's a boolean
       // telling whether we should call delete at the end of the dtor.
@@ -1462,10 +1455,7 @@
 
     // We push them in the forward order so that they'll be popped in
     // the reverse order.
-    for (CXXRecordDecl::base_class_const_iterator I = 
-           ClassDecl->vbases_begin(), E = ClassDecl->vbases_end();
-              I != E; ++I) {
-      const CXXBaseSpecifier &Base = *I;
+    for (const auto &Base : ClassDecl->vbases()) {
       CXXRecordDecl *BaseClassDecl
         = cast<CXXRecordDecl>(Base.getType()->getAs<RecordType>()->getDecl());
     
@@ -1484,10 +1474,7 @@
   assert(DtorType == Dtor_Base);
   
   // Destroy non-virtual bases.
-  for (CXXRecordDecl::base_class_const_iterator I = 
-        ClassDecl->bases_begin(), E = ClassDecl->bases_end(); I != E; ++I) {
-    const CXXBaseSpecifier &Base = *I;
-    
+  for (const auto &Base : ClassDecl->bases()) {
     // Ignore virtual bases.
     if (Base.isVirtual())
       continue;
@@ -1504,11 +1491,8 @@
   }
 
   // Destroy direct fields.
-  SmallVector<const FieldDecl *, 16> FieldDecls;
-  for (CXXRecordDecl::field_iterator I = ClassDecl->field_begin(),
-       E = ClassDecl->field_end(); I != E; ++I) {
-    const FieldDecl *field = *I;
-    QualType type = field->getType();
+  for (const auto *Field : ClassDecl->fields()) {
+    QualType type = Field->getType();
     QualType::DestructionKind dtorKind = type.isDestructedType();
     if (!dtorKind) continue;
 
@@ -1517,7 +1501,7 @@
     if (RT && RT->getDecl()->isAnonymousStructOrUnion()) continue;
 
     CleanupKind cleanupKind = getCleanupKind(dtorKind);
-    EHStack.pushCleanup<DestroyField>(cleanupKind, field,
+    EHStack.pushCleanup<DestroyField>(cleanupKind, Field,
                                       getDestroyer(dtorKind),
                                       cleanupKind & EHCleanup);
   }
@@ -1683,9 +1667,31 @@
     return;
   }
 
-  // Non-trivial constructors are handled in an ABI-specific manner.
-  CGM.getCXXABI().EmitConstructorCall(*this, D, Type, ForVirtualBase,
-                                      Delegating, This, ArgBeg, ArgEnd);
+  // C++11 [class.mfct.non-static]p2:
+  //   If a non-static member function of a class X is called for an object that
+  //   is not of type X, or of a type derived from X, the behavior is undefined.
+  // FIXME: Provide a source location here.
+  EmitTypeCheck(CodeGenFunction::TCK_ConstructorCall, SourceLocation(), This,
+                getContext().getRecordType(D->getParent()));
+
+  CallArgList Args;
+
+  // Push the this ptr.
+  Args.add(RValue::get(This), D->getThisType(getContext()));
+
+  // Add the rest of the user-supplied arguments.
+  const FunctionProtoType *FPT = D->getType()->castAs<FunctionProtoType>();
+  EmitCallArgs(Args, FPT, ArgBeg, ArgEnd);
+
+  // Insert any ABI-specific implicit constructor arguments.
+  unsigned ExtraArgs = CGM.getCXXABI().addImplicitConstructorArgs(
+      *this, D, Type, ForVirtualBase, Delegating, Args);
+
+  // Emit the call.
+  llvm::Value *Callee = CGM.GetAddrOfCXXConstructor(D, Type);
+  const CGFunctionInfo &Info =
+      CGM.getTypes().arrangeCXXConstructorCall(Args, D, Type, ExtraArgs);
+  EmitCall(Info, Callee, ReturnValueSlot(), Args, D);
 }
 
 void
@@ -1704,38 +1710,23 @@
   assert(D->isInstance() &&
          "Trying to emit a member call expr on a static method!");
   
-  const FunctionProtoType *FPT = D->getType()->getAs<FunctionProtoType>();
+  const FunctionProtoType *FPT = D->getType()->castAs<FunctionProtoType>();
   
   CallArgList Args;
   
   // Push the this ptr.
   Args.add(RValue::get(This), D->getThisType(getContext()));
   
-  
   // Push the src ptr.
-  QualType QT = *(FPT->arg_type_begin());
+  QualType QT = *(FPT->param_type_begin());
   llvm::Type *t = CGM.getTypes().ConvertType(QT);
   Src = Builder.CreateBitCast(Src, t);
   Args.add(RValue::get(Src), QT);
-  
+
   // Skip over first argument (Src).
-  ++ArgBeg;
-  CallExpr::const_arg_iterator Arg = ArgBeg;
-  for (FunctionProtoType::arg_type_iterator I = FPT->arg_type_begin()+1,
-       E = FPT->arg_type_end(); I != E; ++I, ++Arg) {
-    assert(Arg != ArgEnd && "Running over edge of argument list!");
-    EmitCallArg(Args, *Arg, *I);
-  }
-  // Either we've emitted all the call args, or we have a call to a
-  // variadic function.
-  assert((Arg == ArgEnd || FPT->isVariadic()) &&
-         "Extra arguments in non-variadic function!");
-  // If we still have any arguments, emit them using the type of the argument.
-  for (; Arg != ArgEnd; ++Arg) {
-    QualType ArgType = Arg->getType();
-    EmitCallArg(Args, *Arg, ArgType);
-  }
-  
+  EmitCallArgs(Args, FPT->isVariadic(), FPT->param_type_begin() + 1,
+               FPT->param_type_end(), ArgBeg + 1, ArgEnd);
+
   EmitCall(CGM.getTypes().arrangeCXXMethodCall(Args, FPT, RequiredArgs::All),
            Callee, ReturnValueSlot(), Args, D);
 }
@@ -1790,7 +1781,7 @@
                            CXXDtorType Type)
       : Dtor(D), Addr(Addr), Type(Type) {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       CGF.EmitCXXDestructorCall(Dtor, Type, /*ForVirtualBase=*/false,
                                 /*Delegating=*/true, Addr);
     }
@@ -1830,23 +1821,8 @@
                                             bool ForVirtualBase,
                                             bool Delegating,
                                             llvm::Value *This) {
-  GlobalDecl GD(DD, Type);
-  llvm::Value *VTT = GetVTTParameter(GD, ForVirtualBase, Delegating);
-  llvm::Value *Callee = 0;
-  if (getLangOpts().AppleKext)
-    Callee = BuildAppleKextVirtualDestructorCall(DD, Type, 
-                                                 DD->getParent());
-    
-  if (!Callee)
-    Callee = CGM.GetAddrOfCXXDestructor(DD, Type);
-
-  if (DD->isVirtual())
-    This = CGM.getCXXABI().adjustThisArgumentForVirtualCall(*this, GD, This);
-
-  // FIXME: Provide a source location here.
-  EmitCXXMemberCall(DD, SourceLocation(), Callee, ReturnValueSlot(), This,
-                    VTT, getContext().getPointerType(getContext().VoidPtrTy),
-                    0, 0);
+  CGM.getCXXABI().EmitDestructorCall(*this, DD, Type, ForVirtualBase,
+                                     Delegating, This);
 }
 
 namespace {
@@ -1857,7 +1833,7 @@
     CallLocalDtor(const CXXDestructorDecl *D, llvm::Value *Addr)
       : Dtor(D), Addr(Addr) {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       CGF.EmitCXXDestructorCall(Dtor, Dtor_Complete,
                                 /*ForVirtualBase=*/false,
                                 /*Delegating=*/false, Addr);
@@ -1944,10 +1920,9 @@
   const CXXRecordDecl *RD = Base.getBase();
 
   // Traverse bases.
-  for (CXXRecordDecl::base_class_const_iterator I = RD->bases_begin(), 
-       E = RD->bases_end(); I != E; ++I) {
+  for (const auto &I : RD->bases()) {
     CXXRecordDecl *BaseDecl
-      = cast<CXXRecordDecl>(I->getType()->getAs<RecordType>()->getDecl());
+      = cast<CXXRecordDecl>(I.getType()->getAs<RecordType>()->getDecl());
 
     // Ignore classes without a vtable.
     if (!BaseDecl->isDynamicClass())
@@ -1957,7 +1932,7 @@
     CharUnits BaseOffsetFromNearestVBase;
     bool BaseDeclIsNonVirtualPrimaryBase;
 
-    if (I->isVirtual()) {
+    if (I.isVirtual()) {
       // Check if we've visited this virtual base before.
       if (!VBases.insert(BaseDecl))
         continue;
@@ -1978,7 +1953,7 @@
     }
     
     InitializeVTablePointers(BaseSubobject(BaseDecl, BaseOffset), 
-                             I->isVirtual() ? BaseDecl : NearestVBase,
+                             I.isVirtual() ? BaseDecl : NearestVBase,
                              BaseOffsetFromNearestVBase,
                              BaseDeclIsNonVirtualPrimaryBase, 
                              VTableClass, VBases);
@@ -2127,7 +2102,7 @@
   // Prepare the return slot.
   const FunctionProtoType *FPT =
     callOperator->getType()->castAs<FunctionProtoType>();
-  QualType resultType = FPT->getResultType();
+  QualType resultType = FPT->getReturnType();
   ReturnValueSlot returnSlot;
   if (!resultType->isVoidType() &&
       calleeFnInfo.getReturnInfo().getKind() == ABIArgInfo::Indirect &&
@@ -2162,11 +2137,9 @@
   CallArgs.add(RValue::get(ThisPtr), ThisType);
 
   // Add the rest of the parameters.
-  for (BlockDecl::param_const_iterator I = BD->param_begin(),
-       E = BD->param_end(); I != E; ++I) {
-    ParmVarDecl *param = *I;
+  for (auto param : BD->params())
     EmitDelegateCallArg(CallArgs, param, param->getLocStart());
-  }
+
   assert(!Lambda->isGenericLambda() && 
             "generic lambda interconversion to block not implemented");
   EmitForwardingCallToLambda(Lambda->getLambdaCallOperator(), CallArgs);
@@ -2194,11 +2167,9 @@
   CallArgs.add(RValue::get(ThisPtr), ThisType);
 
   // Add the rest of the parameters.
-  for (FunctionDecl::param_const_iterator I = MD->param_begin(),
-       E = MD->param_end(); I != E; ++I) {
-    ParmVarDecl *param = *I;
-    EmitDelegateCallArg(CallArgs, param, param->getLocStart());
-  }
+  for (auto Param : MD->params())
+    EmitDelegateCallArg(CallArgs, Param, Param->getLocStart());
+
   const CXXMethodDecl *CallOp = Lambda->getLambdaCallOperator();
   // For a generic lambda, find the corresponding call operator specialization
   // to which the call to the static-invoker shall be forwarded.
diff --git a/lib/CodeGen/CGCleanup.cpp b/lib/CodeGen/CGCleanup.cpp
index 65de4d4..8748224 100644
--- a/lib/CodeGen/CGCleanup.cpp
+++ b/lib/CodeGen/CGCleanup.cpp
@@ -528,7 +528,7 @@
   llvm::BasicBlock *unreachableBB = CGF.getUnreachableBlock();
   for (llvm::BasicBlock::use_iterator
          i = entry->use_begin(), e = entry->use_end(); i != e; ) {
-    llvm::Use &use = i.getUse();
+    llvm::Use &use = *i;
     ++i;
 
     use.set(unreachableBB);
@@ -860,7 +860,9 @@
 
   // Emit the EH cleanup if required.
   if (RequiresEHCleanup) {
-    if (CGDebugInfo *DI = getDebugInfo())
+    CGDebugInfo *DI = getDebugInfo();
+    SaveAndRestoreLocation AutoRestoreLocation(*this, Builder);
+    if (DI)
       DI->EmitLocation(Builder, CurEHLocation);
 
     CGBuilderTy::InsertPoint SavedIP = Builder.saveAndClearIP();
diff --git a/lib/CodeGen/CGCleanup.h b/lib/CodeGen/CGCleanup.h
index 1bd6bba..fed75bc 100644
--- a/lib/CodeGen/CGCleanup.h
+++ b/lib/CodeGen/CGCleanup.h
@@ -194,6 +194,15 @@
     return getHandlers()[I];
   }
 
+  // Clear all handler blocks.
+  // FIXME: it's better to always call clearHandlerBlocks in DTOR and have a
+  // 'takeHandler' or some such function which removes ownership from the
+  // EHCatchScope object if the handlers should live longer than EHCatchScope.
+  void clearHandlerBlocks() {
+    for (unsigned I = 0, N = getNumHandlers(); I != N; ++I)
+      delete getHandler(I).Block;
+  }
+
   typedef const Handler *iterator;
   iterator begin() const { return getHandlers(); }
   iterator end() const { return getHandlers() + getNumHandlers(); }
diff --git a/lib/CodeGen/CGDebugInfo.cpp b/lib/CodeGen/CGDebugInfo.cpp
index fcb26f0..0e94b51 100644
--- a/lib/CodeGen/CGDebugInfo.cpp
+++ b/lib/CodeGen/CGDebugInfo.cpp
@@ -37,7 +37,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Path.h"

+#include "llvm/Support/Path.h"
 using namespace clang;
 using namespace clang::CodeGen;
 
@@ -52,30 +52,35 @@
          "Region stack mismatch, stack not empty!");
 }
 
-
-NoLocation::NoLocation(CodeGenFunction &CGF, CGBuilderTy &B)
-  : DI(CGF.getDebugInfo()), Builder(B) {
+SaveAndRestoreLocation::SaveAndRestoreLocation(CodeGenFunction &CGF,
+                                               CGBuilderTy &B)
+    : DI(CGF.getDebugInfo()), Builder(B) {
   if (DI) {
     SavedLoc = DI->getLocation();
     DI->CurLoc = SourceLocation();
-    Builder.SetCurrentDebugLocation(llvm::DebugLoc());
   }
 }
 
+SaveAndRestoreLocation::~SaveAndRestoreLocation() {
+  if (DI)
+    DI->EmitLocation(Builder, SavedLoc);
+}
+
+NoLocation::NoLocation(CodeGenFunction &CGF, CGBuilderTy &B)
+  : SaveAndRestoreLocation(CGF, B) {
+  if (DI)
+    Builder.SetCurrentDebugLocation(llvm::DebugLoc());
+}
+
 NoLocation::~NoLocation() {
-  if (DI) {
+  if (DI)
     assert(Builder.getCurrentDebugLocation().isUnknown());
-    DI->CurLoc = SavedLoc;
-  }
 }
 
 ArtificialLocation::ArtificialLocation(CodeGenFunction &CGF, CGBuilderTy &B)
-  : DI(CGF.getDebugInfo()), Builder(B) {
-  if (DI) {
-    SavedLoc = DI->getLocation();
-    DI->CurLoc = SourceLocation();
+  : SaveAndRestoreLocation(CGF, B) {
+  if (DI)
     Builder.SetCurrentDebugLocation(llvm::DebugLoc());
-  }
 }
 
 void ArtificialLocation::Emit() {
@@ -91,10 +96,8 @@
 }
 
 ArtificialLocation::~ArtificialLocation() {
-  if (DI) {
+  if (DI)
     assert(Builder.getCurrentDebugLocation().getLine() == 0);
-    DI->CurLoc = SavedLoc;
-  }
 }
 
 void CGDebugInfo::setLocation(SourceLocation Loc) {
@@ -225,34 +228,20 @@
 /// getClassName - Get class name including template argument list.
 StringRef
 CGDebugInfo::getClassName(const RecordDecl *RD) {
-  const ClassTemplateSpecializationDecl *Spec
-    = dyn_cast<ClassTemplateSpecializationDecl>(RD);
-  if (!Spec)
+  // quick optimization to avoid having to intern strings that are already
+  // stored reliably elsewhere
+  if (!isa<ClassTemplateSpecializationDecl>(RD))
     return RD->getName();
 
-  const TemplateArgument *Args;
-  unsigned NumArgs;
-  if (TypeSourceInfo *TAW = Spec->getTypeAsWritten()) {
-    const TemplateSpecializationType *TST =
-      cast<TemplateSpecializationType>(TAW->getType());
-    Args = TST->getArgs();
-    NumArgs = TST->getNumArgs();
-  } else {
-    const TemplateArgumentList &TemplateArgs = Spec->getTemplateArgs();
-    Args = TemplateArgs.data();
-    NumArgs = TemplateArgs.size();
-  }
-  StringRef Name = RD->getIdentifier()->getName();
-  PrintingPolicy Policy(CGM.getLangOpts());
-  SmallString<128> TemplateArgList;
+  SmallString<128> Name;
   {
-    llvm::raw_svector_ostream OS(TemplateArgList);
-    TemplateSpecializationType::PrintTemplateArgumentList(OS, Args, NumArgs,
-                                                          Policy);
+    llvm::raw_svector_ostream OS(Name);
+    RD->getNameForDiagnostic(OS, CGM.getContext().getPrintingPolicy(),
+                             /*Qualified*/ false);
   }
 
   // Copy this name on the side and use its reference.
-  return internString(Name, TemplateArgList);
+  return internString(Name);
 }
 
 /// getOrCreateFile - Get the file debug info descriptor for the input location.
@@ -342,9 +331,9 @@
   if (const FileEntry *MainFile = SM.getFileEntryForID(SM.getMainFileID())) {
     MainFileDir = MainFile->getDir()->getName();
     if (MainFileDir != ".") {
-      llvm::SmallString<1024> MainFileDirSS(MainFileDir);

-      llvm::sys::path::append(MainFileDirSS, MainFileName);

-      MainFileName = MainFileDirSS.str();

+      llvm::SmallString<1024> MainFileDirSS(MainFileDir);
+      llvm::sys::path::append(MainFileDirSS, MainFileName);
+      MainFileName = MainFileDirSS.str();
     }
   }
 
@@ -379,10 +368,12 @@
 
   // Create new compile unit.
   // FIXME - Eliminate TheCU.
-  TheCU = DBuilder.createCompileUnit(LangTag, Filename, getCurrentDirname(),
-                                     Producer, LO.Optimize,
-                                     CGM.getCodeGenOpts().DwarfDebugFlags,
-                                     RuntimeVers, SplitDwarfFilename);
+  TheCU = DBuilder.createCompileUnit(
+      LangTag, Filename, getCurrentDirname(), Producer, LO.Optimize,
+      CGM.getCodeGenOpts().DwarfDebugFlags, RuntimeVers, SplitDwarfFilename,
+      DebugKind == CodeGenOptions::DebugLineTablesOnly
+          ? llvm::DIBuilder::LineTablesOnly
+          : llvm::DIBuilder::FullDebug);
 }
 
 /// CreateType - Get the Basic type from the cache or create a new
@@ -736,14 +727,16 @@
     return llvm::DIType();
   // We don't set size information, but do specify where the typedef was
   // declared.
-  unsigned Line = getLineNumber(Ty->getDecl()->getLocation());
+  SourceLocation Loc = Ty->getDecl()->getLocation();
+  llvm::DIFile File = getOrCreateFile(Loc);
+  unsigned Line = getLineNumber(Loc);
   const TypedefNameDecl *TyDecl = Ty->getDecl();
 
   llvm::DIDescriptor TypedefContext =
     getContextDescriptor(cast<Decl>(Ty->getDecl()->getDeclContext()));
 
   return
-    DBuilder.createTypedef(Src, TyDecl->getName(), Unit, Line, TypedefContext);
+    DBuilder.createTypedef(Src, TyDecl->getName(), File, Line, TypedefContext);
 }
 
 llvm::DIType CGDebugInfo::CreateType(const FunctionType *Ty,
@@ -751,15 +744,17 @@
   SmallVector<llvm::Value *, 16> EltTys;
 
   // Add the result type at least.
-  EltTys.push_back(getOrCreateType(Ty->getResultType(), Unit));
+  EltTys.push_back(getOrCreateType(Ty->getReturnType(), Unit));
 
   // Set up remainder of arguments if there is a prototype.
-  // FIXME: IF NOT, HOW IS THIS REPRESENTED?  llvm-gcc doesn't represent '...'!
+  // otherwise emit it as a variadic function.
   if (isa<FunctionNoProtoType>(Ty))
     EltTys.push_back(DBuilder.createUnspecifiedParameter());
   else if (const FunctionProtoType *FPT = dyn_cast<FunctionProtoType>(Ty)) {
-    for (unsigned i = 0, e = FPT->getNumArgs(); i != e; ++i)
-      EltTys.push_back(getOrCreateType(FPT->getArgType(i), Unit));
+    for (unsigned i = 0, e = FPT->getNumParams(); i != e; ++i)
+      EltTys.push_back(getOrCreateType(FPT->getParamType(i), Unit));
+    if (FPT->isVariadic())
+      EltTys.push_back(DBuilder.createUnspecifiedParameter());
   }
 
   llvm::DIArray EltTypeArray = DBuilder.getOrCreateArray(EltTys);
@@ -784,7 +779,7 @@
   uint64_t sizeInBits = 0;
   unsigned alignInBits = 0;
   if (!type->isIncompleteArrayType()) {
-    llvm::tie(sizeInBits, alignInBits) = CGM.getContext().getTypeInfo(type);
+    std::tie(sizeInBits, alignInBits) = CGM.getContext().getTypeInfo(type);
 
     if (sizeInBitsOverride)
       sizeInBits = sizeInBitsOverride;
@@ -926,9 +921,8 @@
 
     // Static and non-static members should appear in the same order as
     // the corresponding declarations in the source program.
-    for (RecordDecl::decl_iterator I = record->decls_begin(),
-           E = record->decls_end(); I != E; ++I)
-      if (const VarDecl *V = dyn_cast<VarDecl>(*I)) {
+    for (const auto *I : record->decls())
+      if (const auto *V = dyn_cast<VarDecl>(I)) {
         // Reuse the existing static member declaration if one exists
         llvm::DenseMap<const Decl *, llvm::WeakVH>::iterator MI =
             StaticDataMemberCache.find(V->getCanonicalDecl());
@@ -939,7 +933,7 @@
               llvm::DIDerivedType(cast<llvm::MDNode>(MI->second)));
         } else
           elements.push_back(CreateRecordStaticField(V, RecordTy));
-      } else if (FieldDecl *field = dyn_cast<FieldDecl>(*I)) {
+      } else if (const auto *field = dyn_cast<FieldDecl>(I)) {
         CollectRecordNormalField(field, layout.getFieldOffset(fieldNo),
                                  tunit, elements, RecordTy);
 
@@ -1005,7 +999,13 @@
 
   llvm::DIArray EltTypeArray = DBuilder.getOrCreateArray(Elts);
 
-  return DBuilder.createSubroutineType(Unit, EltTypeArray);
+  unsigned Flags = 0;
+  if (Func->getExtProtoInfo().RefQualifier == RQ_LValue)
+    Flags |= llvm::DIDescriptor::FlagLValueReference;
+  if (Func->getExtProtoInfo().RefQualifier == RQ_RValue)
+    Flags |= llvm::DIDescriptor::FlagRValueReference;
+
+  return DBuilder.createSubroutineType(Unit, EltTypeArray, Flags);
 }
 
 /// isFunctionLocalClass - Return true if CXXRecordDecl is defined
@@ -1084,6 +1084,10 @@
   }
   if (Method->hasPrototype())
     Flags |= llvm::DIDescriptor::FlagPrototyped;
+  if (Method->getRefQualifier() == RQ_LValue)
+    Flags |= llvm::DIDescriptor::FlagLValueReference;
+  if (Method->getRefQualifier() == RQ_RValue)
+    Flags |= llvm::DIDescriptor::FlagRValueReference;
 
   llvm::DIArray TParamsArray = CollectFunctionTemplateParams(Method, Unit);
   llvm::DISubprogram SP =
@@ -1111,9 +1115,8 @@
   // Since we want more than just the individual member decls if we
   // have templated functions iterate over every declaration to gather
   // the functions.
-  for(DeclContext::decl_iterator I = RD->decls_begin(),
-        E = RD->decls_end(); I != E; ++I) {
-    if (const CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(*I)) {
+  for(const auto *I : RD->decls()) {
+    if (const auto *Method = dyn_cast<CXXMethodDecl>(I)) {
       // Reuse the existing member function declaration if it exists.
       // It may be associated with the declaration of the type & should be
       // reused as we're building the definition.
@@ -1130,16 +1133,13 @@
           EltTys.push_back(CreateCXXMemberFunction(Method, Unit, RecordTy));
       } else
         EltTys.push_back(MI->second);
-    } else if (const FunctionTemplateDecl *FTD =
-                   dyn_cast<FunctionTemplateDecl>(*I)) {
+    } else if (const auto *FTD = dyn_cast<FunctionTemplateDecl>(I)) {
       // Add any template specializations that have already been seen. Like
       // implicit member functions, these may have been added to a declaration
       // in the case of vtable-based debug info reduction.
-      for (FunctionTemplateDecl::spec_iterator SI = FTD->spec_begin(),
-                                               SE = FTD->spec_end();
-           SI != SE; ++SI) {
+      for (const auto *SI : FTD->specializations()) {
         llvm::DenseMap<const FunctionDecl *, llvm::WeakVH>::iterator MI =
-            SPCache.find(cast<CXXMethodDecl>(*SI)->getCanonicalDecl());
+            SPCache.find(cast<CXXMethodDecl>(SI)->getCanonicalDecl());
         if (MI != SPCache.end())
           EltTys.push_back(MI->second);
       }
@@ -1156,15 +1156,14 @@
                 llvm::DIType RecordTy) {
 
   const ASTRecordLayout &RL = CGM.getContext().getASTRecordLayout(RD);
-  for (CXXRecordDecl::base_class_const_iterator BI = RD->bases_begin(),
-         BE = RD->bases_end(); BI != BE; ++BI) {
+  for (const auto &BI : RD->bases()) {
     unsigned BFlags = 0;
     uint64_t BaseOffset;
 
     const CXXRecordDecl *Base =
-      cast<CXXRecordDecl>(BI->getType()->getAs<RecordType>()->getDecl());
+      cast<CXXRecordDecl>(BI.getType()->getAs<RecordType>()->getDecl());
 
-    if (BI->isVirtual()) {
+    if (BI.isVirtual()) {
       // virtual base offset offset is -ve. The code generator emits dwarf
       // expression where it expects +ve number.
       BaseOffset =
@@ -1176,7 +1175,7 @@
     // FIXME: Inconsistent units for BaseOffset. It is in bytes when
     // BI->isVirtual() and bits when not.
 
-    AccessSpecifier Access = BI->getAccessSpecifier();
+    AccessSpecifier Access = BI.getAccessSpecifier();
     if (Access == clang::AS_private)
       BFlags |= llvm::DIDescriptor::FlagPrivate;
     else if (Access == clang::AS_protected)
@@ -1184,7 +1183,7 @@
 
     llvm::DIType DTy =
       DBuilder.createInheritance(RecordTy,
-                                 getOrCreateType(BI->getType(), Unit),
+                                 getOrCreateType(BI.getType(), Unit),
                                  BaseOffset, BFlags);
     EltTys.push_back(DTy);
   }
@@ -1239,7 +1238,7 @@
         V = CGM.GetAddrOfFunction(FD);
       // Member data pointers have special handling too to compute the fixed
       // offset within the object.
-      if (isa<FieldDecl>(D)) {
+      if (isa<FieldDecl>(D) || isa<IndirectFieldDecl>(D)) {
         // These five lines (& possibly the above member function pointer
         // handling) might be able to be refactored to use similar code in
         // CodeGenModule::getMemberPointerConstant
@@ -1416,6 +1415,9 @@
 }
 
 void CGDebugInfo::completeRequiredType(const RecordDecl *RD) {
+  if (DebugKind <= CodeGenOptions::DebugLineTablesOnly)
+    return;
+
   if (const CXXRecordDecl *CXXDecl = dyn_cast<CXXRecordDecl>(RD))
     if (CXXDecl->isDynamicClass())
       return;
@@ -1439,32 +1441,57 @@
   TypeCache[TyPtr] = Res;
 }
 
+static bool hasExplicitMemberDefinition(CXXRecordDecl::method_iterator I,
+                                        CXXRecordDecl::method_iterator End) {
+  for (; I != End; ++I)
+    if (FunctionDecl *Tmpl = I->getInstantiatedFromMemberFunction())
+      if (!Tmpl->isImplicit() && Tmpl->isThisDeclarationADefinition() &&
+          !I->getMemberSpecializationInfo()->isExplicitSpecialization())
+        return true;
+  return false;
+}
+
+static bool shouldOmitDefinition(CodeGenOptions::DebugInfoKind DebugKind,
+                                 const RecordDecl *RD,
+                                 const LangOptions &LangOpts) {
+  if (DebugKind > CodeGenOptions::LimitedDebugInfo)
+    return false;
+
+  if (!LangOpts.CPlusPlus)
+    return false;
+
+  if (!RD->isCompleteDefinitionRequired())
+    return true;
+
+  const CXXRecordDecl *CXXDecl = dyn_cast<CXXRecordDecl>(RD);
+
+  if (!CXXDecl)
+    return false;
+
+  if (CXXDecl->hasDefinition() && CXXDecl->isDynamicClass())
+    return true;
+
+  TemplateSpecializationKind Spec = TSK_Undeclared;
+  if (const ClassTemplateSpecializationDecl *SD =
+          dyn_cast<ClassTemplateSpecializationDecl>(RD))
+    Spec = SD->getSpecializationKind();
+
+  if (Spec == TSK_ExplicitInstantiationDeclaration &&
+      hasExplicitMemberDefinition(CXXDecl->method_begin(),
+                                  CXXDecl->method_end()))
+    return true;
+
+  return false;
+}
+
 /// CreateType - get structure or union type.
 llvm::DIType CGDebugInfo::CreateType(const RecordType *Ty) {
   RecordDecl *RD = Ty->getDecl();
-  const CXXRecordDecl *CXXDecl = dyn_cast<CXXRecordDecl>(RD);
-  // Always emit declarations for types that aren't required to be complete when
-  // in limit-debug-info mode. If the type is later found to be required to be
-  // complete this declaration will be upgraded to a definition by
-  // `completeRequiredType`.
-  // If the type is dynamic, only emit the definition in TUs that require class
-  // data. This is handled by `completeClassData`.
   llvm::DICompositeType T(getTypeOrNull(QualType(Ty, 0)));
-  // If we've already emitted the type, just use that, even if it's only a
-  // declaration. The completeType, completeRequiredType, and completeClassData
-  // callbacks will handle promoting the declaration to a definition.
-  if (T ||
-      (DebugKind <= CodeGenOptions::LimitedDebugInfo &&
-       // Under -flimit-debug-info, emit only a declaration unless the type is
-       // required to be complete.
-       !RD->isCompleteDefinitionRequired() && CGM.getLangOpts().CPlusPlus) ||
-      // If the class is dynamic, only emit a declaration. A definition will be
-      // emitted whenever the vtable is emitted.
-      (CXXDecl && CXXDecl->hasDefinition() && CXXDecl->isDynamicClass()) || T) {
-    llvm::DIDescriptor FDContext =
-      getContextDescriptor(cast<Decl>(RD->getDeclContext()));
+  if (T || shouldOmitDefinition(DebugKind, RD, CGM.getLangOpts())) {
     if (!T)
-      T = getOrCreateRecordFwdDecl(Ty, FDContext);
+      T = getOrCreateRecordFwdDecl(
+          Ty, getContextDescriptor(cast<Decl>(RD->getDeclContext())));
     return T;
   }
 
@@ -1625,9 +1652,7 @@
   }
 
   // Create entries for all of the properties.
-  for (ObjCContainerDecl::prop_iterator I = ID->prop_begin(),
-         E = ID->prop_end(); I != E; ++I) {
-    const ObjCPropertyDecl *PD = *I;
+  for (const auto *PD : ID->properties()) {
     SourceLocation Loc = PD->getLocation();
     llvm::DIFile PUnit = getOrCreateFile(Loc);
     unsigned PLine = getLineNumber(Loc);
@@ -1829,11 +1854,13 @@
   if (!Ty->getPointeeType()->isFunctionType())
     return DBuilder.createMemberPointerType(
         getOrCreateType(Ty->getPointeeType(), U), ClassType);
+
+  const FunctionProtoType *FPT =
+    Ty->getPointeeType()->getAs<FunctionProtoType>();
   return DBuilder.createMemberPointerType(getOrCreateInstanceMethodType(
-      CGM.getContext().getPointerType(
-          QualType(Ty->getClass(), Ty->getPointeeType().getCVRQualifiers())),
-      Ty->getPointeeType()->getAs<FunctionProtoType>(), U),
-                                          ClassType);
+      CGM.getContext().getPointerType(QualType(Ty->getClass(),
+                                               FPT->getTypeQuals())),
+      FPT, U), ClassType);
 }
 
 llvm::DIType CGDebugInfo::CreateType(const AtomicType *Ty,
@@ -1871,9 +1898,7 @@
   // Create DIEnumerator elements for each enumerator.
   SmallVector<llvm::Value *, 16> Enumerators;
   ED = ED->getDefinition();
-  for (EnumDecl::enumerator_iterator
-         Enum = ED->enumerator_begin(), EnumEnd = ED->enumerator_end();
-       Enum != EnumEnd; ++Enum) {
+  for (const auto *Enum : ED->enumerators()) {
     Enumerators.push_back(
       DBuilder.createEnumerator(Enum->getName(),
                                 Enum->getInitVal().getSExtValue()));
@@ -1994,6 +2019,17 @@
   return llvm::DIType(cast_or_null<llvm::MDNode>(V));
 }
 
+void CGDebugInfo::completeTemplateDefinition(
+    const ClassTemplateSpecializationDecl &SD) {
+  if (DebugKind <= CodeGenOptions::DebugLineTablesOnly)
+    return;
+
+  completeClassData(&SD);
+  // In case this type has no member function definitions being emitted, ensure
+  // it is retained
+  RetainedTypes.push_back(CGM.getContext().getRecordType(&SD).getAsOpaquePtr());
+}
+
 /// getCachedInterfaceTypeOrNull - Get the type from the interface
 /// cache, unless it needs to regenerated. Otherwise return null.
 llvm::Value *CGDebugInfo::getCachedInterfaceTypeOrNull(QualType Ty) {
@@ -2123,10 +2159,11 @@
     return CreateType(cast<ComplexType>(Ty));
   case Type::Pointer:
     return CreateType(cast<PointerType>(Ty), Unit);
+  case Type::Adjusted:
   case Type::Decayed:
-    // Decayed types are just pointers in LLVM and DWARF.
+    // Decayed and adjusted types use the adjusted type in LLVM and DWARF.
     return CreateType(
-        cast<PointerType>(cast<DecayedType>(Ty)->getDecayedType()), Unit);
+        cast<PointerType>(cast<AdjustedType>(Ty)->getAdjustedType()), Unit);
   case Type::BlockPointer:
     return CreateType(cast<BlockPointerType>(Ty), Unit);
   case Type::Typedef:
@@ -2233,9 +2270,10 @@
   if (T && (!T.isForwardDecl() || !RD->getDefinition()))
       return T;
 
-  // If this is just a forward declaration, construct an appropriately
-  // marked node and just return it.
-  if (!RD->getDefinition())
+  // If this is just a forward or incomplete declaration, construct an
+  // appropriately marked node and just return it.
+  const RecordDecl *D = RD->getDefinition();
+  if (!D || !D->isCompleteDefinition())
     return getOrCreateRecordFwdDecl(Ty, RDContext);
 
   uint64_t Size = CGM.getContext().getTypeSize(Ty);
@@ -2277,7 +2315,7 @@
   llvm::DICompositeType ContainingType;
   const ASTRecordLayout &RL = CGM.getContext().getASTRecordLayout(RD);
   if (const CXXRecordDecl *PBase = RL.getPrimaryBase()) {
-    // Seek non virtual primary base root.
+    // Seek non-virtual primary base root.
     while (1) {
       const ASTRecordLayout &BRL = CGM.getContext().getASTRecordLayout(PBase);
       const CXXRecordDecl *PBT = BRL.getPrimaryBase();
@@ -2309,7 +2347,7 @@
   return Ty;
 }
 
-llvm::DIDescriptor CGDebugInfo::getDeclarationOrDefinition(const Decl *D) {
+llvm::DIScope CGDebugInfo::getDeclarationOrDefinition(const Decl *D) {
   // We only need a declaration (not a definition) of the type - so use whatever
   // we would otherwise do to get a type for a pointee. (forward declarations in
   // limited debug info, full definitions (if the type definition is available)
@@ -2327,9 +2365,9 @@
   llvm::DenseMap<const Decl *, llvm::WeakVH>::iterator I =
       DeclCache.find(D->getCanonicalDecl());
   if (I == DeclCache.end())
-    return llvm::DIDescriptor();
+    return llvm::DIScope();
   llvm::Value *V = I->second;
-  return llvm::DIDescriptor(dyn_cast_or_null<llvm::MDNode>(V));
+  return llvm::DIScope(dyn_cast_or_null<llvm::MDNode>(V));
 }
 
 /// getFunctionDeclaration - Return debug info descriptor to describe method
@@ -2352,7 +2390,6 @@
       llvm::DICompositeType T(S);
       llvm::DISubprogram SP =
           CreateCXXMemberFunction(MD, getOrCreateFile(MD->getLocation()), T);
-      T.addMember(SP);
       return SP;
     }
   }
@@ -2363,9 +2400,7 @@
       return SP;
   }
 
-  for (FunctionDecl::redecl_iterator I = FD->redecls_begin(),
-         E = FD->redecls_end(); I != E; ++I) {
-    const FunctionDecl *NextFD = *I;
+  for (auto NextFD : FD->redecls()) {
     llvm::DenseMap<const FunctionDecl *, llvm::WeakVH>::iterator
       MI = SPCache.find(NextFD->getCanonicalDecl());
     if (MI != SPCache.end()) {
@@ -2397,7 +2432,7 @@
     SmallVector<llvm::Value *, 16> Elts;
 
     // First element is always return type. For 'void' functions it is NULL.
-    QualType ResultTy = OMethod->getResultType();
+    QualType ResultTy = OMethod->getReturnType();
 
     // Replace the instancetype keyword with the actual type.
     if (ResultTy == CGM.getContext().getObjCInstanceType())
@@ -2413,13 +2448,27 @@
     llvm::DIType CmdTy = getOrCreateType(OMethod->getCmdDecl()->getType(), F);
     Elts.push_back(DBuilder.createArtificialType(CmdTy));
     // Get rest of the arguments.
-    for (ObjCMethodDecl::param_const_iterator PI = OMethod->param_begin(),
-           PE = OMethod->param_end(); PI != PE; ++PI)
-      Elts.push_back(getOrCreateType((*PI)->getType(), F));
+    for (const auto *PI : OMethod->params())
+      Elts.push_back(getOrCreateType(PI->getType(), F));
 
     llvm::DIArray EltTypeArray = DBuilder.getOrCreateArray(Elts);
     return DBuilder.createSubroutineType(F, EltTypeArray);
   }
+
+  // Handle variadic function types; they need an additional
+  // unspecified parameter.
+  if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D))
+    if (FD->isVariadic()) {
+      SmallVector<llvm::Value *, 16> EltTys;
+      EltTys.push_back(getOrCreateType(FD->getReturnType(), F));
+      if (const FunctionProtoType *FPT = dyn_cast<FunctionProtoType>(FnType))
+        for (unsigned i = 0, e = FPT->getNumParams(); i != e; ++i)
+          EltTys.push_back(getOrCreateType(FPT->getParamType(i), F));
+      EltTys.push_back(DBuilder.createUnspecifiedParameter());
+      llvm::DIArray EltTypeArray = DBuilder.getOrCreateArray(EltTys);
+      return DBuilder.createSubroutineType(F, EltTypeArray);
+    }
+
   return llvm::DICompositeType(getOrCreateType(FnType, F));
 }
 
@@ -2434,14 +2483,25 @@
   FnBeginRegionCount.push_back(LexicalBlockStack.size());
 
   const Decl *D = GD.getDecl();
-  // Function may lack declaration in source code if it is created by Clang
-  // CodeGen (examples: _GLOBAL__I_a, __cxx_global_array_dtor, thunk).
+
+  // Use the location of the start of the function to determine where
+  // the function definition is located. By default use the location
+  // of the declaration as the location for the subprogram. A function
+  // may lack a declaration in the source code if it is created by code
+  // gen. (examples: _GLOBAL__I_a, __cxx_global_array_dtor, thunk).
   bool HasDecl = (D != 0);
-  // Use the location of the declaration.
   SourceLocation Loc;
-  if (HasDecl)
+  if (HasDecl) {
     Loc = D->getLocation();
 
+    // If this is a function specialization then use the pattern body
+    // as the location for the function.
+    if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D))
+      if (const FunctionDecl *SpecDecl = FD->getTemplateInstantiationPattern())
+        if (SpecDecl->hasBody(SpecDecl))
+          Loc = SpecDecl->getLocation();
+  }
+
   unsigned Flags = 0;
   llvm::DIFile Unit = getOrCreateFile(Loc);
   llvm::DIDescriptor FDContext(Unit);
@@ -2504,6 +2564,11 @@
   if (!HasDecl || D->isImplicit())
     Flags |= llvm::DIDescriptor::FlagArtificial;
 
+  // FIXME: The function declaration we're constructing here is mostly reusing
+  // declarations from CXXMethodDecl and not constructing new ones for arbitrary
+  // FunctionDecls. When/if we fix this we can have FDContext be TheCU/null for
+  // all subprograms instead of the actual context since subprogram definitions
+  // are emitted as CU level entities by the backend.
   llvm::DISubprogram SP =
       DBuilder.createFunction(FDContext, Name, LinkageName, Unit, LineNo,
                               getOrCreateFunctionType(D, FnType, Unit),
@@ -2514,9 +2579,10 @@
   if (HasDecl)
     DeclCache.insert(std::make_pair(D->getCanonicalDecl(), llvm::WeakVH(SP)));
 
-  // Push function on region stack.
+  // Push the function onto the lexical block stack.
   llvm::MDNode *SPN = SP;
   LexicalBlockStack.push_back(SPN);
+
   if (HasDecl)
     RegionMap[D] = llvm::WeakVH(SP);
 }
@@ -2560,7 +2626,8 @@
                                 llvm::DIDescriptor(LexicalBlockStack.back()),
                                 getOrCreateFile(CurLoc),
                                 getLineNumber(CurLoc),
-                                getColumnNumber(CurLoc));
+                                getColumnNumber(CurLoc),
+                                0);
   llvm::MDNode *DN = D;
   LexicalBlockStack.push_back(DN);
 }
@@ -2768,10 +2835,7 @@
     // all union fields.
     const RecordDecl *RD = cast<RecordDecl>(RT->getDecl());
     if (RD->isUnion() && RD->isAnonymousStructOrUnion()) {
-      for (RecordDecl::field_iterator I = RD->field_begin(),
-             E = RD->field_end();
-           I != E; ++I) {
-        FieldDecl *Field = *I;
+      for (const auto *Field : RD->fields()) {
         llvm::DIType FieldTy = getOrCreateType(Field->getType(), Unit);
         StringRef FieldName = Field->getName();
 
@@ -2970,10 +3034,7 @@
   }
 
   // Variable captures.
-  for (BlockDecl::capture_const_iterator
-         i = blockDecl->capture_begin(), e = blockDecl->capture_end();
-       i != e; ++i) {
-    const BlockDecl::Capture &capture = *i;
+  for (const auto &capture : blockDecl->captures()) {
     const VarDecl *variable = capture.getVariable();
     const CGBlockInfo::Capture &captureInfo = block.getCapture(variable);
 
@@ -3085,7 +3146,6 @@
   llvm::DICompositeType Ctxt(
       getContextDescriptor(cast<Decl>(D->getDeclContext())));
   llvm::DIDerivedType T = CreateRecordStaticField(D, Ctxt);
-  Ctxt.addMember(T);
   return T;
 }
 
@@ -3196,7 +3256,7 @@
   // Emitting one decl is sufficient - debuggers can detect that this is an
   // overloaded name & provide lookup for all the overloads.
   const UsingShadowDecl &USD = **UD.shadow_begin();
-  if (llvm::DIDescriptor Target =
+  if (llvm::DIScope Target =
           getDeclarationOrDefinition(USD.getUnderlyingDecl()))
     DBuilder.createImportedDeclaration(
         getCurrentContextDescriptor(cast<Decl>(USD.getDeclContext())), Target,
diff --git a/lib/CodeGen/CGDebugInfo.h b/lib/CodeGen/CGDebugInfo.h
index 0ca274f..52784da 100644
--- a/lib/CodeGen/CGDebugInfo.h
+++ b/lib/CodeGen/CGDebugInfo.h
@@ -20,10 +20,10 @@
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Frontend/CodeGenOptions.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/DIBuilder.h"
-#include "llvm/DebugInfo.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Allocator.h"
-#include "llvm/Support/ValueHandle.h"
 
 namespace llvm {
   class MDNode;
@@ -47,8 +47,8 @@
 /// and is responsible for emitting to llvm globals or pass directly to
 /// the backend.
 class CGDebugInfo {
-  friend class NoLocation;
   friend class ArtificialLocation;
+  friend class SaveAndRestoreLocation;
   CodeGenModule &CGM;
   const CodeGenOptions::DebugInfoKind DebugKind;
   llvm::DIBuilder DBuilder;
@@ -288,6 +288,8 @@
   void completeRequiredType(const RecordDecl *RD);
   void completeClassData(const RecordDecl *RD);
 
+  void completeTemplateDefinition(const ClassTemplateSpecializationDecl &SD);
+
 private:
   /// EmitDeclare - Emit call to llvm.dbg.declare for a variable declaration.
   void EmitDeclare(const VarDecl *decl, unsigned Tag, llvm::Value *AI,
@@ -342,9 +344,9 @@
   llvm::DIType CreateMemberType(llvm::DIFile Unit, QualType FType,
                                 StringRef Name, uint64_t *Offset);
 
-  /// \brief Retrieve the DIDescriptor, if any, for the canonical form of this
+  /// \brief Retrieve the DIScope, if any, for the canonical form of this
   /// declaration.
-  llvm::DIDescriptor getDeclarationOrDefinition(const Decl *D);
+  llvm::DIScope getDeclarationOrDefinition(const Decl *D);
 
   /// getFunctionDeclaration - Return debug info descriptor to describe method
   /// declaration for the given method definition.
@@ -394,16 +396,26 @@
   }
 };
 
-/// NoLocation - An RAII object that temporarily disables debug
-/// locations. This is useful for emitting instructions that should be
-/// counted towards the function prologue.
-class NoLocation {
+/// SaveAndRestoreLocation - An RAII object saves the current location
+/// and automatically restores it to the original value.
+class SaveAndRestoreLocation {
+protected:
   SourceLocation SavedLoc;
   CGDebugInfo *DI;
   CGBuilderTy &Builder;
 public:
+  SaveAndRestoreLocation(CodeGenFunction &CGF, CGBuilderTy &B);
+  /// Autorestore everything back to normal.
+  ~SaveAndRestoreLocation();
+};
+
+/// NoLocation - An RAII object that temporarily disables debug
+/// locations. This is useful for emitting instructions that should be
+/// counted towards the function prologue.
+class NoLocation : public SaveAndRestoreLocation {
+public:
   NoLocation(CodeGenFunction &CGF, CGBuilderTy &B);
-  /// ~NoLocation - Autorestore everything back to normal.
+  /// Autorestore everything back to normal.
   ~NoLocation();
 };
 
@@ -418,10 +430,7 @@
 /// This is necessary because passing an empty SourceLocation to
 /// CGDebugInfo::setLocation() will result in the last valid location
 /// being reused.
-class ArtificialLocation {
-  SourceLocation SavedLoc;
-  CGDebugInfo *DI;
-  CGBuilderTy &Builder;
+class ArtificialLocation : public SaveAndRestoreLocation {
 public:
   ArtificialLocation(CodeGenFunction &CGF, CGBuilderTy &B);
 
@@ -429,7 +438,7 @@
   /// (= the top of the LexicalBlockStack).
   void Emit();
 
-  /// ~ArtificialLocation - Autorestore everything back to normal.
+  /// Autorestore everything back to normal.
   ~ArtificialLocation();
 };
 
diff --git a/lib/CodeGen/CGDecl.cpp b/lib/CodeGen/CGDecl.cpp
index 66d6b33..9db3c91 100644
--- a/lib/CodeGen/CGDecl.cpp
+++ b/lib/CodeGen/CGDecl.cpp
@@ -183,7 +183,7 @@
   return ContextName + Separator + D.getNameAsString();
 }
 
-llvm::GlobalVariable *
+llvm::Constant *
 CodeGenFunction::CreateStaticVarDecl(const VarDecl &D,
                                      const char *Separator,
                                      llvm::GlobalValue::LinkageTypes Linkage) {
@@ -212,6 +212,13 @@
   if (D.getTLSKind())
     CGM.setTLSMode(GV, D);
 
+  // Make sure the result is of the correct type.
+  unsigned ExpectedAddrSpace = CGM.getContext().getTargetAddressSpace(Ty);
+  if (AddrSpace != ExpectedAddrSpace) {
+    llvm::PointerType *PTy = llvm::PointerType::get(LTy, ExpectedAddrSpace);
+    return llvm::ConstantExpr::getAddrSpaceCast(GV, PTy);
+  }
+
   return GV;
 }
 
@@ -298,12 +305,8 @@
   llvm::Constant *addr =
     CGM.getStaticLocalDeclAddress(&D);
 
-  llvm::GlobalVariable *var;
-  if (addr) {
-    var = cast<llvm::GlobalVariable>(addr->stripPointerCasts());
-  } else {
-    addr = var = CreateStaticVarDecl(D, ".", Linkage);
-  }
+  if (!addr)
+    addr = CreateStaticVarDecl(D, ".", Linkage);
 
   // Store into LocalDeclMap before generating initializer to handle
   // circular references.
@@ -319,6 +322,8 @@
   // Save the type in case adding the initializer forces a type change.
   llvm::Type *expectedType = addr->getType();
 
+  llvm::GlobalVariable *var =
+    cast<llvm::GlobalVariable>(addr->stripPointerCasts());
   // If this value has an initializer, emit it.
   if (D.getInit())
     var = AddInitializerToStaticVarDecl(D, var);
@@ -332,14 +337,15 @@
     var->setSection(SA->getName());
 
   if (D.hasAttr<UsedAttr>())
-    CGM.AddUsedGlobal(var);
+    CGM.addUsedGlobal(var);
 
   // We may have to cast the constant because of the initializer
   // mismatch above.
   //
   // FIXME: It is really dangerous to store this in the map; if anyone
   // RAUW's the GV uses of this constant will be invalid.
-  llvm::Constant *castedAddr = llvm::ConstantExpr::getBitCast(var, expectedType);
+  llvm::Constant *castedAddr =
+    llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(var, expectedType);
   DMEntry = castedAddr;
   CGM.setStaticLocalDeclAddress(&D, castedAddr);
 
@@ -365,7 +371,7 @@
     CodeGenFunction::Destroyer *destroyer;
     bool useEHCleanupForArray;
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       // Don't use an EH cleanup recursively from an EH cleanup.
       bool useEHCleanupForArray =
         flags.isForNormalCleanup() && this->useEHCleanupForArray;
@@ -384,7 +390,7 @@
     llvm::Value *NRVOFlag;
     llvm::Value *Loc;
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       // Along the exceptions path we always execute the dtor.
       bool NRVO = flags.isForNormalCleanup() && NRVOFlag;
 
@@ -410,7 +416,7 @@
   struct CallStackRestore : EHScopeStack::Cleanup {
     llvm::Value *Stack;
     CallStackRestore(llvm::Value *Stack) : Stack(Stack) {}
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       llvm::Value *V = CGF.Builder.CreateLoad(Stack);
       llvm::Value *F = CGF.CGM.getIntrinsic(llvm::Intrinsic::stackrestore);
       CGF.Builder.CreateCall(F, V);
@@ -421,7 +427,7 @@
     const VarDecl &Var;
     ExtendGCLifetime(const VarDecl *var) : Var(*var) {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       // Compute the address of the local variable, in case it's a
       // byref or something.
       DeclRefExpr DRE(const_cast<VarDecl*>(&Var), false,
@@ -441,7 +447,7 @@
                         const VarDecl *Var)
       : CleanupFn(CleanupFn), FnInfo(*Info), Var(*Var) {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       DeclRefExpr DRE(const_cast<VarDecl*>(&Var), false,
                       Var.getType(), VK_LValue, SourceLocation());
       // Compute the address of the local variable, in case it's a byref
@@ -473,7 +479,7 @@
     CallLifetimeEnd(llvm::Value *addr, llvm::Value *size)
       : Addr(addr), Size(size) {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       llvm::Value *castAddr = CGF.Builder.CreateBitCast(Addr, CGF.Int8PtrTy);
       CGF.Builder.CreateCall2(CGF.CGM.getLLVMLifetimeEndFn(),
                               Size, castAddr)
@@ -530,9 +536,8 @@
       return (ref->getDecl() == &var);
     if (const BlockExpr *be = dyn_cast<BlockExpr>(e)) {
       const BlockDecl *block = be->getBlockDecl();
-      for (BlockDecl::capture_const_iterator i = block->capture_begin(),
-           e = block->capture_end(); i != e; ++i) {
-        if (i->getVariable() == &var)
+      for (const auto &I : block->captures()) {
+        if (I.getVariable() == &var)
           return true;
       }
     }
@@ -571,7 +576,10 @@
     EmitStoreThroughLValue(RValue::get(value), lvalue, true);
     return;
   }
-
+  
+  if (const CXXDefaultInitExpr *DIE = dyn_cast<CXXDefaultInitExpr>(init))
+    init = DIE->getExpr();
+    
   // If we're emitting a value with lifetime, we have to do the
   // initialization *before* we leave the cleanup scopes.
   if (const ExprWithCleanups *ewc = dyn_cast<ExprWithCleanups>(init)) {
@@ -823,7 +831,7 @@
 }
 
 /// EmitAutoVarAlloca - Emit the alloca and debug information for a
-/// local variable.  Does not emit initalization or destruction.
+/// local variable.  Does not emit initialization or destruction.
 CodeGenFunction::AutoVarEmission
 CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
   QualType Ty = D.getType();
@@ -944,12 +952,12 @@
 
       // Push a cleanup block and restore the stack there.
       // FIXME: in general circumstances, this should be an EH cleanup.
-      EHStack.pushCleanup<CallStackRestore>(NormalCleanup, Stack);
+      pushStackRestore(NormalCleanup, Stack);
     }
 
     llvm::Value *elementCount;
     QualType elementType;
-    llvm::tie(elementCount, elementType) = getVLASize(Ty);
+    std::tie(elementCount, elementType) = getVLASize(Ty);
 
     llvm::Type *llvmTy = ConvertTypeForMem(elementType);
 
@@ -990,9 +998,8 @@
 
   if (const BlockExpr *be = dyn_cast<BlockExpr>(e)) {
     const BlockDecl *block = be->getBlockDecl();
-    for (BlockDecl::capture_const_iterator i = block->capture_begin(),
-           e = block->capture_end(); i != e; ++i) {
-      if (i->getVariable() == &var)
+    for (const auto &I : block->captures()) {
+      if (I.getVariable() == &var)
         return true;
     }
 
@@ -1002,18 +1009,16 @@
 
   if (const StmtExpr *SE = dyn_cast<StmtExpr>(e)) {
     const CompoundStmt *CS = SE->getSubStmt();
-    for (CompoundStmt::const_body_iterator BI = CS->body_begin(),
-	   BE = CS->body_end(); BI != BE; ++BI)
-      if (Expr *E = dyn_cast<Expr>((*BI))) {
+    for (const auto *BI : CS->body())
+      if (const auto *E = dyn_cast<Expr>(BI)) {
         if (isCapturedBy(var, E))
             return true;
       }
-      else if (DeclStmt *DS = dyn_cast<DeclStmt>((*BI))) {
+      else if (const auto *DS = dyn_cast<DeclStmt>(BI)) {
           // special case declarations
-          for (DeclStmt::decl_iterator I = DS->decl_begin(), E = DS->decl_end();
-               I != E; ++I) {
-              if (VarDecl *VD = dyn_cast<VarDecl>((*I))) {
-                Expr *Init = VD->getInit();
+          for (const auto *I : DS->decls()) {
+              if (const auto *VD = dyn_cast<VarDecl>((I))) {
+                const Expr *Init = VD->getInit();
                 if (Init && isCapturedBy(var, Init))
                   return true;
               }
@@ -1344,6 +1349,10 @@
                                      destroyer, useEHCleanupForArray);
 }
 
+void CodeGenFunction::pushStackRestore(CleanupKind Kind, llvm::Value *SPMem) {
+  EHStack.pushCleanup<CallStackRestore>(Kind, SPMem);
+}
+
 void CodeGenFunction::pushLifetimeExtendedDestroy(
     CleanupKind cleanupKind, llvm::Value *addr, QualType type,
     Destroyer *destroyer, bool useEHCleanupForArray) {
@@ -1505,7 +1514,7 @@
       : ArrayBegin(arrayBegin), ArrayEnd(arrayEnd),
         ElementType(elementType), Destroyer(destroyer) {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       emitPartialArrayDestroy(CGF, ArrayBegin, ArrayEnd,
                               ElementType, Destroyer);
     }
@@ -1527,7 +1536,7 @@
       : ArrayBegin(arrayBegin), ArrayEndPointer(arrayEndPointer),
         ElementType(elementType), Destroyer(destroyer) {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       llvm::Value *arrayEnd = CGF.Builder.CreateLoad(ArrayEndPointer);
       emitPartialArrayDestroy(CGF, ArrayBegin, arrayEnd,
                               ElementType, Destroyer);
@@ -1594,7 +1603,7 @@
     llvm::Value *Param;
     ARCPreciseLifetime_t Precise;
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       CGF.EmitARCRelease(Param, Precise);
     }
   };
@@ -1603,7 +1612,7 @@
 /// Emit an alloca (or GlobalValue depending on target)
 /// for the specified parameter and set up LocalDeclMap.
 void CodeGenFunction::EmitParmDecl(const VarDecl &D, llvm::Value *Arg,
-                                   unsigned ArgNo) {
+                                   bool ArgIsPointer, unsigned ArgNo) {
   // FIXME: Why isn't ImplicitParamDecl a ParmVarDecl?
   assert((isa<ParmVarDecl>(D) || isa<ImplicitParamDecl>(D)) &&
          "Invalid argument to EmitParmDecl");
@@ -1641,30 +1650,35 @@
   }
 
   llvm::Value *DeclPtr;
-  bool HasNonScalarEvalKind = !CodeGenFunction::hasScalarEvaluationKind(Ty);
-  // If this is an aggregate or variable sized value, reuse the input pointer.
-  if (HasNonScalarEvalKind || !Ty->isConstantSizeType()) {
-    DeclPtr = Arg;
+  bool DoStore = false;
+  bool IsScalar = hasScalarEvaluationKind(Ty);
+  CharUnits Align = getContext().getDeclAlign(&D);
+  // If we already have a pointer to the argument, reuse the input pointer.
+  if (ArgIsPointer) {
+    // If we have a prettier pointer type at this point, bitcast to that.
+    unsigned AS = cast<llvm::PointerType>(Arg->getType())->getAddressSpace();
+    llvm::Type *IRTy = ConvertTypeForMem(Ty)->getPointerTo(AS);
+    DeclPtr = Arg->getType() == IRTy ? Arg : Builder.CreateBitCast(Arg, IRTy,
+                                                                   D.getName());
     // Push a destructor cleanup for this parameter if the ABI requires it.
-    if (HasNonScalarEvalKind &&
-        getTarget().getCXXABI().isArgumentDestroyedByCallee()) {
-      if (const CXXRecordDecl *RD = Ty->getAsCXXRecordDecl()) {
-        if (RD->hasNonTrivialDestructor())
-          pushDestroy(QualType::DK_cxx_destructor, DeclPtr, Ty);
-      }
+    if (!IsScalar &&
+        getTarget().getCXXABI().areArgsDestroyedLeftToRightInCallee()) {
+      const CXXRecordDecl *RD = Ty->getAsCXXRecordDecl();
+      if (RD && RD->hasNonTrivialDestructor())
+        pushDestroy(QualType::DK_cxx_destructor, DeclPtr, Ty);
     }
   } else {
     // Otherwise, create a temporary to hold the value.
     llvm::AllocaInst *Alloc = CreateTempAlloca(ConvertTypeForMem(Ty),
                                                D.getName() + ".addr");
-    CharUnits Align = getContext().getDeclAlign(&D);
     Alloc->setAlignment(Align.getQuantity());
     DeclPtr = Alloc;
+    DoStore = true;
+  }
 
-    bool doStore = true;
-
+  LValue lv = MakeAddrLValue(DeclPtr, Ty, Align);
+  if (IsScalar) {
     Qualifiers qs = Ty.getQualifiers();
-    LValue lv = MakeAddrLValue(DeclPtr, Ty, Align);
     if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) {
       // We honor __attribute__((ns_consumed)) for types with lifetime.
       // For __strong, it's handled by just skipping the initial retain;
@@ -1693,7 +1707,7 @@
             llvm::Value *Null = CGM.EmitNullConstant(D.getType());
             EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
             EmitARCStoreStrongCall(lv.getAddress(), Arg, true);
-            doStore = false;
+            DoStore = false;
           }
           else
           // Don't use objc_retainBlock for block pointers, because we
@@ -1712,19 +1726,19 @@
 
         if (lt == Qualifiers::OCL_Weak) {
           EmitARCInitWeak(DeclPtr, Arg);
-          doStore = false; // The weak init is a store, no need to do two.
+          DoStore = false; // The weak init is a store, no need to do two.
         }
       }
 
       // Enter the cleanup scope.
       EmitAutoVarWithLifetime(*this, D, DeclPtr, lt);
     }
-
-    // Store the initial value into the alloca.
-    if (doStore)
-      EmitStoreOfScalar(Arg, lv, /* isInitialization */ true);
   }
 
+  // Store the initial value into the alloca.
+  if (DoStore)
+    EmitStoreOfScalar(Arg, lv, /* isInitialization */ true);
+
   llvm::Value *&DMEntry = LocalDeclMap[&D];
   assert(DMEntry == 0 && "Decl already exists in localdeclmap!");
   DMEntry = DeclPtr;
diff --git a/lib/CodeGen/CGDeclCXX.cpp b/lib/CodeGen/CGDeclCXX.cpp
index 7bdb9eb..bc8620d 100644
--- a/lib/CodeGen/CGDeclCXX.cpp
+++ b/lib/CodeGen/CGDeclCXX.cpp
@@ -281,9 +281,8 @@
     // FIXME: We only need to register one __cxa_thread_atexit function for the
     // entire TU.
     CXXThreadLocalInits.push_back(Fn);
-  } else if (D->hasAttr<InitPriorityAttr>()) {
-    unsigned int order = D->getAttr<InitPriorityAttr>()->getPriority();
-    OrderGlobalInits Key(order, PrioritizedCXXGlobalInits.size());
+  } else if (const InitPriorityAttr *IPA = D->getAttr<InitPriorityAttr>()) {
+    OrderGlobalInits Key(IPA->getPriority(), PrioritizedCXXGlobalInits.size());
     PrioritizedCXXGlobalInits.push_back(std::make_pair(Key, Fn));
     DelayedCXXInitPosition.erase(D);
   } else if (D->getTemplateSpecializationKind() != TSK_ExplicitSpecialization &&
@@ -503,11 +502,9 @@
   FunctionArgList args;
   ImplicitParamDecl dst(0, SourceLocation(), 0, getContext().VoidPtrTy);
   args.push_back(&dst);
-  
-  const CGFunctionInfo &FI = 
-    CGM.getTypes().arrangeFunctionDeclaration(getContext().VoidTy, args,
-                                              FunctionType::ExtInfo(),
-                                              /*variadic*/ false);
+
+  const CGFunctionInfo &FI = CGM.getTypes().arrangeFreeFunctionDeclaration(
+      getContext().VoidTy, args, FunctionType::ExtInfo(), /*variadic=*/false);
   llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(FI);
   llvm::Function *fn = 
     CreateGlobalInitOrDestructFunction(CGM, FTy, "__cxx_global_array_dtor");
diff --git a/lib/CodeGen/CGException.cpp b/lib/CodeGen/CGException.cpp
index 39a992a..ca31717 100644
--- a/lib/CodeGen/CGException.cpp
+++ b/lib/CodeGen/CGException.cpp
@@ -17,8 +17,8 @@
 #include "TargetInfo.h"
 #include "clang/AST/StmtCXX.h"
 #include "clang/AST/StmtObjC.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/Support/CallSite.h"
 
 using namespace clang;
 using namespace CodeGen;
@@ -263,12 +263,9 @@
 /// Check whether a personality function could reasonably be swapped
 /// for a C++ personality function.
 static bool PersonalityHasOnlyCXXUses(llvm::Constant *Fn) {
-  for (llvm::Constant::use_iterator
-         I = Fn->use_begin(), E = Fn->use_end(); I != E; ++I) {
-    llvm::User *User = *I;
-
+  for (llvm::User *U : Fn->users()) {
     // Conditionally white-list bitcasts.
-    if (llvm::ConstantExpr *CE = dyn_cast<llvm::ConstantExpr>(User)) {
+    if (llvm::ConstantExpr *CE = dyn_cast<llvm::ConstantExpr>(U)) {
       if (CE->getOpcode() != llvm::Instruction::BitCast) return false;
       if (!PersonalityHasOnlyCXXUses(CE))
         return false;
@@ -276,7 +273,7 @@
     }
 
     // Otherwise, it has to be a landingpad instruction.
-    llvm::LandingPadInst *LPI = dyn_cast<llvm::LandingPadInst>(User);
+    llvm::LandingPadInst *LPI = dyn_cast<llvm::LandingPadInst>(U);
     if (!LPI) return false;
 
     for (unsigned I = 0, E = LPI->getNumClauses(); I != E; ++I) {
@@ -363,7 +360,7 @@
   struct FreeException : EHScopeStack::Cleanup {
     llvm::Value *exn;
     FreeException(llvm::Value *exn) : exn(exn) {}
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       CGF.EmitNounwindRuntimeCall(getFreeExceptionFn(CGF.CGM), exn);
     }
   };
@@ -766,11 +763,9 @@
 
   // Save the current IR generation state.
   CGBuilderTy::InsertPoint savedIP = Builder.saveAndClearIP();
-  SourceLocation SavedLocation;
-  if (CGDebugInfo *DI = getDebugInfo()) {
-    SavedLocation = DI->getLocation();
+  SaveAndRestoreLocation AutoRestoreLocation(*this, Builder);
+  if (CGDebugInfo *DI = getDebugInfo())
     DI->EmitLocation(Builder, CurEHLocation);
-  }
 
   const EHPersonality &personality = EHPersonality::get(getLangOpts());
 
@@ -892,8 +887,6 @@
 
   // Restore the old IR generation state.
   Builder.restoreIP(savedIP);
-  if (CGDebugInfo *DI = getDebugInfo())
-    DI->EmitLocation(Builder, SavedLocation);
 
   return lpad;
 }
@@ -915,7 +908,7 @@
     CallEndCatch(bool MightThrow) : MightThrow(MightThrow) {}
     bool MightThrow;
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       if (!MightThrow) {
         CGF.EmitNounwindRuntimeCall(getEndCatchFn(CGF.CGM));
         return;
@@ -1244,6 +1237,7 @@
 
   // If the catch was not required, bail out now.
   if (!CatchScope.hasEHBranches()) {
+    CatchScope.clearHandlerBlocks();
     EHStack.popCatch();
     return;
   }
@@ -1294,6 +1288,10 @@
     // Initialize the catch variable and set up the cleanups.
     BeginCatch(*this, C);
 
+    // Emit the PGO counter increment.
+    RegionCounter CatchCnt = getPGORegionCounter(C);
+    CatchCnt.beginRegion(Builder);
+
     // Perform the body of the catch.
     EmitStmt(C->getHandlerBlock());
 
@@ -1320,7 +1318,9 @@
       Builder.CreateBr(ContBB);
   }
 
+  RegionCounter ContCnt = getPGORegionCounter(&S);
   EmitBlock(ContBB);
+  ContCnt.beginRegion(Builder);
 }
 
 namespace {
@@ -1330,7 +1330,7 @@
     CallEndCatchForFinally(llvm::Value *ForEHVar, llvm::Value *EndCatchFn)
       : ForEHVar(ForEHVar), EndCatchFn(EndCatchFn) {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       llvm::BasicBlock *EndCatchBB = CGF.createBasicBlock("finally.endcatch");
       llvm::BasicBlock *CleanupContBB =
         CGF.createBasicBlock("finally.cleanup.cont");
@@ -1357,7 +1357,7 @@
       : Body(Body), ForEHVar(ForEHVar), EndCatchFn(EndCatchFn),
         RethrowFn(RethrowFn), SavedExnVar(SavedExnVar) {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       // Enter a cleanup to call the end-catch function if one was provided.
       if (EndCatchFn)
         CGF.EHStack.pushCleanup<CallEndCatchForFinally>(NormalAndEHCleanup,
diff --git a/lib/CodeGen/CGExpr.cpp b/lib/CodeGen/CGExpr.cpp
index cb990b2..1bdd094 100644
--- a/lib/CodeGen/CGExpr.cpp
+++ b/lib/CodeGen/CGExpr.cpp
@@ -85,6 +85,7 @@
 /// EvaluateExprAsBool - Perform the usual unary conversions on the specified
 /// expression and compare the result against zero, returning an Int1Ty value.
 llvm::Value *CodeGenFunction::EvaluateExprAsBool(const Expr *E) {
+  PGO.setCurrentStmt(E);
   if (const MemberPointerType *MPT = E->getType()->getAs<MemberPointerType>()) {
     llvm::Value *MemPtr = EmitScalarExpr(E);
     return CGM.getCXXABI().EmitMemberPointerIsNotNull(*this, MemPtr, MPT);
@@ -389,7 +390,7 @@
     case SubobjectAdjustment::MemberPointerAdjustment: {
       llvm::Value *Ptr = EmitScalarExpr(Adjustment.Ptr.RHS);
       Object = CGM.getCXXABI().EmitMemberDataPointerAddress(
-                    *this, Object, Ptr, Adjustment.Ptr.MPT);
+          *this, E, Object, Ptr, Adjustment.Ptr.MPT);
       break;
     }
     }
@@ -1690,11 +1691,16 @@
 
 static LValue EmitGlobalVarDeclLValue(CodeGenFunction &CGF,
                                       const Expr *E, const VarDecl *VD) {
+  QualType T = E->getType();
+
+  // If it's thread_local, emit a call to its wrapper function instead.
+  if (VD->getTLSKind() == VarDecl::TLS_Dynamic)
+    return CGF.CGM.getCXXABI().EmitThreadLocalVarDeclLValue(CGF, VD, T);
+
   llvm::Value *V = CGF.CGM.GetAddrOfGlobalVar(VD);
   llvm::Type *RealVarTy = CGF.getTypes().ConvertTypeForMem(VD->getType());
   V = EmitBitCastOfLValueToProperType(CGF, V, RealVarTy);
   CharUnits Alignment = CGF.getContext().getDeclAlign(VD);
-  QualType T = E->getType();
   LValue LV;
   if (VD->getType()->isReferenceType()) {
     llvm::LoadInst *LI = CGF.Builder.CreateLoad(V);
@@ -1702,7 +1708,7 @@
     V = LI;
     LV = CGF.MakeNaturalAlignAddrLValue(V, T);
   } else {
-    LV = CGF.MakeAddrLValue(V, E->getType(), Alignment);
+    LV = CGF.MakeAddrLValue(V, T, Alignment);
   }
   setObjCGCLValueClass(CGF.getContext(), E, LV);
   return LV;
@@ -1718,7 +1724,7 @@
       // isn't the same as the type of a use.  Correct for this with a
       // bitcast.
       QualType NoProtoType =
-          CGF.getContext().getFunctionNoProtoType(Proto->getResultType());
+          CGF.getContext().getFunctionNoProtoType(Proto->getReturnType());
       NoProtoType = CGF.getContext().getPointerType(NoProtoType);
       V = CGF.Builder.CreateBitCast(V, CGF.ConvertType(NoProtoType));
     }
@@ -1769,12 +1775,8 @@
 
   if (const VarDecl *VD = dyn_cast<VarDecl>(ND)) {
     // Check if this is a global variable.
-    if (VD->hasLinkage() || VD->isStaticDataMember()) {
-      // If it's thread_local, emit a call to its wrapper function instead.
-      if (VD->getTLSKind() == VarDecl::TLS_Dynamic)
-        return CGM.getCXXABI().EmitThreadLocalDeclRefExpr(*this, E);
+    if (VD->hasLinkage() || VD->isStaticDataMember())
       return EmitGlobalVarDeclLValue(*this, E, VD);
-    }
 
     bool isBlockVariable = VD->hasAttr<BlocksAttr>();
 
@@ -2651,6 +2653,7 @@
   }
 
   OpaqueValueMapping binding(*this, expr);
+  RegionCounter Cnt = getPGORegionCounter(expr);
 
   const Expr *condExpr = expr->getCond();
   bool CondExprBool;
@@ -2658,8 +2661,12 @@
     const Expr *live = expr->getTrueExpr(), *dead = expr->getFalseExpr();
     if (!CondExprBool) std::swap(live, dead);
 
-    if (!ContainsLabel(dead))
+    if (!ContainsLabel(dead)) {
+      // If the true case is live, we need to track its region.
+      if (CondExprBool)
+        Cnt.beginRegion(Builder);
       return EmitLValue(live);
+    }
   }
 
   llvm::BasicBlock *lhsBlock = createBasicBlock("cond.true");
@@ -2667,10 +2674,11 @@
   llvm::BasicBlock *contBlock = createBasicBlock("cond.end");
 
   ConditionalEvaluation eval(*this);
-  EmitBranchOnBoolExpr(condExpr, lhsBlock, rhsBlock);
+  EmitBranchOnBoolExpr(condExpr, lhsBlock, rhsBlock, Cnt.getCount());
 
   // Any temporaries created here are conditional.
   EmitBlock(lhsBlock);
+  Cnt.beginRegion(Builder);
   eval.begin(*this);
   LValue lhs = EmitLValue(expr->getTrueExpr());
   eval.end(*this);
@@ -2744,6 +2752,7 @@
   case CK_ARCReclaimReturnedObject:
   case CK_ARCExtendBlockObject:
   case CK_CopyAndAutoreleaseBlockObject:
+  case CK_AddressSpaceConversion:
     return EmitUnsupportedLValue(E, "unexpected cast lvalue");
 
   case CK_Dependent:
@@ -3061,7 +3070,7 @@
   if (!RV.isScalar())
     return MakeAddrLValue(RV.getAggregateAddr(), E->getType());
 
-  assert(E->getMethodDecl()->getResultType()->isReferenceType() &&
+  assert(E->getMethodDecl()->getReturnType()->isReferenceType() &&
          "Can't have a scalar return unless the return type is a "
          "reference type!");
 
@@ -3231,8 +3240,8 @@
   const MemberPointerType *MPT
     = E->getRHS()->getType()->getAs<MemberPointerType>();
 
-  llvm::Value *AddV =
-    CGM.getCXXABI().EmitMemberDataPointerAddress(*this, BaseV, OffsetV, MPT);
+  llvm::Value *AddV = CGM.getCXXABI().EmitMemberDataPointerAddress(
+      *this, E, BaseV, OffsetV, MPT);
 
   return MakeAddrLValue(AddV, MPT->getPointeeType());
 }
diff --git a/lib/CodeGen/CGExprAgg.cpp b/lib/CodeGen/CGExprAgg.cpp
index 9d0f3a9..6c50521 100644
--- a/lib/CodeGen/CGExprAgg.cpp
+++ b/lib/CodeGen/CGExprAgg.cpp
@@ -713,6 +713,7 @@
   case CK_CopyAndAutoreleaseBlockObject:
   case CK_BuiltinFnToFnPtr:
   case CK_ZeroToOCLEvent:
+  case CK_AddressSpaceConversion:
     llvm_unreachable("cast kind invalid for aggregate types");
   }
 }
@@ -891,14 +892,16 @@
   // Bind the common expression if necessary.
   CodeGenFunction::OpaqueValueMapping binding(CGF, E);
 
+  RegionCounter Cnt = CGF.getPGORegionCounter(E);
   CodeGenFunction::ConditionalEvaluation eval(CGF);
-  CGF.EmitBranchOnBoolExpr(E->getCond(), LHSBlock, RHSBlock);
+  CGF.EmitBranchOnBoolExpr(E->getCond(), LHSBlock, RHSBlock, Cnt.getCount());
 
   // Save whether the destination's lifetime is externally managed.
   bool isExternallyDestructed = Dest.isExternallyDestructed();
 
   eval.begin(CGF);
   CGF.EmitBlock(LHSBlock);
+  Cnt.beginRegion(Builder);
   Visit(E->getTrueExpr());
   eval.end(CGF);
 
@@ -928,7 +931,11 @@
   llvm::Value *ArgPtr = CGF.EmitVAArg(ArgValue, VE->getType());
 
   if (!ArgPtr) {
-    CGF.ErrorUnsupported(VE, "aggregate va_arg expression");
+    // If EmitVAArg fails, we fall back to the LLVM instruction.
+    llvm::Value *Val =
+        Builder.CreateVAArg(ArgValue, CGF.ConvertType(VE->getType()));
+    if (!Dest.isIgnored())
+      Builder.CreateStore(Val, Dest.getAddr());
     return;
   }
 
@@ -1134,9 +1141,7 @@
 #ifndef NDEBUG
       // Make sure that it's really an empty and not a failure of
       // semantic analysis.
-      for (RecordDecl::field_iterator Field = record->field_begin(),
-                                   FieldEnd = record->field_end();
-           Field != FieldEnd; ++Field)
+      for (const auto *Field : record->fields())
         assert(Field->isUnnamedBitfield() && "Only unnamed bitfields allowed");
 #endif
       return;
@@ -1165,9 +1170,7 @@
   // Here we iterate over the fields; this makes it simpler to both
   // default-initialize fields and skip over unnamed fields.
   unsigned curInitIndex = 0;
-  for (RecordDecl::field_iterator field = record->field_begin(),
-                               fieldEnd = record->field_end();
-       field != fieldEnd; ++field) {
+  for (const auto *field : record->fields()) {
     // We're done once we hit the flexible array member.
     if (field->getType()->isIncompleteArrayType())
       break;
@@ -1184,7 +1187,7 @@
       break;
     
 
-    LValue LV = CGF.EmitLValueForFieldInitialization(DestLV, *field);
+    LValue LV = CGF.EmitLValueForFieldInitialization(DestLV, field);
     // We never generate write-barries for initialized fields.
     LV.setNonGC(true);
     
@@ -1261,8 +1264,7 @@
       CharUnits NumNonZeroBytes = CharUnits::Zero();
       
       unsigned ILEElement = 0;
-      for (RecordDecl::field_iterator Field = SD->field_begin(),
-           FieldEnd = SD->field_end(); Field != FieldEnd; ++Field) {
+      for (const auto *Field : SD->fields()) {
         // We're done once we hit the flexible array member or run out of
         // InitListExpr elements.
         if (Field->getType()->isIncompleteArrayType() ||
diff --git a/lib/CodeGen/CGExprCXX.cpp b/lib/CodeGen/CGExprCXX.cpp
index cc7b24d..f71a3de 100644
--- a/lib/CodeGen/CGExprCXX.cpp
+++ b/lib/CodeGen/CGExprCXX.cpp
@@ -18,8 +18,8 @@
 #include "CGObjCRuntime.h"
 #include "clang/CodeGen/CGFunctionInfo.h"
 #include "clang/Frontend/CodeGenOptions.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/Support/CallSite.h"
 
 using namespace clang;
 using namespace CodeGen;
@@ -119,8 +119,8 @@
     // type of MD and has a prefix.
     // For now we just avoid devirtualizing these covariant cases.
     if (DevirtualizedMethod &&
-        DevirtualizedMethod->getResultType().getCanonicalType() !=
-        MD->getResultType().getCanonicalType())
+        DevirtualizedMethod->getReturnType().getCanonicalType() !=
+            MD->getReturnType().getCanonicalType())
       DevirtualizedMethod = NULL;
   }
 
@@ -220,8 +220,10 @@
     }
   }
 
-  if (MD->isVirtual())
-    This = CGM.getCXXABI().adjustThisArgumentForVirtualCall(*this, MD, This);
+  if (MD->isVirtual()) {
+    This = CGM.getCXXABI().adjustThisArgumentForVirtualFunctionCall(
+        *this, MD, This, UseVirtualCall);
+  }
 
   return EmitCXXMemberCall(MD, CE->getExprLoc(), Callee, ReturnValue, This,
                            /*ImplicitParam=*/0, QualType(),
@@ -260,7 +262,7 @@
 
   // Ask the ABI to load the callee.  Note that This is modified.
   llvm::Value *Callee =
-    CGM.getCXXABI().EmitLoadOfMemberFunctionPointer(*this, This, MemFnPtr, MPT);
+    CGM.getCXXABI().EmitLoadOfMemberFunctionPointer(*this, BO, This, MemFnPtr, MPT);
   
   CallArgList Args;
 
@@ -316,7 +318,7 @@
 
   const ASTRecordLayout &Layout = CGF.getContext().getASTRecordLayout(Base);
   CharUnits Size = Layout.getNonVirtualSize();
-  CharUnits Align = Layout.getNonVirtualAlign();
+  CharUnits Align = Layout.getNonVirtualAlignment();
 
   llvm::Value *SizeVal = CGF.CGM.getSize(Size);
 
@@ -864,9 +866,29 @@
     cleanupDominator->eraseFromParent();
   }
 
-  // Advance to the next element.
-  llvm::Value *nextPtr = Builder.CreateConstGEP1_32(curPtr, 1, "array.next");
+  // FIXME: The code below intends to initialize the individual array base
+  // elements, one at a time - but when dealing with multi-dimensional arrays -
+  // the pointer arithmetic can get confused - so the fix below entails casting
+  // to the allocated type to ensure that we get the pointer arithmetic right.
+  // It seems like the right approach here, it to really initialize the
+  // individual array base elements one at a time since it'll generate less
+  // code. I think the problem is that the wrong type is being passed into
+  // StoreAnyExprIntoOneUnit, but directly fixing that doesn't really work,
+  // because the Init expression has the wrong type at this point.
+  // So... this is ok for a quick fix, but we can and should do a lot better
+  // here long-term.
 
+  // Advance to the next element by adjusting the pointer type as necessary.
+  // For new int[10][20][30], alloc type is int[20][30], base type is 'int'.
+  QualType AllocType = E->getAllocatedType();
+  llvm::Type *AllocPtrTy = ConvertTypeForMem(AllocType)->getPointerTo(
+      curPtr->getType()->getPointerAddressSpace());
+  llvm::Value *curPtrAllocTy = Builder.CreateBitCast(curPtr, AllocPtrTy);
+  llvm::Value *nextPtrAllocTy =
+      Builder.CreateConstGEP1_32(curPtrAllocTy, 1, "array.next");
+  // Cast it back to the base type so that we can compare it to the endPtr.
+  llvm::Value *nextPtr =
+      Builder.CreateBitCast(nextPtrAllocTy, endPtr->getType());
   // Check whether we've gotten to the end of the array and, if so,
   // exit the loop.
   llvm::Value *isEnd = Builder.CreateICmpEQ(nextPtr, endPtr, "array.atend");
@@ -991,20 +1013,20 @@
       getPlacementArgs()[I] = Arg;
     }
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       const FunctionProtoType *FPT
         = OperatorDelete->getType()->getAs<FunctionProtoType>();
-      assert(FPT->getNumArgs() == NumPlacementArgs + 1 ||
-             (FPT->getNumArgs() == 2 && NumPlacementArgs == 0));
+      assert(FPT->getNumParams() == NumPlacementArgs + 1 ||
+             (FPT->getNumParams() == 2 && NumPlacementArgs == 0));
 
       CallArgList DeleteArgs;
 
       // The first argument is always a void*.
-      FunctionProtoType::arg_type_iterator AI = FPT->arg_type_begin();
+      FunctionProtoType::param_type_iterator AI = FPT->param_type_begin();
       DeleteArgs.add(RValue::get(Ptr), *AI++);
 
       // A member 'operator delete' can take an extra 'size_t' argument.
-      if (FPT->getNumArgs() == NumPlacementArgs + 2)
+      if (FPT->getNumParams() == NumPlacementArgs + 2)
         DeleteArgs.add(RValue::get(AllocSize), *AI++);
 
       // Pass the rest of the arguments, which must match exactly.
@@ -1046,20 +1068,20 @@
       getPlacementArgs()[I] = Arg;
     }
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       const FunctionProtoType *FPT
         = OperatorDelete->getType()->getAs<FunctionProtoType>();
-      assert(FPT->getNumArgs() == NumPlacementArgs + 1 ||
-             (FPT->getNumArgs() == 2 && NumPlacementArgs == 0));
+      assert(FPT->getNumParams() == NumPlacementArgs + 1 ||
+             (FPT->getNumParams() == 2 && NumPlacementArgs == 0));
 
       CallArgList DeleteArgs;
 
       // The first argument is always a void*.
-      FunctionProtoType::arg_type_iterator AI = FPT->arg_type_begin();
+      FunctionProtoType::param_type_iterator AI = FPT->param_type_begin();
       DeleteArgs.add(Ptr.restore(CGF), *AI++);
 
       // A member 'operator delete' can take an extra 'size_t' argument.
-      if (FPT->getNumArgs() == NumPlacementArgs + 2) {
+      if (FPT->getNumParams() == NumPlacementArgs + 2) {
         RValue RV = AllocSize.restore(CGF);
         DeleteArgs.add(RV, *AI++);
       }
@@ -1145,35 +1167,12 @@
   
   allocatorArgs.add(RValue::get(allocSize), sizeType);
 
-  // Emit the rest of the arguments.
-  // FIXME: Ideally, this should just use EmitCallArgs.
-  CXXNewExpr::const_arg_iterator placementArg = E->placement_arg_begin();
-
-  // First, use the types from the function type.
   // We start at 1 here because the first argument (the allocation size)
   // has already been emitted.
-  for (unsigned i = 1, e = allocatorType->getNumArgs(); i != e;
-       ++i, ++placementArg) {
-    QualType argType = allocatorType->getArgType(i);
-
-    assert(getContext().hasSameUnqualifiedType(argType.getNonReferenceType(),
-                                               placementArg->getType()) &&
-           "type mismatch in call argument!");
-
-    EmitCallArg(allocatorArgs, *placementArg, argType);
-  }
-
-  // Either we've emitted all the call args, or we have a call to a
-  // variadic function.
-  assert((placementArg == E->placement_arg_end() ||
-          allocatorType->isVariadic()) &&
-         "Extra arguments to non-variadic function!");
-
-  // If we still have any arguments, emit them using the type of the argument.
-  for (CXXNewExpr::const_arg_iterator placementArgsEnd = E->placement_arg_end();
-       placementArg != placementArgsEnd; ++placementArg) {
-    EmitCallArg(allocatorArgs, *placementArg, placementArg->getType());
-  }
+  EmitCallArgs(allocatorArgs, allocatorType->isVariadic(),
+               allocatorType->param_type_begin() + 1,
+               allocatorType->param_type_end(), E->placement_arg_begin(),
+               E->placement_arg_end());
 
   // Emit the allocation call.  If the allocator is a global placement
   // operator, just "inline" it directly.
@@ -1289,14 +1288,14 @@
   // Check if we need to pass the size to the delete operator.
   llvm::Value *Size = 0;
   QualType SizeTy;
-  if (DeleteFTy->getNumArgs() == 2) {
-    SizeTy = DeleteFTy->getArgType(1);
+  if (DeleteFTy->getNumParams() == 2) {
+    SizeTy = DeleteFTy->getParamType(1);
     CharUnits DeleteTypeSize = getContext().getTypeSizeInChars(DeleteTy);
     Size = llvm::ConstantInt::get(ConvertType(SizeTy), 
                                   DeleteTypeSize.getQuantity());
   }
-  
-  QualType ArgTy = DeleteFTy->getArgType(0);
+
+  QualType ArgTy = DeleteFTy->getParamType(0);
   llvm::Value *DeletePtr = Builder.CreateBitCast(Ptr, ConvertType(ArgTy));
   DeleteArgs.add(RValue::get(DeletePtr), ArgTy);
 
@@ -1319,7 +1318,7 @@
                      QualType ElementType)
       : Ptr(Ptr), OperatorDelete(OperatorDelete), ElementType(ElementType) {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       CGF.EmitDeleteCall(OperatorDelete, Ptr, ElementType);
     }
   };
@@ -1422,22 +1421,22 @@
       : Ptr(Ptr), OperatorDelete(OperatorDelete), NumElements(NumElements),
         ElementType(ElementType), CookieSize(CookieSize) {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       const FunctionProtoType *DeleteFTy =
         OperatorDelete->getType()->getAs<FunctionProtoType>();
-      assert(DeleteFTy->getNumArgs() == 1 || DeleteFTy->getNumArgs() == 2);
+      assert(DeleteFTy->getNumParams() == 1 || DeleteFTy->getNumParams() == 2);
 
       CallArgList Args;
       
       // Pass the pointer as the first argument.
-      QualType VoidPtrTy = DeleteFTy->getArgType(0);
+      QualType VoidPtrTy = DeleteFTy->getParamType(0);
       llvm::Value *DeletePtr
         = CGF.Builder.CreateBitCast(Ptr, CGF.ConvertType(VoidPtrTy));
       Args.add(RValue::get(DeletePtr), VoidPtrTy);
 
       // Pass the original requested size as the second argument.
-      if (DeleteFTy->getNumArgs() == 2) {
-        QualType size_t = DeleteFTy->getArgType(1);
+      if (DeleteFTy->getNumParams() == 2) {
+        QualType size_t = DeleteFTy->getParamType(1);
         llvm::IntegerType *SizeTy
           = cast<llvm::IntegerType>(CGF.ConvertType(size_t));
         
diff --git a/lib/CodeGen/CGExprComplex.cpp b/lib/CodeGen/CGExprComplex.cpp
index 73d5bcb..1f84c86 100644
--- a/lib/CodeGen/CGExprComplex.cpp
+++ b/lib/CodeGen/CGExprComplex.cpp
@@ -93,7 +93,7 @@
   ComplexPairTy Visit(Expr *E) {
     return StmtVisitor<ComplexExprEmitter, ComplexPairTy>::Visit(E);
   }
-    
+
   ComplexPairTy VisitStmt(Stmt *S) {
     S->dump(CGF.getContext().getSourceManager());
     llvm_unreachable("Stmt can't have complex result type!");
@@ -410,7 +410,7 @@
   return ComplexPairTy(Val, llvm::Constant::getNullValue(Val->getType()));
 }
 
-ComplexPairTy ComplexExprEmitter::EmitCast(CastExpr::CastKind CK, Expr *Op, 
+ComplexPairTy ComplexExprEmitter::EmitCast(CastExpr::CastKind CK, Expr *Op,
                                            QualType DestTy) {
   switch (CK) {
   case CK_Dependent: llvm_unreachable("dependent cast kind in IR gen!");
@@ -427,7 +427,7 @@
   case CK_LValueBitCast: {
     LValue origLV = CGF.EmitLValue(Op);
     llvm::Value *V = origLV.getAddress();
-    V = Builder.CreateBitCast(V, 
+    V = Builder.CreateBitCast(V,
                     CGF.ConvertType(CGF.getContext().getPointerType(DestTy)));
     return EmitLoadOfLValue(CGF.MakeAddrLValue(V, DestTy,
                                                origLV.getAlignment()),
@@ -475,6 +475,7 @@
   case CK_CopyAndAutoreleaseBlockObject:
   case CK_BuiltinFnToFnPtr:
   case CK_ZeroToOCLEvent:
+  case CK_AddressSpaceConversion:
     llvm_unreachable("invalid cast kind for complex value");
 
   case CK_FloatingRealToComplex:
@@ -652,7 +653,7 @@
   assert(CGF.getContext().hasSameUnqualifiedType(OpInfo.Ty,
                                                  E->getRHS()->getType()));
   OpInfo.RHS = Visit(E->getRHS());
-  
+
   LValue LHS = CGF.EmitLValue(E->getLHS());
 
   // Load from the l-value and convert it.
@@ -702,7 +703,7 @@
 
 LValue ComplexExprEmitter::EmitBinAssignLValue(const BinaryOperator *E,
                                                ComplexPairTy &Val) {
-  assert(CGF.getContext().hasSameUnqualifiedType(E->getLHS()->getType(), 
+  assert(CGF.getContext().hasSameUnqualifiedType(E->getLHS()->getType(),
                                                  E->getRHS()->getType()) &&
          "Invalid assignment");
   TestAndClearIgnoreReal();
@@ -751,11 +752,13 @@
   // Bind the common expression if necessary.
   CodeGenFunction::OpaqueValueMapping binding(CGF, E);
 
+  RegionCounter Cnt = CGF.getPGORegionCounter(E);
   CodeGenFunction::ConditionalEvaluation eval(CGF);
-  CGF.EmitBranchOnBoolExpr(E->getCond(), LHSBlock, RHSBlock);
+  CGF.EmitBranchOnBoolExpr(E->getCond(), LHSBlock, RHSBlock, Cnt.getCount());
 
   eval.begin(CGF);
   CGF.EmitBlock(LHSBlock);
+  Cnt.beginRegion(Builder);
   ComplexPairTy LHS = Visit(E->getTrueExpr());
   LHSBlock = Builder.GetInsertBlock();
   CGF.EmitBranch(ContBlock);
diff --git a/lib/CodeGen/CGExprConstant.cpp b/lib/CodeGen/CGExprConstant.cpp
index f4d6861..82382dd 100644
--- a/lib/CodeGen/CGExprConstant.cpp
+++ b/lib/CodeGen/CGExprConstant.cpp
@@ -633,6 +633,9 @@
       return llvm::ConstantStruct::get(STy, Elts);
     }
 
+    case CK_AddressSpaceConversion:
+      return llvm::ConstantExpr::getAddrSpaceCast(C, destType);
+
     case CK_LValueToRValue:
     case CK_AtomicToNonAtomic:
     case CK_NonAtomicToAtomic:
@@ -917,7 +920,7 @@
     }
     case Expr::CallExprClass: {
       CallExpr* CE = cast<CallExpr>(E);
-      unsigned builtin = CE->isBuiltinCall();
+      unsigned builtin = CE->getBuiltinCallee();
       if (builtin !=
             Builtin::BI__builtin___CFStringMakeConstantString &&
           builtin !=
@@ -1062,13 +1065,13 @@
       if (!Offset->isNullValue()) {
         llvm::Constant *Casted = llvm::ConstantExpr::getBitCast(C, Int8PtrTy);
         Casted = llvm::ConstantExpr::getGetElementPtr(Casted, Offset);
-        C = llvm::ConstantExpr::getBitCast(Casted, C->getType());
+        C = llvm::ConstantExpr::getPointerCast(Casted, C->getType());
       }
 
       // Convert to the appropriate type; this could be an lvalue for
       // an integer.
       if (isa<llvm::PointerType>(DestTy))
-        return llvm::ConstantExpr::getBitCast(C, DestTy);
+        return llvm::ConstantExpr::getPointerCast(C, DestTy);
 
       return llvm::ConstantExpr::getPtrToInt(C, DestTy);
     } else {
@@ -1265,15 +1268,14 @@
     const ASTRecordLayout &Layout = CGM.getContext().getASTRecordLayout(RD);
 
     // Go through all bases and fill in any null pointer to data members.
-    for (CXXRecordDecl::base_class_const_iterator I = RD->bases_begin(),
-         E = RD->bases_end(); I != E; ++I) {
-      if (I->isVirtual()) {
+    for (const auto &I : RD->bases()) {
+      if (I.isVirtual()) {
         // Ignore virtual bases.
         continue;
       }
       
       const CXXRecordDecl *BaseDecl = 
-      cast<CXXRecordDecl>(I->getType()->getAs<RecordType>()->getDecl());
+      cast<CXXRecordDecl>(I.getType()->getAs<RecordType>()->getDecl());
       
       // Ignore empty bases.
       if (BaseDecl->isEmpty())
@@ -1285,7 +1287,7 @@
 
       uint64_t BaseOffset =
         CGM.getContext().toBits(Layout.getBaseClassOffset(BaseDecl));
-      FillInNullDataMemberPointers(CGM, I->getType(),
+      FillInNullDataMemberPointers(CGM, I.getType(),
                                    Elements, StartOffset + BaseOffset);
     }
     
@@ -1335,16 +1337,15 @@
   std::vector<llvm::Constant *> elements(numElements);
 
   // Fill in all the bases.
-  for (CXXRecordDecl::base_class_const_iterator
-         I = record->bases_begin(), E = record->bases_end(); I != E; ++I) {
-    if (I->isVirtual()) {
+  for (const auto &I : record->bases()) {
+    if (I.isVirtual()) {
       // Ignore virtual bases; if we're laying out for a complete
       // object, we'll lay these out later.
       continue;
     }
 
     const CXXRecordDecl *base = 
-      cast<CXXRecordDecl>(I->getType()->castAs<RecordType>()->getDecl());
+      cast<CXXRecordDecl>(I.getType()->castAs<RecordType>()->getDecl());
 
     // Ignore empty bases.
     if (base->isEmpty())
@@ -1356,28 +1357,24 @@
   }
 
   // Fill in all the fields.
-  for (RecordDecl::field_iterator I = record->field_begin(),
-         E = record->field_end(); I != E; ++I) {
-    const FieldDecl *field = *I;
-
+  for (const auto *Field : record->fields()) {
     // Fill in non-bitfields. (Bitfields always use a zero pattern, which we
     // will fill in later.)
-    if (!field->isBitField()) {
-      unsigned fieldIndex = layout.getLLVMFieldNo(field);
-      elements[fieldIndex] = CGM.EmitNullConstant(field->getType());
+    if (!Field->isBitField()) {
+      unsigned fieldIndex = layout.getLLVMFieldNo(Field);
+      elements[fieldIndex] = CGM.EmitNullConstant(Field->getType());
     }
 
     // For unions, stop after the first named field.
-    if (record->isUnion() && field->getDeclName())
+    if (record->isUnion() && Field->getDeclName())
       break;
   }
 
   // Fill in the virtual bases, if we're working with the complete object.
   if (asCompleteObject) {
-    for (CXXRecordDecl::base_class_const_iterator
-           I = record->vbases_begin(), E = record->vbases_end(); I != E; ++I) {
+    for (const auto &I : record->vbases()) {
       const CXXRecordDecl *base = 
-        cast<CXXRecordDecl>(I->getType()->castAs<RecordType>()->getDecl());
+        cast<CXXRecordDecl>(I.getType()->castAs<RecordType>()->getDecl());
 
       // Ignore empty bases.
       if (base->isEmpty())
diff --git a/lib/CodeGen/CGExprScalar.cpp b/lib/CodeGen/CGExprScalar.cpp
index f3a5387..5f932b0 100644
--- a/lib/CodeGen/CGExprScalar.cpp
+++ b/lib/CodeGen/CGExprScalar.cpp
@@ -22,13 +22,13 @@
 #include "clang/AST/StmtVisitor.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Frontend/CodeGenOptions.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/CFG.h"
 #include <cstdarg>
 
 using namespace clang;
@@ -246,7 +246,7 @@
   }
   Value *VisitObjCMessageExpr(ObjCMessageExpr *E) {
     if (E->getMethodDecl() &&
-        E->getMethodDecl()->getResultType()->isReferenceType())
+        E->getMethodDecl()->getReturnType()->isReferenceType())
       return EmitLoadOfLValue(E);
     return CGF.EmitObjCMessageExpr(E).getScalarVal();
   }
@@ -367,11 +367,8 @@
     CGF.EmitCXXDeleteExpr(E);
     return 0;
   }
-  Value *VisitUnaryTypeTraitExpr(const UnaryTypeTraitExpr *E) {
-    return Builder.getInt1(E->getValue());
-  }
 
-  Value *VisitBinaryTypeTraitExpr(const BinaryTypeTraitExpr *E) {
+  Value *VisitTypeTraitExpr(const TypeTraitExpr *E) {
     return llvm::ConstantInt::get(ConvertType(E->getType()), E->getValue());
   }
 
@@ -1299,7 +1296,18 @@
   case CK_AnyPointerToBlockPointerCast:
   case CK_BitCast: {
     Value *Src = Visit(const_cast<Expr*>(E));
-    return Builder.CreateBitCast(Src, ConvertType(DestTy));
+    llvm::Type *SrcTy = Src->getType();
+    llvm::Type *DstTy = ConvertType(DestTy);
+    if (SrcTy->isPtrOrPtrVectorTy() && DstTy->isPtrOrPtrVectorTy() &&
+        SrcTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace()) {
+      llvm::Type *MidTy = CGF.CGM.getDataLayout().getIntPtrType(SrcTy);
+      return Builder.CreateIntToPtr(Builder.CreatePtrToInt(Src, MidTy), DstTy);
+    }
+    return Builder.CreateBitCast(Src, DstTy);
+  }
+  case CK_AddressSpaceConversion: {
+    Value *Src = Visit(const_cast<Expr*>(E));
+    return Builder.CreateAddrSpaceCast(Src, ConvertType(DestTy));
   }
   case CK_AtomicToNonAtomic:
   case CK_NonAtomicToAtomic:
@@ -1360,7 +1368,7 @@
 
     // Make sure the array decay ends up being the right type.  This matters if
     // the array type was of an incomplete type.
-    return CGF.Builder.CreateBitCast(V, ConvertType(CE->getType()));
+    return CGF.Builder.CreatePointerCast(V, ConvertType(CE->getType()));
   }
   case CK_FunctionToPointerDecay:
     return EmitLValue(E).getAddress();
@@ -1485,7 +1493,7 @@
   }
 
   case CK_ZeroToOCLEvent: {
-    assert(DestTy->isEventT() && "CK_ZeroToOCLEvent cast on non event type");
+    assert(DestTy->isEventT() && "CK_ZeroToOCLEvent cast on non-event type");
     return llvm::Constant::getNullValue(ConvertType(DestTy));
   }
 
@@ -1727,8 +1735,9 @@
   if (atomicPHI) {
     llvm::BasicBlock *opBB = Builder.GetInsertBlock();
     llvm::BasicBlock *contBB = CGF.createBasicBlock("atomic_cont", CGF.CurFn);
-    llvm::Value *old = Builder.CreateAtomicCmpXchg(LV.getAddress(), atomicPHI,
-        CGF.EmitToMemory(value, type), llvm::SequentiallyConsistent);
+    llvm::Value *old = Builder.CreateAtomicCmpXchg(
+        LV.getAddress(), atomicPHI, CGF.EmitToMemory(value, type),
+        llvm::SequentiallyConsistent, llvm::SequentiallyConsistent);
     atomicPHI->addIncoming(old, opBB);
     llvm::Value *success = Builder.CreateICmpEQ(old, atomicPHI);
     Builder.CreateCondBr(success, contBB, opBB);
@@ -1906,7 +1915,7 @@
 
       QualType eltType;
       llvm::Value *numElts;
-      llvm::tie(numElts, eltType) = CGF.getVLASize(VAT);
+      std::tie(numElts, eltType) = CGF.getVLASize(VAT);
 
       llvm::Value *size = numElts;
 
@@ -2069,8 +2078,9 @@
   if (atomicPHI) {
     llvm::BasicBlock *opBB = Builder.GetInsertBlock();
     llvm::BasicBlock *contBB = CGF.createBasicBlock("atomic_cont", CGF.CurFn);
-    llvm::Value *old = Builder.CreateAtomicCmpXchg(LHSLV.getAddress(), atomicPHI,
-        CGF.EmitToMemory(Result, LHSTy), llvm::SequentiallyConsistent);
+    llvm::Value *old = Builder.CreateAtomicCmpXchg(
+        LHSLV.getAddress(), atomicPHI, CGF.EmitToMemory(Result, LHSTy),
+        llvm::SequentiallyConsistent, llvm::SequentiallyConsistent);
     atomicPHI->addIncoming(old, opBB);
     llvm::Value *success = Builder.CreateICmpEQ(old, atomicPHI);
     Builder.CreateCondBr(success, contBB, opBB);
@@ -2238,7 +2248,7 @@
   llvm::BasicBlock *initialBB = Builder.GetInsertBlock();
   llvm::Function::iterator insertPt = initialBB;
   llvm::BasicBlock *continueBB = CGF.createBasicBlock("nooverflow", CGF.CurFn,
-                                                      llvm::next(insertPt));
+                                                      std::next(insertPt));
   llvm::BasicBlock *overflowBB = CGF.createBasicBlock("overflow", CGF.CurFn);
 
   Builder.CreateCondBr(overflow, overflowBB, continueBB);
@@ -2529,7 +2539,7 @@
   if (const VariableArrayType *vla
         = CGF.getContext().getAsVariableArrayType(elementType)) {
     llvm::Value *numElements;
-    llvm::tie(numElements, elementType) = CGF.getVLASize(vla);
+    std::tie(numElements, elementType) = CGF.getVLASize(vla);
 
     divisor = numElements;
 
@@ -2818,11 +2828,11 @@
 
   switch (E->getLHS()->getType().getObjCLifetime()) {
   case Qualifiers::OCL_Strong:
-    llvm::tie(LHS, RHS) = CGF.EmitARCStoreStrong(E, Ignore);
+    std::tie(LHS, RHS) = CGF.EmitARCStoreStrong(E, Ignore);
     break;
 
   case Qualifiers::OCL_Autoreleasing:
-    llvm::tie(LHS,RHS) = CGF.EmitARCStoreAutoreleasing(E);
+    std::tie(LHS, RHS) = CGF.EmitARCStoreAutoreleasing(E);
     break;
 
   case Qualifiers::OCL_Weak:
@@ -2866,8 +2876,12 @@
 }
 
 Value *ScalarExprEmitter::VisitBinLAnd(const BinaryOperator *E) {
+  RegionCounter Cnt = CGF.getPGORegionCounter(E);
+
   // Perform vector logical and on comparisons with zero vectors.
   if (E->getType()->isVectorType()) {
+    Cnt.beginRegion(Builder);
+
     Value *LHS = Visit(E->getLHS());
     Value *RHS = Visit(E->getRHS());
     Value *Zero = llvm::ConstantAggregateZero::get(LHS->getType());
@@ -2889,6 +2903,8 @@
   bool LHSCondVal;
   if (CGF.ConstantFoldsToSimpleInteger(E->getLHS(), LHSCondVal)) {
     if (LHSCondVal) { // If we have 1 && X, just emit X.
+      Cnt.beginRegion(Builder);
+
       Value *RHSCond = CGF.EvaluateExprAsBool(E->getRHS());
       // ZExt result to int or bool.
       return Builder.CreateZExtOrBitCast(RHSCond, ResTy, "land.ext");
@@ -2905,7 +2921,7 @@
   CodeGenFunction::ConditionalEvaluation eval(CGF);
 
   // Branch on the LHS first.  If it is false, go to the failure (cont) block.
-  CGF.EmitBranchOnBoolExpr(E->getLHS(), RHSBlock, ContBlock);
+  CGF.EmitBranchOnBoolExpr(E->getLHS(), RHSBlock, ContBlock, Cnt.getCount());
 
   // Any edges into the ContBlock are now from an (indeterminate number of)
   // edges from this first condition.  All of these values will be false.  Start
@@ -2918,6 +2934,7 @@
 
   eval.begin(CGF);
   CGF.EmitBlock(RHSBlock);
+  Cnt.beginRegion(Builder);
   Value *RHSCond = CGF.EvaluateExprAsBool(E->getRHS());
   eval.end(CGF);
 
@@ -2937,8 +2954,12 @@
 }
 
 Value *ScalarExprEmitter::VisitBinLOr(const BinaryOperator *E) {
+  RegionCounter Cnt = CGF.getPGORegionCounter(E);
+
   // Perform vector logical or on comparisons with zero vectors.
   if (E->getType()->isVectorType()) {
+    Cnt.beginRegion(Builder);
+
     Value *LHS = Visit(E->getLHS());
     Value *RHS = Visit(E->getRHS());
     Value *Zero = llvm::ConstantAggregateZero::get(LHS->getType());
@@ -2960,6 +2981,8 @@
   bool LHSCondVal;
   if (CGF.ConstantFoldsToSimpleInteger(E->getLHS(), LHSCondVal)) {
     if (!LHSCondVal) { // If we have 0 || X, just emit X.
+      Cnt.beginRegion(Builder);
+
       Value *RHSCond = CGF.EvaluateExprAsBool(E->getRHS());
       // ZExt result to int or bool.
       return Builder.CreateZExtOrBitCast(RHSCond, ResTy, "lor.ext");
@@ -2976,7 +2999,8 @@
   CodeGenFunction::ConditionalEvaluation eval(CGF);
 
   // Branch on the LHS first.  If it is true, go to the success (cont) block.
-  CGF.EmitBranchOnBoolExpr(E->getLHS(), ContBlock, RHSBlock);
+  CGF.EmitBranchOnBoolExpr(E->getLHS(), ContBlock, RHSBlock,
+                           Cnt.getParentCount() - Cnt.getCount());
 
   // Any edges into the ContBlock are now from an (indeterminate number of)
   // edges from this first condition.  All of these values will be true.  Start
@@ -2991,6 +3015,7 @@
 
   // Emit the RHS condition as a bool value.
   CGF.EmitBlock(RHSBlock);
+  Cnt.beginRegion(Builder);
   Value *RHSCond = CGF.EvaluateExprAsBool(E->getRHS());
 
   eval.end(CGF);
@@ -3041,6 +3066,7 @@
 
   // Bind the common expression if necessary.
   CodeGenFunction::OpaqueValueMapping binding(CGF, E);
+  RegionCounter Cnt = CGF.getPGORegionCounter(E);
 
   Expr *condExpr = E->getCond();
   Expr *lhsExpr = E->getTrueExpr();
@@ -3055,6 +3081,8 @@
 
     // If the dead side doesn't have labels we need, just emit the Live part.
     if (!CGF.ContainsLabel(dead)) {
+      if (CondExprBool)
+        Cnt.beginRegion(Builder);
       Value *Result = Visit(live);
 
       // If the live part is a throw expression, it acts like it has a void
@@ -3071,6 +3099,8 @@
   // the select function.
   if (CGF.getLangOpts().OpenCL
       && condExpr->getType()->isVectorType()) {
+    Cnt.beginRegion(Builder);
+
     llvm::Value *CondV = CGF.EmitScalarExpr(condExpr);
     llvm::Value *LHS = Visit(lhsExpr);
     llvm::Value *RHS = Visit(rhsExpr);
@@ -3114,6 +3144,8 @@
   // safe to evaluate the LHS and RHS unconditionally.
   if (isCheapEnoughToEvaluateUnconditionally(lhsExpr, CGF) &&
       isCheapEnoughToEvaluateUnconditionally(rhsExpr, CGF)) {
+    Cnt.beginRegion(Builder);
+
     llvm::Value *CondV = CGF.EvaluateExprAsBool(condExpr);
     llvm::Value *LHS = Visit(lhsExpr);
     llvm::Value *RHS = Visit(rhsExpr);
@@ -3130,9 +3162,10 @@
   llvm::BasicBlock *ContBlock = CGF.createBasicBlock("cond.end");
 
   CodeGenFunction::ConditionalEvaluation eval(CGF);
-  CGF.EmitBranchOnBoolExpr(condExpr, LHSBlock, RHSBlock);
+  CGF.EmitBranchOnBoolExpr(condExpr, LHSBlock, RHSBlock, Cnt.getCount());
 
   CGF.EmitBlock(LHSBlock);
+  Cnt.beginRegion(Builder);
   eval.begin(CGF);
   Value *LHS = Visit(lhsExpr);
   eval.end(CGF);
diff --git a/lib/CodeGen/CGObjC.cpp b/lib/CodeGen/CGObjC.cpp
index 0bda053..f78bb0b 100644
--- a/lib/CodeGen/CGObjC.cpp
+++ b/lib/CodeGen/CGObjC.cpp
@@ -22,7 +22,7 @@
 #include "clang/Basic/Diagnostic.h"
 #include "clang/CodeGen/CGFunctionInfo.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/CallSite.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InlineAsm.h"
 using namespace clang;
@@ -79,10 +79,10 @@
   RValue RV = EmitAnyExpr(SubExpr);
   CallArgList Args;
   Args.add(RV, ArgQT);
-  
-  RValue result = Runtime.GenerateMessageSend(*this, ReturnValueSlot(), 
-                                              BoxingMethod->getResultType(), Sel, Receiver, Args, 
-                                              ClassDecl, BoxingMethod);
+
+  RValue result = Runtime.GenerateMessageSend(
+      *this, ReturnValueSlot(), BoxingMethod->getReturnType(), Sel, Receiver,
+      Args, ClassDecl, BoxingMethod);
   return Builder.CreateBitCast(result.getScalarVal(), 
                                ConvertType(E->getType()));
 }
@@ -186,12 +186,9 @@
   llvm::Value *Receiver = Runtime.GetClass(*this, Class);
 
   // Generate the message send.
-  RValue result
-    = Runtime.GenerateMessageSend(*this, ReturnValueSlot(), 
-                                  MethodWithObjects->getResultType(),
-                                  Sel,
-                                  Receiver, Args, Class,
-                                  MethodWithObjects);
+  RValue result = Runtime.GenerateMessageSend(
+      *this, ReturnValueSlot(), MethodWithObjects->getReturnType(), Sel,
+      Receiver, Args, Class, MethodWithObjects);
 
   // The above message send needs these objects, but in ARC they are
   // passed in a buffer that is essentially __unsafe_unretained.
@@ -238,7 +235,7 @@
     return Result;
 
   if (!Method->hasRelatedResultType() ||
-      CGF.getContext().hasSameType(ExpT, Method->getResultType()) ||
+      CGF.getContext().hasSameType(ExpT, Method->getReturnType()) ||
       !Result.isScalar())
     return Result;
   
@@ -369,8 +366,7 @@
       shouldExtendReceiverForInnerPointerMessage(E))
     Receiver = EmitARCRetainAutorelease(ReceiverType, Receiver);
 
-  QualType ResultType =
-    method ? method->getResultType() : E->getType();
+  QualType ResultType = method ? method->getReturnType() : E->getType();
 
   CallArgList Args;
   EmitCallArgs(Args, method, E->arg_begin(), E->arg_end());
@@ -435,7 +431,7 @@
 
 namespace {
 struct FinishARCDealloc : EHScopeStack::Cleanup {
-  void Emit(CodeGenFunction &CGF, Flags flags) {
+  void Emit(CodeGenFunction &CGF, Flags flags) override {
     const ObjCMethodDecl *method = cast<ObjCMethodDecl>(CGF.CurCodeDecl);
 
     const ObjCImplDecl *impl = cast<ObjCImplDecl>(method->getDeclContext());
@@ -480,13 +476,12 @@
   args.push_back(OMD->getSelfDecl());
   args.push_back(OMD->getCmdDecl());
 
-  for (ObjCMethodDecl::param_const_iterator PI = OMD->param_begin(),
-         E = OMD->param_end(); PI != E; ++PI)
-    args.push_back(*PI);
+  for (const auto *PI : OMD->params())
+    args.push_back(PI);
 
   CurGD = OMD;
 
-  StartFunction(OMD, OMD->getResultType(), Fn, FI, args, StartLoc);
+  StartFunction(OMD, OMD->getReturnType(), Fn, FI, args, StartLoc);
 
   // In ARC, certain methods get an extra cleanup.
   if (CGM.getLangOpts().ObjCAutoRefCount &&
@@ -506,8 +501,14 @@
 /// its pointer, name, and types registered in the class struture.
 void CodeGenFunction::GenerateObjCMethod(const ObjCMethodDecl *OMD) {
   StartObjCMethod(OMD, OMD->getClassInterface(), OMD->getLocStart());
-  EmitStmt(OMD->getBody());
+  PGO.assignRegionCounters(OMD, CurFn);
+  assert(isa<CompoundStmt>(OMD->getBody()));
+  RegionCounter Cnt = getPGORegionCounter(OMD->getBody());
+  Cnt.beginRegion(Builder);
+  EmitCompoundStmtWithoutScope(*cast<CompoundStmt>(OMD->getBody()));
   FinishFunction(OMD->getBodyRBrace());
+  PGO.emitInstrumentationData();
+  PGO.destroyRegionCounters();
 }
 
 /// emitStructGetterCall - Call the runtime function to load a property
@@ -622,8 +623,8 @@
   // Evaluate the ivar's size and alignment.
   ObjCIvarDecl *ivar = propImpl->getPropertyIvarDecl();
   QualType ivarType = ivar->getType();
-  llvm::tie(IvarSize, IvarAlignment)
-    = CGM.getContext().getTypeInfoInChars(ivarType);
+  std::tie(IvarSize, IvarAlignment) =
+      CGM.getContext().getTypeInfoInChars(ivarType);
 
   // If we have a copy property, we always have to use getProperty/setProperty.
   // TODO: we could actually use setProperty and an expression for non-atomics.
@@ -895,16 +896,21 @@
 
     // FIXME: We shouldn't need to get the function info here, the
     // runtime already should have computed it to build the function.
+    llvm::Instruction *CallInstruction;
     RValue RV = EmitCall(getTypes().arrangeFreeFunctionCall(propType, args,
                                                        FunctionType::ExtInfo(),
                                                             RequiredArgs::All),
-                         getPropertyFn, ReturnValueSlot(), args);
+                         getPropertyFn, ReturnValueSlot(), args, 0,
+                         &CallInstruction);
+    if (llvm::CallInst *call = dyn_cast<llvm::CallInst>(CallInstruction))
+      call->setTailCall();
 
     // We need to fix the type here. Ivars with copy & retain are
     // always objects so we don't need to worry about complex or
     // aggregates.
-    RV = RValue::get(Builder.CreateBitCast(RV.getScalarVal(),
-           getTypes().ConvertType(getterMethod->getResultType())));
+    RV = RValue::get(Builder.CreateBitCast(
+        RV.getScalarVal(),
+        getTypes().ConvertType(getterMethod->getReturnType())));
 
     EmitReturnOfRValue(RV, propType);
 
@@ -955,8 +961,8 @@
         }
 
         value = Builder.CreateBitCast(value, ConvertType(propType));
-        value = Builder.CreateBitCast(value, 
-                  ConvertType(GetterMethodDecl->getResultType()));
+        value = Builder.CreateBitCast(
+            value, ConvertType(GetterMethodDecl->getReturnType()));
       }
       
       EmitReturnOfRValue(RValue::get(value), propType);
@@ -1292,7 +1298,7 @@
       : addr(addr), ivar(ivar), destroyer(destroyer),
         useEHCleanupForArray(useEHCleanupForArray) {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       LValue lvalue
         = CGF.EmitLValueForIvar(CGF.TypeOfSelfObject(), addr, ivar, /*CVR*/ 0);
       CGF.emitDestroy(lvalue.getAddress(), ivar->getType(), destroyer,
@@ -1356,12 +1362,9 @@
     // Suppress the final autorelease in ARC.
     AutoreleaseResult = false;
 
-    SmallVector<CXXCtorInitializer *, 8> IvarInitializers;
-    for (ObjCImplementationDecl::init_const_iterator B = IMP->init_begin(),
-           E = IMP->init_end(); B != E; ++B) {
-      CXXCtorInitializer *IvarInit = (*B);
+    for (const auto *IvarInit : IMP->inits()) {
       FieldDecl *Field = IvarInit->getAnyMember();
-      ObjCIvarDecl  *Ivar = cast<ObjCIvarDecl>(Field);
+      ObjCIvarDecl *Ivar = cast<ObjCIvarDecl>(Field);
       LValue LV = EmitLValueForIvar(TypeOfSelfObject(), 
                                     LoadObjCSelf(), Ivar, 0);
       EmitAggExpr(IvarInit->getInit(),
@@ -1506,9 +1509,13 @@
   llvm::Value *zero = llvm::Constant::getNullValue(UnsignedLongLTy);
 
   // If the limit pointer was zero to begin with, the collection is
-  // empty; skip all this.
+  // empty; skip all this. Set the branch weight assuming this has the same
+  // probability of exiting the loop as any other loop exit.
+  uint64_t EntryCount = PGO.getCurrentRegionCount();
+  RegionCounter Cnt = getPGORegionCounter(&S);
   Builder.CreateCondBr(Builder.CreateICmpEQ(initialBufferLimit, zero, "iszero"),
-                       EmptyBB, LoopInitBB);
+                       EmptyBB, LoopInitBB,
+                       PGO.createBranchWeights(EntryCount, Cnt.getCount()));
 
   // Otherwise, initialize the loop.
   EmitBlock(LoopInitBB);
@@ -1537,6 +1544,8 @@
   llvm::PHINode *count = Builder.CreatePHI(UnsignedLongLTy, 3, "forcoll.count");
   count->addIncoming(initialBufferLimit, LoopInitBB);
 
+  Cnt.beginRegion(Builder);
+
   // Check whether the mutations value has changed from where it was
   // at start.  StateMutationsPtr should actually be invariant between
   // refreshes.
@@ -1644,8 +1653,12 @@
     = Builder.CreateAdd(index, llvm::ConstantInt::get(UnsignedLongLTy, 1));
 
   // If we haven't overrun the buffer yet, we can continue.
+  // Set the branch weights based on the simplifying assumption that this is
+  // like a while-loop, i.e., ignoring that the false branch fetches more
+  // elements and then returns to the loop.
   Builder.CreateCondBr(Builder.CreateICmpULT(indexPlusOne, count),
-                       LoopBodyBB, FetchMoreBB);
+                       LoopBodyBB, FetchMoreBB,
+                       PGO.createBranchWeights(Cnt.getCount(), EntryCount));
 
   index->addIncoming(indexPlusOne, AfterBody.getBlock());
   count->addIncoming(count, AfterBody.getBlock());
@@ -1715,7 +1728,7 @@
     CallObjCRelease(llvm::Value *object) : object(object) {}
     llvm::Value *object;
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       // Releases at the end of the full-expression are imprecise.
       CGF.EmitARCRelease(object, ARCImpreciseLifetime);
     }
@@ -2324,7 +2337,7 @@
 
     CallObjCAutoreleasePoolObject(llvm::Value *token) : Token(token) {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       CGF.EmitObjCAutoreleasePoolPop(Token);
     }
   };
@@ -2333,7 +2346,7 @@
 
     CallObjCMRRAutoreleasePoolObject(llvm::Value *token) : Token(token) {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       CGF.EmitObjCMRRAutoreleasePoolPop(Token);
     }
   };
@@ -2824,9 +2837,8 @@
     EHStack.pushCleanup<CallObjCMRRAutoreleasePoolObject>(NormalCleanup, token);
   }
 
-  for (CompoundStmt::const_body_iterator I = S.body_begin(),
-       E = S.body_end(); I != E; ++I)
-    EmitStmt(*I);
+  for (const auto *I : S.body())
+    EmitStmt(I);
 
   if (DI)
     DI->EmitLexicalBlockEnd(Builder, S.getRBracLoc());
@@ -2892,12 +2904,10 @@
   args.push_back(&dstDecl);
   ImplicitParamDecl srcDecl(FD, SourceLocation(), 0, SrcTy);
   args.push_back(&srcDecl);
-  
-  const CGFunctionInfo &FI =
-    CGM.getTypes().arrangeFunctionDeclaration(C.VoidTy, args,
-                                              FunctionType::ExtInfo(),
-                                              RequiredArgs::All);
-  
+
+  const CGFunctionInfo &FI = CGM.getTypes().arrangeFreeFunctionDeclaration(
+      C.VoidTy, args, FunctionType::ExtInfo(), RequiredArgs::All);
+
   llvm::FunctionType *LTy = CGM.getTypes().GetFunctionType(FI);
   
   llvm::Function *Fn =
@@ -2973,12 +2983,10 @@
   args.push_back(&dstDecl);
   ImplicitParamDecl srcDecl(FD, SourceLocation(), 0, SrcTy);
   args.push_back(&srcDecl);
-  
-  const CGFunctionInfo &FI =
-  CGM.getTypes().arrangeFunctionDeclaration(C.VoidTy, args,
-                                            FunctionType::ExtInfo(),
-                                            RequiredArgs::All);
-  
+
+  const CGFunctionInfo &FI = CGM.getTypes().arrangeFreeFunctionDeclaration(
+      C.VoidTy, args, FunctionType::ExtInfo(), RequiredArgs::All);
+
   llvm::FunctionType *LTy = CGM.getTypes().GetFunctionType(FI);
   
   llvm::Function *Fn =
diff --git a/lib/CodeGen/CGObjCGNU.cpp b/lib/CodeGen/CGObjCGNU.cpp
index a7ab850..2689d7b 100644
--- a/lib/CodeGen/CGObjCGNU.cpp
+++ b/lib/CodeGen/CGObjCGNU.cpp
@@ -27,11 +27,11 @@
 #include "clang/Basic/SourceManager.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/CallSite.h"
 #include "llvm/Support/Compiler.h"
 #include <cstdarg>
 
@@ -479,102 +479,91 @@
   CGObjCGNU(CodeGenModule &cgm, unsigned runtimeABIVersion,
       unsigned protocolClassVersion);
 
-  virtual llvm::Constant *GenerateConstantString(const StringLiteral *);
+  llvm::Constant *GenerateConstantString(const StringLiteral *) override;
 
-  virtual RValue
-  GenerateMessageSend(CodeGenFunction &CGF,
-                      ReturnValueSlot Return,
-                      QualType ResultType,
-                      Selector Sel,
-                      llvm::Value *Receiver,
-                      const CallArgList &CallArgs,
+  RValue
+  GenerateMessageSend(CodeGenFunction &CGF, ReturnValueSlot Return,
+                      QualType ResultType, Selector Sel,
+                      llvm::Value *Receiver, const CallArgList &CallArgs,
                       const ObjCInterfaceDecl *Class,
-                      const ObjCMethodDecl *Method);
-  virtual RValue
-  GenerateMessageSendSuper(CodeGenFunction &CGF,
-                           ReturnValueSlot Return,
-                           QualType ResultType,
-                           Selector Sel,
+                      const ObjCMethodDecl *Method) override;
+  RValue
+  GenerateMessageSendSuper(CodeGenFunction &CGF, ReturnValueSlot Return,
+                           QualType ResultType, Selector Sel,
                            const ObjCInterfaceDecl *Class,
-                           bool isCategoryImpl,
-                           llvm::Value *Receiver,
-                           bool IsClassMessage,
-                           const CallArgList &CallArgs,
-                           const ObjCMethodDecl *Method);
-  virtual llvm::Value *GetClass(CodeGenFunction &CGF,
-                                const ObjCInterfaceDecl *OID);
-  virtual llvm::Value *GetSelector(CodeGenFunction &CGF, Selector Sel,
-                                   bool lval = false);
-  virtual llvm::Value *GetSelector(CodeGenFunction &CGF, const ObjCMethodDecl
-      *Method);
-  virtual llvm::Constant *GetEHType(QualType T);
+                           bool isCategoryImpl, llvm::Value *Receiver,
+                           bool IsClassMessage, const CallArgList &CallArgs,
+                           const ObjCMethodDecl *Method) override;
+  llvm::Value *GetClass(CodeGenFunction &CGF,
+                        const ObjCInterfaceDecl *OID) override;
+  llvm::Value *GetSelector(CodeGenFunction &CGF, Selector Sel,
+                           bool lval = false) override;
+  llvm::Value *GetSelector(CodeGenFunction &CGF,
+                           const ObjCMethodDecl *Method) override;
+  llvm::Constant *GetEHType(QualType T) override;
 
-  virtual llvm::Function *GenerateMethod(const ObjCMethodDecl *OMD,
-                                         const ObjCContainerDecl *CD);
-  virtual void GenerateCategory(const ObjCCategoryImplDecl *CMD);
-  virtual void GenerateClass(const ObjCImplementationDecl *ClassDecl);
-  virtual void RegisterAlias(const ObjCCompatibleAliasDecl *OAD);
-  virtual llvm::Value *GenerateProtocolRef(CodeGenFunction &CGF,
-                                           const ObjCProtocolDecl *PD);
-  virtual void GenerateProtocol(const ObjCProtocolDecl *PD);
-  virtual llvm::Function *ModuleInitFunction();
-  virtual llvm::Constant *GetPropertyGetFunction();
-  virtual llvm::Constant *GetPropertySetFunction();
-  virtual llvm::Constant *GetOptimizedPropertySetFunction(bool atomic, 
-                                                          bool copy);
-  virtual llvm::Constant *GetSetStructFunction();
-  virtual llvm::Constant *GetGetStructFunction();
-  virtual llvm::Constant *GetCppAtomicObjectGetFunction();
-  virtual llvm::Constant *GetCppAtomicObjectSetFunction();
-  virtual llvm::Constant *EnumerationMutationFunction();
+  llvm::Function *GenerateMethod(const ObjCMethodDecl *OMD,
+                                 const ObjCContainerDecl *CD) override;
+  void GenerateCategory(const ObjCCategoryImplDecl *CMD) override;
+  void GenerateClass(const ObjCImplementationDecl *ClassDecl) override;
+  void RegisterAlias(const ObjCCompatibleAliasDecl *OAD) override;
+  llvm::Value *GenerateProtocolRef(CodeGenFunction &CGF,
+                                   const ObjCProtocolDecl *PD) override;
+  void GenerateProtocol(const ObjCProtocolDecl *PD) override;
+  llvm::Function *ModuleInitFunction() override;
+  llvm::Constant *GetPropertyGetFunction() override;
+  llvm::Constant *GetPropertySetFunction() override;
+  llvm::Constant *GetOptimizedPropertySetFunction(bool atomic,
+                                                  bool copy) override;
+  llvm::Constant *GetSetStructFunction() override;
+  llvm::Constant *GetGetStructFunction() override;
+  llvm::Constant *GetCppAtomicObjectGetFunction() override;
+  llvm::Constant *GetCppAtomicObjectSetFunction() override;
+  llvm::Constant *EnumerationMutationFunction() override;
 
-  virtual void EmitTryStmt(CodeGenFunction &CGF,
-                           const ObjCAtTryStmt &S);
-  virtual void EmitSynchronizedStmt(CodeGenFunction &CGF,
-                                    const ObjCAtSynchronizedStmt &S);
-  virtual void EmitThrowStmt(CodeGenFunction &CGF,
-                             const ObjCAtThrowStmt &S,
-                             bool ClearInsertionPoint=true);
-  virtual llvm::Value * EmitObjCWeakRead(CodeGenFunction &CGF,
-                                         llvm::Value *AddrWeakObj);
-  virtual void EmitObjCWeakAssign(CodeGenFunction &CGF,
-                                  llvm::Value *src, llvm::Value *dst);
-  virtual void EmitObjCGlobalAssign(CodeGenFunction &CGF,
-                                    llvm::Value *src, llvm::Value *dest,
-                                    bool threadlocal=false);
-  virtual void EmitObjCIvarAssign(CodeGenFunction &CGF,
-                                    llvm::Value *src, llvm::Value *dest,
-                                    llvm::Value *ivarOffset);
-  virtual void EmitObjCStrongCastAssign(CodeGenFunction &CGF,
-                                        llvm::Value *src, llvm::Value *dest);
-  virtual void EmitGCMemmoveCollectable(CodeGenFunction &CGF,
-                                        llvm::Value *DestPtr,
-                                        llvm::Value *SrcPtr,
-                                        llvm::Value *Size);
-  virtual LValue EmitObjCValueForIvar(CodeGenFunction &CGF,
-                                      QualType ObjectTy,
-                                      llvm::Value *BaseValue,
-                                      const ObjCIvarDecl *Ivar,
-                                      unsigned CVRQualifiers);
-  virtual llvm::Value *EmitIvarOffset(CodeGenFunction &CGF,
-                                      const ObjCInterfaceDecl *Interface,
-                                      const ObjCIvarDecl *Ivar);
-  virtual llvm::Value *EmitNSAutoreleasePoolClassRef(CodeGenFunction &CGF);
-  virtual llvm::Constant *BuildGCBlockLayout(CodeGenModule &CGM,
-                                             const CGBlockInfo &blockInfo) {
+  void EmitTryStmt(CodeGenFunction &CGF,
+                   const ObjCAtTryStmt &S) override;
+  void EmitSynchronizedStmt(CodeGenFunction &CGF,
+                            const ObjCAtSynchronizedStmt &S) override;
+  void EmitThrowStmt(CodeGenFunction &CGF,
+                     const ObjCAtThrowStmt &S,
+                     bool ClearInsertionPoint=true) override;
+  llvm::Value * EmitObjCWeakRead(CodeGenFunction &CGF,
+                                 llvm::Value *AddrWeakObj) override;
+  void EmitObjCWeakAssign(CodeGenFunction &CGF,
+                          llvm::Value *src, llvm::Value *dst) override;
+  void EmitObjCGlobalAssign(CodeGenFunction &CGF,
+                            llvm::Value *src, llvm::Value *dest,
+                            bool threadlocal=false) override;
+  void EmitObjCIvarAssign(CodeGenFunction &CGF, llvm::Value *src,
+                          llvm::Value *dest, llvm::Value *ivarOffset) override;
+  void EmitObjCStrongCastAssign(CodeGenFunction &CGF,
+                                llvm::Value *src, llvm::Value *dest) override;
+  void EmitGCMemmoveCollectable(CodeGenFunction &CGF, llvm::Value *DestPtr,
+                                llvm::Value *SrcPtr,
+                                llvm::Value *Size) override;
+  LValue EmitObjCValueForIvar(CodeGenFunction &CGF, QualType ObjectTy,
+                              llvm::Value *BaseValue, const ObjCIvarDecl *Ivar,
+                              unsigned CVRQualifiers) override;
+  llvm::Value *EmitIvarOffset(CodeGenFunction &CGF,
+                              const ObjCInterfaceDecl *Interface,
+                              const ObjCIvarDecl *Ivar) override;
+  llvm::Value *EmitNSAutoreleasePoolClassRef(CodeGenFunction &CGF) override;
+  llvm::Constant *BuildGCBlockLayout(CodeGenModule &CGM,
+                                     const CGBlockInfo &blockInfo) override {
     return NULLPtr;
   }
-  virtual llvm::Constant *BuildRCBlockLayout(CodeGenModule &CGM,
-                                             const CGBlockInfo &blockInfo) {
+  llvm::Constant *BuildRCBlockLayout(CodeGenModule &CGM,
+                                     const CGBlockInfo &blockInfo) override {
     return NULLPtr;
   }
-  
-  virtual llvm::Constant *BuildByrefLayout(CodeGenModule &CGM,
-                                           QualType T) {
+
+  llvm::Constant *BuildByrefLayout(CodeGenModule &CGM, QualType T) override {
     return NULLPtr;
   }
-  
-  virtual llvm::GlobalVariable *GetClassGlobal(const std::string &Name) {
+
+  llvm::GlobalVariable *GetClassGlobal(const std::string &Name,
+                                       bool Weak = false) override {
     return 0;
   }
 };
@@ -595,11 +584,9 @@
   /// arguments.  Returns the IMP for the corresponding method.
   LazyRuntimeFunction MsgLookupSuperFn;
 protected:
-  virtual llvm::Value *LookupIMP(CodeGenFunction &CGF,
-                                 llvm::Value *&Receiver,
-                                 llvm::Value *cmd,
-                                 llvm::MDNode *node,
-                                 MessageSendInfo &MSI) {
+  llvm::Value *LookupIMP(CodeGenFunction &CGF, llvm::Value *&Receiver,
+                         llvm::Value *cmd, llvm::MDNode *node,
+                         MessageSendInfo &MSI) override {
     CGBuilderTy &Builder = CGF.Builder;
     llvm::Value *args[] = {
             EnforceType(Builder, Receiver, IdTy),
@@ -608,10 +595,8 @@
     imp->setMetadata(msgSendMDKind, node);
     return imp.getInstruction();
   }
-  virtual llvm::Value *LookupIMPSuper(CodeGenFunction &CGF,
-                                      llvm::Value *ObjCSuper,
-                                      llvm::Value *cmd,
-                                      MessageSendInfo &MSI) {
+  llvm::Value *LookupIMPSuper(CodeGenFunction &CGF, llvm::Value *ObjCSuper,
+                              llvm::Value *cmd, MessageSendInfo &MSI) override {
       CGBuilderTy &Builder = CGF.Builder;
       llvm::Value *lookupArgs[] = {EnforceType(Builder, ObjCSuper,
           PtrToObjCSuperTy), cmd};
@@ -654,13 +639,11 @@
     /// lookup functions.
     llvm::Type *SlotTy;
   public:
-    virtual llvm::Constant *GetEHType(QualType T);
+    llvm::Constant *GetEHType(QualType T) override;
   protected:
-    virtual llvm::Value *LookupIMP(CodeGenFunction &CGF,
-                                   llvm::Value *&Receiver,
-                                   llvm::Value *cmd,
-                                   llvm::MDNode *node,
-                                   MessageSendInfo &MSI) {
+    llvm::Value *LookupIMP(CodeGenFunction &CGF, llvm::Value *&Receiver,
+                           llvm::Value *cmd, llvm::MDNode *node,
+                           MessageSendInfo &MSI) override {
       CGBuilderTy &Builder = CGF.Builder;
       llvm::Function *LookupFn = SlotLookupFn;
 
@@ -696,10 +679,9 @@
       Receiver = Builder.CreateLoad(ReceiverPtr, true);
       return imp;
     }
-    virtual llvm::Value *LookupIMPSuper(CodeGenFunction &CGF,
-                                        llvm::Value *ObjCSuper,
-                                        llvm::Value *cmd,
-                                        MessageSendInfo &MSI) {
+    llvm::Value *LookupIMPSuper(CodeGenFunction &CGF, llvm::Value *ObjCSuper,
+                                llvm::Value *cmd,
+                                MessageSendInfo &MSI) override {
       CGBuilderTy &Builder = CGF.Builder;
       llvm::Value *lookupArgs[] = {ObjCSuper, cmd};
 
@@ -760,22 +742,22 @@
       CxxAtomicObjectGetFn.init(&CGM, "objc_getCppObjectAtomic", VoidTy, PtrTy,
           PtrTy, PtrTy, NULL);
     }
-    virtual llvm::Constant *GetCppAtomicObjectGetFunction() {
+    llvm::Constant *GetCppAtomicObjectGetFunction() override {
       // The optimised functions were added in version 1.7 of the GNUstep
       // runtime.
       assert (CGM.getLangOpts().ObjCRuntime.getVersion() >=
           VersionTuple(1, 7));
       return CxxAtomicObjectGetFn;
     }
-    virtual llvm::Constant *GetCppAtomicObjectSetFunction() {
+    llvm::Constant *GetCppAtomicObjectSetFunction() override {
       // The optimised functions were added in version 1.7 of the GNUstep
       // runtime.
       assert (CGM.getLangOpts().ObjCRuntime.getVersion() >=
           VersionTuple(1, 7));
       return CxxAtomicObjectSetFn;
     }
-    virtual llvm::Constant *GetOptimizedPropertySetFunction(bool atomic,
-                                                            bool copy) {
+    llvm::Constant *GetOptimizedPropertySetFunction(bool atomic,
+                                                    bool copy) override {
       // The optimised property functions omit the GC check, and so are not
       // safe to use in GC mode.  The standard functions are fast in GC mode,
       // so there is less advantage in using them.
@@ -789,10 +771,8 @@
         if (copy) return SetPropertyAtomicCopy;
         return SetPropertyAtomic;
       }
-      if (copy) return SetPropertyNonAtomicCopy;
-      return SetPropertyNonAtomic;
 
-      return 0;
+      return copy ? SetPropertyNonAtomicCopy : SetPropertyNonAtomic;
     }
 };
 
@@ -810,11 +790,9 @@
   /// arguments.  Returns the IMP for the corresponding method.
   LazyRuntimeFunction MsgLookupSuperFn, MsgLookupSuperFnSRet;
 
-  virtual llvm::Value *LookupIMP(CodeGenFunction &CGF,
-                                 llvm::Value *&Receiver,
-                                 llvm::Value *cmd,
-                                 llvm::MDNode *node,
-                                 MessageSendInfo &MSI) {
+  llvm::Value *LookupIMP(CodeGenFunction &CGF, llvm::Value *&Receiver,
+                         llvm::Value *cmd, llvm::MDNode *node,
+                         MessageSendInfo &MSI) override {
     CGBuilderTy &Builder = CGF.Builder;
     llvm::Value *args[] = {
             EnforceType(Builder, Receiver, IdTy),
@@ -830,10 +808,8 @@
     return imp.getInstruction();
   }
 
-  virtual llvm::Value *LookupIMPSuper(CodeGenFunction &CGF,
-                                      llvm::Value *ObjCSuper,
-                                      llvm::Value *cmd,
-                                      MessageSendInfo &MSI) {
+  llvm::Value *LookupIMPSuper(CodeGenFunction &CGF, llvm::Value *ObjCSuper,
+                              llvm::Value *cmd, MessageSendInfo &MSI) override {
       CGBuilderTy &Builder = CGF.Builder;
       llvm::Value *lookupArgs[] = {EnforceType(Builder, ObjCSuper,
           PtrToObjCSuperTy), cmd};
@@ -844,8 +820,8 @@
         return CGF.EmitNounwindRuntimeCall(MsgLookupSuperFn, lookupArgs);
     }
 
-  virtual llvm::Value *GetClassNamed(CodeGenFunction &CGF,
-                                     const std::string &Name, bool isWeak) {
+  llvm::Value *GetClassNamed(CodeGenFunction &CGF,
+                             const std::string &Name, bool isWeak) override {
     if (isWeak)
       return CGObjCGNU::GetClassNamed(CGF, Name, isWeak);
 
@@ -948,7 +924,7 @@
   Int64Ty = llvm::Type::getInt64Ty(VMContext);
 
   IntPtrTy =
-      TheModule.getPointerSize() == llvm::Module::Pointer32 ? Int32Ty : Int64Ty;
+      CGM.getDataLayout().getPointerSizeInBits() == 32 ? Int32Ty : Int64Ty;
 
   // Object type
   QualType UnqualIdTy = CGM.getContext().getObjCIdType();
@@ -1779,24 +1755,22 @@
     PD = Def;
 
   SmallVector<std::string, 16> Protocols;
-  for (ObjCProtocolDecl::protocol_iterator PI = PD->protocol_begin(),
-       E = PD->protocol_end(); PI != E; ++PI)
-    Protocols.push_back((*PI)->getNameAsString());
+  for (const auto *PI : PD->protocols())
+    Protocols.push_back(PI->getNameAsString());
   SmallVector<llvm::Constant*, 16> InstanceMethodNames;
   SmallVector<llvm::Constant*, 16> InstanceMethodTypes;
   SmallVector<llvm::Constant*, 16> OptionalInstanceMethodNames;
   SmallVector<llvm::Constant*, 16> OptionalInstanceMethodTypes;
-  for (ObjCProtocolDecl::instmeth_iterator iter = PD->instmeth_begin(),
-       E = PD->instmeth_end(); iter != E; iter++) {
+  for (const auto *I : PD->instance_methods()) {
     std::string TypeStr;
-    Context.getObjCEncodingForMethodDecl(*iter, TypeStr);
-    if ((*iter)->getImplementationControl() == ObjCMethodDecl::Optional) {
+    Context.getObjCEncodingForMethodDecl(I, TypeStr);
+    if (I->getImplementationControl() == ObjCMethodDecl::Optional) {
       OptionalInstanceMethodNames.push_back(
-          MakeConstantString((*iter)->getSelector().getAsString()));
+          MakeConstantString(I->getSelector().getAsString()));
       OptionalInstanceMethodTypes.push_back(MakeConstantString(TypeStr));
     } else {
       InstanceMethodNames.push_back(
-          MakeConstantString((*iter)->getSelector().getAsString()));
+          MakeConstantString(I->getSelector().getAsString()));
       InstanceMethodTypes.push_back(MakeConstantString(TypeStr));
     }
   }
@@ -1805,18 +1779,16 @@
   SmallVector<llvm::Constant*, 16> ClassMethodTypes;
   SmallVector<llvm::Constant*, 16> OptionalClassMethodNames;
   SmallVector<llvm::Constant*, 16> OptionalClassMethodTypes;
-  for (ObjCProtocolDecl::classmeth_iterator
-         iter = PD->classmeth_begin(), endIter = PD->classmeth_end();
-       iter != endIter ; iter++) {
+  for (const auto *I : PD->class_methods()) {
     std::string TypeStr;
-    Context.getObjCEncodingForMethodDecl((*iter),TypeStr);
-    if ((*iter)->getImplementationControl() == ObjCMethodDecl::Optional) {
+    Context.getObjCEncodingForMethodDecl(I,TypeStr);
+    if (I->getImplementationControl() == ObjCMethodDecl::Optional) {
       OptionalClassMethodNames.push_back(
-          MakeConstantString((*iter)->getSelector().getAsString()));
+          MakeConstantString(I->getSelector().getAsString()));
       OptionalClassMethodTypes.push_back(MakeConstantString(TypeStr));
     } else {
       ClassMethodNames.push_back(
-          MakeConstantString((*iter)->getSelector().getAsString()));
+          MakeConstantString(I->getSelector().getAsString()));
       ClassMethodTypes.push_back(MakeConstantString(TypeStr));
     }
   }
@@ -1846,11 +1818,8 @@
 
   // Add all of the property methods need adding to the method list and to the
   // property metadata list.
-  for (ObjCContainerDecl::prop_iterator
-         iter = PD->prop_begin(), endIter = PD->prop_end();
-       iter != endIter ; iter++) {
+  for (auto *property : PD->properties()) {
     std::vector<llvm::Constant*> Fields;
-    ObjCPropertyDecl *property = *iter;
 
     Fields.push_back(MakePropertyEncodingString(property, 0));
     PushPropertyAttributes(Fields, property);
@@ -1996,8 +1965,7 @@
 /// bitfield / with the 63rd bit set will be 1<<64.
 llvm::Constant *CGObjCGNU::MakeBitField(ArrayRef<bool> bits) {
   int bitCount = bits.size();
-  int ptrBits =
-        (TheModule.getPointerSize() == llvm::Module::Pointer32) ? 32 : 64;
+  int ptrBits = CGM.getDataLayout().getPointerSizeInBits();
   if (bitCount < ptrBits) {
     uint64_t val = 1;
     for (int i=0 ; i<bitCount ; ++i) {
@@ -2032,24 +2000,20 @@
   // Collect information about instance methods
   SmallVector<Selector, 16> InstanceMethodSels;
   SmallVector<llvm::Constant*, 16> InstanceMethodTypes;
-  for (ObjCCategoryImplDecl::instmeth_iterator
-         iter = OCD->instmeth_begin(), endIter = OCD->instmeth_end();
-       iter != endIter ; iter++) {
-    InstanceMethodSels.push_back((*iter)->getSelector());
+  for (const auto *I : OCD->instance_methods()) {
+    InstanceMethodSels.push_back(I->getSelector());
     std::string TypeStr;
-    CGM.getContext().getObjCEncodingForMethodDecl(*iter,TypeStr);
+    CGM.getContext().getObjCEncodingForMethodDecl(I,TypeStr);
     InstanceMethodTypes.push_back(MakeConstantString(TypeStr));
   }
 
   // Collect information about class methods
   SmallVector<Selector, 16> ClassMethodSels;
   SmallVector<llvm::Constant*, 16> ClassMethodTypes;
-  for (ObjCCategoryImplDecl::classmeth_iterator
-         iter = OCD->classmeth_begin(), endIter = OCD->classmeth_end();
-       iter != endIter ; iter++) {
-    ClassMethodSels.push_back((*iter)->getSelector());
+  for (const auto *I : OCD->class_methods()) {
+    ClassMethodSels.push_back(I->getSelector());
     std::string TypeStr;
-    CGM.getContext().getObjCEncodingForMethodDecl(*iter,TypeStr);
+    CGM.getContext().getObjCEncodingForMethodDecl(I,TypeStr);
     ClassMethodTypes.push_back(MakeConstantString(TypeStr));
   }
 
@@ -2093,12 +2057,9 @@
 
   // Add all of the property methods need adding to the method list and to the
   // property metadata list.
-  for (ObjCImplDecl::propimpl_iterator
-         iter = OID->propimpl_begin(), endIter = OID->propimpl_end();
-       iter != endIter ; iter++) {
+  for (auto *propertyImpl : OID->property_impls()) {
     std::vector<llvm::Constant*> Fields;
-    ObjCPropertyDecl *property = iter->getPropertyDecl();
-    ObjCPropertyImplDecl *propertyImpl = *iter;
+    ObjCPropertyDecl *property = propertyImpl->getPropertyDecl();
     bool isSynthesized = (propertyImpl->getPropertyImplementation() == 
         ObjCPropertyImplDecl::Synthesize);
     bool isDynamic = (propertyImpl->getPropertyImplementation() == 
@@ -2265,12 +2226,10 @@
   // Collect information about instance methods
   SmallVector<Selector, 16> InstanceMethodSels;
   SmallVector<llvm::Constant*, 16> InstanceMethodTypes;
-  for (ObjCImplementationDecl::instmeth_iterator
-         iter = OID->instmeth_begin(), endIter = OID->instmeth_end();
-       iter != endIter ; iter++) {
-    InstanceMethodSels.push_back((*iter)->getSelector());
+  for (const auto *I : OID->instance_methods()) {
+    InstanceMethodSels.push_back(I->getSelector());
     std::string TypeStr;
-    Context.getObjCEncodingForMethodDecl((*iter),TypeStr);
+    Context.getObjCEncodingForMethodDecl(I,TypeStr);
     InstanceMethodTypes.push_back(MakeConstantString(TypeStr));
   }
 
@@ -2281,22 +2240,16 @@
   // Collect information about class methods
   SmallVector<Selector, 16> ClassMethodSels;
   SmallVector<llvm::Constant*, 16> ClassMethodTypes;
-  for (ObjCImplementationDecl::classmeth_iterator
-         iter = OID->classmeth_begin(), endIter = OID->classmeth_end();
-       iter != endIter ; iter++) {
-    ClassMethodSels.push_back((*iter)->getSelector());
+  for (const auto *I : OID->class_methods()) {
+    ClassMethodSels.push_back(I->getSelector());
     std::string TypeStr;
-    Context.getObjCEncodingForMethodDecl((*iter),TypeStr);
+    Context.getObjCEncodingForMethodDecl(I,TypeStr);
     ClassMethodTypes.push_back(MakeConstantString(TypeStr));
   }
   // Collect the names of referenced protocols
   SmallVector<std::string, 16> Protocols;
-  for (ObjCInterfaceDecl::protocol_iterator
-         I = ClassDecl->protocol_begin(),
-         E = ClassDecl->protocol_end(); I != E; ++I)
-    Protocols.push_back((*I)->getNameAsString());
-
-
+  for (const auto *I : ClassDecl->protocols())
+    Protocols.push_back(I->getNameAsString());
 
   // Get the superclass pointer.
   llvm::Constant *SuperClass;
@@ -2595,7 +2548,7 @@
             llvm::Constant::getNullValue(RegisterAlias->getType()));
     Builder.CreateCondBr(HasRegisterAlias, AliasBB, NoAliasBB);
 
-    // The true branch (has alias registration fucntion):
+    // The true branch (has alias registration function):
     Builder.SetInsertPoint(AliasBB);
     // Emit alias registration calls:
     for (std::vector<ClassAliasPair>::iterator iter = ClassAliases.begin();
diff --git a/lib/CodeGen/CGObjCMac.cpp b/lib/CodeGen/CGObjCMac.cpp
index 2b2a5b8..8f5969c 100644
--- a/lib/CodeGen/CGObjCMac.cpp
+++ b/lib/CodeGen/CGObjCMac.cpp
@@ -29,12 +29,12 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/CallSite.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdio>
 
@@ -174,6 +174,7 @@
 public:
   llvm::Type *ShortTy, *IntTy, *LongTy, *LongLongTy;
   llvm::Type *Int8PtrTy, *Int8PtrPtrTy;
+  llvm::Type *IvarOffsetVarTy;
 
   /// ObjectPtrTy - LLVM type for object handles (typeof(id))
   llvm::Type *ObjectPtrTy;
@@ -243,8 +244,8 @@
     Params.push_back(Ctx.getPointerDiffType()->getCanonicalTypeUnqualified());
     Params.push_back(Ctx.BoolTy);
     llvm::FunctionType *FTy =
-      Types.GetFunctionType(Types.arrangeLLVMFunctionInfo(IdType, Params,
-                                                    FunctionType::ExtInfo(),
+      Types.GetFunctionType(Types.arrangeLLVMFunctionInfo(IdType, false, Params,
+                                                          FunctionType::ExtInfo(),
                                                           RequiredArgs::All));
     return CGM.CreateRuntimeFunction(FTy, "objc_getProperty");
   }
@@ -263,8 +264,9 @@
     Params.push_back(Ctx.BoolTy);
     Params.push_back(Ctx.BoolTy);
     llvm::FunctionType *FTy =
-      Types.GetFunctionType(Types.arrangeLLVMFunctionInfo(Ctx.VoidTy, Params,
-                                                     FunctionType::ExtInfo(),
+      Types.GetFunctionType(Types.arrangeLLVMFunctionInfo(Ctx.VoidTy, false,
+                                                          Params,
+                                                          FunctionType::ExtInfo(),
                                                           RequiredArgs::All));
     return CGM.CreateRuntimeFunction(FTy, "objc_setProperty");
   }
@@ -289,7 +291,8 @@
     Params.push_back(IdType);
     Params.push_back(Ctx.getPointerDiffType()->getCanonicalTypeUnqualified());
     llvm::FunctionType *FTy =
-    Types.GetFunctionType(Types.arrangeLLVMFunctionInfo(Ctx.VoidTy, Params,
+    Types.GetFunctionType(Types.arrangeLLVMFunctionInfo(Ctx.VoidTy, false,
+                                                        Params,
                                                         FunctionType::ExtInfo(),
                                                         RequiredArgs::All));
     const char *name;
@@ -316,8 +319,9 @@
     Params.push_back(Ctx.BoolTy);
     Params.push_back(Ctx.BoolTy);
     llvm::FunctionType *FTy =
-      Types.GetFunctionType(Types.arrangeLLVMFunctionInfo(Ctx.VoidTy, Params,
-                                                     FunctionType::ExtInfo(),
+      Types.GetFunctionType(Types.arrangeLLVMFunctionInfo(Ctx.VoidTy, false,
+                                                          Params,
+                                                          FunctionType::ExtInfo(),
                                                           RequiredArgs::All));
     return CGM.CreateRuntimeFunction(FTy, "objc_copyStruct");
   }
@@ -335,8 +339,9 @@
     Params.push_back(Ctx.VoidPtrTy);
     Params.push_back(Ctx.VoidPtrTy);
     llvm::FunctionType *FTy =
-      Types.GetFunctionType(Types.arrangeLLVMFunctionInfo(Ctx.VoidTy, Params,
-                                                     FunctionType::ExtInfo(),
+      Types.GetFunctionType(Types.arrangeLLVMFunctionInfo(Ctx.VoidTy, false,
+                                                          Params,
+                                                          FunctionType::ExtInfo(),
                                                           RequiredArgs::All));
     return CGM.CreateRuntimeFunction(FTy, "objc_copyCppObjectAtomic");
   }
@@ -348,8 +353,9 @@
     SmallVector<CanQualType,1> Params;
     Params.push_back(Ctx.getCanonicalParamType(Ctx.getObjCIdType()));
     llvm::FunctionType *FTy =
-      Types.GetFunctionType(Types.arrangeLLVMFunctionInfo(Ctx.VoidTy, Params,
-                                                      FunctionType::ExtInfo(),
+      Types.GetFunctionType(Types.arrangeLLVMFunctionInfo(Ctx.VoidTy, false,
+                                                          Params,
+                                                          FunctionType::ExtInfo(),
                                                       RequiredArgs::All));
     return CGM.CreateRuntimeFunction(FTy, "objc_enumerationMutation");
   }
@@ -885,6 +891,9 @@
 
   /// DefinedClasses - List of defined classes.
   SmallVector<llvm::GlobalValue*, 16> DefinedClasses;
+  
+  /// ImplementedClasses - List of @implemented classes.
+  SmallVector<const ObjCInterfaceDecl*, 16> ImplementedClasses;
 
   /// DefinedNonLazyClasses - List of defined "non-lazy" classes.
   SmallVector<llvm::GlobalValue*, 16> DefinedNonLazyClasses;
@@ -991,7 +1000,7 @@
     llvm::SmallPtrSet<const IdentifierInfo*, 16> &PropertySet,
     SmallVectorImpl<llvm::Constant*> &Properties,
     const Decl *Container,
-    const ObjCProtocolDecl *PROTO,
+    const ObjCProtocolDecl *Proto,
     const ObjCCommonTypesHelper &ObjCTypes);
 
   /// GetProtocolRef - Return a reference to the internal protocol
@@ -1038,12 +1047,12 @@
   CGObjCCommonMac(CodeGen::CodeGenModule &cgm) :
     CGObjCRuntime(cgm), VMContext(cgm.getLLVMContext()) { }
 
-  virtual llvm::Constant *GenerateConstantString(const StringLiteral *SL);
-  
-  virtual llvm::Function *GenerateMethod(const ObjCMethodDecl *OMD,
-                                         const ObjCContainerDecl *CD=0);
+  llvm::Constant *GenerateConstantString(const StringLiteral *SL) override;
 
-  virtual void GenerateProtocol(const ObjCProtocolDecl *PD);
+  llvm::Function *GenerateMethod(const ObjCMethodDecl *OMD,
+                                 const ObjCContainerDecl *CD=0) override;
+
+  void GenerateProtocol(const ObjCProtocolDecl *PD) override;
 
   /// GetOrEmitProtocol - Get the protocol object for the given
   /// declaration, emitting it if necessary. The return value has type
@@ -1055,13 +1064,13 @@
   /// forward references will be filled in with empty bodies if no
   /// definition is seen. The return value has type ProtocolPtrTy.
   virtual llvm::Constant *GetOrEmitProtocolRef(const ObjCProtocolDecl *PD)=0;
-  virtual llvm::Constant *BuildGCBlockLayout(CodeGen::CodeGenModule &CGM,
-                                             const CGBlockInfo &blockInfo);
-  virtual llvm::Constant *BuildRCBlockLayout(CodeGen::CodeGenModule &CGM,
-                                             const CGBlockInfo &blockInfo);
-  
-  virtual llvm::Constant *BuildByrefLayout(CodeGen::CodeGenModule &CGM,
-                                           QualType T);
+  llvm::Constant *BuildGCBlockLayout(CodeGen::CodeGenModule &CGM,
+                                     const CGBlockInfo &blockInfo) override;
+  llvm::Constant *BuildRCBlockLayout(CodeGen::CodeGenModule &CGM,
+                                     const CGBlockInfo &blockInfo) override;
+
+  llvm::Constant *BuildByrefLayout(CodeGen::CodeGenModule &CGM,
+                                   QualType T) override;
 };
 
 class CGObjCMac : public CGObjCCommonMac {
@@ -1092,9 +1101,9 @@
   
   llvm::Value *EmitClassRefFromId(CodeGenFunction &CGF,
                                   IdentifierInfo *II);
-  
-  llvm::Value *EmitNSAutoreleasePoolClassRef(CodeGenFunction &CGF);
-  
+
+  llvm::Value *EmitNSAutoreleasePoolClassRef(CodeGenFunction &CGF) override;
+
   /// EmitSuperClassRef - Emits reference to class's main metadata class.
   llvm::Value *EmitSuperClassRef(const ObjCInterfaceDecl *ID);
 
@@ -1145,13 +1154,13 @@
   /// GetOrEmitProtocol - Get the protocol object for the given
   /// declaration, emitting it if necessary. The return value has type
   /// ProtocolPtrTy.
-  virtual llvm::Constant *GetOrEmitProtocol(const ObjCProtocolDecl *PD);
+  llvm::Constant *GetOrEmitProtocol(const ObjCProtocolDecl *PD) override;
 
   /// GetOrEmitProtocolRef - Get a forward reference to the protocol
   /// object for the given declaration, emitting it if needed. These
   /// forward references will be filled in with empty bodies if no
   /// definition is seen. The return value has type ProtocolPtrTy.
-  virtual llvm::Constant *GetOrEmitProtocolRef(const ObjCProtocolDecl *PD);
+  llvm::Constant *GetOrEmitProtocolRef(const ObjCProtocolDecl *PD) override;
 
   /// EmitProtocolExtension - Generate the protocol extension
   /// structure used to store optional instance and class methods, and
@@ -1177,97 +1186,90 @@
 public:
   CGObjCMac(CodeGen::CodeGenModule &cgm);
 
-  virtual llvm::Function *ModuleInitFunction();
+  llvm::Function *ModuleInitFunction() override;
 
-  virtual CodeGen::RValue GenerateMessageSend(CodeGen::CodeGenFunction &CGF,
-                                              ReturnValueSlot Return,
-                                              QualType ResultType,
-                                              Selector Sel,
-                                              llvm::Value *Receiver,
-                                              const CallArgList &CallArgs,
-                                              const ObjCInterfaceDecl *Class,
-                                              const ObjCMethodDecl *Method);
+  CodeGen::RValue GenerateMessageSend(CodeGen::CodeGenFunction &CGF,
+                                      ReturnValueSlot Return,
+                                      QualType ResultType,
+                                      Selector Sel, llvm::Value *Receiver,
+                                      const CallArgList &CallArgs,
+                                      const ObjCInterfaceDecl *Class,
+                                      const ObjCMethodDecl *Method) override;
 
-  virtual CodeGen::RValue
+  CodeGen::RValue
   GenerateMessageSendSuper(CodeGen::CodeGenFunction &CGF,
-                           ReturnValueSlot Return,
-                           QualType ResultType,
-                           Selector Sel,
-                           const ObjCInterfaceDecl *Class,
-                           bool isCategoryImpl,
-                           llvm::Value *Receiver,
-                           bool IsClassMessage,
-                           const CallArgList &CallArgs,
-                           const ObjCMethodDecl *Method);
+                           ReturnValueSlot Return, QualType ResultType,
+                           Selector Sel, const ObjCInterfaceDecl *Class,
+                           bool isCategoryImpl, llvm::Value *Receiver,
+                           bool IsClassMessage, const CallArgList &CallArgs,
+                           const ObjCMethodDecl *Method) override;
 
-  virtual llvm::Value *GetClass(CodeGenFunction &CGF,
-                                const ObjCInterfaceDecl *ID);
+  llvm::Value *GetClass(CodeGenFunction &CGF,
+                        const ObjCInterfaceDecl *ID) override;
 
-  virtual llvm::Value *GetSelector(CodeGenFunction &CGF, Selector Sel, 
-                                   bool lval = false);
+  llvm::Value *GetSelector(CodeGenFunction &CGF, Selector Sel,
+                           bool lval = false) override;
 
   /// The NeXT/Apple runtimes do not support typed selectors; just emit an
   /// untyped one.
-  virtual llvm::Value *GetSelector(CodeGenFunction &CGF,
-                                   const ObjCMethodDecl *Method);
+  llvm::Value *GetSelector(CodeGenFunction &CGF,
+                           const ObjCMethodDecl *Method) override;
 
-  virtual llvm::Constant *GetEHType(QualType T);
+  llvm::Constant *GetEHType(QualType T) override;
 
-  virtual void GenerateCategory(const ObjCCategoryImplDecl *CMD);
+  void GenerateCategory(const ObjCCategoryImplDecl *CMD) override;
 
-  virtual void GenerateClass(const ObjCImplementationDecl *ClassDecl);
+  void GenerateClass(const ObjCImplementationDecl *ClassDecl) override;
 
-  virtual void RegisterAlias(const ObjCCompatibleAliasDecl *OAD) {}
+  void RegisterAlias(const ObjCCompatibleAliasDecl *OAD) override {}
 
-  virtual llvm::Value *GenerateProtocolRef(CodeGenFunction &CGF,
-                                           const ObjCProtocolDecl *PD);
+  llvm::Value *GenerateProtocolRef(CodeGenFunction &CGF,
+                                   const ObjCProtocolDecl *PD) override;
 
-  virtual llvm::Constant *GetPropertyGetFunction();
-  virtual llvm::Constant *GetPropertySetFunction();
-  virtual llvm::Constant *GetOptimizedPropertySetFunction(bool atomic, 
-                                                          bool copy);
-  virtual llvm::Constant *GetGetStructFunction();
-  virtual llvm::Constant *GetSetStructFunction();
-  virtual llvm::Constant *GetCppAtomicObjectGetFunction();
-  virtual llvm::Constant *GetCppAtomicObjectSetFunction();
-  virtual llvm::Constant *EnumerationMutationFunction();
+  llvm::Constant *GetPropertyGetFunction() override;
+  llvm::Constant *GetPropertySetFunction() override;
+  llvm::Constant *GetOptimizedPropertySetFunction(bool atomic,
+                                                  bool copy) override;
+  llvm::Constant *GetGetStructFunction() override;
+  llvm::Constant *GetSetStructFunction() override;
+  llvm::Constant *GetCppAtomicObjectGetFunction() override;
+  llvm::Constant *GetCppAtomicObjectSetFunction() override;
+  llvm::Constant *EnumerationMutationFunction() override;
 
-  virtual void EmitTryStmt(CodeGen::CodeGenFunction &CGF,
-                           const ObjCAtTryStmt &S);
-  virtual void EmitSynchronizedStmt(CodeGen::CodeGenFunction &CGF,
-                                    const ObjCAtSynchronizedStmt &S);
+  void EmitTryStmt(CodeGen::CodeGenFunction &CGF,
+                   const ObjCAtTryStmt &S) override;
+  void EmitSynchronizedStmt(CodeGen::CodeGenFunction &CGF,
+                            const ObjCAtSynchronizedStmt &S) override;
   void EmitTryOrSynchronizedStmt(CodeGen::CodeGenFunction &CGF, const Stmt &S);
-  virtual void EmitThrowStmt(CodeGen::CodeGenFunction &CGF,
-                             const ObjCAtThrowStmt &S,
-                             bool ClearInsertionPoint=true);
-  virtual llvm::Value * EmitObjCWeakRead(CodeGen::CodeGenFunction &CGF,
-                                         llvm::Value *AddrWeakObj);
-  virtual void EmitObjCWeakAssign(CodeGen::CodeGenFunction &CGF,
-                                  llvm::Value *src, llvm::Value *dst);
-  virtual void EmitObjCGlobalAssign(CodeGen::CodeGenFunction &CGF,
-                                    llvm::Value *src, llvm::Value *dest,
-                                    bool threadlocal = false);
-  virtual void EmitObjCIvarAssign(CodeGen::CodeGenFunction &CGF,
-                                  llvm::Value *src, llvm::Value *dest,
-                                  llvm::Value *ivarOffset);
-  virtual void EmitObjCStrongCastAssign(CodeGen::CodeGenFunction &CGF,
-                                        llvm::Value *src, llvm::Value *dest);
-  virtual void EmitGCMemmoveCollectable(CodeGen::CodeGenFunction &CGF,
-                                        llvm::Value *dest, llvm::Value *src,
-                                        llvm::Value *size);
+  void EmitThrowStmt(CodeGen::CodeGenFunction &CGF, const ObjCAtThrowStmt &S,
+                     bool ClearInsertionPoint=true) override;
+  llvm::Value * EmitObjCWeakRead(CodeGen::CodeGenFunction &CGF,
+                                 llvm::Value *AddrWeakObj) override;
+  void EmitObjCWeakAssign(CodeGen::CodeGenFunction &CGF,
+                          llvm::Value *src, llvm::Value *dst) override;
+  void EmitObjCGlobalAssign(CodeGen::CodeGenFunction &CGF,
+                            llvm::Value *src, llvm::Value *dest,
+                            bool threadlocal = false) override;
+  void EmitObjCIvarAssign(CodeGen::CodeGenFunction &CGF,
+                          llvm::Value *src, llvm::Value *dest,
+                          llvm::Value *ivarOffset) override;
+  void EmitObjCStrongCastAssign(CodeGen::CodeGenFunction &CGF,
+                                llvm::Value *src, llvm::Value *dest) override;
+  void EmitGCMemmoveCollectable(CodeGen::CodeGenFunction &CGF,
+                                llvm::Value *dest, llvm::Value *src,
+                                llvm::Value *size) override;
 
-  virtual LValue EmitObjCValueForIvar(CodeGen::CodeGenFunction &CGF,
-                                      QualType ObjectTy,
-                                      llvm::Value *BaseValue,
-                                      const ObjCIvarDecl *Ivar,
-                                      unsigned CVRQualifiers);
-  virtual llvm::Value *EmitIvarOffset(CodeGen::CodeGenFunction &CGF,
-                                      const ObjCInterfaceDecl *Interface,
-                                      const ObjCIvarDecl *Ivar);
-  
+  LValue EmitObjCValueForIvar(CodeGen::CodeGenFunction &CGF, QualType ObjectTy,
+                              llvm::Value *BaseValue, const ObjCIvarDecl *Ivar,
+                              unsigned CVRQualifiers) override;
+  llvm::Value *EmitIvarOffset(CodeGen::CodeGenFunction &CGF,
+                              const ObjCInterfaceDecl *Interface,
+                              const ObjCIvarDecl *Ivar) override;
+
   /// GetClassGlobal - Return the global variable for the Objective-C
   /// class of the given name.
-  virtual llvm::GlobalVariable *GetClassGlobal(const std::string &Name) {
+  llvm::GlobalVariable *GetClassGlobal(const std::string &Name,
+                                       bool Weak = false) override {
     llvm_unreachable("CGObjCMac::GetClassGlobal");
   }
 };
@@ -1316,7 +1318,8 @@
                                             llvm::Constant *IsAGV,
                                             llvm::Constant *SuperClassGV,
                                             llvm::Constant *ClassRoGV,
-                                            bool HiddenVisibility);
+                                            bool HiddenVisibility,
+                                            bool Weak);
 
   llvm::Constant *GetMethodConstant(const ObjCMethodDecl *MD);
 
@@ -1341,13 +1344,13 @@
   /// GetOrEmitProtocol - Get the protocol object for the given
   /// declaration, emitting it if necessary. The return value has type
   /// ProtocolPtrTy.
-  virtual llvm::Constant *GetOrEmitProtocol(const ObjCProtocolDecl *PD);
+  llvm::Constant *GetOrEmitProtocol(const ObjCProtocolDecl *PD) override;
 
   /// GetOrEmitProtocolRef - Get a forward reference to the protocol
   /// object for the given declaration, emitting it if needed. These
   /// forward references will be filled in with empty bodies if no
   /// definition is seen. The return value has type ProtocolPtrTy.
-  virtual llvm::Constant *GetOrEmitProtocolRef(const ObjCProtocolDecl *PD);
+  llvm::Constant *GetOrEmitProtocolRef(const ObjCProtocolDecl *PD) override;
 
   /// EmitProtocolList - Generate the list of referenced
   /// protocols. The return value has type ProtocolListPtrTy.
@@ -1367,17 +1370,18 @@
   
   /// GetClassGlobal - Return the global variable for the Objective-C
   /// class of the given name.
-  llvm::GlobalVariable *GetClassGlobal(const std::string &Name);
-    
+  llvm::GlobalVariable *GetClassGlobal(const std::string &Name,
+                                       bool Weak = false) override;
+
   /// EmitClassRef - Return a Value*, of type ObjCTypes.ClassPtrTy,
   /// for the given class reference.
   llvm::Value *EmitClassRef(CodeGenFunction &CGF,
                             const ObjCInterfaceDecl *ID);
   
   llvm::Value *EmitClassRefFromId(CodeGenFunction &CGF,
-                                  IdentifierInfo *II);
-  
-  llvm::Value *EmitNSAutoreleasePoolClassRef(CodeGenFunction &CGF);
+                                  IdentifierInfo *II, bool Weak);
+
+  llvm::Value *EmitNSAutoreleasePoolClassRef(CodeGenFunction &CGF) override;
 
   /// EmitSuperClassRef - Return a Value*, of type ObjCTypes.ClassPtrTy,
   /// for the given super class reference.
@@ -1434,11 +1438,10 @@
   bool ImplementationIsNonLazy(const ObjCImplDecl *OD) const;
 
   bool IsIvarOffsetKnownIdempotent(const CodeGen::CodeGenFunction &CGF,
-                                   const ObjCInterfaceDecl *ID,
                                    const ObjCIvarDecl *IV) {
-    // Annotate the load as an invariant load iff the object type is the type,
-    // or a derived type, of the class containing the ivar within an ObjC
-    // method.  This check is needed because the ivar offset is a lazily
+    // Annotate the load as an invariant load iff inside an instance method
+    // and ivar belongs to instance method's class and one of its super class.
+    // This check is needed because the ivar offset is a lazily
     // initialised value that may depend on objc_msgSend to perform a fixup on
     // the first message dispatch.
     //
@@ -1446,121 +1449,115 @@
     // base of the ivar access is a parameter to an Objective C method.
     // However, because the parameters are not available in the current
     // interface, we cannot perform this check.
-    if (CGF.CurFuncDecl && isa<ObjCMethodDecl>(CGF.CurFuncDecl))
-      if (IV->getContainingInterface()->isSuperClassOf(ID))
-        return true;
+    if (const ObjCMethodDecl *MD =
+          dyn_cast_or_null<ObjCMethodDecl>(CGF.CurFuncDecl))
+      if (MD->isInstanceMethod())
+        if (const ObjCInterfaceDecl *ID = MD->getClassInterface())
+          return IV->getContainingInterface()->isSuperClassOf(ID);
     return false;
   }
 
 public:
   CGObjCNonFragileABIMac(CodeGen::CodeGenModule &cgm);
   // FIXME. All stubs for now!
-  virtual llvm::Function *ModuleInitFunction();
+  llvm::Function *ModuleInitFunction() override;
 
-  virtual CodeGen::RValue GenerateMessageSend(CodeGen::CodeGenFunction &CGF,
-                                              ReturnValueSlot Return,
-                                              QualType ResultType,
-                                              Selector Sel,
-                                              llvm::Value *Receiver,
-                                              const CallArgList &CallArgs,
-                                              const ObjCInterfaceDecl *Class,
-                                              const ObjCMethodDecl *Method);
+  CodeGen::RValue GenerateMessageSend(CodeGen::CodeGenFunction &CGF,
+                                      ReturnValueSlot Return,
+                                      QualType ResultType, Selector Sel,
+                                      llvm::Value *Receiver,
+                                      const CallArgList &CallArgs,
+                                      const ObjCInterfaceDecl *Class,
+                                      const ObjCMethodDecl *Method) override;
 
-  virtual CodeGen::RValue
+  CodeGen::RValue
   GenerateMessageSendSuper(CodeGen::CodeGenFunction &CGF,
-                           ReturnValueSlot Return,
-                           QualType ResultType,
-                           Selector Sel,
-                           const ObjCInterfaceDecl *Class,
-                           bool isCategoryImpl,
-                           llvm::Value *Receiver,
-                           bool IsClassMessage,
-                           const CallArgList &CallArgs,
-                           const ObjCMethodDecl *Method);
+                           ReturnValueSlot Return, QualType ResultType,
+                           Selector Sel, const ObjCInterfaceDecl *Class,
+                           bool isCategoryImpl, llvm::Value *Receiver,
+                           bool IsClassMessage, const CallArgList &CallArgs,
+                           const ObjCMethodDecl *Method) override;
 
-  virtual llvm::Value *GetClass(CodeGenFunction &CGF,
-                                const ObjCInterfaceDecl *ID);
+  llvm::Value *GetClass(CodeGenFunction &CGF,
+                        const ObjCInterfaceDecl *ID) override;
 
-  virtual llvm::Value *GetSelector(CodeGenFunction &CGF, Selector Sel,
-                                   bool lvalue = false)
+  llvm::Value *GetSelector(CodeGenFunction &CGF, Selector Sel,
+                           bool lvalue = false) override
     { return EmitSelector(CGF, Sel, lvalue); }
 
   /// The NeXT/Apple runtimes do not support typed selectors; just emit an
   /// untyped one.
-  virtual llvm::Value *GetSelector(CodeGenFunction &CGF,
-                                   const ObjCMethodDecl *Method)
+  llvm::Value *GetSelector(CodeGenFunction &CGF,
+                           const ObjCMethodDecl *Method) override
     { return EmitSelector(CGF, Method->getSelector()); }
 
-  virtual void GenerateCategory(const ObjCCategoryImplDecl *CMD);
+  void GenerateCategory(const ObjCCategoryImplDecl *CMD) override;
 
-  virtual void GenerateClass(const ObjCImplementationDecl *ClassDecl);
+  void GenerateClass(const ObjCImplementationDecl *ClassDecl) override;
 
-  virtual void RegisterAlias(const ObjCCompatibleAliasDecl *OAD) {}
+  void RegisterAlias(const ObjCCompatibleAliasDecl *OAD) override {}
 
-  virtual llvm::Value *GenerateProtocolRef(CodeGenFunction &CGF,
-                                           const ObjCProtocolDecl *PD);
+  llvm::Value *GenerateProtocolRef(CodeGenFunction &CGF,
+                                   const ObjCProtocolDecl *PD) override;
 
-  virtual llvm::Constant *GetEHType(QualType T);
+  llvm::Constant *GetEHType(QualType T) override;
 
-  virtual llvm::Constant *GetPropertyGetFunction() {
+  llvm::Constant *GetPropertyGetFunction() override {
     return ObjCTypes.getGetPropertyFn();
   }
-  virtual llvm::Constant *GetPropertySetFunction() {
+  llvm::Constant *GetPropertySetFunction() override {
     return ObjCTypes.getSetPropertyFn();
   }
-  
-  virtual llvm::Constant *GetOptimizedPropertySetFunction(bool atomic, 
-                                                          bool copy) {
+
+  llvm::Constant *GetOptimizedPropertySetFunction(bool atomic,
+                                                  bool copy) override {
     return ObjCTypes.getOptimizedSetPropertyFn(atomic, copy);
   }
-  
-  virtual llvm::Constant *GetSetStructFunction() {
+
+  llvm::Constant *GetSetStructFunction() override {
     return ObjCTypes.getCopyStructFn();
   }
-  virtual llvm::Constant *GetGetStructFunction() {
+  llvm::Constant *GetGetStructFunction() override {
     return ObjCTypes.getCopyStructFn();
   }
-  virtual llvm::Constant *GetCppAtomicObjectSetFunction() {
+  llvm::Constant *GetCppAtomicObjectSetFunction() override {
     return ObjCTypes.getCppAtomicObjectFunction();
   }
-  virtual llvm::Constant *GetCppAtomicObjectGetFunction() {
+  llvm::Constant *GetCppAtomicObjectGetFunction() override {
     return ObjCTypes.getCppAtomicObjectFunction();
   }
-  
-  virtual llvm::Constant *EnumerationMutationFunction() {
+
+  llvm::Constant *EnumerationMutationFunction() override {
     return ObjCTypes.getEnumerationMutationFn();
   }
 
-  virtual void EmitTryStmt(CodeGen::CodeGenFunction &CGF,
-                           const ObjCAtTryStmt &S);
-  virtual void EmitSynchronizedStmt(CodeGen::CodeGenFunction &CGF,
-                                    const ObjCAtSynchronizedStmt &S);
-  virtual void EmitThrowStmt(CodeGen::CodeGenFunction &CGF,
-                             const ObjCAtThrowStmt &S,
-                             bool ClearInsertionPoint=true);
-  virtual llvm::Value * EmitObjCWeakRead(CodeGen::CodeGenFunction &CGF,
-                                         llvm::Value *AddrWeakObj);
-  virtual void EmitObjCWeakAssign(CodeGen::CodeGenFunction &CGF,
-                                  llvm::Value *src, llvm::Value *dst);
-  virtual void EmitObjCGlobalAssign(CodeGen::CodeGenFunction &CGF,
-                                    llvm::Value *src, llvm::Value *dest,
-                                    bool threadlocal = false);
-  virtual void EmitObjCIvarAssign(CodeGen::CodeGenFunction &CGF,
-                                  llvm::Value *src, llvm::Value *dest,
-                                  llvm::Value *ivarOffset);
-  virtual void EmitObjCStrongCastAssign(CodeGen::CodeGenFunction &CGF,
-                                        llvm::Value *src, llvm::Value *dest);
-  virtual void EmitGCMemmoveCollectable(CodeGen::CodeGenFunction &CGF,
-                                        llvm::Value *dest, llvm::Value *src,
-                                        llvm::Value *size);
-  virtual LValue EmitObjCValueForIvar(CodeGen::CodeGenFunction &CGF,
-                                      QualType ObjectTy,
-                                      llvm::Value *BaseValue,
-                                      const ObjCIvarDecl *Ivar,
-                                      unsigned CVRQualifiers);
-  virtual llvm::Value *EmitIvarOffset(CodeGen::CodeGenFunction &CGF,
-                                      const ObjCInterfaceDecl *Interface,
-                                      const ObjCIvarDecl *Ivar);
+  void EmitTryStmt(CodeGen::CodeGenFunction &CGF,
+                   const ObjCAtTryStmt &S) override;
+  void EmitSynchronizedStmt(CodeGen::CodeGenFunction &CGF,
+                            const ObjCAtSynchronizedStmt &S) override;
+  void EmitThrowStmt(CodeGen::CodeGenFunction &CGF, const ObjCAtThrowStmt &S,
+                     bool ClearInsertionPoint=true) override;
+  llvm::Value * EmitObjCWeakRead(CodeGen::CodeGenFunction &CGF,
+                                 llvm::Value *AddrWeakObj) override;
+  void EmitObjCWeakAssign(CodeGen::CodeGenFunction &CGF,
+                          llvm::Value *src, llvm::Value *dst) override;
+  void EmitObjCGlobalAssign(CodeGen::CodeGenFunction &CGF,
+                            llvm::Value *src, llvm::Value *dest,
+                            bool threadlocal = false) override;
+  void EmitObjCIvarAssign(CodeGen::CodeGenFunction &CGF,
+                          llvm::Value *src, llvm::Value *dest,
+                          llvm::Value *ivarOffset) override;
+  void EmitObjCStrongCastAssign(CodeGen::CodeGenFunction &CGF,
+                                llvm::Value *src, llvm::Value *dest) override;
+  void EmitGCMemmoveCollectable(CodeGen::CodeGenFunction &CGF,
+                                llvm::Value *dest, llvm::Value *src,
+                                llvm::Value *size) override;
+  LValue EmitObjCValueForIvar(CodeGen::CodeGenFunction &CGF, QualType ObjectTy,
+                              llvm::Value *BaseValue, const ObjCIvarDecl *Ivar,
+                              unsigned CVRQualifiers) override;
+  llvm::Value *EmitIvarOffset(CodeGen::CodeGenFunction &CGF,
+                              const ObjCInterfaceDecl *Interface,
+                              const ObjCIvarDecl *Ivar) override;
 };
 
 /// A helper class for performing the null-initialization of a return
@@ -1878,14 +1875,14 @@
   MessageSendInfo MSI = getMessageSendInfo(Method, ResultType, ActualArgs);
 
   if (Method)
-    assert(CGM.getContext().getCanonicalType(Method->getResultType()) ==
-           CGM.getContext().getCanonicalType(ResultType) &&
+    assert(CGM.getContext().getCanonicalType(Method->getReturnType()) ==
+               CGM.getContext().getCanonicalType(ResultType) &&
            "Result type mismatch!");
 
   NullReturnState nullReturn;
 
   llvm::Constant *Fn = NULL;
-  if (CGM.ReturnTypeUsesSRet(MSI.CallInfo)) {
+  if (CGM.ReturnSlotInterferesWithArgs(MSI.CallInfo)) {
     if (!IsSuper) nullReturn.init(CGF, Arg0);
     Fn = (ObjCABI == 2) ?  ObjCTypes.getSendStretFn2(IsSuper)
       : ObjCTypes.getSendStretFn(IsSuper);
@@ -1896,15 +1893,17 @@
     Fn = (ObjCABI == 2) ? ObjCTypes.getSendFp2RetFn2(IsSuper)
       : ObjCTypes.getSendFp2retFn(IsSuper);
   } else {
+    // arm64 uses objc_msgSend for stret methods and yet null receiver check
+    // must be made for it.
+    if (!IsSuper && CGM.ReturnTypeUsesSRet(MSI.CallInfo))
+      nullReturn.init(CGF, Arg0);
     Fn = (ObjCABI == 2) ? ObjCTypes.getSendFn2(IsSuper)
       : ObjCTypes.getSendFn(IsSuper);
   }
   
   bool requiresnullCheck = false;
   if (CGM.getLangOpts().ObjCAutoRefCount && Method)
-    for (ObjCMethodDecl::param_const_iterator i = Method->param_begin(),
-         e = Method->param_end(); i != e; ++i) {
-      const ParmVarDecl *ParamDecl = (*i);
+    for (const auto *ParamDecl : Method->params()) {
       if (ParamDecl->hasAttr<NSConsumedAttr>()) {
         if (!nullReturn.NullBB)
           nullReturn.init(CGF, Arg0);
@@ -1967,9 +1966,8 @@
   // to be GC'ed.
 
   // Walk the captured variables.
-  for (BlockDecl::capture_const_iterator ci = blockDecl->capture_begin(),
-         ce = blockDecl->capture_end(); ci != ce; ++ci) {
-    const VarDecl *variable = ci->getVariable();
+  for (const auto &CI : blockDecl->captures()) {
+    const VarDecl *variable = CI.getVariable();
     QualType type = variable->getType();
 
     const CGBlockInfo::Capture &capture = blockInfo.getCapture(variable);
@@ -1980,7 +1978,7 @@
     uint64_t fieldOffset = layout->getElementOffset(capture.getIndex());
 
     // __block variables are passed by their descriptor address.
-    if (ci->isByRef()) {
+    if (CI.isByRef()) {
       IvarsInfo.push_back(GC_IVAR(fieldOffset, /*size in words*/ 1));
       continue;
     }
@@ -2118,9 +2116,6 @@
         ElCount *= CArray->getSize().getZExtValue();
         FQT = CArray->getElementType();
       }
-      
-      assert(!FQT->isUnionType() &&
-             "layout for array of unions not supported");
       if (FQT->isRecordType() && ElCount) {
         int OldIndex = RunSkipBlockVars.size() - 1;
         const RecordType *RT = FQT->getAs<RecordType>();
@@ -2196,10 +2191,7 @@
                                                   bool &HasUnion,
                                                   bool ByrefLayout) {
   const RecordDecl *RD = RT->getDecl();
-  SmallVector<const FieldDecl*, 16> Fields;
-  for (RecordDecl::field_iterator i = RD->field_begin(),
-       e = RD->field_end(); i != e; ++i)
-    Fields.push_back(*i);
+  SmallVector<const FieldDecl*, 16> Fields(RD->fields());
   llvm::Type *Ty = CGM.getTypes().ConvertType(QualType(RT, 0));
   const llvm::StructLayout *RecLayout =
     CGM.getDataLayout().getStructLayout(cast<llvm::StructType>(Ty));
@@ -2486,9 +2478,8 @@
                            blockInfo.BlockHeaderForcedGapOffset,
                            blockInfo.BlockHeaderForcedGapSize);
   // Walk the captured variables.
-  for (BlockDecl::capture_const_iterator ci = blockDecl->capture_begin(),
-       ce = blockDecl->capture_end(); ci != ce; ++ci) {
-    const VarDecl *variable = ci->getVariable();
+  for (const auto &CI : blockDecl->captures()) {
+    const VarDecl *variable = CI.getVariable();
     QualType type = variable->getType();
     
     const CGBlockInfo::Capture &capture = blockInfo.getCapture(variable);
@@ -2500,17 +2491,17 @@
        CharUnits::fromQuantity(layout->getElementOffset(capture.getIndex()));
     
     assert(!type->isArrayType() && "array variable should not be caught");
-    if (!ci->isByRef())
+    if (!CI.isByRef())
       if (const RecordType *record = type->getAs<RecordType>()) {
         BuildRCBlockVarRecordLayout(record, fieldOffset, hasUnion);
         continue;
       }
     CharUnits fieldSize;
-    if (ci->isByRef())
+    if (CI.isByRef())
       fieldSize = CharUnits::fromQuantity(WordSizeInBytes);
     else
       fieldSize = CGM.getContext().getTypeSizeInChars(type);
-    UpdateRunSkipBlockVars(ci->isByRef(), getBlockCaptureLifetime(type, false),
+    UpdateRunSkipBlockVars(CI.isByRef(), getBlockCaptureLifetime(type, false),
                            fieldOffset, fieldSize);
   }
   return getBitmapBlockLayout(false);
@@ -2562,8 +2553,16 @@
   return GetOrEmitProtocolRef(PD);
 }
 
+static void assertPrivateName(const llvm::GlobalValue *GV) {
+  StringRef NameRef = GV->getName();
+  (void)NameRef;
+  assert(NameRef[0] == '\01' && (NameRef[1] == 'L' || NameRef[1] == 'l'));
+  assert(GV->getVisibility() == llvm::GlobalValue::DefaultVisibility);
+  assert(GV->getLinkage() == llvm::GlobalValue::PrivateLinkage);
+}
+
 /*
-// APPLE LOCAL radar 4585769 - Objective-C 1.0 extensions
+// Objective-C 1.0 extensions
 struct _objc_protocol {
 struct _objc_protocol_extension *isa;
 char *protocol_name;
@@ -2593,9 +2592,7 @@
   std::vector<llvm::Constant*> InstanceMethods, ClassMethods;
   std::vector<llvm::Constant*> OptInstanceMethods, OptClassMethods;
   std::vector<llvm::Constant*> MethodTypesExt, OptMethodTypesExt;
-  for (ObjCProtocolDecl::instmeth_iterator
-         i = PD->instmeth_begin(), e = PD->instmeth_end(); i != e; ++i) {
-    ObjCMethodDecl *MD = *i;
+  for (const auto *MD : PD->instance_methods()) {
     llvm::Constant *C = GetMethodDescriptionConstant(MD);
     if (!C)
       return GetOrEmitProtocolRef(PD);
@@ -2609,9 +2606,7 @@
     }
   }
 
-  for (ObjCProtocolDecl::classmeth_iterator
-         i = PD->classmeth_begin(), e = PD->classmeth_end(); i != e; ++i) {
-    ObjCMethodDecl *MD = *i;
+  for (const auto *MD : PD->class_methods()) {
     llvm::Constant *C = GetMethodDescriptionConstant(MD);
     if (!C)
       return GetOrEmitProtocolRef(PD);
@@ -2646,13 +2641,13 @@
                                                    Values);
 
   if (Entry) {
-    // Already created, fix the linkage and update the initializer.
-    Entry->setLinkage(llvm::GlobalValue::InternalLinkage);
+    // Already created, update the initializer.
+    assert(Entry->getLinkage() == llvm::GlobalValue::PrivateLinkage);
     Entry->setInitializer(Init);
   } else {
     Entry =
       new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ProtocolTy, false,
-                               llvm::GlobalValue::InternalLinkage,
+                               llvm::GlobalValue::PrivateLinkage,
                                Init,
                                "\01L_OBJC_PROTOCOL_" + PD->getName());
     Entry->setSection("__OBJC,__protocol,regular,no_dead_strip");
@@ -2661,7 +2656,8 @@
 
     Protocols[PD->getIdentifier()] = Entry;
   }
-  CGM.AddUsedGlobal(Entry);
+  assertPrivateName(Entry);
+  CGM.addCompilerUsedGlobal(Entry);
 
   return Entry;
 }
@@ -2675,13 +2671,14 @@
     // contents for protocols which were referenced but never defined.
     Entry =
       new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ProtocolTy, false,
-                               llvm::GlobalValue::ExternalLinkage,
+                               llvm::GlobalValue::PrivateLinkage,
                                0,
                                "\01L_OBJC_PROTOCOL_" + PD->getName());
     Entry->setSection("__OBJC,__protocol,regular,no_dead_strip");
     // FIXME: Is this necessary? Why only for protocol?
     Entry->setAlignment(4);
   }
+  assertPrivateName(Entry);
 
   return Entry;
 }
@@ -2726,7 +2723,7 @@
     llvm::ConstantStruct::get(ObjCTypes.ProtocolExtensionTy, Values);
 
   // No special section, but goes in llvm.used
-  return CreateMetadataVar("\01L_OBJC_PROTOCOLEXT_" + PD->getName(),
+  return CreateMetadataVar("\01l_OBJC_PROTOCOLEXT_" + PD->getName(),
                            Init,
                            0, 0, true);
 }
@@ -2775,14 +2772,11 @@
 PushProtocolProperties(llvm::SmallPtrSet<const IdentifierInfo*,16> &PropertySet,
                        SmallVectorImpl<llvm::Constant *> &Properties,
                        const Decl *Container,
-                       const ObjCProtocolDecl *PROTO,
+                       const ObjCProtocolDecl *Proto,
                        const ObjCCommonTypesHelper &ObjCTypes) {
-  for (ObjCProtocolDecl::protocol_iterator P = PROTO->protocol_begin(),
-         E = PROTO->protocol_end(); P != E; ++P) 
-    PushProtocolProperties(PropertySet, Properties, Container, (*P), ObjCTypes);
-  for (ObjCContainerDecl::prop_iterator I = PROTO->prop_begin(),
-       E = PROTO->prop_end(); I != E; ++I) {
-    const ObjCPropertyDecl *PD = *I;
+  for (const auto *P : Proto->protocols()) 
+    PushProtocolProperties(PropertySet, Properties, Container, P, ObjCTypes);
+  for (const auto *PD : Proto->properties()) {
     if (!PropertySet.insert(PD->getIdentifier()))
       continue;
     llvm::Constant *Prop[] = {
@@ -2811,9 +2805,7 @@
                                        const ObjCCommonTypesHelper &ObjCTypes) {
   SmallVector<llvm::Constant *, 16> Properties;
   llvm::SmallPtrSet<const IdentifierInfo*, 16> PropertySet;
-  for (ObjCContainerDecl::prop_iterator I = OCD->prop_begin(),
-         E = OCD->prop_end(); I != E; ++I) {
-    const ObjCPropertyDecl *PD = *I;
+  for (const auto *PD : OCD->properties()) {
     PropertySet.insert(PD->getIdentifier());
     llvm::Constant *Prop[] = {
       GetPropertyName(PD->getIdentifier()),
@@ -2823,17 +2815,12 @@
                                                    Prop));
   }
   if (const ObjCInterfaceDecl *OID = dyn_cast<ObjCInterfaceDecl>(OCD)) {
-    for (ObjCInterfaceDecl::all_protocol_iterator
-         P = OID->all_referenced_protocol_begin(),
-         E = OID->all_referenced_protocol_end(); P != E; ++P)
-      PushProtocolProperties(PropertySet, Properties, Container, (*P), 
-                             ObjCTypes);
+    for (const auto *P : OID->all_referenced_protocols())
+      PushProtocolProperties(PropertySet, Properties, Container, P, ObjCTypes);
   }
   else if (const ObjCCategoryDecl *CD = dyn_cast<ObjCCategoryDecl>(OCD)) {
-    for (ObjCCategoryDecl::protocol_iterator P = CD->protocol_begin(),
-         E = CD->protocol_end(); P != E; ++P)
-      PushProtocolProperties(PropertySet, Properties, Container, (*P), 
-                             ObjCTypes);
+    for (const auto *P : CD->protocols())
+      PushProtocolProperties(PropertySet, Properties, Container, P, ObjCTypes);
   }
 
   // Return null for empty list.
@@ -2945,16 +2932,13 @@
                                      << OCD->getName();
 
   SmallVector<llvm::Constant *, 16> InstanceMethods, ClassMethods;
-  for (ObjCCategoryImplDecl::instmeth_iterator
-         i = OCD->instmeth_begin(), e = OCD->instmeth_end(); i != e; ++i) {
+  for (const auto *I : OCD->instance_methods())
     // Instance methods should always be defined.
-    InstanceMethods.push_back(GetMethodConstant(*i));
-  }
-  for (ObjCCategoryImplDecl::classmeth_iterator
-         i = OCD->classmeth_begin(), e = OCD->classmeth_end(); i != e; ++i) {
+    InstanceMethods.push_back(GetMethodConstant(I));
+
+  for (const auto *I : OCD->class_methods())
     // Class methods should always be defined.
-    ClassMethods.push_back(GetMethodConstant(*i));
-  }
+    ClassMethods.push_back(GetMethodConstant(I));
 
   llvm::Constant *Values[7];
   Values[0] = GetClassName(OCD->getIdentifier());
@@ -3073,21 +3057,15 @@
     Flags |= FragileABI_Class_Hidden;
 
   SmallVector<llvm::Constant *, 16> InstanceMethods, ClassMethods;
-  for (ObjCImplementationDecl::instmeth_iterator
-         i = ID->instmeth_begin(), e = ID->instmeth_end(); i != e; ++i) {
+  for (const auto *I : ID->instance_methods())
     // Instance methods should always be defined.
-    InstanceMethods.push_back(GetMethodConstant(*i));
-  }
-  for (ObjCImplementationDecl::classmeth_iterator
-         i = ID->classmeth_begin(), e = ID->classmeth_end(); i != e; ++i) {
+    InstanceMethods.push_back(GetMethodConstant(I));
+
+  for (const auto *I : ID->class_methods())
     // Class methods should always be defined.
-    ClassMethods.push_back(GetMethodConstant(*i));
-  }
+    ClassMethods.push_back(GetMethodConstant(I));
 
-  for (ObjCImplementationDecl::propimpl_iterator
-         i = ID->propimpl_begin(), e = ID->propimpl_end(); i != e; ++i) {
-    ObjCPropertyImplDecl *PID = *i;
-
+  for (const auto *PID : ID->property_impls()) {
     if (PID->getPropertyImplementation() == ObjCPropertyImplDecl::Synthesize) {
       ObjCPropertyDecl *PD = PID->getPropertyDecl();
 
@@ -3133,19 +3111,19 @@
   Name += ClassName;
   const char *Section = "__OBJC,__class,regular,no_dead_strip";
   // Check for a forward reference.
-  llvm::GlobalVariable *GV = CGM.getModule().getGlobalVariable(Name);
+  llvm::GlobalVariable *GV = CGM.getModule().getGlobalVariable(Name, true);
   if (GV) {
     assert(GV->getType()->getElementType() == ObjCTypes.ClassTy &&
            "Forward metaclass reference has incorrect type.");
-    GV->setLinkage(llvm::GlobalValue::InternalLinkage);
     GV->setInitializer(Init);
     GV->setSection(Section);
     GV->setAlignment(4);
-    CGM.AddUsedGlobal(GV);
-  } 
-  else
+    CGM.addCompilerUsedGlobal(GV);
+  } else
     GV = CreateMetadataVar(Name, Init, Section, 4, true);
+  assertPrivateName(GV);
   DefinedClasses.push_back(GV);
+  ImplementedClasses.push_back(Interface);
   // method definition entries must be clear for next implementation.
   MethodDefinitions.clear();
 }
@@ -3201,20 +3179,20 @@
   Name += ID->getName();
 
   // Check for a forward reference.
-  llvm::GlobalVariable *GV = CGM.getModule().getGlobalVariable(Name);
+  llvm::GlobalVariable *GV = CGM.getModule().getGlobalVariable(Name, true);
   if (GV) {
     assert(GV->getType()->getElementType() == ObjCTypes.ClassTy &&
            "Forward metaclass reference has incorrect type.");
-    GV->setLinkage(llvm::GlobalValue::InternalLinkage);
     GV->setInitializer(Init);
   } else {
     GV = new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ClassTy, false,
-                                  llvm::GlobalValue::InternalLinkage,
+                                  llvm::GlobalValue::PrivateLinkage,
                                   Init, Name);
   }
+  assertPrivateName(GV);
   GV->setSection("__OBJC,__meta_class,regular,no_dead_strip");
   GV->setAlignment(4);
-  CGM.AddUsedGlobal(GV);
+  CGM.addCompilerUsedGlobal(GV);
 
   return GV;
 }
@@ -3230,35 +3208,29 @@
   // Check for an existing forward reference.
   // Previously, metaclass with internal linkage may have been defined.
   // pass 'true' as 2nd argument so it is returned.
-  if (llvm::GlobalVariable *GV = CGM.getModule().getGlobalVariable(Name,
-                                                                   true)) {
-    assert(GV->getType()->getElementType() == ObjCTypes.ClassTy &&
-           "Forward metaclass reference has incorrect type.");
-    return GV;
-  } else {
-    // Generate as an external reference to keep a consistent
-    // module. This will be patched up when we emit the metaclass.
-    return new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ClassTy, false,
-                                    llvm::GlobalValue::ExternalLinkage,
-                                    0,
-                                    Name);
-  }
+  llvm::GlobalVariable *GV = CGM.getModule().getGlobalVariable(Name, true);
+  if (!GV)
+    GV = new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ClassTy, false,
+                                  llvm::GlobalValue::PrivateLinkage, 0, Name);
+
+  assert(GV->getType()->getElementType() == ObjCTypes.ClassTy &&
+         "Forward metaclass reference has incorrect type.");
+  assertPrivateName(GV);
+  return GV;
 }
 
 llvm::Value *CGObjCMac::EmitSuperClassRef(const ObjCInterfaceDecl *ID) {
   std::string Name = "\01L_OBJC_CLASS_" + ID->getNameAsString();
-  
-  if (llvm::GlobalVariable *GV = CGM.getModule().getGlobalVariable(Name,
-                                                                   true)) {
-    assert(GV->getType()->getElementType() == ObjCTypes.ClassTy &&
-           "Forward class metadata reference has incorrect type.");
-    return GV;
-  } else {
-    return new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ClassTy, false,
-                                    llvm::GlobalValue::ExternalLinkage,
-                                    0,
-                                    Name);
-  }
+  llvm::GlobalVariable *GV = CGM.getModule().getGlobalVariable(Name, true);
+
+  if (!GV)
+    GV = new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ClassTy, false,
+                                  llvm::GlobalValue::PrivateLinkage, 0, Name);
+
+  assert(GV->getType()->getElementType() == ObjCTypes.ClassTy &&
+         "Forward class metadata reference has incorrect type.");
+  assertPrivateName(GV);
+  return GV;
 }
 
 /*
@@ -3430,13 +3402,14 @@
   llvm::Type *Ty = Init->getType();
   llvm::GlobalVariable *GV =
     new llvm::GlobalVariable(CGM.getModule(), Ty, false,
-                             llvm::GlobalValue::InternalLinkage, Init, Name);
+                             llvm::GlobalValue::PrivateLinkage, Init, Name);
+  assertPrivateName(GV);
   if (Section)
     GV->setSection(Section);
   if (Align)
     GV->setAlignment(Align);
   if (AddToUsed)
-    CGM.AddUsedGlobal(GV);
+    CGM.addCompilerUsedGlobal(GV);
   return GV;
 }
 
@@ -3501,7 +3474,7 @@
       : S(*S), SyncArgSlot(SyncArgSlot), CallTryExitVar(CallTryExitVar),
         ExceptionData(ExceptionData), ObjCTypes(*ObjCTypes) {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       // Check whether we need to call objc_exception_try_exit.
       // In optimized code, this branch will always be folded.
       llvm::BasicBlock *FinallyCallExit =
@@ -4293,14 +4266,14 @@
 ///   unsigned flags;
 /// };
 enum ImageInfoFlags {
-  eImageInfo_FixAndContinue      = (1 << 0),
+  eImageInfo_FixAndContinue      = (1 << 0), // This flag is no longer set by clang.
   eImageInfo_GarbageCollected    = (1 << 1),
   eImageInfo_GCOnly              = (1 << 2),
-  eImageInfo_OptimizedByDyld     = (1 << 3), // FIXME: When is this set.
+  eImageInfo_OptimizedByDyld     = (1 << 3), // This flag is set by the dyld shared cache.
 
   // A flag indicating that the module has no instances of a @synthesize of a
   // superclass variable. <rdar://problem/6803242>
-  eImageInfo_CorrectedSynthesize = (1 << 4),
+  eImageInfo_CorrectedSynthesize = (1 << 4), // This flag is no longer set by clang.
   eImageInfo_ImageIsSimulated    = (1 << 5)
 };
 
@@ -4399,9 +4372,17 @@
   // The runtime expects exactly the list of defined classes followed
   // by the list of defined categories, in a single array.
   SmallVector<llvm::Constant*, 8> Symbols(NumClasses + NumCategories);
-  for (unsigned i=0; i<NumClasses; i++)
+  for (unsigned i=0; i<NumClasses; i++) {
+    const ObjCInterfaceDecl *ID = ImplementedClasses[i];
+    assert(ID);
+    if (ObjCImplementationDecl *IMP = ID->getImplementation())
+      // We are implementing a weak imported interface. Give it external linkage
+      if (ID->isWeakImported() && !IMP->isWeakImported())
+        DefinedClasses[i]->setLinkage(llvm::GlobalVariable::ExternalLinkage);
+    
     Symbols[i] = llvm::ConstantExpr::getBitCast(DefinedClasses[i],
                                                 ObjCTypes.Int8PtrTy);
+  }
   for (unsigned i=0; i<NumCategories; i++)
     Symbols[NumClasses + i] =
       llvm::ConstantExpr::getBitCast(DefinedCategories[i],
@@ -4507,10 +4488,7 @@
                                                 bool &HasUnion) {
   const RecordDecl *RD = RT->getDecl();
   // FIXME - Use iterator.
-  SmallVector<const FieldDecl*, 16> Fields;
-  for (RecordDecl::field_iterator i = RD->field_begin(),
-                                  e = RD->field_end(); i != e; ++i)
-    Fields.push_back(*i);
+  SmallVector<const FieldDecl*, 16> Fields(RD->fields());
   llvm::Type *Ty = CGM.getTypes().ConvertType(QualType(RT, 0));
   const llvm::StructLayout *RecLayout =
     CGM.getDataLayout().getStructLayout(cast<llvm::StructType>(Ty));
@@ -4589,9 +4567,6 @@
         ElCount *= CArray->getSize().getZExtValue();
         FQT = CArray->getElementType();
       }
-
-      assert(!FQT->isUnionType() &&
-             "layout for array of unions not supported");
       if (FQT->isRecordType() && ElCount) {
         int OldIndex = IvarsInfo.size() - 1;
         int OldSkIndex = SkipIvars.size() -1;
@@ -4996,10 +4971,10 @@
     Values[2] = llvm::Constant::getNullValue(ObjCTypes.ProtocolListPtrTy);
     Values[3] = Values[4] =
       llvm::Constant::getNullValue(ObjCTypes.MethodDescriptionListPtrTy);
-    I->second->setLinkage(llvm::GlobalValue::InternalLinkage);
+    assertPrivateName(I->second);
     I->second->setInitializer(llvm::ConstantStruct::get(ObjCTypes.ProtocolTy,
                                                         Values));
-    CGM.AddUsedGlobal(I->second);
+    CGM.addCompilerUsedGlobal(I->second);
   }
 
   // Add assembler directives to add lazy undefined symbol references
@@ -5054,6 +5029,13 @@
   Int8PtrTy = CGM.Int8PtrTy;
   Int8PtrPtrTy = CGM.Int8PtrPtrTy;
 
+  // arm64 targets use "int" ivar offset variables. All others,
+  // including OS X x86_64 and Windows x86_64, use "long" ivar offsets.
+  if (CGM.getTarget().getTriple().getArch() == llvm::Triple::arm64)
+    IvarOffsetVarTy = IntTy;
+  else
+    IvarOffsetVarTy = LongTy;
+
   ObjectPtrTy = Types.ConvertType(Ctx.getObjCIdType());
   PtrObjectPtrTy = llvm::PointerType::getUnqual(ObjectPtrTy);
   SelectorPtrTy = Types.ConvertType(Ctx.getObjCSelType());
@@ -5357,16 +5339,15 @@
   ProtocolListnfABIPtrTy = llvm::PointerType::getUnqual(ProtocolListnfABITy);
 
   // struct _ivar_t {
-  //   unsigned long int *offset;  // pointer to ivar offset location
+  //   unsigned [long] int *offset;  // pointer to ivar offset location
   //   char *name;
   //   char *type;
   //   uint32_t alignment;
   //   uint32_t size;
   // }
-  IvarnfABITy =
-    llvm::StructType::create("struct._ivar_t",
-                             llvm::PointerType::getUnqual(LongTy),
-                             Int8PtrTy, Int8PtrTy, IntTy, IntTy, NULL);
+  IvarnfABITy = llvm::StructType::create(
+      "struct._ivar_t", llvm::PointerType::getUnqual(IvarOffsetVarTy),
+      Int8PtrTy, Int8PtrTy, IntTy, IntTy, NULL);
 
   // struct _ivar_list_t {
   //   uint32 entsize;  // sizeof(struct _ivar_t)
@@ -5521,12 +5502,13 @@
 
   llvm::GlobalVariable *GV =
     new llvm::GlobalVariable(CGM.getModule(), Init->getType(), false,
-                             llvm::GlobalValue::InternalLinkage,
+                             llvm::GlobalValue::PrivateLinkage,
                              Init,
                              SymbolName);
+  assertPrivateName(GV);
   GV->setAlignment(CGM.getDataLayout().getABITypeAlignment(Init->getType()));
   GV->setSection(SectionName);
-  CGM.AddUsedGlobal(GV);
+  CGM.addCompilerUsedGlobal(GV);
 }
 
 void CGObjCNonFragileABIMac::FinishNonFragileABIModule() {
@@ -5534,24 +5516,20 @@
 
   // Build list of all implemented class addresses in array
   // L_OBJC_LABEL_CLASS_$.
+
+  for (unsigned i=0, NumClasses=ImplementedClasses.size(); i<NumClasses; i++) {
+    const ObjCInterfaceDecl *ID = ImplementedClasses[i];
+    assert(ID);
+    if (ObjCImplementationDecl *IMP = ID->getImplementation())
+      // We are implementing a weak imported interface. Give it external linkage
+      if (ID->isWeakImported() && !IMP->isWeakImported())
+        DefinedClasses[i]->setLinkage(llvm::GlobalVariable::ExternalLinkage);
+  }
+  
   AddModuleClassList(DefinedClasses,
                      "\01L_OBJC_LABEL_CLASS_$",
                      "__DATA, __objc_classlist, regular, no_dead_strip");
-  
-  for (unsigned i = 0, e = DefinedClasses.size(); i < e; i++) {
-    llvm::GlobalValue *IMPLGV = DefinedClasses[i];
-    if (IMPLGV->getLinkage() != llvm::GlobalValue::ExternalWeakLinkage)
-      continue;
-    IMPLGV->setLinkage(llvm::GlobalValue::ExternalLinkage);
-  }
-  
-  for (unsigned i = 0, e = DefinedMetaClasses.size(); i < e; i++) {
-    llvm::GlobalValue *IMPLGV = DefinedMetaClasses[i];
-    if (IMPLGV->getLinkage() != llvm::GlobalValue::ExternalWeakLinkage)
-      continue;
-    IMPLGV->setLinkage(llvm::GlobalValue::ExternalLinkage);
-  }    
-  
+
   AddModuleClassList(DefinedNonLazyClasses,
                      "\01L_OBJC_LABEL_NONLAZY_CLASS_$",
                      "__DATA, __objc_nlclslist, regular, no_dead_strip");
@@ -5669,22 +5647,16 @@
   std::string MethodListName("\01l_OBJC_$_");
   if (flags & NonFragileABI_Class_Meta) {
     MethodListName += "CLASS_METHODS_" + ID->getNameAsString();
-    for (ObjCImplementationDecl::classmeth_iterator
-           i = ID->classmeth_begin(), e = ID->classmeth_end(); i != e; ++i) {
+    for (const auto *I : ID->class_methods())
       // Class methods should always be defined.
-      Methods.push_back(GetMethodConstant(*i));
-    }
+      Methods.push_back(GetMethodConstant(I));
   } else {
     MethodListName += "INSTANCE_METHODS_" + ID->getNameAsString();
-    for (ObjCImplementationDecl::instmeth_iterator
-           i = ID->instmeth_begin(), e = ID->instmeth_end(); i != e; ++i) {
+    for (const auto *I : ID->instance_methods())
       // Instance methods should always be defined.
-      Methods.push_back(GetMethodConstant(*i));
-    }
-    for (ObjCImplementationDecl::propimpl_iterator
-           i = ID->propimpl_begin(), e = ID->propimpl_end(); i != e; ++i) {
-      ObjCPropertyImplDecl *PID = *i;
+      Methods.push_back(GetMethodConstant(I));
 
+    for (const auto *PID : ID->property_impls()) {
       if (PID->getPropertyImplementation() == ObjCPropertyImplDecl::Synthesize){
         ObjCPropertyDecl *PD = PID->getPropertyDecl();
 
@@ -5721,11 +5693,12 @@
                                                    Values);
   llvm::GlobalVariable *CLASS_RO_GV =
     new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ClassRonfABITy, false,
-                             llvm::GlobalValue::InternalLinkage,
+                             llvm::GlobalValue::PrivateLinkage,
                              Init,
                              (flags & NonFragileABI_Class_Meta) ?
                              std::string("\01l_OBJC_METACLASS_RO_$_")+ClassName :
                              std::string("\01l_OBJC_CLASS_RO_$_")+ClassName);
+  assertPrivateName(CLASS_RO_GV);
   CLASS_RO_GV->setAlignment(
     CGM.getDataLayout().getABITypeAlignment(ObjCTypes.ClassRonfABITy));
   CLASS_RO_GV->setSection("__DATA, __objc_const");
@@ -5743,12 +5716,9 @@
 ///   struct class_ro_t *ro;
 /// }
 ///
-llvm::GlobalVariable * CGObjCNonFragileABIMac::BuildClassMetaData(
-  std::string &ClassName,
-  llvm::Constant *IsAGV,
-  llvm::Constant *SuperClassGV,
-  llvm::Constant *ClassRoGV,
-  bool HiddenVisibility) {
+llvm::GlobalVariable *CGObjCNonFragileABIMac::BuildClassMetaData(
+    std::string &ClassName, llvm::Constant *IsAGV, llvm::Constant *SuperClassGV,
+    llvm::Constant *ClassRoGV, bool HiddenVisibility, bool Weak) {
   llvm::Constant *Values[] = {
     IsAGV,
     SuperClassGV,
@@ -5763,7 +5733,7 @@
                   llvm::PointerType::getUnqual(ObjCTypes.ImpnfABITy));
   llvm::Constant *Init = llvm::ConstantStruct::get(ObjCTypes.ClassnfABITy,
                                                    Values);
-  llvm::GlobalVariable *GV = GetClassGlobal(ClassName);
+  llvm::GlobalVariable *GV = GetClassGlobal(ClassName, Weak);
   GV->setInitializer(Init);
   GV->setSection("__DATA, __objc_data");
   GV->setAlignment(
@@ -5849,31 +5819,37 @@
   if (!ID->getClassInterface()->getSuperClass()) {
     // class is root
     flags |= NonFragileABI_Class_Root;
-    SuperClassGV = GetClassGlobal(ObjCClassName + ClassName);
-    IsAGV = GetClassGlobal(ObjCMetaClassName + ClassName);
+    SuperClassGV = GetClassGlobal(ObjCClassName + ClassName,
+                                  ID->getClassInterface()->isWeakImported());
+    IsAGV = GetClassGlobal(ObjCMetaClassName + ClassName,
+                           ID->getClassInterface()->isWeakImported());
+
+    // We are implementing a weak imported interface. Give it external
+    // linkage.
+    if (!ID->isWeakImported() && ID->getClassInterface()->isWeakImported())
+      IsAGV->setLinkage(llvm::GlobalVariable::ExternalLinkage);
   } else {
     // Has a root. Current class is not a root.
     const ObjCInterfaceDecl *Root = ID->getClassInterface();
     while (const ObjCInterfaceDecl *Super = Root->getSuperClass())
       Root = Super;
-    IsAGV = GetClassGlobal(ObjCMetaClassName + Root->getNameAsString());
-    if (Root->isWeakImported())
-      IsAGV->setLinkage(llvm::GlobalValue::ExternalWeakLinkage);
+    IsAGV = GetClassGlobal(ObjCMetaClassName + Root->getNameAsString(),
+                           Root->isWeakImported());
     // work on super class metadata symbol.
     std::string SuperClassName =
       ObjCMetaClassName + 
         ID->getClassInterface()->getSuperClass()->getNameAsString();
-    SuperClassGV = GetClassGlobal(SuperClassName);
-    if (ID->getClassInterface()->getSuperClass()->isWeakImported())
-      SuperClassGV->setLinkage(llvm::GlobalValue::ExternalWeakLinkage);
+    SuperClassGV = GetClassGlobal(
+        SuperClassName,
+        ID->getClassInterface()->getSuperClass()->isWeakImported());
   }
   llvm::GlobalVariable *CLASS_RO_GV = BuildClassRoTInitializer(flags,
                                                                InstanceStart,
                                                                InstanceSize,ID);
   std::string TClassName = ObjCMetaClassName + ClassName;
-  llvm::GlobalVariable *MetaTClass =
-    BuildClassMetaData(TClassName, IsAGV, SuperClassGV, CLASS_RO_GV,
-                       classIsHidden);
+  llvm::GlobalVariable *MetaTClass = BuildClassMetaData(
+      TClassName, IsAGV, SuperClassGV, CLASS_RO_GV, classIsHidden,
+      ID->isWeakImported());
   DefinedMetaClasses.push_back(MetaTClass);
 
   // Metadata for the class
@@ -5904,9 +5880,9 @@
     // Has a root. Current class is not a root.
     std::string RootClassName =
       ID->getClassInterface()->getSuperClass()->getNameAsString();
-    SuperClassGV = GetClassGlobal(ObjCClassName + RootClassName);
-    if (ID->getClassInterface()->getSuperClass()->isWeakImported())
-      SuperClassGV->setLinkage(llvm::GlobalValue::ExternalWeakLinkage);
+    SuperClassGV = GetClassGlobal(
+        ObjCClassName + RootClassName,
+        ID->getClassInterface()->getSuperClass()->isWeakImported());
   }
   GetClassSizeInfo(ID, InstanceStart, InstanceSize);
   CLASS_RO_GV = BuildClassRoTInitializer(flags,
@@ -5917,8 +5893,10 @@
   TClassName = ObjCClassName + ClassName;
   llvm::GlobalVariable *ClassMD =
     BuildClassMetaData(TClassName, MetaTClass, SuperClassGV, CLASS_RO_GV,
-                       classIsHidden);
+                       classIsHidden,
+                       ID->getClassInterface()->isWeakImported());
   DefinedClasses.push_back(ClassMD);
+  ImplementedClasses.push_back(ID->getClassInterface());
 
   // Determine if this class is also "non-lazy".
   if (ImplementationIsNonLazy(ID))
@@ -5963,7 +5941,7 @@
     ProtocolName);
   PTGV->setSection("__DATA, __objc_protorefs, coalesced, no_dead_strip");
   PTGV->setVisibility(llvm::GlobalValue::HiddenVisibility);
-  CGM.AddUsedGlobal(PTGV);
+  CGM.addCompilerUsedGlobal(PTGV);
   return CGF.Builder.CreateLoad(PTGV);
 }
 
@@ -5988,21 +5966,18 @@
   llvm::Constant *Values[6];
   Values[0] = GetClassName(OCD->getIdentifier());
   // meta-class entry symbol
-  llvm::GlobalVariable *ClassGV = GetClassGlobal(ExtClassName);
-  if (Interface->isWeakImported())
-    ClassGV->setLinkage(llvm::GlobalValue::ExternalWeakLinkage);
-  
+  llvm::GlobalVariable *ClassGV =
+      GetClassGlobal(ExtClassName, Interface->isWeakImported());
+
   Values[1] = ClassGV;
   std::vector<llvm::Constant*> Methods;
   std::string MethodListName(Prefix);
   MethodListName += "INSTANCE_METHODS_" + Interface->getNameAsString() +
     "_$_" + OCD->getNameAsString();
 
-  for (ObjCCategoryImplDecl::instmeth_iterator
-         i = OCD->instmeth_begin(), e = OCD->instmeth_end(); i != e; ++i) {
+  for (const auto *I : OCD->instance_methods())
     // Instance methods should always be defined.
-    Methods.push_back(GetMethodConstant(*i));
-  }
+    Methods.push_back(GetMethodConstant(I));
 
   Values[2] = EmitMethodList(MethodListName,
                              "__DATA, __objc_const",
@@ -6012,11 +5987,9 @@
   MethodListName += "CLASS_METHODS_" + Interface->getNameAsString() + "_$_" +
     OCD->getNameAsString();
   Methods.clear();
-  for (ObjCCategoryImplDecl::classmeth_iterator
-         i = OCD->classmeth_begin(), e = OCD->classmeth_end(); i != e; ++i) {
+  for (const auto *I : OCD->class_methods())
     // Class methods should always be defined.
-    Methods.push_back(GetMethodConstant(*i));
-  }
+    Methods.push_back(GetMethodConstant(I));
 
   Values[3] = EmitMethodList(MethodListName,
                              "__DATA, __objc_const",
@@ -6045,13 +6018,14 @@
   llvm::GlobalVariable *GCATV
     = new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.CategorynfABITy,
                                false,
-                               llvm::GlobalValue::InternalLinkage,
+                               llvm::GlobalValue::PrivateLinkage,
                                Init,
                                ExtCatName);
+  assertPrivateName(GCATV);
   GCATV->setAlignment(
     CGM.getDataLayout().getABITypeAlignment(ObjCTypes.CategorynfABITy));
   GCATV->setSection("__DATA, __objc_const");
-  CGM.AddUsedGlobal(GCATV);
+  CGM.addCompilerUsedGlobal(GCATV);
   DefinedCategories.push_back(GCATV);
 
   // Determine if this category is also "non-lazy".
@@ -6107,10 +6081,11 @@
 
   llvm::GlobalVariable *GV =
     new llvm::GlobalVariable(CGM.getModule(), Init->getType(), false,
-                             llvm::GlobalValue::InternalLinkage, Init, Name);
+                             llvm::GlobalValue::PrivateLinkage, Init, Name);
+  assertPrivateName(GV);
   GV->setAlignment(CGM.getDataLayout().getABITypeAlignment(Init->getType()));
   GV->setSection(Section);
-  CGM.AddUsedGlobal(GV);
+  CGM.addCompilerUsedGlobal(GV);
   return llvm::ConstantExpr::getBitCast(GV, ObjCTypes.MethodListnfABIPtrTy);
 }
 
@@ -6125,12 +6100,9 @@
   llvm::GlobalVariable *IvarOffsetGV =
     CGM.getModule().getGlobalVariable(Name);
   if (!IvarOffsetGV)
-    IvarOffsetGV =
-      new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.LongTy,
-                               false,
-                               llvm::GlobalValue::ExternalLinkage,
-                               0,
-                               Name);
+    IvarOffsetGV = new llvm::GlobalVariable(
+        CGM.getModule(), ObjCTypes.IvarOffsetVarTy, false,
+        llvm::GlobalValue::ExternalLinkage, 0, Name);
   return IvarOffsetGV;
 }
 
@@ -6139,10 +6111,10 @@
                                           const ObjCIvarDecl *Ivar,
                                           unsigned long int Offset) {
   llvm::GlobalVariable *IvarOffsetGV = ObjCIvarOffsetVariable(ID, Ivar);
-  IvarOffsetGV->setInitializer(llvm::ConstantInt::get(ObjCTypes.LongTy,
-                                                      Offset));
+  IvarOffsetGV->setInitializer(
+      llvm::ConstantInt::get(ObjCTypes.IvarOffsetVarTy, Offset));
   IvarOffsetGV->setAlignment(
-    CGM.getDataLayout().getABITypeAlignment(ObjCTypes.LongTy));
+      CGM.getDataLayout().getABITypeAlignment(ObjCTypes.IvarOffsetVarTy));
 
   // FIXME: This matches gcc, but shouldn't the visibility be set on the use as
   // well (i.e., in ObjCIvarOffsetVariable).
@@ -6160,7 +6132,7 @@
 /// implementation. The return value has type
 /// IvarListnfABIPtrTy.
 ///  struct _ivar_t {
-///   unsigned long int *offset;  // pointer to ivar offset location
+///   unsigned [long] int *offset;  // pointer to ivar offset location
 ///   char *name;
 ///   char *type;
 ///   uint32_t alignment;
@@ -6223,14 +6195,15 @@
   const char *Prefix = "\01l_OBJC_$_INSTANCE_VARIABLES_";
   llvm::GlobalVariable *GV =
     new llvm::GlobalVariable(CGM.getModule(), Init->getType(), false,
-                             llvm::GlobalValue::InternalLinkage,
+                             llvm::GlobalValue::PrivateLinkage,
                              Init,
                              Prefix + OID->getName());
+  assertPrivateName(GV);
   GV->setAlignment(
     CGM.getDataLayout().getABITypeAlignment(Init->getType()));
   GV->setSection("__DATA, __objc_const");
 
-  CGM.AddUsedGlobal(GV);
+  CGM.addCompilerUsedGlobal(GV);
   return llvm::ConstantExpr::getBitCast(GV, ObjCTypes.IvarListnfABIPtrTy);
 }
 
@@ -6243,10 +6216,9 @@
     // reference or not. At module finalization we add the empty
     // contents for protocols which were referenced but never defined.
     Entry =
-      new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ProtocolnfABITy, false,
-                               llvm::GlobalValue::ExternalLinkage,
-                               0,
-                               "\01l_OBJC_PROTOCOL_$_" + PD->getName());
+        new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ProtocolnfABITy,
+                                 false, llvm::GlobalValue::WeakAnyLinkage, 0,
+                                 "\01l_OBJC_PROTOCOL_$_" + PD->getName());
     Entry->setSection("__DATA,__datacoal_nt,coalesced");
   }
 
@@ -6287,9 +6259,7 @@
   std::vector<llvm::Constant*> InstanceMethods, ClassMethods;
   std::vector<llvm::Constant*> OptInstanceMethods, OptClassMethods;
   std::vector<llvm::Constant*> MethodTypesExt, OptMethodTypesExt;
-  for (ObjCProtocolDecl::instmeth_iterator
-         i = PD->instmeth_begin(), e = PD->instmeth_end(); i != e; ++i) {
-    ObjCMethodDecl *MD = *i;
+  for (const auto *MD : PD->instance_methods()) {
     llvm::Constant *C = GetMethodDescriptionConstant(MD);
     if (!C)
       return GetOrEmitProtocolRef(PD);
@@ -6303,9 +6273,7 @@
     }
   }
 
-  for (ObjCProtocolDecl::classmeth_iterator
-         i = PD->classmeth_begin(), e = PD->classmeth_end(); i != e; ++i) {
-    ObjCMethodDecl *MD = *i;
+  for (const auto *MD : PD->class_methods()) {
     llvm::Constant *C = GetMethodDescriptionConstant(MD);
     if (!C)
       return GetOrEmitProtocolRef(PD);
@@ -6359,8 +6327,8 @@
                                                    Values);
 
   if (Entry) {
-    // Already created, fix the linkage and update the initializer.
-    Entry->setLinkage(llvm::GlobalValue::WeakAnyLinkage);
+    // Already created, update the initializer.
+    assert(Entry->getLinkage() == llvm::GlobalValue::WeakAnyLinkage);
     Entry->setInitializer(Init);
   } else {
     Entry =
@@ -6374,7 +6342,7 @@
     Protocols[PD->getIdentifier()] = Entry;
   }
   Entry->setVisibility(llvm::GlobalValue::HiddenVisibility);
-  CGM.AddUsedGlobal(Entry);
+  CGM.addCompilerUsedGlobal(Entry);
 
   // Use this protocol meta-data to build protocol list table in section
   // __DATA, __objc_protolist
@@ -6386,7 +6354,7 @@
     CGM.getDataLayout().getABITypeAlignment(ObjCTypes.ProtocolnfABIPtrTy));
   PTGV->setSection("__DATA, __objc_protolist, coalesced, no_dead_strip");
   PTGV->setVisibility(llvm::GlobalValue::HiddenVisibility);
-  CGM.AddUsedGlobal(PTGV);
+  CGM.addCompilerUsedGlobal(PTGV);
   return Entry;
 }
 
@@ -6433,12 +6401,13 @@
 
   llvm::Constant *Init = llvm::ConstantStruct::getAnon(Values);
   GV = new llvm::GlobalVariable(CGM.getModule(), Init->getType(), false,
-                                llvm::GlobalValue::InternalLinkage,
+                                llvm::GlobalValue::PrivateLinkage,
                                 Init, Name);
+  assertPrivateName(GV);
   GV->setSection("__DATA, __objc_const");
   GV->setAlignment(
     CGM.getDataLayout().getABITypeAlignment(Init->getType()));
-  CGM.AddUsedGlobal(GV);
+  CGM.addCompilerUsedGlobal(GV);
   return llvm::ConstantExpr::getBitCast(GV,
                                         ObjCTypes.ProtocolListnfABIPtrTy);
 }
@@ -6479,12 +6448,6 @@
                                                unsigned CVRQualifiers) {
   ObjCInterfaceDecl *ID = ObjectTy->getAs<ObjCObjectType>()->getInterface();
   llvm::Value *Offset = EmitIvarOffset(CGF, ID, Ivar);
-
-  if (IsIvarOffsetKnownIdempotent(CGF, ID, Ivar))
-    if (llvm::LoadInst *LI = cast<llvm::LoadInst>(Offset))
-      LI->setMetadata(CGM.getModule().getMDKindID("invariant.load"),
-                      llvm::MDNode::get(VMContext, ArrayRef<llvm::Value*>()));
-
   return EmitValueForIvarAtOffset(CGF, ID, BaseValue, Ivar, CVRQualifiers,
                                   Offset);
 }
@@ -6493,7 +6456,20 @@
   CodeGen::CodeGenFunction &CGF,
   const ObjCInterfaceDecl *Interface,
   const ObjCIvarDecl *Ivar) {
-  return CGF.Builder.CreateLoad(ObjCIvarOffsetVariable(Interface, Ivar),"ivar");
+  llvm::Value *IvarOffsetValue = ObjCIvarOffsetVariable(Interface, Ivar);
+  IvarOffsetValue = CGF.Builder.CreateLoad(IvarOffsetValue, "ivar");
+  if (IsIvarOffsetKnownIdempotent(CGF, Ivar))
+    cast<llvm::LoadInst>(IvarOffsetValue)
+        ->setMetadata(CGM.getModule().getMDKindID("invariant.load"),
+                      llvm::MDNode::get(VMContext, ArrayRef<llvm::Value *>()));
+
+  // This could be 32bit int or 64bit integer depending on the architecture.
+  // Cast it to 64bit integer value, if it is a 32bit integer ivar offset value
+  //  as this is what caller always expectes.
+  if (ObjCTypes.IvarOffsetVarTy == ObjCTypes.IntTy)
+    IvarOffsetValue = CGF.Builder.CreateIntCast(
+        IvarOffsetValue, ObjCTypes.LongTy, true, "ivar.conv");
+  return IvarOffsetValue;
 }
 
 static void appendSelectorForMessageRefTable(std::string &buffer,
@@ -6556,7 +6532,7 @@
   // FIXME: don't use this for that.
   llvm::Constant *fn = 0;
   std::string messageRefName("\01l_");
-  if (CGM.ReturnTypeUsesSRet(MSI.CallInfo)) {
+  if (CGM.ReturnSlotInterferesWithArgs(MSI.CallInfo)) {
     if (isSuper) {
       fn = ObjCTypes.getMessageSendSuper2StretFixupFn();
       messageRefName += "objc_msgSendSuper2_stret_fixup";
@@ -6603,9 +6579,7 @@
   
   bool requiresnullCheck = false;
   if (CGM.getLangOpts().ObjCAutoRefCount && method)
-    for (ObjCMethodDecl::param_const_iterator i = method->param_begin(),
-         e = method->param_end(); i != e; ++i) {
-      const ParmVarDecl *ParamDecl = (*i);
+    for (const auto *ParamDecl : method->params()) {
       if (ParamDecl->hasAttr<NSConsumedAttr>()) {
         if (!nullReturn.NullBB)
           nullReturn.init(CGF, arg0);
@@ -6652,49 +6626,53 @@
 }
 
 llvm::GlobalVariable *
-CGObjCNonFragileABIMac::GetClassGlobal(const std::string &Name) {
+CGObjCNonFragileABIMac::GetClassGlobal(const std::string &Name, bool Weak) {
+  llvm::GlobalValue::LinkageTypes L =
+      Weak ? llvm::GlobalValue::ExternalWeakLinkage
+           : llvm::GlobalValue::ExternalLinkage;
+
   llvm::GlobalVariable *GV = CGM.getModule().getGlobalVariable(Name);
 
-  if (!GV) {
+  if (!GV)
     GV = new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ClassnfABITy,
-                                  false, llvm::GlobalValue::ExternalLinkage,
-                                  0, Name);
-  }
+                                  false, L, 0, Name);
 
+  assert(GV->getLinkage() == L);
   return GV;
 }
 
 llvm::Value *CGObjCNonFragileABIMac::EmitClassRefFromId(CodeGenFunction &CGF,
-                                                        IdentifierInfo *II) {
+                                                        IdentifierInfo *II,
+                                                        bool Weak) {
   llvm::GlobalVariable *&Entry = ClassReferences[II];
   
   if (!Entry) {
     std::string ClassName(getClassSymbolPrefix() + II->getName().str());
-    llvm::GlobalVariable *ClassGV = GetClassGlobal(ClassName);
+    llvm::GlobalVariable *ClassGV = GetClassGlobal(ClassName, Weak);
     Entry =
     new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ClassnfABIPtrTy,
-                             false, llvm::GlobalValue::InternalLinkage,
+                             false, llvm::GlobalValue::PrivateLinkage,
                              ClassGV,
                              "\01L_OBJC_CLASSLIST_REFERENCES_$_");
     Entry->setAlignment(
                         CGM.getDataLayout().getABITypeAlignment(
                                                                 ObjCTypes.ClassnfABIPtrTy));
     Entry->setSection("__DATA, __objc_classrefs, regular, no_dead_strip");
-    CGM.AddUsedGlobal(Entry);
+    CGM.addCompilerUsedGlobal(Entry);
   }
-  
+  assertPrivateName(Entry);
   return CGF.Builder.CreateLoad(Entry);
 }
 
 llvm::Value *CGObjCNonFragileABIMac::EmitClassRef(CodeGenFunction &CGF,
                                                   const ObjCInterfaceDecl *ID) {
-  return EmitClassRefFromId(CGF, ID->getIdentifier());
+  return EmitClassRefFromId(CGF, ID->getIdentifier(), ID->isWeakImported());
 }
 
 llvm::Value *CGObjCNonFragileABIMac::EmitNSAutoreleasePoolClassRef(
                                                     CodeGenFunction &CGF) {
   IdentifierInfo *II = &CGM.getContext().Idents.get("NSAutoreleasePool");
-  return EmitClassRefFromId(CGF, II);
+  return EmitClassRefFromId(CGF, II, false);
 }
 
 llvm::Value *
@@ -6704,19 +6682,20 @@
 
   if (!Entry) {
     std::string ClassName(getClassSymbolPrefix() + ID->getNameAsString());
-    llvm::GlobalVariable *ClassGV = GetClassGlobal(ClassName);
+    llvm::GlobalVariable *ClassGV = GetClassGlobal(ClassName,
+                                                   ID->isWeakImported());
     Entry =
       new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ClassnfABIPtrTy,
-                               false, llvm::GlobalValue::InternalLinkage,
+                               false, llvm::GlobalValue::PrivateLinkage,
                                ClassGV,
                                "\01L_OBJC_CLASSLIST_SUP_REFS_$_");
     Entry->setAlignment(
       CGM.getDataLayout().getABITypeAlignment(
         ObjCTypes.ClassnfABIPtrTy));
     Entry->setSection("__DATA, __objc_superrefs, regular, no_dead_strip");
-    CGM.AddUsedGlobal(Entry);
+    CGM.addCompilerUsedGlobal(Entry);
   }
-
+  assertPrivateName(Entry);
   return CGF.Builder.CreateLoad(Entry);
 }
 
@@ -6726,23 +6705,23 @@
 llvm::Value *CGObjCNonFragileABIMac::EmitMetaClassRef(CodeGenFunction &CGF,
                                                       const ObjCInterfaceDecl *ID) {
   llvm::GlobalVariable * &Entry = MetaClassReferences[ID->getIdentifier()];
-  if (Entry)
-    return CGF.Builder.CreateLoad(Entry);
+  if (!Entry) {
 
-  std::string MetaClassName(getMetaclassSymbolPrefix() + ID->getNameAsString());
-  llvm::GlobalVariable *MetaClassGV = GetClassGlobal(MetaClassName);
-  Entry =
-    new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ClassnfABIPtrTy, false,
-                             llvm::GlobalValue::InternalLinkage,
-                             MetaClassGV,
-                             "\01L_OBJC_CLASSLIST_SUP_REFS_$_");
-  Entry->setAlignment(
-    CGM.getDataLayout().getABITypeAlignment(
-      ObjCTypes.ClassnfABIPtrTy));
+    std::string MetaClassName(getMetaclassSymbolPrefix() +
+                              ID->getNameAsString());
+    llvm::GlobalVariable *MetaClassGV = GetClassGlobal(MetaClassName);
+    Entry = new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ClassnfABIPtrTy,
+                                     false, llvm::GlobalValue::PrivateLinkage,
+                                     MetaClassGV,
+                                     "\01L_OBJC_CLASSLIST_SUP_REFS_$_");
+    Entry->setAlignment(
+        CGM.getDataLayout().getABITypeAlignment(ObjCTypes.ClassnfABIPtrTy));
 
-  Entry->setSection("__DATA, __objc_superrefs, regular, no_dead_strip");
-  CGM.AddUsedGlobal(Entry);
+    Entry->setSection("__DATA, __objc_superrefs, regular, no_dead_strip");
+    CGM.addCompilerUsedGlobal(Entry);
+  }
 
+  assertPrivateName(Entry);
   return CGF.Builder.CreateLoad(Entry);
 }
 
@@ -6752,8 +6731,9 @@
                                               const ObjCInterfaceDecl *ID) {
   if (ID->isWeakImported()) {
     std::string ClassName(getClassSymbolPrefix() + ID->getNameAsString());
-    llvm::GlobalVariable *ClassGV = GetClassGlobal(ClassName);
-    ClassGV->setLinkage(llvm::GlobalValue::ExternalWeakLinkage);
+    llvm::GlobalVariable *ClassGV = GetClassGlobal(ClassName, true);
+    (void)ClassGV;
+    assert(ClassGV->getLinkage() == llvm::GlobalValue::ExternalWeakLinkage);
   }
   
   return EmitClassRef(CGF, ID);
@@ -6819,12 +6799,13 @@
                                      ObjCTypes.SelectorPtrTy);
     Entry =
       new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.SelectorPtrTy, false,
-                               llvm::GlobalValue::InternalLinkage,
+                               llvm::GlobalValue::PrivateLinkage,
                                Casted, "\01L_OBJC_SELECTOR_REFERENCES_");
     Entry->setExternallyInitialized(true);
     Entry->setSection("__DATA, __objc_selrefs, literal_pointers, no_dead_strip");
-    CGM.AddUsedGlobal(Entry);
+    CGM.addCompilerUsedGlobal(Entry);
   }
+  assertPrivateName(Entry);
 
   if (lval)
     return Entry;
@@ -7054,27 +7035,29 @@
   llvm::Constant *Init =
     llvm::ConstantStruct::get(ObjCTypes.EHTypeTy, Values);
 
+  llvm::GlobalValue::LinkageTypes L = ForDefinition
+                                          ? llvm::GlobalValue::ExternalLinkage
+                                          : llvm::GlobalValue::WeakAnyLinkage;
   if (Entry) {
     Entry->setInitializer(Init);
   } else {
     Entry = new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.EHTypeTy, false,
-                                     llvm::GlobalValue::WeakAnyLinkage,
+                                     L,
                                      Init,
                                      ("OBJC_EHTYPE_$_" +
                                       ID->getIdentifier()->getName()));
   }
+  assert(Entry->getLinkage() == L);
 
   if (ID->getVisibility() == HiddenVisibility)
     Entry->setVisibility(llvm::GlobalValue::HiddenVisibility);
   Entry->setAlignment(CGM.getDataLayout().getABITypeAlignment(
       ObjCTypes.EHTypeTy));
 
-  if (ForDefinition) {
+  if (ForDefinition)
     Entry->setSection("__DATA,__objc_const");
-    Entry->setLinkage(llvm::GlobalValue::ExternalLinkage);
-  } else {
+  else
     Entry->setSection("__DATA,__datacoal_nt,coalesced");
-  }
 
   return Entry;
 }
diff --git a/lib/CodeGen/CGObjCRuntime.cpp b/lib/CodeGen/CGObjCRuntime.cpp
index d097b6f..8d6c653 100644
--- a/lib/CodeGen/CGObjCRuntime.cpp
+++ b/lib/CodeGen/CGObjCRuntime.cpp
@@ -21,7 +21,7 @@
 #include "clang/AST/RecordLayout.h"
 #include "clang/AST/StmtObjC.h"
 #include "clang/CodeGen/CGFunctionInfo.h"
-#include "llvm/Support/CallSite.h"
+#include "llvm/IR/CallSite.h"
 
 using namespace clang;
 using namespace CodeGen;
@@ -158,7 +158,7 @@
     bool MightThrow;
     llvm::Value *Fn;
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       if (!MightThrow) {
         CGF.Builder.CreateCall(Fn)->setDoesNotThrow();
         return;
@@ -303,7 +303,7 @@
     CallSyncExit(llvm::Value *SyncExitFn, llvm::Value *SyncArg)
       : SyncExitFn(SyncExitFn), SyncArg(SyncArg) {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       CGF.Builder.CreateCall(SyncExitFn, SyncArg)->setDoesNotThrow();
     }
   };
diff --git a/lib/CodeGen/CGObjCRuntime.h b/lib/CodeGen/CGObjCRuntime.h
index 7f030f2..2a8ae90 100644
--- a/lib/CodeGen/CGObjCRuntime.h
+++ b/lib/CodeGen/CGObjCRuntime.h
@@ -268,7 +268,8 @@
                                   const CodeGen::CGBlockInfo &blockInfo) = 0;
   virtual llvm::Constant *BuildByrefLayout(CodeGen::CodeGenModule &CGM,
                                            QualType T) = 0;
-  virtual llvm::GlobalVariable *GetClassGlobal(const std::string &Name) = 0;
+  virtual llvm::GlobalVariable *GetClassGlobal(const std::string &Name,
+                                               bool Weak = false) = 0;
 
   struct MessageSendInfo {
     const CGFunctionInfo &CallInfo;
diff --git a/lib/CodeGen/CGOpenCLRuntime.cpp b/lib/CodeGen/CGOpenCLRuntime.cpp
index 7c454ac..6e1a3c9 100644
--- a/lib/CodeGen/CGOpenCLRuntime.cpp
+++ b/lib/CodeGen/CGOpenCLRuntime.cpp
@@ -33,32 +33,35 @@
   assert(T->isOpenCLSpecificType() &&
          "Not an OpenCL specific type!");
 
+  llvm::LLVMContext& Ctx = CGM.getLLVMContext();
+  uint32_t ImgAddrSpc =
+    CGM.getContext().getTargetAddressSpace(LangAS::opencl_global);
   switch (cast<BuiltinType>(T)->getKind()) {
   default: 
     llvm_unreachable("Unexpected opencl builtin type!");
     return 0;
   case BuiltinType::OCLImage1d:
     return llvm::PointerType::get(llvm::StructType::create(
-                           CGM.getLLVMContext(), "opencl.image1d_t"), 0);
+                           Ctx, "opencl.image1d_t"), ImgAddrSpc);
   case BuiltinType::OCLImage1dArray:
     return llvm::PointerType::get(llvm::StructType::create(
-                           CGM.getLLVMContext(), "opencl.image1d_array_t"), 0);
+                           Ctx, "opencl.image1d_array_t"), ImgAddrSpc);
   case BuiltinType::OCLImage1dBuffer:
     return llvm::PointerType::get(llvm::StructType::create(
-                           CGM.getLLVMContext(), "opencl.image1d_buffer_t"), 0);
+                           Ctx, "opencl.image1d_buffer_t"), ImgAddrSpc);
   case BuiltinType::OCLImage2d:
     return llvm::PointerType::get(llvm::StructType::create(
-                           CGM.getLLVMContext(), "opencl.image2d_t"), 0);
+                           Ctx, "opencl.image2d_t"), ImgAddrSpc);
   case BuiltinType::OCLImage2dArray:
     return llvm::PointerType::get(llvm::StructType::create(
-                           CGM.getLLVMContext(), "opencl.image2d_array_t"), 0);
+                           Ctx, "opencl.image2d_array_t"), ImgAddrSpc);
   case BuiltinType::OCLImage3d:
     return llvm::PointerType::get(llvm::StructType::create(
-                           CGM.getLLVMContext(), "opencl.image3d_t"), 0);
+                           Ctx, "opencl.image3d_t"), ImgAddrSpc);
   case BuiltinType::OCLSampler:
-    return llvm::IntegerType::get(CGM.getLLVMContext(),32);
+    return llvm::IntegerType::get(Ctx, 32);
   case BuiltinType::OCLEvent:
     return llvm::PointerType::get(llvm::StructType::create(
-                           CGM.getLLVMContext(), "opencl.event_t"), 0);
+                           Ctx, "opencl.event_t"), 0);
   }
 }
diff --git a/lib/CodeGen/CGRTTI.cpp b/lib/CodeGen/CGRTTI.cpp
index aa687b9..7049df7 100644
--- a/lib/CodeGen/CGRTTI.cpp
+++ b/lib/CodeGen/CGRTTI.cpp
@@ -332,11 +332,11 @@
   
   switch (Ty->getLinkage()) {
   case NoLinkage:
-  case VisibleNoLinkage:
   case InternalLinkage:
   case UniqueExternalLinkage:
     return llvm::GlobalValue::InternalLinkage;
 
+  case VisibleNoLinkage:
   case ExternalLinkage:
     if (!CGM.getLangOpts().RTTI) {
       // RTTI is not enabled, which means that this type info struct is going
@@ -544,8 +544,25 @@
   
   // And the name.
   llvm::GlobalVariable *TypeName = GetAddrOfTypeName(Ty, Linkage);
+  llvm::Constant *TypeNameField;
 
-  Fields.push_back(llvm::ConstantExpr::getBitCast(TypeName, CGM.Int8PtrTy));
+  // If we're supposed to demote the visibility, be sure to set a flag
+  // to use a string comparison for type_info comparisons.
+  CGCXXABI::RTTIUniquenessKind RTTIUniqueness =
+      CGM.getCXXABI().classifyRTTIUniqueness(Ty, Linkage);
+  if (RTTIUniqueness != CGCXXABI::RUK_Unique) {
+    // The flag is the sign bit, which on ARM64 is defined to be clear
+    // for global pointers.  This is very ARM64-specific.
+    TypeNameField = llvm::ConstantExpr::getPtrToInt(TypeName, CGM.Int64Ty);
+    llvm::Constant *flag =
+        llvm::ConstantInt::get(CGM.Int64Ty, ((uint64_t)1) << 63);
+    TypeNameField = llvm::ConstantExpr::getAdd(TypeNameField, flag);
+    TypeNameField =
+        llvm::ConstantExpr::getIntToPtr(TypeNameField, CGM.Int8PtrTy);
+  } else {
+    TypeNameField = llvm::ConstantExpr::getBitCast(TypeName, CGM.Int8PtrTy);
+  }
+  Fields.push_back(TypeNameField);
 
   switch (Ty->getTypeClass()) {
 #define TYPE(Class, Base)
@@ -644,32 +661,35 @@
     OldGV->eraseFromParent();
   }
 
-  // GCC only relies on the uniqueness of the type names, not the
-  // type_infos themselves, so we can emit these as hidden symbols.
-  // But don't do this if we're worried about strict visibility
-  // compatibility.
-  if (const RecordType *RT = dyn_cast<RecordType>(Ty)) {
-    const CXXRecordDecl *RD = cast<CXXRecordDecl>(RT->getDecl());
+  // The Itanium ABI specifies that type_info objects must be globally
+  // unique, with one exception: if the type is an incomplete class
+  // type or a (possibly indirect) pointer to one.  That exception
+  // affects the general case of comparing type_info objects produced
+  // by the typeid operator, which is why the comparison operators on
+  // std::type_info generally use the type_info name pointers instead
+  // of the object addresses.  However, the language's built-in uses
+  // of RTTI generally require class types to be complete, even when
+  // manipulating pointers to those class types.  This allows the
+  // implementation of dynamic_cast to rely on address equality tests,
+  // which is much faster.
 
-    CGM.setTypeVisibility(GV, RD, CodeGenModule::TVK_ForRTTI);
-    CGM.setTypeVisibility(TypeName, RD, CodeGenModule::TVK_ForRTTIName);
-  } else {
-    Visibility TypeInfoVisibility = DefaultVisibility;
-    if (CGM.getCodeGenOpts().HiddenWeakVTables &&
-        Linkage == llvm::GlobalValue::LinkOnceODRLinkage)
-      TypeInfoVisibility = HiddenVisibility;
+  // All of this is to say that it's important that both the type_info
+  // object and the type_info name be uniqued when weakly emitted.
 
-    // The type name should have the same visibility as the type itself.
-    Visibility ExplicitVisibility = Ty->getVisibility();
-    TypeName->setVisibility(CodeGenModule::
-                            GetLLVMVisibility(ExplicitVisibility));
-  
-    TypeInfoVisibility = minVisibility(TypeInfoVisibility, Ty->getVisibility());
-    GV->setVisibility(CodeGenModule::GetLLVMVisibility(TypeInfoVisibility));
+  // Give the type_info object and name the formal visibility of the
+  // type itself.
+  Visibility formalVisibility = Ty->getVisibility();
+  llvm::GlobalValue::VisibilityTypes llvmVisibility =
+    CodeGenModule::GetLLVMVisibility(formalVisibility);
+  TypeName->setVisibility(llvmVisibility);
+  GV->setVisibility(llvmVisibility);
+
+  // FIXME: integrate this better into the above when we move to trunk
+  if (RTTIUniqueness == CGCXXABI::RUK_NonUniqueHidden) {
+    TypeName->setVisibility(llvm::GlobalValue::HiddenVisibility);
+    GV->setVisibility(llvm::GlobalValue::HiddenVisibility);
   }
 
-  GV->setUnnamedAddr(true);
-
   return llvm::ConstantExpr::getBitCast(GV, CGM.Int8PtrTy);
 }
 
@@ -766,9 +786,8 @@
   }
 
   // Walk all bases.
-  for (CXXRecordDecl::base_class_const_iterator I = BaseDecl->bases_begin(),
-       E = BaseDecl->bases_end(); I != E; ++I) 
-    Flags |= ComputeVMIClassTypeInfoFlags(I, Bases);
+  for (const auto &I : BaseDecl->bases()) 
+    Flags |= ComputeVMIClassTypeInfoFlags(&I, Bases);
   
   return Flags;
 }
@@ -778,9 +797,8 @@
   SeenBases Bases;
   
   // Walk all bases.
-  for (CXXRecordDecl::base_class_const_iterator I = RD->bases_begin(),
-       E = RD->bases_end(); I != E; ++I) 
-    Flags |= ComputeVMIClassTypeInfoFlags(I, Bases);
+  for (const auto &I : RD->bases()) 
+    Flags |= ComputeVMIClassTypeInfoFlags(&I, Bases);
   
   return Flags;
 }
@@ -827,15 +845,12 @@
   //       __offset_shift = 8
   //     };
   //   };
-  for (CXXRecordDecl::base_class_const_iterator I = RD->bases_begin(),
-       E = RD->bases_end(); I != E; ++I) {
-    const CXXBaseSpecifier *Base = I;
-
+  for (const auto &Base : RD->bases()) {
     // The __base_type member points to the RTTI for the base type.
-    Fields.push_back(RTTIBuilder(CGM).BuildTypeInfo(Base->getType()));
+    Fields.push_back(RTTIBuilder(CGM).BuildTypeInfo(Base.getType()));
 
     const CXXRecordDecl *BaseDecl = 
-      cast<CXXRecordDecl>(Base->getType()->getAs<RecordType>()->getDecl());
+      cast<CXXRecordDecl>(Base.getType()->getAs<RecordType>()->getDecl());
 
     int64_t OffsetFlags = 0;
     
@@ -844,7 +859,7 @@
     // subobject. For a virtual base, this is the offset in the virtual table of
     // the virtual base offset for the virtual base referenced (negative).
     CharUnits Offset;
-    if (Base->isVirtual())
+    if (Base.isVirtual())
       Offset = 
         CGM.getItaniumVTableContext().getVirtualBaseOffsetOffset(RD, BaseDecl);
     else {
@@ -856,9 +871,9 @@
     
     // The low-order byte of __offset_flags contains flags, as given by the 
     // masks from the enumeration __offset_flags_masks.
-    if (Base->isVirtual())
+    if (Base.isVirtual())
       OffsetFlags |= BCTI_Virtual;
-    if (Base->getAccessSpecifier() == AS_public)
+    if (Base.getAccessSpecifier() == AS_public)
       OffsetFlags |= BCTI_Public;
 
     Fields.push_back(llvm::ConstantInt::get(LongLTy, OffsetFlags));
diff --git a/lib/CodeGen/CGRecordLayout.h b/lib/CodeGen/CGRecordLayout.h
index b29fc98..0fc7b8a 100644
--- a/lib/CodeGen/CGRecordLayout.h
+++ b/lib/CodeGen/CGRecordLayout.h
@@ -130,7 +130,7 @@
   llvm::DenseMap<const FieldDecl *, CGBitFieldInfo> BitFields;
 
   // FIXME: Maybe we could use a CXXBaseSpecifier as the key and use a single
-  // map for both virtual and non virtual bases.
+  // map for both virtual and non-virtual bases.
   llvm::DenseMap<const CXXRecordDecl *, unsigned> NonVirtualBases;
 
   /// Map from virtual bases to their field index in the complete object.
@@ -201,7 +201,7 @@
 
   /// \brief Return the BitFieldInfo that corresponds to the field FD.
   const CGBitFieldInfo &getBitFieldInfo(const FieldDecl *FD) const {
-    assert(FD->isBitField() && "Invalid call for non bit-field decl!");
+    assert(FD->isBitField() && "Invalid call for non-bit-field decl!");
     llvm::DenseMap<const FieldDecl *, CGBitFieldInfo>::const_iterator
       it = BitFields.find(FD);
     assert(it != BitFields.end() && "Unable to find bitfield info");
diff --git a/lib/CodeGen/CGRecordLayoutBuilder.cpp b/lib/CodeGen/CGRecordLayoutBuilder.cpp
index ab92563..75b4504 100644
--- a/lib/CodeGen/CGRecordLayoutBuilder.cpp
+++ b/lib/CodeGen/CGRecordLayoutBuilder.cpp
@@ -25,205 +25,560 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace clang;
 using namespace CodeGen;
 
 namespace {
-
-class CGRecordLayoutBuilder {
-public:
-  /// FieldTypes - Holds the LLVM types that the struct is created from.
-  /// 
+/// The CGRecordLowering is responsible for lowering an ASTRecordLayout to an
+/// llvm::Type.  Some of the lowering is straightforward, some is not.  Here we
+/// detail some of the complexities and weirdnesses here.
+/// * LLVM does not have unions - Unions can, in theory be represented by any
+///   llvm::Type with correct size.  We choose a field via a specific heuristic
+///   and add padding if necessary.
+/// * LLVM does not have bitfields - Bitfields are collected into contiguous
+///   runs and allocated as a single storage type for the run.  ASTRecordLayout
+///   contains enough information to determine where the runs break.  Microsoft
+///   and Itanium follow different rules and use different codepaths.
+/// * It is desired that, when possible, bitfields use the appropriate iN type
+///   when lowered to llvm types.  For example unsigned x : 24 gets lowered to
+///   i24.  This isn't always possible because i24 has storage size of 32 bit
+///   and if it is possible to use that extra byte of padding we must use
+///   [i8 x 3] instead of i24.  The function clipTailPadding does this.
+///   C++ examples that require clipping:
+///   struct { int a : 24; char b; }; // a must be clipped, b goes at offset 3
+///   struct A { int a : 24; }; // a must be clipped because a struct like B
+//    could exist: struct B : A { char b; }; // b goes at offset 3
+/// * Clang ignores 0 sized bitfields and 0 sized bases but *not* zero sized
+///   fields.  The existing asserts suggest that LLVM assumes that *every* field
+///   has an underlying storage type.  Therefore empty structures containing
+///   zero sized subobjects such as empty records or zero sized arrays still get
+///   a zero sized (empty struct) storage type.
+/// * Clang reads the complete type rather than the base type when generating
+///   code to access fields.  Bitfields in tail position with tail padding may
+///   be clipped in the base class but not the complete class (we may discover
+///   that the tail padding is not used in the complete class.) However,
+///   because LLVM reads from the complete type it can generate incorrect code
+///   if we do not clip the tail padding off of the bitfield in the complete
+///   layout.  This introduces a somewhat awkward extra unnecessary clip stage.
+///   The location of the clip is stored internally as a sentinal of type
+///   SCISSOR.  If LLVM were updated to read base types (which it probably
+///   should because locations of things such as VBases are bogus in the llvm
+///   type anyway) then we could eliminate the SCISSOR.
+/// * Itanium allows nearly empty primary virtual bases.  These bases don't get
+///   get their own storage because they're laid out as part of another base
+///   or at the beginning of the structure.  Determining if a VBase actually
+///   gets storage awkwardly involves a walk of all bases.
+/// * VFPtrs and VBPtrs do *not* make a record NotZeroInitializable.
+struct CGRecordLowering {
+  // MemberInfo is a helper structure that contains information about a record
+  // member.  In additional to the standard member types, there exists a
+  // sentinal member type that ensures correct rounding.
+  struct MemberInfo {
+    CharUnits Offset;
+    enum InfoKind { VFPtr, VBPtr, Field, Base, VBase, Scissor } Kind;
+    llvm::Type *Data;
+    union {
+      const FieldDecl *FD;
+      const CXXRecordDecl *RD;
+    };
+    MemberInfo(CharUnits Offset, InfoKind Kind, llvm::Type *Data,
+               const FieldDecl *FD = 0)
+      : Offset(Offset), Kind(Kind), Data(Data), FD(FD) {}
+    MemberInfo(CharUnits Offset, InfoKind Kind, llvm::Type *Data,
+               const CXXRecordDecl *RD)
+      : Offset(Offset), Kind(Kind), Data(Data), RD(RD) {}
+    // MemberInfos are sorted so we define a < operator.
+    bool operator <(const MemberInfo& a) const { return Offset < a.Offset; }
+  };
+  // The constructor.
+  CGRecordLowering(CodeGenTypes &Types, const RecordDecl *D);
+  // Short helper routines.
+  /// \brief Constructs a MemberInfo instance from an offset and llvm::Type *.
+  MemberInfo StorageInfo(CharUnits Offset, llvm::Type *Data) {
+    return MemberInfo(Offset, MemberInfo::Field, Data);
+  }
+  bool useMSABI() {
+    return Context.getTargetInfo().getCXXABI().isMicrosoft() ||
+           D->isMsStruct(Context);
+  }
+  /// \brief Wraps llvm::Type::getIntNTy with some implicit arguments.
+  llvm::Type *getIntNType(uint64_t NumBits) {
+    return llvm::Type::getIntNTy(Types.getLLVMContext(),
+        (unsigned)llvm::RoundUpToAlignment(NumBits, 8));
+  }
+  /// \brief Gets an llvm type of size NumBytes and alignment 1.
+  llvm::Type *getByteArrayType(CharUnits NumBytes) {
+    assert(!NumBytes.isZero() && "Empty byte arrays aren't allowed.");
+    llvm::Type *Type = llvm::Type::getInt8Ty(Types.getLLVMContext());
+    return NumBytes == CharUnits::One() ? Type :
+        (llvm::Type *)llvm::ArrayType::get(Type, NumBytes.getQuantity());
+  }
+  /// \brief Gets the storage type for a field decl and handles storage
+  /// for itanium bitfields that are smaller than their declared type.
+  llvm::Type *getStorageType(const FieldDecl *FD) {
+    llvm::Type *Type = Types.ConvertTypeForMem(FD->getType());
+    return useMSABI() || !FD->isBitField() ? Type :
+        getIntNType(std::min(FD->getBitWidthValue(Context),
+                             (unsigned)Context.toBits(getSize(Type))));
+  }
+  /// \brief Gets the llvm Basesubobject type from a CXXRecordDecl.
+  llvm::Type *getStorageType(const CXXRecordDecl *RD) {
+    return Types.getCGRecordLayout(RD).getBaseSubobjectLLVMType();
+  }
+  CharUnits bitsToCharUnits(uint64_t BitOffset) {
+    return Context.toCharUnitsFromBits(BitOffset);
+  }
+  CharUnits getSize(llvm::Type *Type) {
+    return CharUnits::fromQuantity(DataLayout.getTypeAllocSize(Type));
+  }
+  CharUnits getAlignment(llvm::Type *Type) {
+    return CharUnits::fromQuantity(DataLayout.getABITypeAlignment(Type));
+  }
+  bool isZeroInitializable(const FieldDecl *FD) {
+    const Type *Type = FD->getType()->getBaseElementTypeUnsafe();
+    if (const MemberPointerType *MPT = Type->getAs<MemberPointerType>())
+      return Types.getCXXABI().isZeroInitializable(MPT);
+    if (const RecordType *RT = Type->getAs<RecordType>())
+      return isZeroInitializable(RT->getDecl());
+    return true;
+  }
+  bool isZeroInitializable(const RecordDecl *RD) {
+    return Types.getCGRecordLayout(RD).isZeroInitializable();
+  }
+  void appendPaddingBytes(CharUnits Size) {
+    if (!Size.isZero())
+      FieldTypes.push_back(getByteArrayType(Size));
+  }
+  uint64_t getFieldBitOffset(const FieldDecl *FD) {
+    return Layout.getFieldOffset(FD->getFieldIndex());
+  }
+  // Layout routines.
+  void setBitFieldInfo(const FieldDecl *FD, CharUnits StartOffset, 
+                       llvm::Type *StorageType);
+  /// \brief Lowers an ASTRecordLayout to a llvm type.
+  void lower(bool NonVirtualBaseType);
+  void lowerUnion();
+  void accumulateFields();
+  void accumulateBitFields(RecordDecl::field_iterator Field,
+                        RecordDecl::field_iterator FieldEnd);
+  void accumulateBases();
+  void accumulateVPtrs();
+  void accumulateVBases();
+  /// \brief Recursively searches all of the bases to find out if a vbase is
+  /// not the primary vbase of some base class.
+  bool hasOwnStorage(const CXXRecordDecl *Decl, const CXXRecordDecl *Query);
+  void calculateZeroInit();
+  /// \brief Lowers bitfield storage types to I8 arrays for bitfields with tail
+  /// padding that is or can potentially be used.
+  void clipTailPadding();
+  /// \brief Determines if we need a packed llvm struct.
+  void determinePacked();
+  /// \brief Inserts padding everwhere it's needed.
+  void insertPadding();
+  /// \brief Fills out the structures that are ultimately consumed.
+  void fillOutputFields();
+  // Input memoization fields.
+  CodeGenTypes &Types;
+  const ASTContext &Context;
+  const RecordDecl *D;
+  const CXXRecordDecl *RD;
+  const ASTRecordLayout &Layout;
+  const llvm::DataLayout &DataLayout;
+  // Helpful intermediate data-structures.
+  std::vector<MemberInfo> Members;
+  // Output fields, consumed by CodeGenTypes::ComputeRecordLayout.
   SmallVector<llvm::Type *, 16> FieldTypes;
-
-  /// BaseSubobjectType - Holds the LLVM type for the non-virtual part
-  /// of the struct. For example, consider:
-  ///
-  /// struct A { int i; };
-  /// struct B { void *v; };
-  /// struct C : virtual A, B { };
-  ///
-  /// The LLVM type of C will be
-  /// %struct.C = type { i32 (...)**, %struct.A, i32, %struct.B }
-  ///
-  /// And the LLVM type of the non-virtual base struct will be
-  /// %struct.C.base = type { i32 (...)**, %struct.A, i32 }
-  ///
-  /// This only gets initialized if the base subobject type is
-  /// different from the complete-object type.
-  llvm::StructType *BaseSubobjectType;
-
-  /// FieldInfo - Holds a field and its corresponding LLVM field number.
   llvm::DenseMap<const FieldDecl *, unsigned> Fields;
-
-  /// BitFieldInfo - Holds location and size information about a bit field.
   llvm::DenseMap<const FieldDecl *, CGBitFieldInfo> BitFields;
-
   llvm::DenseMap<const CXXRecordDecl *, unsigned> NonVirtualBases;
   llvm::DenseMap<const CXXRecordDecl *, unsigned> VirtualBases;
-
-  /// IndirectPrimaryBases - Virtual base classes, direct or indirect, that are
-  /// primary base classes for some other direct or indirect base class.
-  CXXIndirectPrimaryBaseSet IndirectPrimaryBases;
-
-  /// LaidOutVirtualBases - A set of all laid out virtual bases, used to avoid
-  /// avoid laying out virtual bases more than once.
-  llvm::SmallPtrSet<const CXXRecordDecl *, 4> LaidOutVirtualBases;
-  
-  /// IsZeroInitializable - Whether this struct can be C++
-  /// zero-initialized with an LLVM zeroinitializer.
-  bool IsZeroInitializable;
-  bool IsZeroInitializableAsBase;
-
-  /// Packed - Whether the resulting LLVM struct will be packed or not.
-  bool Packed;
-
+  bool IsZeroInitializable : 1;
+  bool IsZeroInitializableAsBase : 1;
+  bool Packed : 1;
 private:
-  CodeGenTypes &Types;
-
-  /// LastLaidOutBaseInfo - Contains the offset and non-virtual size of the
-  /// last base laid out. Used so that we can replace the last laid out base
-  /// type with an i8 array if needed.
-  struct LastLaidOutBaseInfo {
-    CharUnits Offset;
-    CharUnits NonVirtualSize;
-
-    bool isValid() const { return !NonVirtualSize.isZero(); }
-    void invalidate() { NonVirtualSize = CharUnits::Zero(); }
-  
-  } LastLaidOutBase;
-
-  /// Alignment - Contains the alignment of the RecordDecl.
-  CharUnits Alignment;
-
-  /// NextFieldOffset - Holds the next field offset.
-  CharUnits NextFieldOffset;
-
-  /// LayoutUnionField - Will layout a field in an union and return the type
-  /// that the field will have.
-  llvm::Type *LayoutUnionField(const FieldDecl *Field,
-                               const ASTRecordLayout &Layout);
-  
-  /// LayoutUnion - Will layout a union RecordDecl.
-  void LayoutUnion(const RecordDecl *D);
-
-  /// Lay out a sequence of contiguous bitfields.
-  bool LayoutBitfields(const ASTRecordLayout &Layout,
-                       unsigned &FirstFieldNo,
-                       RecordDecl::field_iterator &FI,
-                       RecordDecl::field_iterator FE);
-
-  /// LayoutFields - try to layout all fields in the record decl.
-  /// Returns false if the operation failed because the struct is not packed.
-  bool LayoutFields(const RecordDecl *D);
-
-  /// Layout a single base, virtual or non-virtual
-  bool LayoutBase(const CXXRecordDecl *base,
-                  const CGRecordLayout &baseLayout,
-                  CharUnits baseOffset);
-
-  /// LayoutVirtualBase - layout a single virtual base.
-  bool LayoutVirtualBase(const CXXRecordDecl *base,
-                         CharUnits baseOffset);
-
-  /// LayoutVirtualBases - layout the virtual bases of a record decl.
-  bool LayoutVirtualBases(const CXXRecordDecl *RD,
-                          const ASTRecordLayout &Layout);
-
-  /// MSLayoutVirtualBases - layout the virtual bases of a record decl,
-  /// like MSVC.
-  bool MSLayoutVirtualBases(const CXXRecordDecl *RD,
-                            const ASTRecordLayout &Layout);
-  
-  /// LayoutNonVirtualBase - layout a single non-virtual base.
-  bool LayoutNonVirtualBase(const CXXRecordDecl *base,
-                            CharUnits baseOffset);
-  
-  /// LayoutNonVirtualBases - layout the virtual bases of a record decl.
-  bool LayoutNonVirtualBases(const CXXRecordDecl *RD, 
-                             const ASTRecordLayout &Layout);
-
-  /// ComputeNonVirtualBaseType - Compute the non-virtual base field types.
-  bool ComputeNonVirtualBaseType(const CXXRecordDecl *RD);
-  
-  /// LayoutField - layout a single field. Returns false if the operation failed
-  /// because the current struct is not packed.
-  bool LayoutField(const FieldDecl *D, uint64_t FieldOffset);
-
-  /// LayoutBitField - layout a single bit field.
-  void LayoutBitField(const FieldDecl *D, uint64_t FieldOffset);
-
-  /// AppendField - Appends a field with the given offset and type.
-  void AppendField(CharUnits fieldOffset, llvm::Type *FieldTy);
-
-  /// AppendPadding - Appends enough padding bytes so that the total
-  /// struct size is a multiple of the field alignment.
-  void AppendPadding(CharUnits fieldOffset, CharUnits fieldAlignment);
-
-  /// ResizeLastBaseFieldIfNecessary - Fields and bases can be laid out in the
-  /// tail padding of a previous base. If this happens, the type of the previous
-  /// base needs to be changed to an array of i8. Returns true if the last
-  /// laid out base was resized.
-  bool ResizeLastBaseFieldIfNecessary(CharUnits offset);
-
-  /// getByteArrayType - Returns a byte array type with the given number of
-  /// elements.
-  llvm::Type *getByteArrayType(CharUnits NumBytes);
-  
-  /// AppendBytes - Append a given number of bytes to the record.
-  void AppendBytes(CharUnits numBytes);
-
-  /// AppendTailPadding - Append enough tail padding so that the type will have
-  /// the passed size.
-  void AppendTailPadding(CharUnits RecordSize);
-
-  CharUnits getTypeAlignment(llvm::Type *Ty) const;
-
-  /// getAlignmentAsLLVMStruct - Returns the maximum alignment of all the
-  /// LLVM element types.
-  CharUnits getAlignmentAsLLVMStruct() const;
-
-  /// CheckZeroInitializable - Check if the given type contains a pointer
-  /// to data member.
-  void CheckZeroInitializable(QualType T);
-
-public:
-  CGRecordLayoutBuilder(CodeGenTypes &Types)
-    : BaseSubobjectType(0),
-      IsZeroInitializable(true), IsZeroInitializableAsBase(true),
-      Packed(false), Types(Types) { }
-
-  /// Layout - Will layout a RecordDecl.
-  void Layout(const RecordDecl *D);
+  CGRecordLowering(const CGRecordLowering &) LLVM_DELETED_FUNCTION;
+  void operator =(const CGRecordLowering &) LLVM_DELETED_FUNCTION;
 };
+} // namespace {
 
+CGRecordLowering::CGRecordLowering(CodeGenTypes &Types, const RecordDecl *D)
+  : Types(Types), Context(Types.getContext()), D(D),
+    RD(dyn_cast<CXXRecordDecl>(D)),
+    Layout(Types.getContext().getASTRecordLayout(D)),
+    DataLayout(Types.getDataLayout()), IsZeroInitializable(true),
+    IsZeroInitializableAsBase(true), Packed(false) {}
+
+void CGRecordLowering::setBitFieldInfo(
+    const FieldDecl *FD, CharUnits StartOffset, llvm::Type *StorageType) {
+  CGBitFieldInfo &Info = BitFields[FD];
+  Info.IsSigned = FD->getType()->isSignedIntegerOrEnumerationType();
+  Info.Offset = (unsigned)(getFieldBitOffset(FD) - Context.toBits(StartOffset));
+  Info.Size = FD->getBitWidthValue(Context);
+  Info.StorageSize = (unsigned)DataLayout.getTypeAllocSizeInBits(StorageType);
+  // Here we calculate the actual storage alignment of the bits.  E.g if we've
+  // got an alignment >= 2 and the bitfield starts at offset 6 we've got an
+  // alignment of 2.
+  Info.StorageAlignment =
+      Layout.getAlignment().alignmentAtOffset(StartOffset).getQuantity();
+  if (Info.Size > Info.StorageSize)
+    Info.Size = Info.StorageSize;
+  // Reverse the bit offsets for big endian machines. Because we represent
+  // a bitfield as a single large integer load, we can imagine the bits
+  // counting from the most-significant-bit instead of the
+  // least-significant-bit.
+  if (DataLayout.isBigEndian())
+    Info.Offset = Info.StorageSize - (Info.Offset + Info.Size);
 }
 
-void CGRecordLayoutBuilder::Layout(const RecordDecl *D) {
-  const ASTRecordLayout &Layout = Types.getContext().getASTRecordLayout(D);
-  Alignment = Layout.getAlignment();
-  Packed = D->hasAttr<PackedAttr>() || Layout.getSize() % Alignment != 0;
+void CGRecordLowering::lower(bool NVBaseType) {
+  // The lowering process implemented in this function takes a variety of
+  // carefully ordered phases.
+  // 1) Store all members (fields and bases) in a list and sort them by offset.
+  // 2) Add a 1-byte capstone member at the Size of the structure.
+  // 3) Clip bitfield storages members if their tail padding is or might be
+  //    used by another field or base.  The clipping process uses the capstone 
+  //    by treating it as another object that occurs after the record.
+  // 4) Determine if the llvm-struct requires packing.  It's important that this
+  //    phase occur after clipping, because clipping changes the llvm type.
+  //    This phase reads the offset of the capstone when determining packedness
+  //    and updates the alignment of the capstone to be equal of the alignment
+  //    of the record after doing so.
+  // 5) Insert padding everywhere it is needed.  This phase requires 'Packed' to
+  //    have been computed and needs to know the alignment of the record in
+  //    order to understand if explicit tail padding is needed.
+  // 6) Remove the capstone, we don't need it anymore.
+  // 7) Determine if this record can be zero-initialized.  This phase could have
+  //    been placed anywhere after phase 1.
+  // 8) Format the complete list of members in a way that can be consumed by
+  //    CodeGenTypes::ComputeRecordLayout.
+  CharUnits Size = NVBaseType ? Layout.getNonVirtualSize() : Layout.getSize();
+  if (D->isUnion())
+    return lowerUnion();
+  accumulateFields();
+  // RD implies C++.
+  if (RD) {
+    accumulateVPtrs();
+    accumulateBases();
+    if (Members.empty())
+      return appendPaddingBytes(Size);
+    if (!NVBaseType)
+      accumulateVBases();
+  }
+  std::stable_sort(Members.begin(), Members.end());
+  Members.push_back(StorageInfo(Size, getIntNType(8)));
+  clipTailPadding();
+  determinePacked();
+  insertPadding();
+  Members.pop_back();
+  calculateZeroInit();
+  fillOutputFields();
+}
 
-  if (D->isUnion()) {
-    LayoutUnion(D);
+void CGRecordLowering::lowerUnion() {
+  CharUnits LayoutSize = Layout.getSize();
+  llvm::Type *StorageType = 0;
+  // Compute zero-initializable status.
+  if (!D->field_empty() && !isZeroInitializable(*D->field_begin()))
+    IsZeroInitializable = IsZeroInitializableAsBase = false;
+  // Iterate through the fields setting bitFieldInfo and the Fields array. Also
+  // locate the "most appropriate" storage type.  The heuristic for finding the
+  // storage type isn't necessary, the first (non-0-length-bitfield) field's
+  // type would work fine and be simpler but would be differen than what we've
+  // been doing and cause lit tests to change.
+  for (const auto *Field : D->fields()) {
+    if (Field->isBitField()) {
+      // Skip 0 sized bitfields.
+      if (Field->getBitWidthValue(Context) == 0)
+        continue;
+      llvm::Type *FieldType = getStorageType(Field);
+      if (LayoutSize < getSize(FieldType))
+        FieldType = getByteArrayType(LayoutSize);
+      setBitFieldInfo(Field, CharUnits::Zero(), FieldType);
+    }
+    Fields[Field] = 0;
+    llvm::Type *FieldType = getStorageType(Field);
+    // Conditionally update our storage type if we've got a new "better" one.
+    if (!StorageType ||
+        getAlignment(FieldType) >  getAlignment(StorageType) ||
+        (getAlignment(FieldType) == getAlignment(StorageType) &&
+        getSize(FieldType) > getSize(StorageType)))
+      StorageType = FieldType;
+  }
+  // If we have no storage type just pad to the appropriate size and return.
+  if (!StorageType)
+    return appendPaddingBytes(LayoutSize);
+  // If our storage size was bigger than our required size (can happen in the
+  // case of packed bitfields on Itanium) then just use an I8 array.
+  if (LayoutSize < getSize(StorageType))
+    StorageType = getByteArrayType(LayoutSize);
+  FieldTypes.push_back(StorageType);
+  appendPaddingBytes(LayoutSize - getSize(StorageType));
+  // Set packed if we need it.
+  if (LayoutSize % getAlignment(StorageType))
+    Packed = true;
+}
+
+void CGRecordLowering::accumulateFields() {
+  for (RecordDecl::field_iterator Field = D->field_begin(),
+                                  FieldEnd = D->field_end();
+    Field != FieldEnd;)
+    if (Field->isBitField()) {
+      RecordDecl::field_iterator Start = Field;
+      // Iterate to gather the list of bitfields.
+      for (++Field; Field != FieldEnd && Field->isBitField(); ++Field);
+      accumulateBitFields(Start, Field);
+    } else {
+      Members.push_back(MemberInfo(
+          bitsToCharUnits(getFieldBitOffset(*Field)), MemberInfo::Field,
+          getStorageType(*Field), *Field));
+      ++Field;
+    }
+}
+
+void
+CGRecordLowering::accumulateBitFields(RecordDecl::field_iterator Field,
+                                      RecordDecl::field_iterator FieldEnd) {
+  // Run stores the first element of the current run of bitfields.  FieldEnd is
+  // used as a special value to note that we don't have a current run.  A
+  // bitfield run is a contiguous collection of bitfields that can be stored in
+  // the same storage block.  Zero-sized bitfields and bitfields that would
+  // cross an alignment boundary break a run and start a new one.
+  RecordDecl::field_iterator Run = FieldEnd;
+  // Tail is the offset of the first bit off the end of the current run.  It's
+  // used to determine if the ASTRecordLayout is treating these two bitfields as
+  // contiguous.  StartBitOffset is offset of the beginning of the Run.
+  uint64_t StartBitOffset, Tail = 0;
+  if (useMSABI()) {
+    for (; Field != FieldEnd; ++Field) {
+      uint64_t BitOffset = getFieldBitOffset(*Field);
+      // Zero-width bitfields end runs.
+      if (Field->getBitWidthValue(Context) == 0) {
+        Run = FieldEnd;
+        continue;
+      }
+      llvm::Type *Type = Types.ConvertTypeForMem(Field->getType());
+      // If we don't have a run yet, or don't live within the previous run's
+      // allocated storage then we allocate some storage and start a new run.
+      if (Run == FieldEnd || BitOffset >= Tail) {
+        Run = Field;
+        StartBitOffset = BitOffset;
+        Tail = StartBitOffset + DataLayout.getTypeAllocSizeInBits(Type);
+        // Add the storage member to the record.  This must be added to the
+        // record before the bitfield members so that it gets laid out before
+        // the bitfields it contains get laid out.
+        Members.push_back(StorageInfo(bitsToCharUnits(StartBitOffset), Type));
+      }
+      // Bitfields get the offset of their storage but come afterward and remain
+      // there after a stable sort.
+      Members.push_back(MemberInfo(bitsToCharUnits(StartBitOffset),
+                                   MemberInfo::Field, 0, *Field));
+    }
     return;
   }
+  for (;;) {
+    // Check to see if we need to start a new run.
+    if (Run == FieldEnd) {
+      // If we're out of fields, return.
+      if (Field == FieldEnd)
+        break;
+      // Any non-zero-length bitfield can start a new run.
+      if (Field->getBitWidthValue(Context) != 0) {
+        Run = Field;
+        StartBitOffset = getFieldBitOffset(*Field);
+        Tail = StartBitOffset + Field->getBitWidthValue(Context);
+      }
+      ++Field;
+      continue;
+    }
+    // Add bitfields to the run as long as they qualify.
+    if (Field != FieldEnd && Field->getBitWidthValue(Context) != 0 &&
+        Tail == getFieldBitOffset(*Field)) {
+      Tail += Field->getBitWidthValue(Context);
+      ++Field;
+      continue;
+    }
+    // We've hit a break-point in the run and need to emit a storage field.
+    llvm::Type *Type = getIntNType(Tail - StartBitOffset);
+    // Add the storage member to the record and set the bitfield info for all of
+    // the bitfields in the run.  Bitfields get the offset of their storage but
+    // come afterward and remain there after a stable sort.
+    Members.push_back(StorageInfo(bitsToCharUnits(StartBitOffset), Type));
+    for (; Run != Field; ++Run)
+      Members.push_back(MemberInfo(bitsToCharUnits(StartBitOffset),
+                                   MemberInfo::Field, 0, *Run));
+    Run = FieldEnd;
+  }
+}
 
-  if (LayoutFields(D))
+void CGRecordLowering::accumulateBases() {
+  // If we've got a primary virtual base, we need to add it with the bases.
+  if (Layout.isPrimaryBaseVirtual())
+    Members.push_back(StorageInfo(
+      CharUnits::Zero(),
+      getStorageType(Layout.getPrimaryBase())));
+  // Accumulate the non-virtual bases.
+  for (const auto &Base : RD->bases()) {
+    if (Base.isVirtual())
+      continue;
+    const CXXRecordDecl *BaseDecl = Base.getType()->getAsCXXRecordDecl();
+    if (!BaseDecl->isEmpty())
+      Members.push_back(MemberInfo(Layout.getBaseClassOffset(BaseDecl),
+          MemberInfo::Base, getStorageType(BaseDecl), BaseDecl));
+  }
+}
+
+void CGRecordLowering::accumulateVPtrs() {
+  if (Layout.hasOwnVFPtr())
+    Members.push_back(MemberInfo(CharUnits::Zero(), MemberInfo::VFPtr,
+        llvm::FunctionType::get(getIntNType(32), /*isVarArg=*/true)->
+            getPointerTo()->getPointerTo()));
+  if (Layout.hasOwnVBPtr())
+    Members.push_back(MemberInfo(Layout.getVBPtrOffset(), MemberInfo::VBPtr,
+        llvm::Type::getInt32PtrTy(Types.getLLVMContext())));
+}
+
+void CGRecordLowering::accumulateVBases() {
+  Members.push_back(MemberInfo(Layout.getNonVirtualSize(),
+                               MemberInfo::Scissor, 0, RD));
+  for (const auto &Base : RD->vbases()) {
+    const CXXRecordDecl *BaseDecl = Base.getType()->getAsCXXRecordDecl();
+    if (BaseDecl->isEmpty())
+      continue;
+    CharUnits Offset = Layout.getVBaseClassOffset(BaseDecl);
+    // If the vbase is a primary virtual base of some base, then it doesn't
+    // get its own storage location but instead lives inside of that base.
+    if (!useMSABI() && Context.isNearlyEmpty(BaseDecl) &&
+        !hasOwnStorage(RD, BaseDecl)) {
+      Members.push_back(MemberInfo(Offset, MemberInfo::VBase, 0, BaseDecl));
+      continue;
+    }
+    // If we've got a vtordisp, add it as a storage type.
+    if (Layout.getVBaseOffsetsMap().find(BaseDecl)->second.hasVtorDisp())
+      Members.push_back(StorageInfo(Offset - CharUnits::fromQuantity(4),
+                                    getIntNType(32)));
+    Members.push_back(MemberInfo(Offset, MemberInfo::VBase,
+                                 getStorageType(BaseDecl), BaseDecl));
+  }
+}
+
+bool CGRecordLowering::hasOwnStorage(const CXXRecordDecl *Decl,
+                                     const CXXRecordDecl *Query) {
+  const ASTRecordLayout &DeclLayout = Context.getASTRecordLayout(Decl);
+  if (DeclLayout.isPrimaryBaseVirtual() && DeclLayout.getPrimaryBase() == Query)
+    return false;
+  for (const auto &Base : Decl->bases())
+    if (!hasOwnStorage(Base.getType()->getAsCXXRecordDecl(), Query))
+      return false;
+  return true;
+}
+
+void CGRecordLowering::calculateZeroInit() {
+  for (std::vector<MemberInfo>::const_iterator Member = Members.begin(),
+                                               MemberEnd = Members.end();
+       IsZeroInitializableAsBase && Member != MemberEnd; ++Member) {
+    if (Member->Kind == MemberInfo::Field) {
+      if (!Member->FD || isZeroInitializable(Member->FD))
+        continue;
+      IsZeroInitializable = IsZeroInitializableAsBase = false;
+    } else if (Member->Kind == MemberInfo::Base ||
+               Member->Kind == MemberInfo::VBase) {
+      if (isZeroInitializable(Member->RD))
+        continue;
+      IsZeroInitializable = false;
+      if (Member->Kind == MemberInfo::Base)
+        IsZeroInitializableAsBase = false;
+    }
+  }
+}
+
+void CGRecordLowering::clipTailPadding() {
+  std::vector<MemberInfo>::iterator Prior = Members.begin();
+  CharUnits Tail = getSize(Prior->Data);
+  for (std::vector<MemberInfo>::iterator Member = Prior + 1,
+                                         MemberEnd = Members.end();
+       Member != MemberEnd; ++Member) {
+    // Only members with data and the scissor can cut into tail padding.
+    if (!Member->Data && Member->Kind != MemberInfo::Scissor)
+      continue;
+    if (Member->Offset < Tail) {
+      assert(Prior->Kind == MemberInfo::Field && !Prior->FD &&
+             "Only storage fields have tail padding!");
+      Prior->Data = getByteArrayType(bitsToCharUnits(llvm::RoundUpToAlignment(
+          cast<llvm::IntegerType>(Prior->Data)->getIntegerBitWidth(), 8)));
+    }
+    if (Member->Data)
+      Prior = Member;
+    Tail = Prior->Offset + getSize(Prior->Data);
+  }
+}
+
+void CGRecordLowering::determinePacked() {
+  CharUnits Alignment = CharUnits::One();
+  for (std::vector<MemberInfo>::const_iterator Member = Members.begin(),
+                                               MemberEnd = Members.end();
+       Member != MemberEnd; ++Member) {
+    if (!Member->Data)
+      continue;
+    // If any member falls at an offset that it not a multiple of its alignment,
+    // then the entire record must be packed.
+    if (Member->Offset % getAlignment(Member->Data))
+      Packed = true;
+    Alignment = std::max(Alignment, getAlignment(Member->Data));
+  }
+  // If the size of the record (the capstone's offset) is not a multiple of the
+  // record's alignment, it must be packed.
+  if (Members.back().Offset % Alignment)
+    Packed = true;
+  // Update the alignment of the sentinal.
+  if (!Packed)
+    Members.back().Data = getIntNType(Context.toBits(Alignment));
+}
+
+void CGRecordLowering::insertPadding() {
+  std::vector<std::pair<CharUnits, CharUnits> > Padding;
+  CharUnits Size = CharUnits::Zero();
+  for (std::vector<MemberInfo>::const_iterator Member = Members.begin(),
+                                               MemberEnd = Members.end();
+       Member != MemberEnd; ++Member) {
+    if (!Member->Data)
+      continue;
+    CharUnits Offset = Member->Offset;
+    assert(Offset >= Size);
+    // Insert padding if we need to.
+    if (Offset != Size.RoundUpToAlignment(Packed ? CharUnits::One() :
+                                          getAlignment(Member->Data)))
+      Padding.push_back(std::make_pair(Size, Offset - Size));
+    Size = Offset + getSize(Member->Data);
+  }
+  if (Padding.empty())
     return;
+  // Add the padding to the Members list and sort it.
+  for (std::vector<std::pair<CharUnits, CharUnits> >::const_iterator
+        Pad = Padding.begin(), PadEnd = Padding.end();
+        Pad != PadEnd; ++Pad)
+    Members.push_back(StorageInfo(Pad->first, getByteArrayType(Pad->second)));
+  std::stable_sort(Members.begin(), Members.end());
+}
 
-  // We weren't able to layout the struct. Try again with a packed struct
-  Packed = true;
-  LastLaidOutBase.invalidate();
-  NextFieldOffset = CharUnits::Zero();
-  FieldTypes.clear();
-  Fields.clear();
-  BitFields.clear();
-  NonVirtualBases.clear();
-  VirtualBases.clear();
-
-  LayoutFields(D);
+void CGRecordLowering::fillOutputFields() {
+  for (std::vector<MemberInfo>::const_iterator Member = Members.begin(),
+                                               MemberEnd = Members.end();
+       Member != MemberEnd; ++Member) {
+    if (Member->Data)
+      FieldTypes.push_back(Member->Data);
+    if (Member->Kind == MemberInfo::Field) {
+      if (Member->FD)
+        Fields[Member->FD] = FieldTypes.size() - 1;
+      // A field without storage must be a bitfield.
+      if (!Member->Data)
+        setBitFieldInfo(Member->FD, Member->Offset, FieldTypes.back());
+    } else if (Member->Kind == MemberInfo::Base)
+      NonVirtualBases[Member->RD] = FieldTypes.size() - 1;
+    else if (Member->Kind == MemberInfo::VBase)
+      VirtualBases[Member->RD] = FieldTypes.size() - 1;
+  }
 }
 
 CGBitFieldInfo CGBitFieldInfo::MakeInfo(CodeGenTypes &Types,
@@ -231,6 +586,9 @@
                                         uint64_t Offset, uint64_t Size,
                                         uint64_t StorageSize,
                                         uint64_t StorageAlignment) {
+  // This function is vestigial from CGRecordLayoutBuilder days but is still 
+  // used in GCObjCRuntime.cpp.  That usage has a "fixme" attached to it that
+  // when addressed will allow for the removal of this function.
   llvm::Type *Ty = Types.ConvertTypeForMem(FD->getType());
   CharUnits TypeSizeInBytes =
     CharUnits::fromQuantity(Types.getDataLayout().getTypeAllocSize(Ty));
@@ -262,709 +620,33 @@
   return CGBitFieldInfo(Offset, Size, IsSigned, StorageSize, StorageAlignment);
 }
 
-/// \brief Layout the range of bitfields from BFI to BFE as contiguous storage.
-bool CGRecordLayoutBuilder::LayoutBitfields(const ASTRecordLayout &Layout,
-                                            unsigned &FirstFieldNo,
-                                            RecordDecl::field_iterator &FI,
-                                            RecordDecl::field_iterator FE) {
-  assert(FI != FE);
-  uint64_t FirstFieldOffset = Layout.getFieldOffset(FirstFieldNo);
-  uint64_t NextFieldOffsetInBits = Types.getContext().toBits(NextFieldOffset);
-
-  unsigned CharAlign = Types.getTarget().getCharAlign();
-  assert(FirstFieldOffset % CharAlign == 0 &&
-         "First field offset is misaligned");
-  CharUnits FirstFieldOffsetInBytes
-    = Types.getContext().toCharUnitsFromBits(FirstFieldOffset);
-
-  unsigned StorageAlignment
-    = llvm::MinAlign(Alignment.getQuantity(),
-                     FirstFieldOffsetInBytes.getQuantity());
-
-  if (FirstFieldOffset < NextFieldOffsetInBits) {
-    CharUnits FieldOffsetInCharUnits =
-      Types.getContext().toCharUnitsFromBits(FirstFieldOffset);
-
-    // Try to resize the last base field.
-    if (!ResizeLastBaseFieldIfNecessary(FieldOffsetInCharUnits))
-      llvm_unreachable("We must be able to resize the last base if we need to "
-                       "pack bits into it.");
-
-    NextFieldOffsetInBits = Types.getContext().toBits(NextFieldOffset);
-    assert(FirstFieldOffset >= NextFieldOffsetInBits);
-  }
-
-  // Append padding if necessary.
-  AppendPadding(Types.getContext().toCharUnitsFromBits(FirstFieldOffset),
-                CharUnits::One());
-
-  // Find the last bitfield in a contiguous run of bitfields.
-  RecordDecl::field_iterator BFI = FI;
-  unsigned LastFieldNo = FirstFieldNo;
-  uint64_t NextContiguousFieldOffset = FirstFieldOffset;
-  for (RecordDecl::field_iterator FJ = FI;
-       (FJ != FE && (*FJ)->isBitField() &&
-        NextContiguousFieldOffset == Layout.getFieldOffset(LastFieldNo) &&
-        (*FJ)->getBitWidthValue(Types.getContext()) != 0); FI = FJ++) {
-    NextContiguousFieldOffset += (*FJ)->getBitWidthValue(Types.getContext());
-    ++LastFieldNo;
-
-    // We must use packed structs for packed fields, and also unnamed bit
-    // fields since they don't affect the struct alignment.
-    if (!Packed && ((*FJ)->hasAttr<PackedAttr>() || !(*FJ)->getDeclName()))
-      return false;
-  }
-  RecordDecl::field_iterator BFE = llvm::next(FI);
-  --LastFieldNo;
-  assert(LastFieldNo >= FirstFieldNo && "Empty run of contiguous bitfields");
-  FieldDecl *LastFD = *FI;
-
-  // Find the last bitfield's offset, add its size, and round it up to the
-  // character alignment to compute the storage required.
-  uint64_t LastFieldOffset = Layout.getFieldOffset(LastFieldNo);
-  uint64_t LastFieldSize = LastFD->getBitWidthValue(Types.getContext());
-  uint64_t TotalBits = (LastFieldOffset + LastFieldSize) - FirstFieldOffset;
-  CharUnits StorageBytes = Types.getContext().toCharUnitsFromBits(
-    llvm::RoundUpToAlignment(TotalBits, CharAlign));
-  uint64_t StorageBits = Types.getContext().toBits(StorageBytes);
-
-  // Grow the storage to encompass any known padding in the layout when doing
-  // so will make the storage a power-of-two. There are two cases when we can
-  // do this. The first is when we have a subsequent field and can widen up to
-  // its offset. The second is when the data size of the AST record layout is
-  // past the end of the current storage. The latter is true when there is tail
-  // padding on a struct and no members of a super class can be packed into it.
-  //
-  // Note that we widen the storage as much as possible here to express the
-  // maximum latitude the language provides, and rely on the backend to lower
-  // these in conjunction with shifts and masks to narrower operations where
-  // beneficial.
-  uint64_t EndOffset = Types.getContext().toBits(Layout.getDataSize());
-  if (BFE != FE)
-    // If there are more fields to be laid out, the offset at the end of the
-    // bitfield is the offset of the next field in the record.
-    EndOffset = Layout.getFieldOffset(LastFieldNo + 1);
-  assert(EndOffset >= (FirstFieldOffset + TotalBits) &&
-         "End offset is not past the end of the known storage bits.");
-  uint64_t SpaceBits = EndOffset - FirstFieldOffset;
-  uint64_t LongBits = Types.getTarget().getLongWidth();
-  uint64_t WidenedBits = (StorageBits / LongBits) * LongBits +
-                         llvm::NextPowerOf2(StorageBits % LongBits - 1);
-  assert(WidenedBits >= StorageBits && "Widening shrunk the bits!");
-  if (WidenedBits <= SpaceBits) {
-    StorageBits = WidenedBits;
-    StorageBytes = Types.getContext().toCharUnitsFromBits(StorageBits);
-    assert(StorageBits == (uint64_t)Types.getContext().toBits(StorageBytes));
-  }
-
-  unsigned FieldIndex = FieldTypes.size();
-  AppendBytes(StorageBytes);
-
-  // Now walk the bitfields associating them with this field of storage and
-  // building up the bitfield specific info.
-  unsigned FieldNo = FirstFieldNo;
-  for (; BFI != BFE; ++BFI, ++FieldNo) {
-    FieldDecl *FD = *BFI;
-    uint64_t FieldOffset = Layout.getFieldOffset(FieldNo) - FirstFieldOffset;
-    uint64_t FieldSize = FD->getBitWidthValue(Types.getContext());
-    Fields[FD] = FieldIndex;
-    BitFields[FD] = CGBitFieldInfo::MakeInfo(Types, FD, FieldOffset, FieldSize,
-                                             StorageBits, StorageAlignment);
-  }
-  FirstFieldNo = LastFieldNo;
-  return true;
-}
-
-bool CGRecordLayoutBuilder::LayoutField(const FieldDecl *D,
-                                        uint64_t fieldOffset) {
-  // If the field is packed, then we need a packed struct.
-  if (!Packed && D->hasAttr<PackedAttr>())
-    return false;
-
-  assert(!D->isBitField() && "Bitfields should be laid out seperately.");
-
-  CheckZeroInitializable(D->getType());
-
-  assert(fieldOffset % Types.getTarget().getCharWidth() == 0
-         && "field offset is not on a byte boundary!");
-  CharUnits fieldOffsetInBytes
-    = Types.getContext().toCharUnitsFromBits(fieldOffset);
-
-  llvm::Type *Ty = Types.ConvertTypeForMem(D->getType());
-  CharUnits typeAlignment = getTypeAlignment(Ty);
-
-  // If the type alignment is larger then the struct alignment, we must use
-  // a packed struct.
-  if (typeAlignment > Alignment) {
-    assert(!Packed && "Alignment is wrong even with packed struct!");
-    return false;
-  }
-
-  if (!Packed) {
-    if (const RecordType *RT = D->getType()->getAs<RecordType>()) {
-      const RecordDecl *RD = cast<RecordDecl>(RT->getDecl());
-      if (const MaxFieldAlignmentAttr *MFAA =
-            RD->getAttr<MaxFieldAlignmentAttr>()) {
-        if (MFAA->getAlignment() != Types.getContext().toBits(typeAlignment))
-          return false;
-      }
-    }
-  }
-
-  // Round up the field offset to the alignment of the field type.
-  CharUnits alignedNextFieldOffsetInBytes =
-    NextFieldOffset.RoundUpToAlignment(typeAlignment);
-
-  if (fieldOffsetInBytes < alignedNextFieldOffsetInBytes) {
-    // Try to resize the last base field.
-    if (ResizeLastBaseFieldIfNecessary(fieldOffsetInBytes)) {
-      alignedNextFieldOffsetInBytes = 
-        NextFieldOffset.RoundUpToAlignment(typeAlignment);
-    }
-  }
-
-  if (fieldOffsetInBytes < alignedNextFieldOffsetInBytes) {
-    assert(!Packed && "Could not place field even with packed struct!");
-    return false;
-  }
-
-  AppendPadding(fieldOffsetInBytes, typeAlignment);
-
-  // Now append the field.
-  Fields[D] = FieldTypes.size();
-  AppendField(fieldOffsetInBytes, Ty);
-
-  LastLaidOutBase.invalidate();
-  return true;
-}
-
-llvm::Type *
-CGRecordLayoutBuilder::LayoutUnionField(const FieldDecl *Field,
-                                        const ASTRecordLayout &Layout) {
-  Fields[Field] = 0;
-  if (Field->isBitField()) {
-    uint64_t FieldSize = Field->getBitWidthValue(Types.getContext());
-
-    // Ignore zero sized bit fields.
-    if (FieldSize == 0)
-      return 0;
-
-    unsigned StorageBits = llvm::RoundUpToAlignment(
-      FieldSize, Types.getTarget().getCharAlign());
-    CharUnits NumBytesToAppend
-      = Types.getContext().toCharUnitsFromBits(StorageBits);
-
-    llvm::Type *FieldTy = llvm::Type::getInt8Ty(Types.getLLVMContext());
-    if (NumBytesToAppend > CharUnits::One())
-      FieldTy = llvm::ArrayType::get(FieldTy, NumBytesToAppend.getQuantity());
-
-    // Add the bit field info.
-    BitFields[Field] = CGBitFieldInfo::MakeInfo(Types, Field, 0, FieldSize,
-                                                StorageBits,
-                                                Alignment.getQuantity());
-    return FieldTy;
-  }
-
-  // This is a regular union field.
-  return Types.ConvertTypeForMem(Field->getType());
-}
-
-void CGRecordLayoutBuilder::LayoutUnion(const RecordDecl *D) {
-  assert(D->isUnion() && "Can't call LayoutUnion on a non-union record!");
-
-  const ASTRecordLayout &layout = Types.getContext().getASTRecordLayout(D);
-
-  llvm::Type *unionType = 0;
-  CharUnits unionSize = CharUnits::Zero();
-  CharUnits unionAlign = CharUnits::Zero();
-
-  bool hasOnlyZeroSizedBitFields = true;
-  bool checkedFirstFieldZeroInit = false;
-
-  unsigned fieldNo = 0;
-  for (RecordDecl::field_iterator field = D->field_begin(),
-       fieldEnd = D->field_end(); field != fieldEnd; ++field, ++fieldNo) {
-    assert(layout.getFieldOffset(fieldNo) == 0 &&
-          "Union field offset did not start at the beginning of record!");
-    llvm::Type *fieldType = LayoutUnionField(*field, layout);
-
-    if (!fieldType)
-      continue;
-
-    if (field->getDeclName() && !checkedFirstFieldZeroInit) {
-      CheckZeroInitializable(field->getType());
-      checkedFirstFieldZeroInit = true;
-    }
-
-    hasOnlyZeroSizedBitFields = false;
-
-    CharUnits fieldAlign = CharUnits::fromQuantity(
-                          Types.getDataLayout().getABITypeAlignment(fieldType));
-    CharUnits fieldSize = CharUnits::fromQuantity(
-                             Types.getDataLayout().getTypeAllocSize(fieldType));
-
-    if (fieldAlign < unionAlign)
-      continue;
-
-    if (fieldAlign > unionAlign || fieldSize > unionSize) {
-      unionType = fieldType;
-      unionAlign = fieldAlign;
-      unionSize = fieldSize;
-    }
-  }
-
-  // Now add our field.
-  if (unionType) {
-    AppendField(CharUnits::Zero(), unionType);
-
-    if (getTypeAlignment(unionType) > layout.getAlignment()) {
-      // We need a packed struct.
-      Packed = true;
-      unionAlign = CharUnits::One();
-    }
-  }
-  if (unionAlign.isZero()) {
-    (void)hasOnlyZeroSizedBitFields;
-    assert(hasOnlyZeroSizedBitFields &&
-           "0-align record did not have all zero-sized bit-fields!");
-    unionAlign = CharUnits::One();
-  }
-
-  // Append tail padding.
-  CharUnits recordSize = layout.getSize();
-  if (recordSize > unionSize)
-    AppendPadding(recordSize, unionAlign);
-}
-
-bool CGRecordLayoutBuilder::LayoutBase(const CXXRecordDecl *base,
-                                       const CGRecordLayout &baseLayout,
-                                       CharUnits baseOffset) {
-  ResizeLastBaseFieldIfNecessary(baseOffset);
-
-  AppendPadding(baseOffset, CharUnits::One());
-
-  const ASTRecordLayout &baseASTLayout
-    = Types.getContext().getASTRecordLayout(base);
-
-  LastLaidOutBase.Offset = NextFieldOffset;
-  LastLaidOutBase.NonVirtualSize = baseASTLayout.getNonVirtualSize();
-
-  llvm::StructType *subobjectType = baseLayout.getBaseSubobjectLLVMType();
-  if (getTypeAlignment(subobjectType) > Alignment)
-    return false;
-
-  AppendField(baseOffset, subobjectType);
-  return true;
-}
-
-bool CGRecordLayoutBuilder::LayoutNonVirtualBase(const CXXRecordDecl *base,
-                                                 CharUnits baseOffset) {
-  // Ignore empty bases.
-  if (base->isEmpty()) return true;
-
-  const CGRecordLayout &baseLayout = Types.getCGRecordLayout(base);
-  if (IsZeroInitializableAsBase) {
-    assert(IsZeroInitializable &&
-           "class zero-initializable as base but not as complete object");
-
-    IsZeroInitializable = IsZeroInitializableAsBase =
-      baseLayout.isZeroInitializableAsBase();
-  }
-
-  if (!LayoutBase(base, baseLayout, baseOffset))
-    return false;
-  NonVirtualBases[base] = (FieldTypes.size() - 1);
-  return true;
-}
-
-bool
-CGRecordLayoutBuilder::LayoutVirtualBase(const CXXRecordDecl *base,
-                                         CharUnits baseOffset) {
-  // Ignore empty bases.
-  if (base->isEmpty()) return true;
-
-  const CGRecordLayout &baseLayout = Types.getCGRecordLayout(base);
-  if (IsZeroInitializable)
-    IsZeroInitializable = baseLayout.isZeroInitializableAsBase();
-
-  if (!LayoutBase(base, baseLayout, baseOffset))
-    return false;
-  VirtualBases[base] = (FieldTypes.size() - 1);
-  return true;
-}
-
-bool
-CGRecordLayoutBuilder::MSLayoutVirtualBases(const CXXRecordDecl *RD,
-                                          const ASTRecordLayout &Layout) {
-  if (!RD->getNumVBases())
-    return true;
-
-  // The vbases list is uniqued and ordered by a depth-first
-  // traversal, which is what we need here.
-  for (CXXRecordDecl::base_class_const_iterator I = RD->vbases_begin(),
-        E = RD->vbases_end(); I != E; ++I) {
-
-    const CXXRecordDecl *BaseDecl = 
-      cast<CXXRecordDecl>(I->getType()->castAs<RecordType>()->getDecl());
-
-    CharUnits vbaseOffset = Layout.getVBaseClassOffset(BaseDecl);
-    if (!LayoutVirtualBase(BaseDecl, vbaseOffset))
-      return false;
-  }
-  return true;
-}
-
-/// LayoutVirtualBases - layout the non-virtual bases of a record decl.
-bool
-CGRecordLayoutBuilder::LayoutVirtualBases(const CXXRecordDecl *RD,
-                                          const ASTRecordLayout &Layout) {
-  for (CXXRecordDecl::base_class_const_iterator I = RD->bases_begin(),
-       E = RD->bases_end(); I != E; ++I) {
-    const CXXRecordDecl *BaseDecl = 
-      cast<CXXRecordDecl>(I->getType()->getAs<RecordType>()->getDecl());
-
-    // We only want to lay out virtual bases that aren't indirect primary bases
-    // of some other base.
-    if (I->isVirtual() && !IndirectPrimaryBases.count(BaseDecl)) {
-      // Only lay out the base once.
-      if (!LaidOutVirtualBases.insert(BaseDecl))
-        continue;
-
-      CharUnits vbaseOffset = Layout.getVBaseClassOffset(BaseDecl);
-      if (!LayoutVirtualBase(BaseDecl, vbaseOffset))
-        return false;
-    }
-
-    if (!BaseDecl->getNumVBases()) {
-      // This base isn't interesting since it doesn't have any virtual bases.
-      continue;
-    }
-    
-    if (!LayoutVirtualBases(BaseDecl, Layout))
-      return false;
-  }
-  return true;
-}
-
-bool
-CGRecordLayoutBuilder::LayoutNonVirtualBases(const CXXRecordDecl *RD,
-                                             const ASTRecordLayout &Layout) {
-  const CXXRecordDecl *PrimaryBase = Layout.getPrimaryBase();
-
-  // If we have a primary base, lay it out first.
-  if (PrimaryBase) {
-    if (!Layout.isPrimaryBaseVirtual()) {
-      if (!LayoutNonVirtualBase(PrimaryBase, CharUnits::Zero()))
-        return false;
-    } else {
-      if (!LayoutVirtualBase(PrimaryBase, CharUnits::Zero()))
-        return false;
-    }
-
-  // Otherwise, add a vtable / vf-table if the layout says to do so.
-  } else if (Layout.hasOwnVFPtr()) {
-    llvm::Type *FunctionType =
-      llvm::FunctionType::get(llvm::Type::getInt32Ty(Types.getLLVMContext()),
-                              /*isVarArg=*/true);
-    llvm::Type *VTableTy = FunctionType->getPointerTo();
-
-    if (getTypeAlignment(VTableTy) > Alignment) {
-      // FIXME: Should we allow this to happen in Sema?
-      assert(!Packed && "Alignment is wrong even with packed struct!");
-      return false;
-    }
-
-    assert(NextFieldOffset.isZero() &&
-           "VTable pointer must come first!");
-    AppendField(CharUnits::Zero(), VTableTy->getPointerTo());
-  }
-
-  // Layout the non-virtual bases.
-  for (CXXRecordDecl::base_class_const_iterator I = RD->bases_begin(),
-       E = RD->bases_end(); I != E; ++I) {
-    if (I->isVirtual())
-      continue;
-
-    const CXXRecordDecl *BaseDecl = 
-      cast<CXXRecordDecl>(I->getType()->getAs<RecordType>()->getDecl());
-
-    // We've already laid out the primary base.
-    if (BaseDecl == PrimaryBase && !Layout.isPrimaryBaseVirtual())
-      continue;
-
-    if (!LayoutNonVirtualBase(BaseDecl, Layout.getBaseClassOffset(BaseDecl)))
-      return false;
-  }
-
-  // Add a vb-table pointer if the layout insists.
-    if (Layout.hasOwnVBPtr()) {
-    CharUnits VBPtrOffset = Layout.getVBPtrOffset();
-    llvm::Type *Vbptr = llvm::Type::getInt32PtrTy(Types.getLLVMContext());
-    AppendPadding(VBPtrOffset, getTypeAlignment(Vbptr));
-    AppendField(VBPtrOffset, Vbptr);
-  }
-
-  return true;
-}
-
-bool
-CGRecordLayoutBuilder::ComputeNonVirtualBaseType(const CXXRecordDecl *RD) {
-  const ASTRecordLayout &Layout = Types.getContext().getASTRecordLayout(RD);
-
-  CharUnits NonVirtualSize  = Layout.getNonVirtualSize();
-  CharUnits NonVirtualAlign = Layout.getNonVirtualAlign();
-  CharUnits AlignedNonVirtualTypeSize =
-    NonVirtualSize.RoundUpToAlignment(NonVirtualAlign);
-  
-  // First check if we can use the same fields as for the complete class.
-  CharUnits RecordSize = Layout.getSize();
-  if (AlignedNonVirtualTypeSize == RecordSize)
-    return true;
-
-  // Check if we need padding.
-  CharUnits AlignedNextFieldOffset =
-    NextFieldOffset.RoundUpToAlignment(getAlignmentAsLLVMStruct());
-
-  if (AlignedNextFieldOffset > AlignedNonVirtualTypeSize) {
-    assert(!Packed && "cannot layout even as packed struct");
-    return false; // Needs packing.
-  }
-
-  bool needsPadding = (AlignedNonVirtualTypeSize != AlignedNextFieldOffset);
-  if (needsPadding) {
-    CharUnits NumBytes = AlignedNonVirtualTypeSize - AlignedNextFieldOffset;
-    FieldTypes.push_back(getByteArrayType(NumBytes));
-  }
-  
-  BaseSubobjectType = llvm::StructType::create(Types.getLLVMContext(),
-                                               FieldTypes, "", Packed);
-  Types.addRecordTypeName(RD, BaseSubobjectType, ".base");
-
-  // Pull the padding back off.
-  if (needsPadding)
-    FieldTypes.pop_back();
-
-  return true;
-}
-
-bool CGRecordLayoutBuilder::LayoutFields(const RecordDecl *D) {
-  assert(!D->isUnion() && "Can't call LayoutFields on a union!");
-  assert(!Alignment.isZero() && "Did not set alignment!");
-
-  const ASTRecordLayout &Layout = Types.getContext().getASTRecordLayout(D);
-
-  const CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(D);
-  if (RD)
-    if (!LayoutNonVirtualBases(RD, Layout))
-      return false;
-
-  unsigned FieldNo = 0;
-  
-  for (RecordDecl::field_iterator FI = D->field_begin(), FE = D->field_end();
-       FI != FE; ++FI, ++FieldNo) {
-    FieldDecl *FD = *FI;
-
-    // If this field is a bitfield, layout all of the consecutive
-    // non-zero-length bitfields and the last zero-length bitfield; these will
-    // all share storage.
-    if (FD->isBitField()) {
-      // If all we have is a zero-width bitfield, skip it.
-      if (FD->getBitWidthValue(Types.getContext()) == 0)
-        continue;
-
-      // Layout this range of bitfields.
-      if (!LayoutBitfields(Layout, FieldNo, FI, FE)) {
-        assert(!Packed &&
-               "Could not layout bitfields even with a packed LLVM struct!");
-        return false;
-      }
-      assert(FI != FE && "Advanced past the last bitfield");
-      continue;
-    }
-
-    if (!LayoutField(FD, Layout.getFieldOffset(FieldNo))) {
-      assert(!Packed &&
-             "Could not layout fields even with a packed LLVM struct!");
-      return false;
-    }
-  }
-
-  if (RD) {
-    // We've laid out the non-virtual bases and the fields, now compute the
-    // non-virtual base field types.
-    if (!ComputeNonVirtualBaseType(RD)) {
-      assert(!Packed && "Could not layout even with a packed LLVM struct!");
-      return false;
-    }
-
-    // Lay out the virtual bases.  The MS ABI uses a different
-    // algorithm here due to the lack of primary virtual bases.
-    if (Types.getTarget().getCXXABI().hasPrimaryVBases()) {
-      RD->getIndirectPrimaryBases(IndirectPrimaryBases);
-      if (Layout.isPrimaryBaseVirtual())
-        IndirectPrimaryBases.insert(Layout.getPrimaryBase());
-
-      if (!LayoutVirtualBases(RD, Layout))
-        return false;
-    } else {
-      if (!MSLayoutVirtualBases(RD, Layout))
-        return false;
-    }
-  }
-  
-  // Append tail padding if necessary.
-  AppendTailPadding(Layout.getSize());
-
-  return true;
-}
-
-void CGRecordLayoutBuilder::AppendTailPadding(CharUnits RecordSize) {
-  ResizeLastBaseFieldIfNecessary(RecordSize);
-
-  assert(NextFieldOffset <= RecordSize && "Size mismatch!");
-
-  CharUnits AlignedNextFieldOffset =
-    NextFieldOffset.RoundUpToAlignment(getAlignmentAsLLVMStruct());
-
-  if (AlignedNextFieldOffset == RecordSize) {
-    // We don't need any padding.
-    return;
-  }
-
-  CharUnits NumPadBytes = RecordSize - NextFieldOffset;
-  AppendBytes(NumPadBytes);
-}
-
-void CGRecordLayoutBuilder::AppendField(CharUnits fieldOffset,
-                                        llvm::Type *fieldType) {
-  CharUnits fieldSize =
-    CharUnits::fromQuantity(Types.getDataLayout().getTypeAllocSize(fieldType));
-
-  FieldTypes.push_back(fieldType);
-
-  NextFieldOffset = fieldOffset + fieldSize;
-}
-
-void CGRecordLayoutBuilder::AppendPadding(CharUnits fieldOffset,
-                                          CharUnits fieldAlignment) {
-  assert(NextFieldOffset <= fieldOffset &&
-         "Incorrect field layout!");
-
-  // Do nothing if we're already at the right offset.
-  if (fieldOffset == NextFieldOffset) return;
-
-  // If we're not emitting a packed LLVM type, try to avoid adding
-  // unnecessary padding fields.
-  if (!Packed) {
-    // Round up the field offset to the alignment of the field type.
-    CharUnits alignedNextFieldOffset =
-      NextFieldOffset.RoundUpToAlignment(fieldAlignment);
-    assert(alignedNextFieldOffset <= fieldOffset);
-
-    // If that's the right offset, we're done.
-    if (alignedNextFieldOffset == fieldOffset) return;
-  }
-
-  // Otherwise we need explicit padding.
-  CharUnits padding = fieldOffset - NextFieldOffset;
-  AppendBytes(padding);
-}
-
-bool CGRecordLayoutBuilder::ResizeLastBaseFieldIfNecessary(CharUnits offset) {
-  // Check if we have a base to resize.
-  if (!LastLaidOutBase.isValid())
-    return false;
-
-  // This offset does not overlap with the tail padding.
-  if (offset >= NextFieldOffset)
-    return false;
-
-  // Restore the field offset and append an i8 array instead.
-  FieldTypes.pop_back();
-  NextFieldOffset = LastLaidOutBase.Offset;
-  AppendBytes(LastLaidOutBase.NonVirtualSize);
-  LastLaidOutBase.invalidate();
-
-  return true;
-}
-
-llvm::Type *CGRecordLayoutBuilder::getByteArrayType(CharUnits numBytes) {
-  assert(!numBytes.isZero() && "Empty byte arrays aren't allowed.");
-
-  llvm::Type *Ty = llvm::Type::getInt8Ty(Types.getLLVMContext());
-  if (numBytes > CharUnits::One())
-    Ty = llvm::ArrayType::get(Ty, numBytes.getQuantity());
-
-  return Ty;
-}
-
-void CGRecordLayoutBuilder::AppendBytes(CharUnits numBytes) {
-  if (numBytes.isZero())
-    return;
-
-  // Append the padding field
-  AppendField(NextFieldOffset, getByteArrayType(numBytes));
-}
-
-CharUnits CGRecordLayoutBuilder::getTypeAlignment(llvm::Type *Ty) const {
-  if (Packed)
-    return CharUnits::One();
-
-  return CharUnits::fromQuantity(Types.getDataLayout().getABITypeAlignment(Ty));
-}
-
-CharUnits CGRecordLayoutBuilder::getAlignmentAsLLVMStruct() const {
-  if (Packed)
-    return CharUnits::One();
-
-  CharUnits maxAlignment = CharUnits::One();
-  for (size_t i = 0; i != FieldTypes.size(); ++i)
-    maxAlignment = std::max(maxAlignment, getTypeAlignment(FieldTypes[i]));
-
-  return maxAlignment;
-}
-
-/// Merge in whether a field of the given type is zero-initializable.
-void CGRecordLayoutBuilder::CheckZeroInitializable(QualType T) {
-  // This record already contains a member pointer.
-  if (!IsZeroInitializableAsBase)
-    return;
-
-  // Can only have member pointers if we're compiling C++.
-  if (!Types.getContext().getLangOpts().CPlusPlus)
-    return;
-
-  const Type *elementType = T->getBaseElementTypeUnsafe();
-
-  if (const MemberPointerType *MPT = elementType->getAs<MemberPointerType>()) {
-    if (!Types.getCXXABI().isZeroInitializable(MPT))
-      IsZeroInitializable = IsZeroInitializableAsBase = false;
-  } else if (const RecordType *RT = elementType->getAs<RecordType>()) {
-    const CXXRecordDecl *RD = cast<CXXRecordDecl>(RT->getDecl());
-    const CGRecordLayout &Layout = Types.getCGRecordLayout(RD);
-    if (!Layout.isZeroInitializable())
-      IsZeroInitializable = IsZeroInitializableAsBase = false;
-  }
-}
-
 CGRecordLayout *CodeGenTypes::ComputeRecordLayout(const RecordDecl *D,
                                                   llvm::StructType *Ty) {
-  CGRecordLayoutBuilder Builder(*this);
+  CGRecordLowering Builder(*this, D);
 
-  Builder.Layout(D);
-
-  Ty->setBody(Builder.FieldTypes, Builder.Packed);
+  Builder.lower(false);
 
   // If we're in C++, compute the base subobject type.
   llvm::StructType *BaseTy = 0;
-  if (isa<CXXRecordDecl>(D) && !D->isUnion()) {
-    BaseTy = Builder.BaseSubobjectType;
-    if (!BaseTy) BaseTy = Ty;
+  if (isa<CXXRecordDecl>(D) && !D->isUnion() && !D->hasAttr<FinalAttr>()) {
+    BaseTy = Ty;
+    if (Builder.Layout.getNonVirtualSize() != Builder.Layout.getSize()) {
+      CGRecordLowering BaseBuilder(*this, D);
+      BaseBuilder.lower(true);
+      BaseTy = llvm::StructType::create(
+          getLLVMContext(), BaseBuilder.FieldTypes, "", BaseBuilder.Packed);
+      addRecordTypeName(D, BaseTy, ".base");
+    }
   }
 
+  // Fill in the struct *after* computing the base type.  Filling in the body
+  // signifies that the type is no longer opaque and record layout is complete,
+  // but we may need to recursively layout D while laying D out as a base type.
+  Ty->setBody(Builder.FieldTypes, Builder.Packed);
+
   CGRecordLayout *RL =
     new CGRecordLayout(Ty, BaseTy, Builder.IsZeroInitializable,
-                       Builder.IsZeroInitializableAsBase);
+                        Builder.IsZeroInitializableAsBase);
 
   RL->NonVirtualBases.swap(Builder.NonVirtualBases);
   RL->CompleteObjectVirtualBases.swap(Builder.VirtualBases);
@@ -994,12 +676,9 @@
 
   if (BaseTy) {
     CharUnits NonVirtualSize  = Layout.getNonVirtualSize();
-    CharUnits NonVirtualAlign = Layout.getNonVirtualAlign();
-    CharUnits AlignedNonVirtualTypeSize = 
-      NonVirtualSize.RoundUpToAlignment(NonVirtualAlign);
 
     uint64_t AlignedNonVirtualTypeSizeInBits = 
-      getContext().toBits(AlignedNonVirtualTypeSize);
+      getContext().toBits(NonVirtualSize);
 
     assert(AlignedNonVirtualTypeSizeInBits == 
            getDataLayout().getTypeAllocSizeInBits(BaseTy) &&
diff --git a/lib/CodeGen/CGStmt.cpp b/lib/CodeGen/CGStmt.cpp
index 0bc51dd..cba9e6c 100644
--- a/lib/CodeGen/CGStmt.cpp
+++ b/lib/CodeGen/CGStmt.cpp
@@ -16,14 +16,14 @@
 #include "CodeGenModule.h"
 #include "TargetInfo.h"
 #include "clang/AST/StmtVisitor.h"
-#include "clang/Sema/SemaDiagnostic.h"
 #include "clang/Basic/PrettyStackTrace.h"
 #include "clang/Basic/TargetInfo.h"
+#include "clang/Sema/SemaDiagnostic.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/Support/CallSite.h"
 using namespace clang;
 using namespace CodeGen;
 
@@ -43,6 +43,7 @@
 
 void CodeGenFunction::EmitStmt(const Stmt *S) {
   assert(S && "Null statement?");
+  PGO.setCurrentStmt(S);
 
   // These statements have their own debug info handling.
   if (EmitSimpleStmt(S))
@@ -76,6 +77,7 @@
   case Stmt::SEHFinallyStmtClass:
   case Stmt::MSDependentExistsStmtClass:
   case Stmt::OMPParallelDirectiveClass:
+  case Stmt::OMPSimdDirectiveClass:
     llvm_unreachable("invalid statement class to emit generically");
   case Stmt::NullStmtClass:
   case Stmt::CompoundStmtClass:
@@ -242,7 +244,7 @@
       EmitAnyExprToMem(cast<Expr>(LastStmt), RetAlloca, Qualifiers(),
                        /*IsInit*/false);
     }
-      
+
   }
 
   return RetAlloca;
@@ -309,9 +311,8 @@
 
 void CodeGenFunction::EmitBlockAfterUses(llvm::BasicBlock *block) {
   bool inserted = false;
-  for (llvm::BasicBlock::use_iterator
-         i = block->use_begin(), e = block->use_end(); i != e; ++i) {
-    if (llvm::Instruction *insn = dyn_cast<llvm::Instruction>(*i)) {
+  for (llvm::User *u : block->users()) {
+    if (llvm::Instruction *insn = dyn_cast<llvm::Instruction>(u)) {
       CurFn->getBasicBlockList().insertAfter(insn->getParent(), block);
       inserted = true;
       break;
@@ -358,7 +359,9 @@
     ResolveBranchFixups(Dest.getBlock());
   }
 
+  RegionCounter Cnt = getPGORegionCounter(D->getStmt());
   EmitBlock(Dest.getBlock());
+  Cnt.beginRegion(Builder);
 }
 
 /// Change the cleanup scope of the labels in this lexical scope to
@@ -430,6 +433,7 @@
   // C99 6.8.4.1: The first substatement is executed if the expression compares
   // unequal to 0.  The condition must be a scalar type.
   LexicalScope ConditionScope(*this, S.getSourceRange());
+  RegionCounter Cnt = getPGORegionCounter(&S);
 
   if (S.getConditionVariable())
     EmitAutoVarDecl(*S.getConditionVariable());
@@ -447,6 +451,8 @@
     // If the skipped block has no labels in it, just emit the executed block.
     // This avoids emitting dead code and simplifies the CFG substantially.
     if (!ContainsLabel(Skipped)) {
+      if (CondConstant)
+        Cnt.beginRegion(Builder);
       if (Executed) {
         RunCleanupsScope ExecutedScope(*this);
         EmitStmt(Executed);
@@ -462,10 +468,12 @@
   llvm::BasicBlock *ElseBlock = ContBlock;
   if (S.getElse())
     ElseBlock = createBasicBlock("if.else");
-  EmitBranchOnBoolExpr(S.getCond(), ThenBlock, ElseBlock);
+
+  EmitBranchOnBoolExpr(S.getCond(), ThenBlock, ElseBlock, Cnt.getCount());
 
   // Emit the 'then' code.
-  EmitBlock(ThenBlock); 
+  EmitBlock(ThenBlock);
+  Cnt.beginRegion(Builder);
   {
     RunCleanupsScope ThenScope(*this);
     EmitStmt(S.getThen());
@@ -493,6 +501,8 @@
 }
 
 void CodeGenFunction::EmitWhileStmt(const WhileStmt &S) {
+  RegionCounter Cnt = getPGORegionCounter(&S);
+
   // Emit the header for the loop, which will also become
   // the continue target.
   JumpDest LoopHeader = getJumpDestInCurrentScope("while.cond");
@@ -535,8 +545,8 @@
     llvm::BasicBlock *ExitBlock = LoopExit.getBlock();
     if (ConditionScope.requiresCleanups())
       ExitBlock = createBasicBlock("while.exit");
-
-    Builder.CreateCondBr(BoolCondVal, LoopBody, ExitBlock);
+    Builder.CreateCondBr(BoolCondVal, LoopBody, ExitBlock,
+                         PGO.createLoopWeights(S.getCond(), Cnt));
 
     if (ExitBlock != LoopExit.getBlock()) {
       EmitBlock(ExitBlock);
@@ -549,6 +559,7 @@
   {
     RunCleanupsScope BodyScope(*this);
     EmitBlock(LoopBody);
+    Cnt.beginRegion(Builder);
     EmitStmt(S.getBody());
   }
 
@@ -573,19 +584,19 @@
   JumpDest LoopExit = getJumpDestInCurrentScope("do.end");
   JumpDest LoopCond = getJumpDestInCurrentScope("do.cond");
 
+  RegionCounter Cnt = getPGORegionCounter(&S);
+
   // Store the blocks to use for break and continue.
   BreakContinueStack.push_back(BreakContinue(LoopExit, LoopCond));
 
   // Emit the body of the loop.
   llvm::BasicBlock *LoopBody = createBasicBlock("do.body");
-  EmitBlock(LoopBody);
+  EmitBlockWithFallThrough(LoopBody, Cnt);
   {
     RunCleanupsScope BodyScope(*this);
     EmitStmt(S.getBody());
   }
 
-  BreakContinueStack.pop_back();
-
   EmitBlock(LoopCond.getBlock());
 
   // C99 6.8.5.2: "The evaluation of the controlling expression takes place
@@ -596,6 +607,8 @@
   // compares unequal to 0.  The condition must be a scalar type.
   llvm::Value *BoolCondVal = EvaluateExprAsBool(S.getCond());
 
+  BreakContinueStack.pop_back();
+
   // "do {} while (0)" is common in macros, avoid extra blocks.  Be sure
   // to correctly handle break/continue though.
   bool EmitBoolCondBranch = true;
@@ -605,7 +618,8 @@
 
   // As long as the condition is true, iterate the loop.
   if (EmitBoolCondBranch)
-    Builder.CreateCondBr(BoolCondVal, LoopBody, LoopExit.getBlock());
+    Builder.CreateCondBr(BoolCondVal, LoopBody, LoopExit.getBlock(),
+                         PGO.createLoopWeights(S.getCond(), Cnt));
 
   // Emit the exit block.
   EmitBlock(LoopExit.getBlock());
@@ -629,6 +643,8 @@
   if (S.getInit())
     EmitStmt(S.getInit());
 
+  RegionCounter Cnt = getPGORegionCounter(&S);
+
   // Start the loop with a block that tests the condition.
   // If there's an increment, the continue scope will be overwritten
   // later.
@@ -636,6 +652,16 @@
   llvm::BasicBlock *CondBlock = Continue.getBlock();
   EmitBlock(CondBlock);
 
+  // If the for loop doesn't have an increment we can just use the
+  // condition as the continue block.  Otherwise we'll need to create
+  // a block for it (in the current scope, i.e. in the scope of the
+  // condition), and that we will become our continue block.
+  if (S.getInc())
+    Continue = getJumpDestInCurrentScope("for.inc");
+
+  // Store the blocks to use for break and continue.
+  BreakContinueStack.push_back(BreakContinue(LoopExit, Continue));
+
   // Create a cleanup scope for the condition variable cleanups.
   RunCleanupsScope ConditionScope(*this);
 
@@ -657,7 +683,9 @@
 
     // C99 6.8.5p2/p4: The first substatement is executed if the expression
     // compares unequal to 0.  The condition must be a scalar type.
-    EmitBranchOnBoolExpr(S.getCond(), ForBody, ExitBlock);
+    llvm::Value *BoolCondVal = EvaluateExprAsBool(S.getCond());
+    Builder.CreateCondBr(BoolCondVal, ForBody, ExitBlock,
+                         PGO.createLoopWeights(S.getCond(), Cnt));
 
     if (ExitBlock != LoopExit.getBlock()) {
       EmitBlock(ExitBlock);
@@ -669,16 +697,7 @@
     // Treat it as a non-zero constant.  Don't even create a new block for the
     // body, just fall into it.
   }
-
-  // If the for loop doesn't have an increment we can just use the
-  // condition as the continue block.  Otherwise we'll need to create
-  // a block for it (in the current scope, i.e. in the scope of the
-  // condition), and that we will become our continue block.
-  if (S.getInc())
-    Continue = getJumpDestInCurrentScope("for.inc");
-
-  // Store the blocks to use for break and continue.
-  BreakContinueStack.push_back(BreakContinue(LoopExit, Continue));
+  Cnt.beginRegion(Builder);
 
   {
     // Create a separate cleanup scope for the body, in case it is not
@@ -720,6 +739,8 @@
   EmitStmt(S.getRangeStmt());
   EmitStmt(S.getBeginEndStmt());
 
+  RegionCounter Cnt = getPGORegionCounter(&S);
+
   // Start the loop with a block that tests the condition.
   // If there's an increment, the continue scope will be overwritten
   // later.
@@ -737,7 +758,9 @@
 
   // The body is executed if the expression, contextually converted
   // to bool, is true.
-  EmitBranchOnBoolExpr(S.getCond(), ForBody, ExitBlock);
+  llvm::Value *BoolCondVal = EvaluateExprAsBool(S.getCond());
+  Builder.CreateCondBr(BoolCondVal, ForBody, ExitBlock,
+                       PGO.createLoopWeights(S.getCond(), Cnt));
 
   if (ExitBlock != LoopExit.getBlock()) {
     EmitBlock(ExitBlock);
@@ -745,6 +768,7 @@
   }
 
   EmitBlock(ForBody);
+  Cnt.beginRegion(Builder);
 
   // Create a block for the increment. In case of a 'continue', we jump there.
   JumpDest Continue = getJumpDestInCurrentScope("for.inc");
@@ -809,7 +833,8 @@
 
   // FIXME: Clean this up by using an LValue for ReturnTemp,
   // EmitStoreThroughLValue, and EmitAnyExpr.
-  if (S.getNRVOCandidate() && S.getNRVOCandidate()->isNRVOVariable()) {
+  if (getLangOpts().ElideConstructors &&
+      S.getNRVOCandidate() && S.getNRVOCandidate()->isNRVOVariable()) {
     // Apply the named return value optimization for this return statement,
     // which means doing nothing: the appropriate result has already been
     // constructed into the NRVO variable.
@@ -818,7 +843,7 @@
     // that the cleanup code should not destroy the variable.
     if (llvm::Value *NRVOFlag = NRVOFlags[S.getNRVOCandidate()])
       Builder.CreateStore(Builder.getTrue(), NRVOFlag);
-  } else if (!ReturnValue) {
+  } else if (!ReturnValue || (RV && RV->getType()->isVoidType())) {
     // Make sure not to return anything, but evaluate the expression
     // for side effects.
     if (RV)
@@ -866,9 +891,8 @@
   if (HaveInsertPoint())
     EmitStopPoint(&S);
 
-  for (DeclStmt::const_decl_iterator I = S.decl_begin(), E = S.decl_end();
-       I != E; ++I)
-    EmitDecl(**I);
+  for (const auto *I : S.decls())
+    EmitDecl(*I);
 }
 
 void CodeGenFunction::EmitBreakStmt(const BreakStmt &S) {
@@ -880,8 +904,7 @@
   if (HaveInsertPoint())
     EmitStopPoint(&S);
 
-  JumpDest Block = BreakContinueStack.back().BreakBlock;
-  EmitBranchThroughCleanup(Block);
+  EmitBranchThroughCleanup(BreakContinueStack.back().BreakBlock);
 }
 
 void CodeGenFunction::EmitContinueStmt(const ContinueStmt &S) {
@@ -893,8 +916,7 @@
   if (HaveInsertPoint())
     EmitStopPoint(&S);
 
-  JumpDest Block = BreakContinueStack.back().ContinueBlock;
-  EmitBranchThroughCleanup(Block);
+  EmitBranchThroughCleanup(BreakContinueStack.back().ContinueBlock);
 }
 
 /// EmitCaseStmtRange - If case statement range is not too big then
@@ -906,11 +928,13 @@
   llvm::APSInt LHS = S.getLHS()->EvaluateKnownConstInt(getContext());
   llvm::APSInt RHS = S.getRHS()->EvaluateKnownConstInt(getContext());
 
+  RegionCounter CaseCnt = getPGORegionCounter(&S);
+
   // Emit the code for this case. We do this first to make sure it is
   // properly chained from our predecessor before generating the
   // switch machinery to enter this block.
-  EmitBlock(createBasicBlock("sw.bb"));
-  llvm::BasicBlock *CaseDest = Builder.GetInsertBlock();
+  llvm::BasicBlock *CaseDest = createBasicBlock("sw.bb");
+  EmitBlockWithFallThrough(CaseDest, CaseCnt);
   EmitStmt(S.getSubStmt());
 
   // If range is empty, do nothing.
@@ -921,7 +945,18 @@
   // FIXME: parameters such as this should not be hardcoded.
   if (Range.ult(llvm::APInt(Range.getBitWidth(), 64))) {
     // Range is small enough to add multiple switch instruction cases.
-    for (unsigned i = 0, e = Range.getZExtValue() + 1; i != e; ++i) {
+    uint64_t Total = CaseCnt.getCount();
+    unsigned NCases = Range.getZExtValue() + 1;
+    // We only have one region counter for the entire set of cases here, so we
+    // need to divide the weights evenly between the generated cases, ensuring
+    // that the total weight is preserved. E.g., a weight of 5 over three cases
+    // will be distributed as weights of 2, 2, and 1.
+    uint64_t Weight = Total / NCases, Rem = Total % NCases;
+    for (unsigned I = 0; I != NCases; ++I) {
+      if (SwitchWeights)
+        SwitchWeights->push_back(Weight + (Rem ? 1 : 0));
+      if (Rem)
+        Rem--;
       SwitchInsn->addCase(Builder.getInt(LHS), CaseDest);
       LHS++;
     }
@@ -946,7 +981,19 @@
     Builder.CreateSub(SwitchInsn->getCondition(), Builder.getInt(LHS));
   llvm::Value *Cond =
     Builder.CreateICmpULE(Diff, Builder.getInt(Range), "inbounds");
-  Builder.CreateCondBr(Cond, CaseDest, FalseDest);
+
+  llvm::MDNode *Weights = 0;
+  if (SwitchWeights) {
+    uint64_t ThisCount = CaseCnt.getCount();
+    uint64_t DefaultCount = (*SwitchWeights)[0];
+    Weights = PGO.createBranchWeights(ThisCount, DefaultCount);
+
+    // Since we're chaining the switch default through each large case range, we
+    // need to update the weight for the default, ie, the first case, to include
+    // this case.
+    (*SwitchWeights)[0] += ThisCount;
+  }
+  Builder.CreateCondBr(Cond, CaseDest, FalseDest, Weights);
 
   // Restore the appropriate insertion point.
   if (RestoreBB)
@@ -959,7 +1006,7 @@
   // If there is no enclosing switch instance that we're aware of, then this
   // case statement and its block can be elided.  This situation only happens
   // when we've constant-folded the switch, are emitting the constant case,
-  // and part of the constant case includes another case statement.  For 
+  // and part of the constant case includes another case statement.  For
   // instance: switch (4) { case 4: do { case 5: } while (1); }
   if (!SwitchInsn) {
     EmitStmt(S.getSubStmt());
@@ -972,17 +1019,22 @@
     return;
   }
 
+  RegionCounter CaseCnt = getPGORegionCounter(&S);
   llvm::ConstantInt *CaseVal =
     Builder.getInt(S.getLHS()->EvaluateKnownConstInt(getContext()));
 
-  // If the body of the case is just a 'break', and if there was no fallthrough,
-  // try to not emit an empty block.
-  if ((CGM.getCodeGenOpts().OptimizationLevel > 0) &&
+  // If the body of the case is just a 'break', try to not emit an empty block.
+  // If we're profiling or we're not optimizing, leave the block in for better
+  // debug and coverage analysis.
+  if (!CGM.getCodeGenOpts().ProfileInstrGenerate &&
+      CGM.getCodeGenOpts().OptimizationLevel > 0 &&
       isa<BreakStmt>(S.getSubStmt())) {
     JumpDest Block = BreakContinueStack.back().BreakBlock;
 
     // Only do this optimization if there are no cleanups that need emitting.
     if (isObviouslyBranchWithoutCleanups(Block)) {
+      if (SwitchWeights)
+        SwitchWeights->push_back(CaseCnt.getCount());
       SwitchInsn->addCase(CaseVal, Block.getBlock());
 
       // If there was a fallthrough into this case, make sure to redirect it to
@@ -995,8 +1047,10 @@
     }
   }
 
-  EmitBlock(createBasicBlock("sw.bb"));
-  llvm::BasicBlock *CaseDest = Builder.GetInsertBlock();
+  llvm::BasicBlock *CaseDest = createBasicBlock("sw.bb");
+  EmitBlockWithFallThrough(CaseDest, CaseCnt);
+  if (SwitchWeights)
+    SwitchWeights->push_back(CaseCnt.getCount());
   SwitchInsn->addCase(CaseVal, CaseDest);
 
   // Recursively emitting the statement is acceptable, but is not wonderful for
@@ -1014,8 +1068,17 @@
   // Otherwise, iteratively add consecutive cases to this switch stmt.
   while (NextCase && NextCase->getRHS() == 0) {
     CurCase = NextCase;
-    llvm::ConstantInt *CaseVal = 
+    llvm::ConstantInt *CaseVal =
       Builder.getInt(CurCase->getLHS()->EvaluateKnownConstInt(getContext()));
+
+    CaseCnt = getPGORegionCounter(NextCase);
+    if (SwitchWeights)
+      SwitchWeights->push_back(CaseCnt.getCount());
+    if (CGM.getCodeGenOpts().ProfileInstrGenerate) {
+      CaseDest = createBasicBlock("sw.bb");
+      EmitBlockWithFallThrough(CaseDest, CaseCnt);
+    }
+
     SwitchInsn->addCase(CaseVal, CaseDest);
     NextCase = dyn_cast<CaseStmt>(CurCase->getSubStmt());
   }
@@ -1028,7 +1091,10 @@
   llvm::BasicBlock *DefaultBlock = SwitchInsn->getDefaultDest();
   assert(DefaultBlock->empty() &&
          "EmitDefaultStmt: Default block already defined?");
-  EmitBlock(DefaultBlock);
+
+  RegionCounter Cnt = getPGORegionCounter(&S);
+  EmitBlockWithFallThrough(DefaultBlock, Cnt);
+
   EmitStmt(S.getSubStmt());
 }
 
@@ -1185,7 +1251,8 @@
 static bool FindCaseStatementsForValue(const SwitchStmt &S,
                                        const llvm::APSInt &ConstantCondValue,
                                 SmallVectorImpl<const Stmt*> &ResultStmts,
-                                       ASTContext &C) {
+                                       ASTContext &C,
+                                       const SwitchCase *&ResultCase) {
   // First step, find the switch case that is being branched to.  We can do this
   // efficiently by scanning the SwitchCase list.
   const SwitchCase *Case = S.getSwitchCaseList();
@@ -1228,6 +1295,7 @@
   //   while (1) {
   //     case 4: ...
   bool FoundCase = false;
+  ResultCase = Case;
   return CollectStatementsForCase(S.getBody(), Case, FoundCase,
                                   ResultStmts) != CSFC_Failure &&
          FoundCase;
@@ -1243,6 +1311,7 @@
 
   // Handle nested switch statements.
   llvm::SwitchInst *SavedSwitchInsn = SwitchInsn;
+  SmallVector<uint64_t, 16> *SavedSwitchWeights = SwitchWeights;
   llvm::BasicBlock *SavedCRBlock = CaseRangeBlock;
 
   // See if we can constant fold the condition of the switch and therefore only
@@ -1250,8 +1319,13 @@
   llvm::APSInt ConstantCondValue;
   if (ConstantFoldsToSimpleInteger(S.getCond(), ConstantCondValue)) {
     SmallVector<const Stmt*, 4> CaseStmts;
+    const SwitchCase *Case = 0;
     if (FindCaseStatementsForValue(S, ConstantCondValue, CaseStmts,
-                                   getContext())) {
+                                   getContext(), Case)) {
+      if (Case) {
+        RegionCounter CaseCnt = getPGORegionCounter(Case);
+        CaseCnt.beginRegion(Builder);
+      }
       RunCleanupsScope ExecutedScope(*this);
 
       // At this point, we are no longer "within" a switch instance, so
@@ -1263,6 +1337,8 @@
       // specified series of statements and we're good.
       for (unsigned i = 0, e = CaseStmts.size(); i != e; ++i)
         EmitStmt(CaseStmts[i]);
+      RegionCounter ExitCnt = getPGORegionCounter(&S);
+      ExitCnt.beginRegion(Builder);
 
       // Now we want to restore the saved switch instance so that nested
       // switches continue to function properly
@@ -1280,12 +1356,29 @@
   // failure.
   llvm::BasicBlock *DefaultBlock = createBasicBlock("sw.default");
   SwitchInsn = Builder.CreateSwitch(CondV, DefaultBlock);
+  if (PGO.haveRegionCounts()) {
+    // Walk the SwitchCase list to find how many there are.
+    uint64_t DefaultCount = 0;
+    unsigned NumCases = 0;
+    for (const SwitchCase *Case = S.getSwitchCaseList();
+         Case;
+         Case = Case->getNextSwitchCase()) {
+      if (isa<DefaultStmt>(Case))
+        DefaultCount = getPGORegionCounter(Case).getCount();
+      NumCases += 1;
+    }
+    SwitchWeights = new SmallVector<uint64_t, 16>();
+    SwitchWeights->reserve(NumCases);
+    // The default needs to be first. We store the edge count, so we already
+    // know the right weight.
+    SwitchWeights->push_back(DefaultCount);
+  }
   CaseRangeBlock = DefaultBlock;
 
   // Clear the insertion point to indicate we are in unreachable code.
   Builder.ClearInsertionPoint();
 
-  // All break statements jump to NextBlock. If BreakContinueStack is non empty
+  // All break statements jump to NextBlock. If BreakContinueStack is non-empty
   // then reuse last ContinueBlock.
   JumpDest OuterContinue;
   if (!BreakContinueStack.empty())
@@ -1320,8 +1413,20 @@
 
   // Emit continuation.
   EmitBlock(SwitchExit.getBlock(), true);
+  RegionCounter ExitCnt = getPGORegionCounter(&S);
+  ExitCnt.beginRegion(Builder);
 
+  if (SwitchWeights) {
+    assert(SwitchWeights->size() == 1 + SwitchInsn->getNumCases() &&
+           "switch weights do not match switch cases");
+    // If there's only one jump destination there's no sense weighting it.
+    if (SwitchWeights->size() > 1)
+      SwitchInsn->setMetadata(llvm::LLVMContext::MD_prof,
+                              PGO.createBranchWeights(*SwitchWeights));
+    delete SwitchWeights;
+  }
   SwitchInsn = SavedSwitchInsn;
+  SwitchWeights = SavedSwitchWeights;
   CaseRangeBlock = SavedCRBlock;
 }
 
@@ -1493,7 +1598,7 @@
       Name = GAS->getOutputName(i);
     TargetInfo::ConstraintInfo Info(S.getOutputConstraint(i), Name);
     bool IsValid = getTarget().validateOutputConstraint(Info); (void)IsValid;
-    assert(IsValid && "Failed to parse output constraint"); 
+    assert(IsValid && "Failed to parse output constraint");
     OutputConstraintInfos.push_back(Info);
   }
 
@@ -1829,8 +1934,8 @@
   // Create the function declaration.
   FunctionType::ExtInfo ExtInfo;
   const CGFunctionInfo &FuncInfo =
-    CGM.getTypes().arrangeFunctionDeclaration(Ctx.VoidTy, Args, ExtInfo,
-                                              /*IsVariadic=*/false);
+      CGM.getTypes().arrangeFreeFunctionDeclaration(Ctx.VoidTy, Args, ExtInfo,
+                                                    /*IsVariadic=*/false);
   llvm::FunctionType *FuncLLVMTy = CGM.getTypes().GetFunctionType(FuncInfo);
 
   llvm::Function *F =
diff --git a/lib/CodeGen/CGVTT.cpp b/lib/CodeGen/CGVTT.cpp
index bfff470..bd280ea 100644
--- a/lib/CodeGen/CGVTT.cpp
+++ b/lib/CodeGen/CGVTT.cpp
@@ -66,7 +66,8 @@
     if (VTTVT.getBase() == RD) {
       // Just get the address point for the regular vtable.
       AddressPoint =
-          ItaniumVTContext.getVTableLayout(RD).getAddressPoint(i->VTableBase);
+          getItaniumVTableContext().getVTableLayout(RD).getAddressPoint(
+              i->VTableBase);
       assert(AddressPoint != 0 && "Did not find vtable address point!");
     } else {
       AddressPoint = VTableAddressPoints[i->VTableIndex].lookup(i->VTableBase);
@@ -94,7 +95,7 @@
   VTT->setLinkage(Linkage);
 
   // Set the right visibility.
-  CGM.setTypeVisibility(VTT, RD, CodeGenModule::TVK_ForVTT);
+  CGM.setGlobalVisibility(VTT, RD);
 }
 
 llvm::GlobalVariable *CodeGenVTables::GetAddrOfVTT(const CXXRecordDecl *RD) {
diff --git a/lib/CodeGen/CGVTables.cpp b/lib/CodeGen/CGVTables.cpp
index f28d9b6..0ad765c 100644
--- a/lib/CodeGen/CGVTables.cpp
+++ b/lib/CodeGen/CGVTables.cpp
@@ -30,14 +30,7 @@
 using namespace CodeGen;
 
 CodeGenVTables::CodeGenVTables(CodeGenModule &CGM)
-  : CGM(CGM), ItaniumVTContext(CGM.getContext()) {
-  if (CGM.getTarget().getCXXABI().isMicrosoft()) {
-    // FIXME: Eventually, we should only have one of V*TContexts available.
-    // Today we use both in the Microsoft ABI as MicrosoftVFTableContext
-    // is not completely supported in CodeGen yet.
-    MicrosoftVTContext.reset(new MicrosoftVTableContext(CGM.getContext()));
-  }
-}
+    : CGM(CGM), VTContext(CGM.getContext().getVTableContext()) {}
 
 llvm::Constant *CodeGenModule::GetAddrOfThunk(GlobalDecl GD, 
                                               const ThunkInfo &Thunk) {
@@ -54,54 +47,13 @@
   Out.flush();
 
   llvm::Type *Ty = getTypes().GetFunctionTypeForVTable(GD);
-  return GetOrCreateLLVMFunction(Name, Ty, GD, /*ForVTable=*/true);
+  return GetOrCreateLLVMFunction(Name, Ty, GD, /*ForVTable=*/true,
+                                 /*DontDefer*/ true);
 }
 
 static void setThunkVisibility(CodeGenModule &CGM, const CXXMethodDecl *MD,
                                const ThunkInfo &Thunk, llvm::Function *Fn) {
   CGM.setGlobalVisibility(Fn, MD);
-
-  if (!CGM.getCodeGenOpts().HiddenWeakVTables)
-    return;
-
-  // If the thunk has weak/linkonce linkage, but the function must be
-  // emitted in every translation unit that references it, then we can
-  // emit its thunks with hidden visibility, since its thunks must be
-  // emitted when the function is.
-
-  // This follows CodeGenModule::setTypeVisibility; see the comments
-  // there for explanation.
-
-  if ((Fn->getLinkage() != llvm::GlobalVariable::LinkOnceODRLinkage &&
-       Fn->getLinkage() != llvm::GlobalVariable::WeakODRLinkage) ||
-      Fn->getVisibility() != llvm::GlobalVariable::DefaultVisibility)
-    return;
-
-  if (MD->getExplicitVisibility(ValueDecl::VisibilityForValue))
-    return;
-
-  switch (MD->getTemplateSpecializationKind()) {
-  case TSK_ExplicitInstantiationDefinition:
-  case TSK_ExplicitInstantiationDeclaration:
-    return;
-
-  case TSK_Undeclared:
-    break;
-
-  case TSK_ExplicitSpecialization:
-  case TSK_ImplicitInstantiation:
-    return;
-    break;
-  }
-
-  // If there's an explicit definition, and that definition is
-  // out-of-line, then we can't assume that all users will have a
-  // definition to emit.
-  const FunctionDecl *Def = 0;
-  if (MD->hasBody(Def) && Def->isOutOfLine())
-    return;
-
-  Fn->setVisibility(llvm::GlobalValue::HiddenVisibility);
 }
 
 #ifndef NDEBUG
@@ -177,7 +129,7 @@
                                       GlobalDecl GD, const ThunkInfo &Thunk) {
   const CXXMethodDecl *MD = cast<CXXMethodDecl>(GD.getDecl());
   const FunctionProtoType *FPT = MD->getType()->getAs<FunctionProtoType>();
-  QualType ResultType = FPT->getResultType();
+  QualType ResultType = FPT->getReturnType();
 
   // Get the original function
   assert(FnInfo.isVariadic());
@@ -248,11 +200,11 @@
   QualType ThisType = MD->getThisType(getContext());
   const FunctionProtoType *FPT = MD->getType()->getAs<FunctionProtoType>();
   QualType ResultType =
-    CGM.getCXXABI().HasThisReturn(GD) ? ThisType : FPT->getResultType();
+      CGM.getCXXABI().HasThisReturn(GD) ? ThisType : FPT->getReturnType();
   FunctionArgList FunctionArgs;
 
   // Create the implicit 'this' parameter declaration.
-  CGM.getCXXABI().BuildInstanceFunctionParams(*this, ResultType, FunctionArgs);
+  CGM.getCXXABI().buildThisParam(*this, FunctionArgs);
 
   // Add the rest of the parameters.
   for (FunctionDecl::param_const_iterator I = MD->param_begin(),
@@ -260,6 +212,9 @@
        I != E; ++I)
     FunctionArgs.push_back(*I);
 
+  if (isa<CXXDestructorDecl>(MD))
+    CGM.getCXXABI().addImplicitStructorParams(*this, ResultType, FunctionArgs);
+
   // Start defining the function.
   StartFunction(GlobalDecl(), ResultType, Fn, FnInfo, FunctionArgs,
                 SourceLocation());
@@ -316,7 +271,7 @@
 
   // Determine whether we have a return value slot to use.
   QualType ResultType =
-    CGM.getCXXABI().HasThisReturn(GD) ? ThisType : FPT->getResultType();
+      CGM.getCXXABI().HasThisReturn(GD) ? ThisType : FPT->getReturnType();
   ReturnValueSlot Slot;
   if (!ResultType->isVoidType() &&
       CurFnInfo->getReturnInfo().getKind() == ABIArgInfo::Indirect &&
@@ -461,12 +416,8 @@
   if (isa<CXXDestructorDecl>(MD) && GD.getDtorType() == Dtor_Base)
     return;
 
-  const VTableContextBase::ThunkInfoVectorTy *ThunkInfoVector;
-  if (MicrosoftVTContext.isValid()) {
-    ThunkInfoVector = MicrosoftVTContext->getThunkInfo(GD);
-  } else {
-    ThunkInfoVector = ItaniumVTContext.getThunkInfo(GD);
-  }
+  const VTableContextBase::ThunkInfoVectorTy *ThunkInfoVector =
+      VTContext->getThunkInfo(GD);
 
   if (!ThunkInfoVector)
     return;
@@ -603,8 +554,8 @@
   if (CGDebugInfo *DI = CGM.getModuleDebugInfo())
     DI->completeClassData(Base.getBase());
 
-  OwningPtr<VTableLayout> VTLayout(
-      ItaniumVTContext.createConstructionVTableLayout(
+  std::unique_ptr<VTableLayout> VTLayout(
+      getItaniumVTableContext().createConstructionVTableLayout(
           Base.getBase(), Base.getBaseOffset(), BaseIsVirtual, RD));
 
   // Add the address points.
@@ -633,7 +584,7 @@
   // Create the variable that will hold the construction vtable.
   llvm::GlobalVariable *VTable = 
     CGM.CreateOrReplaceCXXRuntimeVariable(Name, ArrayType, Linkage);
-  CGM.setTypeVisibility(VTable, RD, CodeGenModule::TVK_ForConstructionVTable);
+  CGM.setGlobalVisibility(VTable, RD);
 
   // V-tables are always unnamed_addr.
   VTable->setUnnamedAddr(true);
@@ -752,7 +703,7 @@
 /// strongly elsewhere.  Otherwise, we'd just like to avoid emitting
 /// v-tables when unnecessary.
 bool CodeGenVTables::isVTableExternal(const CXXRecordDecl *RD) {
-  assert(RD->isDynamicClass() && "Non dynamic classes have no VTable.");
+  assert(RD->isDynamicClass() && "Non-dynamic classes have no VTable.");
 
   // If we have an explicit instantiation declaration (and not a
   // definition), the v-table is defined elsewhere.
diff --git a/lib/CodeGen/CGVTables.h b/lib/CodeGen/CGVTables.h
index e8cd55e..e1554be 100644
--- a/lib/CodeGen/CGVTables.h
+++ b/lib/CodeGen/CGVTables.h
@@ -31,11 +31,8 @@
 class CodeGenVTables {
   CodeGenModule &CGM;
 
-  // FIXME: Consider moving ItaniumVTContext and MicrosoftVTContext into
-  // respective CXXABI classes?
-  ItaniumVTableContext ItaniumVTContext;
-  OwningPtr<MicrosoftVTableContext> MicrosoftVTContext;
-  
+  VTableContextBase *VTContext;
+
   /// VTableAddressPointsMapTy - Address points for a single vtable.
   typedef llvm::DenseMap<BaseSubobject, uint64_t> VTableAddressPointsMapTy;
 
@@ -72,10 +69,12 @@
 
   CodeGenVTables(CodeGenModule &CGM);
 
-  ItaniumVTableContext &getItaniumVTableContext() { return ItaniumVTContext; }
+  ItaniumVTableContext &getItaniumVTableContext() {
+    return *cast<ItaniumVTableContext>(VTContext);
+  }
 
   MicrosoftVTableContext &getMicrosoftVTableContext() {
-    return *MicrosoftVTContext.get();
+    return *cast<MicrosoftVTableContext>(VTContext);
   }
 
   /// getSubVTTIndex - Return the index of the sub-VTT for the base class of the
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 83dbbf0..12cb1ab 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -1,12 +1,18 @@
 set(LLVM_LINK_COMPONENTS
-  asmparser
-  bitreader
-  bitwriter
-  irreader
-  instrumentation
-  ipo
-  linker
-  vectorize
+  BitReader
+  BitWriter
+  Core
+  IPO
+  IRReader
+  InstCombine
+  Instrumentation
+  Linker
+  MC
+  ObjCARCOpts
+  ScalarOpts
+  Support
+  Target
+  TransformUtils
   )
 
 add_clang_library(clangCodeGen
@@ -14,12 +20,12 @@
   CGAtomic.cpp
   CGBlocks.cpp
   CGBuiltin.cpp
-  CGCall.cpp
-  CGClass.cpp
   CGCUDANV.cpp
   CGCUDARuntime.cpp
   CGCXX.cpp
   CGCXXABI.cpp
+  CGCall.cpp
+  CGClass.cpp
   CGCleanup.cpp
   CGDebugInfo.cpp
   CGDecl.cpp
@@ -27,45 +33,36 @@
   CGException.cpp
   CGExpr.cpp
   CGExprAgg.cpp
+  CGExprCXX.cpp
   CGExprComplex.cpp
   CGExprConstant.cpp
-  CGExprCXX.cpp
   CGExprScalar.cpp
   CGObjC.cpp
   CGObjCGNU.cpp
   CGObjCMac.cpp
   CGObjCRuntime.cpp
   CGOpenCLRuntime.cpp
-  CGRecordLayoutBuilder.cpp
   CGRTTI.cpp
+  CGRecordLayoutBuilder.cpp
   CGStmt.cpp
-  CGVTables.cpp
   CGVTT.cpp
+  CGVTables.cpp
   CodeGenABITypes.cpp
   CodeGenAction.cpp
   CodeGenFunction.cpp
   CodeGenModule.cpp
+  CodeGenPGO.cpp
   CodeGenTBAA.cpp
   CodeGenTypes.cpp
   ItaniumCXXABI.cpp
   MicrosoftCXXABI.cpp
-  MicrosoftVBTables.cpp
   ModuleBuilder.cpp
   TargetInfo.cpp
-  )
 
-add_dependencies(clangCodeGen
-  ClangARMNeon
-  ClangAttrClasses
-  ClangAttrList
-  ClangCommentNodes
-  ClangDeclNodes
-  ClangDiagnosticCommon
-  ClangDiagnosticFrontend
-  ClangStmtNodes
-  )
+  DEPENDS
+  intrinsics_gen
 
-target_link_libraries(clangCodeGen
+  LINK_LIBS
   clangBasic
   clangAST
   clangFrontend
diff --git a/lib/CodeGen/CodeGenABITypes.cpp b/lib/CodeGen/CodeGenABITypes.cpp
index 18c836c..fba7184 100644
--- a/lib/CodeGen/CodeGenABITypes.cpp
+++ b/lib/CodeGen/CodeGenABITypes.cpp
@@ -17,23 +17,23 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/CodeGen/CodeGenABITypes.h"
-
-#include "clang/CodeGen/CGFunctionInfo.h"
 #include "CodeGenModule.h"
+#include "clang/CodeGen/CGFunctionInfo.h"
+#include "clang/Frontend/CodeGenOptions.h"
 
 using namespace clang;
 using namespace CodeGen;
 
 CodeGenABITypes::CodeGenABITypes(ASTContext &C,
-                                 const CodeGenOptions &CodeGenOpts,
                                  llvm::Module &M,
-                                 const llvm::DataLayout &TD,
-                                 DiagnosticsEngine &Diags)
-  : CGM(new CodeGen::CodeGenModule(C, CodeGenOpts, M, TD, Diags)) {
+                                 const llvm::DataLayout &TD)
+  : CGO(new CodeGenOptions),
+    CGM(new CodeGen::CodeGenModule(C, *CGO, M, TD, C.getDiagnostics())) {
 }
 
 CodeGenABITypes::~CodeGenABITypes()
 {
+  delete CGO;
   delete CGM;
 }
 
@@ -60,10 +60,10 @@
 }
 
 const CGFunctionInfo &
-CodeGenABITypes::arrangeLLVMFunctionInfo(CanQualType returnType,
+CodeGenABITypes::arrangeFreeFunctionCall(CanQualType returnType,
                                          llvm::ArrayRef<CanQualType> argTypes,
                                          FunctionType::ExtInfo info,
                                          RequiredArgs args) {
-  return CGM->getTypes().arrangeLLVMFunctionInfo(returnType, argTypes,
-                                                info, args);
+  return CGM->getTypes().arrangeLLVMFunctionInfo(
+      returnType, /*IsInstanceMethod=*/false, argTypes, info, args);
 }
diff --git a/lib/CodeGen/CodeGenAction.cpp b/lib/CodeGen/CodeGenAction.cpp
index 3072204..573f8e9 100644
--- a/lib/CodeGen/CodeGenAction.cpp
+++ b/lib/CodeGen/CodeGenAction.cpp
@@ -18,17 +18,19 @@
 #include "clang/CodeGen/ModuleBuilder.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IRReader/IRReader.h"
-#include "llvm/Linker.h"
+#include "llvm/Linker/Linker.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/Timer.h"
+#include <memory>
 using namespace clang;
 using namespace llvm;
 
@@ -45,42 +47,33 @@
 
     Timer LLVMIRGeneration;
 
-    OwningPtr<CodeGenerator> Gen;
+    std::unique_ptr<CodeGenerator> Gen;
 
-    OwningPtr<llvm::Module> TheModule, LinkModule;
+    std::unique_ptr<llvm::Module> TheModule, LinkModule;
 
   public:
     BackendConsumer(BackendAction action, DiagnosticsEngine &_Diags,
                     const CodeGenOptions &compopts,
                     const TargetOptions &targetopts,
-                    const LangOptions &langopts,
-                    bool TimePasses,
-                    const std::string &infile,
-                    llvm::Module *LinkModule,
-                    raw_ostream *OS,
-                    LLVMContext &C) :
-      Diags(_Diags),
-      Action(action),
-      CodeGenOpts(compopts),
-      TargetOpts(targetopts),
-      LangOpts(langopts),
-      AsmOutStream(OS),
-      Context(), 
-      LLVMIRGeneration("LLVM IR Generation Time"),
-      Gen(CreateLLVMCodeGen(Diags, infile, compopts, targetopts, C)),
-      LinkModule(LinkModule)
-    {
+                    const LangOptions &langopts, bool TimePasses,
+                    const std::string &infile, llvm::Module *LinkModule,
+                    raw_ostream *OS, LLVMContext &C)
+        : Diags(_Diags), Action(action), CodeGenOpts(compopts),
+          TargetOpts(targetopts), LangOpts(langopts), AsmOutStream(OS),
+          Context(), LLVMIRGeneration("LLVM IR Generation Time"),
+          Gen(CreateLLVMCodeGen(Diags, infile, compopts, targetopts, C)),
+          LinkModule(LinkModule) {
       llvm::TimePassesIsEnabled = TimePasses;
     }
 
-    llvm::Module *takeModule() { return TheModule.take(); }
-    llvm::Module *takeLinkModule() { return LinkModule.take(); }
+    llvm::Module *takeModule() { return TheModule.release(); }
+    llvm::Module *takeLinkModule() { return LinkModule.release(); }
 
-    virtual void HandleCXXStaticMemberVarInstantiation(VarDecl *VD) {
+    void HandleCXXStaticMemberVarInstantiation(VarDecl *VD) override {
       Gen->HandleCXXStaticMemberVarInstantiation(VD);
     }
 
-    virtual void Initialize(ASTContext &Ctx) {
+    void Initialize(ASTContext &Ctx) override {
       Context = &Ctx;
 
       if (llvm::TimePassesIsEnabled)
@@ -94,7 +87,7 @@
         LLVMIRGeneration.stopTimer();
     }
 
-    virtual bool HandleTopLevelDecl(DeclGroupRef D) {
+    bool HandleTopLevelDecl(DeclGroupRef D) override {
       PrettyStackTraceDecl CrashInfo(*D.begin(), SourceLocation(),
                                      Context->getSourceManager(),
                                      "LLVM IR generation of declaration");
@@ -110,7 +103,7 @@
       return true;
     }
 
-    virtual void HandleTranslationUnit(ASTContext &C) {
+    void HandleTranslationUnit(ASTContext &C) override {
       {
         PrettyStackTraceString CrashInfo("Per-file LLVM IR generation");
         if (llvm::TimePassesIsEnabled)
@@ -132,7 +125,7 @@
       if (!M) {
         // The module has been released by IR gen on failures, do not double
         // free.
-        TheModule.take();
+        TheModule.release();
         return;
       }
 
@@ -158,41 +151,49 @@
       void *OldContext = Ctx.getInlineAsmDiagnosticContext();
       Ctx.setInlineAsmDiagnosticHandler(InlineAsmDiagHandler, this);
 
+      LLVMContext::DiagnosticHandlerTy OldDiagnosticHandler =
+          Ctx.getDiagnosticHandler();
+      void *OldDiagnosticContext = Ctx.getDiagnosticContext();
+      Ctx.setDiagnosticHandler(DiagnosticHandler, this);
+
       EmitBackendOutput(Diags, CodeGenOpts, TargetOpts, LangOpts,
+                        C.getTargetInfo().getTargetDescription(),
                         TheModule.get(), Action, AsmOutStream);
-      
+
       Ctx.setInlineAsmDiagnosticHandler(OldHandler, OldContext);
+
+      Ctx.setDiagnosticHandler(OldDiagnosticHandler, OldDiagnosticContext);
     }
 
-    virtual void HandleTagDeclDefinition(TagDecl *D) {
+    void HandleTagDeclDefinition(TagDecl *D) override {
       PrettyStackTraceDecl CrashInfo(D, SourceLocation(),
                                      Context->getSourceManager(),
                                      "LLVM IR generation of declaration");
       Gen->HandleTagDeclDefinition(D);
     }
 
-    virtual void HandleTagDeclRequiredDefinition(const TagDecl *D) {
+    void HandleTagDeclRequiredDefinition(const TagDecl *D) override {
       Gen->HandleTagDeclRequiredDefinition(D);
     }
 
-    virtual void CompleteTentativeDefinition(VarDecl *D) {
+    void CompleteTentativeDefinition(VarDecl *D) override {
       Gen->CompleteTentativeDefinition(D);
     }
 
-    virtual void HandleVTable(CXXRecordDecl *RD, bool DefinitionRequired) {
+    void HandleVTable(CXXRecordDecl *RD, bool DefinitionRequired) override {
       Gen->HandleVTable(RD, DefinitionRequired);
     }
 
-    virtual void HandleLinkerOptionPragma(llvm::StringRef Opts) {
+    void HandleLinkerOptionPragma(llvm::StringRef Opts) override {
       Gen->HandleLinkerOptionPragma(Opts);
     }
 
-    virtual void HandleDetectMismatch(llvm::StringRef Name,
-                                      llvm::StringRef Value) {
+    void HandleDetectMismatch(llvm::StringRef Name,
+                                      llvm::StringRef Value) override {
       Gen->HandleDetectMismatch(Name, Value);
     }
 
-    virtual void HandleDependentLibrary(llvm::StringRef Opts) {
+    void HandleDependentLibrary(llvm::StringRef Opts) override {
       Gen->HandleDependentLibrary(Opts);
     }
 
@@ -202,8 +203,23 @@
       ((BackendConsumer*)Context)->InlineAsmDiagHandler2(SM, Loc);
     }
 
+    static void DiagnosticHandler(const llvm::DiagnosticInfo &DI,
+                                  void *Context) {
+      ((BackendConsumer *)Context)->DiagnosticHandlerImpl(DI);
+    }
+
     void InlineAsmDiagHandler2(const llvm::SMDiagnostic &,
                                SourceLocation LocCookie);
+
+    void DiagnosticHandlerImpl(const llvm::DiagnosticInfo &DI);
+    /// \brief Specialized handler for InlineAsm diagnostic.
+    /// \return True if the diagnostic has been successfully reported, false
+    /// otherwise.
+    bool InlineAsmDiagHandler(const llvm::DiagnosticInfoInlineAsm &D);
+    /// \brief Specialized handler for StackSize diagnostic.
+    /// \return True if the diagnostic has been successfully reported, false
+    /// otherwise.
+    bool StackSizeDiagHandler(const llvm::DiagnosticInfoStackSize &D);
   };
   
   void BackendConsumer::anchor() {}
@@ -282,7 +298,116 @@
   Diags.Report(Loc, diag::err_fe_inline_asm).AddString(Message);
 }
 
-//
+#define ComputeDiagID(Severity, GroupName, DiagID)                             \
+  do {                                                                         \
+    switch (Severity) {                                                        \
+    case llvm::DS_Error:                                                       \
+      DiagID = diag::err_fe_##GroupName;                                       \
+      break;                                                                   \
+    case llvm::DS_Warning:                                                     \
+      DiagID = diag::warn_fe_##GroupName;                                      \
+      break;                                                                   \
+    case llvm::DS_Remark:                                                      \
+      llvm_unreachable("'remark' severity not expected");                      \
+      break;                                                                   \
+    case llvm::DS_Note:                                                        \
+      DiagID = diag::note_fe_##GroupName;                                      \
+      break;                                                                   \
+    }                                                                          \
+  } while (false)
+
+#define ComputeDiagRemarkID(Severity, GroupName, DiagID)                       \
+  do {                                                                         \
+    switch (Severity) {                                                        \
+    case llvm::DS_Error:                                                       \
+      DiagID = diag::err_fe_##GroupName;                                       \
+      break;                                                                   \
+    case llvm::DS_Warning:                                                     \
+      DiagID = diag::warn_fe_##GroupName;                                      \
+      break;                                                                   \
+    case llvm::DS_Remark:                                                      \
+      DiagID = diag::remark_fe_##GroupName;                                    \
+      break;                                                                   \
+    case llvm::DS_Note:                                                        \
+      DiagID = diag::note_fe_##GroupName;                                      \
+      break;                                                                   \
+    }                                                                          \
+  } while (false)
+
+bool
+BackendConsumer::InlineAsmDiagHandler(const llvm::DiagnosticInfoInlineAsm &D) {
+  unsigned DiagID;
+  ComputeDiagID(D.getSeverity(), inline_asm, DiagID);
+  std::string Message = D.getMsgStr().str();
+
+  // If this problem has clang-level source location information, report the
+  // issue as being a problem in the source with a note showing the instantiated
+  // code.
+  SourceLocation LocCookie =
+      SourceLocation::getFromRawEncoding(D.getLocCookie());
+  if (LocCookie.isValid())
+    Diags.Report(LocCookie, DiagID).AddString(Message);
+  else {
+    // Otherwise, report the backend diagnostic as occurring in the generated
+    // .s file.
+    // If Loc is invalid, we still need to report the diagnostic, it just gets
+    // no location info.
+    FullSourceLoc Loc;
+    Diags.Report(Loc, DiagID).AddString(Message);
+  }
+  // We handled all the possible severities.
+  return true;
+}
+
+bool
+BackendConsumer::StackSizeDiagHandler(const llvm::DiagnosticInfoStackSize &D) {
+  if (D.getSeverity() != llvm::DS_Warning)
+    // For now, the only support we have for StackSize diagnostic is warning.
+    // We do not know how to format other severities.
+    return false;
+
+  // FIXME: We should demangle the function name.
+  // FIXME: Is there a way to get a location for that function?
+  FullSourceLoc Loc;
+  Diags.Report(Loc, diag::warn_fe_backend_frame_larger_than)
+      << D.getStackSize() << D.getFunction().getName();
+  return true;
+}
+
+/// \brief This function is invoked when the backend needs
+/// to report something to the user.
+void BackendConsumer::DiagnosticHandlerImpl(const DiagnosticInfo &DI) {
+  unsigned DiagID = diag::err_fe_inline_asm;
+  llvm::DiagnosticSeverity Severity = DI.getSeverity();
+  // Get the diagnostic ID based.
+  switch (DI.getKind()) {
+  case llvm::DK_InlineAsm:
+    if (InlineAsmDiagHandler(cast<DiagnosticInfoInlineAsm>(DI)))
+      return;
+    ComputeDiagID(Severity, inline_asm, DiagID);
+    break;
+  case llvm::DK_StackSize:
+    if (StackSizeDiagHandler(cast<DiagnosticInfoStackSize>(DI)))
+      return;
+    ComputeDiagID(Severity, backend_frame_larger_than, DiagID);
+    break;
+  default:
+    // Plugin IDs are not bound to any value as they are set dynamically.
+    ComputeDiagRemarkID(Severity, backend_plugin, DiagID);
+    break;
+  }
+  std::string MsgStorage;
+  {
+    raw_string_ostream Stream(MsgStorage);
+    DiagnosticPrinterRawOStream DP(Stream);
+    DI.print(DP);
+  }
+
+  // Report the backend message using the usual diagnostic mechanism.
+  FullSourceLoc Loc;
+  Diags.Report(Loc, DiagID).AddString(MsgStorage);
+}
+#undef ComputeDiagID
 
 CodeGenAction::CodeGenAction(unsigned _Act, LLVMContext *_VMContext)
   : Act(_Act), LinkModule(0),
@@ -310,9 +435,7 @@
   TheModule.reset(BEConsumer->takeModule());
 }
 
-llvm::Module *CodeGenAction::takeModule() {
-  return TheModule.take();
-}
+llvm::Module *CodeGenAction::takeModule() { return TheModule.release(); }
 
 llvm::LLVMContext *CodeGenAction::takeLLVMContext() {
   OwnsVMContext = false;
@@ -342,7 +465,7 @@
 ASTConsumer *CodeGenAction::CreateASTConsumer(CompilerInstance &CI,
                                               StringRef InFile) {
   BackendAction BA = static_cast<BackendAction>(Act);
-  OwningPtr<raw_ostream> OS(GetOutputStream(CI, InFile, BA));
+  std::unique_ptr<raw_ostream> OS(GetOutputStream(CI, InFile, BA));
   if (BA != Backend_EmitNothing && !OS)
     return 0;
 
@@ -362,20 +485,20 @@
       return 0;
     }
 
-    LinkModuleToUse = getLazyBitcodeModule(BCBuf, *VMContext, &ErrorStr);
-    if (!LinkModuleToUse) {
+    ErrorOr<llvm::Module *> ModuleOrErr =
+        getLazyBitcodeModule(BCBuf, *VMContext);
+    if (error_code EC = ModuleOrErr.getError()) {
       CI.getDiagnostics().Report(diag::err_cannot_open_file)
-        << LinkBCFile << ErrorStr;
+        << LinkBCFile << EC.message();
       return 0;
     }
+    LinkModuleToUse = ModuleOrErr.get();
   }
 
-  BEConsumer = 
-      new BackendConsumer(BA, CI.getDiagnostics(),
-                          CI.getCodeGenOpts(), CI.getTargetOpts(),
-                          CI.getLangOpts(),
-                          CI.getFrontendOpts().ShowTimers, InFile,
-                          LinkModuleToUse, OS.take(), *VMContext);
+  BEConsumer = new BackendConsumer(BA, CI.getDiagnostics(), CI.getCodeGenOpts(),
+                                   CI.getTargetOpts(), CI.getLangOpts(),
+                                   CI.getFrontendOpts().ShowTimers, InFile,
+                                   LinkModuleToUse, OS.release(), *VMContext);
   return BEConsumer;
 }
 
@@ -408,31 +531,30 @@
         SM.getFileEntryForID(SM.getMainFileID()), Err.getLineNo(),
         Err.getColumnNo() + 1);
 
-      // Get a custom diagnostic for the error. We strip off a leading
-      // diagnostic code if there is one.
+      // Strip off a leading diagnostic code if there is one.
       StringRef Msg = Err.getMessage();
       if (Msg.startswith("error: "))
         Msg = Msg.substr(7);
 
-      // Escape '%', which is interpreted as a format character.
-      SmallString<128> EscapedMessage;
-      for (unsigned i = 0, e = Msg.size(); i != e; ++i) {
-        if (Msg[i] == '%')
-          EscapedMessage += '%';
-        EscapedMessage += Msg[i];
-      }
+      unsigned DiagID =
+          CI.getDiagnostics().getCustomDiagID(DiagnosticsEngine::Error, "%0");
 
-      unsigned DiagID = CI.getDiagnostics().getCustomDiagID(
-          DiagnosticsEngine::Error, EscapedMessage);
-
-      CI.getDiagnostics().Report(Loc, DiagID);
+      CI.getDiagnostics().Report(Loc, DiagID) << Msg;
       return;
     }
+    const TargetOptions &TargetOpts = CI.getTargetOpts();
+    if (TheModule->getTargetTriple() != TargetOpts.Triple) {
+      unsigned DiagID = CI.getDiagnostics().getCustomDiagID(
+          DiagnosticsEngine::Warning,
+          "overriding the module target triple with %0");
 
-    EmitBackendOutput(CI.getDiagnostics(), CI.getCodeGenOpts(),
-                      CI.getTargetOpts(), CI.getLangOpts(),
-                      TheModule.get(),
-                      BA, OS);
+      CI.getDiagnostics().Report(SourceLocation(), DiagID) << TargetOpts.Triple;
+      TheModule->setTargetTriple(TargetOpts.Triple);
+    }
+
+    EmitBackendOutput(CI.getDiagnostics(), CI.getCodeGenOpts(), TargetOpts,
+                      CI.getLangOpts(), CI.getTarget().getTargetDescription(),
+                      TheModule.get(), BA, OS);
     return;
   }
 
diff --git a/lib/CodeGen/CodeGenFunction.cpp b/lib/CodeGen/CodeGenFunction.cpp
index ce1b445..806540a 100644
--- a/lib/CodeGen/CodeGenFunction.cpp
+++ b/lib/CodeGen/CodeGenFunction.cpp
@@ -16,12 +16,12 @@
 #include "CGCXXABI.h"
 #include "CGDebugInfo.h"
 #include "CodeGenModule.h"
+#include "CodeGenPGO.h"
 #include "TargetInfo.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/StmtCXX.h"
-#include "clang/Basic/OpenCL.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/CodeGen/CGFunctionInfo.h"
 #include "clang/Frontend/CodeGenOptions.h"
@@ -44,7 +44,8 @@
       NextCleanupDestIndex(1), FirstBlockInfo(0), EHResumeBlock(0),
       ExceptionSlot(0), EHSelectorSlot(0), DebugInfo(CGM.getModuleDebugInfo()),
       DisableDebugInfo(false), DidCallStackSave(false), IndirectBranch(0),
-      SwitchInsn(0), CaseRangeBlock(0), UnreachableBlock(0), NumReturnExprs(0),
+      PGO(cgm), SwitchInsn(0), SwitchWeights(0),
+      CaseRangeBlock(0), UnreachableBlock(0), NumReturnExprs(0),
       NumSimpleReturnExprs(0), CXXABIThisDecl(0), CXXABIThisValue(0),
       CXXThisValue(0), CXXDefaultInitExprThis(0),
       CXXStructorImplicitParamDecl(0), CXXStructorImplicitParamValue(0),
@@ -157,7 +158,7 @@
   // cleans up functions which started with a unified return block.
   if (ReturnBlock.getBlock()->hasOneUse()) {
     llvm::BranchInst *BI =
-      dyn_cast<llvm::BranchInst>(*ReturnBlock.getBlock()->use_begin());
+      dyn_cast<llvm::BranchInst>(*ReturnBlock.getBlock()->user_begin());
     if (BI && BI->isUnconditional() &&
         BI->getSuccessor(0) == ReturnBlock.getBlock()) {
       // Reset insertion point, including debug location, and delete the
@@ -275,6 +276,14 @@
 
   if (CGM.getCodeGenOpts().EmitDeclMetadata)
     EmitDeclMetadata();
+
+  for (SmallVectorImpl<std::pair<llvm::Instruction *, llvm::Value *> >::iterator
+           I = DeferredReplacements.begin(),
+           E = DeferredReplacements.end();
+       I != E; ++I) {
+    I->first->replaceAllUsesWith(I->second);
+    I->first->eraseFromParent();
+  }
 }
 
 /// ShouldInstrumentFunction - Return true if the current function should be
@@ -381,7 +390,12 @@
       if (pointeeTy.isVolatileQualified())
         typeQuals += typeQuals.empty() ? "volatile" : " volatile";
     } else {
-      addressQuals.push_back(Builder.getInt32(0));
+      uint32_t AddrSpc = 0;
+      if (ty->isImageType())
+        AddrSpc =
+          CGM.getContext().getTargetAddressSpace(LangAS::opencl_global);
+
+      addressQuals.push_back(Builder.getInt32(AddrSpc));
 
       // Get argument type name.
       std::string typeName = ty.getUnqualifiedType().getAsString();
@@ -399,16 +413,17 @@
       if (ty.isVolatileQualified())
         typeQuals += typeQuals.empty() ? "volatile" : " volatile";
     }
-    
+
     argTypeQuals.push_back(llvm::MDString::get(Context, typeQuals));
 
     // Get image access qualifier:
     if (ty->isImageType()) {
-      if (parm->hasAttr<OpenCLImageAccessAttr>() &&
-          parm->getAttr<OpenCLImageAccessAttr>()->getAccess() == CLIA_write_only)
+      const OpenCLImageAccessAttr *A = parm->getAttr<OpenCLImageAccessAttr>();
+      if (A && A->isWriteOnly())
         accessQuals.push_back(llvm::MDString::get(Context, "write_only"));
       else
         accessQuals.push_back(llvm::MDString::get(Context, "read_only"));
+      // FIXME: what about read_write?
     } else
       accessQuals.push_back(llvm::MDString::get(Context, "none"));
 
@@ -438,16 +453,15 @@
     GenOpenCLArgMetadata(FD, Fn, CGM, Context, kernelMDArgs,
                          Builder, getContext());
 
-  if (FD->hasAttr<VecTypeHintAttr>()) {
-    VecTypeHintAttr *attr = FD->getAttr<VecTypeHintAttr>();
-    QualType hintQTy = attr->getTypeHint();
+  if (const VecTypeHintAttr *A = FD->getAttr<VecTypeHintAttr>()) {
+    QualType hintQTy = A->getTypeHint();
     const ExtVectorType *hintEltQTy = hintQTy->getAs<ExtVectorType>();
     bool isSignedInteger =
         hintQTy->isSignedIntegerType() ||
         (hintEltQTy && hintEltQTy->getElementType()->isSignedIntegerType());
     llvm::Value *attrMDArgs[] = {
       llvm::MDString::get(Context, "vec_type_hint"),
-      llvm::UndefValue::get(CGM.getTypes().ConvertType(attr->getTypeHint())),
+      llvm::UndefValue::get(CGM.getTypes().ConvertType(A->getTypeHint())),
       llvm::ConstantInt::get(
           llvm::IntegerType::get(Context, 32),
           llvm::APInt(32, (uint64_t)(isSignedInteger ? 1 : 0)))
@@ -455,24 +469,22 @@
     kernelMDArgs.push_back(llvm::MDNode::get(Context, attrMDArgs));
   }
 
-  if (FD->hasAttr<WorkGroupSizeHintAttr>()) {
-    WorkGroupSizeHintAttr *attr = FD->getAttr<WorkGroupSizeHintAttr>();
+  if (const WorkGroupSizeHintAttr *A = FD->getAttr<WorkGroupSizeHintAttr>()) {
     llvm::Value *attrMDArgs[] = {
       llvm::MDString::get(Context, "work_group_size_hint"),
-      Builder.getInt32(attr->getXDim()),
-      Builder.getInt32(attr->getYDim()),
-      Builder.getInt32(attr->getZDim())
+      Builder.getInt32(A->getXDim()),
+      Builder.getInt32(A->getYDim()),
+      Builder.getInt32(A->getZDim())
     };
     kernelMDArgs.push_back(llvm::MDNode::get(Context, attrMDArgs));
   }
 
-  if (FD->hasAttr<ReqdWorkGroupSizeAttr>()) {
-    ReqdWorkGroupSizeAttr *attr = FD->getAttr<ReqdWorkGroupSizeAttr>();
+  if (const ReqdWorkGroupSizeAttr *A = FD->getAttr<ReqdWorkGroupSizeAttr>()) {
     llvm::Value *attrMDArgs[] = {
       llvm::MDString::get(Context, "reqd_work_group_size"),
-      Builder.getInt32(attr->getXDim()),
-      Builder.getInt32(attr->getYDim()),
-      Builder.getInt32(attr->getZDim())
+      Builder.getInt32(A->getXDim()),
+      Builder.getInt32(A->getYDim()),
+      Builder.getInt32(A->getZDim())
     };
     kernelMDArgs.push_back(llvm::MDNode::get(Context, attrMDArgs));
   }
@@ -505,15 +517,18 @@
   }
 
   // Pass inline keyword to optimizer if it appears explicitly on any
-  // declaration.
-  if (!CGM.getCodeGenOpts().NoInline)
-    if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D))
-      for (FunctionDecl::redecl_iterator RI = FD->redecls_begin(),
-             RE = FD->redecls_end(); RI != RE; ++RI)
+  // declaration. Also, in the case of -fno-inline attach NoInline
+  // attribute to all function that are not marked AlwaysInline.
+  if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D)) {
+    if (!CGM.getCodeGenOpts().NoInline) {
+      for (auto RI : FD->redecls())
         if (RI->isInlineSpecified()) {
           Fn->addFnAttr(llvm::Attribute::InlineHint);
           break;
         }
+    } else if (!FD->hasAttr<AlwaysInlineAttr>())
+      Fn->addFnAttr(llvm::Attribute::NoInline);
+  }
 
   if (getLangOpts().OpenCL) {
     // Add metadata for a kernel function.
@@ -581,6 +596,14 @@
     // Indirect aggregate return; emit returned value directly into sret slot.
     // This reduces code size, and affects correctness in C++.
     ReturnValue = CurFn->arg_begin();
+  } else if (CurFnInfo->getReturnInfo().getKind() == ABIArgInfo::InAlloca &&
+             !hasScalarEvaluationKind(CurFnInfo->getReturnType())) {
+    // Load the sret pointer from the argument struct and return into that.
+    unsigned Idx = CurFnInfo->getReturnInfo().getInAllocaFieldIndex();
+    llvm::Function::arg_iterator EI = CurFn->arg_end();
+    --EI;
+    llvm::Value *Addr = Builder.CreateStructGEP(EI, Idx);
+    ReturnValue = Builder.CreateLoad(Addr, "agg.result");
   } else {
     ReturnValue = CreateIRTemp(RetTy, "retval");
 
@@ -645,12 +668,34 @@
 
 void CodeGenFunction::EmitFunctionBody(FunctionArgList &Args,
                                        const Stmt *Body) {
+  RegionCounter Cnt = getPGORegionCounter(Body);
+  Cnt.beginRegion(Builder);
   if (const CompoundStmt *S = dyn_cast<CompoundStmt>(Body))
     EmitCompoundStmtWithoutScope(*S);
   else
     EmitStmt(Body);
 }
 
+/// When instrumenting to collect profile data, the counts for some blocks
+/// such as switch cases need to not include the fall-through counts, so
+/// emit a branch around the instrumentation code. When not instrumenting,
+/// this just calls EmitBlock().
+void CodeGenFunction::EmitBlockWithFallThrough(llvm::BasicBlock *BB,
+                                               RegionCounter &Cnt) {
+  llvm::BasicBlock *SkipCountBB = 0;
+  if (HaveInsertPoint() && CGM.getCodeGenOpts().ProfileInstrGenerate) {
+    // When instrumenting for profiling, the fallthrough to certain
+    // statements needs to skip over the instrumentation code so that we
+    // get an accurate count.
+    SkipCountBB = createBasicBlock("skipcount");
+    EmitBranch(SkipCountBB);
+  }
+  EmitBlock(BB);
+  Cnt.beginRegion(Builder, /*AddIncomingFallThrough=*/true);
+  if (SkipCountBB)
+    EmitBlock(SkipCountBB);
+}
+
 /// Tries to mark the given function nounwind based on the
 /// non-existence of any throwing calls within it.  We believe this is
 /// lightweight enough to do at -O0.
@@ -691,19 +736,22 @@
     DebugInfo = NULL; // disable debug info indefinitely for this function
 
   FunctionArgList Args;
-  QualType ResTy = FD->getResultType();
+  QualType ResTy = FD->getReturnType();
 
   CurGD = GD;
-  const CXXMethodDecl *MD;
-  if ((MD = dyn_cast<CXXMethodDecl>(FD)) && MD->isInstance()) {
+  const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(FD);
+  if (MD && MD->isInstance()) {
     if (CGM.getCXXABI().HasThisReturn(GD))
       ResTy = MD->getThisType(getContext());
-    CGM.getCXXABI().BuildInstanceFunctionParams(*this, ResTy, Args);
+    CGM.getCXXABI().buildThisParam(*this, Args);
   }
 
   for (unsigned i = 0, e = FD->getNumParams(); i != e; ++i)
     Args.push_back(FD->getParamDecl(i));
 
+  if (MD && (isa<CXXConstructorDecl>(MD) || isa<CXXDestructorDecl>(MD)))
+    CGM.getCXXABI().addImplicitStructorParams(*this, ResTy, Args);
+
   SourceRange BodyRange;
   if (Stmt *Body = FD->getBody()) BodyRange = Body->getSourceRange();
   CurEHLocation = BodyRange.getEnd();
@@ -712,6 +760,7 @@
   StartFunction(GD, ResTy, Fn, FnInfo, Args, BodyRange.getBegin());
 
   // Generate the body of the function.
+  PGO.assignRegionCounters(GD.getDecl(), CurFn);
   if (isa<CXXDestructorDecl>(FD))
     EmitDestructorBody(Args);
   else if (isa<CXXConstructorDecl>(FD))
@@ -753,7 +802,7 @@
   //   If the '}' that terminates a function is reached, and the value of the
   //   function call is used by the caller, the behavior is undefined.
   if (getLangOpts().CPlusPlus && !FD->hasImplicitReturnZero() &&
-      !FD->getResultType()->isVoidType() && Builder.GetInsertBlock()) {
+      !FD->getReturnType()->isVoidType() && Builder.GetInsertBlock()) {
     if (SanOpts->Return)
       EmitCheck(Builder.getFalse(), "missing_return",
                 EmitCheckSourceLocation(FD->getLocation()),
@@ -771,6 +820,9 @@
   // a quick pass now to see if we can.
   if (!CurFn->doesNotThrow())
     TryMarkNoThrow(CurFn);
+
+  PGO.emitInstrumentationData();
+  PGO.destroyRegionCounters();
 }
 
 /// ContainsLabel - Return true if the statement contains a label in it.  If
@@ -869,19 +921,25 @@
 ///
 void CodeGenFunction::EmitBranchOnBoolExpr(const Expr *Cond,
                                            llvm::BasicBlock *TrueBlock,
-                                           llvm::BasicBlock *FalseBlock) {
+                                           llvm::BasicBlock *FalseBlock,
+                                           uint64_t TrueCount) {
   Cond = Cond->IgnoreParens();
 
   if (const BinaryOperator *CondBOp = dyn_cast<BinaryOperator>(Cond)) {
+
     // Handle X && Y in a condition.
     if (CondBOp->getOpcode() == BO_LAnd) {
+      RegionCounter Cnt = getPGORegionCounter(CondBOp);
+
       // If we have "1 && X", simplify the code.  "0 && X" would have constant
       // folded if the case was simple enough.
       bool ConstantBool = false;
       if (ConstantFoldsToSimpleInteger(CondBOp->getLHS(), ConstantBool) &&
           ConstantBool) {
         // br(1 && X) -> br(X).
-        return EmitBranchOnBoolExpr(CondBOp->getRHS(), TrueBlock, FalseBlock);
+        Cnt.beginRegion(Builder);
+        return EmitBranchOnBoolExpr(CondBOp->getRHS(), TrueBlock, FalseBlock,
+                                    TrueCount);
       }
 
       // If we have "X && 1", simplify the code to use an uncond branch.
@@ -889,33 +947,42 @@
       if (ConstantFoldsToSimpleInteger(CondBOp->getRHS(), ConstantBool) &&
           ConstantBool) {
         // br(X && 1) -> br(X).
-        return EmitBranchOnBoolExpr(CondBOp->getLHS(), TrueBlock, FalseBlock);
+        return EmitBranchOnBoolExpr(CondBOp->getLHS(), TrueBlock, FalseBlock,
+                                    TrueCount);
       }
 
       // Emit the LHS as a conditional.  If the LHS conditional is false, we
       // want to jump to the FalseBlock.
       llvm::BasicBlock *LHSTrue = createBasicBlock("land.lhs.true");
+      // The counter tells us how often we evaluate RHS, and all of TrueCount
+      // can be propagated to that branch.
+      uint64_t RHSCount = Cnt.getCount();
 
       ConditionalEvaluation eval(*this);
-      EmitBranchOnBoolExpr(CondBOp->getLHS(), LHSTrue, FalseBlock);
+      EmitBranchOnBoolExpr(CondBOp->getLHS(), LHSTrue, FalseBlock, RHSCount);
       EmitBlock(LHSTrue);
 
       // Any temporaries created here are conditional.
+      Cnt.beginRegion(Builder);
       eval.begin(*this);
-      EmitBranchOnBoolExpr(CondBOp->getRHS(), TrueBlock, FalseBlock);
+      EmitBranchOnBoolExpr(CondBOp->getRHS(), TrueBlock, FalseBlock, TrueCount);
       eval.end(*this);
 
       return;
     }
 
     if (CondBOp->getOpcode() == BO_LOr) {
+      RegionCounter Cnt = getPGORegionCounter(CondBOp);
+
       // If we have "0 || X", simplify the code.  "1 || X" would have constant
       // folded if the case was simple enough.
       bool ConstantBool = false;
       if (ConstantFoldsToSimpleInteger(CondBOp->getLHS(), ConstantBool) &&
           !ConstantBool) {
         // br(0 || X) -> br(X).
-        return EmitBranchOnBoolExpr(CondBOp->getRHS(), TrueBlock, FalseBlock);
+        Cnt.beginRegion(Builder);
+        return EmitBranchOnBoolExpr(CondBOp->getRHS(), TrueBlock, FalseBlock,
+                                    TrueCount);
       }
 
       // If we have "X || 0", simplify the code to use an uncond branch.
@@ -923,20 +990,28 @@
       if (ConstantFoldsToSimpleInteger(CondBOp->getRHS(), ConstantBool) &&
           !ConstantBool) {
         // br(X || 0) -> br(X).
-        return EmitBranchOnBoolExpr(CondBOp->getLHS(), TrueBlock, FalseBlock);
+        return EmitBranchOnBoolExpr(CondBOp->getLHS(), TrueBlock, FalseBlock,
+                                    TrueCount);
       }
 
       // Emit the LHS as a conditional.  If the LHS conditional is true, we
       // want to jump to the TrueBlock.
       llvm::BasicBlock *LHSFalse = createBasicBlock("lor.lhs.false");
+      // We have the count for entry to the RHS and for the whole expression
+      // being true, so we can divy up True count between the short circuit and
+      // the RHS.
+      uint64_t LHSCount = Cnt.getParentCount() - Cnt.getCount();
+      uint64_t RHSCount = TrueCount - LHSCount;
 
       ConditionalEvaluation eval(*this);
-      EmitBranchOnBoolExpr(CondBOp->getLHS(), TrueBlock, LHSFalse);
+      EmitBranchOnBoolExpr(CondBOp->getLHS(), TrueBlock, LHSFalse, LHSCount);
       EmitBlock(LHSFalse);
 
       // Any temporaries created here are conditional.
+      Cnt.beginRegion(Builder);
       eval.begin(*this);
-      EmitBranchOnBoolExpr(CondBOp->getRHS(), TrueBlock, FalseBlock);
+      EmitBranchOnBoolExpr(CondBOp->getRHS(), TrueBlock, FalseBlock, RHSCount);
+
       eval.end(*this);
 
       return;
@@ -945,8 +1020,13 @@
 
   if (const UnaryOperator *CondUOp = dyn_cast<UnaryOperator>(Cond)) {
     // br(!x, t, f) -> br(x, f, t)
-    if (CondUOp->getOpcode() == UO_LNot)
-      return EmitBranchOnBoolExpr(CondUOp->getSubExpr(), FalseBlock, TrueBlock);
+    if (CondUOp->getOpcode() == UO_LNot) {
+      // Negate the count.
+      uint64_t FalseCount = PGO.getCurrentRegionCount() - TrueCount;
+      // Negate the condition and swap the destination blocks.
+      return EmitBranchOnBoolExpr(CondUOp->getSubExpr(), FalseBlock, TrueBlock,
+                                  FalseCount);
+    }
   }
 
   if (const ConditionalOperator *CondOp = dyn_cast<ConditionalOperator>(Cond)) {
@@ -954,17 +1034,32 @@
     llvm::BasicBlock *LHSBlock = createBasicBlock("cond.true");
     llvm::BasicBlock *RHSBlock = createBasicBlock("cond.false");
 
+    RegionCounter Cnt = getPGORegionCounter(CondOp);
     ConditionalEvaluation cond(*this);
-    EmitBranchOnBoolExpr(CondOp->getCond(), LHSBlock, RHSBlock);
+    EmitBranchOnBoolExpr(CondOp->getCond(), LHSBlock, RHSBlock, Cnt.getCount());
+
+    // When computing PGO branch weights, we only know the overall count for
+    // the true block. This code is essentially doing tail duplication of the
+    // naive code-gen, introducing new edges for which counts are not
+    // available. Divide the counts proportionally between the LHS and RHS of
+    // the conditional operator.
+    uint64_t LHSScaledTrueCount = 0;
+    if (TrueCount) {
+      double LHSRatio = Cnt.getCount() / (double) Cnt.getParentCount();
+      LHSScaledTrueCount = TrueCount * LHSRatio;
+    }
 
     cond.begin(*this);
     EmitBlock(LHSBlock);
-    EmitBranchOnBoolExpr(CondOp->getLHS(), TrueBlock, FalseBlock);
+    Cnt.beginRegion(Builder);
+    EmitBranchOnBoolExpr(CondOp->getLHS(), TrueBlock, FalseBlock,
+                         LHSScaledTrueCount);
     cond.end(*this);
 
     cond.begin(*this);
     EmitBlock(RHSBlock);
-    EmitBranchOnBoolExpr(CondOp->getRHS(), TrueBlock, FalseBlock);
+    EmitBranchOnBoolExpr(CondOp->getRHS(), TrueBlock, FalseBlock,
+                         TrueCount - LHSScaledTrueCount);
     cond.end(*this);
 
     return;
@@ -980,9 +1075,15 @@
     return;
   }
 
+  // Create branch weights based on the number of times we get here and the
+  // number of times the condition should be true.
+  uint64_t CurrentCount = std::max(PGO.getCurrentRegionCount(), TrueCount);
+  llvm::MDNode *Weights = PGO.createBranchWeights(TrueCount,
+                                                  CurrentCount - TrueCount);
+
   // Emit the code with the fully general case.
   llvm::Value *CondV = EvaluateExprAsBool(Cond);
-  Builder.CreateCondBr(CondV, TrueBlock, FalseBlock);
+  Builder.CreateCondBr(CondV, TrueBlock, FalseBlock, Weights);
 }
 
 /// ErrorUnsupported - Print out an error that codegen doesn't support the
@@ -1075,7 +1176,7 @@
                                           getContext().getAsArrayType(Ty))) {
       QualType eltType;
       llvm::Value *numElts;
-      llvm::tie(numElts, eltType) = getVLASize(vlaType);
+      std::tie(numElts, eltType) = getVLASize(vlaType);
 
       SizeVal = numElts;
       CharUnits eltSize = getContext().getTypeSizeInChars(eltType);
@@ -1264,7 +1365,7 @@
       numElements = vlaSize;
     } else {
       // It's undefined behavior if this wraps around, so mark it that way.
-      // FIXME: Teach -fcatch-undefined-behavior to trap this.
+      // FIXME: Teach -fsanitize=undefined to trap this.
       numElements = Builder.CreateNUWMul(numElements, vlaSize);
     }
   } while ((type = getContext().getAsVariableArrayType(elementType)));
@@ -1308,6 +1409,10 @@
     case Type::ObjCObjectPointer:
       llvm_unreachable("type class is never variably-modified!");
 
+    case Type::Adjusted:
+      type = cast<AdjustedType>(ty)->getAdjustedType();
+      break;
+
     case Type::Decayed:
       type = cast<DecayedType>(ty)->getPointeeType();
       break;
@@ -1375,7 +1480,7 @@
 
     case Type::FunctionProto:
     case Type::FunctionNoProto:
-      type = cast<FunctionType>(ty)->getResultType();
+      type = cast<FunctionType>(ty)->getReturnType();
       break;
 
     case Type::Paren:
@@ -1464,12 +1569,10 @@
   assert(D->hasAttr<AnnotateAttr>() && "no annotate attribute");
   // FIXME We create a new bitcast for every annotation because that's what
   // llvm-gcc was doing.
-  for (specific_attr_iterator<AnnotateAttr>
-       ai = D->specific_attr_begin<AnnotateAttr>(),
-       ae = D->specific_attr_end<AnnotateAttr>(); ai != ae; ++ai)
+  for (const auto *I : D->specific_attrs<AnnotateAttr>())
     EmitAnnotationCall(CGM.getIntrinsic(llvm::Intrinsic::var_annotation),
                        Builder.CreateBitCast(V, CGM.Int8PtrTy, V->getName()),
-                       (*ai)->getAnnotation(), D->getLocation());
+                       I->getAnnotation(), D->getLocation());
 }
 
 llvm::Value *CodeGenFunction::EmitFieldAnnotations(const FieldDecl *D,
@@ -1479,15 +1582,13 @@
   llvm::Value *F = CGM.getIntrinsic(llvm::Intrinsic::ptr_annotation,
                                     CGM.Int8PtrTy);
 
-  for (specific_attr_iterator<AnnotateAttr>
-       ai = D->specific_attr_begin<AnnotateAttr>(),
-       ae = D->specific_attr_end<AnnotateAttr>(); ai != ae; ++ai) {
+  for (const auto *I : D->specific_attrs<AnnotateAttr>()) {
     // FIXME Always emit the cast inst so we can differentiate between
     // annotation on the first field of a struct and annotation on the struct
     // itself.
     if (VTy != CGM.Int8PtrTy)
       V = Builder.Insert(new llvm::BitCastInst(V, CGM.Int8PtrTy));
-    V = EmitAnnotationCall(F, V, (*ai)->getAnnotation(), D->getLocation());
+    V = EmitAnnotationCall(F, V, I->getAnnotation(), D->getLocation());
     V = Builder.CreateBitCast(V, VTy);
   }
 
diff --git a/lib/CodeGen/CodeGenFunction.h b/lib/CodeGen/CodeGenFunction.h
index db291e3..5b0653a 100644
--- a/lib/CodeGen/CodeGenFunction.h
+++ b/lib/CodeGen/CodeGenFunction.h
@@ -17,8 +17,9 @@
 #include "CGBuilder.h"
 #include "CGDebugInfo.h"
 #include "CGValue.h"
-#include "EHScopeStack.h"
 #include "CodeGenModule.h"
+#include "CodeGenPGO.h"
+#include "EHScopeStack.h"
 #include "clang/AST/CharUnits.h"
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/ExprObjC.h"
@@ -30,8 +31,8 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ValueHandle.h"
 
 namespace llvm {
   class BasicBlock;
@@ -687,8 +688,8 @@
       // act exactly like l-values but are formally required to be
       // r-values in C.
       return expr->isGLValue() ||
-             expr->getType()->isRecordType() ||
-             expr->getType()->isFunctionType();
+             expr->getType()->isFunctionType() ||
+             hasAggregateEvaluationKind(expr->getType());
     }
 
     static OpaqueValueMappingData bind(CodeGenFunction &CGF,
@@ -827,9 +828,21 @@
   };
   SmallVector<BreakContinue, 8> BreakContinueStack;
 
+  CodeGenPGO PGO;
+
+public:
+  /// Get a counter for instrumentation of the region associated with the given
+  /// statement.
+  RegionCounter getPGORegionCounter(const Stmt *S) {
+    return RegionCounter(PGO, S);
+  }
+private:
+
   /// SwitchInsn - This is nearest current switch instruction. It is null if
   /// current context is not in a switch.
   llvm::SwitchInst *SwitchInsn;
+  /// The branch weights of SwitchInsn when doing instrumentation based PGO.
+  SmallVector<uint64_t, 16> *SwitchWeights;
 
   /// CaseRangeBlock - This block holds if condition check for last case
   /// statement range in current switch instruction.
@@ -1019,6 +1032,7 @@
   void pushLifetimeExtendedDestroy(CleanupKind kind, llvm::Value *addr,
                                    QualType type, Destroyer *destroyer,
                                    bool useEHCleanupForArray);
+  void pushStackRestore(CleanupKind kind, llvm::Value *SPMem);
   void emitDestroy(llvm::Value *addr, QualType type, Destroyer *destroyer,
                    bool useEHCleanupForArray);
   llvm::Function *generateDestroyHelper(llvm::Constant *addr, QualType type,
@@ -1137,6 +1151,7 @@
   void EmitDestructorBody(FunctionArgList &Args);
   void emitImplicitAssignmentOperatorBody(FunctionArgList &Args);
   void EmitFunctionBody(FunctionArgList &Args, const Stmt *Body);
+  void EmitBlockWithFallThrough(llvm::BasicBlock *BB, RegionCounter &Cnt);
 
   void EmitForwardingCallToLambda(const CXXMethodDecl *LambdaCallOperator,
                                   CallArgList &CallArgs);
@@ -1380,6 +1395,10 @@
                                  AggValueSlot::IsNotAliased);
   }
 
+  /// CreateInAllocaTmp - Create a temporary memory object for the given
+  /// aggregate type.
+  AggValueSlot CreateInAllocaTmp(QualType T, const Twine &Name = "inalloca");
+
   /// Emit a cast to void* in the appropriate address space.
   llvm::Value *EmitCastToVoidPtr(llvm::Value *value);
 
@@ -1767,7 +1786,8 @@
                          llvm::GlobalValue::LinkageTypes Linkage);
 
   /// EmitParmDecl - Emit a ParmVarDecl or an ImplicitParamDecl.
-  void EmitParmDecl(const VarDecl &D, llvm::Value *Arg, unsigned ArgNo);
+  void EmitParmDecl(const VarDecl &D, llvm::Value *Arg, bool ArgIsPointer,
+                    unsigned ArgNo);
 
   /// protectFromPeepholes - Protect a value that we're intending to
   /// store to the side, but which will probably be used later, from
@@ -2159,6 +2179,18 @@
   llvm::Value *EmitAArch64CompareBuiltinExpr(llvm::Value *Op, llvm::Type *Ty);
   llvm::Value *EmitAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E);
   llvm::Value *EmitARMBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
+
+  llvm::Value *EmitCommonNeonBuiltinExpr(unsigned BuiltinID,
+                                         unsigned LLVMIntrinsic,
+                                         unsigned AltLLVMIntrinsic,
+                                         const char *NameHint,
+                                         unsigned Modifier,
+                                         const CallExpr *E,
+                                         SmallVectorImpl<llvm::Value *> &Ops,
+                                         llvm::Value *Align = 0);
+  llvm::Function *LookupNeonLLVMIntrinsic(unsigned IntrinsicID,
+                                          unsigned Modifier, llvm::Type *ArgTy,
+                                          const CallExpr *E);
   llvm::Value *EmitNeonCall(llvm::Function *F,
                             SmallVectorImpl<llvm::Value*> &O,
                             const char *name,
@@ -2168,6 +2200,20 @@
                                    bool negateForRightShift);
   llvm::Value *EmitNeonRShiftImm(llvm::Value *Vec, llvm::Value *Amt,
                                  llvm::Type *Ty, bool usgn, const char *name);
+  llvm::Value *EmitConcatVectors(llvm::Value *Lo, llvm::Value *Hi,
+                                 llvm::Type *ArgTy);
+  llvm::Value *EmitExtractHigh(llvm::Value *In, llvm::Type *ResTy);
+  // Helper functions for EmitARM64BuiltinExpr.
+  llvm::Value *vectorWrapScalar8(llvm::Value *Op);
+  llvm::Value *vectorWrapScalar16(llvm::Value *Op);
+  llvm::Value *emitVectorWrappedScalar8Intrinsic(
+      unsigned Int, SmallVectorImpl<llvm::Value *> &Ops, const char *Name);
+  llvm::Value *emitVectorWrappedScalar16Intrinsic(
+      unsigned Int, SmallVectorImpl<llvm::Value *> &Ops, const char *Name);
+  llvm::Value *EmitARM64BuiltinExpr(unsigned BuiltinID, const CallExpr *E);
+  llvm::Value *EmitNeon64Call(llvm::Function *F,
+                              llvm::SmallVectorImpl<llvm::Value *> &O,
+                              const char *name);
 
   llvm::Value *BuildVector(ArrayRef<llvm::Value*> Ops);
   llvm::Value *EmitX86BuiltinExpr(unsigned BuiltinID, const CallExpr *E);
@@ -2304,9 +2350,9 @@
 
   /// CreateStaticVarDecl - Create a zero-initialized LLVM global for
   /// a static local variable.
-  llvm::GlobalVariable *CreateStaticVarDecl(const VarDecl &D,
-                                            const char *Separator,
-                                       llvm::GlobalValue::LinkageTypes Linkage);
+  llvm::Constant *CreateStaticVarDecl(const VarDecl &D,
+                                      const char *Separator,
+                                      llvm::GlobalValue::LinkageTypes Linkage);
 
   /// AddInitializerToStaticVarDecl - Add the initializer for 'D' to the
   /// global variable that has already been created for it.  If the initializer
@@ -2413,8 +2459,10 @@
   /// EmitBranchOnBoolExpr - Emit a branch on a boolean condition (e.g. for an
   /// if statement) to the specified blocks.  Based on the condition, this might
   /// try to simplify the codegen of the conditional based on the branch.
+  /// TrueCount should be the number of times we expect the condition to
+  /// evaluate to true based on PGO data.
   void EmitBranchOnBoolExpr(const Expr *Cond, llvm::BasicBlock *TrueBlock,
-                            llvm::BasicBlock *FalseBlock);
+                            llvm::BasicBlock *FalseBlock, uint64_t TrueCount);
 
   /// \brief Emit a description of a type in a format suitable for passing to
   /// a runtime sanitizer handler.
@@ -2467,6 +2515,11 @@
   llvm::MDNode *getRangeForLoadFromType(QualType Ty);
   void EmitReturnOfRValue(RValue RV, QualType Ty);
 
+  void deferPlaceholderReplacement(llvm::Instruction *Old, llvm::Value *New);
+
+  llvm::SmallVector<std::pair<llvm::Instruction *, llvm::Value *>, 4>
+  DeferredReplacements;
+
   /// ExpandTypeFromArgs - Reconstruct a structure of type \arg Ty
   /// from function arguments into \arg Dst. See ABIArgInfo::Expand.
   ///
@@ -2492,69 +2545,81 @@
                                   std::string &ConstraintStr,
                                   SourceLocation Loc);
 
+public:
   /// EmitCallArgs - Emit call arguments for a function.
-  /// The CallArgTypeInfo parameter is used for iterating over the known
-  /// argument types of the function being called.
-  template<typename T>
-  void EmitCallArgs(CallArgList& Args, const T* CallArgTypeInfo,
+  template <typename T>
+  void EmitCallArgs(CallArgList &Args, const T *CallArgTypeInfo,
                     CallExpr::const_arg_iterator ArgBeg,
                     CallExpr::const_arg_iterator ArgEnd,
                     bool ForceColumnInfo = false) {
-    CGDebugInfo *DI = getDebugInfo();
-    SourceLocation CallLoc;
-    if (DI) CallLoc = DI->getLocation();
-
-    CallExpr::const_arg_iterator Arg = ArgBeg;
-
-    // First, use the argument types that the type info knows about
     if (CallArgTypeInfo) {
-      for (typename T::arg_type_iterator I = CallArgTypeInfo->arg_type_begin(),
-           E = CallArgTypeInfo->arg_type_end(); I != E; ++I, ++Arg) {
-        assert(Arg != ArgEnd && "Running over edge of argument list!");
-        QualType ArgType = *I;
-#ifndef NDEBUG
-        QualType ActualArgType = Arg->getType();
-        if (ArgType->isPointerType() && ActualArgType->isPointerType()) {
-          QualType ActualBaseType =
-            ActualArgType->getAs<PointerType>()->getPointeeType();
-          QualType ArgBaseType =
-            ArgType->getAs<PointerType>()->getPointeeType();
-          if (ArgBaseType->isVariableArrayType()) {
-            if (const VariableArrayType *VAT =
-                getContext().getAsVariableArrayType(ActualBaseType)) {
-              if (!VAT->getSizeExpr())
-                ActualArgType = ArgType;
-            }
-          }
-        }
-        assert(getContext().getCanonicalType(ArgType.getNonReferenceType()).
-               getTypePtr() ==
-               getContext().getCanonicalType(ActualArgType).getTypePtr() &&
-               "type mismatch in call argument!");
-#endif
-        EmitCallArg(Args, *Arg, ArgType);
-
-        // Each argument expression could modify the debug
-        // location. Restore it.
-        if (DI) DI->EmitLocation(Builder, CallLoc, ForceColumnInfo);
-      }
-
-      // Either we've emitted all the call args, or we have a call to a
-      // variadic function.
-      assert((Arg == ArgEnd || CallArgTypeInfo->isVariadic()) &&
-             "Extra arguments in non-variadic function!");
-
-    }
-
-    // If we still have any arguments, emit them using the type of the argument.
-    for (; Arg != ArgEnd; ++Arg) {
-      EmitCallArg(Args, *Arg, Arg->getType());
-
-      // Restore the debug location.
-      if (DI) DI->EmitLocation(Builder, CallLoc, ForceColumnInfo);
+      EmitCallArgs(Args, CallArgTypeInfo->isVariadic(),
+                   CallArgTypeInfo->param_type_begin(),
+                   CallArgTypeInfo->param_type_end(), ArgBeg, ArgEnd,
+                   ForceColumnInfo);
+    } else {
+      // T::param_type_iterator might not have a default ctor.
+      const QualType *NoIter = 0;
+      EmitCallArgs(Args, /*AllowExtraArguments=*/true, NoIter, NoIter, ArgBeg,
+                   ArgEnd, ForceColumnInfo);
     }
   }
 
+  template<typename ArgTypeIterator>
+  void EmitCallArgs(CallArgList& Args,
+                    bool AllowExtraArguments,
+                    ArgTypeIterator ArgTypeBeg,
+                    ArgTypeIterator ArgTypeEnd,
+                    CallExpr::const_arg_iterator ArgBeg,
+                    CallExpr::const_arg_iterator ArgEnd,
+                    bool ForceColumnInfo = false) {
+    SmallVector<QualType, 16> ArgTypes;
+    CallExpr::const_arg_iterator Arg = ArgBeg;
+
+    // First, use the argument types that the type info knows about
+    for (ArgTypeIterator I = ArgTypeBeg, E = ArgTypeEnd; I != E; ++I, ++Arg) {
+      assert(Arg != ArgEnd && "Running over edge of argument list!");
+#ifndef NDEBUG
+      QualType ArgType = *I;
+      QualType ActualArgType = Arg->getType();
+      if (ArgType->isPointerType() && ActualArgType->isPointerType()) {
+        QualType ActualBaseType =
+            ActualArgType->getAs<PointerType>()->getPointeeType();
+        QualType ArgBaseType =
+            ArgType->getAs<PointerType>()->getPointeeType();
+        if (ArgBaseType->isVariableArrayType()) {
+          if (const VariableArrayType *VAT =
+              getContext().getAsVariableArrayType(ActualBaseType)) {
+            if (!VAT->getSizeExpr())
+              ActualArgType = ArgType;
+          }
+        }
+      }
+      assert(getContext().getCanonicalType(ArgType.getNonReferenceType()).
+             getTypePtr() ==
+             getContext().getCanonicalType(ActualArgType).getTypePtr() &&
+             "type mismatch in call argument!");
+#endif
+      ArgTypes.push_back(*I);
+    }
+
+    // Either we've emitted all the call args, or we have a call to variadic
+    // function or some other call that allows extra arguments.
+    assert((Arg == ArgEnd || AllowExtraArguments) &&
+           "Extra arguments in non-variadic function!");
+
+    // If we still have any arguments, emit them using the type of the argument.
+    for (; Arg != ArgEnd; ++Arg)
+      ArgTypes.push_back(Arg->getType());
+
+    EmitCallArgs(Args, ArgTypes, ArgBeg, ArgEnd, ForceColumnInfo);
+  }
+
+  void EmitCallArgs(CallArgList &Args, ArrayRef<QualType> ArgTypes,
+                    CallExpr::const_arg_iterator ArgBeg,
+                    CallExpr::const_arg_iterator ArgEnd, bool ForceColumnInfo);
+
+private:
   const TargetCodeGenInfo &getTargetHooks() const {
     return CGM.getTargetCodeGenInfo();
   }
diff --git a/lib/CodeGen/CodeGenModule.cpp b/lib/CodeGen/CodeGenModule.cpp
index 792fbfc..c26f769 100644
--- a/lib/CodeGen/CodeGenModule.cpp
+++ b/lib/CodeGen/CodeGenModule.cpp
@@ -19,6 +19,7 @@
 #include "CGObjCRuntime.h"
 #include "CGOpenCLRuntime.h"
 #include "CodeGenFunction.h"
+#include "CodeGenPGO.h"
 #include "CodeGenTBAA.h"
 #include "TargetInfo.h"
 #include "clang/AST/ASTContext.h"
@@ -40,30 +41,30 @@
 #include "clang/Sema/SemaDiagnostic.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/CallSite.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/Mangler.h"
 
 using namespace clang;
 using namespace CodeGen;
 
 static const char AnnotationSection[] = "llvm.metadata";
 
-static CGCXXABI &createCXXABI(CodeGenModule &CGM) {
+static CGCXXABI *createCXXABI(CodeGenModule &CGM) {
   switch (CGM.getTarget().getCXXABI().getKind()) {
   case TargetCXXABI::GenericAArch64:
   case TargetCXXABI::GenericARM:
   case TargetCXXABI::iOS:
+  case TargetCXXABI::iOS64:
   case TargetCXXABI::GenericItanium:
-    return *CreateItaniumCXXABI(CGM);
+    return CreateItaniumCXXABI(CGM);
   case TargetCXXABI::Microsoft:
-    return *CreateMicrosoftCXXABI(CGM);
+    return CreateMicrosoftCXXABI(CGM);
   }
 
   llvm_unreachable("invalid C++ ABI kind");
@@ -77,7 +78,8 @@
       ABI(createCXXABI(*this)), VMContext(M.getContext()), TBAA(0),
       TheTargetCodeGenInfo(0), Types(*this), VTables(*this), ObjCRuntime(0),
       OpenCLRuntime(0), CUDARuntime(0), DebugInfo(0), ARCData(0),
-      NoObjCARCExceptionsMetadata(0), RRData(0), CFConstantStringClassRef(0),
+      NoObjCARCExceptionsMetadata(0), RRData(0), PGOData(0),
+      CFConstantStringClassRef(0),
       ConstantStringClassRef(0), NSConstantStringType(0),
       NSConcreteGlobalBlock(0), NSConcreteStackBlock(0), BlockObjectAssign(0),
       BlockObjectDispose(0), BlockDescriptorType(0), GenericBlockLiteralType(0),
@@ -117,7 +119,7 @@
   if (SanOpts.Thread ||
       (!CodeGenOpts.RelaxedAliasing && CodeGenOpts.OptimizationLevel > 0))
     TBAA = new CodeGenTBAA(Context, VMContext, CodeGenOpts, getLangOpts(),
-                           ABI.getMangleContext());
+                           getCXXABI().getMangleContext());
 
   // If debug info or coverage generation is enabled, create the CGDebugInfo
   // object.
@@ -131,6 +133,9 @@
   if (C.getLangOpts().ObjCAutoRefCount)
     ARCData = new ARCEntrypoints();
   RRData = new RREntrypoints();
+
+  if (!CodeGenOpts.InstrProfileInput.empty())
+    PGOData = new PGOProfileData(*this, CodeGenOpts.InstrProfileInput);
 }
 
 CodeGenModule::~CodeGenModule() {
@@ -138,7 +143,6 @@
   delete OpenCLRuntime;
   delete CUDARuntime;
   delete TheTargetCodeGenInfo;
-  delete &ABI;
   delete TBAA;
   delete DebugInfo;
   delete ARCData;
@@ -184,10 +188,14 @@
     llvm::Function *OldF = cast<llvm::Function>(Entry);
     llvm::Function *NewF = dyn_cast<llvm::Function>(Replacement);
     if (!NewF) {
-      llvm::ConstantExpr *CE = cast<llvm::ConstantExpr>(Replacement);
-      assert(CE->getOpcode() == llvm::Instruction::BitCast ||
-             CE->getOpcode() == llvm::Instruction::GetElementPtr);
-      NewF = dyn_cast<llvm::Function>(CE->getOperand(0));
+      if (llvm::GlobalAlias *Alias = dyn_cast<llvm::GlobalAlias>(Replacement)) {
+        NewF = dyn_cast<llvm::Function>(Alias->getAliasedGlobal());
+      } else {
+        llvm::ConstantExpr *CE = cast<llvm::ConstantExpr>(Replacement);
+        assert(CE->getOpcode() == llvm::Instruction::BitCast ||
+               CE->getOpcode() == llvm::Instruction::GetElementPtr);
+        NewF = dyn_cast<llvm::Function>(CE->getOperand(0));
+      }
     }
 
     // Replace old with new, but keep the old order.
@@ -201,6 +209,9 @@
 }
 
 void CodeGenModule::checkAliases() {
+  // Check if the constructed aliases are well formed. It is really unfortunate
+  // that we have to do this in CodeGen, but we only construct mangled names
+  // and aliases during codegen.
   bool Error = false;
   for (std::vector<GlobalDecl>::iterator I = Aliases.begin(),
          E = Aliases.end(); I != E; ++I) {
@@ -211,12 +222,37 @@
     llvm::GlobalValue *Entry = GetGlobalValue(MangledName);
     llvm::GlobalAlias *Alias = cast<llvm::GlobalAlias>(Entry);
     llvm::GlobalValue *GV = Alias->getAliasedGlobal();
-    if (GV->isDeclaration()) {
-      Error = true;
-      getDiags().Report(AA->getLocation(), diag::err_alias_to_undefined);
-    } else if (!Alias->resolveAliasedGlobal(/*stopOnWeak*/ false)) {
+    if (!GV) {
       Error = true;
       getDiags().Report(AA->getLocation(), diag::err_cyclic_alias);
+    } else if (GV->isDeclaration()) {
+      Error = true;
+      getDiags().Report(AA->getLocation(), diag::err_alias_to_undefined);
+    }
+
+    // We have to handle alias to weak aliases in here. LLVM itself disallows
+    // this since the object semantics would not match the IL one. For
+    // compatibility with gcc we implement it by just pointing the alias
+    // to its aliasee's aliasee. We also warn, since the user is probably
+    // expecting the link to be weak.
+    llvm::Constant *Aliasee = Alias->getAliasee();
+    llvm::GlobalValue *AliaseeGV;
+    if (auto CE = dyn_cast<llvm::ConstantExpr>(Aliasee)) {
+      assert((CE->getOpcode() == llvm::Instruction::BitCast ||
+              CE->getOpcode() == llvm::Instruction::AddrSpaceCast) &&
+             "Unsupported aliasee");
+      AliaseeGV = cast<llvm::GlobalValue>(CE->getOperand(0));
+    } else {
+      AliaseeGV = cast<llvm::GlobalValue>(Aliasee);
+    }
+    if (auto GA = dyn_cast<llvm::GlobalAlias>(AliaseeGV)) {
+      if (GA->mayBeOverridden()) {
+        getDiags().Report(AA->getLocation(), diag::warn_alias_to_weak_alias)
+          << GA->getAliasedGlobal()->getName() << GA->getName();
+        Aliasee = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+            GA->getAliasee(), Alias->getType());
+        Alias->setAliasee(Aliasee);
+      }
     }
   }
   if (!Error)
@@ -233,6 +269,10 @@
   }
 }
 
+void CodeGenModule::clear() {
+  DeferredDeclsToEmit.clear();
+}
+
 void CodeGenModule::Release() {
   EmitDeferred();
   applyReplacements();
@@ -247,7 +287,7 @@
   EmitCtorList(GlobalDtors, "llvm.global_dtors");
   EmitGlobalAnnotations();
   EmitStaticExternCAliases();
-  EmitLLVMUsed();
+  emitLLVMUsed();
 
   if (CodeGenOpts.Autolink &&
       (Context.getLangOpts().Modules || !LinkerOptionsMetadata.empty())) {
@@ -332,9 +372,9 @@
     Inst->setMetadata(llvm::LLVMContext::MD_tbaa, TBAAInfo);
 }
 
-void CodeGenModule::Error(SourceLocation loc, StringRef error) {
-  unsigned diagID = getDiags().getCustomDiagID(DiagnosticsEngine::Error, error);
-  getDiags().Report(Context.getFullLoc(loc), diagID);
+void CodeGenModule::Error(SourceLocation loc, StringRef message) {
+  unsigned diagID = getDiags().getCustomDiagID(DiagnosticsEngine::Error, "%0");
+  getDiags().Report(Context.getFullLoc(loc), diagID) << message;
 }
 
 /// ErrorUnsupported - Print out an error that codegen doesn't support the
@@ -405,81 +445,13 @@
   TLM = GetLLVMTLSModel(CodeGenOpts.getDefaultTLSModel());
 
   // Override the TLS model if it is explicitly specified.
-  if (D.hasAttr<TLSModelAttr>()) {
-    const TLSModelAttr *Attr = D.getAttr<TLSModelAttr>();
+  if (const TLSModelAttr *Attr = D.getAttr<TLSModelAttr>()) {
     TLM = GetLLVMTLSModel(Attr->getModel());
   }
 
   GV->setThreadLocalMode(TLM);
 }
 
-/// Set the symbol visibility of type information (vtable and RTTI)
-/// associated with the given type.
-void CodeGenModule::setTypeVisibility(llvm::GlobalValue *GV,
-                                      const CXXRecordDecl *RD,
-                                      TypeVisibilityKind TVK) const {
-  setGlobalVisibility(GV, RD);
-
-  if (!CodeGenOpts.HiddenWeakVTables)
-    return;
-
-  // We never want to drop the visibility for RTTI names.
-  if (TVK == TVK_ForRTTIName)
-    return;
-
-  // We want to drop the visibility to hidden for weak type symbols.
-  // This isn't possible if there might be unresolved references
-  // elsewhere that rely on this symbol being visible.
-
-  // This should be kept roughly in sync with setThunkVisibility
-  // in CGVTables.cpp.
-
-  // Preconditions.
-  if (GV->getLinkage() != llvm::GlobalVariable::LinkOnceODRLinkage ||
-      GV->getVisibility() != llvm::GlobalVariable::DefaultVisibility)
-    return;
-
-  // Don't override an explicit visibility attribute.
-  if (RD->getExplicitVisibility(NamedDecl::VisibilityForType))
-    return;
-
-  switch (RD->getTemplateSpecializationKind()) {
-  // We have to disable the optimization if this is an EI definition
-  // because there might be EI declarations in other shared objects.
-  case TSK_ExplicitInstantiationDefinition:
-  case TSK_ExplicitInstantiationDeclaration:
-    return;
-
-  // Every use of a non-template class's type information has to emit it.
-  case TSK_Undeclared:
-    break;
-
-  // In theory, implicit instantiations can ignore the possibility of
-  // an explicit instantiation declaration because there necessarily
-  // must be an EI definition somewhere with default visibility.  In
-  // practice, it's possible to have an explicit instantiation for
-  // an arbitrary template class, and linkers aren't necessarily able
-  // to deal with mixed-visibility symbols.
-  case TSK_ExplicitSpecialization:
-  case TSK_ImplicitInstantiation:
-    return;
-  }
-
-  // If there's a key function, there may be translation units
-  // that don't have the key function's definition.  But ignore
-  // this if we're emitting RTTI under -fno-rtti.
-  if (!(TVK != TVK_ForRTTI) || LangOpts.RTTI) {
-    // FIXME: what should we do if we "lose" the key function during
-    // the emission of the file?
-    if (Context.getCurrentKeyFunction(RD))
-      return;
-  }
-
-  // Otherwise, drop the visibility to hidden.
-  GV->setVisibility(llvm::GlobalValue::HiddenVisibility);
-  GV->setUnnamedAddr(true);
-}
-
 StringRef CodeGenModule::getMangledName(GlobalDecl GD) {
   const NamedDecl *ND = cast<NamedDecl>(GD.getDecl());
 
@@ -581,18 +553,13 @@
 CodeGenModule::getFunctionLinkage(GlobalDecl GD) {
   const FunctionDecl *D = cast<FunctionDecl>(GD.getDecl());
 
-  if (isa<CXXDestructorDecl>(D) &&
-      getCXXABI().useThunkForDtorVariant(cast<CXXDestructorDecl>(D),
-                                         GD.getDtorType()))
-    return llvm::Function::LinkOnceODRLinkage;
-
   GVALinkage Linkage = getContext().GetGVALinkageForFunction(D);
 
   if (Linkage == GVA_Internal)
     return llvm::Function::InternalLinkage;
   
   if (D->hasAttr<DLLExportAttr>())
-    return llvm::Function::DLLExportLinkage;
+    return llvm::Function::ExternalLinkage;
   
   if (D->hasAttr<WeakAttr>())
     return llvm::Function::WeakAnyLinkage;
@@ -622,11 +589,18 @@
   // explicit instantiations can occur in multiple translation units
   // and must all be equivalent. However, we are not allowed to
   // throw away these explicit instantiations.
-  if (Linkage == GVA_ExplicitTemplateInstantiation)
+  if (Linkage == GVA_StrongODR)
     return !Context.getLangOpts().AppleKext
              ? llvm::Function::WeakODRLinkage
              : llvm::Function::ExternalLinkage;
-  
+
+  // Destructor variants in the Microsoft C++ ABI are always linkonce_odr thunks
+  // emitted on an as-needed basis.
+  if (isa<CXXDestructorDecl>(D) &&
+      getCXXABI().useThunkForDtorVariant(cast<CXXDestructorDecl>(D),
+                                         GD.getDtorType()))
+    return llvm::Function::LinkOnceODRLinkage;
+
   // Otherwise, we have strong external linkage.
   assert(Linkage == GVA_StrongExternal);
   return llvm::Function::ExternalLinkage;
@@ -686,10 +660,15 @@
     // Naked implies noinline: we should not be inlining such functions.
     B.addAttribute(llvm::Attribute::Naked);
     B.addAttribute(llvm::Attribute::NoInline);
+  } else if (D->hasAttr<OptimizeNoneAttr>()) {
+    // OptimizeNone implies noinline; we should not be inlining such functions.
+    B.addAttribute(llvm::Attribute::OptimizeNone);
+    B.addAttribute(llvm::Attribute::NoInline);
+  } else if (D->hasAttr<NoDuplicateAttr>()) {
+    B.addAttribute(llvm::Attribute::NoDuplicate);
   } else if (D->hasAttr<NoInlineAttr>()) {
     B.addAttribute(llvm::Attribute::NoInline);
-  } else if ((D->hasAttr<AlwaysInlineAttr>() ||
-              D->hasAttr<ForceInlineAttr>()) &&
+  } else if (D->hasAttr<AlwaysInlineAttr>() &&
              !F->getAttributes().hasAttribute(llvm::AttributeSet::FunctionIndex,
                                               llvm::Attribute::NoInline)) {
     // (noinline wins over always_inline, and we can't specify both in IR)
@@ -704,8 +683,16 @@
   if (D->hasAttr<MinSizeAttr>())
     B.addAttribute(llvm::Attribute::MinSize);
 
+  if (D->hasAttr<OptimizeNoneAttr>()) {
+    // OptimizeNone wins over OptimizeForSize and MinSize.
+    B.removeAttribute(llvm::Attribute::OptimizeForSize);
+    B.removeAttribute(llvm::Attribute::MinSize);
+  }
+
   if (LangOpts.getStackProtector() == LangOptions::SSPOn)
     B.addAttribute(llvm::Attribute::StackProtect);
+  else if (LangOpts.getStackProtector() == LangOptions::SSPStrong)
+    B.addAttribute(llvm::Attribute::StackProtectStrong);
   else if (LangOpts.getStackProtector() == LangOptions::SSPReq)
     B.addAttribute(llvm::Attribute::StackProtectReq);
 
@@ -751,7 +738,7 @@
     GV->setVisibility(llvm::GlobalValue::DefaultVisibility);
 
   if (D->hasAttr<UsedAttr>())
-    AddUsedGlobal(GV);
+    addUsedGlobal(GV);
 
   if (const SectionAttr *SA = D->getAttr<SectionAttr>())
     GV->setSection(SA->getName());
@@ -788,7 +775,12 @@
   if (!IsIncompleteFunction)
     SetLLVMFunctionAttributes(FD, getTypes().arrangeGlobalDeclaration(GD), F);
 
-  if (getCXXABI().HasThisReturn(GD)) {
+  // Add the Returned attribute for "this", except for iOS 5 and earlier
+  // where substantial code, including the libstdc++ dylib, was compiled with
+  // GCC and does not actually return "this".
+  if (getCXXABI().HasThisReturn(GD) &&
+      !(getTarget().getTriple().isiOS() &&
+        getTarget().getTriple().isOSVersionLT(6))) {
     assert(!F->arg_empty() &&
            F->arg_begin()->getType()
              ->canLosslesslyBitCastTo(F->getReturnType()) &&
@@ -800,7 +792,8 @@
   // overridden by a definition.
 
   if (FD->hasAttr<DLLImportAttr>()) {
-    F->setLinkage(llvm::Function::DLLImportLinkage);
+    F->setLinkage(llvm::Function::ExternalLinkage);
+    F->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
   } else if (FD->hasAttr<WeakAttr>() ||
              FD->isWeakImported()) {
     // "extern_weak" is overloaded in LLVM; we probably should have
@@ -808,6 +801,8 @@
     F->setLinkage(llvm::Function::ExternalWeakLinkage);
   } else {
     F->setLinkage(llvm::Function::ExternalLinkage);
+    if (FD->hasAttr<DLLExportAttr>())
+      F->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
 
     LinkageInfo LV = FD->getLinkageAndVisibility();
     if (LV.getLinkage() == ExternalLinkage && LV.isVisibilityExplicit()) {
@@ -825,39 +820,51 @@
                     llvm::Attribute::NoBuiltin);
 }
 
-void CodeGenModule::AddUsedGlobal(llvm::GlobalValue *GV) {
+void CodeGenModule::addUsedGlobal(llvm::GlobalValue *GV) {
   assert(!GV->isDeclaration() &&
          "Only globals with definition can force usage.");
   LLVMUsed.push_back(GV);
 }
 
-void CodeGenModule::EmitLLVMUsed() {
+void CodeGenModule::addCompilerUsedGlobal(llvm::GlobalValue *GV) {
+  assert(!GV->isDeclaration() &&
+         "Only globals with definition can force usage.");
+  LLVMCompilerUsed.push_back(GV);
+}
+
+static void emitUsed(CodeGenModule &CGM, StringRef Name,
+                     std::vector<llvm::WeakVH> &List) {
   // Don't create llvm.used if there is no need.
-  if (LLVMUsed.empty())
+  if (List.empty())
     return;
 
-  // Convert LLVMUsed to what ConstantArray needs.
+  // Convert List to what ConstantArray needs.
   SmallVector<llvm::Constant*, 8> UsedArray;
-  UsedArray.resize(LLVMUsed.size());
-  for (unsigned i = 0, e = LLVMUsed.size(); i != e; ++i) {
+  UsedArray.resize(List.size());
+  for (unsigned i = 0, e = List.size(); i != e; ++i) {
     UsedArray[i] =
-     llvm::ConstantExpr::getBitCast(cast<llvm::Constant>(&*LLVMUsed[i]),
-                                    Int8PtrTy);
+     llvm::ConstantExpr::getBitCast(cast<llvm::Constant>(&*List[i]),
+                                    CGM.Int8PtrTy);
   }
 
   if (UsedArray.empty())
     return;
-  llvm::ArrayType *ATy = llvm::ArrayType::get(Int8PtrTy, UsedArray.size());
+  llvm::ArrayType *ATy = llvm::ArrayType::get(CGM.Int8PtrTy, UsedArray.size());
 
   llvm::GlobalVariable *GV =
-    new llvm::GlobalVariable(getModule(), ATy, false,
+    new llvm::GlobalVariable(CGM.getModule(), ATy, false,
                              llvm::GlobalValue::AppendingLinkage,
                              llvm::ConstantArray::get(ATy, UsedArray),
-                             "llvm.used");
+                             Name);
 
   GV->setSection("llvm.metadata");
 }
 
+void CodeGenModule::emitLLVMUsed() {
+  emitUsed(*this, "llvm.used", LLVMUsed);
+  emitUsed(*this, "llvm.compiler.used", LLVMCompilerUsed);
+}
+
 void CodeGenModule::AppendLinkerOptions(StringRef Opts) {
   llvm::Value *MDOpts = llvm::MDString::get(getLLVMContext(), Opts);
   LinkerOptionsMetadata.push_back(llvm::MDNode::get(getLLVMContext(), MDOpts));
@@ -1002,31 +1009,23 @@
     // Stop if we're out of both deferred v-tables and deferred declarations.
     if (DeferredDeclsToEmit.empty()) break;
 
-    GlobalDecl D = DeferredDeclsToEmit.back();
+    DeferredGlobal &G = DeferredDeclsToEmit.back();
+    GlobalDecl D = G.GD;
+    llvm::GlobalValue *GV = G.GV;
     DeferredDeclsToEmit.pop_back();
 
+    assert(GV == GetGlobalValue(getMangledName(D)));
     // Check to see if we've already emitted this.  This is necessary
     // for a couple of reasons: first, decls can end up in the
     // deferred-decls queue multiple times, and second, decls can end
     // up with definitions in unusual ways (e.g. by an extern inline
     // function acquiring a strong function redefinition).  Just
     // ignore these cases.
-    //
-    // TODO: That said, looking this up multiple times is very wasteful.
-    StringRef Name = getMangledName(D);
-    llvm::GlobalValue *CGRef = GetGlobalValue(Name);
-    assert(CGRef && "Deferred decl wasn't referenced?");
-
-    if (!CGRef->isDeclaration())
-      continue;
-
-    // GlobalAlias::isDeclaration() defers to the aliasee, but for our
-    // purposes an alias counts as a definition.
-    if (isa<llvm::GlobalAlias>(CGRef))
+    if(!GV->isDeclaration())
       continue;
 
     // Otherwise, emit the definition and move on to the next one.
-    EmitGlobalDefinition(D);
+    EmitGlobalDefinition(D, GV);
   }
 }
 
@@ -1096,10 +1095,8 @@
                                          llvm::GlobalValue *GV) {
   assert(D->hasAttr<AnnotateAttr>() && "no annotate attribute");
   // Get the struct elements for these annotations.
-  for (specific_attr_iterator<AnnotateAttr>
-       ai = D->specific_attr_begin<AnnotateAttr>(),
-       ae = D->specific_attr_end<AnnotateAttr>(); ai != ae; ++ai)
-    Annotations.push_back(EmitAnnotateAttr(GV, *ai, D->getLocation()));
+  for (const auto *I : D->specific_attrs<AnnotateAttr>())
+    Annotations.push_back(EmitAnnotateAttr(GV, I, D->getLocation()));
 }
 
 bool CodeGenModule::MayDeferGeneration(const ValueDecl *Global) {
@@ -1196,12 +1193,14 @@
       if (!FD->doesDeclarationForceExternallyVisibleDefinition())
         return;
 
-      const FunctionDecl *InlineDefinition = 0;
-      FD->getBody(InlineDefinition);
-
       StringRef MangledName = getMangledName(GD);
-      DeferredDecls.erase(MangledName);
-      EmitGlobalDefinition(InlineDefinition);
+
+      // Compute the function info and LLVM type.
+      const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD);
+      llvm::Type *Ty = getTypes().GetFunctionType(FI);
+
+      GetOrCreateLLVMFunction(MangledName, Ty, GD, /*ForVTable=*/false,
+                              /*DontDefer=*/false);
       return;
     }
   } else {
@@ -1231,8 +1230,8 @@
   // If the value has already been used, add it directly to the
   // DeferredDeclsToEmit list.
   StringRef MangledName = getMangledName(GD);
-  if (GetGlobalValue(MangledName))
-    DeferredDeclsToEmit.push_back(GD);
+  if (llvm::GlobalValue *GV = GetGlobalValue(MangledName))
+    addDeferredDeclToEmit(GV, GD);
   else {
     // Otherwise, remember that we saw a deferred decl with this name.  The
     // first use of the mangled name will cause it to move into
@@ -1301,8 +1300,7 @@
   if (getFunctionLinkage(GD) != llvm::Function::AvailableExternallyLinkage)
     return true;
   const FunctionDecl *F = cast<FunctionDecl>(GD.getDecl());
-  if (CodeGenOpts.OptimizationLevel == 0 &&
-      !F->hasAttr<AlwaysInlineAttr>() && !F->hasAttr<ForceInlineAttr>())
+  if (CodeGenOpts.OptimizationLevel == 0 && !F->hasAttr<AlwaysInlineAttr>())
     return false;
   // PR9614. Avoid cases where the source code is lying to us. An available
   // externally function should have an equivalent function somewhere else,
@@ -1329,7 +1327,7 @@
     }
 }
 
-void CodeGenModule::EmitGlobalDefinition(GlobalDecl GD) {
+void CodeGenModule::EmitGlobalDefinition(GlobalDecl GD, llvm::GlobalValue *GV) {
   const ValueDecl *D = cast<ValueDecl>(GD.getDecl());
 
   PrettyStackTraceDecl CrashInfo(const_cast<ValueDecl *>(D), D->getLocation(), 
@@ -1351,7 +1349,7 @@
       else if (const CXXDestructorDecl *DD =dyn_cast<CXXDestructorDecl>(Method))
         EmitCXXDestructor(DD, GD.getDtorType());
       else
-        EmitGlobalFunctionDefinition(GD);
+        EmitGlobalFunctionDefinition(GD, GV);
 
       if (Method->isVirtual())
         getVTables().EmitThunks(GD);
@@ -1359,7 +1357,7 @@
       return;
     }
 
-    return EmitGlobalFunctionDefinition(GD);
+    return EmitGlobalFunctionDefinition(GD, GV);
   }
   
   if (const VarDecl *VD = dyn_cast<VarDecl>(D))
@@ -1379,6 +1377,7 @@
 CodeGenModule::GetOrCreateLLVMFunction(StringRef MangledName,
                                        llvm::Type *Ty,
                                        GlobalDecl GD, bool ForVTable,
+                                       bool DontDefer,
                                        llvm::AttributeSet ExtraAttrs) {
   const Decl *D = GD.getDecl();
 
@@ -1398,14 +1397,6 @@
     return llvm::ConstantExpr::getBitCast(Entry, Ty->getPointerTo());
   }
 
-  // All MSVC dtors other than the base dtor are linkonce_odr and delegate to
-  // each other bottoming out with the base dtor.  Therefore we emit non-base
-  // dtors on usage, even if there is no dtor definition in the TU.
-  if (D && isa<CXXDestructorDecl>(D) &&
-      getCXXABI().useThunkForDtorVariant(cast<CXXDestructorDecl>(D),
-                                         GD.getDtorType()))
-    DeferredDeclsToEmit.push_back(GD);
-
   // This function doesn't have a complete type (for example, the return
   // type is an incomplete struct). Use a fake type instead, and make
   // sure not to try to set attributes.
@@ -1433,50 +1424,64 @@
                                              B));
   }
 
-  // This is the first use or definition of a mangled name.  If there is a
-  // deferred decl with this name, remember that we need to emit it at the end
-  // of the file.
-  llvm::StringMap<GlobalDecl>::iterator DDI = DeferredDecls.find(MangledName);
-  if (DDI != DeferredDecls.end()) {
-    // Move the potentially referenced deferred decl to the DeferredDeclsToEmit
-    // list, and remove it from DeferredDecls (since we don't need it anymore).
-    DeferredDeclsToEmit.push_back(DDI->second);
-    DeferredDecls.erase(DDI);
+  if (!DontDefer) {
+    // All MSVC dtors other than the base dtor are linkonce_odr and delegate to
+    // each other bottoming out with the base dtor.  Therefore we emit non-base
+    // dtors on usage, even if there is no dtor definition in the TU.
+    if (D && isa<CXXDestructorDecl>(D) &&
+        getCXXABI().useThunkForDtorVariant(cast<CXXDestructorDecl>(D),
+                                           GD.getDtorType()))
+      addDeferredDeclToEmit(F, GD);
 
-  // Otherwise, if this is a sized deallocation function, emit a weak definition
-  // for it at the end of the translation unit.
-  } else if (D && cast<FunctionDecl>(D)
-                      ->getCorrespondingUnsizedGlobalDeallocationFunction()) {
-    DeferredDeclsToEmit.push_back(GD);
+    // This is the first use or definition of a mangled name.  If there is a
+    // deferred decl with this name, remember that we need to emit it at the end
+    // of the file.
+    llvm::StringMap<GlobalDecl>::iterator DDI = DeferredDecls.find(MangledName);
+    if (DDI != DeferredDecls.end()) {
+      // Move the potentially referenced deferred decl to the
+      // DeferredDeclsToEmit list, and remove it from DeferredDecls (since we
+      // don't need it anymore).
+      addDeferredDeclToEmit(F, DDI->second);
+      DeferredDecls.erase(DDI);
 
-  // Otherwise, there are cases we have to worry about where we're
-  // using a declaration for which we must emit a definition but where
-  // we might not find a top-level definition:
-  //   - member functions defined inline in their classes
-  //   - friend functions defined inline in some class
-  //   - special member functions with implicit definitions
-  // If we ever change our AST traversal to walk into class methods,
-  // this will be unnecessary.
-  //
-  // We also don't emit a definition for a function if it's going to be an entry
-  // in a vtable, unless it's already marked as used.
-  } else if (getLangOpts().CPlusPlus && D) {
-    // Look for a declaration that's lexically in a record.
-    const FunctionDecl *FD = cast<FunctionDecl>(D);
-    FD = FD->getMostRecentDecl();
-    do {
-      if (isa<CXXRecordDecl>(FD->getLexicalDeclContext())) {
-        if (FD->isImplicit() && !ForVTable) {
-          assert(FD->isUsed() && "Sema didn't mark implicit function as used!");
-          DeferredDeclsToEmit.push_back(GD.getWithDecl(FD));
-          break;
-        } else if (FD->doesThisDeclarationHaveABody()) {
-          DeferredDeclsToEmit.push_back(GD.getWithDecl(FD));
-          break;
+      // Otherwise, if this is a sized deallocation function, emit a weak
+      // definition
+      // for it at the end of the translation unit.
+    } else if (D && cast<FunctionDecl>(D)
+                        ->getCorrespondingUnsizedGlobalDeallocationFunction()) {
+      addDeferredDeclToEmit(F, GD);
+
+      // Otherwise, there are cases we have to worry about where we're
+      // using a declaration for which we must emit a definition but where
+      // we might not find a top-level definition:
+      //   - member functions defined inline in their classes
+      //   - friend functions defined inline in some class
+      //   - special member functions with implicit definitions
+      // If we ever change our AST traversal to walk into class methods,
+      // this will be unnecessary.
+      //
+      // We also don't emit a definition for a function if it's going to be an
+      // entry
+      // in a vtable, unless it's already marked as used.
+    } else if (getLangOpts().CPlusPlus && D) {
+      // Look for a declaration that's lexically in a record.
+      const FunctionDecl *FD = cast<FunctionDecl>(D);
+      FD = FD->getMostRecentDecl();
+      do {
+        if (isa<CXXRecordDecl>(FD->getLexicalDeclContext())) {
+          if (FD->isImplicit() && !ForVTable) {
+            assert(FD->isUsed() &&
+                   "Sema didn't mark implicit function as used!");
+            addDeferredDeclToEmit(F, GD.getWithDecl(FD));
+            break;
+          } else if (FD->doesThisDeclarationHaveABody()) {
+            addDeferredDeclToEmit(F, GD.getWithDecl(FD));
+            break;
+          }
         }
-      }
-      FD = FD->getPreviousDecl();
-    } while (FD);
+        FD = FD->getPreviousDecl();
+      } while (FD);
+    }
   }
 
   // Make sure the result is of the requested type.
@@ -1494,13 +1499,14 @@
 /// create it (this occurs when we see a definition of the function).
 llvm::Constant *CodeGenModule::GetAddrOfFunction(GlobalDecl GD,
                                                  llvm::Type *Ty,
-                                                 bool ForVTable) {
+                                                 bool ForVTable,
+                                                 bool DontDefer) {
   // If there was no specific requested type, just convert it now.
   if (!Ty)
     Ty = getTypes().ConvertType(cast<ValueDecl>(GD.getDecl())->getType());
   
   StringRef MangledName = getMangledName(GD);
-  return GetOrCreateLLVMFunction(MangledName, Ty, GD, ForVTable);
+  return GetOrCreateLLVMFunction(MangledName, Ty, GD, ForVTable, DontDefer);
 }
 
 /// CreateRuntimeFunction - Create a new runtime function with the specified
@@ -1509,9 +1515,9 @@
 CodeGenModule::CreateRuntimeFunction(llvm::FunctionType *FTy,
                                      StringRef Name,
                                      llvm::AttributeSet ExtraAttrs) {
-  llvm::Constant *C
-    = GetOrCreateLLVMFunction(Name, FTy, GlobalDecl(), /*ForVTable=*/false,
-                              ExtraAttrs);
+  llvm::Constant *C =
+      GetOrCreateLLVMFunction(Name, FTy, GlobalDecl(), /*ForVTable=*/false,
+                              /*DontDefer=*/false, ExtraAttrs);
   if (llvm::Function *F = dyn_cast<llvm::Function>(C))
     if (F->empty())
       F->setCallingConv(getRuntimeCC());
@@ -1571,6 +1577,13 @@
     return llvm::ConstantExpr::getBitCast(Entry, Ty);
   }
 
+  unsigned AddrSpace = GetGlobalVarAddressSpace(D, Ty->getAddressSpace());
+  llvm::GlobalVariable *GV =
+    new llvm::GlobalVariable(getModule(), Ty->getElementType(), false,
+                             llvm::GlobalValue::ExternalLinkage,
+                             0, MangledName, 0,
+                             llvm::GlobalVariable::NotThreadLocal, AddrSpace);
+
   // This is the first use or definition of a mangled name.  If there is a
   // deferred decl with this name, remember that we need to emit it at the end
   // of the file.
@@ -1578,17 +1591,10 @@
   if (DDI != DeferredDecls.end()) {
     // Move the potentially referenced deferred decl to the DeferredDeclsToEmit
     // list, and remove it from DeferredDecls (since we don't need it anymore).
-    DeferredDeclsToEmit.push_back(DDI->second);
+    addDeferredDeclToEmit(GV, DDI->second);
     DeferredDecls.erase(DDI);
   }
 
-  unsigned AddrSpace = GetGlobalVarAddressSpace(D, Ty->getAddressSpace());
-  llvm::GlobalVariable *GV =
-    new llvm::GlobalVariable(getModule(), Ty->getElementType(), false,
-                             llvm::GlobalValue::ExternalLinkage,
-                             0, MangledName, 0,
-                             llvm::GlobalVariable::NotThreadLocal, AddrSpace);
-
   // Handle things which are present even on external declarations.
   if (D) {
     // FIXME: This code is overly simple and should be merged with other global
@@ -1600,9 +1606,10 @@
     if (LV.getLinkage() != ExternalLinkage) {
       // Don't set internal linkage on declarations.
     } else {
-      if (D->hasAttr<DLLImportAttr>())
-        GV->setLinkage(llvm::GlobalValue::DLLImportLinkage);
-      else if (D->hasAttr<WeakAttr>() || D->isWeakImported())
+      if (D->hasAttr<DLLImportAttr>()) {
+        GV->setLinkage(llvm::GlobalValue::ExternalLinkage);
+        GV->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
+      } else if (D->hasAttr<WeakAttr>() || D->isWeakImported())
         GV->setLinkage(llvm::GlobalValue::ExternalWeakLinkage);
 
       // Set visibility on a declaration only if it's explicit.
@@ -1627,6 +1634,12 @@
   if (AddrSpace != Ty->getAddressSpace())
     return llvm::ConstantExpr::getAddrSpaceCast(GV, Ty);
 
+  if (getTarget().getTriple().getArch() == llvm::Triple::xcore &&
+      D->getLanguageLinkage() == CLanguageLinkage &&
+      D->getType().isConstant(Context) &&
+      isExternallyVisible(D->getLinkageAndVisibility().getLinkage()))
+    GV->setSection(".cp.rodata");
+
   return GV;
 }
 
@@ -1875,6 +1888,10 @@
   llvm::GlobalValue::LinkageTypes Linkage = 
     GetLLVMLinkageVarDefinition(D, GV->isConstant());
   GV->setLinkage(Linkage);
+  if (D->hasAttr<DLLImportAttr>())
+    GV->setDLLStorageClass(llvm::GlobalVariable::DLLImportStorageClass);
+  else if (D->hasAttr<DLLExportAttr>())
+    GV->setDLLStorageClass(llvm::GlobalVariable::DLLExportStorageClass);
 
   // If required by the ABI, give definitions of static data members with inline
   // initializers linkonce_odr linkage.
@@ -1917,9 +1934,9 @@
   if (Linkage == GVA_Internal)
     return llvm::Function::InternalLinkage;
   else if (D->hasAttr<DLLImportAttr>())
-    return llvm::Function::DLLImportLinkage;
+    return llvm::Function::ExternalLinkage;
   else if (D->hasAttr<DLLExportAttr>())
-    return llvm::Function::DLLExportLinkage;
+    return llvm::Function::ExternalLinkage;
   else if (D->hasAttr<SelectAnyAttr>()) {
     // selectany symbols are externally visible, so use weak instead of
     // linkonce.  MSVC optimizes away references to const selectany globals, so
@@ -1931,15 +1948,14 @@
       return llvm::GlobalVariable::WeakODRLinkage;
     else
       return llvm::GlobalVariable::WeakAnyLinkage;
-  } else if (Linkage == GVA_TemplateInstantiation ||
-             Linkage == GVA_ExplicitTemplateInstantiation)
+  } else if (Linkage == GVA_TemplateInstantiation || Linkage == GVA_StrongODR)
     return llvm::GlobalVariable::WeakODRLinkage;
   else if (!getLangOpts().CPlusPlus && 
-           ((!CodeGenOpts.NoCommon && !D->getAttr<NoCommonAttr>()) ||
-             D->getAttr<CommonAttr>()) &&
+           ((!CodeGenOpts.NoCommon && !D->hasAttr<NoCommonAttr>()) ||
+             D->hasAttr<CommonAttr>()) &&
            !D->hasExternalStorage() && !D->getInit() &&
-           !D->getAttr<SectionAttr>() && !D->getTLSKind() &&
-           !D->getAttr<WeakImportAttr>()) {
+           !D->hasAttr<SectionAttr>() && !D->getTLSKind() &&
+           !D->hasAttr<WeakImportAttr>()) {
     // Thread local vars aren't considered common linkage.
     return llvm::GlobalVariable::CommonLinkage;
   } else if (D->getTLSKind() == VarDecl::TLS_Dynamic &&
@@ -1965,7 +1981,7 @@
   for (llvm::Value::use_iterator ui = old->use_begin(), ue = old->use_end();
          ui != ue; ) {
     llvm::Value::use_iterator use = ui++; // Increment before the use is erased.
-    llvm::User *user = *use;
+    llvm::User *user = use->getUser();
 
     // Recognize and replace uses of bitcasts.  Most calls to
     // unprototyped functions will use bitcasts.
@@ -1978,7 +1994,7 @@
     // Recognize calls to the function.
     llvm::CallSite callSite(user);
     if (!callSite) continue;
-    if (!callSite.isCallee(use)) continue;
+    if (!callSite.isCallee(&*use)) continue;
 
     // If the return types don't match exactly, then we can't
     // transform this call unless it's dead.
@@ -2087,7 +2103,8 @@
   EmitTopLevelDecl(VD);
 }
 
-void CodeGenModule::EmitGlobalFunctionDefinition(GlobalDecl GD) {
+void CodeGenModule::EmitGlobalFunctionDefinition(GlobalDecl GD,
+                                                 llvm::GlobalValue *GV) {
   const FunctionDecl *D = cast<FunctionDecl>(GD.getDecl());
 
   // Compute the function info and LLVM type.
@@ -2095,7 +2112,9 @@
   llvm::FunctionType *Ty = getTypes().GetFunctionType(FI);
 
   // Get or create the prototype for the function.
-  llvm::Constant *Entry = GetAddrOfFunction(GD, Ty);
+  llvm::Constant *Entry =
+      GV ? GV
+         : GetAddrOfFunction(GD, Ty, /*ForVTable=*/false, /*DontDefer*/ true);
 
   // Strip off a bitcast if we got one back.
   if (llvm::ConstantExpr *CE = dyn_cast<llvm::ConstantExpr>(Entry)) {
@@ -2174,6 +2193,10 @@
     AddGlobalDtor(Fn, DA->getPriority());
   if (D->hasAttr<AnnotateAttr>())
     AddGlobalAnnotations(D, Fn);
+
+  llvm::Function *PGOInit = CodeGenPGO::emitInitialization(*this);
+  if (PGOInit)
+    AddGlobalCtor(PGOInit, 0);
 }
 
 void CodeGenModule::EmitAliasDefinition(GlobalDecl GD) {
@@ -2235,9 +2258,9 @@
     if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
       // The dllexport attribute is ignored for undefined symbols.
       if (FD->hasBody())
-        GA->setLinkage(llvm::Function::DLLExportLinkage);
+        GA->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
     } else {
-      GA->setLinkage(llvm::Function::DLLExportLinkage);
+      GA->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
     }
   } else if (D->hasAttr<WeakAttr>() ||
              D->hasAttr<WeakRefAttr>() ||
@@ -2355,30 +2378,25 @@
     C = llvm::ConstantDataArray::getString(VMContext, Entry.getKey());
   }
 
-  llvm::GlobalValue::LinkageTypes Linkage;
-  if (isUTF16)
-    // FIXME: why do utf strings get "_" labels instead of "L" labels?
-    Linkage = llvm::GlobalValue::InternalLinkage;
-  else
-    // FIXME: With OS X ld 123.2 (xcode 4) and LTO we would get a linker error
-    // when using private linkage. It is not clear if this is a bug in ld
-    // or a reasonable new restriction.
-    Linkage = llvm::GlobalValue::LinkerPrivateLinkage;
-  
   // Note: -fwritable-strings doesn't make the backing store strings of
   // CFStrings writable. (See <rdar://problem/10657500>)
   llvm::GlobalVariable *GV =
-    new llvm::GlobalVariable(getModule(), C->getType(), /*isConstant=*/true,
-                             Linkage, C, ".str");
+      new llvm::GlobalVariable(getModule(), C->getType(), /*isConstant=*/true,
+                               llvm::GlobalValue::PrivateLinkage, C, ".str");
   GV->setUnnamedAddr(true);
   // Don't enforce the target's minimum global alignment, since the only use
   // of the string is via this class initializer.
+  // FIXME: We set the section explicitly to avoid a bug in ld64 224.1. Without
+  // it LLVM can merge the string with a non unnamed_addr one during LTO. Doing
+  // that changes the section it ends in, which surprises ld64.
   if (isUTF16) {
     CharUnits Align = getContext().getTypeAlignInChars(getContext().ShortTy);
     GV->setAlignment(Align.getQuantity());
+    GV->setSection("__TEXT,__ustring");
   } else {
     CharUnits Align = getContext().getTypeAlignInChars(getContext().CharTy);
     GV->setAlignment(Align.getQuantity());
+    GV->setSection("__TEXT,__cstring,cstring_literals");
   }
 
   // String.
@@ -2397,23 +2415,12 @@
   GV = new llvm::GlobalVariable(getModule(), C->getType(), true,
                                 llvm::GlobalVariable::PrivateLinkage, C,
                                 "_unnamed_cfstring_");
-  if (const char *Sect = getTarget().getCFStringSection())
-    GV->setSection(Sect);
+  GV->setSection("__DATA,__cfstring");
   Entry.setValue(GV);
 
   return GV;
 }
 
-static RecordDecl *
-CreateRecordDecl(const ASTContext &Ctx, RecordDecl::TagKind TK,
-                 DeclContext *DC, IdentifierInfo *Id) {
-  SourceLocation Loc;
-  if (Ctx.getLangOpts().CPlusPlus)
-    return CXXRecordDecl::Create(Ctx, TK, DC, Loc, Loc, Id);
-  else
-    return RecordDecl::Create(Ctx, TK, DC, Loc, Loc, Id);
-}
-
 llvm::Constant *
 CodeGenModule::GetAddrOfConstantString(const StringLiteral *Literal) {
   unsigned StringLength = 0;
@@ -2456,9 +2463,7 @@
 
   if (!NSConstantStringType) {
     // Construct the type for a constant NSString.
-    RecordDecl *D = CreateRecordDecl(Context, TTK_Struct, 
-                                     Context.getTranslationUnitDecl(),
-                                   &Context.Idents.get("__builtin_NSString"));
+    RecordDecl *D = Context.buildImplicitRecord("__builtin_NSString");
     D->startDefinition();
       
     QualType FieldTypes[3];
@@ -2521,12 +2526,13 @@
   GV = new llvm::GlobalVariable(getModule(), C->getType(), true,
                                 llvm::GlobalVariable::PrivateLinkage, C,
                                 "_unnamed_nsstring_");
+  const char *NSStringSection = "__OBJC,__cstring_object,regular,no_dead_strip";
+  const char *NSStringNonFragileABISection =
+      "__DATA,__objc_stringobj,regular,no_dead_strip";
   // FIXME. Fix section.
-  if (const char *Sect = 
-        LangOpts.ObjCRuntime.isNonFragile() 
-          ? getTarget().getNSStringNonFragileABISection() 
-          : getTarget().getNSStringSection())
-    GV->setSection(Sect);
+  GV->setSection(LangOpts.ObjCRuntime.isNonFragile()
+                     ? NSStringNonFragileABISection
+                     : NSStringSection);
   Entry.setValue(GV);
   
   return GV;
@@ -2534,9 +2540,7 @@
 
 QualType CodeGenModule::getObjCFastEnumerationStateType() {
   if (ObjCFastEnumerationStateType.isNull()) {
-    RecordDecl *D = CreateRecordDecl(Context, TTK_Struct, 
-                                     Context.getTranslationUnitDecl(),
-                      &Context.Idents.get("__objcFastEnumerationState"));
+    RecordDecl *D = Context.buildImplicitRecord("__objcFastEnumerationState");
     D->startDefinition();
     
     QualType FieldTypes[] = {
@@ -2613,25 +2617,67 @@
 llvm::Constant *
 CodeGenModule::GetAddrOfConstantStringFromLiteral(const StringLiteral *S) {
   CharUnits Align = getContext().getAlignOfGlobalVarInChars(S->getType());
-  if (S->isAscii() || S->isUTF8()) {
-    SmallString<64> Str(S->getString());
-    
-    // Resize the string to the right size, which is indicated by its type.
-    const ConstantArrayType *CAT = Context.getAsConstantArrayType(S->getType());
-    Str.resize(CAT->getSize().getZExtValue());
-    return GetAddrOfConstantString(Str, /*GlobalName*/ 0, Align.getQuantity());
+
+  llvm::StringMapEntry<llvm::GlobalVariable *> *Entry = nullptr;
+  llvm::GlobalVariable *GV = nullptr;
+  if (!LangOpts.WritableStrings) {
+    llvm::StringMap<llvm::GlobalVariable *> *ConstantStringMap = nullptr;
+    switch (S->getCharByteWidth()) {
+    case 1:
+      ConstantStringMap = &Constant1ByteStringMap;
+      break;
+    case 2:
+      ConstantStringMap = &Constant2ByteStringMap;
+      break;
+    case 4:
+      ConstantStringMap = &Constant4ByteStringMap;
+      break;
+    default:
+      llvm_unreachable("unhandled byte width!");
+    }
+    Entry = &ConstantStringMap->GetOrCreateValue(S->getBytes());
+    GV = Entry->getValue();
   }
 
-  // FIXME: the following does not memoize wide strings.
-  llvm::Constant *C = GetConstantArrayFromStringLiteral(S);
-  llvm::GlobalVariable *GV =
-    new llvm::GlobalVariable(getModule(),C->getType(),
-                             !LangOpts.WritableStrings,
-                             llvm::GlobalValue::PrivateLinkage,
-                             C,".str");
+  if (!GV) {
+    SmallString<256> MangledNameBuffer;
+    StringRef GlobalVariableName;
+    llvm::GlobalValue::LinkageTypes LT;
 
-  GV->setAlignment(Align.getQuantity());
-  GV->setUnnamedAddr(true);
+    // Mangle the string literal if the ABI allows for it.  However, we cannot
+    // do this if  we are compiling with ASan or -fwritable-strings because they
+    // rely on strings having normal linkage.
+    if (!LangOpts.WritableStrings && !SanOpts.Address &&
+        getCXXABI().getMangleContext().shouldMangleStringLiteral(S)) {
+      llvm::raw_svector_ostream Out(MangledNameBuffer);
+      getCXXABI().getMangleContext().mangleStringLiteral(S, Out);
+      Out.flush();
+
+      LT = llvm::GlobalValue::LinkOnceODRLinkage;
+      GlobalVariableName = MangledNameBuffer;
+    } else {
+      LT = llvm::GlobalValue::PrivateLinkage;;
+      GlobalVariableName = ".str";
+    }
+
+    // OpenCL v1.2 s6.5.3: a string literal is in the constant address space.
+    unsigned AddrSpace = 0;
+    if (getLangOpts().OpenCL)
+      AddrSpace = getContext().getTargetAddressSpace(LangAS::opencl_constant);
+
+    llvm::Constant *C = GetConstantArrayFromStringLiteral(S);
+    GV = new llvm::GlobalVariable(
+        getModule(), C->getType(), !LangOpts.WritableStrings, LT, C,
+        GlobalVariableName, /*InsertBefore=*/nullptr,
+        llvm::GlobalVariable::NotThreadLocal, AddrSpace);
+    GV->setUnnamedAddr(true);
+    if (Entry)
+      Entry->setValue(GV);
+  }
+
+  if (Align.getQuantity() > GV->getAlignment())
+    GV->setAlignment(Align.getQuantity());
+
   return GV;
 }
 
@@ -2656,7 +2702,7 @@
   llvm::Constant *C =
       llvm::ConstantDataArray::getString(CGM.getLLVMContext(), str, false);
 
-  // OpenCL v1.1 s6.5.3: a string literal is in the constant address space.
+  // OpenCL v1.2 s6.5.3: a string literal is in the constant address space.
   unsigned AddrSpace = 0;
   if (CGM.getLangOpts().OpenCL)
     AddrSpace = CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant);
@@ -2695,7 +2741,7 @@
     return GenerateStringLiteral(Str, false, *this, GlobalName, Alignment);
 
   llvm::StringMapEntry<llvm::GlobalVariable *> &Entry =
-    ConstantStringMap.GetOrCreateValue(Str);
+    Constant1ByteStringMap.GetOrCreateValue(Str);
 
   if (llvm::GlobalVariable *GV = Entry.getValue()) {
     if (Alignment > GV->getAlignment()) {
@@ -2793,10 +2839,7 @@
 /// properties for an implementation.
 void CodeGenModule::EmitObjCPropertyImplementations(const
                                                     ObjCImplementationDecl *D) {
-  for (ObjCImplementationDecl::propimpl_iterator
-         i = D->propimpl_begin(), e = D->propimpl_end(); i != e; ++i) {
-    ObjCPropertyImplDecl *PID = *i;
-
+  for (const auto *PID : D->property_impls()) {
     // Dynamic is just for type-checking.
     if (PID->getPropertyImplementation() == ObjCPropertyImplDecl::Synthesize) {
       ObjCPropertyDecl *PD = PID->getPropertyDecl();
@@ -2871,13 +2914,12 @@
 
 /// EmitNamespace - Emit all declarations in a namespace.
 void CodeGenModule::EmitNamespace(const NamespaceDecl *ND) {
-  for (RecordDecl::decl_iterator I = ND->decls_begin(), E = ND->decls_end();
-       I != E; ++I) {
-    if (const VarDecl *VD = dyn_cast<VarDecl>(*I))
+  for (auto *I : ND->decls()) {
+    if (const auto *VD = dyn_cast<VarDecl>(I))
       if (VD->getTemplateSpecializationKind() != TSK_ExplicitSpecialization &&
           VD->getTemplateSpecializationKind() != TSK_Undeclared)
         continue;
-    EmitTopLevelDecl(*I);
+    EmitTopLevelDecl(I);
   }
 }
 
@@ -2889,17 +2931,14 @@
     return;
   }
 
-  for (RecordDecl::decl_iterator I = LSD->decls_begin(), E = LSD->decls_end();
-       I != E; ++I) {
+  for (auto *I : LSD->decls()) {
     // Meta-data for ObjC class includes references to implemented methods.
     // Generate class's method definitions first.
-    if (ObjCImplDecl *OID = dyn_cast<ObjCImplDecl>(*I)) {
-      for (ObjCContainerDecl::method_iterator M = OID->meth_begin(),
-           MEnd = OID->meth_end();
-           M != MEnd; ++M)
-        EmitTopLevelDecl(*M);
+    if (auto *OID = dyn_cast<ObjCImplDecl>(I)) {
+      for (auto *M : OID->methods())
+        EmitTopLevelDecl(M);
     }
-    EmitTopLevelDecl(*I);
+    EmitTopLevelDecl(I);
   }
 }
 
@@ -2940,7 +2979,6 @@
     break;
     // No code generation needed.
   case Decl::UsingShadow:
-  case Decl::Using:
   case Decl::ClassTemplate:
   case Decl::VarTemplate:
   case Decl::VarTemplatePartialSpecialization:
@@ -2949,6 +2987,10 @@
   case Decl::Block:
   case Decl::Empty:
     break;
+  case Decl::Using:          // using X; [C++]
+    if (CGDebugInfo *DI = getModuleDebugInfo())
+        DI->EmitUsingDecl(cast<UsingDecl>(*D));
+    return;
   case Decl::NamespaceAlias:
     if (CGDebugInfo *DI = getModuleDebugInfo())
         DI->EmitNamespaceAlias(cast<NamespaceAliasDecl>(*D));
@@ -3048,7 +3090,15 @@
 
     ImportedModules.insert(Import->getImportedModule());
     break;
- }
+  }
+
+  case Decl::ClassTemplateSpecialization: {
+    const ClassTemplateSpecializationDecl *Spec =
+        cast<ClassTemplateSpecializationDecl>(D);
+    if (DebugInfo &&
+        Spec->getSpecializationKind() == TSK_ExplicitInstantiationDefinition)
+      DebugInfo->completeTemplateDefinition(*Spec);
+  }
 
   default:
     // Make sure we handled everything we should, every other kind is a
@@ -3094,7 +3144,7 @@
     IdentifierInfo *Name = I->first;
     llvm::GlobalValue *Val = I->second;
     if (Val && !getModule().getNamedValue(Name->getName()))
-      AddUsedGlobal(new llvm::GlobalAlias(Val->getType(), Val->getLinkage(),
+      addUsedGlobal(new llvm::GlobalAlias(Val->getType(), Val->getLinkage(),
                                           Name->getName(), Val, &getModule()));
   }
 }
diff --git a/lib/CodeGen/CodeGenModule.h b/lib/CodeGen/CodeGenModule.h
index c161224..0d13bdc 100644
--- a/lib/CodeGen/CodeGenModule.h
+++ b/lib/CodeGen/CodeGenModule.h
@@ -30,7 +30,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/ValueHandle.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Transforms/Utils/SpecialCaseList.h"
 
 namespace llvm {
@@ -85,7 +85,8 @@
   class CGCUDARuntime;
   class BlockFieldFlags;
   class FunctionArgList;
-  
+  class PGOProfileData;
+
   struct OrderGlobalInits {
     unsigned int priority;
     unsigned int lex_order;
@@ -98,10 +99,8 @@
     }
     
     bool operator<(const OrderGlobalInits &RHS) const {
-      if (priority < RHS.priority)
-        return true;
-      
-      return priority == RHS.priority && lex_order < RHS.lex_order;
+      return std::tie(priority, lex_order) <
+             std::tie(RHS.priority, RHS.lex_order);
     }
   };
 
@@ -236,7 +235,7 @@
   DiagnosticsEngine &Diags;
   const llvm::DataLayout &TheDataLayout;
   const TargetInfo &Target;
-  CGCXXABI &ABI;
+  std::unique_ptr<CGCXXABI> ABI;
   llvm::LLVMContext &VMContext;
 
   CodeGenTBAA *TBAA;
@@ -258,6 +257,7 @@
   ARCEntrypoints *ARCData;
   llvm::MDNode *NoObjCARCExceptionsMetadata;
   RREntrypoints *RRData;
+  PGOProfileData *PGOData;
 
   // WeakRefReferences - A set of references that have only been seen via
   // a weakref so far. This is used to remove the weak of the reference if we
@@ -273,7 +273,15 @@
   /// DeferredDeclsToEmit - This is a list of deferred decls which we have seen
   /// that *are* actually referenced.  These get code generated when the module
   /// is done.
-  std::vector<GlobalDecl> DeferredDeclsToEmit;
+  struct DeferredGlobal {
+    DeferredGlobal(llvm::GlobalValue *GV, GlobalDecl GD) : GV(GV), GD(GD) {}
+    llvm::AssertingVH<llvm::GlobalValue> GV;
+    GlobalDecl GD;
+  };
+  std::vector<DeferredGlobal> DeferredDeclsToEmit;
+  void addDeferredDeclToEmit(llvm::GlobalValue *GV, GlobalDecl GD) {
+    DeferredDeclsToEmit.push_back(DeferredGlobal(GV, GD));
+  }
 
   /// List of alias we have emitted. Used to make sure that what they point to
   /// is defined once we get to the end of the of the translation unit.
@@ -290,6 +298,7 @@
   /// forcing visibility of symbols which may otherwise be optimized
   /// out.
   std::vector<llvm::WeakVH> LLVMUsed;
+  std::vector<llvm::WeakVH> LLVMCompilerUsed;
 
   /// GlobalCtors - Store the list of global constructors and their respective
   /// priorities to be emitted when the translation unit is complete.
@@ -310,7 +319,10 @@
   llvm::StringMap<llvm::Constant*> AnnotationStrings;
 
   llvm::StringMap<llvm::Constant*> CFConstantStringMap;
-  llvm::StringMap<llvm::GlobalVariable*> ConstantStringMap;
+
+  llvm::StringMap<llvm::GlobalVariable *> Constant1ByteStringMap;
+  llvm::StringMap<llvm::GlobalVariable *> Constant2ByteStringMap;
+  llvm::StringMap<llvm::GlobalVariable *> Constant4ByteStringMap;
   llvm::DenseMap<const Decl*, llvm::Constant *> StaticLocalDeclMap;
   llvm::DenseMap<const Decl*, llvm::GlobalVariable*> StaticLocalDeclGuardMap;
   llvm::DenseMap<const Expr*, llvm::Constant *> MaterializedGlobalTemporaryMap;
@@ -421,7 +433,7 @@
 
   GlobalDecl initializedGlobalDecl;
 
-  llvm::OwningPtr<llvm::SpecialCaseList> SanitizerBlacklist;
+  std::unique_ptr<llvm::SpecialCaseList> SanitizerBlacklist;
 
   const SanitizerOptions &SanOpts;
 
@@ -433,6 +445,8 @@
 
   ~CodeGenModule();
 
+  void clear();
+
   /// Release - Finalize LLVM code generation.
   void Release();
 
@@ -469,6 +483,10 @@
     return *RRData;
   }
 
+  PGOProfileData *getPGOData() const {
+    return PGOData;
+  }
+
   llvm::Constant *getStaticLocalDeclAddress(const VarDecl *D) {
     return StaticLocalDeclMap[D];
   }
@@ -525,7 +543,7 @@
   DiagnosticsEngine &getDiags() const { return Diags; }
   const llvm::DataLayout &getDataLayout() const { return TheDataLayout; }
   const TargetInfo &getTarget() const { return Target; }
-  CGCXXABI &getCXXABI() { return ABI; }
+  CGCXXABI &getCXXABI() const { return *ABI; }
   llvm::LLVMContext &getLLVMContext() { return VMContext; }
   
   bool shouldUseTBAA() const { return TBAA != 0; }
@@ -577,21 +595,6 @@
   /// for the thread-local variable declaration D.
   void setTLSMode(llvm::GlobalVariable *GV, const VarDecl &D) const;
 
-  /// TypeVisibilityKind - The kind of global variable that is passed to 
-  /// setTypeVisibility
-  enum TypeVisibilityKind {
-    TVK_ForVTT,
-    TVK_ForVTable,
-    TVK_ForConstructionVTable,
-    TVK_ForRTTI,
-    TVK_ForRTTIName
-  };
-
-  /// setTypeVisibility - Set the visibility for the given global
-  /// value which holds information about a type.
-  void setTypeVisibility(llvm::GlobalValue *GV, const CXXRecordDecl *D,
-                         TypeVisibilityKind TVK) const;
-
   static llvm::GlobalValue::VisibilityTypes GetLLVMVisibility(Visibility V) {
     switch (V) {
     case DefaultVisibility:   return llvm::GlobalValue::DefaultVisibility;
@@ -639,9 +642,9 @@
   /// GetAddrOfFunction - Return the address of the given function.  If Ty is
   /// non-null, then this function will use the specified type if it has to
   /// create it.
-  llvm::Constant *GetAddrOfFunction(GlobalDecl GD,
-                                    llvm::Type *Ty = 0,
-                                    bool ForVTable = false);
+  llvm::Constant *GetAddrOfFunction(GlobalDecl GD, llvm::Type *Ty = 0,
+                                    bool ForVTable = false,
+                                    bool DontDefer = false);
 
   /// GetAddrOfRTTIDescriptor - Get the address of the RTTI descriptor 
   /// for the given type.
@@ -769,14 +772,16 @@
   /// given type.
   llvm::GlobalValue *GetAddrOfCXXConstructor(const CXXConstructorDecl *ctor,
                                              CXXCtorType ctorType,
-                                             const CGFunctionInfo *fnInfo = 0);
+                                             const CGFunctionInfo *fnInfo = 0,
+                                             bool DontDefer = false);
 
   /// GetAddrOfCXXDestructor - Return the address of the constructor of the
   /// given type.
   llvm::GlobalValue *GetAddrOfCXXDestructor(const CXXDestructorDecl *dtor,
                                             CXXDtorType dtorType,
                                             const CGFunctionInfo *fnInfo = 0,
-                                            llvm::FunctionType *fnType = 0);
+                                            llvm::FunctionType *fnType = 0,
+                                            bool DontDefer = false);
 
   /// getBuiltinLibFunction - Given a builtin id for a function like
   /// "__builtin_fabsf", return a Function* for "fabsf".
@@ -798,10 +803,11 @@
   template<typename SomeDecl>
   void MaybeHandleStaticInExternC(const SomeDecl *D, llvm::GlobalValue *GV);
 
-  /// AddUsedGlobal - Add a global which should be forced to be
-  /// present in the object file; these are emitted to the llvm.used
-  /// metadata global.
-  void AddUsedGlobal(llvm::GlobalValue *GV);
+  /// Add a global to a list to be added to the llvm.used metadata.
+  void addUsedGlobal(llvm::GlobalValue *GV);
+
+  /// Add a global to a list to be added to the llvm.compiler.used metadata.
+  void addCompilerUsedGlobal(llvm::GlobalValue *GV);
 
   /// AddCXXDtorEntry - Add a destructor and object to add to the C++ global
   /// destructor function.
@@ -902,6 +908,10 @@
   /// as a return type.
   bool ReturnTypeUsesSRet(const CGFunctionInfo &FI);
 
+  /// ReturnSlotInterferesWithArgs - Return true iff the given type uses an
+  /// argument slot when 'sret' is used as a return type.
+  bool ReturnSlotInterferesWithArgs(const CGFunctionInfo &FI);
+
   /// ReturnTypeUsesFPRet - Return true iff the given type uses 'fpret' when
   /// used as a return type.
   bool ReturnTypeUsesFPRet(QualType ResultType);
@@ -1009,12 +1019,11 @@
 private:
   llvm::GlobalValue *GetGlobalValue(StringRef Ref);
 
-  llvm::Constant *GetOrCreateLLVMFunction(StringRef MangledName,
-                                          llvm::Type *Ty,
-                                          GlobalDecl D,
-                                          bool ForVTable,
-                                          llvm::AttributeSet ExtraAttrs =
-                                            llvm::AttributeSet());
+  llvm::Constant *
+  GetOrCreateLLVMFunction(StringRef MangledName, llvm::Type *Ty, GlobalDecl D,
+                          bool ForVTable, bool DontDefer = false,
+                          llvm::AttributeSet ExtraAttrs = llvm::AttributeSet());
+
   llvm::Constant *GetOrCreateLLVMGlobal(StringRef MangledName,
                                         llvm::PointerType *PTy,
                                         const VarDecl *D,
@@ -1037,9 +1046,9 @@
                              llvm::Function *F,
                              bool IsIncompleteFunction);
 
-  void EmitGlobalDefinition(GlobalDecl D);
+  void EmitGlobalDefinition(GlobalDecl D, llvm::GlobalValue *GV = 0);
 
-  void EmitGlobalFunctionDefinition(GlobalDecl GD);
+  void EmitGlobalFunctionDefinition(GlobalDecl GD, llvm::GlobalValue *GV);
   void EmitGlobalVarDefinition(const VarDecl *D);
   void EmitAliasDefinition(GlobalDecl GD);
   void EmitObjCPropertyImplementations(const ObjCImplementationDecl *D);
@@ -1104,9 +1113,8 @@
   /// still have a use for.
   void EmitDeferredVTables();
 
-  /// EmitLLVMUsed - Emit the llvm.used metadata used to force
-  /// references to global which may otherwise be optimized out.
-  void EmitLLVMUsed();
+  /// Emit the llvm.used and llvm.compiler.used metadata.
+  void emitLLVMUsed();
 
   /// \brief Emit the link options introduced by imported modules.
   void EmitModuleLinkOptions();
diff --git a/lib/CodeGen/CodeGenPGO.cpp b/lib/CodeGen/CodeGenPGO.cpp
new file mode 100644
index 0000000..ac6b7bc
--- /dev/null
+++ b/lib/CodeGen/CodeGenPGO.cpp
@@ -0,0 +1,1033 @@
+//===--- CodeGenPGO.cpp - PGO Instrumentation for LLVM CodeGen --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Instrumentation-based profile-guided optimization
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeGenPGO.h"
+#include "CodeGenFunction.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/AST/StmtVisitor.h"
+#include "llvm/Config/config.h" // for strtoull()/strtoll() define
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/Support/FileSystem.h"
+
+using namespace clang;
+using namespace CodeGen;
+
+static void ReportBadPGOData(CodeGenModule &CGM, const char *Message) {
+  DiagnosticsEngine &Diags = CGM.getDiags();
+  unsigned diagID = Diags.getCustomDiagID(DiagnosticsEngine::Error, "%0");
+  Diags.Report(diagID) << Message;
+}
+
+PGOProfileData::PGOProfileData(CodeGenModule &CGM, std::string Path)
+  : CGM(CGM) {
+  if (llvm::MemoryBuffer::getFile(Path, DataBuffer)) {
+    ReportBadPGOData(CGM, "failed to open pgo data file");
+    return;
+  }
+
+  if (DataBuffer->getBufferSize() > std::numeric_limits<unsigned>::max()) {
+    ReportBadPGOData(CGM, "pgo data file too big");
+    return;
+  }
+
+  // Scan through the data file and map each function to the corresponding
+  // file offset where its counts are stored.
+  const char *BufferStart = DataBuffer->getBufferStart();
+  const char *BufferEnd = DataBuffer->getBufferEnd();
+  const char *CurPtr = BufferStart;
+  uint64_t MaxCount = 0;
+  while (CurPtr < BufferEnd) {
+    // Read the function name.
+    const char *FuncStart = CurPtr;
+    // For Objective-C methods, the name may include whitespace, so search
+    // backward from the end of the line to find the space that separates the
+    // name from the number of counters. (This is a temporary hack since we are
+    // going to completely replace this file format in the near future.)
+    CurPtr = strchr(CurPtr, '\n');
+    if (!CurPtr) {
+      ReportBadPGOData(CGM, "pgo data file has malformed function entry");
+      return;
+    }
+    StringRef FuncName(FuncStart, CurPtr - FuncStart);
+
+    // Skip over the function hash.
+    CurPtr = strchr(++CurPtr, '\n');
+    if (!CurPtr) {
+      ReportBadPGOData(CGM, "pgo data file is missing the function hash");
+      return;
+    }
+
+    // Read the number of counters.
+    char *EndPtr;
+    unsigned NumCounters = strtol(++CurPtr, &EndPtr, 10);
+    if (EndPtr == CurPtr || *EndPtr != '\n' || NumCounters <= 0) {
+      ReportBadPGOData(CGM, "pgo data file has unexpected number of counters");
+      return;
+    }
+    CurPtr = EndPtr;
+
+    // Read function count.
+    uint64_t Count = strtoll(CurPtr, &EndPtr, 10);
+    if (EndPtr == CurPtr || *EndPtr != '\n') {
+      ReportBadPGOData(CGM, "pgo-data file has bad count value");
+      return;
+    }
+    CurPtr = EndPtr; // Point to '\n'.
+    FunctionCounts[FuncName] = Count;
+    MaxCount = Count > MaxCount ? Count : MaxCount;
+
+    // There is one line for each counter; skip over those lines.
+    // Since function count is already read, we start the loop from 1.
+    for (unsigned N = 1; N < NumCounters; ++N) {
+      CurPtr = strchr(++CurPtr, '\n');
+      if (!CurPtr) {
+        ReportBadPGOData(CGM, "pgo data file is missing some counter info");
+        return;
+      }
+    }
+
+    // Skip over the blank line separating functions.
+    CurPtr += 2;
+
+    DataOffsets[FuncName] = FuncStart - BufferStart;
+  }
+  MaxFunctionCount = MaxCount;
+}
+
+bool PGOProfileData::getFunctionCounts(StringRef FuncName, uint64_t &FuncHash,
+                                       std::vector<uint64_t> &Counts) {
+  // Find the relevant section of the pgo-data file.
+  llvm::StringMap<unsigned>::const_iterator OffsetIter =
+    DataOffsets.find(FuncName);
+  if (OffsetIter == DataOffsets.end())
+    return true;
+  const char *CurPtr = DataBuffer->getBufferStart() + OffsetIter->getValue();
+
+  // Skip over the function name.
+  CurPtr = strchr(CurPtr, '\n');
+  assert(CurPtr && "pgo-data has corrupted function entry");
+
+  char *EndPtr;
+  // Read the function hash.
+  FuncHash = strtoll(++CurPtr, &EndPtr, 10);
+  assert(EndPtr != CurPtr && *EndPtr == '\n' &&
+         "pgo-data file has corrupted function hash");
+  CurPtr = EndPtr;
+
+  // Read the number of counters.
+  unsigned NumCounters = strtol(++CurPtr, &EndPtr, 10);
+  assert(EndPtr != CurPtr && *EndPtr == '\n' && NumCounters > 0 &&
+         "pgo-data file has corrupted number of counters");
+  CurPtr = EndPtr;
+
+  Counts.reserve(NumCounters);
+
+  for (unsigned N = 0; N < NumCounters; ++N) {
+    // Read the count value.
+    uint64_t Count = strtoll(CurPtr, &EndPtr, 10);
+    if (EndPtr == CurPtr || *EndPtr != '\n') {
+      ReportBadPGOData(CGM, "pgo-data file has bad count value");
+      return true;
+    }
+    Counts.push_back(Count);
+    CurPtr = EndPtr + 1;
+  }
+
+  // Make sure the number of counters matches up.
+  if (Counts.size() != NumCounters) {
+    ReportBadPGOData(CGM, "pgo-data file has inconsistent counters");
+    return true;
+  }
+
+  return false;
+}
+
+void CodeGenPGO::setFuncName(llvm::Function *Fn) {
+  RawFuncName = Fn->getName();
+
+  // Function names may be prefixed with a binary '1' to indicate
+  // that the backend should not modify the symbols due to any platform
+  // naming convention. Do not include that '1' in the PGO profile name.
+  if (RawFuncName[0] == '\1')
+    RawFuncName = RawFuncName.substr(1);
+
+  if (!Fn->hasLocalLinkage()) {
+    PrefixedFuncName.reset(new std::string(RawFuncName));
+    return;
+  }
+
+  // For local symbols, prepend the main file name to distinguish them.
+  // Do not include the full path in the file name since there's no guarantee
+  // that it will stay the same, e.g., if the files are checked out from
+  // version control in different locations.
+  PrefixedFuncName.reset(new std::string(CGM.getCodeGenOpts().MainFileName));
+  if (PrefixedFuncName->empty())
+    PrefixedFuncName->assign("<unknown>");
+  PrefixedFuncName->append(":");
+  PrefixedFuncName->append(RawFuncName);
+}
+
+static llvm::Function *getRegisterFunc(CodeGenModule &CGM) {
+  return CGM.getModule().getFunction("__llvm_profile_register_functions");
+}
+
+static llvm::BasicBlock *getOrInsertRegisterBB(CodeGenModule &CGM) {
+  // Don't do this for Darwin.  compiler-rt uses linker magic.
+  if (CGM.getTarget().getTriple().isOSDarwin())
+    return nullptr;
+
+  // Only need to insert this once per module.
+  if (llvm::Function *RegisterF = getRegisterFunc(CGM))
+    return &RegisterF->getEntryBlock();
+
+  // Construct the function.
+  auto *VoidTy = llvm::Type::getVoidTy(CGM.getLLVMContext());
+  auto *RegisterFTy = llvm::FunctionType::get(VoidTy, false);
+  auto *RegisterF = llvm::Function::Create(RegisterFTy,
+                                           llvm::GlobalValue::InternalLinkage,
+                                           "__llvm_profile_register_functions",
+                                           &CGM.getModule());
+  RegisterF->setUnnamedAddr(true);
+  if (CGM.getCodeGenOpts().DisableRedZone)
+    RegisterF->addFnAttr(llvm::Attribute::NoRedZone);
+
+  // Construct and return the entry block.
+  auto *BB = llvm::BasicBlock::Create(CGM.getLLVMContext(), "", RegisterF);
+  CGBuilderTy Builder(BB);
+  Builder.CreateRetVoid();
+  return BB;
+}
+
+static llvm::Constant *getOrInsertRuntimeRegister(CodeGenModule &CGM) {
+  auto *VoidTy = llvm::Type::getVoidTy(CGM.getLLVMContext());
+  auto *VoidPtrTy = llvm::Type::getInt8PtrTy(CGM.getLLVMContext());
+  auto *RuntimeRegisterTy = llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
+  return CGM.getModule().getOrInsertFunction("__llvm_profile_register_function",
+                                             RuntimeRegisterTy);
+}
+
+static bool isMachO(const CodeGenModule &CGM) {
+  return CGM.getTarget().getTriple().isOSBinFormatMachO();
+}
+
+static StringRef getCountersSection(const CodeGenModule &CGM) {
+  return isMachO(CGM) ? "__DATA,__llvm_prf_cnts" : "__llvm_prf_cnts";
+}
+
+static StringRef getNameSection(const CodeGenModule &CGM) {
+  return isMachO(CGM) ? "__DATA,__llvm_prf_names" : "__llvm_prf_names";
+}
+
+static StringRef getDataSection(const CodeGenModule &CGM) {
+  return isMachO(CGM) ? "__DATA,__llvm_prf_data" : "__llvm_prf_data";
+}
+
+llvm::GlobalVariable *CodeGenPGO::buildDataVar() {
+  // Create name variable.
+  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
+  auto *VarName = llvm::ConstantDataArray::getString(Ctx, getFuncName(),
+                                                     false);
+  auto *Name = new llvm::GlobalVariable(CGM.getModule(), VarName->getType(),
+                                        true, VarLinkage, VarName,
+                                        getFuncVarName("name"));
+  Name->setSection(getNameSection(CGM));
+  Name->setAlignment(1);
+
+  // Create data variable.
+  auto *Int32Ty = llvm::Type::getInt32Ty(Ctx);
+  auto *Int64Ty = llvm::Type::getInt64Ty(Ctx);
+  auto *Int8PtrTy = llvm::Type::getInt8PtrTy(Ctx);
+  auto *Int64PtrTy = llvm::Type::getInt64PtrTy(Ctx);
+  llvm::Type *DataTypes[] = {
+    Int32Ty, Int32Ty, Int64Ty, Int8PtrTy, Int64PtrTy
+  };
+  auto *DataTy = llvm::StructType::get(Ctx, makeArrayRef(DataTypes));
+  llvm::Constant *DataVals[] = {
+    llvm::ConstantInt::get(Int32Ty, getFuncName().size()),
+    llvm::ConstantInt::get(Int32Ty, NumRegionCounters),
+    llvm::ConstantInt::get(Int64Ty, FunctionHash),
+    llvm::ConstantExpr::getBitCast(Name, Int8PtrTy),
+    llvm::ConstantExpr::getBitCast(RegionCounters, Int64PtrTy)
+  };
+  auto *Data =
+    new llvm::GlobalVariable(CGM.getModule(), DataTy, true, VarLinkage,
+                             llvm::ConstantStruct::get(DataTy, DataVals),
+                             getFuncVarName("data"));
+
+  // All the data should be packed into an array in its own section.
+  Data->setSection(getDataSection(CGM));
+  Data->setAlignment(8);
+
+  // Make sure the data doesn't get deleted.
+  CGM.addUsedGlobal(Data);
+  return Data;
+}
+
+void CodeGenPGO::emitInstrumentationData() {
+  if (!CGM.getCodeGenOpts().ProfileInstrGenerate)
+    return;
+
+  // Build the data.
+  auto *Data = buildDataVar();
+
+  // Register the data.
+  auto *RegisterBB = getOrInsertRegisterBB(CGM);
+  if (!RegisterBB)
+    return;
+  CGBuilderTy Builder(RegisterBB->getTerminator());
+  auto *VoidPtrTy = llvm::Type::getInt8PtrTy(CGM.getLLVMContext());
+  Builder.CreateCall(getOrInsertRuntimeRegister(CGM),
+                     Builder.CreateBitCast(Data, VoidPtrTy));
+}
+
+llvm::Function *CodeGenPGO::emitInitialization(CodeGenModule &CGM) {
+  if (!CGM.getCodeGenOpts().ProfileInstrGenerate)
+    return nullptr;
+
+  // Only need to create this once per module.
+  if (CGM.getModule().getFunction("__llvm_profile_init"))
+    return nullptr;
+
+  // Get the function to call at initialization.
+  llvm::Constant *RegisterF = getRegisterFunc(CGM);
+  if (!RegisterF)
+    return nullptr;
+
+  // Create the initialization function.
+  auto *VoidTy = llvm::Type::getVoidTy(CGM.getLLVMContext());
+  auto *F = llvm::Function::Create(llvm::FunctionType::get(VoidTy, false),
+                                   llvm::GlobalValue::InternalLinkage,
+                                   "__llvm_profile_init", &CGM.getModule());
+  F->setUnnamedAddr(true);
+  F->addFnAttr(llvm::Attribute::NoInline);
+  if (CGM.getCodeGenOpts().DisableRedZone)
+    F->addFnAttr(llvm::Attribute::NoRedZone);
+
+  // Add the basic block and the necessary calls.
+  CGBuilderTy Builder(llvm::BasicBlock::Create(CGM.getLLVMContext(), "", F));
+  Builder.CreateCall(RegisterF);
+  Builder.CreateRetVoid();
+
+  return F;
+}
+
+namespace {
+  /// A StmtVisitor that fills a map of statements to PGO counters.
+  struct MapRegionCounters : public ConstStmtVisitor<MapRegionCounters> {
+    /// The next counter value to assign.
+    unsigned NextCounter;
+    /// The map of statements to counters.
+    llvm::DenseMap<const Stmt *, unsigned> &CounterMap;
+
+    MapRegionCounters(llvm::DenseMap<const Stmt *, unsigned> &CounterMap)
+        : NextCounter(0), CounterMap(CounterMap) {}
+
+    void VisitChildren(const Stmt *S) {
+      for (Stmt::const_child_range I = S->children(); I; ++I)
+        if (*I)
+         this->Visit(*I);
+    }
+    void VisitStmt(const Stmt *S) { VisitChildren(S); }
+
+    /// Assign a counter to track entry to the function body.
+    void VisitFunctionDecl(const FunctionDecl *S) {
+      CounterMap[S->getBody()] = NextCounter++;
+      Visit(S->getBody());
+    }
+    void VisitObjCMethodDecl(const ObjCMethodDecl *S) {
+      CounterMap[S->getBody()] = NextCounter++;
+      Visit(S->getBody());
+    }
+    void VisitBlockDecl(const BlockDecl *S) {
+      CounterMap[S->getBody()] = NextCounter++;
+      Visit(S->getBody());
+    }
+    /// Assign a counter to track the block following a label.
+    void VisitLabelStmt(const LabelStmt *S) {
+      CounterMap[S] = NextCounter++;
+      Visit(S->getSubStmt());
+    }
+    /// Assign a counter for the body of a while loop.
+    void VisitWhileStmt(const WhileStmt *S) {
+      CounterMap[S] = NextCounter++;
+      Visit(S->getCond());
+      Visit(S->getBody());
+    }
+    /// Assign a counter for the body of a do-while loop.
+    void VisitDoStmt(const DoStmt *S) {
+      CounterMap[S] = NextCounter++;
+      Visit(S->getBody());
+      Visit(S->getCond());
+    }
+    /// Assign a counter for the body of a for loop.
+    void VisitForStmt(const ForStmt *S) {
+      CounterMap[S] = NextCounter++;
+      if (S->getInit())
+        Visit(S->getInit());
+      const Expr *E;
+      if ((E = S->getCond()))
+        Visit(E);
+      if ((E = S->getInc()))
+        Visit(E);
+      Visit(S->getBody());
+    }
+    /// Assign a counter for the body of a for-range loop.
+    void VisitCXXForRangeStmt(const CXXForRangeStmt *S) {
+      CounterMap[S] = NextCounter++;
+      Visit(S->getRangeStmt());
+      Visit(S->getBeginEndStmt());
+      Visit(S->getCond());
+      Visit(S->getLoopVarStmt());
+      Visit(S->getBody());
+      Visit(S->getInc());
+    }
+    /// Assign a counter for the body of a for-collection loop.
+    void VisitObjCForCollectionStmt(const ObjCForCollectionStmt *S) {
+      CounterMap[S] = NextCounter++;
+      Visit(S->getElement());
+      Visit(S->getBody());
+    }
+    /// Assign a counter for the exit block of the switch statement.
+    void VisitSwitchStmt(const SwitchStmt *S) {
+      CounterMap[S] = NextCounter++;
+      Visit(S->getCond());
+      Visit(S->getBody());
+    }
+    /// Assign a counter for a particular case in a switch. This counts jumps
+    /// from the switch header as well as fallthrough from the case before this
+    /// one.
+    void VisitCaseStmt(const CaseStmt *S) {
+      CounterMap[S] = NextCounter++;
+      Visit(S->getSubStmt());
+    }
+    /// Assign a counter for the default case of a switch statement. The count
+    /// is the number of branches from the loop header to the default, and does
+    /// not include fallthrough from previous cases. If we have multiple
+    /// conditional branch blocks from the switch instruction to the default
+    /// block, as with large GNU case ranges, this is the counter for the last
+    /// edge in that series, rather than the first.
+    void VisitDefaultStmt(const DefaultStmt *S) {
+      CounterMap[S] = NextCounter++;
+      Visit(S->getSubStmt());
+    }
+    /// Assign a counter for the "then" part of an if statement. The count for
+    /// the "else" part, if it exists, will be calculated from this counter.
+    void VisitIfStmt(const IfStmt *S) {
+      CounterMap[S] = NextCounter++;
+      Visit(S->getCond());
+      Visit(S->getThen());
+      if (S->getElse())
+        Visit(S->getElse());
+    }
+    /// Assign a counter for the continuation block of a C++ try statement.
+    void VisitCXXTryStmt(const CXXTryStmt *S) {
+      CounterMap[S] = NextCounter++;
+      Visit(S->getTryBlock());
+      for (unsigned I = 0, E = S->getNumHandlers(); I < E; ++I)
+        Visit(S->getHandler(I));
+    }
+    /// Assign a counter for a catch statement's handler block.
+    void VisitCXXCatchStmt(const CXXCatchStmt *S) {
+      CounterMap[S] = NextCounter++;
+      Visit(S->getHandlerBlock());
+    }
+    /// Assign a counter for the "true" part of a conditional operator. The
+    /// count in the "false" part will be calculated from this counter.
+    void VisitConditionalOperator(const ConditionalOperator *E) {
+      CounterMap[E] = NextCounter++;
+      Visit(E->getCond());
+      Visit(E->getTrueExpr());
+      Visit(E->getFalseExpr());
+    }
+    /// Assign a counter for the right hand side of a logical and operator.
+    void VisitBinLAnd(const BinaryOperator *E) {
+      CounterMap[E] = NextCounter++;
+      Visit(E->getLHS());
+      Visit(E->getRHS());
+    }
+    /// Assign a counter for the right hand side of a logical or operator.
+    void VisitBinLOr(const BinaryOperator *E) {
+      CounterMap[E] = NextCounter++;
+      Visit(E->getLHS());
+      Visit(E->getRHS());
+    }
+  };
+
+  /// A StmtVisitor that propagates the raw counts through the AST and
+  /// records the count at statements where the value may change.
+  struct ComputeRegionCounts : public ConstStmtVisitor<ComputeRegionCounts> {
+    /// PGO state.
+    CodeGenPGO &PGO;
+
+    /// A flag that is set when the current count should be recorded on the
+    /// next statement, such as at the exit of a loop.
+    bool RecordNextStmtCount;
+
+    /// The map of statements to count values.
+    llvm::DenseMap<const Stmt *, uint64_t> &CountMap;
+
+    /// BreakContinueStack - Keep counts of breaks and continues inside loops. 
+    struct BreakContinue {
+      uint64_t BreakCount;
+      uint64_t ContinueCount;
+      BreakContinue() : BreakCount(0), ContinueCount(0) {}
+    };
+    SmallVector<BreakContinue, 8> BreakContinueStack;
+
+    ComputeRegionCounts(llvm::DenseMap<const Stmt *, uint64_t> &CountMap,
+                        CodeGenPGO &PGO)
+        : PGO(PGO), RecordNextStmtCount(false), CountMap(CountMap) {}
+
+    void RecordStmtCount(const Stmt *S) {
+      if (RecordNextStmtCount) {
+        CountMap[S] = PGO.getCurrentRegionCount();
+        RecordNextStmtCount = false;
+      }
+    }
+
+    void VisitStmt(const Stmt *S) {
+      RecordStmtCount(S);
+      for (Stmt::const_child_range I = S->children(); I; ++I) {
+        if (*I)
+         this->Visit(*I);
+      }
+    }
+
+    void VisitFunctionDecl(const FunctionDecl *S) {
+      RegionCounter Cnt(PGO, S->getBody());
+      Cnt.beginRegion();
+      CountMap[S->getBody()] = PGO.getCurrentRegionCount();
+      Visit(S->getBody());
+    }
+
+    void VisitObjCMethodDecl(const ObjCMethodDecl *S) {
+      RegionCounter Cnt(PGO, S->getBody());
+      Cnt.beginRegion();
+      CountMap[S->getBody()] = PGO.getCurrentRegionCount();
+      Visit(S->getBody());
+    }
+
+    void VisitBlockDecl(const BlockDecl *S) {
+      RegionCounter Cnt(PGO, S->getBody());
+      Cnt.beginRegion();
+      CountMap[S->getBody()] = PGO.getCurrentRegionCount();
+      Visit(S->getBody());
+    }
+
+    void VisitReturnStmt(const ReturnStmt *S) {
+      RecordStmtCount(S);
+      if (S->getRetValue())
+        Visit(S->getRetValue());
+      PGO.setCurrentRegionUnreachable();
+      RecordNextStmtCount = true;
+    }
+
+    void VisitGotoStmt(const GotoStmt *S) {
+      RecordStmtCount(S);
+      PGO.setCurrentRegionUnreachable();
+      RecordNextStmtCount = true;
+    }
+
+    void VisitLabelStmt(const LabelStmt *S) {
+      RecordNextStmtCount = false;
+      RegionCounter Cnt(PGO, S);
+      Cnt.beginRegion();
+      CountMap[S] = PGO.getCurrentRegionCount();
+      Visit(S->getSubStmt());
+    }
+
+    void VisitBreakStmt(const BreakStmt *S) {
+      RecordStmtCount(S);
+      assert(!BreakContinueStack.empty() && "break not in a loop or switch!");
+      BreakContinueStack.back().BreakCount += PGO.getCurrentRegionCount();
+      PGO.setCurrentRegionUnreachable();
+      RecordNextStmtCount = true;
+    }
+
+    void VisitContinueStmt(const ContinueStmt *S) {
+      RecordStmtCount(S);
+      assert(!BreakContinueStack.empty() && "continue stmt not in a loop!");
+      BreakContinueStack.back().ContinueCount += PGO.getCurrentRegionCount();
+      PGO.setCurrentRegionUnreachable();
+      RecordNextStmtCount = true;
+    }
+
+    void VisitWhileStmt(const WhileStmt *S) {
+      RecordStmtCount(S);
+      RegionCounter Cnt(PGO, S);
+      BreakContinueStack.push_back(BreakContinue());
+      // Visit the body region first so the break/continue adjustments can be
+      // included when visiting the condition.
+      Cnt.beginRegion();
+      CountMap[S->getBody()] = PGO.getCurrentRegionCount();
+      Visit(S->getBody());
+      Cnt.adjustForControlFlow();
+
+      // ...then go back and propagate counts through the condition. The count
+      // at the start of the condition is the sum of the incoming edges,
+      // the backedge from the end of the loop body, and the edges from
+      // continue statements.
+      BreakContinue BC = BreakContinueStack.pop_back_val();
+      Cnt.setCurrentRegionCount(Cnt.getParentCount() +
+                                Cnt.getAdjustedCount() + BC.ContinueCount);
+      CountMap[S->getCond()] = PGO.getCurrentRegionCount();
+      Visit(S->getCond());
+      Cnt.adjustForControlFlow();
+      Cnt.applyAdjustmentsToRegion(BC.BreakCount + BC.ContinueCount);
+      RecordNextStmtCount = true;
+    }
+
+    void VisitDoStmt(const DoStmt *S) {
+      RecordStmtCount(S);
+      RegionCounter Cnt(PGO, S);
+      BreakContinueStack.push_back(BreakContinue());
+      Cnt.beginRegion(/*AddIncomingFallThrough=*/true);
+      CountMap[S->getBody()] = PGO.getCurrentRegionCount();
+      Visit(S->getBody());
+      Cnt.adjustForControlFlow();
+
+      BreakContinue BC = BreakContinueStack.pop_back_val();
+      // The count at the start of the condition is equal to the count at the
+      // end of the body. The adjusted count does not include either the
+      // fall-through count coming into the loop or the continue count, so add
+      // both of those separately. This is coincidentally the same equation as
+      // with while loops but for different reasons.
+      Cnt.setCurrentRegionCount(Cnt.getParentCount() +
+                                Cnt.getAdjustedCount() + BC.ContinueCount);
+      CountMap[S->getCond()] = PGO.getCurrentRegionCount();
+      Visit(S->getCond());
+      Cnt.adjustForControlFlow();
+      Cnt.applyAdjustmentsToRegion(BC.BreakCount + BC.ContinueCount);
+      RecordNextStmtCount = true;
+    }
+
+    void VisitForStmt(const ForStmt *S) {
+      RecordStmtCount(S);
+      if (S->getInit())
+        Visit(S->getInit());
+      RegionCounter Cnt(PGO, S);
+      BreakContinueStack.push_back(BreakContinue());
+      // Visit the body region first. (This is basically the same as a while
+      // loop; see further comments in VisitWhileStmt.)
+      Cnt.beginRegion();
+      CountMap[S->getBody()] = PGO.getCurrentRegionCount();
+      Visit(S->getBody());
+      Cnt.adjustForControlFlow();
+
+      // The increment is essentially part of the body but it needs to include
+      // the count for all the continue statements.
+      if (S->getInc()) {
+        Cnt.setCurrentRegionCount(PGO.getCurrentRegionCount() +
+                                  BreakContinueStack.back().ContinueCount);
+        CountMap[S->getInc()] = PGO.getCurrentRegionCount();
+        Visit(S->getInc());
+        Cnt.adjustForControlFlow();
+      }
+
+      BreakContinue BC = BreakContinueStack.pop_back_val();
+
+      // ...then go back and propagate counts through the condition.
+      if (S->getCond()) {
+        Cnt.setCurrentRegionCount(Cnt.getParentCount() +
+                                  Cnt.getAdjustedCount() +
+                                  BC.ContinueCount);
+        CountMap[S->getCond()] = PGO.getCurrentRegionCount();
+        Visit(S->getCond());
+        Cnt.adjustForControlFlow();
+      }
+      Cnt.applyAdjustmentsToRegion(BC.BreakCount + BC.ContinueCount);
+      RecordNextStmtCount = true;
+    }
+
+    void VisitCXXForRangeStmt(const CXXForRangeStmt *S) {
+      RecordStmtCount(S);
+      Visit(S->getRangeStmt());
+      Visit(S->getBeginEndStmt());
+      RegionCounter Cnt(PGO, S);
+      BreakContinueStack.push_back(BreakContinue());
+      // Visit the body region first. (This is basically the same as a while
+      // loop; see further comments in VisitWhileStmt.)
+      Cnt.beginRegion();
+      CountMap[S->getLoopVarStmt()] = PGO.getCurrentRegionCount();
+      Visit(S->getLoopVarStmt());
+      Visit(S->getBody());
+      Cnt.adjustForControlFlow();
+
+      // The increment is essentially part of the body but it needs to include
+      // the count for all the continue statements.
+      Cnt.setCurrentRegionCount(PGO.getCurrentRegionCount() +
+                                BreakContinueStack.back().ContinueCount);
+      CountMap[S->getInc()] = PGO.getCurrentRegionCount();
+      Visit(S->getInc());
+      Cnt.adjustForControlFlow();
+
+      BreakContinue BC = BreakContinueStack.pop_back_val();
+
+      // ...then go back and propagate counts through the condition.
+      Cnt.setCurrentRegionCount(Cnt.getParentCount() +
+                                Cnt.getAdjustedCount() +
+                                BC.ContinueCount);
+      CountMap[S->getCond()] = PGO.getCurrentRegionCount();
+      Visit(S->getCond());
+      Cnt.adjustForControlFlow();
+      Cnt.applyAdjustmentsToRegion(BC.BreakCount + BC.ContinueCount);
+      RecordNextStmtCount = true;
+    }
+
+    void VisitObjCForCollectionStmt(const ObjCForCollectionStmt *S) {
+      RecordStmtCount(S);
+      Visit(S->getElement());
+      RegionCounter Cnt(PGO, S);
+      BreakContinueStack.push_back(BreakContinue());
+      Cnt.beginRegion();
+      CountMap[S->getBody()] = PGO.getCurrentRegionCount();
+      Visit(S->getBody());
+      BreakContinue BC = BreakContinueStack.pop_back_val();
+      Cnt.adjustForControlFlow();
+      Cnt.applyAdjustmentsToRegion(BC.BreakCount + BC.ContinueCount);
+      RecordNextStmtCount = true;
+    }
+
+    void VisitSwitchStmt(const SwitchStmt *S) {
+      RecordStmtCount(S);
+      Visit(S->getCond());
+      PGO.setCurrentRegionUnreachable();
+      BreakContinueStack.push_back(BreakContinue());
+      Visit(S->getBody());
+      // If the switch is inside a loop, add the continue counts.
+      BreakContinue BC = BreakContinueStack.pop_back_val();
+      if (!BreakContinueStack.empty())
+        BreakContinueStack.back().ContinueCount += BC.ContinueCount;
+      RegionCounter ExitCnt(PGO, S);
+      ExitCnt.beginRegion();
+      RecordNextStmtCount = true;
+    }
+
+    void VisitCaseStmt(const CaseStmt *S) {
+      RecordNextStmtCount = false;
+      RegionCounter Cnt(PGO, S);
+      Cnt.beginRegion(/*AddIncomingFallThrough=*/true);
+      CountMap[S] = Cnt.getCount();
+      RecordNextStmtCount = true;
+      Visit(S->getSubStmt());
+    }
+
+    void VisitDefaultStmt(const DefaultStmt *S) {
+      RecordNextStmtCount = false;
+      RegionCounter Cnt(PGO, S);
+      Cnt.beginRegion(/*AddIncomingFallThrough=*/true);
+      CountMap[S] = Cnt.getCount();
+      RecordNextStmtCount = true;
+      Visit(S->getSubStmt());
+    }
+
+    void VisitIfStmt(const IfStmt *S) {
+      RecordStmtCount(S);
+      RegionCounter Cnt(PGO, S);
+      Visit(S->getCond());
+
+      Cnt.beginRegion();
+      CountMap[S->getThen()] = PGO.getCurrentRegionCount();
+      Visit(S->getThen());
+      Cnt.adjustForControlFlow();
+
+      if (S->getElse()) {
+        Cnt.beginElseRegion();
+        CountMap[S->getElse()] = PGO.getCurrentRegionCount();
+        Visit(S->getElse());
+        Cnt.adjustForControlFlow();
+      }
+      Cnt.applyAdjustmentsToRegion(0);
+      RecordNextStmtCount = true;
+    }
+
+    void VisitCXXTryStmt(const CXXTryStmt *S) {
+      RecordStmtCount(S);
+      Visit(S->getTryBlock());
+      for (unsigned I = 0, E = S->getNumHandlers(); I < E; ++I)
+        Visit(S->getHandler(I));
+      RegionCounter Cnt(PGO, S);
+      Cnt.beginRegion();
+      RecordNextStmtCount = true;
+    }
+
+    void VisitCXXCatchStmt(const CXXCatchStmt *S) {
+      RecordNextStmtCount = false;
+      RegionCounter Cnt(PGO, S);
+      Cnt.beginRegion();
+      CountMap[S] = PGO.getCurrentRegionCount();
+      Visit(S->getHandlerBlock());
+    }
+
+    void VisitConditionalOperator(const ConditionalOperator *E) {
+      RecordStmtCount(E);
+      RegionCounter Cnt(PGO, E);
+      Visit(E->getCond());
+
+      Cnt.beginRegion();
+      CountMap[E->getTrueExpr()] = PGO.getCurrentRegionCount();
+      Visit(E->getTrueExpr());
+      Cnt.adjustForControlFlow();
+
+      Cnt.beginElseRegion();
+      CountMap[E->getFalseExpr()] = PGO.getCurrentRegionCount();
+      Visit(E->getFalseExpr());
+      Cnt.adjustForControlFlow();
+
+      Cnt.applyAdjustmentsToRegion(0);
+      RecordNextStmtCount = true;
+    }
+
+    void VisitBinLAnd(const BinaryOperator *E) {
+      RecordStmtCount(E);
+      RegionCounter Cnt(PGO, E);
+      Visit(E->getLHS());
+      Cnt.beginRegion();
+      CountMap[E->getRHS()] = PGO.getCurrentRegionCount();
+      Visit(E->getRHS());
+      Cnt.adjustForControlFlow();
+      Cnt.applyAdjustmentsToRegion(0);
+      RecordNextStmtCount = true;
+    }
+
+    void VisitBinLOr(const BinaryOperator *E) {
+      RecordStmtCount(E);
+      RegionCounter Cnt(PGO, E);
+      Visit(E->getLHS());
+      Cnt.beginRegion();
+      CountMap[E->getRHS()] = PGO.getCurrentRegionCount();
+      Visit(E->getRHS());
+      Cnt.adjustForControlFlow();
+      Cnt.applyAdjustmentsToRegion(0);
+      RecordNextStmtCount = true;
+    }
+  };
+}
+
+static void emitRuntimeHook(CodeGenModule &CGM) {
+  LLVM_CONSTEXPR const char *RuntimeVarName = "__llvm_profile_runtime";
+  LLVM_CONSTEXPR const char *RuntimeUserName = "__llvm_profile_runtime_user";
+  if (CGM.getModule().getGlobalVariable(RuntimeVarName))
+    return;
+
+  // Declare the runtime hook.
+  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
+  auto *Int32Ty = llvm::Type::getInt32Ty(Ctx);
+  auto *Var = new llvm::GlobalVariable(CGM.getModule(), Int32Ty, false,
+                                       llvm::GlobalValue::ExternalLinkage,
+                                       nullptr, RuntimeVarName);
+
+  // Make a function that uses it.
+  auto *User = llvm::Function::Create(llvm::FunctionType::get(Int32Ty, false),
+                                      llvm::GlobalValue::LinkOnceODRLinkage,
+                                      RuntimeUserName, &CGM.getModule());
+  User->addFnAttr(llvm::Attribute::NoInline);
+  if (CGM.getCodeGenOpts().DisableRedZone)
+    User->addFnAttr(llvm::Attribute::NoRedZone);
+  CGBuilderTy Builder(llvm::BasicBlock::Create(CGM.getLLVMContext(), "", User));
+  auto *Load = Builder.CreateLoad(Var);
+  Builder.CreateRet(Load);
+
+  // Create a use of the function.  Now the definition of the runtime variable
+  // should get pulled in, along with any static initializears.
+  CGM.addUsedGlobal(User);
+}
+
+void CodeGenPGO::assignRegionCounters(const Decl *D, llvm::Function *Fn) {
+  bool InstrumentRegions = CGM.getCodeGenOpts().ProfileInstrGenerate;
+  PGOProfileData *PGOData = CGM.getPGOData();
+  if (!InstrumentRegions && !PGOData)
+    return;
+  if (!D)
+    return;
+  setFuncName(Fn);
+
+  // Set the linkage for variables based on the function linkage.  Usually, we
+  // want to match it, but available_externally and extern_weak both have the
+  // wrong semantics.
+  VarLinkage = Fn->getLinkage();
+  switch (VarLinkage) {
+  case llvm::GlobalValue::ExternalWeakLinkage:
+    VarLinkage = llvm::GlobalValue::LinkOnceAnyLinkage;
+    break;
+  case llvm::GlobalValue::AvailableExternallyLinkage:
+    VarLinkage = llvm::GlobalValue::LinkOnceODRLinkage;
+    break;
+  default:
+    break;
+  }
+
+  mapRegionCounters(D);
+  if (InstrumentRegions) {
+    emitRuntimeHook(CGM);
+    emitCounterVariables();
+  }
+  if (PGOData) {
+    loadRegionCounts(PGOData);
+    computeRegionCounts(D);
+    applyFunctionAttributes(PGOData, Fn);
+  }
+}
+
+void CodeGenPGO::mapRegionCounters(const Decl *D) {
+  RegionCounterMap.reset(new llvm::DenseMap<const Stmt *, unsigned>);
+  MapRegionCounters Walker(*RegionCounterMap);
+  if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D))
+    Walker.VisitFunctionDecl(FD);
+  else if (const ObjCMethodDecl *MD = dyn_cast_or_null<ObjCMethodDecl>(D))
+    Walker.VisitObjCMethodDecl(MD);
+  else if (const BlockDecl *BD = dyn_cast_or_null<BlockDecl>(D))
+    Walker.VisitBlockDecl(BD);
+  NumRegionCounters = Walker.NextCounter;
+  // FIXME: The number of counters isn't sufficient for the hash
+  FunctionHash = NumRegionCounters;
+}
+
+void CodeGenPGO::computeRegionCounts(const Decl *D) {
+  StmtCountMap.reset(new llvm::DenseMap<const Stmt *, uint64_t>);
+  ComputeRegionCounts Walker(*StmtCountMap, *this);
+  if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D))
+    Walker.VisitFunctionDecl(FD);
+  else if (const ObjCMethodDecl *MD = dyn_cast_or_null<ObjCMethodDecl>(D))
+    Walker.VisitObjCMethodDecl(MD);
+  else if (const BlockDecl *BD = dyn_cast_or_null<BlockDecl>(D))
+    Walker.VisitBlockDecl(BD);
+}
+
+void CodeGenPGO::applyFunctionAttributes(PGOProfileData *PGOData,
+                                         llvm::Function *Fn) {
+  if (!haveRegionCounts())
+    return;
+
+  uint64_t MaxFunctionCount = PGOData->getMaximumFunctionCount();
+  uint64_t FunctionCount = getRegionCount(0);
+  if (FunctionCount >= (uint64_t)(0.3 * (double)MaxFunctionCount))
+    // Turn on InlineHint attribute for hot functions.
+    // FIXME: 30% is from preliminary tuning on SPEC, it may not be optimal.
+    Fn->addFnAttr(llvm::Attribute::InlineHint);
+  else if (FunctionCount <= (uint64_t)(0.01 * (double)MaxFunctionCount))
+    // Turn on Cold attribute for cold functions.
+    // FIXME: 1% is from preliminary tuning on SPEC, it may not be optimal.
+    Fn->addFnAttr(llvm::Attribute::Cold);
+}
+
+void CodeGenPGO::emitCounterVariables() {
+  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
+  llvm::ArrayType *CounterTy = llvm::ArrayType::get(llvm::Type::getInt64Ty(Ctx),
+                                                    NumRegionCounters);
+  RegionCounters =
+    new llvm::GlobalVariable(CGM.getModule(), CounterTy, false, VarLinkage,
+                             llvm::Constant::getNullValue(CounterTy),
+                             getFuncVarName("counters"));
+  RegionCounters->setAlignment(8);
+  RegionCounters->setSection(getCountersSection(CGM));
+}
+
+void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, unsigned Counter) {
+  if (!RegionCounters)
+    return;
+  llvm::Value *Addr =
+    Builder.CreateConstInBoundsGEP2_64(RegionCounters, 0, Counter);
+  llvm::Value *Count = Builder.CreateLoad(Addr, "pgocount");
+  Count = Builder.CreateAdd(Count, Builder.getInt64(1));
+  Builder.CreateStore(Count, Addr);
+}
+
+void CodeGenPGO::loadRegionCounts(PGOProfileData *PGOData) {
+  // For now, ignore the counts from the PGO data file only if the number of
+  // counters does not match. This could be tightened down in the future to
+  // ignore counts when the input changes in various ways, e.g., by comparing a
+  // hash value based on some characteristics of the input.
+  RegionCounts.reset(new std::vector<uint64_t>);
+  uint64_t Hash;
+  if (PGOData->getFunctionCounts(getFuncName(), Hash, *RegionCounts) ||
+      Hash != FunctionHash || RegionCounts->size() != NumRegionCounters)
+    RegionCounts.reset();
+}
+
+void CodeGenPGO::destroyRegionCounters() {
+  RegionCounterMap.reset();
+  StmtCountMap.reset();
+  RegionCounts.reset();
+}
+
+/// \brief Calculate what to divide by to scale weights.
+///
+/// Given the maximum weight, calculate a divisor that will scale all the
+/// weights to strictly less than UINT32_MAX.
+static uint64_t calculateWeightScale(uint64_t MaxWeight) {
+  return MaxWeight < UINT32_MAX ? 1 : MaxWeight / UINT32_MAX + 1;
+}
+
+/// \brief Scale an individual branch weight (and add 1).
+///
+/// Scale a 64-bit weight down to 32-bits using \c Scale.
+///
+/// According to Laplace's Rule of Succession, it is better to compute the
+/// weight based on the count plus 1, so universally add 1 to the value.
+///
+/// \pre \c Scale was calculated by \a calculateWeightScale() with a weight no
+/// greater than \c Weight.
+static uint32_t scaleBranchWeight(uint64_t Weight, uint64_t Scale) {
+  assert(Scale && "scale by 0?");
+  uint64_t Scaled = Weight / Scale + 1;
+  assert(Scaled <= UINT32_MAX && "overflow 32-bits");
+  return Scaled;
+}
+
+llvm::MDNode *CodeGenPGO::createBranchWeights(uint64_t TrueCount,
+                                              uint64_t FalseCount) {
+  // Check for empty weights.
+  if (!TrueCount && !FalseCount)
+    return nullptr;
+
+  // Calculate how to scale down to 32-bits.
+  uint64_t Scale = calculateWeightScale(std::max(TrueCount, FalseCount));
+
+  llvm::MDBuilder MDHelper(CGM.getLLVMContext());
+  return MDHelper.createBranchWeights(scaleBranchWeight(TrueCount, Scale),
+                                      scaleBranchWeight(FalseCount, Scale));
+}
+
+llvm::MDNode *CodeGenPGO::createBranchWeights(ArrayRef<uint64_t> Weights) {
+  // We need at least two elements to create meaningful weights.
+  if (Weights.size() < 2)
+    return nullptr;
+
+  // Calculate how to scale down to 32-bits.
+  uint64_t Scale = calculateWeightScale(*std::max_element(Weights.begin(),
+                                                          Weights.end()));
+
+  SmallVector<uint32_t, 16> ScaledWeights;
+  ScaledWeights.reserve(Weights.size());
+  for (uint64_t W : Weights)
+    ScaledWeights.push_back(scaleBranchWeight(W, Scale));
+
+  llvm::MDBuilder MDHelper(CGM.getLLVMContext());
+  return MDHelper.createBranchWeights(ScaledWeights);
+}
+
+llvm::MDNode *CodeGenPGO::createLoopWeights(const Stmt *Cond,
+                                            RegionCounter &Cnt) {
+  if (!haveRegionCounts())
+    return nullptr;
+  uint64_t LoopCount = Cnt.getCount();
+  uint64_t CondCount = 0;
+  bool Found = getStmtCount(Cond, CondCount);
+  assert(Found && "missing expected loop condition count");
+  (void)Found;
+  if (CondCount == 0)
+    return nullptr;
+  return createBranchWeights(LoopCount,
+                             std::max(CondCount, LoopCount) - LoopCount);
+}
diff --git a/lib/CodeGen/CodeGenPGO.h b/lib/CodeGen/CodeGenPGO.h
new file mode 100644
index 0000000..c59a58e
--- /dev/null
+++ b/lib/CodeGen/CodeGenPGO.h
@@ -0,0 +1,256 @@
+//===--- CodeGenPGO.h - PGO Instrumentation for LLVM CodeGen ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Instrumentation-based profile-guided optimization
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_CODEGEN_CODEGENPGO_H
+#define CLANG_CODEGEN_CODEGENPGO_H
+
+#include "CGBuilder.h"
+#include "CodeGenModule.h"
+#include "CodeGenTypes.h"
+#include "clang/Frontend/CodeGenOptions.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <memory>
+
+namespace clang {
+namespace CodeGen {
+class RegionCounter;
+
+/// The raw counter data from an instrumented PGO binary
+class PGOProfileData {
+private:
+  /// The PGO data
+  std::unique_ptr<llvm::MemoryBuffer> DataBuffer;
+  /// Offsets into DataBuffer for each function's counters
+  llvm::StringMap<unsigned> DataOffsets;
+  /// Execution counts for each function.
+  llvm::StringMap<uint64_t> FunctionCounts;
+  /// The maximal execution count among all functions.
+  uint64_t MaxFunctionCount;
+  CodeGenModule &CGM;
+public:
+  PGOProfileData(CodeGenModule &CGM, std::string Path);
+  /// Fill Counts with the profile data for the given function name. Returns
+  /// false on success.
+  bool getFunctionCounts(StringRef FuncName, uint64_t &FuncHash,
+                         std::vector<uint64_t> &Counts);
+  /// Return the maximum of all known function counts.
+  uint64_t getMaximumFunctionCount() { return MaxFunctionCount; }
+};
+
+/// Per-function PGO state. This class should generally not be used directly,
+/// but instead through the CodeGenFunction and RegionCounter types.
+class CodeGenPGO {
+private:
+  CodeGenModule &CGM;
+  std::unique_ptr<std::string> PrefixedFuncName;
+  StringRef RawFuncName;
+  llvm::GlobalValue::LinkageTypes VarLinkage;
+
+  unsigned NumRegionCounters;
+  uint64_t FunctionHash;
+  llvm::GlobalVariable *RegionCounters;
+  std::unique_ptr<llvm::DenseMap<const Stmt *, unsigned>> RegionCounterMap;
+  std::unique_ptr<llvm::DenseMap<const Stmt *, uint64_t>> StmtCountMap;
+  std::unique_ptr<std::vector<uint64_t>> RegionCounts;
+  uint64_t CurrentRegionCount;
+
+public:
+  CodeGenPGO(CodeGenModule &CGM)
+      : CGM(CGM), NumRegionCounters(0), FunctionHash(0), RegionCounters(0),
+        CurrentRegionCount(0) {}
+
+  /// Whether or not we have PGO region data for the current function. This is
+  /// false both when we have no data at all and when our data has been
+  /// discarded.
+  bool haveRegionCounts() const { return RegionCounts != 0; }
+
+  /// Get the string used to identify this function in the profile data.
+  /// For functions with local linkage, this includes the main file name.
+  StringRef getFuncName() const { return StringRef(*PrefixedFuncName); }
+  std::string getFuncVarName(StringRef VarName) const {
+    return ("__llvm_profile_" + VarName + "_" + RawFuncName).str();
+  }
+
+  /// Return the counter value of the current region.
+  uint64_t getCurrentRegionCount() const { return CurrentRegionCount; }
+
+  /// Set the counter value for the current region. This is used to keep track
+  /// of changes to the most recent counter from control flow and non-local
+  /// exits.
+  void setCurrentRegionCount(uint64_t Count) { CurrentRegionCount = Count; }
+
+  /// Indicate that the current region is never reached, and thus should have a
+  /// counter value of zero. This is important so that subsequent regions can
+  /// correctly track their parent counts.
+  void setCurrentRegionUnreachable() { setCurrentRegionCount(0); }
+
+  /// Check if an execution count is known for a given statement. If so, return
+  /// true and put the value in Count; else return false.
+  bool getStmtCount(const Stmt *S, uint64_t &Count) {
+    if (!StmtCountMap)
+      return false;
+    llvm::DenseMap<const Stmt*, uint64_t>::const_iterator
+      I = StmtCountMap->find(S);
+    if (I == StmtCountMap->end())
+      return false;
+    Count = I->second;
+    return true;
+  }
+
+  /// If the execution count for the current statement is known, record that
+  /// as the current count.
+  void setCurrentStmt(const Stmt *S) {
+    uint64_t Count;
+    if (getStmtCount(S, Count))
+      setCurrentRegionCount(Count);
+  }
+
+  /// Calculate branch weights appropriate for PGO data
+  llvm::MDNode *createBranchWeights(uint64_t TrueCount, uint64_t FalseCount);
+  llvm::MDNode *createBranchWeights(ArrayRef<uint64_t> Weights);
+  llvm::MDNode *createLoopWeights(const Stmt *Cond, RegionCounter &Cnt);
+
+  /// Assign counters to regions and configure them for PGO of a given
+  /// function. Does nothing if instrumentation is not enabled and either
+  /// generates global variables or associates PGO data with each of the
+  /// counters depending on whether we are generating or using instrumentation.
+  void assignRegionCounters(const Decl *D, llvm::Function *Fn);
+  /// Emit static data structures for instrumentation data.
+  void emitInstrumentationData();
+  /// Clean up region counter state. Must be called if assignRegionCounters is
+  /// used.
+  void destroyRegionCounters();
+  /// Emit static initialization code, if any.
+  static llvm::Function *emitInitialization(CodeGenModule &CGM);
+
+private:
+  void setFuncName(llvm::Function *Fn);
+  void mapRegionCounters(const Decl *D);
+  void computeRegionCounts(const Decl *D);
+  void applyFunctionAttributes(PGOProfileData *PGOData, llvm::Function *Fn);
+  void loadRegionCounts(PGOProfileData *PGOData);
+  void emitCounterVariables();
+  llvm::GlobalVariable *buildDataVar();
+
+  /// Emit code to increment the counter at the given index
+  void emitCounterIncrement(CGBuilderTy &Builder, unsigned Counter);
+
+  /// Return the region counter for the given statement. This should only be
+  /// called on statements that have a dedicated counter.
+  unsigned getRegionCounter(const Stmt *S) {
+    if (RegionCounterMap == 0)
+      return 0;
+    return (*RegionCounterMap)[S];
+  }
+
+  /// Return the region count for the counter at the given index.
+  uint64_t getRegionCount(unsigned Counter) {
+    if (!haveRegionCounts())
+      return 0;
+    return (*RegionCounts)[Counter];
+  }
+
+  friend class RegionCounter;
+};
+
+/// A counter for a particular region. This is the primary interface through
+/// which clients manage PGO counters and their values.
+class RegionCounter {
+  CodeGenPGO *PGO;
+  unsigned Counter;
+  uint64_t Count;
+  uint64_t ParentCount;
+  uint64_t RegionCount;
+  int64_t Adjust;
+
+  RegionCounter(CodeGenPGO &PGO, unsigned CounterIndex)
+    : PGO(&PGO), Counter(CounterIndex), Count(PGO.getRegionCount(Counter)),
+      ParentCount(PGO.getCurrentRegionCount()), Adjust(0) {}
+
+public:
+  RegionCounter(CodeGenPGO &PGO, const Stmt *S)
+    : PGO(&PGO), Counter(PGO.getRegionCounter(S)),
+      Count(PGO.getRegionCount(Counter)),
+      ParentCount(PGO.getCurrentRegionCount()), Adjust(0) {}
+
+  /// Get the value of the counter. In most cases this is the number of times
+  /// the region of the counter was entered, but for switch labels it's the
+  /// number of direct jumps to that label.
+  uint64_t getCount() const { return Count; }
+
+  /// Get the value of the counter with adjustments applied. Adjustments occur
+  /// when control enters or leaves the region abnormally; i.e., if there is a
+  /// jump to a label within the region, or if the function can return from
+  /// within the region. The adjusted count, then, is the value of the counter
+  /// at the end of the region.
+  uint64_t getAdjustedCount() const {
+    return Count + Adjust;
+  }
+
+  /// Get the value of the counter in this region's parent, i.e., the region
+  /// that was active when this region began. This is useful for deriving
+  /// counts in implicitly counted regions, like the false case of a condition
+  /// or the normal exits of a loop.
+  uint64_t getParentCount() const { return ParentCount; }
+
+  /// Activate the counter by emitting an increment and starting to track
+  /// adjustments. If AddIncomingFallThrough is true, the current region count
+  /// will be added to the counter for the purposes of tracking the region.
+  void beginRegion(CGBuilderTy &Builder, bool AddIncomingFallThrough=false) {
+    beginRegion(AddIncomingFallThrough);
+    PGO->emitCounterIncrement(Builder, Counter);
+  }
+  void beginRegion(bool AddIncomingFallThrough=false) {
+    RegionCount = Count;
+    if (AddIncomingFallThrough)
+      RegionCount += PGO->getCurrentRegionCount();
+    PGO->setCurrentRegionCount(RegionCount);
+  }
+
+  /// For counters on boolean branches, begins tracking adjustments for the
+  /// uncounted path.
+  void beginElseRegion() {
+    RegionCount = ParentCount - Count;
+    PGO->setCurrentRegionCount(RegionCount);
+  }
+
+  /// Reset the current region count.
+  void setCurrentRegionCount(uint64_t CurrentCount) {
+    RegionCount = CurrentCount;
+    PGO->setCurrentRegionCount(RegionCount);
+  }
+
+  /// Adjust for non-local control flow after emitting a subexpression or
+  /// substatement. This must be called to account for constructs such as gotos,
+  /// labels, and returns, so that we can ensure that our region's count is
+  /// correct in the code that follows.
+  void adjustForControlFlow() {
+    Adjust += PGO->getCurrentRegionCount() - RegionCount;
+    // Reset the region count in case this is called again later.
+    RegionCount = PGO->getCurrentRegionCount();
+  }
+
+  /// Commit all adjustments to the current region. If the region is a loop,
+  /// the LoopAdjust value should be the count of all the breaks and continues
+  /// from the loop, to compensate for those counts being deducted from the
+  /// adjustments for the body of the loop.
+  void applyAdjustmentsToRegion(uint64_t LoopAdjust) {
+    PGO->setCurrentRegionCount(ParentCount + Adjust + LoopAdjust);
+  }
+};
+
+}  // end namespace CodeGen
+}  // end namespace clang
+
+#endif
diff --git a/lib/CodeGen/CodeGenTypes.cpp b/lib/CodeGen/CodeGenTypes.cpp
index 5f3c59c..b7f3746 100644
--- a/lib/CodeGen/CodeGenTypes.cpp
+++ b/lib/CodeGen/CodeGenTypes.cpp
@@ -38,10 +38,7 @@
 }
 
 CodeGenTypes::~CodeGenTypes() {
-  for (llvm::DenseMap<const Type *, CGRecordLayout *>::iterator
-         I = CGRecordLayouts.begin(), E = CGRecordLayouts.end();
-      I != E; ++I)
-    delete I->second;
+  llvm::DeleteContainerSeconds(CGRecordLayouts);
 
   for (llvm::FoldingSet<CGFunctionInfo>::iterator
        I = FunctionInfos.begin(), E = FunctionInfos.end(); I != E; )
@@ -134,17 +131,15 @@
   // when a class is translated, even though they aren't embedded by-value into
   // the class.
   if (const CXXRecordDecl *CRD = dyn_cast<CXXRecordDecl>(RD)) {
-    for (CXXRecordDecl::base_class_const_iterator I = CRD->bases_begin(),
-         E = CRD->bases_end(); I != E; ++I)
-      if (!isSafeToConvert(I->getType()->getAs<RecordType>()->getDecl(),
+    for (const auto &I : CRD->bases())
+      if (!isSafeToConvert(I.getType()->getAs<RecordType>()->getDecl(),
                            CGT, AlreadyChecked))
         return false;
   }
   
   // If this type would require laying out members that are currently being laid
   // out, don't do it.
-  for (RecordDecl::field_iterator I = RD->field_begin(),
-       E = RD->field_end(); I != E; ++I)
+  for (const auto *I : RD->fields())
     if (!isSafeToConvert(I->getType(), CGT, AlreadyChecked))
       return false;
   
@@ -186,13 +181,12 @@
   return isSafeToConvert(RD, CGT, AlreadyChecked);
 }
 
-
-/// isFuncTypeArgumentConvertible - Return true if the specified type in a 
-/// function argument or result position can be converted to an IR type at this
+/// isFuncParamTypeConvertible - Return true if the specified type in a
+/// function parameter or result position can be converted to an IR type at this
 /// point.  This boils down to being whether it is complete, as well as whether
 /// we've temporarily deferred expanding the type because we're in a recursive
 /// context.
-bool CodeGenTypes::isFuncTypeArgumentConvertible(QualType Ty) {
+bool CodeGenTypes::isFuncParamTypeConvertible(QualType Ty) {
   // If this isn't a tagged type, we can convert it!
   const TagType *TT = Ty->getAs<TagType>();
   if (TT == 0) return true;
@@ -217,17 +211,17 @@
 
 
 /// Code to verify a given function type is complete, i.e. the return type
-/// and all of the argument types are complete.  Also check to see if we are in
+/// and all of the parameter types are complete.  Also check to see if we are in
 /// a RS_StructPointer context, and if so whether any struct types have been
 /// pended.  If so, we don't want to ask the ABI lowering code to handle a type
 /// that cannot be converted to an IR type.
 bool CodeGenTypes::isFuncTypeConvertible(const FunctionType *FT) {
-  if (!isFuncTypeArgumentConvertible(FT->getResultType()))
+  if (!isFuncParamTypeConvertible(FT->getReturnType()))
     return false;
   
   if (const FunctionProtoType *FPT = dyn_cast<FunctionProtoType>(FT))
-    for (unsigned i = 0, e = FPT->getNumArgs(); i != e; i++)
-      if (!isFuncTypeArgumentConvertible(FPT->getArgType(i)))
+    for (unsigned i = 0, e = FPT->getNumParams(); i != e; i++)
+      if (!isFuncParamTypeConvertible(FPT->getParamType(i)))
         return false;
 
   return true;
@@ -479,11 +473,11 @@
 
       // Force conversion of all the relevant record types, to make sure
       // we re-convert the FunctionType when appropriate.
-      if (const RecordType *RT = FT->getResultType()->getAs<RecordType>())
+      if (const RecordType *RT = FT->getReturnType()->getAs<RecordType>())
         ConvertRecordDeclType(RT->getDecl());
       if (const FunctionProtoType *FPT = dyn_cast<FunctionProtoType>(FT))
-        for (unsigned i = 0, e = FPT->getNumArgs(); i != e; i++)
-          if (const RecordType *RT = FPT->getArgType(i)->getAs<RecordType>())
+        for (unsigned i = 0, e = FPT->getNumParams(); i != e; i++)
+          if (const RecordType *RT = FPT->getParamType(i)->getAs<RecordType>())
             ConvertRecordDeclType(RT->getDecl());
 
       // Return a placeholder type.
@@ -493,7 +487,7 @@
       break;
     }
 
-    // While we're converting the argument types for a function, we don't want
+    // While we're converting the parameter types for a function, we don't want
     // to recursively convert any pointed-to structs.  Converting directly-used
     // structs is ok though.
     if (!RecordsBeingLaidOut.insert(Ty)) {
@@ -655,11 +649,10 @@
   
   // Force conversion of non-virtual base classes recursively.
   if (const CXXRecordDecl *CRD = dyn_cast<CXXRecordDecl>(RD)) {
-    for (CXXRecordDecl::base_class_const_iterator i = CRD->bases_begin(),
-         e = CRD->bases_end(); i != e; ++i) {
-      if (i->isVirtual()) continue;
+    for (const auto &I : CRD->bases()) {
+      if (I.isVirtual()) continue;
       
-      ConvertRecordDeclType(i->getType()->getAs<RecordType>()->getDecl());
+      ConvertRecordDeclType(I.getType()->getAs<RecordType>()->getDecl());
     }
   }
 
diff --git a/lib/CodeGen/CodeGenTypes.h b/lib/CodeGen/CodeGenTypes.h
index 94ca9e2..59e3089 100644
--- a/lib/CodeGen/CodeGenTypes.h
+++ b/lib/CodeGen/CodeGenTypes.h
@@ -136,8 +136,8 @@
   /// be converted to an LLVM type (i.e. doesn't depend on an incomplete tag
   /// type).
   bool isFuncTypeConvertible(const FunctionType *FT);
-  bool isFuncTypeArgumentConvertible(QualType Ty);
-  
+  bool isFuncParamTypeConvertible(QualType Ty);
+
   /// GetFunctionTypeForVTable - Get the LLVM function type for use in a vtable,
   /// given a CXXMethodDecl. If the method to has an incomplete return type,
   /// and/or incomplete argument types, this will return the opaque type.
@@ -175,10 +175,10 @@
 
   const CGFunctionInfo &arrangeGlobalDeclaration(GlobalDecl GD);
   const CGFunctionInfo &arrangeFunctionDeclaration(const FunctionDecl *FD);
-  const CGFunctionInfo &arrangeFunctionDeclaration(QualType ResTy,
-                                                   const FunctionArgList &Args,
-                                             const FunctionType::ExtInfo &Info,
-                                                   bool isVariadic);
+  const CGFunctionInfo &
+  arrangeFreeFunctionDeclaration(QualType ResTy, const FunctionArgList &Args,
+                                 const FunctionType::ExtInfo &Info,
+                                 bool isVariadic);
 
   const CGFunctionInfo &arrangeObjCMethodDeclaration(const ObjCMethodDecl *MD);
   const CGFunctionInfo &arrangeObjCMessageSendSignature(const ObjCMethodDecl *MD,
@@ -188,6 +188,10 @@
   const CGFunctionInfo &arrangeCXXConstructorDeclaration(
                                                     const CXXConstructorDecl *D,
                                                     CXXCtorType Type);
+  const CGFunctionInfo &arrangeCXXConstructorCall(const CallArgList &Args,
+                                                  const CXXConstructorDecl *D,
+                                                  CXXCtorType CtorKind,
+                                                  unsigned ExtraArgs);
   const CGFunctionInfo &arrangeCXXDestructor(const CXXDestructorDecl *D,
                                              CXXDtorType Type);
 
@@ -216,6 +220,7 @@
   ///
   /// \param argTypes - must all actually be canonical as params
   const CGFunctionInfo &arrangeLLVMFunctionInfo(CanQualType returnType,
+                                                bool IsInstanceMethod,
                                                 ArrayRef<CanQualType> argTypes,
                                                 FunctionType::ExtInfo info,
                                                 RequiredArgs args);
diff --git a/lib/CodeGen/EHScopeStack.h b/lib/CodeGen/EHScopeStack.h
index e9d9a33..166d420 100644
--- a/lib/CodeGen/EHScopeStack.h
+++ b/lib/CodeGen/EHScopeStack.h
@@ -19,8 +19,8 @@
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Value.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Value.h"
 
 namespace clang {
 namespace CodeGen {
@@ -65,9 +65,9 @@
 template <class T> struct DominatingValue : InvariantValue<T> {};
 
 template <class T, bool mightBeInstruction =
-            llvm::is_base_of<llvm::Value, T>::value &&
-            !llvm::is_base_of<llvm::Constant, T>::value &&
-            !llvm::is_base_of<llvm::BasicBlock, T>::value>
+            std::is_base_of<llvm::Value, T>::value &&
+            !std::is_base_of<llvm::Constant, T>::value &&
+            !std::is_base_of<llvm::BasicBlock, T>::value>
 struct DominatingPointer;
 template <class T> struct DominatingPointer<T,false> : InvariantValue<T*> {};
 // template <class T> struct DominatingPointer<T,true> at end of file
@@ -182,7 +182,7 @@
     typedef typename DominatingValue<A0>::saved_type A0_saved;
     A0_saved a0_saved;
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       A0 a0 = DominatingValue<A0>::restore(CGF, a0_saved);
       T(a0).Emit(CGF, flags);
     }
@@ -199,7 +199,7 @@
     A0_saved a0_saved;
     A1_saved a1_saved;
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       A0 a0 = DominatingValue<A0>::restore(CGF, a0_saved);
       A1 a1 = DominatingValue<A1>::restore(CGF, a1_saved);
       T(a0, a1).Emit(CGF, flags);
@@ -219,7 +219,7 @@
     A1_saved a1_saved;
     A2_saved a2_saved;
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       A0 a0 = DominatingValue<A0>::restore(CGF, a0_saved);
       A1 a1 = DominatingValue<A1>::restore(CGF, a1_saved);
       A2 a2 = DominatingValue<A2>::restore(CGF, a2_saved);
@@ -242,7 +242,7 @@
     A2_saved a2_saved;
     A3_saved a3_saved;
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       A0 a0 = DominatingValue<A0>::restore(CGF, a0_saved);
       A1 a1 = DominatingValue<A1>::restore(CGF, a1_saved);
       A2 a2 = DominatingValue<A2>::restore(CGF, a2_saved);
diff --git a/lib/CodeGen/ItaniumCXXABI.cpp b/lib/CodeGen/ItaniumCXXABI.cpp
index 0e8f31a..27825ab 100644
--- a/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/lib/CodeGen/ItaniumCXXABI.cpp
@@ -52,13 +52,13 @@
     CGCXXABI(CGM), UseARMMethodPtrABI(UseARMMethodPtrABI),
     UseARMGuardVarABI(UseARMGuardVarABI) { }
 
-  bool isReturnTypeIndirect(const CXXRecordDecl *RD) const {
+  bool isReturnTypeIndirect(const CXXRecordDecl *RD) const override {
     // Structures with either a non-trivial destructor or a non-trivial
     // copy constructor are always indirect.
     return !RD->hasTrivialDestructor() || RD->hasNonTrivialCopyConstructor();
   }
 
-  RecordArgABI getRecordArgABI(const CXXRecordDecl *RD) const {
+  RecordArgABI getRecordArgABI(const CXXRecordDecl *RD) const override {
     // Structures with either a non-trivial destructor or a non-trivial
     // copy constructor are always indirect.
     if (!RD->hasTrivialDestructor() || RD->hasNonTrivialCopyConstructor())
@@ -66,114 +66,117 @@
     return RAA_Default;
   }
 
-  bool isZeroInitializable(const MemberPointerType *MPT);
+  bool isZeroInitializable(const MemberPointerType *MPT) override;
 
-  llvm::Type *ConvertMemberPointerType(const MemberPointerType *MPT);
+  llvm::Type *ConvertMemberPointerType(const MemberPointerType *MPT) override;
 
-  llvm::Value *EmitLoadOfMemberFunctionPointer(CodeGenFunction &CGF,
-                                               llvm::Value *&This,
-                                               llvm::Value *MemFnPtr,
-                                               const MemberPointerType *MPT);
+  llvm::Value *
+    EmitLoadOfMemberFunctionPointer(CodeGenFunction &CGF,
+                                    const Expr *E,
+                                    llvm::Value *&This,
+                                    llvm::Value *MemFnPtr,
+                                    const MemberPointerType *MPT) override;
 
-  llvm::Value *EmitMemberDataPointerAddress(CodeGenFunction &CGF,
-                                            llvm::Value *Base,
-                                            llvm::Value *MemPtr,
-                                            const MemberPointerType *MPT);
+  llvm::Value *
+    EmitMemberDataPointerAddress(CodeGenFunction &CGF, const Expr *E,
+                                 llvm::Value *Base,
+                                 llvm::Value *MemPtr,
+                                 const MemberPointerType *MPT) override;
 
   llvm::Value *EmitMemberPointerConversion(CodeGenFunction &CGF,
                                            const CastExpr *E,
-                                           llvm::Value *Src);
+                                           llvm::Value *Src) override;
   llvm::Constant *EmitMemberPointerConversion(const CastExpr *E,
-                                              llvm::Constant *Src);
+                                              llvm::Constant *Src) override;
 
-  llvm::Constant *EmitNullMemberPointer(const MemberPointerType *MPT);
+  llvm::Constant *EmitNullMemberPointer(const MemberPointerType *MPT) override;
 
-  llvm::Constant *EmitMemberPointer(const CXXMethodDecl *MD);
+  llvm::Constant *EmitMemberPointer(const CXXMethodDecl *MD) override;
   llvm::Constant *EmitMemberDataPointer(const MemberPointerType *MPT,
-                                        CharUnits offset);
-  llvm::Constant *EmitMemberPointer(const APValue &MP, QualType MPT);
+                                        CharUnits offset) override;
+  llvm::Constant *EmitMemberPointer(const APValue &MP, QualType MPT) override;
   llvm::Constant *BuildMemberPointer(const CXXMethodDecl *MD,
                                      CharUnits ThisAdjustment);
 
   llvm::Value *EmitMemberPointerComparison(CodeGenFunction &CGF,
-                                           llvm::Value *L,
-                                           llvm::Value *R,
+                                           llvm::Value *L, llvm::Value *R,
                                            const MemberPointerType *MPT,
-                                           bool Inequality);
+                                           bool Inequality) override;
 
   llvm::Value *EmitMemberPointerIsNotNull(CodeGenFunction &CGF,
-                                          llvm::Value *Addr,
-                                          const MemberPointerType *MPT);
+                                         llvm::Value *Addr,
+                                         const MemberPointerType *MPT) override;
 
-  llvm::Value *adjustToCompleteObject(CodeGenFunction &CGF,
-                                      llvm::Value *ptr,
-                                      QualType type);
+  llvm::Value *adjustToCompleteObject(CodeGenFunction &CGF, llvm::Value *ptr,
+                                      QualType type) override;
 
-  llvm::Value *GetVirtualBaseClassOffset(CodeGenFunction &CGF,
-                                         llvm::Value *This,
-                                         const CXXRecordDecl *ClassDecl,
-                                         const CXXRecordDecl *BaseClassDecl);
+  llvm::Value *
+    GetVirtualBaseClassOffset(CodeGenFunction &CGF, llvm::Value *This,
+                              const CXXRecordDecl *ClassDecl,
+                              const CXXRecordDecl *BaseClassDecl) override;
 
   void BuildConstructorSignature(const CXXConstructorDecl *Ctor,
-                                 CXXCtorType T,
-                                 CanQualType &ResTy,
-                                 SmallVectorImpl<CanQualType> &ArgTys);
+                                 CXXCtorType T, CanQualType &ResTy,
+                                 SmallVectorImpl<CanQualType> &ArgTys) override;
 
-  void EmitCXXConstructors(const CXXConstructorDecl *D);
+  void EmitCXXConstructors(const CXXConstructorDecl *D) override;
 
   void BuildDestructorSignature(const CXXDestructorDecl *Dtor,
-                                CXXDtorType T,
-                                CanQualType &ResTy,
-                                SmallVectorImpl<CanQualType> &ArgTys);
+                                CXXDtorType T, CanQualType &ResTy,
+                                SmallVectorImpl<CanQualType> &ArgTys) override;
 
   bool useThunkForDtorVariant(const CXXDestructorDecl *Dtor,
-                              CXXDtorType DT) const {
+                              CXXDtorType DT) const override {
     // Itanium does not emit any destructor variant as an inline thunk.
     // Delegating may occur as an optimization, but all variants are either
     // emitted with external linkage or as linkonce if they are inline and used.
     return false;
   }
 
-  void EmitCXXDestructors(const CXXDestructorDecl *D);
+  void EmitCXXDestructors(const CXXDestructorDecl *D) override;
 
-  void BuildInstanceFunctionParams(CodeGenFunction &CGF,
-                                   QualType &ResTy,
-                                   FunctionArgList &Params);
+  void addImplicitStructorParams(CodeGenFunction &CGF, QualType &ResTy,
+                                 FunctionArgList &Params) override;
 
-  void EmitInstanceFunctionProlog(CodeGenFunction &CGF);
+  void EmitInstanceFunctionProlog(CodeGenFunction &CGF) override;
 
-  void EmitConstructorCall(CodeGenFunction &CGF,
-                           const CXXConstructorDecl *D, CXXCtorType Type,
-                           bool ForVirtualBase, bool Delegating,
-                           llvm::Value *This,
-                           CallExpr::const_arg_iterator ArgBeg,
-                           CallExpr::const_arg_iterator ArgEnd);
+  unsigned addImplicitConstructorArgs(CodeGenFunction &CGF,
+                                      const CXXConstructorDecl *D,
+                                      CXXCtorType Type, bool ForVirtualBase,
+                                      bool Delegating,
+                                      CallArgList &Args) override;
 
-  void emitVTableDefinitions(CodeGenVTables &CGVT, const CXXRecordDecl *RD);
+  void EmitDestructorCall(CodeGenFunction &CGF, const CXXDestructorDecl *DD,
+                          CXXDtorType Type, bool ForVirtualBase,
+                          bool Delegating, llvm::Value *This) override;
+
+  void emitVTableDefinitions(CodeGenVTables &CGVT,
+                             const CXXRecordDecl *RD) override;
 
   llvm::Value *getVTableAddressPointInStructor(
       CodeGenFunction &CGF, const CXXRecordDecl *VTableClass,
       BaseSubobject Base, const CXXRecordDecl *NearestVBase,
-      bool &NeedsVirtualOffset);
+      bool &NeedsVirtualOffset) override;
 
   llvm::Constant *
   getVTableAddressPointForConstExpr(BaseSubobject Base,
-                                    const CXXRecordDecl *VTableClass);
+                                    const CXXRecordDecl *VTableClass) override;
 
   llvm::GlobalVariable *getAddrOfVTable(const CXXRecordDecl *RD,
-                                        CharUnits VPtrOffset);
+                                        CharUnits VPtrOffset) override;
 
   llvm::Value *getVirtualFunctionPointer(CodeGenFunction &CGF, GlobalDecl GD,
-                                         llvm::Value *This, llvm::Type *Ty);
+                                         llvm::Value *This,
+                                         llvm::Type *Ty) override;
 
   void EmitVirtualDestructorCall(CodeGenFunction &CGF,
                                  const CXXDestructorDecl *Dtor,
                                  CXXDtorType DtorType, SourceLocation CallLoc,
-                                 llvm::Value *This);
+                                 llvm::Value *This) override;
 
-  void emitVirtualInheritanceTables(const CXXRecordDecl *RD);
+  void emitVirtualInheritanceTables(const CXXRecordDecl *RD) override;
 
-  void setThunkLinkage(llvm::Function *Thunk, bool ForVTable) {
+  void setThunkLinkage(llvm::Function *Thunk, bool ForVTable) override {
     // Allow inlining of thunks by emitting them with available_externally
     // linkage together with vtables when needed.
     if (ForVTable)
@@ -181,38 +184,40 @@
   }
 
   llvm::Value *performThisAdjustment(CodeGenFunction &CGF, llvm::Value *This,
-                                     const ThisAdjustment &TA);
+                                     const ThisAdjustment &TA) override;
 
   llvm::Value *performReturnAdjustment(CodeGenFunction &CGF, llvm::Value *Ret,
-                                       const ReturnAdjustment &RA);
+                                       const ReturnAdjustment &RA) override;
 
-  StringRef GetPureVirtualCallName() { return "__cxa_pure_virtual"; }
-  StringRef GetDeletedVirtualCallName() { return "__cxa_deleted_virtual"; }
+  StringRef GetPureVirtualCallName() override { return "__cxa_pure_virtual"; }
+  StringRef GetDeletedVirtualCallName() override
+    { return "__cxa_deleted_virtual"; }
 
-  CharUnits getArrayCookieSizeImpl(QualType elementType);
+  CharUnits getArrayCookieSizeImpl(QualType elementType) override;
   llvm::Value *InitializeArrayCookie(CodeGenFunction &CGF,
                                      llvm::Value *NewPtr,
                                      llvm::Value *NumElements,
                                      const CXXNewExpr *expr,
-                                     QualType ElementType);
+                                     QualType ElementType) override;
   llvm::Value *readArrayCookieImpl(CodeGenFunction &CGF,
                                    llvm::Value *allocPtr,
-                                   CharUnits cookieSize);
+                                   CharUnits cookieSize) override;
 
   void EmitGuardedInit(CodeGenFunction &CGF, const VarDecl &D,
-                       llvm::GlobalVariable *DeclPtr, bool PerformInit);
+                       llvm::GlobalVariable *DeclPtr,
+                       bool PerformInit) override;
   void registerGlobalDtor(CodeGenFunction &CGF, const VarDecl &D,
-                          llvm::Constant *dtor, llvm::Constant *addr);
+                          llvm::Constant *dtor, llvm::Constant *addr) override;
 
   llvm::Function *getOrCreateThreadLocalWrapper(const VarDecl *VD,
                                                 llvm::GlobalVariable *Var);
   void EmitThreadLocalInitFuncs(
       llvm::ArrayRef<std::pair<const VarDecl *, llvm::GlobalVariable *> > Decls,
-      llvm::Function *InitFunc);
-  LValue EmitThreadLocalDeclRefExpr(CodeGenFunction &CGF,
-                                    const DeclRefExpr *DRE);
+      llvm::Function *InitFunc) override;
+  LValue EmitThreadLocalVarDeclLValue(CodeGenFunction &CGF, const VarDecl *VD,
+                                      QualType LValType) override;
 
-  bool NeedsVTTParameter(GlobalDecl GD);
+  bool NeedsVTTParameter(GlobalDecl GD) override;
 };
 
 class ARMCXXABI : public ItaniumCXXABI {
@@ -221,22 +226,31 @@
     ItaniumCXXABI(CGM, /* UseARMMethodPtrABI = */ true,
                   /* UseARMGuardVarABI = */ true) {}
 
-  bool HasThisReturn(GlobalDecl GD) const {
+  bool HasThisReturn(GlobalDecl GD) const override {
     return (isa<CXXConstructorDecl>(GD.getDecl()) || (
               isa<CXXDestructorDecl>(GD.getDecl()) &&
               GD.getDtorType() != Dtor_Deleting));
   }
 
-  void EmitReturnFromThunk(CodeGenFunction &CGF, RValue RV, QualType ResTy);
+  void EmitReturnFromThunk(CodeGenFunction &CGF, RValue RV,
+                           QualType ResTy) override;
 
-  CharUnits getArrayCookieSizeImpl(QualType elementType);
+  CharUnits getArrayCookieSizeImpl(QualType elementType) override;
   llvm::Value *InitializeArrayCookie(CodeGenFunction &CGF,
                                      llvm::Value *NewPtr,
                                      llvm::Value *NumElements,
                                      const CXXNewExpr *expr,
-                                     QualType ElementType);
+                                     QualType ElementType) override;
   llvm::Value *readArrayCookieImpl(CodeGenFunction &CGF, llvm::Value *allocPtr,
-                                   CharUnits cookieSize);
+                                   CharUnits cookieSize) override;
+};
+
+class iOS64CXXABI : public ARMCXXABI {
+public:
+  iOS64CXXABI(CodeGen::CodeGenModule &CGM) : ARMCXXABI(CGM) {}
+
+  // ARM64 libraries are prepared for non-unique RTTI.
+  bool shouldRTTIBeUnique() override { return false; }
 };
 }
 
@@ -248,6 +262,9 @@
   case TargetCXXABI::iOS:
     return new ARMCXXABI(CGM);
 
+  case TargetCXXABI::iOS64:
+    return new iOS64CXXABI(CGM);
+
   // Note that AArch64 uses the generic ItaniumCXXABI class since it doesn't
   // include the other 32-bit ARM oddities: constructor/destructor return values
   // and array cookies.
@@ -299,11 +316,9 @@
 ///
 /// If the member is non-virtual, memptr.ptr is the address of
 /// the function to call.
-llvm::Value *
-ItaniumCXXABI::EmitLoadOfMemberFunctionPointer(CodeGenFunction &CGF,
-                                               llvm::Value *&This,
-                                               llvm::Value *MemFnPtr,
-                                               const MemberPointerType *MPT) {
+llvm::Value *ItaniumCXXABI::EmitLoadOfMemberFunctionPointer(
+    CodeGenFunction &CGF, const Expr *E, llvm::Value *&This,
+    llvm::Value *MemFnPtr, const MemberPointerType *MPT) {
   CGBuilderTy &Builder = CGF.Builder;
 
   const FunctionProtoType *FPT = 
@@ -355,8 +370,7 @@
 
   // Cast the adjusted this to a pointer to vtable pointer and load.
   llvm::Type *VTableTy = Builder.getInt8PtrTy();
-  llvm::Value *VTable = Builder.CreateBitCast(This, VTableTy->getPointerTo());
-  VTable = Builder.CreateLoad(VTable, "memptr.vtable");
+  llvm::Value *VTable = CGF.GetVTablePtr(This, VTableTy);
 
   // Apply the offset.
   llvm::Value *VTableOffset = FnAsInt;
@@ -385,10 +399,9 @@
 
 /// Compute an l-value by applying the given pointer-to-member to a
 /// base object.
-llvm::Value *ItaniumCXXABI::EmitMemberDataPointerAddress(CodeGenFunction &CGF,
-                                                         llvm::Value *Base,
-                                                         llvm::Value *MemPtr,
-                                           const MemberPointerType *MPT) {
+llvm::Value *ItaniumCXXABI::EmitMemberDataPointerAddress(
+    CodeGenFunction &CGF, const Expr *E, llvm::Value *Base, llvm::Value *MemPtr,
+    const MemberPointerType *MPT) {
   assert(MemPtr->getType() == CGM.PtrDiffTy);
 
   CGBuilderTy &Builder = CGF.Builder;
@@ -797,34 +810,35 @@
 
 /// The generic ABI passes 'this', plus a VTT if it's initializing a
 /// base subobject.
-void ItaniumCXXABI::BuildConstructorSignature(const CXXConstructorDecl *Ctor,
-                                              CXXCtorType Type,
-                                              CanQualType &ResTy,
-                                SmallVectorImpl<CanQualType> &ArgTys) {
+void
+ItaniumCXXABI::BuildConstructorSignature(const CXXConstructorDecl *Ctor,
+                                         CXXCtorType Type, CanQualType &ResTy,
+                                         SmallVectorImpl<CanQualType> &ArgTys) {
   ASTContext &Context = getContext();
 
-  // 'this' parameter is already there, as well as 'this' return if
-  // HasThisReturn(GlobalDecl(Ctor, Type)) is true
+  // All parameters are already in place except VTT, which goes after 'this'.
+  // These are Clang types, so we don't need to worry about sret yet.
 
   // Check if we need to add a VTT parameter (which has type void **).
   if (Type == Ctor_Base && Ctor->getParent()->getNumVBases() != 0)
-    ArgTys.push_back(Context.getPointerType(Context.VoidPtrTy));
+    ArgTys.insert(ArgTys.begin() + 1,
+                  Context.getPointerType(Context.VoidPtrTy));
 }
 
 void ItaniumCXXABI::EmitCXXConstructors(const CXXConstructorDecl *D) {
   // Just make sure we're in sync with TargetCXXABI.
   assert(CGM.getTarget().getCXXABI().hasConstructorVariants());
 
+  // The constructor used for constructing this as a base class;
+  // ignores virtual bases.
+  CGM.EmitGlobal(GlobalDecl(D, Ctor_Base));
+
   // The constructor used for constructing this as a complete class;
   // constucts the virtual bases, then calls the base constructor.
   if (!D->getParent()->isAbstract()) {
     // We don't need to emit the complete ctor if the class is abstract.
     CGM.EmitGlobal(GlobalDecl(D, Ctor_Complete));
   }
-
-  // The constructor used for constructing this as a base class;
-  // ignores virtual bases.
-  CGM.EmitGlobal(GlobalDecl(D, Ctor_Base));
 }
 
 /// The generic ABI passes 'this', plus a VTT if it's destroying a
@@ -844,29 +858,26 @@
 }
 
 void ItaniumCXXABI::EmitCXXDestructors(const CXXDestructorDecl *D) {
-  // The destructor in a virtual table is always a 'deleting'
-  // destructor, which calls the complete destructor and then uses the
-  // appropriate operator delete.
-  if (D->isVirtual())
-    CGM.EmitGlobal(GlobalDecl(D, Dtor_Deleting));
+  // The destructor used for destructing this as a base class; ignores
+  // virtual bases.
+  CGM.EmitGlobal(GlobalDecl(D, Dtor_Base));
 
   // The destructor used for destructing this as a most-derived class;
   // call the base destructor and then destructs any virtual bases.
   CGM.EmitGlobal(GlobalDecl(D, Dtor_Complete));
 
-  // The destructor used for destructing this as a base class; ignores
-  // virtual bases.
-  CGM.EmitGlobal(GlobalDecl(D, Dtor_Base));
+  // The destructor in a virtual table is always a 'deleting'
+  // destructor, which calls the complete destructor and then uses the
+  // appropriate operator delete.
+  if (D->isVirtual())
+    CGM.EmitGlobal(GlobalDecl(D, Dtor_Deleting));
 }
 
-void ItaniumCXXABI::BuildInstanceFunctionParams(CodeGenFunction &CGF,
-                                                QualType &ResTy,
-                                                FunctionArgList &Params) {
-  /// Create the 'this' variable.
-  BuildThisParam(CGF, Params);
-
+void ItaniumCXXABI::addImplicitStructorParams(CodeGenFunction &CGF,
+                                              QualType &ResTy,
+                                              FunctionArgList &Params) {
   const CXXMethodDecl *MD = cast<CXXMethodDecl>(CGF.CurGD.getDecl());
-  assert(MD->isInstance());
+  assert(isa<CXXConstructorDecl>(MD) || isa<CXXDestructorDecl>(MD));
 
   // Check if we need a VTT parameter as well.
   if (NeedsVTTParameter(CGF.CurGD)) {
@@ -877,8 +888,8 @@
     ImplicitParamDecl *VTTDecl
       = ImplicitParamDecl::Create(Context, 0, MD->getLocation(),
                                   &Context.Idents.get("vtt"), T);
-    Params.push_back(VTTDecl);
-    getVTTDecl(CGF) = VTTDecl;
+    Params.insert(Params.begin() + 1, VTTDecl);
+    getStructorImplicitParamDecl(CGF) = VTTDecl;
   }
 }
 
@@ -887,10 +898,9 @@
   EmitThisParam(CGF);
 
   /// Initialize the 'vtt' slot if needed.
-  if (getVTTDecl(CGF)) {
-    getVTTValue(CGF)
-      = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(getVTTDecl(CGF)),
-                               "vtt");
+  if (getStructorImplicitParamDecl(CGF)) {
+    getStructorImplicitParamValue(CGF) = CGF.Builder.CreateLoad(
+        CGF.GetAddrOfLocalVar(getStructorImplicitParamDecl(CGF)), "vtt");
   }
 
   /// If this is a function that the ABI specifies returns 'this', initialize
@@ -905,21 +915,39 @@
     CGF.Builder.CreateStore(getThisValue(CGF), CGF.ReturnValue);
 }
 
-void ItaniumCXXABI::EmitConstructorCall(CodeGenFunction &CGF,
-                                        const CXXConstructorDecl *D,
-                                        CXXCtorType Type,
-                                        bool ForVirtualBase, bool Delegating,
-                                        llvm::Value *This,
-                                        CallExpr::const_arg_iterator ArgBeg,
-                                        CallExpr::const_arg_iterator ArgEnd) {
-  llvm::Value *VTT = CGF.GetVTTParameter(GlobalDecl(D, Type), ForVirtualBase,
-                                         Delegating);
+unsigned ItaniumCXXABI::addImplicitConstructorArgs(
+    CodeGenFunction &CGF, const CXXConstructorDecl *D, CXXCtorType Type,
+    bool ForVirtualBase, bool Delegating, CallArgList &Args) {
+  if (!NeedsVTTParameter(GlobalDecl(D, Type)))
+    return 0;
+
+  // Insert the implicit 'vtt' argument as the second argument.
+  llvm::Value *VTT =
+      CGF.GetVTTParameter(GlobalDecl(D, Type), ForVirtualBase, Delegating);
   QualType VTTTy = getContext().getPointerType(getContext().VoidPtrTy);
-  llvm::Value *Callee = CGM.GetAddrOfCXXConstructor(D, Type);
+  Args.insert(Args.begin() + 1,
+              CallArg(RValue::get(VTT), VTTTy, /*needscopy=*/false));
+  return 1;  // Added one arg.
+}
+
+void ItaniumCXXABI::EmitDestructorCall(CodeGenFunction &CGF,
+                                       const CXXDestructorDecl *DD,
+                                       CXXDtorType Type, bool ForVirtualBase,
+                                       bool Delegating, llvm::Value *This) {
+  GlobalDecl GD(DD, Type);
+  llvm::Value *VTT = CGF.GetVTTParameter(GD, ForVirtualBase, Delegating);
+  QualType VTTTy = getContext().getPointerType(getContext().VoidPtrTy);
+
+  llvm::Value *Callee = 0;
+  if (getContext().getLangOpts().AppleKext)
+    Callee = CGF.BuildAppleKextVirtualDestructorCall(DD, Type, DD->getParent());
+
+  if (!Callee)
+    Callee = CGM.GetAddrOfCXXDestructor(DD, Type);
 
   // FIXME: Provide a source location here.
-  CGF.EmitCXXMemberCall(D, SourceLocation(), Callee, ReturnValueSlot(),
-                        This, VTT, VTTTy, ArgBeg, ArgEnd);
+  CGF.EmitCXXMemberCall(DD, SourceLocation(), Callee, ReturnValueSlot(), This,
+                        VTT, VTTTy, 0, 0);
 }
 
 void ItaniumCXXABI::emitVTableDefinitions(CodeGenVTables &CGVT,
@@ -942,7 +970,7 @@
   VTable->setLinkage(Linkage);
 
   // Set the right visibility.
-  CGM.setTypeVisibility(VTable, RD, CodeGenModule::TVK_ForVTable);
+  CGM.setGlobalVisibility(VTable, RD);
 
   // If this is the magic class __cxxabiv1::__fundamental_type_info,
   // we will emit the typeinfo for the fundamental types. This is the
@@ -1305,7 +1333,7 @@
     llvm::GlobalVariable *Guard;
     CallGuardAbort(llvm::GlobalVariable *Guard) : Guard(Guard) {}
 
-    void Emit(CodeGenFunction &CGF, Flags flags) {
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
       CGF.EmitNounwindRuntimeCall(getGuardAbortFn(CGF.CGM, Guard->getType()),
                                   Guard);
     }
@@ -1398,6 +1426,13 @@
   //         __cxa_guard_release (&obj_guard);
   //       }
   //     }
+
+    // ARM64 C++ ABI 3.2.2:
+    // This ABI instead only specifies the value bit 0 of the static guard
+    // variable; all other bits are platform defined. Bit 0 shall be 0 when the
+    // variable is not initialized and 1 when it is.
+    // FIXME: Reading one bit is no more efficient than reading one byte so
+    // the codegen is same as generic Itanium ABI.
   } else {
     // Load the first byte of the guard variable.
     llvm::LoadInst *LI = 
@@ -1523,8 +1558,6 @@
 /// the wrapper emits a copy, and we want the linker to merge them.
 static llvm::GlobalValue::LinkageTypes getThreadLocalWrapperLinkage(
     llvm::GlobalValue::LinkageTypes VarLinkage) {
-  if (llvm::GlobalValue::isLinkerPrivateLinkage(VarLinkage))
-    return llvm::GlobalValue::LinkerPrivateWeakLinkage;
   // For internal linkage variables, we don't need an external or weak wrapper.
   if (llvm::GlobalValue::isLocalLinkage(VarLinkage))
     return VarLinkage;
@@ -1632,9 +1665,9 @@
   }
 }
 
-LValue ItaniumCXXABI::EmitThreadLocalDeclRefExpr(CodeGenFunction &CGF,
-                                                 const DeclRefExpr *DRE) {
-  const VarDecl *VD = cast<VarDecl>(DRE->getDecl());
+LValue ItaniumCXXABI::EmitThreadLocalVarDeclLValue(CodeGenFunction &CGF,
+                                                   const VarDecl *VD,
+                                                   QualType LValType) {
   QualType T = VD->getType();
   llvm::Type *Ty = CGF.getTypes().ConvertTypeForMem(T);
   llvm::Value *Val = CGF.CGM.GetAddrOfGlobalVar(VD, Ty);
@@ -1645,10 +1678,9 @@
 
   LValue LV;
   if (VD->getType()->isReferenceType())
-    LV = CGF.MakeNaturalAlignAddrLValue(Val, T);
+    LV = CGF.MakeNaturalAlignAddrLValue(Val, LValType);
   else
-    LV = CGF.MakeAddrLValue(Val, DRE->getType(),
-                            CGF.getContext().getDeclAlign(VD));
+    LV = CGF.MakeAddrLValue(Val, LValType, CGF.getContext().getDeclAlign(VD));
   // FIXME: need setObjCGCLValueClass?
   return LV;
 }
diff --git a/lib/CodeGen/MicrosoftCXXABI.cpp b/lib/CodeGen/MicrosoftCXXABI.cpp
index 7318fe7..9832969 100644
--- a/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -15,9 +15,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "CGCXXABI.h"
-#include "CodeGenModule.h"
 #include "CGVTables.h"
-#include "MicrosoftVBTables.h"
+#include "CodeGenModule.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/VTableBuilder.h"
@@ -28,51 +27,63 @@
 
 namespace {
 
+/// Holds all the vbtable globals for a given class.
+struct VBTableGlobals {
+  const VPtrInfoVector *VBTables;
+  SmallVector<llvm::GlobalVariable *, 2> Globals;
+};
+
 class MicrosoftCXXABI : public CGCXXABI {
 public:
   MicrosoftCXXABI(CodeGenModule &CGM) : CGCXXABI(CGM) {}
 
-  bool HasThisReturn(GlobalDecl GD) const;
+  bool HasThisReturn(GlobalDecl GD) const override;
 
-  bool isReturnTypeIndirect(const CXXRecordDecl *RD) const {
+  bool isReturnTypeIndirect(const CXXRecordDecl *RD) const override {
     // Structures that are not C++03 PODs are always indirect.
     return !RD->isPOD();
   }
 
-  RecordArgABI getRecordArgABI(const CXXRecordDecl *RD) const {
-    if (RD->hasNonTrivialCopyConstructor() || RD->hasNonTrivialDestructor())
-      return RAA_DirectInMemory;
+  RecordArgABI getRecordArgABI(const CXXRecordDecl *RD) const override {
+    if (RD->hasNonTrivialCopyConstructor() || RD->hasNonTrivialDestructor()) {
+      llvm::Triple::ArchType Arch = CGM.getTarget().getTriple().getArch();
+      if (Arch == llvm::Triple::x86)
+        return RAA_DirectInMemory;
+      // On x64, pass non-trivial records indirectly.
+      // FIXME: Test other Windows architectures.
+      return RAA_Indirect;
+    }
     return RAA_Default;
   }
 
-  StringRef GetPureVirtualCallName() { return "_purecall"; }
+  StringRef GetPureVirtualCallName() override { return "_purecall"; }
   // No known support for deleted functions in MSVC yet, so this choice is
   // arbitrary.
-  StringRef GetDeletedVirtualCallName() { return "_purecall"; }
+  StringRef GetDeletedVirtualCallName() override { return "_purecall"; }
 
-  bool isInlineInitializedStaticDataMemberLinkOnce() { return true; }
+  bool isInlineInitializedStaticDataMemberLinkOnce()  override{ return true; }
 
   llvm::Value *adjustToCompleteObject(CodeGenFunction &CGF,
                                       llvm::Value *ptr,
-                                      QualType type);
+                                      QualType type) override;
 
-  llvm::Value *GetVirtualBaseClassOffset(CodeGenFunction &CGF,
-                                         llvm::Value *This,
-                                         const CXXRecordDecl *ClassDecl,
-                                         const CXXRecordDecl *BaseClassDecl);
+  llvm::Value *
+  GetVirtualBaseClassOffset(CodeGenFunction &CGF, llvm::Value *This,
+                            const CXXRecordDecl *ClassDecl,
+                            const CXXRecordDecl *BaseClassDecl) override;
 
   void BuildConstructorSignature(const CXXConstructorDecl *Ctor,
-                                 CXXCtorType Type,
-                                 CanQualType &ResTy,
-                                 SmallVectorImpl<CanQualType> &ArgTys);
+                                 CXXCtorType Type, CanQualType &ResTy,
+                                 SmallVectorImpl<CanQualType> &ArgTys) override;
 
-  llvm::BasicBlock *EmitCtorCompleteObjectHandler(CodeGenFunction &CGF,
-                                                  const CXXRecordDecl *RD);
+  llvm::BasicBlock *
+  EmitCtorCompleteObjectHandler(CodeGenFunction &CGF,
+                                const CXXRecordDecl *RD) override;
 
   void initializeHiddenVirtualInheritanceMembers(CodeGenFunction &CGF,
-                                                 const CXXRecordDecl *RD);
+                                              const CXXRecordDecl *RD) override;
 
-  void EmitCXXConstructors(const CXXConstructorDecl *D);
+  void EmitCXXConstructors(const CXXConstructorDecl *D) override;
 
   // Background on MSVC destructors
   // ==============================
@@ -109,17 +120,18 @@
   void BuildDestructorSignature(const CXXDestructorDecl *Dtor,
                                 CXXDtorType Type,
                                 CanQualType &ResTy,
-                                SmallVectorImpl<CanQualType> &ArgTys);
+                                SmallVectorImpl<CanQualType> &ArgTys) override;
 
   /// Non-base dtors should be emitted as delegating thunks in this ABI.
   bool useThunkForDtorVariant(const CXXDestructorDecl *Dtor,
-                              CXXDtorType DT) const {
+                              CXXDtorType DT) const override {
     return DT != Dtor_Base;
   }
 
-  void EmitCXXDestructors(const CXXDestructorDecl *D);
+  void EmitCXXDestructors(const CXXDestructorDecl *D) override;
 
-  const CXXRecordDecl *getThisArgumentTypeForMethod(const CXXMethodDecl *MD) {
+  const CXXRecordDecl *
+  getThisArgumentTypeForMethod(const CXXMethodDecl *MD) override {
     MD = MD->getCanonicalDecl();
     if (MD->isVirtual() && !isa<CXXDestructorDecl>(MD)) {
       MicrosoftVTableContext::MethodVFTableLocation ML =
@@ -136,71 +148,83 @@
     return MD->getParent();
   }
 
-  llvm::Value *adjustThisArgumentForVirtualCall(CodeGenFunction &CGF,
-                                                GlobalDecl GD,
-                                                llvm::Value *This);
+  llvm::Value *
+  adjustThisArgumentForVirtualFunctionCall(CodeGenFunction &CGF, GlobalDecl GD,
+                                           llvm::Value *This,
+                                           bool VirtualCall) override;
 
-  void BuildInstanceFunctionParams(CodeGenFunction &CGF,
-                                   QualType &ResTy,
-                                   FunctionArgList &Params);
+  void addImplicitStructorParams(CodeGenFunction &CGF, QualType &ResTy,
+                                 FunctionArgList &Params) override;
 
   llvm::Value *adjustThisParameterInVirtualFunctionPrologue(
-      CodeGenFunction &CGF, GlobalDecl GD, llvm::Value *This);
+      CodeGenFunction &CGF, GlobalDecl GD, llvm::Value *This) override;
 
-  void EmitInstanceFunctionProlog(CodeGenFunction &CGF);
+  void EmitInstanceFunctionProlog(CodeGenFunction &CGF) override;
 
-  void EmitConstructorCall(CodeGenFunction &CGF,
-                           const CXXConstructorDecl *D, CXXCtorType Type,
-                           bool ForVirtualBase, bool Delegating,
-                           llvm::Value *This,
-                           CallExpr::const_arg_iterator ArgBeg,
-                           CallExpr::const_arg_iterator ArgEnd);
+  unsigned addImplicitConstructorArgs(CodeGenFunction &CGF,
+                                      const CXXConstructorDecl *D,
+                                      CXXCtorType Type, bool ForVirtualBase,
+                                      bool Delegating,
+                                      CallArgList &Args) override;
 
-  void emitVTableDefinitions(CodeGenVTables &CGVT, const CXXRecordDecl *RD);
+  void EmitDestructorCall(CodeGenFunction &CGF, const CXXDestructorDecl *DD,
+                          CXXDtorType Type, bool ForVirtualBase,
+                          bool Delegating, llvm::Value *This) override;
+
+  void emitVTableDefinitions(CodeGenVTables &CGVT,
+                             const CXXRecordDecl *RD) override;
 
   llvm::Value *getVTableAddressPointInStructor(
       CodeGenFunction &CGF, const CXXRecordDecl *VTableClass,
       BaseSubobject Base, const CXXRecordDecl *NearestVBase,
-      bool &NeedsVirtualOffset);
+      bool &NeedsVirtualOffset) override;
 
   llvm::Constant *
   getVTableAddressPointForConstExpr(BaseSubobject Base,
-                                    const CXXRecordDecl *VTableClass);
+                                    const CXXRecordDecl *VTableClass) override;
 
   llvm::GlobalVariable *getAddrOfVTable(const CXXRecordDecl *RD,
-                                        CharUnits VPtrOffset);
+                                        CharUnits VPtrOffset) override;
 
   llvm::Value *getVirtualFunctionPointer(CodeGenFunction &CGF, GlobalDecl GD,
-                                         llvm::Value *This, llvm::Type *Ty);
+                                         llvm::Value *This,
+                                         llvm::Type *Ty) override;
 
   void EmitVirtualDestructorCall(CodeGenFunction &CGF,
                                  const CXXDestructorDecl *Dtor,
                                  CXXDtorType DtorType, SourceLocation CallLoc,
-                                 llvm::Value *This);
+                                 llvm::Value *This) override;
 
   void adjustCallArgsForDestructorThunk(CodeGenFunction &CGF, GlobalDecl GD,
-                                        CallArgList &CallArgs) {
+                                        CallArgList &CallArgs) override {
     assert(GD.getDtorType() == Dtor_Deleting &&
            "Only deleting destructor thunks are available in this ABI");
     CallArgs.add(RValue::get(getStructorImplicitParamValue(CGF)),
                              CGM.getContext().IntTy);
   }
 
-  void emitVirtualInheritanceTables(const CXXRecordDecl *RD);
+  void emitVirtualInheritanceTables(const CXXRecordDecl *RD) override;
 
-  void setThunkLinkage(llvm::Function *Thunk, bool ForVTable) {
+  llvm::GlobalVariable *
+  getAddrOfVBTable(const VPtrInfo &VBT, const CXXRecordDecl *RD,
+                   llvm::GlobalVariable::LinkageTypes Linkage);
+
+  void emitVBTableDefinition(const VPtrInfo &VBT, const CXXRecordDecl *RD,
+                             llvm::GlobalVariable *GV) const;
+
+  void setThunkLinkage(llvm::Function *Thunk, bool ForVTable) override {
     Thunk->setLinkage(llvm::GlobalValue::WeakAnyLinkage);
   }
 
   llvm::Value *performThisAdjustment(CodeGenFunction &CGF, llvm::Value *This,
-                                     const ThisAdjustment &TA);
+                                     const ThisAdjustment &TA) override;
 
   llvm::Value *performReturnAdjustment(CodeGenFunction &CGF, llvm::Value *Ret,
-                                       const ReturnAdjustment &RA);
+                                       const ReturnAdjustment &RA) override;
 
   void EmitGuardedInit(CodeGenFunction &CGF, const VarDecl &D,
                        llvm::GlobalVariable *DeclPtr,
-                       bool PerformInit);
+                       bool PerformInit) override;
 
   // ==== Notes on array cookies =========
   //
@@ -225,17 +249,18 @@
   //   }
   // Whereas it prints "104" and "104" if you give A a destructor.
 
-  bool requiresArrayCookie(const CXXDeleteExpr *expr, QualType elementType);
-  bool requiresArrayCookie(const CXXNewExpr *expr);
-  CharUnits getArrayCookieSizeImpl(QualType type);
+  bool requiresArrayCookie(const CXXDeleteExpr *expr,
+                           QualType elementType) override;
+  bool requiresArrayCookie(const CXXNewExpr *expr) override;
+  CharUnits getArrayCookieSizeImpl(QualType type) override;
   llvm::Value *InitializeArrayCookie(CodeGenFunction &CGF,
                                      llvm::Value *NewPtr,
                                      llvm::Value *NumElements,
                                      const CXXNewExpr *expr,
-                                     QualType ElementType);
+                                     QualType ElementType) override;
   llvm::Value *readArrayCookieImpl(CodeGenFunction &CGF,
                                    llvm::Value *allocPtr,
-                                   CharUnits cookieSize);
+                                   CharUnits cookieSize) override;
 
 private:
   MicrosoftMangleContext &getMangleContext() {
@@ -258,15 +283,12 @@
     return C ? C : getZeroInt();
   }
 
+  CharUnits getVirtualFunctionPrologueThisAdjustment(GlobalDecl GD);
+
   void
   GetNullMemberPointerFields(const MemberPointerType *MPT,
                              llvm::SmallVectorImpl<llvm::Constant *> &fields);
 
-  /// \brief Finds the offset from the base of RD to the vbptr it uses, even if
-  /// it is reusing a vbptr from a non-virtual base.  RD must have morally
-  /// virtual bases.
-  CharUnits GetVBPtrOffsetFromBases(const CXXRecordDecl *RD);
-
   /// \brief Shared code for virtual base adjustment.  Returns the offset from
   /// the vbptr to the virtual base.  Optionally returns the address of the
   /// vbptr itself.
@@ -288,8 +310,8 @@
 
   /// \brief Performs a full virtual base adjustment.  Used to dereference
   /// pointers to members of virtual bases.
-  llvm::Value *AdjustVirtualBase(CodeGenFunction &CGF, const CXXRecordDecl *RD,
-                                 llvm::Value *Base,
+  llvm::Value *AdjustVirtualBase(CodeGenFunction &CGF, const Expr *E,
+                                 const CXXRecordDecl *RD, llvm::Value *Base,
                                  llvm::Value *VirtualBaseAdjustmentOffset,
                                  llvm::Value *VBPtrOffset /* optional */);
 
@@ -311,51 +333,51 @@
   void EmitVBPtrStores(CodeGenFunction &CGF, const CXXRecordDecl *RD);
 
   /// \brief Caching wrapper around VBTableBuilder::enumerateVBTables().
-  const VBTableVector &EnumerateVBTables(const CXXRecordDecl *RD);
+  const VBTableGlobals &enumerateVBTables(const CXXRecordDecl *RD);
 
   /// \brief Generate a thunk for calling a virtual member function MD.
-  llvm::Function *EmitVirtualMemPtrThunk(const CXXMethodDecl *MD,
-                                         StringRef ThunkName);
+  llvm::Function *EmitVirtualMemPtrThunk(
+      const CXXMethodDecl *MD,
+      const MicrosoftVTableContext::MethodVFTableLocation &ML);
 
 public:
-  virtual llvm::Type *ConvertMemberPointerType(const MemberPointerType *MPT);
+  llvm::Type *ConvertMemberPointerType(const MemberPointerType *MPT) override;
 
-  virtual bool isZeroInitializable(const MemberPointerType *MPT);
+  bool isZeroInitializable(const MemberPointerType *MPT) override;
 
-  virtual llvm::Constant *EmitNullMemberPointer(const MemberPointerType *MPT);
+  llvm::Constant *EmitNullMemberPointer(const MemberPointerType *MPT) override;
 
-  virtual llvm::Constant *EmitMemberDataPointer(const MemberPointerType *MPT,
-                                                CharUnits offset);
-  virtual llvm::Constant *EmitMemberPointer(const CXXMethodDecl *MD);
-  virtual llvm::Constant *EmitMemberPointer(const APValue &MP, QualType MPT);
+  llvm::Constant *EmitMemberDataPointer(const MemberPointerType *MPT,
+                                        CharUnits offset) override;
+  llvm::Constant *EmitMemberPointer(const CXXMethodDecl *MD) override;
+  llvm::Constant *EmitMemberPointer(const APValue &MP, QualType MPT) override;
 
-  virtual llvm::Value *EmitMemberPointerComparison(CodeGenFunction &CGF,
-                                                   llvm::Value *L,
-                                                   llvm::Value *R,
-                                                   const MemberPointerType *MPT,
-                                                   bool Inequality);
+  llvm::Value *EmitMemberPointerComparison(CodeGenFunction &CGF,
+                                           llvm::Value *L,
+                                           llvm::Value *R,
+                                           const MemberPointerType *MPT,
+                                           bool Inequality) override;
 
-  virtual llvm::Value *EmitMemberPointerIsNotNull(CodeGenFunction &CGF,
-                                                  llvm::Value *MemPtr,
-                                                  const MemberPointerType *MPT);
+  llvm::Value *EmitMemberPointerIsNotNull(CodeGenFunction &CGF,
+                                          llvm::Value *MemPtr,
+                                          const MemberPointerType *MPT) override;
 
-  virtual llvm::Value *EmitMemberDataPointerAddress(CodeGenFunction &CGF,
-                                                    llvm::Value *Base,
-                                                    llvm::Value *MemPtr,
-                                                  const MemberPointerType *MPT);
+  llvm::Value *
+  EmitMemberDataPointerAddress(CodeGenFunction &CGF, const Expr *E,
+                               llvm::Value *Base, llvm::Value *MemPtr,
+                               const MemberPointerType *MPT) override;
 
-  virtual llvm::Value *EmitMemberPointerConversion(CodeGenFunction &CGF,
-                                                   const CastExpr *E,
-                                                   llvm::Value *Src);
+  llvm::Value *EmitMemberPointerConversion(CodeGenFunction &CGF,
+                                           const CastExpr *E,
+                                           llvm::Value *Src) override;
 
-  virtual llvm::Constant *EmitMemberPointerConversion(const CastExpr *E,
-                                                      llvm::Constant *Src);
+  llvm::Constant *EmitMemberPointerConversion(const CastExpr *E,
+                                              llvm::Constant *Src) override;
 
-  virtual llvm::Value *
-  EmitLoadOfMemberFunctionPointer(CodeGenFunction &CGF,
-                                  llvm::Value *&This,
-                                  llvm::Value *MemPtr,
-                                  const MemberPointerType *MPT);
+  llvm::Value *
+  EmitLoadOfMemberFunctionPointer(CodeGenFunction &CGF, const Expr *E,
+                                  llvm::Value *&This, llvm::Value *MemPtr,
+                                  const MemberPointerType *MPT) override;
 
 private:
   typedef std::pair<const CXXRecordDecl *, CharUnits> VFTableIdTy;
@@ -368,7 +390,7 @@
 
 
   /// \brief All the vbtables which have been referenced.
-  llvm::DenseMap<const CXXRecordDecl *, VBTableVector> VBTablesMap;
+  llvm::DenseMap<const CXXRecordDecl *, VBTableGlobals> VBTablesMap;
 
   /// Info on the global variable used to guard initialization of static locals.
   /// The BitIndex field is only used for externally invisible declarations.
@@ -392,41 +414,13 @@
   return ptr;
 }
 
-/// \brief Finds the first non-virtual base of RD that has virtual bases.  If RD
-/// doesn't have a vbptr, it will reuse the vbptr of the returned class.
-static const CXXRecordDecl *FindFirstNVBaseWithVBases(const CXXRecordDecl *RD) {
-  for (CXXRecordDecl::base_class_const_iterator I = RD->bases_begin(),
-       E = RD->bases_end(); I != E; ++I) {
-    const CXXRecordDecl *Base = I->getType()->getAsCXXRecordDecl();
-    if (!I->isVirtual() && Base->getNumVBases() > 0)
-      return Base;
-  }
-  llvm_unreachable("RD must have an nv base with vbases");
-}
-
-CharUnits MicrosoftCXXABI::GetVBPtrOffsetFromBases(const CXXRecordDecl *RD) {
-  assert(RD->getNumVBases());
-  CharUnits Total = CharUnits::Zero();
-  while (RD) {
-    const ASTRecordLayout &RDLayout = getContext().getASTRecordLayout(RD);
-    CharUnits VBPtrOffset = RDLayout.getVBPtrOffset();
-    // -1 is the sentinel for no vbptr.
-    if (VBPtrOffset != CharUnits::fromQuantity(-1)) {
-      Total += VBPtrOffset;
-      break;
-    }
-    RD = FindFirstNVBaseWithVBases(RD);
-    Total += RDLayout.getBaseClassOffset(RD);
-  }
-  return Total;
-}
-
 llvm::Value *
 MicrosoftCXXABI::GetVirtualBaseClassOffset(CodeGenFunction &CGF,
                                            llvm::Value *This,
                                            const CXXRecordDecl *ClassDecl,
                                            const CXXRecordDecl *BaseClassDecl) {
-  int64_t VBPtrChars = GetVBPtrOffsetFromBases(ClassDecl).getQuantity();
+  int64_t VBPtrChars =
+      getContext().getASTRecordLayout(ClassDecl).getVBPtrOffset().getQuantity();
   llvm::Value *VBPtrOffset = llvm::ConstantInt::get(CGM.PtrDiffTy, VBPtrChars);
   CharUnits IntSize = getContext().getTypeSizeInChars(getContext().IntTy);
   CharUnits VBTableChars =
@@ -446,16 +440,20 @@
   return isa<CXXConstructorDecl>(GD.getDecl());
 }
 
-void MicrosoftCXXABI::BuildConstructorSignature(const CXXConstructorDecl *Ctor,
-                                 CXXCtorType Type,
-                                 CanQualType &ResTy,
-                                 SmallVectorImpl<CanQualType> &ArgTys) {
-  // 'this' parameter and 'this' return are already in place
+void MicrosoftCXXABI::BuildConstructorSignature(
+    const CXXConstructorDecl *Ctor, CXXCtorType Type, CanQualType &ResTy,
+    SmallVectorImpl<CanQualType> &ArgTys) {
+
+  // All parameters are already in place except is_most_derived, which goes
+  // after 'this' if it's variadic and last if it's not.
 
   const CXXRecordDecl *Class = Ctor->getParent();
+  const FunctionProtoType *FPT = Ctor->getType()->castAs<FunctionProtoType>();
   if (Class->getNumVBases()) {
-    // Constructors of classes with virtual bases take an implicit parameter.
-    ArgTys.push_back(CGM.getContext().IntTy);
+    if (FPT->isVariadic())
+      ArgTys.insert(ArgTys.begin() + 1, CGM.getContext().IntTy);
+    else
+      ArgTys.push_back(CGM.getContext().IntTy);
   }
 }
 
@@ -547,19 +545,23 @@
                                       const CXXRecordDecl *RD) {
   llvm::Value *ThisInt8Ptr =
     CGF.Builder.CreateBitCast(getThisValue(CGF), CGM.Int8PtrTy, "this.int8");
+  const ASTRecordLayout &Layout = CGM.getContext().getASTRecordLayout(RD);
 
-  const VBTableVector &VBTables = EnumerateVBTables(RD);
-  for (VBTableVector::const_iterator I = VBTables.begin(), E = VBTables.end();
-       I != E; ++I) {
+  const VBTableGlobals &VBGlobals = enumerateVBTables(RD);
+  for (unsigned I = 0, E = VBGlobals.VBTables->size(); I != E; ++I) {
+    const VPtrInfo *VBT = (*VBGlobals.VBTables)[I];
+    llvm::GlobalVariable *GV = VBGlobals.Globals[I];
     const ASTRecordLayout &SubobjectLayout =
-      CGM.getContext().getASTRecordLayout(I->VBPtrSubobject.getBase());
-    uint64_t Offs = (I->VBPtrSubobject.getBaseOffset() +
-                     SubobjectLayout.getVBPtrOffset()).getQuantity();
+        CGM.getContext().getASTRecordLayout(VBT->BaseWithVPtr);
+    CharUnits Offs = VBT->NonVirtualOffset;
+    Offs += SubobjectLayout.getVBPtrOffset();
+    if (VBT->getVBaseWithVPtr())
+      Offs += Layout.getVBaseClassOffset(VBT->getVBaseWithVPtr());
     llvm::Value *VBPtr =
-        CGF.Builder.CreateConstInBoundsGEP1_64(ThisInt8Ptr, Offs);
-    VBPtr = CGF.Builder.CreateBitCast(VBPtr, I->GV->getType()->getPointerTo(0),
-                                      "vbptr." + I->ReusingBase->getName());
-    CGF.Builder.CreateStore(I->GV, VBPtr);
+        CGF.Builder.CreateConstInBoundsGEP1_64(ThisInt8Ptr, Offs.getQuantity());
+    VBPtr = CGF.Builder.CreateBitCast(VBPtr, GV->getType()->getPointerTo(0),
+                                      "vbptr." + VBT->ReusingBase->getName());
+    CGF.Builder.CreateStore(GV, VBPtr);
   }
 }
 
@@ -583,12 +585,61 @@
   CGM.EmitGlobal(GlobalDecl(D, Dtor_Base));
 }
 
-llvm::Value *MicrosoftCXXABI::adjustThisArgumentForVirtualCall(
-    CodeGenFunction &CGF, GlobalDecl GD, llvm::Value *This) {
+CharUnits
+MicrosoftCXXABI::getVirtualFunctionPrologueThisAdjustment(GlobalDecl GD) {
   GD = GD.getCanonicalDecl();
   const CXXMethodDecl *MD = cast<CXXMethodDecl>(GD.getDecl());
-  // FIXME: consider splitting the vdtor vs regular method code into two
-  // functions.
+
+  GlobalDecl LookupGD = GD;
+  if (const CXXDestructorDecl *DD = dyn_cast<CXXDestructorDecl>(MD)) {
+    // Complete destructors take a pointer to the complete object as a
+    // parameter, thus don't need this adjustment.
+    if (GD.getDtorType() == Dtor_Complete)
+      return CharUnits();
+
+    // There's no Dtor_Base in vftable but it shares the this adjustment with
+    // the deleting one, so look it up instead.
+    LookupGD = GlobalDecl(DD, Dtor_Deleting);
+  }
+
+  MicrosoftVTableContext::MethodVFTableLocation ML =
+      CGM.getMicrosoftVTableContext().getMethodVFTableLocation(LookupGD);
+  CharUnits Adjustment = ML.VFPtrOffset;
+
+  // Normal virtual instance methods need to adjust from the vfptr that first
+  // defined the virtual method to the virtual base subobject, but destructors
+  // do not.  The vector deleting destructor thunk applies this adjustment for
+  // us if necessary.
+  if (isa<CXXDestructorDecl>(MD))
+    Adjustment = CharUnits::Zero();
+
+  if (ML.VBase) {
+    const ASTRecordLayout &DerivedLayout =
+        CGM.getContext().getASTRecordLayout(MD->getParent());
+    Adjustment += DerivedLayout.getVBaseClassOffset(ML.VBase);
+  }
+
+  return Adjustment;
+}
+
+llvm::Value *MicrosoftCXXABI::adjustThisArgumentForVirtualFunctionCall(
+    CodeGenFunction &CGF, GlobalDecl GD, llvm::Value *This, bool VirtualCall) {
+  if (!VirtualCall) {
+    // If the call of a virtual function is not virtual, we just have to
+    // compensate for the adjustment the virtual function does in its prologue.
+    CharUnits Adjustment = getVirtualFunctionPrologueThisAdjustment(GD);
+    if (Adjustment.isZero())
+      return This;
+
+    unsigned AS = cast<llvm::PointerType>(This->getType())->getAddressSpace();
+    llvm::Type *charPtrTy = CGF.Int8Ty->getPointerTo(AS);
+    This = CGF.Builder.CreateBitCast(This, charPtrTy);
+    assert(Adjustment.isPositive());
+    return CGF.Builder.CreateConstGEP1_32(This, Adjustment.getQuantity());
+  }
+
+  GD = GD.getCanonicalDecl();
+  const CXXMethodDecl *MD = cast<CXXMethodDecl>(GD.getDecl());
 
   GlobalDecl LookupGD = GD;
   if (const CXXDestructorDecl *DD = dyn_cast<CXXDestructorDecl>(MD)) {
@@ -607,51 +658,18 @@
   unsigned AS = cast<llvm::PointerType>(This->getType())->getAddressSpace();
   llvm::Type *charPtrTy = CGF.Int8Ty->getPointerTo(AS);
   CharUnits StaticOffset = ML.VFPtrOffset;
+
+  // Base destructors expect 'this' to point to the beginning of the base
+  // subobject, not the first vfptr that happens to contain the virtual dtor.
+  // However, we still need to apply the virtual base adjustment.
+  if (isa<CXXDestructorDecl>(MD) && GD.getDtorType() == Dtor_Base)
+    StaticOffset = CharUnits::Zero();
+
   if (ML.VBase) {
-    bool AvoidVirtualOffset = false;
-    if (isa<CXXDestructorDecl>(MD) && GD.getDtorType() == Dtor_Base) {
-      // A base destructor can only be called from a complete destructor of the
-      // same record type or another destructor of a more derived type;
-      // or a constructor of the same record type if an exception is thrown.
-      assert(isa<CXXDestructorDecl>(CGF.CurGD.getDecl()) ||
-             isa<CXXConstructorDecl>(CGF.CurGD.getDecl()));
-      const CXXRecordDecl *CurRD =
-          cast<CXXMethodDecl>(CGF.CurGD.getDecl())->getParent();
-
-      if (MD->getParent() == CurRD) {
-        if (isa<CXXDestructorDecl>(CGF.CurGD.getDecl()))
-          assert(CGF.CurGD.getDtorType() == Dtor_Complete);
-        if (isa<CXXConstructorDecl>(CGF.CurGD.getDecl()))
-          assert(CGF.CurGD.getCtorType() == Ctor_Complete);
-        // We're calling the main base dtor from a complete structor,
-        // so we know the "this" offset statically.
-        AvoidVirtualOffset = true;
-      } else {
-        // Let's see if we try to call a destructor of a non-virtual base.
-        for (CXXRecordDecl::base_class_const_iterator I = CurRD->bases_begin(),
-             E = CurRD->bases_end(); I != E; ++I) {
-          if (I->getType()->getAsCXXRecordDecl() != MD->getParent())
-            continue;
-          // If we call a base destructor for a non-virtual base, we statically
-          // know where it expects the vfptr and "this" to be.
-          // The total offset should reflect the adjustment done by
-          // adjustThisParameterInVirtualFunctionPrologue().
-          AvoidVirtualOffset = true;
-          break;
-        }
-      }
-    }
-
-    if (AvoidVirtualOffset) {
-      const ASTRecordLayout &Layout =
-          CGF.getContext().getASTRecordLayout(MD->getParent());
-      StaticOffset += Layout.getVBaseClassOffset(ML.VBase);
-    } else {
-      This = CGF.Builder.CreateBitCast(This, charPtrTy);
-      llvm::Value *VBaseOffset =
-          GetVirtualBaseClassOffset(CGF, This, MD->getParent(), ML.VBase);
-      This = CGF.Builder.CreateInBoundsGEP(This, VBaseOffset);
-    }
+    This = CGF.Builder.CreateBitCast(This, charPtrTy);
+    llvm::Value *VBaseOffset =
+        GetVirtualBaseClassOffset(CGF, This, MD->getParent(), ML.VBase);
+    This = CGF.Builder.CreateInBoundsGEP(This, VBaseOffset);
   }
   if (!StaticOffset.isZero()) {
     assert(StaticOffset.isPositive());
@@ -678,20 +696,25 @@
   return false;
 }
 
-void MicrosoftCXXABI::BuildInstanceFunctionParams(CodeGenFunction &CGF,
-                                                  QualType &ResTy,
-                                                  FunctionArgList &Params) {
-  BuildThisParam(CGF, Params);
-
+void MicrosoftCXXABI::addImplicitStructorParams(CodeGenFunction &CGF,
+                                                QualType &ResTy,
+                                                FunctionArgList &Params) {
   ASTContext &Context = getContext();
   const CXXMethodDecl *MD = cast<CXXMethodDecl>(CGF.CurGD.getDecl());
+  assert(isa<CXXConstructorDecl>(MD) || isa<CXXDestructorDecl>(MD));
   if (isa<CXXConstructorDecl>(MD) && MD->getParent()->getNumVBases()) {
     ImplicitParamDecl *IsMostDerived
       = ImplicitParamDecl::Create(Context, 0,
                                   CGF.CurGD.getDecl()->getLocation(),
                                   &Context.Idents.get("is_most_derived"),
                                   Context.IntTy);
-    Params.push_back(IsMostDerived);
+    // The 'most_derived' parameter goes second if the ctor is variadic and last
+    // if it's not.  Dtors can't be variadic.
+    const FunctionProtoType *FPT = MD->getType()->castAs<FunctionProtoType>();
+    if (FPT->isVariadic())
+      Params.insert(Params.begin() + 1, IsMostDerived);
+    else
+      Params.push_back(IsMostDerived);
     getStructorImplicitParamDecl(CGF) = IsMostDerived;
   } else if (IsDeletingDtor(CGF.CurGD)) {
     ImplicitParamDecl *ShouldDelete
@@ -706,36 +729,12 @@
 
 llvm::Value *MicrosoftCXXABI::adjustThisParameterInVirtualFunctionPrologue(
     CodeGenFunction &CGF, GlobalDecl GD, llvm::Value *This) {
-  GD = GD.getCanonicalDecl();
-  const CXXMethodDecl *MD = cast<CXXMethodDecl>(GD.getDecl());
-
-  GlobalDecl LookupGD = GD;
-  if (const CXXDestructorDecl *DD = dyn_cast<CXXDestructorDecl>(MD)) {
-    // Complete destructors take a pointer to the complete object as a
-    // parameter, thus don't need this adjustment.
-    if (GD.getDtorType() == Dtor_Complete)
-      return This;
-
-    // There's no Dtor_Base in vftable but it shares the this adjustment with
-    // the deleting one, so look it up instead.
-    LookupGD = GlobalDecl(DD, Dtor_Deleting);
-  }
-
   // In this ABI, every virtual function takes a pointer to one of the
   // subobjects that first defines it as the 'this' parameter, rather than a
-  // pointer to ther final overrider subobject. Thus, we need to adjust it back
+  // pointer to the final overrider subobject. Thus, we need to adjust it back
   // to the final overrider subobject before use.
   // See comments in the MicrosoftVFTableContext implementation for the details.
-
-  MicrosoftVTableContext::MethodVFTableLocation ML =
-      CGM.getMicrosoftVTableContext().getMethodVFTableLocation(LookupGD);
-  CharUnits Adjustment = ML.VFPtrOffset;
-  if (ML.VBase) {
-    const ASTRecordLayout &DerivedLayout =
-        CGF.getContext().getASTRecordLayout(MD->getParent());
-    Adjustment += DerivedLayout.getVBaseClassOffset(ML.VBase);
-  }
-
+  CharUnits Adjustment = getVirtualFunctionPrologueThisAdjustment(GD);
   if (Adjustment.isZero())
     return This;
 
@@ -784,43 +783,63 @@
   }
 }
 
-void MicrosoftCXXABI::EmitConstructorCall(CodeGenFunction &CGF,
-                                          const CXXConstructorDecl *D,
-                                          CXXCtorType Type, 
-                                          bool ForVirtualBase,
-                                          bool Delegating,
-                                          llvm::Value *This,
-                                          CallExpr::const_arg_iterator ArgBeg,
-                                          CallExpr::const_arg_iterator ArgEnd) {
+unsigned MicrosoftCXXABI::addImplicitConstructorArgs(
+    CodeGenFunction &CGF, const CXXConstructorDecl *D, CXXCtorType Type,
+    bool ForVirtualBase, bool Delegating, CallArgList &Args) {
   assert(Type == Ctor_Complete || Type == Ctor_Base);
-  llvm::Value *Callee = CGM.GetAddrOfCXXConstructor(D, Ctor_Complete);
 
-  llvm::Value *ImplicitParam = 0;
-  QualType ImplicitParamTy;
-  if (D->getParent()->getNumVBases()) {
-    ImplicitParam = llvm::ConstantInt::get(CGM.Int32Ty, Type == Ctor_Complete);
-    ImplicitParamTy = getContext().IntTy;
+  // Check if we need a 'most_derived' parameter.
+  if (!D->getParent()->getNumVBases())
+    return 0;
+
+  // Add the 'most_derived' argument second if we are variadic or last if not.
+  const FunctionProtoType *FPT = D->getType()->castAs<FunctionProtoType>();
+  llvm::Value *MostDerivedArg =
+      llvm::ConstantInt::get(CGM.Int32Ty, Type == Ctor_Complete);
+  RValue RV = RValue::get(MostDerivedArg);
+  if (MostDerivedArg) {
+    if (FPT->isVariadic())
+      Args.insert(Args.begin() + 1,
+                  CallArg(RV, getContext().IntTy, /*needscopy=*/false));
+    else
+      Args.add(RV, getContext().IntTy);
+  }
+
+  return 1;  // Added one arg.
+}
+
+void MicrosoftCXXABI::EmitDestructorCall(CodeGenFunction &CGF,
+                                         const CXXDestructorDecl *DD,
+                                         CXXDtorType Type, bool ForVirtualBase,
+                                         bool Delegating, llvm::Value *This) {
+  llvm::Value *Callee = CGM.GetAddrOfCXXDestructor(DD, Type);
+
+  if (DD->isVirtual()) {
+    assert(Type != CXXDtorType::Dtor_Deleting &&
+           "The deleting destructor should only be called via a virtual call");
+    This = adjustThisArgumentForVirtualFunctionCall(CGF, GlobalDecl(DD, Type),
+                                                    This, false);
   }
 
   // FIXME: Provide a source location here.
-  CGF.EmitCXXMemberCall(D, SourceLocation(), Callee, ReturnValueSlot(), This,
-                        ImplicitParam, ImplicitParamTy, ArgBeg, ArgEnd);
+  CGF.EmitCXXMemberCall(DD, SourceLocation(), Callee, ReturnValueSlot(), This,
+                        /*ImplicitParam=*/0, /*ImplicitParamTy=*/QualType(), 0, 0);
 }
 
 void MicrosoftCXXABI::emitVTableDefinitions(CodeGenVTables &CGVT,
                                             const CXXRecordDecl *RD) {
   MicrosoftVTableContext &VFTContext = CGM.getMicrosoftVTableContext();
-  MicrosoftVTableContext::VFPtrListTy VFPtrs = VFTContext.getVFPtrOffsets(RD);
+  VPtrInfoVector VFPtrs = VFTContext.getVFPtrOffsets(RD);
   llvm::GlobalVariable::LinkageTypes Linkage = CGM.getVTableLinkage(RD);
 
-  for (MicrosoftVTableContext::VFPtrListTy::iterator I = VFPtrs.begin(),
-       E = VFPtrs.end(); I != E; ++I) {
-    llvm::GlobalVariable *VTable = getAddrOfVTable(RD, I->VFPtrFullOffset);
+  for (VPtrInfoVector::iterator I = VFPtrs.begin(), E = VFPtrs.end(); I != E;
+       ++I) {
+    llvm::GlobalVariable *VTable = getAddrOfVTable(RD, (*I)->FullOffsetInMDC);
     if (VTable->hasInitializer())
       continue;
 
     const VTableLayout &VTLayout =
-        VFTContext.getVFTableLayout(RD, I->VFPtrFullOffset);
+        VFTContext.getVFTableLayout(RD, (*I)->FullOffsetInMDC);
     llvm::Constant *Init = CGVT.CreateVTableInitializer(
         RD, VTLayout.vtable_component_begin(),
         VTLayout.getNumVTableComponents(), VTLayout.vtable_thunk_begin(),
@@ -828,7 +847,7 @@
     VTable->setInitializer(Init);
 
     VTable->setLinkage(Linkage);
-    CGM.setTypeVisibility(VTable, RD, CodeGenModule::TVK_ForVTable);
+    CGM.setGlobalVisibility(VTable, RD);
   }
 }
 
@@ -847,10 +866,10 @@
 }
 
 static void mangleVFTableName(MicrosoftMangleContext &MangleContext,
-                              const CXXRecordDecl *RD, const VFPtrInfo &VFPtr,
+                              const CXXRecordDecl *RD, const VPtrInfo *VFPtr,
                               SmallString<256> &Name) {
   llvm::raw_svector_ostream Out(Name);
-  MangleContext.mangleCXXVFTable(RD, VFPtr.PathToMangle, Out);
+  MangleContext.mangleCXXVFTable(RD, VFPtr->MangledPath, Out);
 }
 
 llvm::Constant *MicrosoftCXXABI::getVTableAddressPointForConstExpr(
@@ -868,7 +887,7 @@
   VFTableIdTy ID(RD, VPtrOffset);
   VFTablesMapTy::iterator I;
   bool Inserted;
-  llvm::tie(I, Inserted) = VFTablesMap.insert(
+  std::tie(I, Inserted) = VFTablesMap.insert(
       std::make_pair(ID, static_cast<llvm::GlobalVariable *>(0)));
   if (!Inserted)
     return I->second;
@@ -876,8 +895,7 @@
   llvm::GlobalVariable *&VTable = I->second;
 
   MicrosoftVTableContext &VTContext = CGM.getMicrosoftVTableContext();
-  const MicrosoftVTableContext::VFPtrListTy &VFPtrs =
-      VTContext.getVFPtrOffsets(RD);
+  const VPtrInfoVector &VFPtrs = VTContext.getVFPtrOffsets(RD);
 
   if (DeferredVFTables.insert(RD)) {
     // We haven't processed this record type before.
@@ -898,12 +916,12 @@
   }
 
   for (size_t J = 0, F = VFPtrs.size(); J != F; ++J) {
-    if (VFPtrs[J].VFPtrFullOffset != VPtrOffset)
+    if (VFPtrs[J]->FullOffsetInMDC != VPtrOffset)
       continue;
 
     llvm::ArrayType *ArrayType = llvm::ArrayType::get(
         CGM.Int8PtrTy,
-        VTContext.getVFTableLayout(RD, VFPtrs[J].VFPtrFullOffset)
+        VTContext.getVFTableLayout(RD, VFPtrs[J]->FullOffsetInMDC)
             .getNumVTableComponents());
 
     SmallString<256> Name;
@@ -925,7 +943,8 @@
   CGBuilderTy &Builder = CGF.Builder;
 
   Ty = Ty->getPointerTo()->getPointerTo();
-  llvm::Value *VPtr = adjustThisArgumentForVirtualCall(CGF, GD, This);
+  llvm::Value *VPtr =
+      adjustThisArgumentForVirtualFunctionCall(CGF, GD, This, true);
   llvm::Value *VTable = CGF.GetVTablePtr(VPtr, Ty);
 
   MicrosoftVTableContext::MethodVFTableLocation ML =
@@ -955,30 +974,47 @@
       llvm::ConstantInt::get(llvm::IntegerType::getInt32Ty(CGF.getLLVMContext()),
                              DtorType == Dtor_Deleting);
 
-  This = adjustThisArgumentForVirtualCall(CGF, GD, This);
+  This = adjustThisArgumentForVirtualFunctionCall(CGF, GD, This, true);
   CGF.EmitCXXMemberCall(Dtor, CallLoc, Callee, ReturnValueSlot(), This,
                         ImplicitParam, Context.IntTy, 0, 0);
 }
 
-const VBTableVector &
-MicrosoftCXXABI::EnumerateVBTables(const CXXRecordDecl *RD) {
+const VBTableGlobals &
+MicrosoftCXXABI::enumerateVBTables(const CXXRecordDecl *RD) {
   // At this layer, we can key the cache off of a single class, which is much
-  // easier than caching at the GlobalVariable layer.
-  llvm::DenseMap<const CXXRecordDecl*, VBTableVector>::iterator I;
-  bool added;
-  llvm::tie(I, added) = VBTablesMap.insert(std::make_pair(RD, VBTableVector()));
-  VBTableVector &VBTables = I->second;
-  if (!added)
-    return VBTables;
+  // easier than caching each vbtable individually.
+  llvm::DenseMap<const CXXRecordDecl*, VBTableGlobals>::iterator Entry;
+  bool Added;
+  std::tie(Entry, Added) =
+      VBTablesMap.insert(std::make_pair(RD, VBTableGlobals()));
+  VBTableGlobals &VBGlobals = Entry->second;
+  if (!Added)
+    return VBGlobals;
 
-  VBTableBuilder(CGM, RD).enumerateVBTables(VBTables);
+  MicrosoftVTableContext &Context = CGM.getMicrosoftVTableContext();
+  VBGlobals.VBTables = &Context.enumerateVBTables(RD);
 
-  return VBTables;
+  // Cache the globals for all vbtables so we don't have to recompute the
+  // mangled names.
+  llvm::GlobalVariable::LinkageTypes Linkage = CGM.getVTableLinkage(RD);
+  for (VPtrInfoVector::const_iterator I = VBGlobals.VBTables->begin(),
+                                      E = VBGlobals.VBTables->end();
+       I != E; ++I) {
+    VBGlobals.Globals.push_back(getAddrOfVBTable(**I, RD, Linkage));
+  }
+
+  return VBGlobals;
 }
 
-llvm::Function *
-MicrosoftCXXABI::EmitVirtualMemPtrThunk(const CXXMethodDecl *MD,
-                                        StringRef ThunkName) {
+llvm::Function *MicrosoftCXXABI::EmitVirtualMemPtrThunk(
+    const CXXMethodDecl *MD,
+    const MicrosoftVTableContext::MethodVFTableLocation &ML) {
+  // Calculate the mangled name.
+  SmallString<256> ThunkName;
+  llvm::raw_svector_ostream Out(ThunkName);
+  getMangleContext().mangleVirtualMemPtrThunk(MD, Out);
+  Out.flush();
+
   // If the thunk has been generated previously, just return it.
   if (llvm::GlobalValue *GV = CGM.getModule().getNamedValue(ThunkName))
     return cast<llvm::Function>(GV);
@@ -1002,9 +1038,14 @@
   CodeGenFunction CGF(CGM);
   CGF.StartThunk(ThunkFn, MD, FnInfo);
 
-  // Get to the Callee.
+  // Load the vfptr and then callee from the vftable.  The callee should have
+  // adjusted 'this' so that the vfptr is at offset zero.
   llvm::Value *This = CGF.LoadCXXThis();
-  llvm::Value *Callee = getVirtualFunctionPointer(CGF, MD, This, ThunkTy);
+  llvm::Value *VTable =
+      CGF.GetVTablePtr(This, ThunkTy->getPointerTo()->getPointerTo());
+  llvm::Value *VFuncPtr =
+      CGF.Builder.CreateConstInBoundsGEP1_64(VTable, ML.Index, "vfn");
+  llvm::Value *Callee = CGF.Builder.CreateLoad(VFuncPtr);
 
   // Make the call and return the result.
   CGF.EmitCallAndReturnForThunk(MD, Callee, 0);
@@ -1013,15 +1054,85 @@
 }
 
 void MicrosoftCXXABI::emitVirtualInheritanceTables(const CXXRecordDecl *RD) {
-  const VBTableVector &VBTables = EnumerateVBTables(RD);
-  llvm::GlobalVariable::LinkageTypes Linkage = CGM.getVTableLinkage(RD);
-
-  for (VBTableVector::const_iterator I = VBTables.begin(), E = VBTables.end();
-       I != E; ++I) {
-    I->EmitVBTableDefinition(CGM, RD, Linkage);
+  const VBTableGlobals &VBGlobals = enumerateVBTables(RD);
+  for (unsigned I = 0, E = VBGlobals.VBTables->size(); I != E; ++I) {
+    const VPtrInfo *VBT = (*VBGlobals.VBTables)[I];
+    llvm::GlobalVariable *GV = VBGlobals.Globals[I];
+    emitVBTableDefinition(*VBT, RD, GV);
   }
 }
 
+llvm::GlobalVariable *
+MicrosoftCXXABI::getAddrOfVBTable(const VPtrInfo &VBT, const CXXRecordDecl *RD,
+                                  llvm::GlobalVariable::LinkageTypes Linkage) {
+  SmallString<256> OutName;
+  llvm::raw_svector_ostream Out(OutName);
+  MicrosoftMangleContext &Mangler =
+      cast<MicrosoftMangleContext>(CGM.getCXXABI().getMangleContext());
+  Mangler.mangleCXXVBTable(RD, VBT.MangledPath, Out);
+  Out.flush();
+  StringRef Name = OutName.str();
+
+  llvm::ArrayType *VBTableType =
+      llvm::ArrayType::get(CGM.IntTy, 1 + VBT.ReusingBase->getNumVBases());
+
+  assert(!CGM.getModule().getNamedGlobal(Name) &&
+         "vbtable with this name already exists: mangling bug?");
+  llvm::GlobalVariable *GV =
+      CGM.CreateOrReplaceCXXRuntimeVariable(Name, VBTableType, Linkage);
+  GV->setUnnamedAddr(true);
+  return GV;
+}
+
+void MicrosoftCXXABI::emitVBTableDefinition(const VPtrInfo &VBT,
+                                            const CXXRecordDecl *RD,
+                                            llvm::GlobalVariable *GV) const {
+  const CXXRecordDecl *ReusingBase = VBT.ReusingBase;
+
+  assert(RD->getNumVBases() && ReusingBase->getNumVBases() &&
+         "should only emit vbtables for classes with vbtables");
+
+  const ASTRecordLayout &BaseLayout =
+      CGM.getContext().getASTRecordLayout(VBT.BaseWithVPtr);
+  const ASTRecordLayout &DerivedLayout =
+    CGM.getContext().getASTRecordLayout(RD);
+
+  SmallVector<llvm::Constant *, 4> Offsets(1 + ReusingBase->getNumVBases(), 0);
+
+  // The offset from ReusingBase's vbptr to itself always leads.
+  CharUnits VBPtrOffset = BaseLayout.getVBPtrOffset();
+  Offsets[0] = llvm::ConstantInt::get(CGM.IntTy, -VBPtrOffset.getQuantity());
+
+  MicrosoftVTableContext &Context = CGM.getMicrosoftVTableContext();
+  for (const auto &I : ReusingBase->vbases()) {
+    const CXXRecordDecl *VBase = I.getType()->getAsCXXRecordDecl();
+    CharUnits Offset = DerivedLayout.getVBaseClassOffset(VBase);
+    assert(!Offset.isNegative());
+
+    // Make it relative to the subobject vbptr.
+    CharUnits CompleteVBPtrOffset = VBT.NonVirtualOffset + VBPtrOffset;
+    if (VBT.getVBaseWithVPtr())
+      CompleteVBPtrOffset +=
+          DerivedLayout.getVBaseClassOffset(VBT.getVBaseWithVPtr());
+    Offset -= CompleteVBPtrOffset;
+
+    unsigned VBIndex = Context.getVBTableIndex(ReusingBase, VBase);
+    assert(Offsets[VBIndex] == 0 && "The same vbindex seen twice?");
+    Offsets[VBIndex] = llvm::ConstantInt::get(CGM.IntTy, Offset.getQuantity());
+  }
+
+  assert(Offsets.size() ==
+         cast<llvm::ArrayType>(cast<llvm::PointerType>(GV->getType())
+                               ->getElementType())->getNumElements());
+  llvm::ArrayType *VBTableType =
+    llvm::ArrayType::get(CGM.IntTy, Offsets.size());
+  llvm::Constant *Init = llvm::ConstantArray::get(VBTableType, Offsets);
+  GV->setInitializer(Init);
+
+  // Set the right visibility.
+  CGM.setGlobalVisibility(GV, RD);
+}
+
 llvm::Value *MicrosoftCXXABI::performThisAdjustment(CodeGenFunction &CGF,
                                                     llvm::Value *This,
                                                     const ThisAdjustment &TA) {
@@ -1168,7 +1279,7 @@
   if (D.isExternallyVisible()) {
     // Externally visible variables have to be numbered in Sema to properly
     // handle unreachable VarDecls.
-    BitIndex = getContext().getManglingNumber(&D);
+    BitIndex = getContext().getStaticLocalNumber(&D);
     assert(BitIndex > 0);
     BitIndex--;
   } else {
@@ -1229,38 +1340,6 @@
   CGF.EmitBlock(EndBlock);
 }
 
-// Member pointer helpers.
-static bool hasVBPtrOffsetField(MSInheritanceModel Inheritance) {
-  return Inheritance == MSIM_Unspecified;
-}
-
-static bool hasOnlyOneField(bool IsMemberFunction,
-                            MSInheritanceModel Inheritance) {
-  return Inheritance <= MSIM_SinglePolymorphic ||
-      (!IsMemberFunction && Inheritance <= MSIM_MultiplePolymorphic);
-}
-
-// Only member pointers to functions need a this adjustment, since it can be
-// combined with the field offset for data pointers.
-static bool hasNonVirtualBaseAdjustmentField(bool IsMemberFunction,
-                                             MSInheritanceModel Inheritance) {
-  return (IsMemberFunction && Inheritance >= MSIM_Multiple);
-}
-
-static bool hasVirtualBaseAdjustmentField(MSInheritanceModel Inheritance) {
-  return Inheritance >= MSIM_Virtual;
-}
-
-// Use zero for the field offset of a null data member pointer if we can
-// guarantee that zero is not a valid field offset, or if the member pointer has
-// multiple fields.  Polymorphic classes have a vfptr at offset zero, so we can
-// use zero for null.  If there are multiple fields, we can use zero even if it
-// is a valid field offset because null-ness testing will check the other
-// fields.
-static bool nullFieldOffsetIsZero(MSInheritanceModel Inheritance) {
-  return Inheritance != MSIM_Multiple && Inheritance != MSIM_Single;
-}
-
 bool MicrosoftCXXABI::isZeroInitializable(const MemberPointerType *MPT) {
   // Null-ness for function memptrs only depends on the first field, which is
   // the function pointer.  The rest don't matter, so we can zero initialize.
@@ -1270,28 +1349,28 @@
   // The virtual base adjustment field is always -1 for null, so if we have one
   // we can't zero initialize.  The field offset is sometimes also -1 if 0 is a
   // valid field offset.
-  const CXXRecordDecl *RD = MPT->getClass()->getAsCXXRecordDecl();
-  MSInheritanceModel Inheritance = RD->getMSInheritanceModel();
-  return (!hasVirtualBaseAdjustmentField(Inheritance) &&
-          nullFieldOffsetIsZero(Inheritance));
+  const CXXRecordDecl *RD = MPT->getMostRecentCXXRecordDecl();
+  MSInheritanceAttr::Spelling Inheritance = RD->getMSInheritanceModel();
+  return (!MSInheritanceAttr::hasVBTableOffsetField(Inheritance) &&
+          RD->nullFieldOffsetIsZero());
 }
 
 llvm::Type *
 MicrosoftCXXABI::ConvertMemberPointerType(const MemberPointerType *MPT) {
-  const CXXRecordDecl *RD = MPT->getClass()->getAsCXXRecordDecl();
-  MSInheritanceModel Inheritance = RD->getMSInheritanceModel();
+  const CXXRecordDecl *RD = MPT->getMostRecentCXXRecordDecl();
+  MSInheritanceAttr::Spelling Inheritance = RD->getMSInheritanceModel();
   llvm::SmallVector<llvm::Type *, 4> fields;
   if (MPT->isMemberFunctionPointer())
     fields.push_back(CGM.VoidPtrTy);  // FunctionPointerOrVirtualThunk
   else
     fields.push_back(CGM.IntTy);  // FieldOffset
 
-  if (hasNonVirtualBaseAdjustmentField(MPT->isMemberFunctionPointer(),
-                                       Inheritance))
+  if (MSInheritanceAttr::hasNVOffsetField(MPT->isMemberFunctionPointer(),
+                                          Inheritance))
     fields.push_back(CGM.IntTy);
-  if (hasVBPtrOffsetField(Inheritance))
+  if (MSInheritanceAttr::hasVBPtrOffsetField(Inheritance))
     fields.push_back(CGM.IntTy);
-  if (hasVirtualBaseAdjustmentField(Inheritance))
+  if (MSInheritanceAttr::hasVBTableOffsetField(Inheritance))
     fields.push_back(CGM.IntTy);  // VirtualBaseAdjustmentOffset
 
   if (fields.size() == 1)
@@ -1303,24 +1382,24 @@
 GetNullMemberPointerFields(const MemberPointerType *MPT,
                            llvm::SmallVectorImpl<llvm::Constant *> &fields) {
   assert(fields.empty());
-  const CXXRecordDecl *RD = MPT->getClass()->getAsCXXRecordDecl();
-  MSInheritanceModel Inheritance = RD->getMSInheritanceModel();
+  const CXXRecordDecl *RD = MPT->getMostRecentCXXRecordDecl();
+  MSInheritanceAttr::Spelling Inheritance = RD->getMSInheritanceModel();
   if (MPT->isMemberFunctionPointer()) {
     // FunctionPointerOrVirtualThunk
     fields.push_back(llvm::Constant::getNullValue(CGM.VoidPtrTy));
   } else {
-    if (nullFieldOffsetIsZero(Inheritance))
+    if (RD->nullFieldOffsetIsZero())
       fields.push_back(getZeroInt());  // FieldOffset
     else
       fields.push_back(getAllOnesInt());  // FieldOffset
   }
 
-  if (hasNonVirtualBaseAdjustmentField(MPT->isMemberFunctionPointer(),
-                                       Inheritance))
+  if (MSInheritanceAttr::hasNVOffsetField(MPT->isMemberFunctionPointer(),
+                                          Inheritance))
     fields.push_back(getZeroInt());
-  if (hasVBPtrOffsetField(Inheritance))
+  if (MSInheritanceAttr::hasVBPtrOffsetField(Inheritance))
     fields.push_back(getZeroInt());
-  if (hasVirtualBaseAdjustmentField(Inheritance))
+  if (MSInheritanceAttr::hasVBTableOffsetField(Inheritance))
     fields.push_back(getAllOnesInt());
 }
 
@@ -1341,29 +1420,29 @@
                                        const CXXRecordDecl *RD,
                                        CharUnits NonVirtualBaseAdjustment)
 {
-  MSInheritanceModel Inheritance = RD->getMSInheritanceModel();
+  MSInheritanceAttr::Spelling Inheritance = RD->getMSInheritanceModel();
 
   // Single inheritance class member pointer are represented as scalars instead
   // of aggregates.
-  if (hasOnlyOneField(IsMemberFunction, Inheritance))
+  if (MSInheritanceAttr::hasOnlyOneField(IsMemberFunction, Inheritance))
     return FirstField;
 
   llvm::SmallVector<llvm::Constant *, 4> fields;
   fields.push_back(FirstField);
 
-  if (hasNonVirtualBaseAdjustmentField(IsMemberFunction, Inheritance))
+  if (MSInheritanceAttr::hasNVOffsetField(IsMemberFunction, Inheritance))
     fields.push_back(llvm::ConstantInt::get(
       CGM.IntTy, NonVirtualBaseAdjustment.getQuantity()));
 
-  if (hasVBPtrOffsetField(Inheritance)) {
+  if (MSInheritanceAttr::hasVBPtrOffsetField(Inheritance)) {
     CharUnits Offs = CharUnits::Zero();
     if (RD->getNumVBases())
-      Offs = GetVBPtrOffsetFromBases(RD);
+      Offs = getContext().getASTRecordLayout(RD).getVBPtrOffset();
     fields.push_back(llvm::ConstantInt::get(CGM.IntTy, Offs.getQuantity()));
   }
 
   // The rest of the fields are adjusted by conversions to a more derived class.
-  if (hasVirtualBaseAdjustmentField(Inheritance))
+  if (MSInheritanceAttr::hasVBTableOffsetField(Inheritance))
     fields.push_back(getZeroInt());
 
   return llvm::ConstantStruct::getAnon(fields);
@@ -1372,7 +1451,7 @@
 llvm::Constant *
 MicrosoftCXXABI::EmitMemberDataPointer(const MemberPointerType *MPT,
                                        CharUnits offset) {
-  const CXXRecordDecl *RD = MPT->getClass()->getAsCXXRecordDecl();
+  const CXXRecordDecl *RD = MPT->getMostRecentCXXRecordDecl();
   llvm::Constant *FirstField =
     llvm::ConstantInt::get(CGM.IntTy, offset.getQuantity());
   return EmitFullMemberPointer(FirstField, /*IsMemberFunction=*/false, RD,
@@ -1395,8 +1474,8 @@
   // FIXME PR15713: Support virtual inheritance paths.
 
   if (const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(MPD))
-    return BuildMemberPointer(MPT->getClass()->getAsCXXRecordDecl(),
-                              MD, ThisAdjustment);
+    return BuildMemberPointer(MPT->getMostRecentCXXRecordDecl(), MD,
+                              ThisAdjustment);
 
   CharUnits FieldOffset =
     getContext().toCharUnitsFromBits(getContext().getFieldOffset(MPD));
@@ -1409,6 +1488,7 @@
                                     CharUnits NonVirtualBaseAdjustment) {
   assert(MD->isInstance() && "Member function must not be static!");
   MD = MD->getCanonicalDecl();
+  RD = RD->getMostRecentDecl();
   CodeGenTypes &Types = CGM.getTypes();
 
   llvm::Constant *FirstField;
@@ -1442,16 +1522,10 @@
                                "member function in virtual base class");
       FirstField = llvm::Constant::getNullValue(CGM.VoidPtrTy);
     } else {
-      SmallString<256> ThunkName;
-      CharUnits PointerWidth = getContext().toCharUnitsFromBits(
-          getContext().getTargetInfo().getPointerWidth(0));
-      uint64_t OffsetInVFTable = ML.Index * PointerWidth.getQuantity();
-      llvm::raw_svector_ostream Out(ThunkName);
-      getMangleContext().mangleVirtualMemPtrThunk(MD, OffsetInVFTable, Out);
-      Out.flush();
-
-      llvm::Function *Thunk = EmitVirtualMemPtrThunk(MD, ThunkName.str());
+      llvm::Function *Thunk = EmitVirtualMemPtrThunk(MD, ML);
       FirstField = llvm::ConstantExpr::getBitCast(Thunk, CGM.VoidPtrTy);
+      // Include the vfptr adjustment if the method is in a non-primary vftable.
+      NonVirtualBaseAdjustment += ML.VFPtrOffset;
     }
   }
 
@@ -1486,9 +1560,10 @@
 
   // If this is a single field member pointer (single inheritance), this is a
   // single icmp.
-  const CXXRecordDecl *RD = MPT->getClass()->getAsCXXRecordDecl();
-  MSInheritanceModel Inheritance = RD->getMSInheritanceModel();
-  if (hasOnlyOneField(MPT->isMemberFunctionPointer(), Inheritance))
+  const CXXRecordDecl *RD = MPT->getMostRecentCXXRecordDecl();
+  MSInheritanceAttr::Spelling Inheritance = RD->getMSInheritanceModel();
+  if (MSInheritanceAttr::hasOnlyOneField(MPT->isMemberFunctionPointer(),
+                                         Inheritance))
     return Builder.CreateICmp(Eq, L, R);
 
   // Compare the first field.
@@ -1607,11 +1682,9 @@
 
 // Returns an adjusted base cast to i8*, since we do more address arithmetic on
 // it.
-llvm::Value *
-MicrosoftCXXABI::AdjustVirtualBase(CodeGenFunction &CGF,
-                                   const CXXRecordDecl *RD, llvm::Value *Base,
-                                   llvm::Value *VBTableOffset,
-                                   llvm::Value *VBPtrOffset) {
+llvm::Value *MicrosoftCXXABI::AdjustVirtualBase(
+    CodeGenFunction &CGF, const Expr *E, const CXXRecordDecl *RD,
+    llvm::Value *Base, llvm::Value *VBTableOffset, llvm::Value *VBPtrOffset) {
   CGBuilderTy &Builder = CGF.Builder;
   Base = Builder.CreateBitCast(Base, CGM.Int8PtrTy);
   llvm::BasicBlock *OriginalBB = 0;
@@ -1637,9 +1710,15 @@
   // know the vbptr offset.
   if (!VBPtrOffset) {
     CharUnits offs = CharUnits::Zero();
-    if (RD->getNumVBases()) {
-      offs = GetVBPtrOffsetFromBases(RD);
-    }
+    if (!RD->hasDefinition()) {
+      DiagnosticsEngine &Diags = CGF.CGM.getDiags();
+      unsigned DiagID = Diags.getCustomDiagID(
+          DiagnosticsEngine::Error,
+          "member pointer representation requires a "
+          "complete class type for %0 to perform this expression");
+      Diags.Report(E->getExprLoc(), DiagID) << RD << E->getSourceRange();
+    } else if (RD->getNumVBases())
+      offs = getContext().getASTRecordLayout(RD).getVBPtrOffset();
     VBPtrOffset = llvm::ConstantInt::get(CGM.IntTy, offs.getQuantity());
   }
   llvm::Value *VBPtr = 0;
@@ -1659,18 +1738,16 @@
   return AdjustedBase;
 }
 
-llvm::Value *
-MicrosoftCXXABI::EmitMemberDataPointerAddress(CodeGenFunction &CGF,
-                                              llvm::Value *Base,
-                                              llvm::Value *MemPtr,
-                                              const MemberPointerType *MPT) {
+llvm::Value *MicrosoftCXXABI::EmitMemberDataPointerAddress(
+    CodeGenFunction &CGF, const Expr *E, llvm::Value *Base, llvm::Value *MemPtr,
+    const MemberPointerType *MPT) {
   assert(MPT->isMemberDataPointer());
   unsigned AS = Base->getType()->getPointerAddressSpace();
   llvm::Type *PType =
       CGF.ConvertTypeForMem(MPT->getPointeeType())->getPointerTo(AS);
   CGBuilderTy &Builder = CGF.Builder;
-  const CXXRecordDecl *RD = MPT->getClass()->getAsCXXRecordDecl();
-  MSInheritanceModel Inheritance = RD->getMSInheritanceModel();
+  const CXXRecordDecl *RD = MPT->getMostRecentCXXRecordDecl();
+  MSInheritanceAttr::Spelling Inheritance = RD->getMSInheritanceModel();
 
   // Extract the fields we need, regardless of model.  We'll apply them if we
   // have them.
@@ -1681,16 +1758,21 @@
     // We need to extract values.
     unsigned I = 0;
     FieldOffset = Builder.CreateExtractValue(MemPtr, I++);
-    if (hasVBPtrOffsetField(Inheritance))
+    if (MSInheritanceAttr::hasVBPtrOffsetField(Inheritance))
       VBPtrOffset = Builder.CreateExtractValue(MemPtr, I++);
-    if (hasVirtualBaseAdjustmentField(Inheritance))
+    if (MSInheritanceAttr::hasVBTableOffsetField(Inheritance))
       VirtualBaseAdjustmentOffset = Builder.CreateExtractValue(MemPtr, I++);
   }
 
   if (VirtualBaseAdjustmentOffset) {
-    Base = AdjustVirtualBase(CGF, RD, Base, VirtualBaseAdjustmentOffset,
+    Base = AdjustVirtualBase(CGF, E, RD, Base, VirtualBaseAdjustmentOffset,
                              VBPtrOffset);
   }
+
+  // Cast to char*.
+  Base = Builder.CreateBitCast(Base, Builder.getInt8Ty()->getPointerTo(AS));
+
+  // Apply the offset, which we assume is non-null.
   llvm::Value *Addr =
     Builder.CreateInBoundsGEP(Base, FieldOffset, "memptr.offset");
 
@@ -1699,9 +1781,9 @@
   return Builder.CreateBitCast(Addr, PType);
 }
 
-static MSInheritanceModel
+static MSInheritanceAttr::Spelling
 getInheritanceFromMemptr(const MemberPointerType *MPT) {
-  return MPT->getClass()->getAsCXXRecordDecl()->getMSInheritanceModel();
+  return MPT->getMostRecentCXXRecordDecl()->getMSInheritanceModel();
 }
 
 llvm::Value *
@@ -1721,15 +1803,17 @@
   const MemberPointerType *SrcTy =
     E->getSubExpr()->getType()->castAs<MemberPointerType>();
   const MemberPointerType *DstTy = E->getType()->castAs<MemberPointerType>();
-  MSInheritanceModel SrcInheritance = getInheritanceFromMemptr(SrcTy);
-  MSInheritanceModel DstInheritance = getInheritanceFromMemptr(DstTy);
   bool IsFunc = SrcTy->isMemberFunctionPointer();
 
   // If the classes use the same null representation, reinterpret_cast is a nop.
   bool IsReinterpret = E->getCastKind() == CK_ReinterpretMemberPointer;
-  if (IsReinterpret && (IsFunc ||
-                        nullFieldOffsetIsZero(SrcInheritance) ==
-                        nullFieldOffsetIsZero(DstInheritance)))
+  if (IsReinterpret && IsFunc)
+    return Src;
+
+  CXXRecordDecl *SrcRD = SrcTy->getMostRecentCXXRecordDecl();
+  CXXRecordDecl *DstRD = DstTy->getMostRecentCXXRecordDecl();
+  if (IsReinterpret &&
+      SrcRD->nullFieldOffsetIsZero() == DstRD->nullFieldOffsetIsZero())
     return Src;
 
   CGBuilderTy &Builder = CGF.Builder;
@@ -1758,15 +1842,16 @@
   llvm::Value *NonVirtualBaseAdjustment = 0;
   llvm::Value *VirtualBaseAdjustmentOffset = 0;
   llvm::Value *VBPtrOffset = 0;
-  if (!hasOnlyOneField(IsFunc, SrcInheritance)) {
+  MSInheritanceAttr::Spelling SrcInheritance = SrcRD->getMSInheritanceModel();
+  if (!MSInheritanceAttr::hasOnlyOneField(IsFunc, SrcInheritance)) {
     // We need to extract values.
     unsigned I = 0;
     FirstField = Builder.CreateExtractValue(Src, I++);
-    if (hasNonVirtualBaseAdjustmentField(IsFunc, SrcInheritance))
+    if (MSInheritanceAttr::hasNVOffsetField(IsFunc, SrcInheritance))
       NonVirtualBaseAdjustment = Builder.CreateExtractValue(Src, I++);
-    if (hasVBPtrOffsetField(SrcInheritance))
+    if (MSInheritanceAttr::hasVBPtrOffsetField(SrcInheritance))
       VBPtrOffset = Builder.CreateExtractValue(Src, I++);
-    if (hasVirtualBaseAdjustmentField(SrcInheritance))
+    if (MSInheritanceAttr::hasVBTableOffsetField(SrcInheritance))
       VirtualBaseAdjustmentOffset = Builder.CreateExtractValue(Src, I++);
   }
 
@@ -1788,20 +1873,21 @@
   // FIXME PR15713: Support conversions through virtually derived classes.
 
   // Recompose dst from the null struct and the adjusted fields from src.
+  MSInheritanceAttr::Spelling DstInheritance = DstRD->getMSInheritanceModel();
   llvm::Value *Dst;
-  if (hasOnlyOneField(IsFunc, DstInheritance)) {
+  if (MSInheritanceAttr::hasOnlyOneField(IsFunc, DstInheritance)) {
     Dst = FirstField;
   } else {
     Dst = llvm::UndefValue::get(DstNull->getType());
     unsigned Idx = 0;
     Dst = Builder.CreateInsertValue(Dst, FirstField, Idx++);
-    if (hasNonVirtualBaseAdjustmentField(IsFunc, DstInheritance))
+    if (MSInheritanceAttr::hasNVOffsetField(IsFunc, DstInheritance))
       Dst = Builder.CreateInsertValue(
         Dst, getValueOrZeroInt(NonVirtualBaseAdjustment), Idx++);
-    if (hasVBPtrOffsetField(DstInheritance))
+    if (MSInheritanceAttr::hasVBPtrOffsetField(DstInheritance))
       Dst = Builder.CreateInsertValue(
         Dst, getValueOrZeroInt(VBPtrOffset), Idx++);
-    if (hasVirtualBaseAdjustmentField(DstInheritance))
+    if (MSInheritanceAttr::hasVBTableOffsetField(DstInheritance))
       Dst = Builder.CreateInsertValue(
         Dst, getValueOrZeroInt(VirtualBaseAdjustmentOffset), Idx++);
   }
@@ -1833,8 +1919,8 @@
   if (E->getCastKind() == CK_ReinterpretMemberPointer)
     return Src;
 
-  MSInheritanceModel SrcInheritance = getInheritanceFromMemptr(SrcTy);
-  MSInheritanceModel DstInheritance = getInheritanceFromMemptr(DstTy);
+  MSInheritanceAttr::Spelling SrcInheritance = getInheritanceFromMemptr(SrcTy);
+  MSInheritanceAttr::Spelling DstInheritance = getInheritanceFromMemptr(DstTy);
 
   // Decompose src.
   llvm::Constant *FirstField = Src;
@@ -1842,15 +1928,15 @@
   llvm::Constant *VirtualBaseAdjustmentOffset = 0;
   llvm::Constant *VBPtrOffset = 0;
   bool IsFunc = SrcTy->isMemberFunctionPointer();
-  if (!hasOnlyOneField(IsFunc, SrcInheritance)) {
+  if (!MSInheritanceAttr::hasOnlyOneField(IsFunc, SrcInheritance)) {
     // We need to extract values.
     unsigned I = 0;
     FirstField = Src->getAggregateElement(I++);
-    if (hasNonVirtualBaseAdjustmentField(IsFunc, SrcInheritance))
+    if (MSInheritanceAttr::hasNVOffsetField(IsFunc, SrcInheritance))
       NonVirtualBaseAdjustment = Src->getAggregateElement(I++);
-    if (hasVBPtrOffsetField(SrcInheritance))
+    if (MSInheritanceAttr::hasVBPtrOffsetField(SrcInheritance))
       VBPtrOffset = Src->getAggregateElement(I++);
-    if (hasVirtualBaseAdjustmentField(SrcInheritance))
+    if (MSInheritanceAttr::hasVBTableOffsetField(SrcInheritance))
       VirtualBaseAdjustmentOffset = Src->getAggregateElement(I++);
   }
 
@@ -1873,35 +1959,33 @@
   // FIXME PR15713: Support conversions through virtually derived classes.
 
   // Recompose dst from the null struct and the adjusted fields from src.
-  if (hasOnlyOneField(IsFunc, DstInheritance))
+  if (MSInheritanceAttr::hasOnlyOneField(IsFunc, DstInheritance))
     return FirstField;
 
   llvm::SmallVector<llvm::Constant *, 4> Fields;
   Fields.push_back(FirstField);
-  if (hasNonVirtualBaseAdjustmentField(IsFunc, DstInheritance))
+  if (MSInheritanceAttr::hasNVOffsetField(IsFunc, DstInheritance))
     Fields.push_back(getConstantOrZeroInt(NonVirtualBaseAdjustment));
-  if (hasVBPtrOffsetField(DstInheritance))
+  if (MSInheritanceAttr::hasVBPtrOffsetField(DstInheritance))
     Fields.push_back(getConstantOrZeroInt(VBPtrOffset));
-  if (hasVirtualBaseAdjustmentField(DstInheritance))
+  if (MSInheritanceAttr::hasVBTableOffsetField(DstInheritance))
     Fields.push_back(getConstantOrZeroInt(VirtualBaseAdjustmentOffset));
   return llvm::ConstantStruct::getAnon(Fields);
 }
 
-llvm::Value *
-MicrosoftCXXABI::EmitLoadOfMemberFunctionPointer(CodeGenFunction &CGF,
-                                                 llvm::Value *&This,
-                                                 llvm::Value *MemPtr,
-                                                 const MemberPointerType *MPT) {
+llvm::Value *MicrosoftCXXABI::EmitLoadOfMemberFunctionPointer(
+    CodeGenFunction &CGF, const Expr *E, llvm::Value *&This,
+    llvm::Value *MemPtr, const MemberPointerType *MPT) {
   assert(MPT->isMemberFunctionPointer());
   const FunctionProtoType *FPT =
     MPT->getPointeeType()->castAs<FunctionProtoType>();
-  const CXXRecordDecl *RD = MPT->getClass()->getAsCXXRecordDecl();
+  const CXXRecordDecl *RD = MPT->getMostRecentCXXRecordDecl();
   llvm::FunctionType *FTy =
     CGM.getTypes().GetFunctionType(
       CGM.getTypes().arrangeCXXMethodType(RD, FPT));
   CGBuilderTy &Builder = CGF.Builder;
 
-  MSInheritanceModel Inheritance = RD->getMSInheritanceModel();
+  MSInheritanceAttr::Spelling Inheritance = RD->getMSInheritanceModel();
 
   // Extract the fields we need, regardless of model.  We'll apply them if we
   // have them.
@@ -1913,16 +1997,16 @@
     // We need to extract values.
     unsigned I = 0;
     FunctionPointer = Builder.CreateExtractValue(MemPtr, I++);
-    if (hasNonVirtualBaseAdjustmentField(MPT, Inheritance))
+    if (MSInheritanceAttr::hasNVOffsetField(MPT, Inheritance))
       NonVirtualBaseAdjustment = Builder.CreateExtractValue(MemPtr, I++);
-    if (hasVBPtrOffsetField(Inheritance))
+    if (MSInheritanceAttr::hasVBPtrOffsetField(Inheritance))
       VBPtrOffset = Builder.CreateExtractValue(MemPtr, I++);
-    if (hasVirtualBaseAdjustmentField(Inheritance))
+    if (MSInheritanceAttr::hasVBTableOffsetField(Inheritance))
       VirtualBaseAdjustmentOffset = Builder.CreateExtractValue(MemPtr, I++);
   }
 
   if (VirtualBaseAdjustmentOffset) {
-    This = AdjustVirtualBase(CGF, RD, This, VirtualBaseAdjustmentOffset,
+    This = AdjustVirtualBase(CGF, E, RD, This, VirtualBaseAdjustmentOffset,
                              VBPtrOffset);
   }
 
diff --git a/lib/CodeGen/MicrosoftVBTables.cpp b/lib/CodeGen/MicrosoftVBTables.cpp
deleted file mode 100644
index dabf52c..0000000
--- a/lib/CodeGen/MicrosoftVBTables.cpp
+++ /dev/null
@@ -1,233 +0,0 @@
-//===--- MicrosoftVBTables.cpp - Virtual Base Table Emission --------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class generates data about MSVC virtual base tables.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MicrosoftVBTables.h"
-#include "CodeGenModule.h"
-#include "CGCXXABI.h"
-
-namespace clang {
-namespace CodeGen {
-
-/// Holds intermediate data about a path to a vbptr inside a base subobject.
-struct VBTablePath {
-  VBTablePath(const VBTableInfo &VBInfo)
-    : VBInfo(VBInfo), NextBase(VBInfo.VBPtrSubobject.getBase()) { }
-
-  /// All the data needed to build a vbtable, minus the GlobalVariable whose
-  /// name we haven't computed yet.
-  VBTableInfo VBInfo;
-
-  /// Next base to use for disambiguation.  Can be null if we've already
-  /// disambiguated this path once.
-  const CXXRecordDecl *NextBase;
-
-  /// Path is not really a full path like a CXXBasePath.  It holds the subset of
-  /// records that need to be mangled into the vbtable symbol name in order to get
-  /// a unique name.
-  llvm::SmallVector<const CXXRecordDecl *, 1> Path;
-};
-
-VBTableBuilder::VBTableBuilder(CodeGenModule &CGM,
-                               const CXXRecordDecl *MostDerived)
-    : CGM(CGM), MostDerived(MostDerived),
-      DerivedLayout(CGM.getContext().getASTRecordLayout(MostDerived)) {}
-
-void VBTableBuilder::enumerateVBTables(VBTableVector &VBTables) {
-  VBTablePathVector Paths;
-  findUnambiguousPaths(MostDerived, BaseSubobject(MostDerived,
-                                                  CharUnits::Zero()), Paths);
-  for (VBTablePathVector::iterator I = Paths.begin(), E = Paths.end();
-       I != E; ++I) {
-    VBTablePath *P = *I;
-    P->VBInfo.GV = getAddrOfVBTable(P->VBInfo.ReusingBase, P->Path);
-    VBTables.push_back(P->VBInfo);
-  }
-}
-
-
-void VBTableBuilder::findUnambiguousPaths(const CXXRecordDecl *ReusingBase,
-                                          BaseSubobject CurSubobject,
-                                          VBTablePathVector &Paths) {
-  size_t PathsStart = Paths.size();
-  bool ReuseVBPtrFromBase = true;
-  const CXXRecordDecl *CurBase = CurSubobject.getBase();
-  const ASTRecordLayout &Layout = CGM.getContext().getASTRecordLayout(CurBase);
-
-  // If this base has a vbptr, then we've found a path.  These are not full
-  // paths, so we don't use CXXBasePath.
-  if (Layout.hasOwnVBPtr()) {
-    ReuseVBPtrFromBase = false;
-    VBTablePath *Info = new VBTablePath(
-      VBTableInfo(ReusingBase, CurSubobject, /*GV=*/0));
-    Paths.push_back(Info);
-  }
-
-  // Recurse onto any bases which themselves have virtual bases.
-  for (CXXRecordDecl::base_class_const_iterator I = CurBase->bases_begin(),
-       E = CurBase->bases_end(); I != E; ++I) {
-    const CXXRecordDecl *Base = I->getType()->getAsCXXRecordDecl();
-    if (!Base->getNumVBases())
-      continue;  // Bases without virtual bases have no vbptrs.
-    CharUnits NextOffset;
-    const CXXRecordDecl *NextReusingBase = Base;
-    if (I->isVirtual()) {
-      if (!VBasesSeen.insert(Base))
-        continue;  // Don't visit virtual bases twice.
-      NextOffset = DerivedLayout.getVBaseClassOffset(Base);
-    } else {
-      NextOffset = (CurSubobject.getBaseOffset() +
-                    Layout.getBaseClassOffset(Base));
-
-      // If CurBase didn't have a vbptr, then ReusingBase will reuse the vbptr
-      // from the first non-virtual base with vbases for its vbptr.
-      if (ReuseVBPtrFromBase) {
-        NextReusingBase = ReusingBase;
-        ReuseVBPtrFromBase = false;
-      }
-    }
-
-    size_t NumPaths = Paths.size();
-    findUnambiguousPaths(NextReusingBase, BaseSubobject(Base, NextOffset),
-                         Paths);
-
-    // Tag paths through this base with the base itself.  We might use it to
-    // disambiguate.
-    for (size_t I = NumPaths, E = Paths.size(); I != E; ++I)
-      Paths[I]->NextBase = Base;
-  }
-
-  bool AmbiguousPaths = rebucketPaths(Paths, PathsStart);
-  if (AmbiguousPaths)
-    rebucketPaths(Paths, PathsStart, /*SecondPass=*/true);
-
-#ifndef NDEBUG
-  // Check that the paths are in fact unique.
-  for (size_t I = PathsStart + 1, E = Paths.size(); I != E; ++I) {
-    assert(Paths[I]->Path != Paths[I - 1]->Path && "vbtable paths are not unique");
-  }
-#endif
-}
-
-static bool pathCompare(VBTablePath *LHS, VBTablePath *RHS) {
-  return LHS->Path < RHS->Path;
-}
-
-void VBTableBuilder::extendPath(VBTablePath *P, bool SecondPass) {
-  assert(P->NextBase || SecondPass);
-  if (P->NextBase) {
-    P->Path.push_back(P->NextBase);
-    P->NextBase = 0;  // Prevent the path from being extended twice.
-  }
-}
-
-bool VBTableBuilder::rebucketPaths(VBTablePathVector &Paths, size_t PathsStart,
-                                   bool SecondPass) {
-  // What we're essentially doing here is bucketing together ambiguous paths.
-  // Any bucket with more than one path in it gets extended by NextBase, which
-  // is usually the direct base of the inherited the vbptr.  This code uses a
-  // sorted vector to implement a multiset to form the buckets.  Note that the
-  // ordering is based on pointers, but it doesn't change our output order.  The
-  // current algorithm is designed to match MSVC 2012's names.
-  // TODO: Implement MSVC 2010 or earlier names to avoid extra vbtable cruft.
-  VBTablePathVector PathsSorted(&Paths[PathsStart], &Paths.back() + 1);
-  std::sort(PathsSorted.begin(), PathsSorted.end(), pathCompare);
-  bool AmbiguousPaths = false;
-  for (size_t I = 0, E = PathsSorted.size(); I != E;) {
-    // Scan forward to find the end of the bucket.
-    size_t BucketStart = I;
-    do {
-      ++I;
-    } while (I != E && PathsSorted[BucketStart]->Path == PathsSorted[I]->Path);
-
-    // If this bucket has multiple paths, extend them all.
-    if (I - BucketStart > 1) {
-      AmbiguousPaths = true;
-      for (size_t II = BucketStart; II != I; ++II)
-        extendPath(PathsSorted[II], SecondPass);
-    }
-  }
-  return AmbiguousPaths;
-}
-
-llvm::GlobalVariable *
-VBTableBuilder::getAddrOfVBTable(const CXXRecordDecl *ReusingBase,
-                                 ArrayRef<const CXXRecordDecl *> BasePath) {
-  // Caching at this layer is redundant with the caching in EnumerateVBTables().
-
-  SmallString<256> OutName;
-  llvm::raw_svector_ostream Out(OutName);
-  MicrosoftMangleContext &Mangler =
-      cast<MicrosoftMangleContext>(CGM.getCXXABI().getMangleContext());
-  Mangler.mangleCXXVBTable(MostDerived, BasePath, Out);
-  Out.flush();
-  StringRef Name = OutName.str();
-
-  llvm::ArrayType *VBTableType =
-    llvm::ArrayType::get(CGM.IntTy, 1 + ReusingBase->getNumVBases());
-
-  assert(!CGM.getModule().getNamedGlobal(Name) &&
-         "vbtable with this name already exists: mangling bug?");
-  llvm::GlobalVariable *VBTable =
-    CGM.CreateOrReplaceCXXRuntimeVariable(Name, VBTableType,
-                                          llvm::GlobalValue::ExternalLinkage);
-  VBTable->setUnnamedAddr(true);
-  return VBTable;
-}
-
-void VBTableInfo::EmitVBTableDefinition(
-    CodeGenModule &CGM, const CXXRecordDecl *RD,
-    llvm::GlobalVariable::LinkageTypes Linkage) const {
-  assert(RD->getNumVBases() && ReusingBase->getNumVBases() &&
-         "should only emit vbtables for classes with vbtables");
-
-  const ASTRecordLayout &BaseLayout =
-    CGM.getContext().getASTRecordLayout(VBPtrSubobject.getBase());
-  const ASTRecordLayout &DerivedLayout =
-    CGM.getContext().getASTRecordLayout(RD);
-
-  SmallVector<llvm::Constant *, 4> Offsets(1 + ReusingBase->getNumVBases(), 0);
-
-  // The offset from ReusingBase's vbptr to itself always leads.
-  CharUnits VBPtrOffset = BaseLayout.getVBPtrOffset();
-  Offsets[0] = llvm::ConstantInt::get(CGM.IntTy, -VBPtrOffset.getQuantity());
-
-  MicrosoftVTableContext &Context = CGM.getMicrosoftVTableContext();
-  for (CXXRecordDecl::base_class_const_iterator I = ReusingBase->vbases_begin(),
-       E = ReusingBase->vbases_end(); I != E; ++I) {
-    const CXXRecordDecl *VBase = I->getType()->getAsCXXRecordDecl();
-    CharUnits Offset = DerivedLayout.getVBaseClassOffset(VBase);
-    assert(!Offset.isNegative());
-    // Make it relative to the subobject vbptr.
-    Offset -= VBPtrSubobject.getBaseOffset() + VBPtrOffset;
-    unsigned VBIndex = Context.getVBTableIndex(ReusingBase, VBase);
-    assert(Offsets[VBIndex] == 0 && "The same vbindex seen twice?");
-    Offsets[VBIndex] = llvm::ConstantInt::get(CGM.IntTy, Offset.getQuantity());
-  }
-
-  assert(Offsets.size() ==
-         cast<llvm::ArrayType>(cast<llvm::PointerType>(GV->getType())
-                               ->getElementType())->getNumElements());
-  llvm::ArrayType *VBTableType =
-    llvm::ArrayType::get(CGM.IntTy, Offsets.size());
-  llvm::Constant *Init = llvm::ConstantArray::get(VBTableType, Offsets);
-  GV->setInitializer(Init);
-
-  // Set the correct linkage.
-  GV->setLinkage(Linkage);
-
-  // Set the right visibility.
-  CGM.setTypeVisibility(GV, RD, CodeGenModule::TVK_ForVTable);
-}
-
-} // namespace CodeGen
-} // namespace clang
diff --git a/lib/CodeGen/MicrosoftVBTables.h b/lib/CodeGen/MicrosoftVBTables.h
deleted file mode 100644
index 4ad8e07..0000000
--- a/lib/CodeGen/MicrosoftVBTables.h
+++ /dev/null
@@ -1,129 +0,0 @@
-//===--- MicrosoftVBTables.h - Virtual Base Table Emission ----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class generates data about MSVC virtual base tables.
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/AST/BaseSubobject.h"
-#include "clang/Basic/LLVM.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/IR/GlobalVariable.h"
-#include <vector>
-
-namespace clang {
-
-class ASTRecordLayout;
-
-namespace CodeGen {
-
-class CodeGenModule;
-
-struct VBTableInfo {
-  VBTableInfo(const CXXRecordDecl *ReusingBase, BaseSubobject VBPtrSubobject,
-              llvm::GlobalVariable *GV)
-    : ReusingBase(ReusingBase), VBPtrSubobject(VBPtrSubobject), GV(GV) { }
-
-  /// The vbtable will hold all of the virtual bases of ReusingBase.  This may
-  /// or may not be the same class as VBPtrSubobject.Base.  A derived class will
-  /// reuse the vbptr of the first non-virtual base subobject that has one.
-  const CXXRecordDecl *ReusingBase;
-
-  /// The vbptr is stored inside this subobject.
-  BaseSubobject VBPtrSubobject;
-
-  /// The GlobalVariable for this vbtable.
-  llvm::GlobalVariable *GV;
-
-  /// \brief Emits a definition for GV by setting it's initializer.
-  void EmitVBTableDefinition(CodeGenModule &CGM, const CXXRecordDecl *RD,
-                             llvm::GlobalVariable::LinkageTypes Linkage) const;
-};
-
-// These are embedded in a DenseMap and the elements are large, so we don't want
-// SmallVector.
-typedef std::vector<VBTableInfo> VBTableVector;
-
-struct VBTablePath;
-
-typedef llvm::SmallVector<VBTablePath *, 6> VBTablePathVector;
-
-/// Produces MSVC-compatible vbtable data.  The symbols produced by this builder
-/// match those produced by MSVC 2012, which is different from MSVC 2010.
-///
-/// Unlike Itanium, which uses only one vtable per class, MSVC uses a different
-/// symbol for every "address point" installed in base subobjects.  As a result,
-/// we have to compute unique symbols for every table.  Since there can be
-/// multiple non-virtual base subobjects of the same class, combining the most
-/// derived class with the base containing the vtable is insufficient.  The most
-/// trivial algorithm would be to mangle in the entire path from base to most
-/// derived, but that would be too easy and would create unnecessarily large
-/// symbols.  ;)
-///
-/// MSVC 2012 appears to minimize the vbtable names using the following
-/// algorithm.  First, walk the class hierarchy in the usual order, depth first,
-/// left to right, to find all of the subobjects which contain a vbptr field.
-/// Visiting each class node yields a list of inheritance paths to vbptrs.  Each
-/// record with a vbptr creates an initially empty path.
-///
-/// To combine paths from child nodes, the paths are compared to check for
-/// ambiguity.  Paths are "ambiguous" if multiple paths have the same set of
-/// components in the same order.  Each group of ambiguous paths is extended by
-/// appending the class of the base from which it came.  If the current class
-/// node produced an ambiguous path, its path is extended with the current class.
-/// After extending paths, MSVC again checks for ambiguity, and extends any
-/// ambiguous path which wasn't already extended.  Because each node yields an
-/// unambiguous set of paths, MSVC doesn't need to extend any path more than once
-/// to produce an unambiguous set of paths.
-///
-/// The VBTableBuilder class attempts to implement this algorithm by repeatedly
-/// bucketing paths together by sorting them.
-///
-/// TODO: Presumably vftables use the same algorithm.
-///
-/// TODO: Implement the MSVC 2010 name mangling scheme to avoid emitting
-/// duplicate vbtables with different symbols.
-class VBTableBuilder {
-public:
-  VBTableBuilder(CodeGenModule &CGM, const CXXRecordDecl *MostDerived);
-
-  void enumerateVBTables(VBTableVector &VBTables);
-
-private:
-  bool hasVBPtr(const CXXRecordDecl *RD);
-
-  llvm::GlobalVariable *getAddrOfVBTable(const CXXRecordDecl *ReusingBase,
-                                      ArrayRef<const CXXRecordDecl *> BasePath);
-
-  /// Enumerates paths to bases with vbptrs.  The paths elements are compressed
-  /// to contain only the classes necessary to form an unambiguous path.
-  void findUnambiguousPaths(const CXXRecordDecl *ReusingBase,
-                            BaseSubobject CurSubobject,
-                            VBTablePathVector &Paths);
-
-  void extendPath(VBTablePath *Info, bool SecondPass);
-
-  bool rebucketPaths(VBTablePathVector &Paths, size_t PathsStart,
-                     bool SecondPass = false);
-
-  CodeGenModule &CGM;
-
-  const CXXRecordDecl *MostDerived;
-
-  /// Caches the layout of the most derived class.
-  const ASTRecordLayout &DerivedLayout;
-
-  /// Set of vbases to avoid re-visiting the same vbases.
-  llvm::SmallPtrSet<const CXXRecordDecl*, 4> VBasesSeen;
-};
-
-} // namespace CodeGen
-
-} // namespace clang
diff --git a/lib/CodeGen/ModuleBuilder.cpp b/lib/CodeGen/ModuleBuilder.cpp
index bc7acbc..7873f44 100644
--- a/lib/CodeGen/ModuleBuilder.cpp
+++ b/lib/CodeGen/ModuleBuilder.cpp
@@ -12,30 +12,31 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/CodeGen/ModuleBuilder.h"
-#include "CodeGenModule.h"
 #include "CGDebugInfo.h"
+#include "CodeGenModule.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/Expr.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Frontend/CodeGenOptions.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include <memory>
 using namespace clang;
 
 namespace {
   class CodeGeneratorImpl : public CodeGenerator {
     DiagnosticsEngine &Diags;
-    OwningPtr<const llvm::DataLayout> TD;
+    std::unique_ptr<const llvm::DataLayout> TD;
     ASTContext *Ctx;
     const CodeGenOptions CodeGenOpts;  // Intentionally copied in.
   protected:
-    OwningPtr<llvm::Module> M;
-    OwningPtr<CodeGen::CodeGenModule> Builder;
+    std::unique_ptr<llvm::Module> M;
+    std::unique_ptr<CodeGen::CodeGenModule> Builder;
+
   public:
     CodeGeneratorImpl(DiagnosticsEngine &diags, const std::string& ModuleName,
                       const CodeGenOptions &CGO, llvm::LLVMContext& C)
@@ -44,15 +45,13 @@
 
     virtual ~CodeGeneratorImpl() {}
 
-    virtual llvm::Module* GetModule() {
+    llvm::Module* GetModule() override {
       return M.get();
     }
 
-    virtual llvm::Module* ReleaseModule() {
-      return M.take();
-    }
+    llvm::Module *ReleaseModule() override { return M.release(); }
 
-    virtual void Initialize(ASTContext &Context) {
+    void Initialize(ASTContext &Context) override {
       Ctx = &Context;
 
       M->setTargetTriple(Ctx->getTargetInfo().getTriple().getTriple());
@@ -65,14 +64,14 @@
         HandleDependentLibrary(CodeGenOpts.DependentLibraries[i]);
     }
 
-    virtual void HandleCXXStaticMemberVarInstantiation(VarDecl *VD) {
+    void HandleCXXStaticMemberVarInstantiation(VarDecl *VD) override {
       if (Diags.hasErrorOccurred())
         return;
 
       Builder->HandleCXXStaticMemberVarInstantiation(VD);
     }
 
-    virtual bool HandleTopLevelDecl(DeclGroupRef DG) {
+    bool HandleTopLevelDecl(DeclGroupRef DG) override {
       if (Diags.hasErrorOccurred())
         return true;
 
@@ -86,7 +85,7 @@
     /// to (e.g. struct, union, enum, class) is completed. This allows the
     /// client hack on the type, which can occur at any point in the file
     /// (because these can be defined in declspecs).
-    virtual void HandleTagDeclDefinition(TagDecl *D) {
+    void HandleTagDeclDefinition(TagDecl *D) override {
       if (Diags.hasErrorOccurred())
         return;
 
@@ -95,10 +94,8 @@
       // In C++, we may have member functions that need to be emitted at this 
       // point.
       if (Ctx->getLangOpts().CPlusPlus && !D->isDependentContext()) {
-        for (DeclContext::decl_iterator M = D->decls_begin(), 
-                                     MEnd = D->decls_end();
-             M != MEnd; ++M)
-          if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(*M))
+        for (auto *M : D->decls())
+          if (auto *Method = dyn_cast<CXXMethodDecl>(M))
             if (Method->doesThisDeclarationHaveABody() &&
                 (Method->hasAttr<UsedAttr>() || 
                  Method->hasAttr<ConstructorAttr>()))
@@ -106,7 +103,7 @@
       }
     }
 
-    virtual void HandleTagDeclRequiredDefinition(const TagDecl *D) LLVM_OVERRIDE {
+    void HandleTagDeclRequiredDefinition(const TagDecl *D) override {
       if (Diags.hasErrorOccurred())
         return;
 
@@ -115,8 +112,10 @@
           DI->completeRequiredType(RD);
     }
 
-    virtual void HandleTranslationUnit(ASTContext &Ctx) {
+    void HandleTranslationUnit(ASTContext &Ctx) override {
       if (Diags.hasErrorOccurred()) {
+        if (Builder)
+          Builder->clear();
         M.reset();
         return;
       }
@@ -125,30 +124,30 @@
         Builder->Release();
     }
 
-    virtual void CompleteTentativeDefinition(VarDecl *D) {
+    void CompleteTentativeDefinition(VarDecl *D) override {
       if (Diags.hasErrorOccurred())
         return;
 
       Builder->EmitTentativeDefinition(D);
     }
 
-    virtual void HandleVTable(CXXRecordDecl *RD, bool DefinitionRequired) {
+    void HandleVTable(CXXRecordDecl *RD, bool DefinitionRequired) override {
       if (Diags.hasErrorOccurred())
         return;
 
       Builder->EmitVTable(RD, DefinitionRequired);
     }
 
-    virtual void HandleLinkerOptionPragma(llvm::StringRef Opts) {
+    void HandleLinkerOptionPragma(llvm::StringRef Opts) override {
       Builder->AppendLinkerOptions(Opts);
     }
 
-    virtual void HandleDetectMismatch(llvm::StringRef Name,
-                                      llvm::StringRef Value) {
+    void HandleDetectMismatch(llvm::StringRef Name,
+                              llvm::StringRef Value) override {
       Builder->AddDetectMismatch(Name, Value);
     }
 
-    virtual void HandleDependentLibrary(llvm::StringRef Lib) {
+    void HandleDependentLibrary(llvm::StringRef Lib) override {
       Builder->AddDependentLib(Lib);
     }
   };
diff --git a/lib/CodeGen/TargetInfo.cpp b/lib/CodeGen/TargetInfo.cpp
index 7446e67..c602b1f 100644
--- a/lib/CodeGen/TargetInfo.cpp
+++ b/lib/CodeGen/TargetInfo.cpp
@@ -114,6 +114,9 @@
   case Ignore:
     OS << "Ignore";
     break;
+  case InAlloca:
+    OS << "InAlloca Offset=" << getInAllocaFieldIndex();
+    break;
   case Indirect:
     OS << "Indirect Align=" << getIndirectAlign()
        << " ByVal=" << getIndirectByVal()
@@ -206,14 +209,12 @@
 
   // If this is a C++ record, check the bases first.
   if (const CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(RD))
-    for (CXXRecordDecl::base_class_const_iterator i = CXXRD->bases_begin(),
-           e = CXXRD->bases_end(); i != e; ++i)
-      if (!isEmptyRecord(Context, i->getType(), true))
+    for (const auto &I : CXXRD->bases())
+      if (!isEmptyRecord(Context, I.getType(), true))
         return false;
 
-  for (RecordDecl::field_iterator i = RD->field_begin(), e = RD->field_end();
-         i != e; ++i)
-    if (!isEmptyField(Context, *i, AllowArrays))
+  for (const auto *I : RD->fields())
+    if (!isEmptyField(Context, I, AllowArrays))
       return false;
   return true;
 }
@@ -239,10 +240,9 @@
 
   // If this is a C++ record, check the bases first.
   if (const CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
-    for (CXXRecordDecl::base_class_const_iterator i = CXXRD->bases_begin(),
-           e = CXXRD->bases_end(); i != e; ++i) {
+    for (const auto &I : CXXRD->bases()) {
       // Ignore empty records.
-      if (isEmptyRecord(Context, i->getType(), true))
+      if (isEmptyRecord(Context, I.getType(), true))
         continue;
 
       // If we already found an element then this isn't a single-element struct.
@@ -251,16 +251,14 @@
 
       // If this is non-empty and not a single element struct, the composite
       // cannot be a single element struct.
-      Found = isSingleElementStruct(i->getType(), Context);
+      Found = isSingleElementStruct(I.getType(), Context);
       if (!Found)
         return 0;
     }
   }
 
   // Check for single element.
-  for (RecordDecl::field_iterator i = RD->field_begin(), e = RD->field_end();
-         i != e; ++i) {
-    const FieldDecl *FD = *i;
+  for (const auto *FD : RD->fields()) {
     QualType FT = FD->getType();
 
     // Ignore empty fields.
@@ -336,10 +334,7 @@
 
   uint64_t Size = 0;
 
-  for (RecordDecl::field_iterator i = RD->field_begin(), e = RD->field_end();
-         i != e; ++i) {
-    const FieldDecl *FD = *i;
-
+  for (const auto *FD : RD->fields()) {
     if (!is32Or64BitBasicType(FD->getType(), Context))
       return false;
 
@@ -371,15 +366,14 @@
   ABIArgInfo classifyReturnType(QualType RetTy) const;
   ABIArgInfo classifyArgumentType(QualType RetTy) const;
 
-  virtual void computeInfo(CGFunctionInfo &FI) const {
+  void computeInfo(CGFunctionInfo &FI) const override {
     FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
-    for (CGFunctionInfo::arg_iterator it = FI.arg_begin(), ie = FI.arg_end();
-         it != ie; ++it)
-      it->info = classifyArgumentType(it->type);
+    for (auto &I : FI.arguments())
+      I.info = classifyArgumentType(I.type);
   }
 
-  virtual llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
-                                 CodeGenFunction &CGF) const;
+  llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
+                         CodeGenFunction &CGF) const override;
 };
 
 class DefaultTargetCodeGenInfo : public TargetCodeGenInfo {
@@ -395,7 +389,7 @@
 
 ABIArgInfo DefaultABIInfo::classifyArgumentType(QualType Ty) const {
   if (isAggregateTypeForABI(Ty)) {
-    // Records with non trivial destructors/constructors should not be passed
+    // Records with non-trivial destructors/constructors should not be passed
     // by value.
     if (isRecordReturnIndirect(Ty, getCXXABI()))
       return ABIArgInfo::getIndirect(0, /*ByVal=*/false);
@@ -440,9 +434,9 @@
   ABIArgInfo classifyReturnType(QualType RetTy) const;
   ABIArgInfo classifyArgumentType(QualType RetTy) const;
 
-  virtual void computeInfo(CGFunctionInfo &FI) const;
-  virtual llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
-                                 CodeGenFunction &CGF) const;
+  void computeInfo(CGFunctionInfo &FI) const override;
+  llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
+                         CodeGenFunction &CGF) const override;
 };
 
 class PNaClTargetCodeGenInfo : public TargetCodeGenInfo {
@@ -454,9 +448,8 @@
 void PNaClABIInfo::computeInfo(CGFunctionInfo &FI) const {
     FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
 
-    for (CGFunctionInfo::arg_iterator it = FI.arg_begin(), ie = FI.arg_end();
-         it != ie; ++it)
-      it->info = classifyArgumentType(it->type);
+    for (auto &I : FI.arguments())
+      I.info = classifyArgumentType(I.type);
   }
 
 llvm::Value *PNaClABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
@@ -526,6 +519,16 @@
 // X86-32 ABI Implementation
 //===----------------------------------------------------------------------===//
 
+/// \brief Similar to llvm::CCState, but for Clang.
+struct CCState {
+  CCState(unsigned CC) : CC(CC), FreeRegs(0) {}
+
+  unsigned CC;
+  unsigned FreeRegs;
+  unsigned StackOffset;
+  bool UseInAlloca;
+};
+
 /// X86_32ABIInfo - The X86-32 ABI information.
 class X86_32ABIInfo : public ABIInfo {
   enum Class {
@@ -544,30 +547,37 @@
     return (Size == 8 || Size == 16 || Size == 32 || Size == 64);
   }
 
-  static bool shouldReturnTypeInRegister(QualType Ty, ASTContext &Context, 
-                                          unsigned callingConvention);
+  bool shouldReturnTypeInRegister(QualType Ty, ASTContext &Context,
+                                  bool IsInstanceMethod) const;
 
   /// getIndirectResult - Give a source type \arg Ty, return a suitable result
   /// such that the argument will be passed in memory.
-  ABIArgInfo getIndirectResult(QualType Ty, bool ByVal,
-                               unsigned &FreeRegs) const;
+  ABIArgInfo getIndirectResult(QualType Ty, bool ByVal, CCState &State) const;
+
+  ABIArgInfo getIndirectReturnResult(CCState &State) const;
 
   /// \brief Return the alignment to use for the given type on the stack.
   unsigned getTypeStackAlignInBytes(QualType Ty, unsigned Align) const;
 
   Class classify(QualType Ty) const;
-  ABIArgInfo classifyReturnType(QualType RetTy,
-                                unsigned callingConvention) const;
-  ABIArgInfo classifyArgumentType(QualType RetTy, unsigned &FreeRegs,
-                                  bool IsFastCall) const;
-  bool shouldUseInReg(QualType Ty, unsigned &FreeRegs,
-                      bool IsFastCall, bool &NeedsPadding) const;
+  ABIArgInfo classifyReturnType(QualType RetTy, CCState &State,
+                                bool IsInstanceMethod) const;
+  ABIArgInfo classifyArgumentType(QualType RetTy, CCState &State) const;
+  bool shouldUseInReg(QualType Ty, CCState &State, bool &NeedsPadding) const;
+
+  /// \brief Rewrite the function info so that all memory arguments use
+  /// inalloca.
+  void rewriteWithInAlloca(CGFunctionInfo &FI) const;
+
+  void addFieldToArgStruct(SmallVector<llvm::Type *, 6> &FrameFields,
+                           unsigned &StackOffset, ABIArgInfo &Info,
+                           QualType Type) const;
 
 public:
 
-  virtual void computeInfo(CGFunctionInfo &FI) const;
-  virtual llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
-                                 CodeGenFunction &CGF) const;
+  void computeInfo(CGFunctionInfo &FI) const override;
+  llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
+                         CodeGenFunction &CGF) const override;
 
   X86_32ABIInfo(CodeGen::CodeGenTypes &CGT, bool d, bool p, bool w,
                 unsigned r)
@@ -585,24 +595,25 @@
       const llvm::Triple &Triple, const CodeGenOptions &Opts);
 
   void SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
-                           CodeGen::CodeGenModule &CGM) const;
+                           CodeGen::CodeGenModule &CGM) const override;
 
-  int getDwarfEHStackPointer(CodeGen::CodeGenModule &CGM) const {
+  int getDwarfEHStackPointer(CodeGen::CodeGenModule &CGM) const override {
     // Darwin uses different dwarf register numbers for EH.
     if (CGM.getTarget().getTriple().isOSDarwin()) return 5;
     return 4;
   }
 
   bool initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF,
-                               llvm::Value *Address) const;
+                               llvm::Value *Address) const override;
 
   llvm::Type* adjustInlineAsmType(CodeGen::CodeGenFunction &CGF,
                                   StringRef Constraint,
-                                  llvm::Type* Ty) const {
+                                  llvm::Type* Ty) const override {
     return X86AdjustInlineAsmType(CGF, Constraint, Ty);
   }
 
-  llvm::Constant *getUBSanFunctionSignature(CodeGen::CodeGenModule &CGM) const {
+  llvm::Constant *
+  getUBSanFunctionSignature(CodeGen::CodeGenModule &CGM) const override {
     unsigned Sig = (0xeb << 0) |  // jmp rel8
                    (0x06 << 8) |  //           .+0x08
                    ('F' << 16) |
@@ -616,9 +627,8 @@
 
 /// shouldReturnTypeInRegister - Determine if the given type should be
 /// passed in a register (for the Darwin ABI).
-bool X86_32ABIInfo::shouldReturnTypeInRegister(QualType Ty,
-                                               ASTContext &Context,
-                                               unsigned callingConvention) {
+bool X86_32ABIInfo::shouldReturnTypeInRegister(QualType Ty, ASTContext &Context,
+                                               bool IsInstanceMethod) const {
   uint64_t Size = Context.getTypeSize(Ty);
 
   // Type must be register sized.
@@ -644,7 +654,7 @@
   // Arrays are treated like records.
   if (const ConstantArrayType *AT = Context.getAsConstantArrayType(Ty))
     return shouldReturnTypeInRegister(AT->getElementType(), Context,
-                                      callingConvention);
+                                      IsInstanceMethod);
 
   // Otherwise, it must be a record type.
   const RecordType *RT = Ty->getAs<RecordType>();
@@ -654,31 +664,35 @@
 
   // For thiscall conventions, structures will never be returned in
   // a register.  This is for compatibility with the MSVC ABI
-  if (callingConvention == llvm::CallingConv::X86_ThisCall && 
-      RT->isStructureType()) {
+  if (IsWin32StructABI && IsInstanceMethod && RT->isStructureType())
     return false;
-  }
 
   // Structure types are passed in register if all fields would be
   // passed in a register.
-  for (RecordDecl::field_iterator i = RT->getDecl()->field_begin(),
-         e = RT->getDecl()->field_end(); i != e; ++i) {
-    const FieldDecl *FD = *i;
-
+  for (const auto *FD : RT->getDecl()->fields()) {
     // Empty fields are ignored.
     if (isEmptyField(Context, FD, true))
       continue;
 
     // Check fields recursively.
-    if (!shouldReturnTypeInRegister(FD->getType(), Context, 
-                                    callingConvention))
+    if (!shouldReturnTypeInRegister(FD->getType(), Context, IsInstanceMethod))
       return false;
   }
   return true;
 }
 
-ABIArgInfo X86_32ABIInfo::classifyReturnType(QualType RetTy, 
-                                            unsigned callingConvention) const {
+ABIArgInfo X86_32ABIInfo::getIndirectReturnResult(CCState &State) const {
+  // If the return value is indirect, then the hidden argument is consuming one
+  // integer register.
+  if (State.FreeRegs) {
+    --State.FreeRegs;
+    return ABIArgInfo::getIndirectInReg(/*Align=*/0, /*ByVal=*/false);
+  }
+  return ABIArgInfo::getIndirect(/*Align=*/0, /*ByVal=*/false);
+}
+
+ABIArgInfo X86_32ABIInfo::classifyReturnType(QualType RetTy, CCState &State,
+                                             bool IsInstanceMethod) const {
   if (RetTy->isVoidType())
     return ABIArgInfo::getIgnore();
 
@@ -701,7 +715,7 @@
         return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(),
                                                             Size));
 
-      return ABIArgInfo::getIndirect(0);
+      return getIndirectReturnResult(State);
     }
 
     return ABIArgInfo::getDirect();
@@ -710,21 +724,20 @@
   if (isAggregateTypeForABI(RetTy)) {
     if (const RecordType *RT = RetTy->getAs<RecordType>()) {
       if (isRecordReturnIndirect(RT, getCXXABI()))
-        return ABIArgInfo::getIndirect(0, /*ByVal=*/false);
+        return getIndirectReturnResult(State);
 
       // Structures with flexible arrays are always indirect.
       if (RT->getDecl()->hasFlexibleArrayMember())
-        return ABIArgInfo::getIndirect(0);
+        return getIndirectReturnResult(State);
     }
 
     // If specified, structs and unions are always indirect.
     if (!IsSmallStructInRegABI && !RetTy->isAnyComplexType())
-      return ABIArgInfo::getIndirect(0);
+      return getIndirectReturnResult(State);
 
     // Small structures which are register sized are generally returned
     // in a register.
-    if (X86_32ABIInfo::shouldReturnTypeInRegister(RetTy, getContext(), 
-                                                  callingConvention)) {
+    if (shouldReturnTypeInRegister(RetTy, getContext(), IsInstanceMethod)) {
       uint64_t Size = getContext().getTypeSize(RetTy);
 
       // As a special-case, if the struct is a "single-element" struct, and
@@ -742,7 +755,7 @@
       return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(),Size));
     }
 
-    return ABIArgInfo::getIndirect(0);
+    return getIndirectReturnResult(State);
   }
 
   // Treat an enum type as its underlying type.
@@ -765,13 +778,11 @@
 
   // If this is a C++ record, check the bases first.
   if (const CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(RD))
-    for (CXXRecordDecl::base_class_const_iterator i = CXXRD->bases_begin(),
-           e = CXXRD->bases_end(); i != e; ++i)
-      if (!isRecordWithSSEVectorType(Context, i->getType()))
+    for (const auto &I : CXXRD->bases())
+      if (!isRecordWithSSEVectorType(Context, I.getType()))
         return false;
 
-  for (RecordDecl::field_iterator i = RD->field_begin(), e = RD->field_end();
-       i != e; ++i) {
+  for (const auto *i : RD->fields()) {
     QualType FT = i->getType();
 
     if (isSSEVectorType(Context, FT))
@@ -806,10 +817,10 @@
 }
 
 ABIArgInfo X86_32ABIInfo::getIndirectResult(QualType Ty, bool ByVal,
-                                            unsigned &FreeRegs) const {
+                                            CCState &State) const {
   if (!ByVal) {
-    if (FreeRegs) {
-      --FreeRegs; // Non byval indirects just use one pointer.
+    if (State.FreeRegs) {
+      --State.FreeRegs; // Non-byval indirects just use one pointer.
       return ABIArgInfo::getIndirectInReg(0, false);
     }
     return ABIArgInfo::getIndirect(0, false);
@@ -819,15 +830,12 @@
   unsigned TypeAlign = getContext().getTypeAlign(Ty) / 8;
   unsigned StackAlign = getTypeStackAlignInBytes(Ty, TypeAlign);
   if (StackAlign == 0)
-    return ABIArgInfo::getIndirect(4);
+    return ABIArgInfo::getIndirect(4, /*ByVal=*/true);
 
   // If the stack alignment is less than the type alignment, realign the
   // argument.
-  if (StackAlign < TypeAlign)
-    return ABIArgInfo::getIndirect(StackAlign, /*ByVal=*/true,
-                                   /*Realign=*/true);
-
-  return ABIArgInfo::getIndirect(StackAlign);
+  bool Realign = TypeAlign > StackAlign;
+  return ABIArgInfo::getIndirect(StackAlign, /*ByVal=*/true, Realign);
 }
 
 X86_32ABIInfo::Class X86_32ABIInfo::classify(QualType Ty) const {
@@ -843,8 +851,8 @@
   return Integer;
 }
 
-bool X86_32ABIInfo::shouldUseInReg(QualType Ty, unsigned &FreeRegs,
-                                   bool IsFastCall, bool &NeedsPadding) const {
+bool X86_32ABIInfo::shouldUseInReg(QualType Ty, CCState &State,
+                                   bool &NeedsPadding) const {
   NeedsPadding = false;
   Class C = classify(Ty);
   if (C == Float)
@@ -856,14 +864,14 @@
   if (SizeInRegs == 0)
     return false;
 
-  if (SizeInRegs > FreeRegs) {
-    FreeRegs = 0;
+  if (SizeInRegs > State.FreeRegs) {
+    State.FreeRegs = 0;
     return false;
   }
 
-  FreeRegs -= SizeInRegs;
+  State.FreeRegs -= SizeInRegs;
 
-  if (IsFastCall) {
+  if (State.CC == llvm::CallingConv::X86_FastCall) {
     if (Size > 32)
       return false;
 
@@ -876,7 +884,7 @@
     if (Ty->isReferenceType())
       return true;
 
-    if (FreeRegs)
+    if (State.FreeRegs)
       NeedsPadding = true;
 
     return false;
@@ -886,20 +894,26 @@
 }
 
 ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty,
-                                               unsigned &FreeRegs,
-                                               bool IsFastCall) const {
+                                               CCState &State) const {
   // FIXME: Set alignment on indirect arguments.
   if (isAggregateTypeForABI(Ty)) {
     if (const RecordType *RT = Ty->getAs<RecordType>()) {
-      if (IsWin32StructABI)
-        return getIndirectResult(Ty, true, FreeRegs);
+      // Check with the C++ ABI first.
+      CGCXXABI::RecordArgABI RAA = getRecordArgABI(RT, getCXXABI());
+      if (RAA == CGCXXABI::RAA_Indirect) {
+        return getIndirectResult(Ty, false, State);
+      } else if (RAA == CGCXXABI::RAA_DirectInMemory) {
+        // The field index doesn't matter, we'll fix it up later.
+        return ABIArgInfo::getInAlloca(/*FieldIndex=*/0);
+      }
 
-      if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(RT, getCXXABI()))
-        return getIndirectResult(Ty, RAA == CGCXXABI::RAA_DirectInMemory, FreeRegs);
+      // Structs are always byval on win32, regardless of what they contain.
+      if (IsWin32StructABI)
+        return getIndirectResult(Ty, true, State);
 
       // Structures with flexible arrays are always indirect.
       if (RT->getDecl()->hasFlexibleArrayMember())
-        return getIndirectResult(Ty, true, FreeRegs);
+        return getIndirectResult(Ty, true, State);
     }
 
     // Ignore empty structs/unions.
@@ -909,7 +923,7 @@
     llvm::LLVMContext &LLVMContext = getVMContext();
     llvm::IntegerType *Int32 = llvm::Type::getInt32Ty(LLVMContext);
     bool NeedsPadding;
-    if (shouldUseInReg(Ty, FreeRegs, IsFastCall, NeedsPadding)) {
+    if (shouldUseInReg(Ty, State, NeedsPadding)) {
       unsigned SizeInRegs = (getContext().getTypeSize(Ty) + 31) / 32;
       SmallVector<llvm::Type*, 3> Elements(SizeInRegs, Int32);
       llvm::Type *Result = llvm::StructType::get(LLVMContext, Elements);
@@ -923,9 +937,10 @@
     // optimizations.
     if (getContext().getTypeSize(Ty) <= 4*32 &&
         canExpandIndirectArgument(Ty, getContext()))
-      return ABIArgInfo::getExpandWithPadding(IsFastCall, PaddingType);
+      return ABIArgInfo::getExpandWithPadding(
+          State.CC == llvm::CallingConv::X86_FastCall, PaddingType);
 
-    return getIndirectResult(Ty, true, FreeRegs);
+    return getIndirectResult(Ty, true, State);
   }
 
   if (const VectorType *VT = Ty->getAs<VectorType>()) {
@@ -950,7 +965,7 @@
     Ty = EnumTy->getDecl()->getIntegerType();
 
   bool NeedsPadding;
-  bool InReg = shouldUseInReg(Ty, FreeRegs, IsFastCall, NeedsPadding);
+  bool InReg = shouldUseInReg(Ty, State, NeedsPadding);
 
   if (Ty->isPromotableIntegerType()) {
     if (InReg)
@@ -963,32 +978,107 @@
 }
 
 void X86_32ABIInfo::computeInfo(CGFunctionInfo &FI) const {
-  FI.getReturnInfo() = classifyReturnType(FI.getReturnType(),
-                                          FI.getCallingConvention());
-
-  unsigned CC = FI.getCallingConvention();
-  bool IsFastCall = CC == llvm::CallingConv::X86_FastCall;
-  unsigned FreeRegs;
-  if (IsFastCall)
-    FreeRegs = 2;
+  CCState State(FI.getCallingConvention());
+  if (State.CC == llvm::CallingConv::X86_FastCall)
+    State.FreeRegs = 2;
   else if (FI.getHasRegParm())
-    FreeRegs = FI.getRegParm();
+    State.FreeRegs = FI.getRegParm();
   else
-    FreeRegs = DefaultNumRegisterParameters;
+    State.FreeRegs = DefaultNumRegisterParameters;
 
-  // If the return value is indirect, then the hidden argument is consuming one
-  // integer register.
-  if (FI.getReturnInfo().isIndirect() && FreeRegs) {
-    --FreeRegs;
-    ABIArgInfo &Old = FI.getReturnInfo();
-    Old = ABIArgInfo::getIndirectInReg(Old.getIndirectAlign(),
-                                       Old.getIndirectByVal(),
-                                       Old.getIndirectRealign());
+  FI.getReturnInfo() =
+      classifyReturnType(FI.getReturnType(), State, FI.isInstanceMethod());
+
+  // On win32, use the x86_cdeclmethodcc convention for cdecl methods that use
+  // sret.  This convention swaps the order of the first two parameters behind
+  // the scenes to match MSVC.
+  if (IsWin32StructABI && FI.isInstanceMethod() &&
+      FI.getCallingConvention() == llvm::CallingConv::C &&
+      FI.getReturnInfo().isIndirect())
+    FI.setEffectiveCallingConvention(llvm::CallingConv::X86_CDeclMethod);
+
+  bool UsedInAlloca = false;
+  for (auto &I : FI.arguments()) {
+    I.info = classifyArgumentType(I.type, State);
+    UsedInAlloca |= (I.info.getKind() == ABIArgInfo::InAlloca);
   }
 
-  for (CGFunctionInfo::arg_iterator it = FI.arg_begin(), ie = FI.arg_end();
-       it != ie; ++it)
-    it->info = classifyArgumentType(it->type, FreeRegs, IsFastCall);
+  // If we needed to use inalloca for any argument, do a second pass and rewrite
+  // all the memory arguments to use inalloca.
+  if (UsedInAlloca)
+    rewriteWithInAlloca(FI);
+}
+
+void
+X86_32ABIInfo::addFieldToArgStruct(SmallVector<llvm::Type *, 6> &FrameFields,
+                                   unsigned &StackOffset,
+                                   ABIArgInfo &Info, QualType Type) const {
+  // Insert padding bytes to respect alignment.  For x86_32, each argument is 4
+  // byte aligned.
+  unsigned Align = 4U;
+  if (Info.getKind() == ABIArgInfo::Indirect && Info.getIndirectByVal())
+    Align = std::max(Align, Info.getIndirectAlign());
+  if (StackOffset & (Align - 1)) {
+    unsigned OldOffset = StackOffset;
+    StackOffset = llvm::RoundUpToAlignment(StackOffset, Align);
+    unsigned NumBytes = StackOffset - OldOffset;
+    assert(NumBytes);
+    llvm::Type *Ty = llvm::Type::getInt8Ty(getVMContext());
+    Ty = llvm::ArrayType::get(Ty, NumBytes);
+    FrameFields.push_back(Ty);
+  }
+
+  Info = ABIArgInfo::getInAlloca(FrameFields.size());
+  FrameFields.push_back(CGT.ConvertTypeForMem(Type));
+  StackOffset += getContext().getTypeSizeInChars(Type).getQuantity();
+}
+
+void X86_32ABIInfo::rewriteWithInAlloca(CGFunctionInfo &FI) const {
+  assert(IsWin32StructABI && "inalloca only supported on win32");
+
+  // Build a packed struct type for all of the arguments in memory.
+  SmallVector<llvm::Type *, 6> FrameFields;
+
+  unsigned StackOffset = 0;
+
+  // Put the sret parameter into the inalloca struct if it's in memory.
+  ABIArgInfo &Ret = FI.getReturnInfo();
+  if (Ret.isIndirect() && !Ret.getInReg()) {
+    CanQualType PtrTy = getContext().getPointerType(FI.getReturnType());
+    addFieldToArgStruct(FrameFields, StackOffset, Ret, PtrTy);
+    // On Windows, the hidden sret parameter is always returned in eax.
+    Ret.setInAllocaSRet(IsWin32StructABI);
+  }
+
+  // Skip the 'this' parameter in ecx.
+  CGFunctionInfo::arg_iterator I = FI.arg_begin(), E = FI.arg_end();
+  if (FI.getCallingConvention() == llvm::CallingConv::X86_ThisCall)
+    ++I;
+
+  // Put arguments passed in memory into the struct.
+  for (; I != E; ++I) {
+
+    // Leave ignored and inreg arguments alone.
+    switch (I->info.getKind()) {
+    case ABIArgInfo::Indirect:
+      assert(I->info.getIndirectByVal());
+      break;
+    case ABIArgInfo::Ignore:
+      continue;
+    case ABIArgInfo::Direct:
+    case ABIArgInfo::Extend:
+      if (I->info.getInReg())
+        continue;
+      break;
+    default:
+      break;
+    }
+
+    addFieldToArgStruct(FrameFields, StackOffset, I->info, I->type);
+  }
+
+  FI.setArgStruct(llvm::StructType::get(getVMContext(), FrameFields,
+                                        /*isPacked=*/true));
 }
 
 llvm::Value *X86_32ABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
@@ -1219,10 +1309,10 @@
     return false;
   }
 
-  virtual void computeInfo(CGFunctionInfo &FI) const;
+  void computeInfo(CGFunctionInfo &FI) const override;
 
-  virtual llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
-                                 CodeGenFunction &CGF) const;
+  llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
+                         CodeGenFunction &CGF) const override;
 };
 
 /// WinX86_64ABIInfo - The Windows X86_64 ABI information.
@@ -1233,10 +1323,10 @@
 public:
   WinX86_64ABIInfo(CodeGen::CodeGenTypes &CGT) : ABIInfo(CGT) {}
 
-  virtual void computeInfo(CGFunctionInfo &FI) const;
+  void computeInfo(CGFunctionInfo &FI) const override;
 
-  virtual llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
-                                 CodeGenFunction &CGF) const;
+  llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
+                         CodeGenFunction &CGF) const override;
 };
 
 class X86_64TargetCodeGenInfo : public TargetCodeGenInfo {
@@ -1248,12 +1338,12 @@
     return static_cast<const X86_64ABIInfo&>(TargetCodeGenInfo::getABIInfo());
   }
 
-  int getDwarfEHStackPointer(CodeGen::CodeGenModule &CGM) const {
+  int getDwarfEHStackPointer(CodeGen::CodeGenModule &CGM) const override {
     return 7;
   }
 
   bool initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF,
-                               llvm::Value *Address) const {
+                               llvm::Value *Address) const override {
     llvm::Value *Eight8 = llvm::ConstantInt::get(CGF.Int8Ty, 8);
 
     // 0-15 are the 16 integer registers.
@@ -1264,12 +1354,12 @@
 
   llvm::Type* adjustInlineAsmType(CodeGen::CodeGenFunction &CGF,
                                   StringRef Constraint,
-                                  llvm::Type* Ty) const {
+                                  llvm::Type* Ty) const override {
     return X86AdjustInlineAsmType(CGF, Constraint, Ty);
   }
 
   bool isNoProtoCallVariadic(const CallArgList &args,
-                             const FunctionNoProtoType *fnType) const {
+                             const FunctionNoProtoType *fnType) const override {
     // The default CC on x86-64 sets %al to the number of SSA
     // registers used, and GCC sets this when calling an unprototyped
     // function, so we override the default behavior.  However, don't do
@@ -1293,7 +1383,8 @@
     return TargetCodeGenInfo::isNoProtoCallVariadic(args, fnType);
   }
 
-  llvm::Constant *getUBSanFunctionSignature(CodeGen::CodeGenModule &CGM) const {
+  llvm::Constant *
+  getUBSanFunctionSignature(CodeGen::CodeGenModule &CGM) const override {
     unsigned Sig = (0xeb << 0) |  // jmp rel8
                    (0x0a << 8) |  //           .+0x0c
                    ('F' << 16) |
@@ -1319,14 +1410,14 @@
     : X86_32TargetCodeGenInfo(CGT, d, p, w, RegParms) {}
 
   void getDependentLibraryOption(llvm::StringRef Lib,
-                                 llvm::SmallString<24> &Opt) const {
+                                 llvm::SmallString<24> &Opt) const override {
     Opt = "/DEFAULTLIB:";
     Opt += qualifyWindowsLibrary(Lib);
   }
 
   void getDetectMismatchOption(llvm::StringRef Name,
                                llvm::StringRef Value,
-                               llvm::SmallString<32> &Opt) const {
+                               llvm::SmallString<32> &Opt) const override {
     Opt = "/FAILIFMISMATCH:\"" + Name.str() + "=" + Value.str() + "\"";
   }
 };
@@ -1336,12 +1427,12 @@
   WinX86_64TargetCodeGenInfo(CodeGen::CodeGenTypes &CGT)
     : TargetCodeGenInfo(new WinX86_64ABIInfo(CGT)) {}
 
-  int getDwarfEHStackPointer(CodeGen::CodeGenModule &CGM) const {
+  int getDwarfEHStackPointer(CodeGen::CodeGenModule &CGM) const override {
     return 7;
   }
 
   bool initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF,
-                               llvm::Value *Address) const {
+                               llvm::Value *Address) const override {
     llvm::Value *Eight8 = llvm::ConstantInt::get(CGF.Int8Ty, 8);
 
     // 0-15 are the 16 integer registers.
@@ -1351,14 +1442,14 @@
   }
 
   void getDependentLibraryOption(llvm::StringRef Lib,
-                                 llvm::SmallString<24> &Opt) const {
+                                 llvm::SmallString<24> &Opt) const override {
     Opt = "/DEFAULTLIB:";
     Opt += qualifyWindowsLibrary(Lib);
   }
 
   void getDetectMismatchOption(llvm::StringRef Name,
                                llvm::StringRef Value,
-                               llvm::SmallString<32> &Opt) const {
+                               llvm::SmallString<32> &Opt) const override {
     Opt = "/FAILIFMISMATCH:\"" + Name.str() + "=" + Value.str() + "\"";
   }
 };
@@ -1642,12 +1733,11 @@
 
     // If this is a C++ record, classify the bases first.
     if (const CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
-      for (CXXRecordDecl::base_class_const_iterator i = CXXRD->bases_begin(),
-             e = CXXRD->bases_end(); i != e; ++i) {
-        assert(!i->isVirtual() && !i->getType()->isDependentType() &&
+      for (const auto &I : CXXRD->bases()) {
+        assert(!I.isVirtual() && !I.getType()->isDependentType() &&
                "Unexpected base class!");
         const CXXRecordDecl *Base =
-          cast<CXXRecordDecl>(i->getType()->getAs<RecordType>()->getDecl());
+          cast<CXXRecordDecl>(I.getType()->getAs<RecordType>()->getDecl());
 
         // Classify this field.
         //
@@ -1657,7 +1747,7 @@
         Class FieldLo, FieldHi;
         uint64_t Offset =
           OffsetBase + getContext().toBits(Layout.getBaseClassOffset(Base));
-        classify(i->getType(), Offset, FieldLo, FieldHi, isNamedArg);
+        classify(I.getType(), Offset, FieldLo, FieldHi, isNamedArg);
         Lo = merge(Lo, FieldLo);
         Hi = merge(Hi, FieldHi);
         if (Lo == Memory || Hi == Memory)
@@ -1887,19 +1977,18 @@
 
     // If this is a C++ record, check the bases first.
     if (const CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
-      for (CXXRecordDecl::base_class_const_iterator i = CXXRD->bases_begin(),
-           e = CXXRD->bases_end(); i != e; ++i) {
-        assert(!i->isVirtual() && !i->getType()->isDependentType() &&
+      for (const auto &I : CXXRD->bases()) {
+        assert(!I.isVirtual() && !I.getType()->isDependentType() &&
                "Unexpected base class!");
         const CXXRecordDecl *Base =
-          cast<CXXRecordDecl>(i->getType()->getAs<RecordType>()->getDecl());
+          cast<CXXRecordDecl>(I.getType()->getAs<RecordType>()->getDecl());
 
         // If the base is after the span we care about, ignore it.
         unsigned BaseOffset = Context.toBits(Layout.getBaseClassOffset(Base));
         if (BaseOffset >= EndBit) continue;
 
         unsigned BaseStart = BaseOffset < StartBit ? StartBit-BaseOffset :0;
-        if (!BitsContainNoUserData(i->getType(), BaseStart,
+        if (!BitsContainNoUserData(I.getType(), BaseStart,
                                    EndBit-BaseOffset, Context))
           return false;
       }
@@ -2643,7 +2732,7 @@
       return ABIArgInfo::getIndirect(0, /*ByVal=*/false);
 
     // FIXME: mingw-w64-gcc emits 128-bit struct as i128
-    if (Size == 128 && getTarget().getTriple().getOS() == llvm::Triple::MinGW32)
+    if (Size == 128 && getTarget().getTriple().isWindowsGNUEnvironment())
       return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(),
                                                           Size));
 
@@ -2668,9 +2757,8 @@
   QualType RetTy = FI.getReturnType();
   FI.getReturnInfo() = classify(RetTy, true);
 
-  for (CGFunctionInfo::arg_iterator it = FI.arg_begin(), ie = FI.arg_end();
-       it != ie; ++it)
-    it->info = classify(it->type, false);
+  for (auto &I : FI.arguments())
+    I.info = classify(I.type, false);
 }
 
 llvm::Value *WinX86_64ABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
@@ -2701,9 +2789,9 @@
  public:
   NaClX86_64ABIInfo(CodeGen::CodeGenTypes &CGT, bool HasAVX)
       : ABIInfo(CGT), PInfo(CGT), NInfo(CGT, HasAVX) {}
-  virtual void computeInfo(CGFunctionInfo &FI) const;
-  virtual llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
-                                 CodeGenFunction &CGF) const;
+  void computeInfo(CGFunctionInfo &FI) const override;
+  llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
+                         CodeGenFunction &CGF) const override;
  private:
   PNaClABIInfo PInfo;  // Used for generating calls with pnaclcall callingconv.
   X86_64ABIInfo NInfo; // Used for everything else.
@@ -2739,13 +2827,13 @@
 public:
   PPC32TargetCodeGenInfo(CodeGenTypes &CGT) : DefaultTargetCodeGenInfo(CGT) {}
 
-  int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const {
+  int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const override {
     // This is recovered from gcc output.
     return 1; // r1 is the dedicated stack pointer
   }
 
   bool initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF,
-                               llvm::Value *Address) const;
+                               llvm::Value *Address) const override;
 };
 
 }
@@ -2811,29 +2899,27 @@
   // floating-point value) to avoid pushing them to memory on function
   // entry.  This would require changing the logic in PPCISelLowering
   // when lowering the parameters in the caller and args in the callee.
-  virtual void computeInfo(CGFunctionInfo &FI) const {
+  void computeInfo(CGFunctionInfo &FI) const override {
     FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
-    for (CGFunctionInfo::arg_iterator it = FI.arg_begin(), ie = FI.arg_end();
-         it != ie; ++it) {
+    for (auto &I : FI.arguments()) {
       // We rely on the default argument classification for the most part.
       // One exception:  An aggregate containing a single floating-point
       // or vector item must be passed in a register if one is available.
-      const Type *T = isSingleElementStruct(it->type, getContext());
+      const Type *T = isSingleElementStruct(I.type, getContext());
       if (T) {
         const BuiltinType *BT = T->getAs<BuiltinType>();
         if (T->isVectorType() || (BT && BT->isFloatingPoint())) {
           QualType QT(T, 0);
-          it->info = ABIArgInfo::getDirectInReg(CGT.ConvertType(QT));
+          I.info = ABIArgInfo::getDirectInReg(CGT.ConvertType(QT));
           continue;
         }
       }
-      it->info = classifyArgumentType(it->type);
+      I.info = classifyArgumentType(I.type);
     }
   }
 
-  virtual llvm::Value *EmitVAArg(llvm::Value *VAListAddr, 
-                                 QualType Ty,
-                                 CodeGenFunction &CGF) const;
+  llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
+                         CodeGenFunction &CGF) const override;
 };
 
 class PPC64_SVR4_TargetCodeGenInfo : public TargetCodeGenInfo {
@@ -2841,26 +2927,26 @@
   PPC64_SVR4_TargetCodeGenInfo(CodeGenTypes &CGT)
     : TargetCodeGenInfo(new PPC64_SVR4_ABIInfo(CGT)) {}
 
-  int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const {
+  int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const override {
     // This is recovered from gcc output.
     return 1; // r1 is the dedicated stack pointer
   }
 
   bool initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF,
-                               llvm::Value *Address) const;
+                               llvm::Value *Address) const override;
 };
 
 class PPC64TargetCodeGenInfo : public DefaultTargetCodeGenInfo {
 public:
   PPC64TargetCodeGenInfo(CodeGenTypes &CGT) : DefaultTargetCodeGenInfo(CGT) {}
 
-  int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const {
+  int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const override {
     // This is recovered from gcc output.
     return 1; // r1 is the dedicated stack pointer
   }
 
   bool initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF,
-                               llvm::Value *Address) const;
+                               llvm::Value *Address) const override;
 };
 
 }
@@ -3049,6 +3135,569 @@
 }
 
 //===----------------------------------------------------------------------===//
+// ARM64 ABI Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class ARM64ABIInfo : public ABIInfo {
+public:
+  enum ABIKind {
+    AAPCS = 0,
+    DarwinPCS
+  };
+
+private:
+  ABIKind Kind;
+
+public:
+  ARM64ABIInfo(CodeGenTypes &CGT, ABIKind Kind) : ABIInfo(CGT), Kind(Kind) {}
+
+private:
+  ABIKind getABIKind() const { return Kind; }
+  bool isDarwinPCS() const { return Kind == DarwinPCS; }
+
+  ABIArgInfo classifyReturnType(QualType RetTy) const;
+  ABIArgInfo classifyArgumentType(QualType RetTy, unsigned &AllocatedVFP,
+                                  bool &IsHA, unsigned &AllocatedGPR,
+                                  bool &IsSmallAggr) const;
+  bool isIllegalVectorType(QualType Ty) const;
+
+  virtual void computeInfo(CGFunctionInfo &FI) const {
+    // To correctly handle Homogeneous Aggregate, we need to keep track of the
+    // number of SIMD and Floating-point registers allocated so far.
+    // If the argument is an HFA or an HVA and there are sufficient unallocated
+    // SIMD and Floating-point registers, then the argument is allocated to SIMD
+    // and Floating-point Registers (with one register per member of the HFA or
+    // HVA). Otherwise, the NSRN is set to 8.
+    unsigned AllocatedVFP = 0;
+    // To correctly handle small aggregates, we need to keep track of the number
+    // of GPRs allocated so far. If the small aggregate can't all fit into
+    // registers, it will be on stack. We don't allow the aggregate to be
+    // partially in registers.
+    unsigned AllocatedGPR = 0;
+    FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
+    for (CGFunctionInfo::arg_iterator it = FI.arg_begin(), ie = FI.arg_end();
+         it != ie; ++it) {
+      unsigned PreAllocation = AllocatedVFP, PreGPR = AllocatedGPR;
+      bool IsHA = false, IsSmallAggr = false;
+      const unsigned NumVFPs = 8;
+      const unsigned NumGPRs = 8;
+      it->info = classifyArgumentType(it->type, AllocatedVFP, IsHA,
+                                      AllocatedGPR, IsSmallAggr);
+      // If we do not have enough VFP registers for the HA, any VFP registers
+      // that are unallocated are marked as unavailable. To achieve this, we add
+      // padding of (NumVFPs - PreAllocation) floats.
+      if (IsHA && AllocatedVFP > NumVFPs && PreAllocation < NumVFPs) {
+        llvm::Type *PaddingTy = llvm::ArrayType::get(
+            llvm::Type::getFloatTy(getVMContext()), NumVFPs - PreAllocation);
+        if (isDarwinPCS())
+          it->info = ABIArgInfo::getExpandWithPadding(false, PaddingTy);
+        else {
+          // Under AAPCS the 64-bit stack slot alignment means we can't pass HAs
+          // as sequences of floats since they'll get "holes" inserted as
+          // padding by the back end.
+          uint32_t NumStackSlots = getContext().getTypeSize(it->type);
+          NumStackSlots = llvm::RoundUpToAlignment(NumStackSlots, 64) / 64;
+
+          llvm::Type *CoerceTy = llvm::ArrayType::get(
+              llvm::Type::getDoubleTy(getVMContext()), NumStackSlots);
+          it->info = ABIArgInfo::getDirect(CoerceTy, 0, PaddingTy);
+        }
+      }
+      // If we do not have enough GPRs for the small aggregate, any GPR regs
+      // that are unallocated are marked as unavailable.
+      if (IsSmallAggr && AllocatedGPR > NumGPRs && PreGPR < NumGPRs) {
+        llvm::Type *PaddingTy = llvm::ArrayType::get(
+            llvm::Type::getInt32Ty(getVMContext()), NumGPRs - PreGPR);
+        it->info =
+            ABIArgInfo::getDirect(it->info.getCoerceToType(), 0, PaddingTy);
+      }
+    }
+  }
+
+  llvm::Value *EmitDarwinVAArg(llvm::Value *VAListAddr, QualType Ty,
+                               CodeGenFunction &CGF) const;
+
+  llvm::Value *EmitAAPCSVAArg(llvm::Value *VAListAddr, QualType Ty,
+                              CodeGenFunction &CGF) const;
+
+  virtual llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
+                                 CodeGenFunction &CGF) const {
+    return isDarwinPCS() ? EmitDarwinVAArg(VAListAddr, Ty, CGF)
+                         : EmitAAPCSVAArg(VAListAddr, Ty, CGF);
+  }
+};
+
+class ARM64TargetCodeGenInfo : public TargetCodeGenInfo {
+public:
+  ARM64TargetCodeGenInfo(CodeGenTypes &CGT, ARM64ABIInfo::ABIKind Kind)
+      : TargetCodeGenInfo(new ARM64ABIInfo(CGT, Kind)) {}
+
+  StringRef getARCRetainAutoreleasedReturnValueMarker() const {
+    return "mov\tfp, fp\t\t; marker for objc_retainAutoreleaseReturnValue";
+  }
+
+  int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const { return 31; }
+
+  virtual bool doesReturnSlotInterfereWithArgs() const { return false; }
+};
+}
+
+static bool isHomogeneousAggregate(QualType Ty, const Type *&Base,
+                                   ASTContext &Context,
+                                   uint64_t *HAMembers = 0);
+
+ABIArgInfo ARM64ABIInfo::classifyArgumentType(QualType Ty,
+                                              unsigned &AllocatedVFP,
+                                              bool &IsHA,
+                                              unsigned &AllocatedGPR,
+                                              bool &IsSmallAggr) const {
+  // Handle illegal vector types here.
+  if (isIllegalVectorType(Ty)) {
+    uint64_t Size = getContext().getTypeSize(Ty);
+    if (Size <= 32) {
+      llvm::Type *ResType = llvm::Type::getInt32Ty(getVMContext());
+      AllocatedGPR++;
+      return ABIArgInfo::getDirect(ResType);
+    }
+    if (Size == 64) {
+      llvm::Type *ResType =
+          llvm::VectorType::get(llvm::Type::getInt32Ty(getVMContext()), 2);
+      AllocatedVFP++;
+      return ABIArgInfo::getDirect(ResType);
+    }
+    if (Size == 128) {
+      llvm::Type *ResType =
+          llvm::VectorType::get(llvm::Type::getInt32Ty(getVMContext()), 4);
+      AllocatedVFP++;
+      return ABIArgInfo::getDirect(ResType);
+    }
+    AllocatedGPR++;
+    return ABIArgInfo::getIndirect(0, /*ByVal=*/false);
+  }
+  if (Ty->isVectorType())
+    // Size of a legal vector should be either 64 or 128.
+    AllocatedVFP++;
+  if (const BuiltinType *BT = Ty->getAs<BuiltinType>()) {
+    if (BT->getKind() == BuiltinType::Half ||
+        BT->getKind() == BuiltinType::Float ||
+        BT->getKind() == BuiltinType::Double ||
+        BT->getKind() == BuiltinType::LongDouble)
+      AllocatedVFP++;
+  }
+
+  if (!isAggregateTypeForABI(Ty)) {
+    // Treat an enum type as its underlying type.
+    if (const EnumType *EnumTy = Ty->getAs<EnumType>())
+      Ty = EnumTy->getDecl()->getIntegerType();
+
+    if (!Ty->isFloatingType() && !Ty->isVectorType()) {
+      int RegsNeeded = getContext().getTypeSize(Ty) > 64 ? 2 : 1;
+      AllocatedGPR += RegsNeeded;
+    }
+    return (Ty->isPromotableIntegerType() && isDarwinPCS()
+                ? ABIArgInfo::getExtend()
+                : ABIArgInfo::getDirect());
+  }
+
+  // Structures with either a non-trivial destructor or a non-trivial
+  // copy constructor are always indirect.
+  if (isRecordReturnIndirect(Ty, getCXXABI())) {
+    AllocatedGPR++;
+    return ABIArgInfo::getIndirect(0, /*ByVal=*/false);
+  }
+
+  // Empty records are always ignored on Darwin, but actually passed in C++ mode
+  // elsewhere for GNU compatibility.
+  if (isEmptyRecord(getContext(), Ty, true)) {
+    if (!getContext().getLangOpts().CPlusPlus || isDarwinPCS())
+      return ABIArgInfo::getIgnore();
+
+    ++AllocatedGPR;
+    return ABIArgInfo::getDirect(llvm::Type::getInt8Ty(getVMContext()));
+  }
+
+  // Homogeneous Floating-point Aggregates (HFAs) need to be expanded.
+  const Type *Base = 0;
+  uint64_t Members = 0;
+  if (isHomogeneousAggregate(Ty, Base, getContext(), &Members)) {
+    AllocatedVFP += Members;
+    IsHA = true;
+    return ABIArgInfo::getExpand();
+  }
+
+  // Aggregates <= 16 bytes are passed directly in registers or on the stack.
+  uint64_t Size = getContext().getTypeSize(Ty);
+  if (Size <= 128) {
+    Size = 64 * ((Size + 63) / 64); // round up to multiple of 8 bytes
+    AllocatedGPR += Size / 64;
+    IsSmallAggr = true;
+    // We use a pair of i64 for 16-byte aggregate with 8-byte alignment.
+    // For aggregates with 16-byte alignment, we use i128.
+    if (getContext().getTypeAlign(Ty) < 128 && Size == 128) {
+      llvm::Type *BaseTy = llvm::Type::getInt64Ty(getVMContext());
+      return ABIArgInfo::getDirect(llvm::ArrayType::get(BaseTy, Size / 64));
+    }
+    return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), Size));
+  }
+
+  AllocatedGPR++;
+  return ABIArgInfo::getIndirect(0, /*ByVal=*/false);
+}
+
+ABIArgInfo ARM64ABIInfo::classifyReturnType(QualType RetTy) const {
+  if (RetTy->isVoidType())
+    return ABIArgInfo::getIgnore();
+
+  // Large vector types should be returned via memory.
+  if (RetTy->isVectorType() && getContext().getTypeSize(RetTy) > 128)
+    return ABIArgInfo::getIndirect(0);
+
+  if (!isAggregateTypeForABI(RetTy)) {
+    // Treat an enum type as its underlying type.
+    if (const EnumType *EnumTy = RetTy->getAs<EnumType>())
+      RetTy = EnumTy->getDecl()->getIntegerType();
+
+    return (RetTy->isPromotableIntegerType() ? ABIArgInfo::getExtend()
+                                             : ABIArgInfo::getDirect());
+  }
+
+  // Structures with either a non-trivial destructor or a non-trivial
+  // copy constructor are always indirect.
+  if (isRecordReturnIndirect(RetTy, getCXXABI()))
+    return ABIArgInfo::getIndirect(0, /*ByVal=*/false);
+
+  if (isEmptyRecord(getContext(), RetTy, true))
+    return ABIArgInfo::getIgnore();
+
+  const Type *Base = 0;
+  if (isHomogeneousAggregate(RetTy, Base, getContext()))
+    // Homogeneous Floating-point Aggregates (HFAs) are returned directly.
+    return ABIArgInfo::getDirect();
+
+  // Aggregates <= 16 bytes are returned directly in registers or on the stack.
+  uint64_t Size = getContext().getTypeSize(RetTy);
+  if (Size <= 128) {
+    Size = 64 * ((Size + 63) / 64); // round up to multiple of 8 bytes
+    return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), Size));
+  }
+
+  return ABIArgInfo::getIndirect(0);
+}
+
+/// isIllegalVectorType - check whether the vector type is legal for ARM64.
+bool ARM64ABIInfo::isIllegalVectorType(QualType Ty) const {
+  if (const VectorType *VT = Ty->getAs<VectorType>()) {
+    // Check whether VT is legal.
+    unsigned NumElements = VT->getNumElements();
+    uint64_t Size = getContext().getTypeSize(VT);
+    // NumElements should be power of 2 between 1 and 16.
+    if ((NumElements & (NumElements - 1)) != 0 || NumElements > 16)
+      return true;
+    return Size != 64 && (Size != 128 || NumElements == 1);
+  }
+  return false;
+}
+
+static llvm::Value *EmitAArch64VAArg(llvm::Value *VAListAddr, QualType Ty,
+                                     int AllocatedGPR, int AllocatedVFP,
+                                     bool IsIndirect, CodeGenFunction &CGF) {
+  // The AArch64 va_list type and handling is specified in the Procedure Call
+  // Standard, section B.4:
+  //
+  // struct {
+  //   void *__stack;
+  //   void *__gr_top;
+  //   void *__vr_top;
+  //   int __gr_offs;
+  //   int __vr_offs;
+  // };
+
+  llvm::BasicBlock *MaybeRegBlock = CGF.createBasicBlock("vaarg.maybe_reg");
+  llvm::BasicBlock *InRegBlock = CGF.createBasicBlock("vaarg.in_reg");
+  llvm::BasicBlock *OnStackBlock = CGF.createBasicBlock("vaarg.on_stack");
+  llvm::BasicBlock *ContBlock = CGF.createBasicBlock("vaarg.end");
+  auto &Ctx = CGF.getContext();
+
+  llvm::Value *reg_offs_p = 0, *reg_offs = 0;
+  int reg_top_index;
+  int RegSize;
+  if (AllocatedGPR) {
+    assert(!AllocatedVFP && "Arguments never split between int & VFP regs");
+    // 3 is the field number of __gr_offs
+    reg_offs_p = CGF.Builder.CreateStructGEP(VAListAddr, 3, "gr_offs_p");
+    reg_offs = CGF.Builder.CreateLoad(reg_offs_p, "gr_offs");
+    reg_top_index = 1; // field number for __gr_top
+    RegSize = 8 * AllocatedGPR;
+  } else {
+    assert(!AllocatedGPR && "Argument must go in VFP or int regs");
+    // 4 is the field number of __vr_offs.
+    reg_offs_p = CGF.Builder.CreateStructGEP(VAListAddr, 4, "vr_offs_p");
+    reg_offs = CGF.Builder.CreateLoad(reg_offs_p, "vr_offs");
+    reg_top_index = 2; // field number for __vr_top
+    RegSize = 16 * AllocatedVFP;
+  }
+
+  //=======================================
+  // Find out where argument was passed
+  //=======================================
+
+  // If reg_offs >= 0 we're already using the stack for this type of
+  // argument. We don't want to keep updating reg_offs (in case it overflows,
+  // though anyone passing 2GB of arguments, each at most 16 bytes, deserves
+  // whatever they get).
+  llvm::Value *UsingStack = 0;
+  UsingStack = CGF.Builder.CreateICmpSGE(
+      reg_offs, llvm::ConstantInt::get(CGF.Int32Ty, 0));
+
+  CGF.Builder.CreateCondBr(UsingStack, OnStackBlock, MaybeRegBlock);
+
+  // Otherwise, at least some kind of argument could go in these registers, the
+  // quesiton is whether this particular type is too big.
+  CGF.EmitBlock(MaybeRegBlock);
+
+  // Integer arguments may need to correct register alignment (for example a
+  // "struct { __int128 a; };" gets passed in x_2N, x_{2N+1}). In this case we
+  // align __gr_offs to calculate the potential address.
+  if (AllocatedGPR && !IsIndirect && Ctx.getTypeAlign(Ty) > 64) {
+    int Align = Ctx.getTypeAlign(Ty) / 8;
+
+    reg_offs = CGF.Builder.CreateAdd(
+        reg_offs, llvm::ConstantInt::get(CGF.Int32Ty, Align - 1),
+        "align_regoffs");
+    reg_offs = CGF.Builder.CreateAnd(
+        reg_offs, llvm::ConstantInt::get(CGF.Int32Ty, -Align),
+        "aligned_regoffs");
+  }
+
+  // Update the gr_offs/vr_offs pointer for next call to va_arg on this va_list.
+  llvm::Value *NewOffset = 0;
+  NewOffset = CGF.Builder.CreateAdd(
+      reg_offs, llvm::ConstantInt::get(CGF.Int32Ty, RegSize), "new_reg_offs");
+  CGF.Builder.CreateStore(NewOffset, reg_offs_p);
+
+  // Now we're in a position to decide whether this argument really was in
+  // registers or not.
+  llvm::Value *InRegs = 0;
+  InRegs = CGF.Builder.CreateICmpSLE(
+      NewOffset, llvm::ConstantInt::get(CGF.Int32Ty, 0), "inreg");
+
+  CGF.Builder.CreateCondBr(InRegs, InRegBlock, OnStackBlock);
+
+  //=======================================
+  // Argument was in registers
+  //=======================================
+
+  // Now we emit the code for if the argument was originally passed in
+  // registers. First start the appropriate block:
+  CGF.EmitBlock(InRegBlock);
+
+  llvm::Value *reg_top_p = 0, *reg_top = 0;
+  reg_top_p =
+      CGF.Builder.CreateStructGEP(VAListAddr, reg_top_index, "reg_top_p");
+  reg_top = CGF.Builder.CreateLoad(reg_top_p, "reg_top");
+  llvm::Value *BaseAddr = CGF.Builder.CreateGEP(reg_top, reg_offs);
+  llvm::Value *RegAddr = 0;
+  llvm::Type *MemTy = llvm::PointerType::getUnqual(CGF.ConvertTypeForMem(Ty));
+
+  if (IsIndirect) {
+    // If it's been passed indirectly (actually a struct), whatever we find from
+    // stored registers or on the stack will actually be a struct **.
+    MemTy = llvm::PointerType::getUnqual(MemTy);
+  }
+
+  const Type *Base = 0;
+  uint64_t NumMembers;
+  if (isHomogeneousAggregate(Ty, Base, Ctx, &NumMembers) && NumMembers > 1) {
+    // Homogeneous aggregates passed in registers will have their elements split
+    // and stored 16-bytes apart regardless of size (they're notionally in qN,
+    // qN+1, ...). We reload and store into a temporary local variable
+    // contiguously.
+    assert(!IsIndirect && "Homogeneous aggregates should be passed directly");
+    llvm::Type *BaseTy = CGF.ConvertType(QualType(Base, 0));
+    llvm::Type *HFATy = llvm::ArrayType::get(BaseTy, NumMembers);
+    llvm::Value *Tmp = CGF.CreateTempAlloca(HFATy);
+    int Offset = 0;
+
+    if (CGF.CGM.getDataLayout().isBigEndian() && Ctx.getTypeSize(Base) < 128)
+      Offset = 16 - Ctx.getTypeSize(Base) / 8;
+    for (unsigned i = 0; i < NumMembers; ++i) {
+      llvm::Value *BaseOffset =
+          llvm::ConstantInt::get(CGF.Int32Ty, 16 * i + Offset);
+      llvm::Value *LoadAddr = CGF.Builder.CreateGEP(BaseAddr, BaseOffset);
+      LoadAddr = CGF.Builder.CreateBitCast(
+          LoadAddr, llvm::PointerType::getUnqual(BaseTy));
+      llvm::Value *StoreAddr = CGF.Builder.CreateStructGEP(Tmp, i);
+
+      llvm::Value *Elem = CGF.Builder.CreateLoad(LoadAddr);
+      CGF.Builder.CreateStore(Elem, StoreAddr);
+    }
+
+    RegAddr = CGF.Builder.CreateBitCast(Tmp, MemTy);
+  } else {
+    // Otherwise the object is contiguous in memory
+    unsigned BeAlign = reg_top_index == 2 ? 16 : 8;
+    if (CGF.CGM.getDataLayout().isBigEndian() && !isAggregateTypeForABI(Ty) &&
+        Ctx.getTypeSize(Ty) < (BeAlign * 8)) {
+      int Offset = BeAlign - Ctx.getTypeSize(Ty) / 8;
+      BaseAddr = CGF.Builder.CreatePtrToInt(BaseAddr, CGF.Int64Ty);
+
+      BaseAddr = CGF.Builder.CreateAdd(
+          BaseAddr, llvm::ConstantInt::get(CGF.Int64Ty, Offset), "align_be");
+
+      BaseAddr = CGF.Builder.CreateIntToPtr(BaseAddr, CGF.Int8PtrTy);
+    }
+
+    RegAddr = CGF.Builder.CreateBitCast(BaseAddr, MemTy);
+  }
+
+  CGF.EmitBranch(ContBlock);
+
+  //=======================================
+  // Argument was on the stack
+  //=======================================
+  CGF.EmitBlock(OnStackBlock);
+
+  llvm::Value *stack_p = 0, *OnStackAddr = 0;
+  stack_p = CGF.Builder.CreateStructGEP(VAListAddr, 0, "stack_p");
+  OnStackAddr = CGF.Builder.CreateLoad(stack_p, "stack");
+
+  // Again, stack arguments may need realigmnent. In this case both integer and
+  // floating-point ones might be affected.
+  if (!IsIndirect && Ctx.getTypeAlign(Ty) > 64) {
+    int Align = Ctx.getTypeAlign(Ty) / 8;
+
+    OnStackAddr = CGF.Builder.CreatePtrToInt(OnStackAddr, CGF.Int64Ty);
+
+    OnStackAddr = CGF.Builder.CreateAdd(
+        OnStackAddr, llvm::ConstantInt::get(CGF.Int64Ty, Align - 1),
+        "align_stack");
+    OnStackAddr = CGF.Builder.CreateAnd(
+        OnStackAddr, llvm::ConstantInt::get(CGF.Int64Ty, -Align),
+        "align_stack");
+
+    OnStackAddr = CGF.Builder.CreateIntToPtr(OnStackAddr, CGF.Int8PtrTy);
+  }
+
+  uint64_t StackSize;
+  if (IsIndirect)
+    StackSize = 8;
+  else
+    StackSize = Ctx.getTypeSize(Ty) / 8;
+
+  // All stack slots are 8 bytes
+  StackSize = llvm::RoundUpToAlignment(StackSize, 8);
+
+  llvm::Value *StackSizeC = llvm::ConstantInt::get(CGF.Int32Ty, StackSize);
+  llvm::Value *NewStack =
+      CGF.Builder.CreateGEP(OnStackAddr, StackSizeC, "new_stack");
+
+  // Write the new value of __stack for the next call to va_arg
+  CGF.Builder.CreateStore(NewStack, stack_p);
+
+  if (CGF.CGM.getDataLayout().isBigEndian() && !isAggregateTypeForABI(Ty) &&
+      Ctx.getTypeSize(Ty) < 64) {
+    int Offset = 8 - Ctx.getTypeSize(Ty) / 8;
+    OnStackAddr = CGF.Builder.CreatePtrToInt(OnStackAddr, CGF.Int64Ty);
+
+    OnStackAddr = CGF.Builder.CreateAdd(
+        OnStackAddr, llvm::ConstantInt::get(CGF.Int64Ty, Offset), "align_be");
+
+    OnStackAddr = CGF.Builder.CreateIntToPtr(OnStackAddr, CGF.Int8PtrTy);
+  }
+
+  OnStackAddr = CGF.Builder.CreateBitCast(OnStackAddr, MemTy);
+
+  CGF.EmitBranch(ContBlock);
+
+  //=======================================
+  // Tidy up
+  //=======================================
+  CGF.EmitBlock(ContBlock);
+
+  llvm::PHINode *ResAddr = CGF.Builder.CreatePHI(MemTy, 2, "vaarg.addr");
+  ResAddr->addIncoming(RegAddr, InRegBlock);
+  ResAddr->addIncoming(OnStackAddr, OnStackBlock);
+
+  if (IsIndirect)
+    return CGF.Builder.CreateLoad(ResAddr, "vaarg.addr");
+
+  return ResAddr;
+}
+
+llvm::Value *ARM64ABIInfo::EmitAAPCSVAArg(llvm::Value *VAListAddr, QualType Ty,
+                                          CodeGenFunction &CGF) const {
+
+  unsigned AllocatedGPR = 0, AllocatedVFP = 0;
+  bool IsHA = false, IsSmallAggr = false;
+  ABIArgInfo AI =
+      classifyArgumentType(Ty, AllocatedVFP, IsHA, AllocatedGPR, IsSmallAggr);
+
+  return EmitAArch64VAArg(VAListAddr, Ty, AllocatedGPR, AllocatedVFP,
+                          AI.isIndirect(), CGF);
+}
+
+llvm::Value *ARM64ABIInfo::EmitDarwinVAArg(llvm::Value *VAListAddr, QualType Ty,
+                                           CodeGenFunction &CGF) const {
+  // We do not support va_arg for aggregates or illegal vector types.
+  // Lower VAArg here for these cases and use the LLVM va_arg instruction for
+  // other cases.
+  if (!isAggregateTypeForABI(Ty) && !isIllegalVectorType(Ty))
+    return 0;
+
+  uint64_t Size = CGF.getContext().getTypeSize(Ty) / 8;
+  uint64_t Align = CGF.getContext().getTypeAlign(Ty) / 8;
+
+  const Type *Base = 0;
+  bool isHA = isHomogeneousAggregate(Ty, Base, getContext());
+
+  bool isIndirect = false;
+  // Arguments bigger than 16 bytes which aren't homogeneous aggregates should
+  // be passed indirectly.
+  if (Size > 16 && !isHA) {
+    isIndirect = true;
+    Size = 8;
+    Align = 8;
+  }
+
+  llvm::Type *BP = llvm::Type::getInt8PtrTy(CGF.getLLVMContext());
+  llvm::Type *BPP = llvm::PointerType::getUnqual(BP);
+
+  CGBuilderTy &Builder = CGF.Builder;
+  llvm::Value *VAListAddrAsBPP = Builder.CreateBitCast(VAListAddr, BPP, "ap");
+  llvm::Value *Addr = Builder.CreateLoad(VAListAddrAsBPP, "ap.cur");
+
+  if (isEmptyRecord(getContext(), Ty, true)) {
+    // These are ignored for parameter passing purposes.
+    llvm::Type *PTy = llvm::PointerType::getUnqual(CGF.ConvertType(Ty));
+    return Builder.CreateBitCast(Addr, PTy);
+  }
+
+  const uint64_t MinABIAlign = 8;
+  if (Align > MinABIAlign) {
+    llvm::Value *Offset = llvm::ConstantInt::get(CGF.Int32Ty, Align - 1);
+    Addr = Builder.CreateGEP(Addr, Offset);
+    llvm::Value *AsInt = Builder.CreatePtrToInt(Addr, CGF.Int64Ty);
+    llvm::Value *Mask = llvm::ConstantInt::get(CGF.Int64Ty, ~(Align - 1));
+    llvm::Value *Aligned = Builder.CreateAnd(AsInt, Mask);
+    Addr = Builder.CreateIntToPtr(Aligned, BP, "ap.align");
+  }
+
+  uint64_t Offset = llvm::RoundUpToAlignment(Size, MinABIAlign);
+  llvm::Value *NextAddr = Builder.CreateGEP(
+      Addr, llvm::ConstantInt::get(CGF.Int32Ty, Offset), "ap.next");
+  Builder.CreateStore(NextAddr, VAListAddrAsBPP);
+
+  if (isIndirect)
+    Addr = Builder.CreateLoad(Builder.CreateBitCast(Addr, BPP));
+  llvm::Type *PTy = llvm::PointerType::getUnqual(CGF.ConvertType(Ty));
+  llvm::Value *AddrTyped = Builder.CreateBitCast(Addr, PTy);
+
+  return AddrTyped;
+}
+
+//===----------------------------------------------------------------------===//
 // ARM ABI Implementation
 //===----------------------------------------------------------------------===//
 
@@ -3064,35 +3713,62 @@
 
 private:
   ABIKind Kind;
+  mutable int VFPRegs[16];
+  const unsigned NumVFPs;
+  const unsigned NumGPRs;
+  mutable unsigned AllocatedGPRs;
+  mutable unsigned AllocatedVFPs;
 
 public:
-  ARMABIInfo(CodeGenTypes &CGT, ABIKind _Kind) : ABIInfo(CGT), Kind(_Kind) {
+  ARMABIInfo(CodeGenTypes &CGT, ABIKind _Kind) : ABIInfo(CGT), Kind(_Kind),
+    NumVFPs(16), NumGPRs(4) {
     setRuntimeCC();
+    resetAllocatedRegs();
   }
 
   bool isEABI() const {
-    StringRef Env = getTarget().getTriple().getEnvironmentName();
-    return (Env == "gnueabi" || Env == "eabi" ||
-            Env == "android" || Env == "androideabi");
+    switch (getTarget().getTriple().getEnvironment()) {
+    case llvm::Triple::Android:
+    case llvm::Triple::EABI:
+    case llvm::Triple::EABIHF:
+    case llvm::Triple::GNUEABI:
+    case llvm::Triple::GNUEABIHF:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  bool isEABIHF() const {
+    switch (getTarget().getTriple().getEnvironment()) {
+    case llvm::Triple::EABIHF:
+    case llvm::Triple::GNUEABIHF:
+      return true;
+    default:
+      return false;
+    }
   }
 
   ABIKind getABIKind() const { return Kind; }
 
 private:
-  ABIArgInfo classifyReturnType(QualType RetTy) const;
-  ABIArgInfo classifyArgumentType(QualType RetTy, int *VFPRegs,
-                                  unsigned &AllocatedVFP,
-                                  bool &IsHA) const;
+  ABIArgInfo classifyReturnType(QualType RetTy, bool isVariadic) const;
+  ABIArgInfo classifyArgumentType(QualType RetTy, bool &IsHA, bool isVariadic,
+                                  bool &IsCPRC) const;
   bool isIllegalVectorType(QualType Ty) const;
 
-  virtual void computeInfo(CGFunctionInfo &FI) const;
+  void computeInfo(CGFunctionInfo &FI) const override;
 
-  virtual llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
-                                 CodeGenFunction &CGF) const;
+  llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
+                         CodeGenFunction &CGF) const override;
 
   llvm::CallingConv::ID getLLVMDefaultCC() const;
   llvm::CallingConv::ID getABIDefaultCC() const;
   void setRuntimeCC();
+
+  void markAllocatedGPRs(unsigned Alignment, unsigned NumRequired) const;
+  void markAllocatedVFPs(unsigned Alignment, unsigned NumRequired) const;
+  void resetAllocatedRegs(void) const;
 };
 
 class ARMTargetCodeGenInfo : public TargetCodeGenInfo {
@@ -3104,16 +3780,16 @@
     return static_cast<const ARMABIInfo&>(TargetCodeGenInfo::getABIInfo());
   }
 
-  int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const {
+  int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const override {
     return 13;
   }
 
-  StringRef getARCRetainAutoreleasedReturnValueMarker() const {
+  StringRef getARCRetainAutoreleasedReturnValueMarker() const override {
     return "mov\tr7, r7\t\t@ marker for objc_retainAutoreleaseReturnValue";
   }
 
   bool initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF,
-                               llvm::Value *Address) const {
+                               llvm::Value *Address) const override {
     llvm::Value *Four8 = llvm::ConstantInt::get(CGF.Int8Ty, 4);
 
     // 0-15 are the 16 integer registers.
@@ -3121,13 +3797,13 @@
     return false;
   }
 
-  unsigned getSizeOfUnwindException() const {
+  unsigned getSizeOfUnwindException() const override {
     if (getABIInfo().isEABI()) return 88;
     return TargetCodeGenInfo::getSizeOfUnwindException();
   }
 
   void SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
-                           CodeGen::CodeGenModule &CGM) const {
+                           CodeGen::CodeGenModule &CGM) const override {
     const FunctionDecl *FD = dyn_cast<FunctionDecl>(D);
     if (!FD)
       return;
@@ -3176,24 +3852,40 @@
   // allocated to the lowest-numbered sequence of such registers.
   // C.2.vfp If the argument is a VFP CPRC then any VFP registers that are
   // unallocated are marked as unavailable. 
-  unsigned AllocatedVFP = 0;
-  int VFPRegs[16] = { 0 };
-  FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
-  for (CGFunctionInfo::arg_iterator it = FI.arg_begin(), ie = FI.arg_end();
-       it != ie; ++it) {
-    unsigned PreAllocation = AllocatedVFP;
+  resetAllocatedRegs();
+
+  FI.getReturnInfo() = classifyReturnType(FI.getReturnType(), FI.isVariadic());
+  for (auto &I : FI.arguments()) {
+    unsigned PreAllocationVFPs = AllocatedVFPs;
+    unsigned PreAllocationGPRs = AllocatedGPRs;
     bool IsHA = false;
+    bool IsCPRC = false;
     // 6.1.2.3 There is one VFP co-processor register class using registers
     // s0-s15 (d0-d7) for passing arguments.
-    const unsigned NumVFPs = 16;
-    it->info = classifyArgumentType(it->type, VFPRegs, AllocatedVFP, IsHA);
+    I.info = classifyArgumentType(I.type, IsHA, FI.isVariadic(), IsCPRC);
+    assert((IsCPRC || !IsHA) && "Homogeneous aggregates must be CPRCs");
     // If we do not have enough VFP registers for the HA, any VFP registers
     // that are unallocated are marked as unavailable. To achieve this, we add
-    // padding of (NumVFPs - PreAllocation) floats.
-    if (IsHA && AllocatedVFP > NumVFPs && PreAllocation < NumVFPs) {
+    // padding of (NumVFPs - PreAllocationVFP) floats.
+    // Note that IsHA will only be set when using the AAPCS-VFP calling convention,
+    // and the callee is not variadic.
+    if (IsHA && AllocatedVFPs > NumVFPs && PreAllocationVFPs < NumVFPs) {
       llvm::Type *PaddingTy = llvm::ArrayType::get(
-          llvm::Type::getFloatTy(getVMContext()), NumVFPs - PreAllocation);
-      it->info = ABIArgInfo::getExpandWithPadding(false, PaddingTy);
+          llvm::Type::getFloatTy(getVMContext()), NumVFPs - PreAllocationVFPs);
+      I.info = ABIArgInfo::getExpandWithPadding(false, PaddingTy);
+    }
+
+    // If we have allocated some arguments onto the stack (due to running
+    // out of VFP registers), we cannot split an argument between GPRs and
+    // the stack. If this situation occurs, we add padding to prevent the
+    // GPRs from being used. In this situiation, the current argument could
+    // only be allocated by rule C.8, so rule C.6 would mark these GPRs as
+    // unusable anyway.
+    const bool StackUsed = PreAllocationGPRs > NumGPRs || PreAllocationVFPs > NumVFPs;
+    if (!IsCPRC && PreAllocationGPRs < NumGPRs && AllocatedGPRs > NumGPRs && StackUsed) {
+      llvm::Type *PaddingTy = llvm::ArrayType::get(
+          llvm::Type::getInt32Ty(getVMContext()), NumGPRs - PreAllocationGPRs);
+      I.info = ABIArgInfo::getExpandWithPadding(false, PaddingTy);
     }
   }
 
@@ -3209,7 +3901,7 @@
 /// Return the default calling convention that LLVM will use.
 llvm::CallingConv::ID ARMABIInfo::getLLVMDefaultCC() const {
   // The default calling convention that LLVM will infer.
-  if (getTarget().getTriple().getEnvironmentName()=="gnueabihf")
+  if (isEABIHF())
     return llvm::CallingConv::ARM_AAPCS_VFP;
   else if (isEABI())
     return llvm::CallingConv::ARM_AAPCS;
@@ -3243,8 +3935,7 @@
 /// contained in the type is returned through it; this is used for the
 /// recursive calls that check aggregate component types.
 static bool isHomogeneousAggregate(QualType Ty, const Type *&Base,
-                                   ASTContext &Context,
-                                   uint64_t *HAMembers = 0) {
+                                   ASTContext &Context, uint64_t *HAMembers) {
   uint64_t Members = 0;
   if (const ConstantArrayType *AT = Context.getAsConstantArrayType(Ty)) {
     if (!isHomogeneousAggregate(AT->getElementType(), Base, Context, &Members))
@@ -3256,9 +3947,7 @@
       return false;
 
     Members = 0;
-    for (RecordDecl::field_iterator i = RD->field_begin(), e = RD->field_end();
-         i != e; ++i) {
-      const FieldDecl *FD = *i;
+    for (const auto *FD : RD->fields()) {
       uint64_t FldMembers;
       if (!isHomogeneousAggregate(FD->getType(), Base, Context, &FldMembers))
         return false;
@@ -3293,10 +3982,29 @@
     const Type *TyPtr = Ty.getTypePtr();
     if (!Base)
       Base = TyPtr;
-    if (Base != TyPtr &&
-        (!Base->isVectorType() || !TyPtr->isVectorType() ||
-         Context.getTypeSize(Base) != Context.getTypeSize(TyPtr)))
-      return false;
+
+    if (Base != TyPtr) {
+      // Homogeneous aggregates are defined as containing members with the
+      // same machine type. There are two cases in which two members have
+      // different TypePtrs but the same machine type:
+
+      // 1) Vectors of the same length, regardless of the type and number
+      //    of their members.
+      const bool SameLengthVectors = Base->isVectorType() && TyPtr->isVectorType()
+        && (Context.getTypeSize(Base) == Context.getTypeSize(TyPtr));
+
+      // 2) In the 32-bit AAPCS, `double' and `long double' have the same
+      //    machine type. This is not the case for the 64-bit AAPCS.
+      const bool SameSizeDoubles =
+           (   (   Base->isSpecificBuiltinType(BuiltinType::Double)
+                && TyPtr->isSpecificBuiltinType(BuiltinType::LongDouble))
+            || (   Base->isSpecificBuiltinType(BuiltinType::LongDouble)
+                && TyPtr->isSpecificBuiltinType(BuiltinType::Double)))
+        && (Context.getTypeSize(Base) == Context.getTypeSize(TyPtr));
+
+      if (!SameLengthVectors && !SameSizeDoubles)
+        return false;
+    }
   }
 
   // Homogeneous Aggregates can have at most 4 members of the base type.
@@ -3308,12 +4016,15 @@
 
 /// markAllocatedVFPs - update VFPRegs according to the alignment and
 /// number of VFP registers (unit is S register) requested.
-static void markAllocatedVFPs(int *VFPRegs, unsigned &AllocatedVFP,
-                              unsigned Alignment,
-                              unsigned NumRequired) {
+void ARMABIInfo::markAllocatedVFPs(unsigned Alignment,
+                                   unsigned NumRequired) const {
   // Early Exit.
-  if (AllocatedVFP >= 16)
+  if (AllocatedVFPs >= 16) {
+    // We use AllocatedVFP > 16 to signal that some CPRCs were allocated on
+    // the stack.
+    AllocatedVFPs = 17;
     return;
+  }
   // C.1.vfp If the argument is a VFP CPRC and there are sufficient consecutive
   // VFP registers of the appropriate type unallocated then the argument is
   // allocated to the lowest-numbered sequence of such registers.
@@ -3327,7 +4038,7 @@
     if (FoundSlot) {
       for (unsigned J = I, JEnd = I + NumRequired; J < JEnd; J++)
         VFPRegs[J] = 1;
-      AllocatedVFP += NumRequired;
+      AllocatedVFPs += NumRequired;
       return;
     }
   }
@@ -3335,12 +4046,32 @@
   // unallocated are marked as unavailable.
   for (unsigned I = 0; I < 16; I++)
     VFPRegs[I] = 1;
-  AllocatedVFP = 17; // We do not have enough VFP registers.
+  AllocatedVFPs = 17; // We do not have enough VFP registers.
 }
 
-ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty, int *VFPRegs,
-                                            unsigned &AllocatedVFP,
-                                            bool &IsHA) const {
+/// Update AllocatedGPRs to record the number of general purpose registers
+/// which have been allocated. It is valid for AllocatedGPRs to go above 4,
+/// this represents arguments being stored on the stack.
+void ARMABIInfo::markAllocatedGPRs(unsigned Alignment,
+                                          unsigned NumRequired) const {
+  assert((Alignment == 1 || Alignment == 2) && "Alignment must be 4 or 8 bytes");
+
+  if (Alignment == 2 && AllocatedGPRs & 0x1)
+    AllocatedGPRs += 1;
+
+  AllocatedGPRs += NumRequired;
+}
+
+void ARMABIInfo::resetAllocatedRegs(void) const {
+  AllocatedGPRs = 0;
+  AllocatedVFPs = 0;
+  for (unsigned i = 0; i < NumVFPs; ++i)
+    VFPRegs[i] = 0;
+}
+
+ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty, bool &IsHA,
+                                            bool isVariadic,
+                                            bool &IsCPRC) const {
   // We update number of allocated VFPs according to
   // 6.1.2.1 The following argument types are VFP CPRCs:
   //   A single-precision floating-point type (including promoted
@@ -3356,55 +4087,82 @@
     if (Size <= 32) {
       llvm::Type *ResType =
           llvm::Type::getInt32Ty(getVMContext());
+      markAllocatedGPRs(1, 1);
       return ABIArgInfo::getDirect(ResType);
     }
     if (Size == 64) {
       llvm::Type *ResType = llvm::VectorType::get(
           llvm::Type::getInt32Ty(getVMContext()), 2);
-      markAllocatedVFPs(VFPRegs, AllocatedVFP, 2, 2);
+      if (getABIKind() == ARMABIInfo::AAPCS || isVariadic){
+        markAllocatedGPRs(2, 2);
+      } else {
+        markAllocatedVFPs(2, 2);
+        IsCPRC = true;
+      }
       return ABIArgInfo::getDirect(ResType);
     }
     if (Size == 128) {
       llvm::Type *ResType = llvm::VectorType::get(
           llvm::Type::getInt32Ty(getVMContext()), 4);
-      markAllocatedVFPs(VFPRegs, AllocatedVFP, 4, 4);
+      if (getABIKind() == ARMABIInfo::AAPCS || isVariadic) {
+        markAllocatedGPRs(2, 4);
+      } else {
+        markAllocatedVFPs(4, 4);
+        IsCPRC = true;
+      }
       return ABIArgInfo::getDirect(ResType);
     }
+    markAllocatedGPRs(1, 1);
     return ABIArgInfo::getIndirect(0, /*ByVal=*/false);
   }
   // Update VFPRegs for legal vector types.
-  if (const VectorType *VT = Ty->getAs<VectorType>()) {
-    uint64_t Size = getContext().getTypeSize(VT);
-    // Size of a legal vector should be power of 2 and above 64.
-    markAllocatedVFPs(VFPRegs, AllocatedVFP, Size >= 128 ? 4 : 2, Size / 32);
+  if (getABIKind() == ARMABIInfo::AAPCS_VFP && !isVariadic) {
+    if (const VectorType *VT = Ty->getAs<VectorType>()) {
+      uint64_t Size = getContext().getTypeSize(VT);
+      // Size of a legal vector should be power of 2 and above 64.
+      markAllocatedVFPs(Size >= 128 ? 4 : 2, Size / 32);
+      IsCPRC = true;
+    }
   }
   // Update VFPRegs for floating point types.
-  if (const BuiltinType *BT = Ty->getAs<BuiltinType>()) {
-    if (BT->getKind() == BuiltinType::Half ||
-        BT->getKind() == BuiltinType::Float)
-      markAllocatedVFPs(VFPRegs, AllocatedVFP, 1, 1);
-    if (BT->getKind() == BuiltinType::Double ||
-        BT->getKind() == BuiltinType::LongDouble)
-      markAllocatedVFPs(VFPRegs, AllocatedVFP, 2, 2);
+  if (getABIKind() == ARMABIInfo::AAPCS_VFP && !isVariadic) {
+    if (const BuiltinType *BT = Ty->getAs<BuiltinType>()) {
+      if (BT->getKind() == BuiltinType::Half ||
+          BT->getKind() == BuiltinType::Float) {
+        markAllocatedVFPs(1, 1);
+        IsCPRC = true;
+      }
+      if (BT->getKind() == BuiltinType::Double ||
+          BT->getKind() == BuiltinType::LongDouble) {
+        markAllocatedVFPs(2, 2);
+        IsCPRC = true;
+      }
+    }
   }
 
   if (!isAggregateTypeForABI(Ty)) {
     // Treat an enum type as its underlying type.
-    if (const EnumType *EnumTy = Ty->getAs<EnumType>())
+    if (const EnumType *EnumTy = Ty->getAs<EnumType>()) {
       Ty = EnumTy->getDecl()->getIntegerType();
+    }
 
+    unsigned Size = getContext().getTypeSize(Ty);
+    if (!IsCPRC)
+      markAllocatedGPRs(Size > 32 ? 2 : 1, (Size + 31) / 32);
     return (Ty->isPromotableIntegerType() ?
             ABIArgInfo::getExtend() : ABIArgInfo::getDirect());
   }
 
-  if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI()))
+  if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) {
+    markAllocatedGPRs(1, 1);
     return ABIArgInfo::getIndirect(0, RAA == CGCXXABI::RAA_DirectInMemory);
+  }
 
   // Ignore empty records.
   if (isEmptyRecord(getContext(), Ty, true))
     return ABIArgInfo::getIgnore();
 
-  if (getABIKind() == ARMABIInfo::AAPCS_VFP) {
+  if (getABIKind() == ARMABIInfo::AAPCS_VFP && !isVariadic) {
     // Homogeneous Aggregates need to be expanded when we can fit the aggregate
     // into VFP registers.
     const Type *Base = 0;
@@ -3415,16 +4173,17 @@
       if (Base->isVectorType()) {
         // ElementSize is in number of floats.
         unsigned ElementSize = getContext().getTypeSize(Base) == 64 ? 2 : 4;
-        markAllocatedVFPs(VFPRegs, AllocatedVFP, ElementSize,
+        markAllocatedVFPs(ElementSize,
                           Members * ElementSize);
       } else if (Base->isSpecificBuiltinType(BuiltinType::Float))
-        markAllocatedVFPs(VFPRegs, AllocatedVFP, 1, Members);
+        markAllocatedVFPs(1, Members);
       else {
         assert(Base->isSpecificBuiltinType(BuiltinType::Double) ||
                Base->isSpecificBuiltinType(BuiltinType::LongDouble));
-        markAllocatedVFPs(VFPRegs, AllocatedVFP, 2, Members * 2);
+        markAllocatedVFPs(2, Members * 2);
       }
       IsHA = true;
+      IsCPRC = true;
       return ABIArgInfo::getExpand();
     }
   }
@@ -3439,7 +4198,9 @@
       getABIKind() == ARMABIInfo::AAPCS)
     ABIAlign = std::min(std::max(TyAlign, (uint64_t)4), (uint64_t)8);
   if (getContext().getTypeSizeInChars(Ty) > CharUnits::fromQuantity(64)) {
-    return ABIArgInfo::getIndirect(0, /*ByVal=*/true,
+      // Update Allocated GPRs
+    markAllocatedGPRs(1, 1);
+    return ABIArgInfo::getIndirect(TyAlign, /*ByVal=*/true,
            /*Realign=*/TyAlign > ABIAlign);
   }
 
@@ -3451,9 +4212,11 @@
   if (getContext().getTypeAlign(Ty) <= 32) {
     ElemTy = llvm::Type::getInt32Ty(getVMContext());
     SizeRegs = (getContext().getTypeSize(Ty) + 31) / 32;
+    markAllocatedGPRs(1, SizeRegs);
   } else {
     ElemTy = llvm::Type::getInt64Ty(getVMContext());
     SizeRegs = (getContext().getTypeSize(Ty) + 63) / 64;
+    markAllocatedGPRs(2, SizeRegs * 2);
   }
 
   llvm::Type *STy =
@@ -3546,13 +4309,16 @@
   return true;
 }
 
-ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy) const {
+ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy,
+                                          bool isVariadic) const {
   if (RetTy->isVoidType())
     return ABIArgInfo::getIgnore();
 
   // Large vector types should be returned via memory.
-  if (RetTy->isVectorType() && getContext().getTypeSize(RetTy) > 128)
+  if (RetTy->isVectorType() && getContext().getTypeSize(RetTy) > 128) {
+    markAllocatedGPRs(1, 1);
     return ABIArgInfo::getIndirect(0);
+  }
 
   if (!isAggregateTypeForABI(RetTy)) {
     // Treat an enum type as its underlying type.
@@ -3565,8 +4331,10 @@
 
   // Structures with either a non-trivial destructor or a non-trivial
   // copy constructor are always indirect.
-  if (isRecordReturnIndirect(RetTy, getCXXABI()))
+  if (isRecordReturnIndirect(RetTy, getCXXABI())) {
+    markAllocatedGPRs(1, 1);
     return ABIArgInfo::getIndirect(0, /*ByVal=*/false);
+  }
 
   // Are we following APCS?
   if (getABIKind() == APCS) {
@@ -3593,6 +4361,7 @@
     }
 
     // Otherwise return in memory.
+    markAllocatedGPRs(1, 1);
     return ABIArgInfo::getIndirect(0);
   }
 
@@ -3602,7 +4371,7 @@
     return ABIArgInfo::getIgnore();
 
   // Check for homogeneous aggregates with AAPCS-VFP.
-  if (getABIKind() == AAPCS_VFP) {
+  if (getABIKind() == AAPCS_VFP && !isVariadic) {
     const Type *Base = 0;
     if (isHomogeneousAggregate(RetTy, Base, getContext())) {
       assert(Base && "Base class should be set for homogeneous aggregate");
@@ -3623,6 +4392,7 @@
     return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
   }
 
+  markAllocatedGPRs(1, 1);
   return ABIArgInfo::getIndirect(0);
 }
 
@@ -3720,9 +4490,9 @@
  public:
   NaClARMABIInfo(CodeGen::CodeGenTypes &CGT, ARMABIInfo::ABIKind Kind)
       : ABIInfo(CGT), PInfo(CGT), NInfo(CGT, Kind) {}
-  virtual void computeInfo(CGFunctionInfo &FI) const;
-  virtual llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
-                                 CodeGenFunction &CGF) const;
+  void computeInfo(CGFunctionInfo &FI) const override;
+  llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
+                         CodeGenFunction &CGF) const override;
  private:
   PNaClABIInfo PInfo; // Used for generating calls with pnaclcall callingconv.
   ARMABIInfo NInfo; // Used for everything else.
@@ -3770,10 +4540,10 @@
   ABIArgInfo tryUseRegs(QualType Ty, int &FreeRegs, int RegsNeeded, bool IsInt,
                         llvm::Type *DirectTy = 0) const;
 
-  virtual void computeInfo(CGFunctionInfo &FI) const;
+  void computeInfo(CGFunctionInfo &FI) const override;
 
-  virtual llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
-                                 CodeGenFunction &CGF) const;
+  llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
+                         CodeGenFunction &CGF) const override;
 };
 
 class AArch64TargetCodeGenInfo : public TargetCodeGenInfo {
@@ -3785,12 +4555,12 @@
     return static_cast<const AArch64ABIInfo&>(TargetCodeGenInfo::getABIInfo());
   }
 
-  int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const {
+  int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const override {
     return 31;
   }
 
   bool initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF,
-                               llvm::Value *Address) const {
+                               llvm::Value *Address) const override {
     // 0-31 are x0-x30 and sp: 8 bytes each
     llvm::Value *Eight8 = llvm::ConstantInt::get(CGF.Int8Ty, 8);
     AssignToArrayRange(CGF.Builder, Address, Eight8, 0, 31);
@@ -3813,9 +4583,8 @@
                                            FreeIntRegs, FreeVFPRegs);
 
   FreeIntRegs = FreeVFPRegs = 8;
-  for (CGFunctionInfo::arg_iterator it = FI.arg_begin(), ie = FI.arg_end();
-       it != ie; ++it) {
-    it->info = classifyGenericType(it->type, FreeIntRegs, FreeVFPRegs);
+  for (auto &I : FI.arguments()) {
+    I.info = classifyGenericType(I.type, FreeIntRegs, FreeVFPRegs);
 
   }
 }
@@ -3958,208 +4727,12 @@
 
 llvm::Value *AArch64ABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
                                        CodeGenFunction &CGF) const {
-  // The AArch64 va_list type and handling is specified in the Procedure Call
-  // Standard, section B.4:
-  //
-  // struct {
-  //   void *__stack;
-  //   void *__gr_top;
-  //   void *__vr_top;
-  //   int __gr_offs;
-  //   int __vr_offs;
-  // };
-
-  assert(!CGF.CGM.getDataLayout().isBigEndian()
-         && "va_arg not implemented for big-endian AArch64");
-
   int FreeIntRegs = 8, FreeVFPRegs = 8;
   Ty = CGF.getContext().getCanonicalType(Ty);
   ABIArgInfo AI = classifyGenericType(Ty, FreeIntRegs, FreeVFPRegs);
 
-  llvm::BasicBlock *MaybeRegBlock = CGF.createBasicBlock("vaarg.maybe_reg");
-  llvm::BasicBlock *InRegBlock = CGF.createBasicBlock("vaarg.in_reg");
-  llvm::BasicBlock *OnStackBlock = CGF.createBasicBlock("vaarg.on_stack");
-  llvm::BasicBlock *ContBlock = CGF.createBasicBlock("vaarg.end");
-
-  llvm::Value *reg_offs_p = 0, *reg_offs = 0;
-  int reg_top_index;
-  int RegSize;
-  if (FreeIntRegs < 8) {
-    assert(FreeVFPRegs == 8 && "Arguments never split between int & VFP regs");
-    // 3 is the field number of __gr_offs
-    reg_offs_p = CGF.Builder.CreateStructGEP(VAListAddr, 3, "gr_offs_p");
-    reg_offs = CGF.Builder.CreateLoad(reg_offs_p, "gr_offs");
-    reg_top_index = 1; // field number for __gr_top
-    RegSize = 8 * (8 - FreeIntRegs);
-  } else {
-    assert(FreeVFPRegs < 8 && "Argument must go in VFP or int regs");
-    // 4 is the field number of __vr_offs.
-    reg_offs_p = CGF.Builder.CreateStructGEP(VAListAddr, 4, "vr_offs_p");
-    reg_offs = CGF.Builder.CreateLoad(reg_offs_p, "vr_offs");
-    reg_top_index = 2; // field number for __vr_top
-    RegSize = 16 * (8 - FreeVFPRegs);
-  }
-
-  //=======================================
-  // Find out where argument was passed
-  //=======================================
-
-  // If reg_offs >= 0 we're already using the stack for this type of
-  // argument. We don't want to keep updating reg_offs (in case it overflows,
-  // though anyone passing 2GB of arguments, each at most 16 bytes, deserves
-  // whatever they get).
-  llvm::Value *UsingStack = 0;
-  UsingStack = CGF.Builder.CreateICmpSGE(reg_offs,
-                                         llvm::ConstantInt::get(CGF.Int32Ty, 0));
-
-  CGF.Builder.CreateCondBr(UsingStack, OnStackBlock, MaybeRegBlock);
-
-  // Otherwise, at least some kind of argument could go in these registers, the
-  // quesiton is whether this particular type is too big.
-  CGF.EmitBlock(MaybeRegBlock);
-
-  // Integer arguments may need to correct register alignment (for example a
-  // "struct { __int128 a; };" gets passed in x_2N, x_{2N+1}). In this case we
-  // align __gr_offs to calculate the potential address.
-  if (FreeIntRegs < 8 && AI.isDirect() && getContext().getTypeAlign(Ty) > 64) {
-    int Align = getContext().getTypeAlign(Ty) / 8;
-
-    reg_offs = CGF.Builder.CreateAdd(reg_offs,
-                                 llvm::ConstantInt::get(CGF.Int32Ty, Align - 1),
-                                 "align_regoffs");
-    reg_offs = CGF.Builder.CreateAnd(reg_offs,
-                                    llvm::ConstantInt::get(CGF.Int32Ty, -Align),
-                                    "aligned_regoffs");
-  }
-
-  // Update the gr_offs/vr_offs pointer for next call to va_arg on this va_list.
-  llvm::Value *NewOffset = 0;
-  NewOffset = CGF.Builder.CreateAdd(reg_offs,
-                                    llvm::ConstantInt::get(CGF.Int32Ty, RegSize),
-                                    "new_reg_offs");
-  CGF.Builder.CreateStore(NewOffset, reg_offs_p);
-
-  // Now we're in a position to decide whether this argument really was in
-  // registers or not.
-  llvm::Value *InRegs = 0;
-  InRegs = CGF.Builder.CreateICmpSLE(NewOffset,
-                                     llvm::ConstantInt::get(CGF.Int32Ty, 0),
-                                     "inreg");
-
-  CGF.Builder.CreateCondBr(InRegs, InRegBlock, OnStackBlock);
-
-  //=======================================
-  // Argument was in registers
-  //=======================================
-
-  // Now we emit the code for if the argument was originally passed in
-  // registers. First start the appropriate block:
-  CGF.EmitBlock(InRegBlock);
-
-  llvm::Value *reg_top_p = 0, *reg_top = 0;
-  reg_top_p = CGF.Builder.CreateStructGEP(VAListAddr, reg_top_index, "reg_top_p");
-  reg_top = CGF.Builder.CreateLoad(reg_top_p, "reg_top");
-  llvm::Value *BaseAddr = CGF.Builder.CreateGEP(reg_top, reg_offs);
-  llvm::Value *RegAddr = 0;
-  llvm::Type *MemTy = llvm::PointerType::getUnqual(CGF.ConvertTypeForMem(Ty));
-
-  if (!AI.isDirect()) {
-    // If it's been passed indirectly (actually a struct), whatever we find from
-    // stored registers or on the stack will actually be a struct **.
-    MemTy = llvm::PointerType::getUnqual(MemTy);
-  }
-
-  const Type *Base = 0;
-  uint64_t NumMembers;
-  if (isHomogeneousAggregate(Ty, Base, getContext(), &NumMembers)
-      && NumMembers > 1) {
-    // Homogeneous aggregates passed in registers will have their elements split
-    // and stored 16-bytes apart regardless of size (they're notionally in qN,
-    // qN+1, ...). We reload and store into a temporary local variable
-    // contiguously.
-    assert(AI.isDirect() && "Homogeneous aggregates should be passed directly");
-    llvm::Type *BaseTy = CGF.ConvertType(QualType(Base, 0));
-    llvm::Type *HFATy = llvm::ArrayType::get(BaseTy, NumMembers);
-    llvm::Value *Tmp = CGF.CreateTempAlloca(HFATy);
-
-    for (unsigned i = 0; i < NumMembers; ++i) {
-      llvm::Value *BaseOffset = llvm::ConstantInt::get(CGF.Int32Ty, 16 * i);
-      llvm::Value *LoadAddr = CGF.Builder.CreateGEP(BaseAddr, BaseOffset);
-      LoadAddr = CGF.Builder.CreateBitCast(LoadAddr,
-                                           llvm::PointerType::getUnqual(BaseTy));
-      llvm::Value *StoreAddr = CGF.Builder.CreateStructGEP(Tmp, i);
-
-      llvm::Value *Elem = CGF.Builder.CreateLoad(LoadAddr);
-      CGF.Builder.CreateStore(Elem, StoreAddr);
-    }
-
-    RegAddr = CGF.Builder.CreateBitCast(Tmp, MemTy);
-  } else {
-    // Otherwise the object is contiguous in memory
-    RegAddr = CGF.Builder.CreateBitCast(BaseAddr, MemTy);
-  }
-
-  CGF.EmitBranch(ContBlock);
-
-  //=======================================
-  // Argument was on the stack
-  //=======================================
-  CGF.EmitBlock(OnStackBlock);
-
-  llvm::Value *stack_p = 0, *OnStackAddr = 0;
-  stack_p = CGF.Builder.CreateStructGEP(VAListAddr, 0, "stack_p");
-  OnStackAddr = CGF.Builder.CreateLoad(stack_p, "stack");
-
-  // Again, stack arguments may need realigmnent. In this case both integer and
-  // floating-point ones might be affected.
-  if (AI.isDirect() && getContext().getTypeAlign(Ty) > 64) {
-    int Align = getContext().getTypeAlign(Ty) / 8;
-
-    OnStackAddr = CGF.Builder.CreatePtrToInt(OnStackAddr, CGF.Int64Ty);
-
-    OnStackAddr = CGF.Builder.CreateAdd(OnStackAddr,
-                                 llvm::ConstantInt::get(CGF.Int64Ty, Align - 1),
-                                 "align_stack");
-    OnStackAddr = CGF.Builder.CreateAnd(OnStackAddr,
-                                    llvm::ConstantInt::get(CGF.Int64Ty, -Align),
-                                    "align_stack");
-
-    OnStackAddr = CGF.Builder.CreateIntToPtr(OnStackAddr, CGF.Int8PtrTy);
-  }
-
-  uint64_t StackSize;
-  if (AI.isDirect())
-    StackSize = getContext().getTypeSize(Ty) / 8;
-  else
-    StackSize = 8;
-
-  // All stack slots are 8 bytes
-  StackSize = llvm::RoundUpToAlignment(StackSize, 8);
-
-  llvm::Value *StackSizeC = llvm::ConstantInt::get(CGF.Int32Ty, StackSize);
-  llvm::Value *NewStack = CGF.Builder.CreateGEP(OnStackAddr, StackSizeC,
-                                                "new_stack");
-
-  // Write the new value of __stack for the next call to va_arg
-  CGF.Builder.CreateStore(NewStack, stack_p);
-
-  OnStackAddr = CGF.Builder.CreateBitCast(OnStackAddr, MemTy);
-
-  CGF.EmitBranch(ContBlock);
-
-  //=======================================
-  // Tidy up
-  //=======================================
-  CGF.EmitBlock(ContBlock);
-
-  llvm::PHINode *ResAddr = CGF.Builder.CreatePHI(MemTy, 2, "vaarg.addr");
-  ResAddr->addIncoming(RegAddr, InRegBlock);
-  ResAddr->addIncoming(OnStackAddr, OnStackBlock);
-
-  if (AI.isDirect())
-    return ResAddr;
-
-  return CGF.Builder.CreateLoad(ResAddr, "vaarg.addr");
+  return EmitAArch64VAArg(VAListAddr, Ty, 8 - FreeIntRegs, 8 - FreeVFPRegs,
+                          AI.isIndirect(), CGF);
 }
 
 //===----------------------------------------------------------------------===//
@@ -4175,18 +4748,18 @@
   ABIArgInfo classifyReturnType(QualType RetTy) const;
   ABIArgInfo classifyArgumentType(QualType Ty) const;
 
-  virtual void computeInfo(CGFunctionInfo &FI) const;
-  virtual llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
-                                 CodeGenFunction &CFG) const;
+  void computeInfo(CGFunctionInfo &FI) const override;
+  llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
+                         CodeGenFunction &CFG) const override;
 };
 
 class NVPTXTargetCodeGenInfo : public TargetCodeGenInfo {
 public:
   NVPTXTargetCodeGenInfo(CodeGenTypes &CGT)
     : TargetCodeGenInfo(new NVPTXABIInfo(CGT)) {}
-    
-  virtual void SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
-                                   CodeGen::CodeGenModule &M) const;
+
+  void SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
+                           CodeGen::CodeGenModule &M) const override;
 private:
   static void addKernelMetadata(llvm::Function *F);
 };
@@ -4218,9 +4791,8 @@
 
 void NVPTXABIInfo::computeInfo(CGFunctionInfo &FI) const {
   FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
-  for (CGFunctionInfo::arg_iterator it = FI.arg_begin(), ie = FI.arg_end();
-       it != ie; ++it)
-    it->info = classifyArgumentType(it->type);
+  for (auto &I : FI.arguments())
+    I.info = classifyArgumentType(I.type);
 
   // Always honor user-specified calling convention.
   if (FI.getCallingConvention() != llvm::CallingConv::C)
@@ -4259,7 +4831,7 @@
     // CUDA __global__ functions get a kernel metadata entry.  Since
     // __global__ functions cannot be called from the device, we do not
     // need to set the noinline attribute.
-    if (FD->getAttr<CUDAGlobalAttr>())
+    if (FD->hasAttr<CUDAGlobalAttr>())
       addKernelMetadata(F);
   }
 }
@@ -4300,15 +4872,14 @@
   ABIArgInfo classifyReturnType(QualType RetTy) const;
   ABIArgInfo classifyArgumentType(QualType ArgTy) const;
 
-  virtual void computeInfo(CGFunctionInfo &FI) const {
+  void computeInfo(CGFunctionInfo &FI) const override {
     FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
-    for (CGFunctionInfo::arg_iterator it = FI.arg_begin(), ie = FI.arg_end();
-         it != ie; ++it)
-      it->info = classifyArgumentType(it->type);
+    for (auto &I : FI.arguments())
+      I.info = classifyArgumentType(I.type);
   }
 
-  virtual llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
-                                 CodeGenFunction &CGF) const;
+  llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
+                         CodeGenFunction &CGF) const override;
 };
 
 class SystemZTargetCodeGenInfo : public TargetCodeGenInfo {
@@ -4360,9 +4931,8 @@
 
     // If this is a C++ record, check the bases first.
     if (const CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(RD))
-      for (CXXRecordDecl::base_class_const_iterator I = CXXRD->bases_begin(),
-             E = CXXRD->bases_end(); I != E; ++I) {
-        QualType Base = I->getType();
+      for (const auto &I : CXXRD->bases()) {
+        QualType Base = I.getType();
 
         // Empty bases don't affect things either way.
         if (isEmptyRecord(getContext(), Base, true))
@@ -4376,10 +4946,7 @@
       }
 
     // Check the fields.
-    for (RecordDecl::field_iterator I = RD->field_begin(),
-           E = RD->field_end(); I != E; ++I) {
-      const FieldDecl *FD = *I;
-
+    for (const auto *FD : RD->fields()) {
       // Empty bitfields don't affect things either way.
       // Unlike isSingleElementStruct(), empty structure and array fields
       // do count.  So do anonymous bitfields that aren't zero-sized.
@@ -4452,7 +5019,7 @@
   llvm::Type *IndexTy = RegCount->getType();
   llvm::Value *MaxRegsV = llvm::ConstantInt::get(IndexTy, MaxRegs);
   llvm::Value *InRegs = CGF.Builder.CreateICmpULT(RegCount, MaxRegsV,
-						  "fits_in_regs");
+                                                 "fits_in_regs");
 
   llvm::BasicBlock *InRegBlock = CGF.createBasicBlock("vaarg.in_reg");
   llvm::BasicBlock *InMemBlock = CGF.createBasicBlock("vaarg.in_mem");
@@ -4535,15 +5102,22 @@
     return true;
 
   switch (Triple.getOS()) {
-  case llvm::Triple::Cygwin:
-  case llvm::Triple::MinGW32:
   case llvm::Triple::AuroraUX:
   case llvm::Triple::DragonFly:
   case llvm::Triple::FreeBSD:
   case llvm::Triple::OpenBSD:
   case llvm::Triple::Bitrig:
-  case llvm::Triple::Win32:
     return true;
+  case llvm::Triple::Win32:
+    switch (Triple.getEnvironment()) {
+    case llvm::Triple::UnknownEnvironment:
+    case llvm::Triple::Cygnus:
+    case llvm::Triple::GNU:
+    case llvm::Triple::MSVC:
+      return true;
+    default:
+      return false;
+    }
   default:
     return false;
   }
@@ -4611,7 +5185,7 @@
   MSP430TargetCodeGenInfo(CodeGenTypes &CGT)
     : TargetCodeGenInfo(new DefaultABIInfo(CGT)) {}
   void SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
-                           CodeGen::CodeGenModule &M) const;
+                           CodeGen::CodeGenModule &M) const override;
 };
 
 }
@@ -4660,9 +5234,9 @@
 
   ABIArgInfo classifyReturnType(QualType RetTy) const;
   ABIArgInfo classifyArgumentType(QualType RetTy, uint64_t &Offset) const;
-  virtual void computeInfo(CGFunctionInfo &FI) const;
-  virtual llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
-                                 CodeGenFunction &CGF) const;
+  void computeInfo(CGFunctionInfo &FI) const override;
+  llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
+                         CodeGenFunction &CGF) const override;
 };
 
 class MIPSTargetCodeGenInfo : public TargetCodeGenInfo {
@@ -4672,12 +5246,12 @@
     : TargetCodeGenInfo(new MipsABIInfo(CGT, IsO32)),
       SizeOfUnwindException(IsO32 ? 24 : 32) {}
 
-  int getDwarfEHStackPointer(CodeGen::CodeGenModule &CGM) const {
+  int getDwarfEHStackPointer(CodeGen::CodeGenModule &CGM) const override {
     return 29;
   }
 
   void SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
-                           CodeGen::CodeGenModule &CGM) const {
+                           CodeGen::CodeGenModule &CGM) const override {
     const FunctionDecl *FD = dyn_cast<FunctionDecl>(D);
     if (!FD) return;
     llvm::Function *Fn = cast<llvm::Function>(GV);
@@ -4690,9 +5264,9 @@
   }
 
   bool initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF,
-                               llvm::Value *Address) const;
+                               llvm::Value *Address) const override;
 
-  unsigned getSizeOfUnwindException() const {
+  unsigned getSizeOfUnwindException() const override {
     return SizeOfUnwindException;
   }
 };
@@ -4901,9 +5475,8 @@
   // Check if a pointer to an aggregate is passed as a hidden argument.  
   uint64_t Offset = RetInfo.isIndirect() ? MinABIStackAlignInBytes : 0;
 
-  for (CGFunctionInfo::arg_iterator it = FI.arg_begin(), ie = FI.arg_end();
-       it != ie; ++it)
-    it->info = classifyArgumentType(it->type, Offset);
+  for (auto &I : FI.arguments())
+    I.info = classifyArgumentType(I.type, Offset);
 }
 
 llvm::Value* MipsABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
@@ -4984,8 +5557,8 @@
   TCETargetCodeGenInfo(CodeGenTypes &CGT)
     : DefaultTargetCodeGenInfo(CGT) {}
 
-  virtual void SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
-                                   CodeGen::CodeGenModule &M) const;
+  void SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
+                           CodeGen::CodeGenModule &M) const override;
 };
 
 void TCETargetCodeGenInfo::SetTargetAttributes(const Decl *D,
@@ -5000,9 +5573,8 @@
     if (FD->hasAttr<OpenCLKernelAttr>()) {
       // OpenCL C Kernel functions are not subject to inlining
       F->addFnAttr(llvm::Attribute::NoInline);
-          
-      if (FD->hasAttr<ReqdWorkGroupSizeAttr>()) {
-
+      const ReqdWorkGroupSizeAttr *Attr = FD->getAttr<ReqdWorkGroupSizeAttr>();
+      if (Attr) {
         // Convert the reqd_work_group_size() attributes to metadata.
         llvm::LLVMContext &Context = F->getContext();
         llvm::NamedMDNode *OpenCLMetadata = 
@@ -5012,14 +5584,11 @@
         Operands.push_back(F);
 
         Operands.push_back(llvm::Constant::getIntegerValue(M.Int32Ty, 
-                             llvm::APInt(32, 
-                             FD->getAttr<ReqdWorkGroupSizeAttr>()->getXDim())));
+                             llvm::APInt(32, Attr->getXDim())));
         Operands.push_back(llvm::Constant::getIntegerValue(M.Int32Ty,
-                             llvm::APInt(32,
-                               FD->getAttr<ReqdWorkGroupSizeAttr>()->getYDim())));
+                             llvm::APInt(32, Attr->getYDim())));
         Operands.push_back(llvm::Constant::getIntegerValue(M.Int32Ty, 
-                             llvm::APInt(32, 
-                               FD->getAttr<ReqdWorkGroupSizeAttr>()->getZDim())));
+                             llvm::APInt(32, Attr->getZDim())));
 
         // Add a boolean constant operand for "required" (true) or "hint" (false)
         // for implementing the work_group_size_hint attr later. Currently 
@@ -5050,10 +5619,10 @@
   ABIArgInfo classifyReturnType(QualType RetTy) const;
   ABIArgInfo classifyArgumentType(QualType RetTy) const;
 
-  virtual void computeInfo(CGFunctionInfo &FI) const;
+  void computeInfo(CGFunctionInfo &FI) const override;
 
-  virtual llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
-                                 CodeGenFunction &CGF) const;
+  llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
+                         CodeGenFunction &CGF) const override;
 };
 
 class HexagonTargetCodeGenInfo : public TargetCodeGenInfo {
@@ -5061,7 +5630,7 @@
   HexagonTargetCodeGenInfo(CodeGenTypes &CGT)
     :TargetCodeGenInfo(new HexagonABIInfo(CGT)) {}
 
-  int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const {
+  int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const override {
     return 29;
   }
 };
@@ -5070,9 +5639,8 @@
 
 void HexagonABIInfo::computeInfo(CGFunctionInfo &FI) const {
   FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
-  for (CGFunctionInfo::arg_iterator it = FI.arg_begin(), ie = FI.arg_end();
-       it != ie; ++it)
-    it->info = classifyArgumentType(it->type);
+  for (auto &I : FI.arguments())
+    I.info = classifyArgumentType(I.type);
 }
 
 ABIArgInfo HexagonABIInfo::classifyArgumentType(QualType Ty) const {
@@ -5205,9 +5773,9 @@
 
 private:
   ABIArgInfo classifyType(QualType RetTy, unsigned SizeLimit) const;
-  virtual void computeInfo(CGFunctionInfo &FI) const;
-  virtual llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
-                                 CodeGenFunction &CGF) const;
+  void computeInfo(CGFunctionInfo &FI) const override;
+  llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
+                         CodeGenFunction &CGF) const override;
 
   // Coercion type builder for structs passed in registers. The coercion type
   // serves two purposes:
@@ -5346,6 +5914,11 @@
   if (!isAggregateTypeForABI(Ty))
     return ABIArgInfo::getDirect();
 
+  // If a C++ object has either a non-trivial copy constructor or a non-trivial
+  // destructor, it is passed with an explicit indirect pointer / sret pointer.
+  if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI()))
+    return ABIArgInfo::getIndirect(0, RAA == CGCXXABI::RAA_DirectInMemory);
+
   // This is a small aggregate type that should be passed in registers.
   // Build a coercion type from the LLVM struct type.
   llvm::StructType *StrTy = dyn_cast<llvm::StructType>(CGT.ConvertType(Ty));
@@ -5382,6 +5955,7 @@
 
   switch (AI.getKind()) {
   case ABIArgInfo::Expand:
+  case ABIArgInfo::InAlloca:
     llvm_unreachable("Unsupported ABI kind for va_arg");
 
   case ABIArgInfo::Extend:
@@ -5417,9 +5991,8 @@
 
 void SparcV9ABIInfo::computeInfo(CGFunctionInfo &FI) const {
   FI.getReturnInfo() = classifyType(FI.getReturnType(), 32 * 8);
-  for (CGFunctionInfo::arg_iterator it = FI.arg_begin(), ie = FI.arg_end();
-       it != ie; ++it)
-    it->info = classifyType(it->type, 16 * 8);
+  for (auto &I : FI.arguments())
+    I.info = classifyType(I.type, 16 * 8);
 }
 
 namespace {
@@ -5427,24 +6000,65 @@
 public:
   SparcV9TargetCodeGenInfo(CodeGenTypes &CGT)
     : TargetCodeGenInfo(new SparcV9ABIInfo(CGT)) {}
+
+  int getDwarfEHStackPointer(CodeGen::CodeGenModule &M) const override {
+    return 14;
+  }
+
+  bool initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF,
+                               llvm::Value *Address) const override;
 };
 } // end anonymous namespace
 
+bool
+SparcV9TargetCodeGenInfo::initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF,
+                                                llvm::Value *Address) const {
+  // This is calculated from the LLVM and GCC tables and verified
+  // against gcc output.  AFAIK all ABIs use the same encoding.
+
+  CodeGen::CGBuilderTy &Builder = CGF.Builder;
+
+  llvm::IntegerType *i8 = CGF.Int8Ty;
+  llvm::Value *Four8 = llvm::ConstantInt::get(i8, 4);
+  llvm::Value *Eight8 = llvm::ConstantInt::get(i8, 8);
+
+  // 0-31: the 8-byte general-purpose registers
+  AssignToArrayRange(Builder, Address, Eight8, 0, 31);
+
+  // 32-63: f0-31, the 4-byte floating-point registers
+  AssignToArrayRange(Builder, Address, Four8, 32, 63);
+
+  //   Y   = 64
+  //   PSR = 65
+  //   WIM = 66
+  //   TBR = 67
+  //   PC  = 68
+  //   NPC = 69
+  //   FSR = 70
+  //   CSR = 71
+  AssignToArrayRange(Builder, Address, Eight8, 64, 71);
+   
+  // 72-87: d0-15, the 8-byte floating-point registers
+  AssignToArrayRange(Builder, Address, Eight8, 72, 87);
+
+  return false;
+}
+
 
 //===----------------------------------------------------------------------===//
-// Xcore ABI Implementation
+// XCore ABI Implementation
 //===----------------------------------------------------------------------===//
 namespace {
 class XCoreABIInfo : public DefaultABIInfo {
 public:
   XCoreABIInfo(CodeGen::CodeGenTypes &CGT) : DefaultABIInfo(CGT) {}
-  virtual llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
-                                 CodeGenFunction &CGF) const;
+  llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
+                         CodeGenFunction &CGF) const override;
 };
 
-class XcoreTargetCodeGenInfo : public TargetCodeGenInfo {
+class XCoreTargetCodeGenInfo : public TargetCodeGenInfo {
 public:
-  XcoreTargetCodeGenInfo(CodeGenTypes &CGT)
+  XCoreTargetCodeGenInfo(CodeGenTypes &CGT)
     :TargetCodeGenInfo(new XCoreABIInfo(CGT)) {}
 };
 } // End anonymous namespace.
@@ -5468,6 +6082,7 @@
   uint64_t ArgSize = 0;
   switch (AI.getKind()) {
   case ABIArgInfo::Expand:
+  case ABIArgInfo::InAlloca:
     llvm_unreachable("Unsupported ABI kind for va_arg");
   case ABIArgInfo::Ignore:
     Val = llvm::UndefValue::get(ArgPtrTy);
@@ -5520,11 +6135,22 @@
   case llvm::Triple::mips64el:
     return *(TheTargetCodeGenInfo = new MIPSTargetCodeGenInfo(Types, false));
 
+  case llvm::Triple::arm64: {
+    ARM64ABIInfo::ABIKind Kind = ARM64ABIInfo::AAPCS;
+    if (strcmp(getTarget().getABI(), "darwinpcs") == 0)
+      Kind = ARM64ABIInfo::DarwinPCS;
+
+    return *(TheTargetCodeGenInfo = new ARM64TargetCodeGenInfo(Types, Kind));
+  }
+
   case llvm::Triple::aarch64:
+  case llvm::Triple::aarch64_be:
     return *(TheTargetCodeGenInfo = new AArch64TargetCodeGenInfo(Types));
 
   case llvm::Triple::arm:
+  case llvm::Triple::armeb:
   case llvm::Triple::thumb:
+  case llvm::Triple::thumbeb:
     {
       ARMABIInfo::ABIKind Kind = ARMABIInfo::AAPCS;
       if (strcmp(getTarget().getABI(), "apcs-gnu") == 0)
@@ -5572,7 +6198,7 @@
     bool IsDarwinVectorABI = Triple.isOSDarwin();
     bool IsSmallStructInRegABI =
         X86_32TargetCodeGenInfo::isStructReturnInRegABI(Triple, CodeGenOpts);
-    bool IsWin32FloatStructABI = (Triple.getOS() == llvm::Triple::Win32);
+    bool IsWin32FloatStructABI = Triple.isWindowsMSVCEnvironment();
 
     if (Triple.getOS() == llvm::Triple::Win32) {
       return *(TheTargetCodeGenInfo =
@@ -5610,7 +6236,6 @@
   case llvm::Triple::sparcv9:
     return *(TheTargetCodeGenInfo = new SparcV9TargetCodeGenInfo(Types));
   case llvm::Triple::xcore:
-    return *(TheTargetCodeGenInfo = new XcoreTargetCodeGenInfo(Types));
-
+    return *(TheTargetCodeGenInfo = new XCoreTargetCodeGenInfo(Types));
   }
 }
diff --git a/lib/CodeGen/TargetInfo.h b/lib/CodeGen/TargetInfo.h
index f631f31..6c3ab64 100644
--- a/lib/CodeGen/TargetInfo.h
+++ b/lib/CodeGen/TargetInfo.h
@@ -17,8 +17,8 @@
 
 #include "clang/AST/Type.h"
 #include "clang/Basic/LLVM.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
 
 namespace llvm {
   class Constant;
@@ -123,6 +123,10 @@
       return Ty;
     }
 
+    /// doesReturnSlotInterfereWithArgs - Return true if the target uses an
+    /// argument slot for an 'sret' type.
+    virtual bool doesReturnSlotInterfereWithArgs() const { return true; }
+
     /// Retrieve the address of a function to call immediately before
     /// calling objc_retainAutoreleasedReturnValue.  The
     /// implementation of objc_autoreleaseReturnValue sniffs the
@@ -176,7 +180,7 @@
     /// However, some platforms make the conventions identical except
     /// for passing additional out-of-band information to a variadic
     /// function: for example, x86-64 passes the number of SSE
-    /// arguments in %al.  On these platforms, it is desireable to
+    /// arguments in %al.  On these platforms, it is desirable to
     /// call unprototyped functions using the variadic convention so
     /// that unprototyped calls to varargs functions still succeed.
     ///